diff options
Diffstat (limited to 'src/trex_watchdog.cpp')
-rw-r--r-- | src/trex_watchdog.cpp | 289 |
1 files changed, 289 insertions, 0 deletions
diff --git a/src/trex_watchdog.cpp b/src/trex_watchdog.cpp new file mode 100644 index 00000000..9b6f5865 --- /dev/null +++ b/src/trex_watchdog.cpp @@ -0,0 +1,289 @@ +/* + Itay Marom + Cisco Systems, Inc. +*/ + +/* +Copyright (c) 2015-2015 Cisco Systems, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "trex_watchdog.h" +#include "trex_exception.h" + +#include <assert.h> +#include <unistd.h> +#include <sstream> + +#include <sys/ptrace.h> +#include <execinfo.h> +#include <cxxabi.h> +#include <dlfcn.h> +#include <pthread.h> +#include <signal.h> +#include <string.h> +#include <iostream> +#include <stdexcept> + + +static TrexMonitor *global_monitor; + +const char *get_exe_name(); + +std::string exec(const char* cmd) { + char buffer[128]; + std::string result = ""; + std::shared_ptr<FILE> pipe(popen(cmd, "r"), pclose); + if (!pipe) throw std::runtime_error("popen() failed!"); + while (!feof(pipe.get())) { + if (fgets(buffer, 128, pipe.get()) != NULL) { + result += buffer; + } + } + return result; +} + +// This function produces a stack backtrace with demangled function & method names. +__attribute__((noinline)) +std::string Backtrace(int skip = 1) +{ + void *callstack[128]; + const int nMaxFrames = sizeof(callstack) / sizeof(callstack[0]); + char buf[1024]; + int nFrames = backtrace(callstack, nMaxFrames); + char **symbols = backtrace_symbols(callstack, nFrames); + + std::ostringstream trace_buf; + for (int i = skip; i < nFrames; i++) { + + Dl_info info; + if (dladdr(callstack[i], &info) && info.dli_sname) { + char *demangled = NULL; + int status = -1; + if (info.dli_sname[0] == '_') + demangled = abi::__cxa_demangle(info.dli_sname, NULL, 0, &status); + snprintf(buf, sizeof(buf), "%-3d %*p %s + %zd\n", + i, int(2 + sizeof(void*) * 2), callstack[i], + status == 0 ? demangled : + info.dli_sname == 0 ? symbols[i] : info.dli_sname, + (char *)callstack[i] - (char *)info.dli_saddr); + free(demangled); + } else { + snprintf(buf, sizeof(buf), "%-3d %*p %s\n", + i, int(2 + sizeof(void*) * 2), callstack[i], symbols[i]); + } + trace_buf << buf; + } + free(symbols); + if (nFrames == nMaxFrames) + trace_buf << "[truncated]\n"; + + /* add the addr2line info */ + std::stringstream addr2line; + + addr2line << "/usr/bin/addr2line -s -e " << get_exe_name() << " "; + for (int i = skip; i < nFrames; i++) { + addr2line << callstack[i] << " "; + } + + trace_buf << "\n\n*** addr2line information follows ***\n\n"; + try { + trace_buf << exec(addr2line.str().c_str()); + } catch (std::runtime_error &e) { + trace_buf << "\n" << e.what(); + } + + return trace_buf.str(); +} + +__attribute__((noinline)) +static void _callstack_signal_handler(int signr, siginfo_t *info, void *secret) { + std::stringstream ss; + + double now = now_sec(); + + ss << "WATCHDOG: task '" << global_monitor->get_name() << "' has not responded for more than " << global_monitor->get_interval(now) << " seconds - timeout is " << global_monitor->get_timeout_sec() << " seconds"; + + std::string backtrace = Backtrace(); + ss << "\n\n*** traceback follows ***\n\n" << backtrace << "\n"; + + throw std::runtime_error(ss.str()); +} + +/************************************** + * Trex Monitor object + *************************************/ + +void TrexMonitor::create(const std::string &name, double timeout_sec) { + m_active = true; + m_tid = pthread_self(); + m_name = name; + m_timeout_sec = timeout_sec; + m_tickled = true; + m_ts = 0; +} + +/************************************** + * Trex watchdog + *************************************/ + +void TrexWatchDog::init(bool enable){ + m_enable = enable; + if (m_enable) { + register_signal(); + } +} + +/** + * register a monitor + * this function is thread safe + * + */ +void TrexWatchDog::register_monitor(TrexMonitor *monitor) { + if (!m_enable){ + return; + } + + /* critical section start */ + std::unique_lock<std::mutex> lock(m_lock); + + /* sanity - not a must but why not... */ + for (int i = 0; i < m_mon_count; i++) { + if ( (monitor == m_monitors[i]) || (m_monitors[i]->get_tid() == pthread_self()) ) { + std::stringstream ss; + ss << "WATCHDOG: double register detected\n\n" << Backtrace(); + throw TrexException(ss.str()); + } + } + + /* check capacity */ + if (m_mon_count == MAX_MONITORS) { + std::stringstream ss; + ss << "WATCHDOG: too many registered monitors\n\n" << Backtrace(); + throw TrexException(ss.str()); + } + + /* add monitor */ + m_monitors[m_mon_count++] = monitor; + + /* critical section end */ + lock.unlock(); + +} + +void TrexWatchDog::start() { + + if (!m_enable){ + return ; + } + + m_active = true; + m_thread = new std::thread(&TrexWatchDog::_main, this); + if (!m_thread) { + throw TrexException("unable to create watchdog thread"); + } +} + +void TrexWatchDog::stop() { + + if (!m_enable){ + return ; + } + + m_active = false; + + if (m_thread) { + m_thread->join(); + delete m_thread; + m_thread = NULL; + } +} + + + +/** + * main loop + * + */ +void TrexWatchDog::_main() { + + pthread_setname_np(pthread_self(), "Trex Watchdog"); + + assert(m_enable == true); + + /* start main loop */ + while (m_active) { + + dsec_t now = now_sec(); + + /* to be on the safe side - read the count with a lock */ + std::unique_lock<std::mutex> lock(m_lock); + int count = m_mon_count; + lock.unlock(); + + for (int i = 0; i < count; i++) { + TrexMonitor *monitor = m_monitors[i]; + + /* skip non active monitors */ + if (!monitor->is_active()) { + continue; + } + + /* if its own - turn it off and write down the time */ + if (monitor->is_tickled()) { + monitor->reset(now); + continue; + } + + /* if the monitor has expired - crash */ + if (monitor->is_expired(now)) { + global_monitor = monitor; + + pthread_kill(monitor->get_tid(), SIGALRM); + + /* nothing to do more... the other thread will terminate, but if not - we terminate */ + sleep(5); + fprintf(stderr, "\n\n*** WATCHDOG violation detected on task '%s' which have failed to response to the signal ***\n\n", monitor->get_name().c_str()); + abort(); + } + + } + + /* the internal clock - 250 ms */ + delay(250); + } +} + + +void TrexWatchDog::register_signal() { + /* do this once */ + if (g_signal_init) { + return; + } + + /* register a handler on SIG ALARM */ + struct sigaction sa; + memset (&sa, '\0', sizeof(sa)); + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = _callstack_signal_handler; + + int rc = sigaction(SIGALRM , &sa, NULL); + assert(rc == 0); + + g_signal_init = true; +} + +bool TrexWatchDog::g_signal_init = false; + |