diff options
author | imarom <imarom@cisco.com> | 2016-06-01 15:52:00 +0300 |
---|---|---|
committer | imarom <imarom@cisco.com> | 2016-06-02 13:45:12 +0300 |
commit | 3c4a29e15f3663f6413fbee2562d7d0aa4e2f80d (patch) | |
tree | c9742549ad7a8013f43077dceb5fa9eacf0aaadf /src/trex_watchdog.cpp | |
parent | b639fb458fb2388164adaf45c4e947a2af2ca0e1 (diff) |
watchdog phase 2
Diffstat (limited to 'src/trex_watchdog.cpp')
-rw-r--r-- | src/trex_watchdog.cpp | 248 |
1 files changed, 235 insertions, 13 deletions
diff --git a/src/trex_watchdog.cpp b/src/trex_watchdog.cpp index b3a0733c..d38809fc 100644 --- a/src/trex_watchdog.cpp +++ b/src/trex_watchdog.cpp @@ -26,21 +26,187 @@ limitations under the License. #include <unistd.h> #include <sstream> -int WatchDog::register_monitor(const std::string &name, double timeout_sec) { +#include <sys/ptrace.h> +#include <execinfo.h> +#include <cxxabi.h> +#include <dlfcn.h> +#include <pthread.h> +#include <signal.h> +#include <string.h> +#include <iostream> +#include <stdexcept> + +static TrexWatchDog::monitor_st *global_monitor; + +const char *get_exe_name(); + +std::string exec(const char* cmd) { + char buffer[128]; + std::string result = ""; + std::shared_ptr<FILE> pipe(popen(cmd, "r"), pclose); + if (!pipe) throw std::runtime_error("popen() failed!"); + while (!feof(pipe.get())) { + if (fgets(buffer, 128, pipe.get()) != NULL) { + result += buffer; + } + } + return result; +} + +// This function produces a stack backtrace with demangled function & method names. +__attribute__((noinline)) +std::string Backtrace(int skip = 1) +{ + void *callstack[128]; + const int nMaxFrames = sizeof(callstack) / sizeof(callstack[0]); + char buf[1024]; + int nFrames = backtrace(callstack, nMaxFrames); + char **symbols = backtrace_symbols(callstack, nFrames); + + std::ostringstream trace_buf; + for (int i = skip; i < nFrames; i++) { + + Dl_info info; + if (dladdr(callstack[i], &info) && info.dli_sname) { + char *demangled = NULL; + int status = -1; + if (info.dli_sname[0] == '_') + demangled = abi::__cxa_demangle(info.dli_sname, NULL, 0, &status); + snprintf(buf, sizeof(buf), "%-3d %*p %s + %zd\n", + i, int(2 + sizeof(void*) * 2), callstack[i], + status == 0 ? demangled : + info.dli_sname == 0 ? symbols[i] : info.dli_sname, + (char *)callstack[i] - (char *)info.dli_saddr); + free(demangled); + } else { + snprintf(buf, sizeof(buf), "%-3d %*p %s\n", + i, int(2 + sizeof(void*) * 2), callstack[i], symbols[i]); + } + trace_buf << buf; + } + free(symbols); + if (nFrames == nMaxFrames) + trace_buf << "[truncated]\n"; + + /* add the addr2line info */ + std::stringstream addr2line; + + addr2line << "/usr/bin/addr2line -e " << get_exe_name() << " "; + for (int i = skip; i < nFrames; i++) { + addr2line << callstack[i] << " "; + } + + trace_buf << "\n\n*** addr2line information follows ***\n\n"; + try { + trace_buf << exec(addr2line.str().c_str()); + } catch (std::runtime_error &e) { + trace_buf << "\n" << e.what(); + } + + return trace_buf.str(); +} + +__attribute__((noinline)) +static void _callstack_signal_handler(int signr, siginfo_t *info, void *secret) { + std::stringstream ss; + + double now = now_sec(); + + ss << "WATCHDOG: task '" << global_monitor->name << "' has not responded for more than " << (now - global_monitor->ts) << " seconds - timeout is " << global_monitor->timeout_sec << " seconds"; + + std::string backtrace = Backtrace(); + ss << "\n\n*** traceback follows ***\n\n" << backtrace << "\n"; + + throw std::runtime_error(ss.str()); +} + +void TrexWatchDog::mark_pending_monitor(int count) { + std::unique_lock<std::mutex> lock(m_lock); + m_pending += count; + lock.unlock(); +} + +void TrexWatchDog::block_on_pending(int max_block_time_ms) { + + int timeout_msec = max_block_time_ms; + + std::unique_lock<std::mutex> lock(m_lock); + + while (m_pending > 0) { + + lock.unlock(); + delay(1); + lock.lock(); + + timeout_msec -= 1; + if (timeout_msec == 0) { + throw TrexException("WATCHDOG: block on pending monitors timed out"); + } + } + + /* lock will be released */ +} + +/** + * register a monitor + * must be called from the relevant thread + * + * this function is thread safe + * + * @author imarom (01-Jun-16) + * + * @param name + * @param timeout_sec + * + * @return int + */ +int TrexWatchDog::register_monitor(const std::string &name, double timeout_sec) { monitor_st monitor; + /* cannot add monitors while active */ + assert(m_active == false); + + monitor.tid = pthread_self(); monitor.name = name; monitor.timeout_sec = timeout_sec; monitor.tickled = true; monitor.ts = 0; + /* critical section start */ + std::unique_lock<std::mutex> lock(m_lock); + + /* make sure no double register */ + for (auto &m : m_monitors) { + if (m.tid == pthread_self()) { + std::stringstream ss; + ss << "WATCHDOG: double register detected\n\n" << Backtrace(); + throw TrexException(ss.str()); + } + } + monitor.handle = m_monitors.size(); m_monitors.push_back(monitor); + assert(m_pending > 0); + m_pending--; + + /* critical section end */ + lock.unlock(); + return monitor.handle; } -void WatchDog::tickle(int handle) { +/** + * thread safe function + * + */ +void TrexWatchDog::tickle(int handle) { + + /* ignore ticks if not active */ + if (!m_active) { + return; + } + assert(handle < m_monitors.size()); /* not nesscary but write gets cache invalidate for nothing */ @@ -51,24 +217,74 @@ void WatchDog::tickle(int handle) { m_monitors[handle].tickled = true; } -void WatchDog::start() { +void TrexWatchDog::register_signal() { + + /* do this once */ + if (g_signal_init) { + return; + } + + /* register a handler on SIG ALARM */ + struct sigaction sa; + memset (&sa, '\0', sizeof(sa)); + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = _callstack_signal_handler; + + int rc = sigaction(SIGALRM , &sa, NULL); + assert(rc == 0); + + g_signal_init = true; +} + +void TrexWatchDog::start() { + + block_on_pending(); + + /* no pending monitors */ + assert(m_pending == 0); + + /* under GDB - disable the watchdog */ + if (ptrace(PTRACE_TRACEME, 0, NULL, 0) == -1) { + printf("\n\n*** GDB detected - disabling watchdog... ***\n\n"); + return; + } + + register_signal(); + m_active = true; - m_thread = new std::thread(&WatchDog::_main, this); + m_thread = new std::thread(&TrexWatchDog::_main, this); if (!m_thread) { throw TrexException("unable to create watchdog thread"); } } -void WatchDog::stop() { - m_thread->join(); - delete m_thread; +void TrexWatchDog::stop() { + m_active = false; + + if (m_thread) { + m_thread->join(); + delete m_thread; + m_thread = NULL; + } + + m_monitors.clear(); } + + /** * main loop * */ -void WatchDog::_main() { +void TrexWatchDog::_main() { + + /* reset all the monitors */ + for (auto &monitor : m_monitors) { + monitor.tickled = true; + } + + /* start main loop */ while (m_active) { dsec_t now = now_sec(); @@ -83,15 +299,21 @@ void WatchDog::_main() { /* the bit is off - check the time first */ if ( (now - monitor.ts) > monitor.timeout_sec ) { - std::stringstream ss; - ss << "WATCHDOG: task '" << monitor.name << "' has not responded for more than " << (now - monitor.ts) << " seconds - timeout is " << monitor.timeout_sec << " seconds"; - throw TrexException(ss.str()); - assert(0); + global_monitor = &monitor; + + pthread_kill(monitor.tid, SIGALRM); + + /* nothing to do more... the other thread will terminate, but if not - we terminate */ + sleep(5); + printf("\n\n*** WATCHDOG violation detected on task '%s' which have failed to response to the signal ***\n\n", monitor.name.c_str()); + exit(1); } } - sleep(1); + /* the internal clock - 250 ms */ + delay(250); } } +bool TrexWatchDog::g_signal_init = false; |