diff options
Diffstat (limited to 'src/trex_watchdog.cpp')
-rw-r--r-- | src/trex_watchdog.cpp | 331 |
1 files changed, 331 insertions, 0 deletions
diff --git a/src/trex_watchdog.cpp b/src/trex_watchdog.cpp new file mode 100644 index 00000000..e78e8e6d --- /dev/null +++ b/src/trex_watchdog.cpp @@ -0,0 +1,331 @@ +/* + Itay Marom + Cisco Systems, Inc. +*/ + +/* +Copyright (c) 2015-2015 Cisco Systems, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "trex_watchdog.h" +#include "trex_exception.h" + +#include <assert.h> +#include <unistd.h> +#include <sstream> + +#include <sys/ptrace.h> +#include <execinfo.h> +#include <cxxabi.h> +#include <dlfcn.h> +#include <pthread.h> +#include <signal.h> +#include <string.h> +#include <iostream> +#include <stdexcept> + +#define DISABLE_WATCHDOG_ON_GDB + +static TrexWatchDog::monitor_st *global_monitor; + +const char *get_exe_name(); + +std::string exec(const char* cmd) { + char buffer[128]; + std::string result = ""; + std::shared_ptr<FILE> pipe(popen(cmd, "r"), pclose); + if (!pipe) throw std::runtime_error("popen() failed!"); + while (!feof(pipe.get())) { + if (fgets(buffer, 128, pipe.get()) != NULL) { + result += buffer; + } + } + return result; +} + +// This function produces a stack backtrace with demangled function & method names. +__attribute__((noinline)) +std::string Backtrace(int skip = 1) +{ + void *callstack[128]; + const int nMaxFrames = sizeof(callstack) / sizeof(callstack[0]); + char buf[1024]; + int nFrames = backtrace(callstack, nMaxFrames); + char **symbols = backtrace_symbols(callstack, nFrames); + + std::ostringstream trace_buf; + for (int i = skip; i < nFrames; i++) { + + Dl_info info; + if (dladdr(callstack[i], &info) && info.dli_sname) { + char *demangled = NULL; + int status = -1; + if (info.dli_sname[0] == '_') + demangled = abi::__cxa_demangle(info.dli_sname, NULL, 0, &status); + snprintf(buf, sizeof(buf), "%-3d %*p %s + %zd\n", + i, int(2 + sizeof(void*) * 2), callstack[i], + status == 0 ? demangled : + info.dli_sname == 0 ? symbols[i] : info.dli_sname, + (char *)callstack[i] - (char *)info.dli_saddr); + free(demangled); + } else { + snprintf(buf, sizeof(buf), "%-3d %*p %s\n", + i, int(2 + sizeof(void*) * 2), callstack[i], symbols[i]); + } + trace_buf << buf; + } + free(symbols); + if (nFrames == nMaxFrames) + trace_buf << "[truncated]\n"; + + /* add the addr2line info */ + std::stringstream addr2line; + + addr2line << "/usr/bin/addr2line -e " << get_exe_name() << " "; + for (int i = skip; i < nFrames; i++) { + addr2line << callstack[i] << " "; + } + + trace_buf << "\n\n*** addr2line information follows ***\n\n"; + try { + trace_buf << exec(addr2line.str().c_str()); + } catch (std::runtime_error &e) { + trace_buf << "\n" << e.what(); + } + + return trace_buf.str(); +} + +__attribute__((noinline)) +static void _callstack_signal_handler(int signr, siginfo_t *info, void *secret) { + std::stringstream ss; + + double now = now_sec(); + + ss << "WATCHDOG: task '" << global_monitor->name << "' has not responded for more than " << (now - global_monitor->ts) << " seconds - timeout is " << global_monitor->timeout_sec << " seconds"; + + std::string backtrace = Backtrace(); + ss << "\n\n*** traceback follows ***\n\n" << backtrace << "\n"; + + throw std::runtime_error(ss.str()); +} + +void TrexWatchDog::mark_pending_monitor(int count) { + std::unique_lock<std::mutex> lock(m_lock); + m_pending += count; + lock.unlock(); +} + +void TrexWatchDog::block_on_pending(int max_block_time_ms) { + + int timeout_msec = max_block_time_ms; + + std::unique_lock<std::mutex> lock(m_lock); + + while (m_pending > 0) { + + lock.unlock(); + delay(1); + lock.lock(); + + timeout_msec -= 1; + if (timeout_msec == 0) { + throw TrexException("WATCHDOG: block on pending monitors timed out"); + } + } + + /* lock will be released */ +} + +/** + * register a monitor + * must be called from the relevant thread + * + * this function is thread safe + * + * @author imarom (01-Jun-16) + * + * @param name + * @param timeout_sec + * + * @return int + */ +int TrexWatchDog::register_monitor(const std::string &name, double timeout_sec) { + monitor_st monitor; + + /* cannot add monitors while active */ + assert(m_active == false); + + monitor.active = true; + monitor.tid = pthread_self(); + monitor.name = name; + monitor.timeout_sec = timeout_sec; + monitor.tickled = true; + monitor.ts = 0; + + /* critical section start */ + std::unique_lock<std::mutex> lock(m_lock); + + /* make sure no double register */ + for (auto &m : m_monitors) { + if (m.tid == pthread_self()) { + std::stringstream ss; + ss << "WATCHDOG: double register detected\n\n" << Backtrace(); + throw TrexException(ss.str()); + } + } + + monitor.handle = m_monitors.size(); + m_monitors.push_back(monitor); + + assert(m_pending > 0); + m_pending--; + + /* critical section end */ + lock.unlock(); + + return monitor.handle; +} + +/** + * will disable the monitor - it will no longer be watched + * + */ +void TrexWatchDog::disable_monitor(int handle) { + assert(handle < m_monitors.size()); + + m_monitors[handle].active = false; +} + +/** + * thread safe function + * + */ +void TrexWatchDog::tickle(int handle) { + + assert(handle < m_monitors.size()); + + /* not nesscary but write gets cache invalidate for nothing */ + if (m_monitors[handle].tickled) { + return; + } + + m_monitors[handle].tickled = true; +} + +void TrexWatchDog::register_signal() { + + /* do this once */ + if (g_signal_init) { + return; + } + + /* register a handler on SIG ALARM */ + struct sigaction sa; + memset (&sa, '\0', sizeof(sa)); + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = _callstack_signal_handler; + + int rc = sigaction(SIGALRM , &sa, NULL); + assert(rc == 0); + + g_signal_init = true; +} + +void TrexWatchDog::start() { + + block_on_pending(); + + /* no pending monitors */ + assert(m_pending == 0); + + /* under GDB - disable the watchdog */ + #ifdef DISABLE_WATCHDOG_ON_GDB + if (ptrace(PTRACE_TRACEME, 0, NULL, 0) == -1) { + printf("\n\n*** GDB detected - disabling watchdog... ***\n\n"); + return; + } + #endif + + m_active = true; + m_thread = new std::thread(&TrexWatchDog::_main, this); + if (!m_thread) { + throw TrexException("unable to create watchdog thread"); + } +} + +void TrexWatchDog::stop() { + m_active = false; + + if (m_thread) { + m_thread->join(); + delete m_thread; + m_thread = NULL; + } +} + + + +/** + * main loop + * + */ +void TrexWatchDog::_main() { + + /* reset all the monitors */ + for (auto &monitor : m_monitors) { + monitor.tickled = true; + } + + /* start main loop */ + while (m_active) { + + dsec_t now = now_sec(); + + for (auto &monitor : m_monitors) { + + /* skip non active monitors */ + if (!monitor.active) { + continue; + } + + /* if its own - turn it off and write down the time */ + if (monitor.tickled) { + monitor.tickled = false; + monitor.ts = now; + continue; + } + + /* the bit is off - check the time first */ + if ( (now - monitor.ts) > monitor.timeout_sec ) { + global_monitor = &monitor; + + pthread_kill(monitor.tid, SIGALRM); + + /* nothing to do more... the other thread will terminate, but if not - we terminate */ + sleep(5); + printf("\n\n*** WATCHDOG violation detected on task '%s' which have failed to response to the signal ***\n\n", monitor.name.c_str()); + exit(1); + } + + } + + /* the internal clock - 250 ms */ + delay(250); + } +} + +bool TrexWatchDog::g_signal_init = false; |