summaryrefslogtreecommitdiffstats
path: root/src/trex_watchdog.cpp
diff options
context:
space:
mode:
authorimarom <imarom@cisco.com>2016-06-01 15:52:00 +0300
committerimarom <imarom@cisco.com>2016-06-02 13:45:12 +0300
commit3c4a29e15f3663f6413fbee2562d7d0aa4e2f80d (patch)
treec9742549ad7a8013f43077dceb5fa9eacf0aaadf /src/trex_watchdog.cpp
parentb639fb458fb2388164adaf45c4e947a2af2ca0e1 (diff)
watchdog phase 2
Diffstat (limited to 'src/trex_watchdog.cpp')
-rw-r--r--src/trex_watchdog.cpp248
1 files changed, 235 insertions, 13 deletions
diff --git a/src/trex_watchdog.cpp b/src/trex_watchdog.cpp
index b3a0733c..d38809fc 100644
--- a/src/trex_watchdog.cpp
+++ b/src/trex_watchdog.cpp
@@ -26,21 +26,187 @@ limitations under the License.
#include <unistd.h>
#include <sstream>
-int WatchDog::register_monitor(const std::string &name, double timeout_sec) {
+#include <sys/ptrace.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include <iostream>
+#include <stdexcept>
+
+static TrexWatchDog::monitor_st *global_monitor;
+
+const char *get_exe_name();
+
+std::string exec(const char* cmd) {
+ char buffer[128];
+ std::string result = "";
+ std::shared_ptr<FILE> pipe(popen(cmd, "r"), pclose);
+ if (!pipe) throw std::runtime_error("popen() failed!");
+ while (!feof(pipe.get())) {
+ if (fgets(buffer, 128, pipe.get()) != NULL) {
+ result += buffer;
+ }
+ }
+ return result;
+}
+
+// This function produces a stack backtrace with demangled function & method names.
+__attribute__((noinline))
+std::string Backtrace(int skip = 1)
+{
+ void *callstack[128];
+ const int nMaxFrames = sizeof(callstack) / sizeof(callstack[0]);
+ char buf[1024];
+ int nFrames = backtrace(callstack, nMaxFrames);
+ char **symbols = backtrace_symbols(callstack, nFrames);
+
+ std::ostringstream trace_buf;
+ for (int i = skip; i < nFrames; i++) {
+
+ Dl_info info;
+ if (dladdr(callstack[i], &info) && info.dli_sname) {
+ char *demangled = NULL;
+ int status = -1;
+ if (info.dli_sname[0] == '_')
+ demangled = abi::__cxa_demangle(info.dli_sname, NULL, 0, &status);
+ snprintf(buf, sizeof(buf), "%-3d %*p %s + %zd\n",
+ i, int(2 + sizeof(void*) * 2), callstack[i],
+ status == 0 ? demangled :
+ info.dli_sname == 0 ? symbols[i] : info.dli_sname,
+ (char *)callstack[i] - (char *)info.dli_saddr);
+ free(demangled);
+ } else {
+ snprintf(buf, sizeof(buf), "%-3d %*p %s\n",
+ i, int(2 + sizeof(void*) * 2), callstack[i], symbols[i]);
+ }
+ trace_buf << buf;
+ }
+ free(symbols);
+ if (nFrames == nMaxFrames)
+ trace_buf << "[truncated]\n";
+
+ /* add the addr2line info */
+ std::stringstream addr2line;
+
+ addr2line << "/usr/bin/addr2line -e " << get_exe_name() << " ";
+ for (int i = skip; i < nFrames; i++) {
+ addr2line << callstack[i] << " ";
+ }
+
+ trace_buf << "\n\n*** addr2line information follows ***\n\n";
+ try {
+ trace_buf << exec(addr2line.str().c_str());
+ } catch (std::runtime_error &e) {
+ trace_buf << "\n" << e.what();
+ }
+
+ return trace_buf.str();
+}
+
+__attribute__((noinline))
+static void _callstack_signal_handler(int signr, siginfo_t *info, void *secret) {
+ std::stringstream ss;
+
+ double now = now_sec();
+
+ ss << "WATCHDOG: task '" << global_monitor->name << "' has not responded for more than " << (now - global_monitor->ts) << " seconds - timeout is " << global_monitor->timeout_sec << " seconds";
+
+ std::string backtrace = Backtrace();
+ ss << "\n\n*** traceback follows ***\n\n" << backtrace << "\n";
+
+ throw std::runtime_error(ss.str());
+}
+
+void TrexWatchDog::mark_pending_monitor(int count) {
+ std::unique_lock<std::mutex> lock(m_lock);
+ m_pending += count;
+ lock.unlock();
+}
+
+void TrexWatchDog::block_on_pending(int max_block_time_ms) {
+
+ int timeout_msec = max_block_time_ms;
+
+ std::unique_lock<std::mutex> lock(m_lock);
+
+ while (m_pending > 0) {
+
+ lock.unlock();
+ delay(1);
+ lock.lock();
+
+ timeout_msec -= 1;
+ if (timeout_msec == 0) {
+ throw TrexException("WATCHDOG: block on pending monitors timed out");
+ }
+ }
+
+ /* lock will be released */
+}
+
+/**
+ * register a monitor
+ * must be called from the relevant thread
+ *
+ * this function is thread safe
+ *
+ * @author imarom (01-Jun-16)
+ *
+ * @param name
+ * @param timeout_sec
+ *
+ * @return int
+ */
+int TrexWatchDog::register_monitor(const std::string &name, double timeout_sec) {
monitor_st monitor;
+ /* cannot add monitors while active */
+ assert(m_active == false);
+
+ monitor.tid = pthread_self();
monitor.name = name;
monitor.timeout_sec = timeout_sec;
monitor.tickled = true;
monitor.ts = 0;
+ /* critical section start */
+ std::unique_lock<std::mutex> lock(m_lock);
+
+ /* make sure no double register */
+ for (auto &m : m_monitors) {
+ if (m.tid == pthread_self()) {
+ std::stringstream ss;
+ ss << "WATCHDOG: double register detected\n\n" << Backtrace();
+ throw TrexException(ss.str());
+ }
+ }
+
monitor.handle = m_monitors.size();
m_monitors.push_back(monitor);
+ assert(m_pending > 0);
+ m_pending--;
+
+ /* critical section end */
+ lock.unlock();
+
return monitor.handle;
}
-void WatchDog::tickle(int handle) {
+/**
+ * thread safe function
+ *
+ */
+void TrexWatchDog::tickle(int handle) {
+
+ /* ignore ticks if not active */
+ if (!m_active) {
+ return;
+ }
+
assert(handle < m_monitors.size());
/* not nesscary but write gets cache invalidate for nothing */
@@ -51,24 +217,74 @@ void WatchDog::tickle(int handle) {
m_monitors[handle].tickled = true;
}
-void WatchDog::start() {
+void TrexWatchDog::register_signal() {
+
+ /* do this once */
+ if (g_signal_init) {
+ return;
+ }
+
+ /* register a handler on SIG ALARM */
+ struct sigaction sa;
+ memset (&sa, '\0', sizeof(sa));
+
+ sa.sa_flags = SA_SIGINFO;
+ sa.sa_sigaction = _callstack_signal_handler;
+
+ int rc = sigaction(SIGALRM , &sa, NULL);
+ assert(rc == 0);
+
+ g_signal_init = true;
+}
+
+void TrexWatchDog::start() {
+
+ block_on_pending();
+
+ /* no pending monitors */
+ assert(m_pending == 0);
+
+ /* under GDB - disable the watchdog */
+ if (ptrace(PTRACE_TRACEME, 0, NULL, 0) == -1) {
+ printf("\n\n*** GDB detected - disabling watchdog... ***\n\n");
+ return;
+ }
+
+ register_signal();
+
m_active = true;
- m_thread = new std::thread(&WatchDog::_main, this);
+ m_thread = new std::thread(&TrexWatchDog::_main, this);
if (!m_thread) {
throw TrexException("unable to create watchdog thread");
}
}
-void WatchDog::stop() {
- m_thread->join();
- delete m_thread;
+void TrexWatchDog::stop() {
+ m_active = false;
+
+ if (m_thread) {
+ m_thread->join();
+ delete m_thread;
+ m_thread = NULL;
+ }
+
+ m_monitors.clear();
}
+
+
/**
* main loop
*
*/
-void WatchDog::_main() {
+void TrexWatchDog::_main() {
+
+ /* reset all the monitors */
+ for (auto &monitor : m_monitors) {
+ monitor.tickled = true;
+ }
+
+ /* start main loop */
while (m_active) {
dsec_t now = now_sec();
@@ -83,15 +299,21 @@ void WatchDog::_main() {
/* the bit is off - check the time first */
if ( (now - monitor.ts) > monitor.timeout_sec ) {
- std::stringstream ss;
- ss << "WATCHDOG: task '" << monitor.name << "' has not responded for more than " << (now - monitor.ts) << " seconds - timeout is " << monitor.timeout_sec << " seconds";
- throw TrexException(ss.str());
- assert(0);
+ global_monitor = &monitor;
+
+ pthread_kill(monitor.tid, SIGALRM);
+
+ /* nothing to do more... the other thread will terminate, but if not - we terminate */
+ sleep(5);
+ printf("\n\n*** WATCHDOG violation detected on task '%s' which have failed to response to the signal ***\n\n", monitor.name.c_str());
+ exit(1);
}
}
- sleep(1);
+ /* the internal clock - 250 ms */
+ delay(250);
}
}
+bool TrexWatchDog::g_signal_init = false;