/* * Copyright 2013 Google Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. */ /* * Author: ncardwell@google.com (Neal Cardwell) * * Implementation for the test script execution module. */ #include "run.h" #include #include #include #include #include #include #include #include #include #include #include #include "ip.h" #include "logging.h" #include "netdev.h" #include "wire_client_netdev.h" #include "parse.h" #include "run_command.h" #include "run_packet.h" #include "run_system_call.h" #include "script.h" #include "so_testing.h" #include "socket.h" #include "system.h" #include "tcp.h" #include "tcp_options.h" /* MAX_SPIN_USECS is the maximum amount of time (in microseconds) to * spin waiting for an event. We sleep up until this many microseconds * before a script event. We get the best results on tickless * (CONFIG_NO_HZ=y) kernels when we try to sleep until the exact jiffy * of a script event; this reduces the staleness/noise we see in * jiffies values on tickless kernels, since the kernel updates the * jiffies value at the time we wake, and then we execute the test * event shortly thereafter. The value below was chosen experimentally * based on experiences on a 2.2GHz machine for which there was a * measured overhead of roughly 15 usec for the unlock/usleep/lock * sequence that wait_for_event() must execute while waiting * for the next event. */ const int MAX_SPIN_USECS = 20; /* Global bool init_cmd_exed */ bool init_cmd_exed = false; /* Final command to always execute at end of script, in order to clean up: */ const char *cleanup_cmd; /* Path of currently-executing script, for use in cleanup command errors: */ const char *script_path; struct state *state_new(struct config *config, struct script *script, struct netdev *netdev) { struct state *state = calloc(1, sizeof(struct state)); if (pthread_mutex_init(&state->mutex, NULL) != 0) die_perror("pthread_mutex_init"); run_lock(state); state->config = config; state->script = script; state->netdev = netdev; state->packets = packets_new(state); state->syscalls = syscalls_new(state); state->code = code_new(config); state->fds = NULL; state->num_events = 0; return state; } /* Add the file descriptor to the list of run-time file descriptors. */ void state_add_fd(struct state *state, struct fd_state *fd) { fd->next = state->fds; state->fds = fd; } /* Close all sockets, free all the socket structs, and send a RST * packet to clean up kernel state for each connection. * TODO(ncardwell): centralize error handling and ensure test errors * always result in a call to these clean-up functions, so we can make * sure to reset connections in all cases. */ static void close_all_fds(struct state *state) { struct fd_state *fd = state->fds, *dead_fd = NULL; while (fd != NULL) { dead_fd = fd; fd = fd->next; dead_fd->ops->close(state, dead_fd); } } void state_free(struct state *state) { /* We have to stop the system call thread first, since it's using * sockets that we want to close and reset. */ syscalls_free(state, state->syscalls); /* Then we close the sockets and reset the connections, while * we still have a netdev for injecting reset packets to free * per-connection kernel state. */ close_all_fds(state); netdev_free(state->netdev); packets_free(state->packets); code_free(state->code); if (state->wire_client) wire_client_free(state->wire_client); if (state->so_instance) so_instance_free(state->so_instance); run_unlock(state); if (pthread_mutex_destroy(&state->mutex) != 0) die_perror("pthread_mutex_destroy"); memset(state, 0, sizeof(*state)); /* paranoia to help catch bugs */ free(state); } s64 now_usecs(struct state *state) { struct timeval tv; if (state->so_instance) { if (state->so_instance->ifc.gettimeofday( state->so_instance->ifc.userdata, &tv, NULL) < 0) die_perror("gettimeofday"); } else { if (gettimeofday(&tv, NULL) < 0) die_perror("gettimeofday"); } return timeval_to_usecs(&tv); } /* * Verify that something happened at the expected time. * WARNING: verify_time() should not be looking at state->event * because in some cases (checking the finish time for blocking system * calls) we call verify_time() at a time when state->event * points at an event other than the one whose time we're currently * checking. */ int verify_time(struct state *state, enum event_time_t time_type, s64 script_usecs, s64 script_usecs_end, s64 live_usecs, const char *description, char **error) { s64 expected_usecs = script_usecs - state->script_start_time_usecs; s64 expected_usecs_end = script_usecs_end - state->script_start_time_usecs; s64 actual_usecs = live_usecs - state->live_start_time_usecs; int tolerance_usecs = state->config->tolerance_usecs; DEBUGP("expected: %.3f actual: %.3f (secs)\n", usecs_to_secs(script_usecs), usecs_to_secs(actual_usecs)); if (time_type == ANY_TIME) return STATUS_OK; if (time_type == ABSOLUTE_RANGE_TIME || time_type == RELATIVE_RANGE_TIME) { DEBUGP("expected_usecs_end %.3f\n", usecs_to_secs(script_usecs_end)); if (actual_usecs < (expected_usecs - tolerance_usecs) || actual_usecs > (expected_usecs_end + tolerance_usecs)) { if (time_type == ABSOLUTE_RANGE_TIME) { asprintf(error, "timing error: expected " "%s in time range %.6f~%.6f sec " "but happened at %.6f sec", description, usecs_to_secs(script_usecs), usecs_to_secs(script_usecs_end), usecs_to_secs(actual_usecs)); } else if (time_type == RELATIVE_RANGE_TIME) { s64 offset_usecs = state->event->offset_usecs; asprintf(error, "timing error: expected " "%s in relative time range +%.6f~+%.6f " "sec but happened at %+.6f sec", description, usecs_to_secs(script_usecs - offset_usecs), usecs_to_secs(script_usecs_end - offset_usecs), usecs_to_secs(actual_usecs - offset_usecs)); } return STATUS_ERR; } else { return STATUS_OK; } } if ((actual_usecs < (expected_usecs - tolerance_usecs)) || (actual_usecs > (expected_usecs + tolerance_usecs))) { asprintf(error, "timing error: " "expected %s at %.6f sec but happened at %.6f sec; " "tolerance %.6f sec", description, usecs_to_secs(script_usecs), usecs_to_secs(actual_usecs), usecs_to_secs(tolerance_usecs)); return STATUS_ERR; } else { return STATUS_OK; } } /* Return a static string describing the given event, for error messages. */ static const char *event_description(struct event *event) { enum direction_t direction = DIRECTION_INVALID; if ((event->type <= INVALID_EVENT) || (event->type >= NUM_EVENT_TYPES)) { die("bogus event type: %d", event->type); } switch (event->type) { case PACKET_EVENT: direction = packet_direction(event->event.packet); if (direction == DIRECTION_INBOUND) return "inbound packet"; else if (direction == DIRECTION_OUTBOUND) return "outbound packet"; else assert(!"bad direction"); break; case SYSCALL_EVENT: return "system call start"; case COMMAND_EVENT: return "command"; case CODE_EVENT: return "data collection for code"; case INVALID_EVENT: case NUM_EVENT_TYPES: assert(!"bogus type"); break; /* We omit default case so compiler catches missing values. */ } return "invalid event"; } void check_event_time(struct state *state, s64 live_usecs) { char *error = NULL; const char *description = event_description(state->event); if (verify_time(state, state->event->time_type, state->event->time_usecs, state->event->time_usecs_end, live_usecs, description, &error)) { die("%s:%d: %s\n", state->config->script_path, state->event->line_number, error); } } /* Consecutive relative inbound packets should be anchored relative to the * packet start times, to avoid accumulating errors from CPU processing * overheads on consecutive packets. */ bool is_event_start_time_anchored(struct event *event) { return (event->type == PACKET_EVENT && packet_direction(event->event.packet) == DIRECTION_INBOUND); } /* Set the start (and end time, if applicable) for the event if it * uses wildcard or relative timing. */ void adjust_relative_event_times(struct state *state, struct event *event) { s64 offset_usecs = 0; if (event->time_type != ANY_TIME && event->time_type != RELATIVE_TIME && event->time_type != RELATIVE_RANGE_TIME) return; if (state->last_event && is_event_start_time_anchored(state->last_event) && is_event_start_time_anchored(event)) offset_usecs = state->last_event->time_usecs; else offset_usecs = now_usecs(state) - state->live_start_time_usecs; event->offset_usecs = offset_usecs; event->time_usecs += offset_usecs; if (event->time_type == RELATIVE_RANGE_TIME) event->time_usecs_end += offset_usecs; /* Adjust the end time of blocking system calls using relative times. */ if (event->time_type == RELATIVE_TIME && event->type == SYSCALL_EVENT && is_blocking_syscall(event->event.syscall)) { event->event.syscall->end_usecs += offset_usecs; } } void wait_for_event(struct state *state) { s64 event_usecs = script_time_to_live_time_usecs( state, state->event->time_usecs); DEBUGP("waiting until %lld -- now is %lld\n", event_usecs, now_usecs(state)); while (1) { const s64 wait_usecs = event_usecs - now_usecs(state); if (wait_usecs <= 0) break; /* If we're waiting a long time, and we are on an OS * that we know has a fine-grained usleep(), then * usleep() instead of spinning on the CPU. */ #ifdef linux /* Since the scheduler may not wake us up precisely * when we tell it to, sleep until just before the * event we're waiting for and then spin. */ if (state->num_events > 0 && wait_usecs > MAX_SPIN_USECS) { run_unlock(state); if (state->so_instance) { state->so_instance->ifc.usleep( state->so_instance->ifc.userdata, wait_usecs - MAX_SPIN_USECS); } else { usleep(wait_usecs - MAX_SPIN_USECS); } run_lock(state); } #endif /* At this point we should only have a millisecond or * two to wait, so we spin. */ } if (state->num_events > 0) check_event_time(state, now_usecs(state)); } int get_next_event(struct state *state, char **error) { DEBUGP("gettimeofday: %.6f\n", now_usecs(state)/1000000.0); if (state->event == NULL) { /* First event. */ state->event = state->script->event_list; state->script_start_time_usecs = state->event->time_usecs; if (state->event->time_usecs != 0) { asprintf(error, "%s:%d: first event should be at time 0\n", state->config->script_path, state->event->line_number); return STATUS_ERR; } } else { /* Move to the next event. */ state->script_last_time_usecs = state->event->time_usecs; state->last_event = state->event; state->event = state->event->next; } if (state->event == NULL) return STATUS_OK; /* script is done */ assert((state->event->type > INVALID_EVENT) && (state->event->type < NUM_EVENT_TYPES)); if (state->last_event && is_event_time_absolute(state->last_event) && is_event_time_absolute(state->event) && state->event->time_usecs < state->script_last_time_usecs) { asprintf(error, "%s:%d: time goes backward in script " "from %lld usec to %lld usec\n", state->config->script_path, state->event->line_number, state->script_last_time_usecs, state->event->time_usecs); return STATUS_ERR; } return STATUS_OK; } /* Run the given packet event; print warnings/errors, and exit on error. */ static void run_local_packet_event(struct state *state, struct event *event, struct packet *packet) { char *error = NULL; int result = STATUS_OK; result = run_packet_event(state, event, packet, &error); if (result == STATUS_WARN) { fprintf(stderr, "%s", error); free(error); } else if (result == STATUS_ERR) { die("%s", error); } } /* For more consistent timing, if there's more than one CPU on this * machine then use a real-time priority. We skip this if there's only * 1 CPU because we do not want to risk making the machine * unresponsive. */ void set_scheduling_priority(void) { /* Get the CPU count and skip this if we only have 1 CPU. */ int num_cpus = sysconf(_SC_NPROCESSORS_ONLN); if (num_cpus < 0) die_perror("sysconf(_SC_NPROCESSORS_ONLN)"); if (num_cpus <= 1) return; #if !defined(__OpenBSD__) /* Chose a real-time policy, but use SCHED_RR instead of * SCHED_FIFO, so that we round-robin among real-time threads * of the same priority. In practice this shouldn't matter, * since there will not likely be other realtime threads. */ int policy = SCHED_RR; /* Use the minimum priority, to be nice. */ int priority = sched_get_priority_min(policy); if (priority < 0) die_perror("sched_get_priority_min"); /* Set the policy and priority for our threads. */ struct sched_param param; memset(¶m, 0, sizeof(param)); param.sched_priority = priority; if (sched_setscheduler(0, policy, ¶m)) DEBUGP("sched_setscheduler failed: %s\n", strerror(errno)); #endif /* !defined(__OpenBSD__) */ } /* To ensure timing that's as consistent as possible, pull all our * pages to RAM and pin them there. */ void lock_memory(void) { if (mlockall(MCL_CURRENT | MCL_FUTURE)) die_perror("lockall(MCL_CURRENT | MCL_FUTURE)"); } /* Wait for and return the wall time at which we should start the * test, in microseconds. To make test results more reproducible, we * wait for a start time that is well into the middle of a Linux jiffy * (JIFFY_OFFSET_USECS into the jiffy). If you try to run a test * script starting at a time that is too near the edge of a jiffy, and * the test tries (as most do) to schedule events at 1-millisecond * boundaries relative to this start time, then slight CPU or * scheduling variations cause the kernel to record time measurements * that are 1 jiffy too big or too small, so the kernel gets * unexpected RTT and RTT variance values, leading to unexpected RTO * and delayed ACK timer behavior. * * To try to find the edge of a jiffy, we spin and watch the output of * times(2), which increments every time the jiffies clock has * advanced another 10ms. We wait for a few ticks * (TARGET_JIFFY_TICKS) to go by, to reduce noise from warm-up * effects. We could do fancier measuring and filtering here, but so * far this level of complexity seems sufficient. */ static s64 schedule_start_time_usecs(struct state *state) { #ifdef linux s64 start_usecs = 0; clock_t last_jiffies = times(NULL); int jiffy_ticks = 0; const int TARGET_JIFFY_TICKS = 10; while (jiffy_ticks < TARGET_JIFFY_TICKS) { clock_t jiffies = times(NULL); if (jiffies != last_jiffies) { start_usecs = now_usecs(state); ++jiffy_ticks; } last_jiffies = jiffies; } const int JIFFY_OFFSET_USECS = 250; start_usecs += JIFFY_OFFSET_USECS; return start_usecs; #else return now_usecs(state); #endif } /* Run final command we always execute at end of script, to clean up. If there * is a cleanup command at the end of a packetdrill script, we execute that no * matter whether the test passes or fails. This makes the cleanup command a * good place to undo any sysctl settings the script changed, for example. */ int run_cleanup_command(void) { if (cleanup_cmd != NULL && init_cmd_exed) { char *error = NULL; if (safe_system(cleanup_cmd, &error)) { fprintf(stderr, "%s: error executing cleanup command: %s\n", script_path, error); free(error); return STATUS_ERR; } } return STATUS_OK; } void run_script(struct config *config, struct script *script) { char *error = NULL; struct state *state = NULL; struct netdev *netdev = NULL; struct event *event = NULL; DEBUGP("run_script: running script\n"); set_scheduling_priority(); lock_memory(); /* This interpreter loop runs for local mode or wire client mode. */ assert(!config->is_wire_server); script_path = config->script_path; /* How we use the network is of course a little different in * each of the two cases.... */ if (config->is_wire_client) netdev = wire_client_netdev_new(config); else if (config->so_filename) netdev = so_netdev_new(config); else netdev = local_netdev_new(config); state = state_new(config, script, netdev); if (config->is_wire_client) { state->wire_client = wire_client_new(); wire_client_init(state->wire_client, config, script); } if (config->so_filename) { state->so_instance = so_instance_new(); so_instance_init(state->so_instance, config, script, state); } init_cmd_exed = false; if (script->init_command != NULL) { if (safe_system(script->init_command->command_line, &error)) { asprintf(&error, "%s: error executing init command: %s\n", config->script_path, error); free(error); exit(EXIT_FAILURE); } init_cmd_exed = true; } signal(SIGPIPE, SIG_IGN); /* ignore EPIPE */ state->live_start_time_usecs = schedule_start_time_usecs(state); DEBUGP("live_start_time_usecs is %lld\n", state->live_start_time_usecs); if (state->wire_client != NULL) wire_client_send_client_starting(state->wire_client); while (1) { if (get_next_event(state, &error)) die("%s", error); event = state->event; if (event == NULL) break; if (state->wire_client != NULL) wire_client_next_event(state->wire_client, event); /* In wire mode, we adjust relative times after * getting notification that previous packet events * have completed, if any. */ adjust_relative_event_times(state, event); switch (event->type) { case PACKET_EVENT: /* For wire clients, the server handles packets. */ if (!config->is_wire_client) { run_local_packet_event(state, event, event->event.packet); } break; case SYSCALL_EVENT: run_system_call_event(state, event, event->event.syscall); break; case COMMAND_EVENT: run_command_event(state, event, event->event.command); break; case CODE_EVENT: run_code_event(state, event, event->event.code->text); break; case INVALID_EVENT: case NUM_EVENT_TYPES: assert(!"bogus type"); break; /* We omit default case so compiler catches missing values. */ } state->num_events++; } /* Wait for any outstanding packet events we requested on the server. */ if (state->wire_client != NULL) wire_client_next_event(state->wire_client, NULL); if (run_cleanup_command() == STATUS_ERR) exit(EXIT_FAILURE); if (code_execute(state->code, &error)) { die("%s: error executing code: %s\n", state->config->script_path, error); free(error); } state_free(state); DEBUGP("run_script: done running\n"); } int parse_script_and_set_config(int argc, char *argv[], struct config *config, struct script *script, const char *script_path, const char *script_buffer) { struct invocation invocation = { .argc = argc, .argv = argv, .config = config, .script = script, }; DEBUGP("parse_and_run_script: %s\n", script_path); assert(script_path != NULL); init_script(script); set_default_config(config); config->script_path = strdup(script_path); if (script_buffer != NULL) copy_script(script_buffer, script); else read_script(script_path, script); return parse_script(config, script, &invocation); }