diff options
Diffstat (limited to 'vlib/vlib/unix')
-rw-r--r-- | vlib/vlib/unix/cj.c | 218 | ||||
-rw-r--r-- | vlib/vlib/unix/cj.h | 68 | ||||
-rw-r--r-- | vlib/vlib/unix/cli.c | 900 | ||||
-rw-r--r-- | vlib/vlib/unix/input.c | 244 | ||||
-rw-r--r-- | vlib/vlib/unix/main.c | 471 | ||||
-rw-r--r-- | vlib/vlib/unix/mc_socket.c | 972 | ||||
-rw-r--r-- | vlib/vlib/unix/mc_socket.h | 126 | ||||
-rw-r--r-- | vlib/vlib/unix/pci.c | 577 | ||||
-rw-r--r-- | vlib/vlib/unix/pci.h | 94 | ||||
-rw-r--r-- | vlib/vlib/unix/physmem.c | 472 | ||||
-rw-r--r-- | vlib/vlib/unix/physmem.h | 59 | ||||
-rw-r--r-- | vlib/vlib/unix/plugin.c | 210 | ||||
-rw-r--r-- | vlib/vlib/unix/plugin.h | 88 | ||||
-rw-r--r-- | vlib/vlib/unix/unix.h | 177 |
14 files changed, 4676 insertions, 0 deletions
diff --git a/vlib/vlib/unix/cj.c b/vlib/vlib/unix/cj.c new file mode 100644 index 00000000000..665a13fa4f5 --- /dev/null +++ b/vlib/vlib/unix/cj.c @@ -0,0 +1,218 @@ +/* + *------------------------------------------------------------------ + * cj.c + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <stdio.h> +#include <vlib/vlib.h> + +#include <vlib/unix/cj.h> + +cj_main_t cj_main; + +void +cj_log (u32 type, void * data0, void * data1) +{ + u64 new_tail; + cj_main_t * cjm = &cj_main; + cj_record_t * r; + + if (cjm->enable == 0) + return; + + new_tail = __sync_add_and_fetch (&cjm->tail, 1); + + r = (cj_record_t *) &(cjm->records[new_tail & (cjm->num_records - 1)]); + r->time = vlib_time_now (cjm->vlib_main); + r->cpu = os_get_cpu_number(); + r->type = type; + r->data[0] = (u64) data0; + r->data[1] = (u64) data1; +} + +void cj_stop(void) +{ + cj_main_t * cjm = &cj_main; + + cjm->enable = 0; +} + + +clib_error_t * cj_init (vlib_main_t * vm) +{ + cj_main_t * cjm = &cj_main; + + cjm->vlib_main = vm; + return 0; +} +VLIB_INIT_FUNCTION (cj_init); + +static clib_error_t * +cj_config (vlib_main_t * vm, unformat_input_t * input) +{ + cj_main_t * cjm = &cj_main; + int matched = 0; + int enable = 0; + + while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "records %d", &cjm->num_records)) + matched = 1; + else if (unformat (input, "on")) + enable = 1; + else + return clib_error_return (0, "cj_config: unknown input '%U'", + format_unformat_error, input); + } + + if (matched == 0) + return 0; + + cjm->num_records = max_pow2 (cjm->num_records); + vec_validate (cjm->records, cjm->num_records-1); + memset (cjm->records, 0xff, cjm->num_records * sizeof (cj_record_t)); + cjm->tail = ~0; + cjm->enable = enable; + + return 0; +} + +VLIB_CONFIG_FUNCTION (cj_config, "cj"); + +void cj_enable_disable (int is_enable) +{ + cj_main_t * cjm = &cj_main; + + if (cjm->num_records) + cjm->enable = is_enable; + else + vlib_cli_output (cjm->vlib_main, "CJ not configured..."); +} + +static inline void cj_dump_one_record (cj_record_t * r) +{ + fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n", + r->cpu, r->time, r->type, (long long unsigned int) r->data[0], + (long long unsigned int) r->data[1]); +} + +static void cj_dump_internal (u8 filter0_enable, u64 filter0, + u8 filter1_enable, u64 filter1) +{ + cj_main_t * cjm = &cj_main; + cj_record_t * r; + u32 i, index; + + if (cjm->num_records == 0) + { + fprintf (stderr, "CJ not configured...\n"); + return; + } + + if (cjm->tail == (u64)~0) + { + fprintf (stderr, "No data collected...\n"); + return; + } + + /* Has the trace wrapped? */ + index = (cjm->tail+1) & (cjm->num_records - 1); + r = &(cjm->records[index]); + + if (r->cpu != (u32)~0) + { + /* Yes, dump from tail + 1 to the end */ + for (i = index; i < cjm->num_records; i++) + { + if (filter0_enable && (r->data[0] != filter0)) + goto skip; + if (filter1_enable && (r->data[1] != filter1)) + goto skip; + cj_dump_one_record (r); + skip: + r++; + } + } + /* dump from the beginning through the final tail */ + r = cjm->records; + for (i = 0; i <= cjm->tail; i++) + { + if (filter0_enable && (r->data[0] != filter0)) + goto skip2; + if (filter1_enable && (r->data[1] != filter1)) + goto skip2; + cj_dump_one_record (r); + skip2: + r++; + } +} + +void cj_dump (void) +{ + cj_dump_internal (0, 0, 0, 0); +} + +void cj_dump_filter_data0 (u64 filter0) +{ + cj_dump_internal (1/* enable f0 */, filter0, 0, 0); +} + +void cj_dump_filter_data1 (u64 filter1) +{ + cj_dump_internal (0, 0, 1 /* enable f1 */, filter1); +} + +void cj_dump_filter_data12 (u64 filter0, u64 filter1) +{ + cj_dump_internal (1, filter0, 1, filter1); +} + +static clib_error_t * +cj_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + int is_enable = -1; + int is_dump = -1; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { + if (unformat (input, "enable") || unformat (input, "on")) + is_enable = 1; + else if (unformat (input, "disable") || unformat (input, "off")) + is_enable = 0; + else if (unformat (input, "dump")) + is_dump = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (is_enable >= 0) + cj_enable_disable (is_enable); + + if (is_dump > 0) + cj_dump (); + + return 0; +} + +VLIB_CLI_COMMAND (cj_command,static) = { + .path = "cj", + .short_help = "cj", + .function = cj_command_fn, +}; + diff --git a/vlib/vlib/unix/cj.h b/vlib/vlib/unix/cj.h new file mode 100644 index 00000000000..3c37f2bf22f --- /dev/null +++ b/vlib/vlib/unix/cj.h @@ -0,0 +1,68 @@ +/* + *------------------------------------------------------------------ + * cj.h + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef __included_cj_h__ +#define __included_cj_h__ + +typedef struct { + f64 time; + u32 cpu; + u32 type; + u64 data[2]; +} cj_record_t; + +typedef struct { + volatile u64 tail; + cj_record_t * records; + u32 num_records; + volatile u32 enable; + + vlib_main_t * vlib_main; +} cj_main_t; + +void cj_log (u32 type, void * data0, void * data1); + +/* + * Supply in application main, so we can log from any library... + * Declare a weak reference in the library, off you go. + */ + +#define DECLARE_CJ_GLOBAL_LOG \ +void cj_global_log (unsigned type, void * data0, void * data1) \ + __attribute__ ((weak)); \ + \ +unsigned __cj_type; \ +void * __cj_data0; \ +void * __cj_data1; \ + \ +void \ +cj_global_log (unsigned type, void * data0, void * data1) \ +{ \ + __cj_type = type; \ + __cj_data0 = data0; \ + __cj_data1 = data1; \ +} + +#define CJ_GLOBAL_LOG_PROTOTYPE +void cj_global_log (unsigned type, void * data0, void * data1) \ + __attribute__ ((weak)); \ + +void cj_stop(void); + +#endif /* __included_cj_h__ */ diff --git a/vlib/vlib/unix/cli.c b/vlib/vlib/unix/cli.c new file mode 100644 index 00000000000..3cb13fc8550 --- /dev/null +++ b/vlib/vlib/unix/cli.c @@ -0,0 +1,900 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * cli.c: Unix stdin/socket CLI. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> + +#include <fcntl.h> +#include <sys/stat.h> +#include <termios.h> +#include <unistd.h> +#include <arpa/telnet.h> + +typedef struct { + u32 unix_file_index; + + /* Vector of output pending write to file descriptor. */ + u8 * output_vector; + + /* Vector of input saved by Unix input node to be processed by + CLI process. */ + u8 * input_vector; + + u8 has_history; + u8 ** command_history; + u8 * current_command; + i32 excursion; + u32 history_limit; + u8 * search_key; + int search_mode; + + u32 process_node_index; +} unix_cli_file_t; + +always_inline void +unix_cli_file_free (unix_cli_file_t * f) +{ + vec_free (f->output_vector); + vec_free (f->input_vector); +} + +typedef struct { + /* Prompt string for CLI. */ + u8 * cli_prompt; + + unix_cli_file_t * cli_file_pool; + + u32 * unused_cli_process_node_indices; + + /* File pool index of current input. */ + u32 current_input_file_index; +} unix_cli_main_t; + +static unix_cli_main_t unix_cli_main; + +static void +unix_cli_add_pending_output (unix_file_t * uf, + unix_cli_file_t * cf, + u8 * buffer, + uword buffer_bytes) +{ + unix_main_t * um = &unix_main; + + vec_add (cf->output_vector, buffer, buffer_bytes); + if (vec_len (cf->output_vector) > 0) + { + int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + if (! skip_update) + um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } +} + +static void +unix_cli_del_pending_output (unix_file_t * uf, + unix_cli_file_t * cf, + uword n_bytes) +{ + unix_main_t * um = &unix_main; + + vec_delete (cf->output_vector, n_bytes, 0); + if (vec_len (cf->output_vector) <= 0) + { + int skip_update = 0 == (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + if (! skip_update) + um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } +} + +/* VLIB cli output function. */ +static void unix_vlib_cli_output (uword cli_file_index, + u8 * buffer, + uword buffer_bytes) +{ + unix_main_t * um = &unix_main; + unix_cli_main_t * cm = &unix_cli_main; + unix_cli_file_t * cf; + unix_file_t * uf; + int n; + + cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + n = 0; + if (vec_len (cf->output_vector) == 0) + n = write (uf->file_descriptor, buffer, buffer_bytes); + if (n < 0 && errno != EAGAIN) + clib_unix_warning ("write"); + + else if ((word) n < (word) buffer_bytes) + { + if (n < 0) n = 0; + unix_cli_add_pending_output (uf, cf, buffer + n, buffer_bytes - n); + } +} + +static int unix_cli_line_edit (unix_main_t * um, unix_cli_file_t * cf) +{ + unix_file_t * uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + u8 * prev; + int i, j, delta; + + for (i = 0; i < vec_len (cf->input_vector); i++) + { + switch (cf->input_vector[i]) + { + case 0: + continue; + + case '?': + /* Erase the current command (if any) plus ?*/ + for (j = 0; j < (vec_len (cf->current_command)+1); j++) + unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3); + + unix_cli_add_pending_output (uf, cf, (u8 *) "\r\nHistory:\r\n", 12); + + for (j = 0; j < vec_len (cf->command_history); j++) + { + unix_cli_add_pending_output (uf, cf, cf->command_history[j], + vec_len(cf->command_history[j])); + unix_cli_add_pending_output (uf, cf, (u8 *) "\r\n", 2); + } + goto crlf; + + /* ^R - reverse search */ + case 'R' - '@': + case 'S' - '@': + if (cf->search_mode == 0) + { + /* Erase the current command (if any) plus ^R */ + for (j = 0; j < (vec_len (cf->current_command)+2); j++) + unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3); + + vec_reset_length (cf->search_key); + vec_reset_length (cf->current_command); + if (cf->input_vector[i] == 'R' - '@') + cf->search_mode = -1; + else + cf->search_mode = 1; + } + else + { + if (cf->input_vector[i] == 'R' - '@') + cf->search_mode = -1; + else + cf->search_mode = 1; + + cf->excursion += cf->search_mode; + unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3); + goto search_again; + } + break; + + /* ^U - line-kill */ + case 'U'-'@': + /* Erase the command, plus ^U */ + for (j = 0; j < (vec_len (cf->current_command)+2); j++) + unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3); + vec_reset_length (cf->current_command); + cf->search_mode = 0; + continue; + + /* ^P - previous, ^N - next */ + case 'P' - '@': + case 'N' - '@': + cf->search_mode = 0; + /* Erase the command, plus ^P */ + for (j = 0; j < (vec_len (cf->current_command)+2); j++) + unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3); + vec_reset_length (cf->current_command); + if (vec_len (cf->command_history)) + { + if (cf->input_vector[i] == 'P' - '@') + delta = -1; + else + delta = 1; + + cf->excursion += delta; + + if (cf->excursion > (i32) vec_len (cf->command_history) -1) + cf->excursion = 0; + else if (cf->excursion < 0) + cf->excursion = vec_len (cf->command_history) -1; + + prev = cf->command_history [cf->excursion]; + vec_validate (cf->current_command, vec_len(prev)-1); + + memcpy (cf->current_command, prev, vec_len(prev)); + _vec_len (cf->current_command) = vec_len(prev); + unix_cli_add_pending_output (uf, cf, cf->current_command, + vec_len (cf->current_command)); + break; + } + break; + + case 0x7f: + case 'H' - '@': + for (j = 0; j < 2; j++) + unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3); + if (vec_len (cf->current_command)) + { + unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3); + _vec_len (cf->current_command)--; + } + cf->search_mode = 0; + cf->excursion = 0; + cf->search_mode = 0; + vec_reset_length (cf->search_key); + break; + + case '\r': + case '\n': + crlf: + vec_add1 (cf->current_command, '\r'); + vec_add1 (cf->current_command, '\n'); + unix_cli_add_pending_output (uf, cf, (u8 *) "\b\b \b\b\r\n", 8); + + vec_validate (cf->input_vector, vec_len(cf->current_command)-1); + memcpy (cf->input_vector, cf->current_command, + vec_len(cf->current_command)); + _vec_len(cf->input_vector) = _vec_len (cf->current_command); + + if (vec_len(cf->command_history) >= cf->history_limit) + { + vec_free (cf->command_history[0]); + vec_delete (cf->command_history, 1, 0); + } + /* Don't add blank lines to the cmd history */ + if (vec_len (cf->current_command) > 2) + { + _vec_len (cf->current_command) -= 2; + vec_add1 (cf->command_history, cf->current_command); + cf->current_command = 0; + } + else + vec_reset_length (cf->current_command); + cf->excursion = 0; + cf->search_mode = 0; + vec_reset_length (cf->search_key); + return 0; + + /* telnet "mode character" blort, echo but don't process. */ + case 0xff: + unix_cli_add_pending_output (uf, cf, cf->input_vector + i, + 6); + i += 6; + continue; + + default: + if (cf->search_mode) + { + int j, k, limit, offset; + u8 * item; + + vec_add1 (cf->search_key, cf->input_vector[i]); + + search_again: + for (j = 0; j < vec_len(cf->command_history); j++) + { + if (cf->excursion > (i32) vec_len (cf->command_history) -1) + cf->excursion = 0; + else if (cf->excursion < 0) + cf->excursion = vec_len (cf->command_history) -1; + + item = cf->command_history[cf->excursion]; + + limit = (vec_len(cf->search_key) > vec_len (item)) ? + vec_len(item) : vec_len (cf->search_key); + + for (offset = 0; offset <= vec_len(item) - limit; offset++) + { + for (k = 0; k < limit; k++) + { + if (item[k+offset] != cf->search_key[k]) + goto next_offset; + } + goto found_at_offset; + + next_offset: + ; + } + goto next; + + found_at_offset: + for (j = 0; j < vec_len (cf->current_command)+1; j++) + unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3); + + vec_validate (cf->current_command, vec_len(item)-1); + + memcpy (cf->current_command, item, vec_len(item)); + _vec_len (cf->current_command) = vec_len(item); + unix_cli_add_pending_output (uf, cf, cf->current_command, + vec_len (cf->current_command)); + goto found; + + next: + cf->excursion += cf->search_mode; + } + + unix_cli_add_pending_output (uf, cf, (u8 *)"\r\nno match..", 12); + vec_reset_length (cf->search_key); + vec_reset_length (cf->current_command); + cf->search_mode = 0; + goto crlf; + } + else + vec_add1 (cf->current_command, cf->input_vector[i]); + + found: + + break; + } + } + vec_reset_length(cf->input_vector); + return 1; +} + +static void unix_cli_process_input (unix_cli_main_t * cm, uword cli_file_index) +{ + unix_main_t * um = &unix_main; + unix_file_t * uf; + unix_cli_file_t * cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + unformat_input_t input; + int vlib_parse_eval (u8 *); + + /* Try vlibplex first. Someday... */ + if (0 && vlib_parse_eval (cf->input_vector) == 0) + goto done; + + /* Line edit, echo, etc. */ + if (cf->has_history && unix_cli_line_edit (um, cf)) + return; + + if (um->log_fd) + { + static u8 * lv; + vec_reset_length (lv); + lv = format (lv, "%U[%d]: %v", + format_timeval, + 0 /* current bat-time */, + 0 /* current bat-format */, + cli_file_index, + cf->input_vector); + { + int rv __attribute__((unused)) = + write (um->log_fd, lv, vec_len(lv)); + } + } + + unformat_init_vector (&input, cf->input_vector); + + /* Remove leading white space from input. */ + (void) unformat (&input, ""); + + cm->current_input_file_index = cli_file_index; + + if (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT) + vlib_cli_input (um->vlib_main, &input, unix_vlib_cli_output, cli_file_index); + + /* Re-fetch pointer since pool may have moved. */ + cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + + /* Zero buffer since otherwise unformat_free will call vec_free on it. */ + input.buffer = 0; + + unformat_free (&input); + + /* Re-use input vector. */ +done: + _vec_len (cf->input_vector) = 0; + + /* Prompt. */ + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + unix_cli_add_pending_output (uf, cf, + cm->cli_prompt, + vec_len (cm->cli_prompt)); +} + +static void unix_cli_kill (unix_cli_main_t * cm, uword cli_file_index) +{ + unix_main_t * um = &unix_main; + unix_cli_file_t * cf; + unix_file_t * uf; + int i; + + cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + + /* Quit/EOF on stdin means quit program. */ + if (uf->file_descriptor == 0) + clib_longjmp (&um->vlib_main->main_loop_exit, VLIB_MAIN_LOOP_EXIT_CLI); + + vec_free (cf->current_command); + vec_free (cf->search_key); + + for (i = 0; i < vec_len (cf->command_history); i++) + vec_free (cf->command_history[i]); + + vec_free (cf->command_history); + + unix_file_del (um, uf); + + unix_cli_file_free (cf); + pool_put (cm->cli_file_pool, cf); +} + +typedef enum { + UNIX_CLI_PROCESS_EVENT_READ_READY, + UNIX_CLI_PROCESS_EVENT_QUIT, +} unix_cli_process_event_type_t; + +static uword +unix_cli_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, + vlib_frame_t * f) +{ + unix_cli_main_t * cm = &unix_cli_main; + uword i, * data = 0; + + while (1) + { + unix_cli_process_event_type_t event_type; + vlib_process_wait_for_event (vm); + event_type = vlib_process_get_events (vm, &data); + + switch (event_type) + { + case UNIX_CLI_PROCESS_EVENT_READ_READY: + for (i = 0; i < vec_len (data); i++) + unix_cli_process_input (cm, data[i]); + break; + + case UNIX_CLI_PROCESS_EVENT_QUIT: + /* Kill this process. */ + for (i = 0; i < vec_len (data); i++) + unix_cli_kill (cm, data[i]); + goto done; + } + + if (data) + _vec_len (data) = 0; + } + + done: + vec_free (data); + + vlib_node_set_state (vm, rt->node_index, VLIB_NODE_STATE_DISABLED); + + /* Add node index so we can re-use this process later. */ + vec_add1 (cm->unused_cli_process_node_indices, rt->node_index); + + return 0; +} + +static clib_error_t * unix_cli_write_ready (unix_file_t * uf) +{ + unix_cli_main_t * cm = &unix_cli_main; + unix_cli_file_t * cf; + int n; + + cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data); + + /* Flush output vector. */ + n = write (uf->file_descriptor, + cf->output_vector, vec_len (cf->output_vector)); + + if (n < 0 && errno != EAGAIN) + return clib_error_return_unix (0, "write"); + + else if (n > 0) + unix_cli_del_pending_output (uf, cf, n); + + return /* no error */ 0; +} + +static clib_error_t * unix_cli_read_ready (unix_file_t * uf) +{ + unix_main_t * um = &unix_main; + unix_cli_main_t * cm = &unix_cli_main; + unix_cli_file_t * cf; + uword l; + int n, n_read, n_try; + + cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data); + + n = n_try = 4096; + while (n == n_try) { + l = vec_len (cf->input_vector); + vec_resize (cf->input_vector, l + n_try); + + n = read (uf->file_descriptor, cf->input_vector + l, n_try); + + /* Error? */ + if (n < 0 && errno != EAGAIN) + return clib_error_return_unix (0, "read"); + + n_read = n < 0 ? 0 : n; + _vec_len (cf->input_vector) = l + n_read; + } + + if (! (n < 0)) + vlib_process_signal_event (um->vlib_main, + cf->process_node_index, + (n_read == 0 + ? UNIX_CLI_PROCESS_EVENT_QUIT + : UNIX_CLI_PROCESS_EVENT_READ_READY), + /* event data */ uf->private_data); + + return /* no error */ 0; +} + +static u32 unix_cli_file_add (unix_cli_main_t * cm, char * name, int fd) +{ + unix_main_t * um = &unix_main; + unix_cli_file_t * cf; + unix_file_t * uf, template = {0}; + vlib_main_t * vm = um->vlib_main; + vlib_node_t * n; + + name = (char *) format (0, "unix-cli-%s", name); + + if (vec_len (cm->unused_cli_process_node_indices) > 0) + { + uword l = vec_len (cm->unused_cli_process_node_indices); + + /* Find node and give it new name. */ + n = vlib_get_node (vm, cm->unused_cli_process_node_indices[l - 1]); + vec_free (n->name); + n->name = (u8 *) name; + + vlib_node_set_state (vm, n->index, VLIB_NODE_STATE_POLLING); + + _vec_len (cm->unused_cli_process_node_indices) = l - 1; + } + else + { + static vlib_node_registration_t r = { + .function = unix_cli_process, + .type = VLIB_NODE_TYPE_PROCESS, + .process_log2_n_stack_bytes = 14, + }; + + r.name = name; + vlib_register_node (vm, &r); + vec_free (name); + + n = vlib_get_node (vm, r.index); + } + + pool_get (cm->cli_file_pool, cf); + memset (cf, 0, sizeof (*cf)); + + template.read_function = unix_cli_read_ready; + template.write_function = unix_cli_write_ready; + template.file_descriptor = fd; + template.private_data = cf - cm->cli_file_pool; + + cf->process_node_index = n->index; + cf->unix_file_index = unix_file_add (um, &template); + cf->output_vector = 0; + cf->input_vector = 0; + + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + + /* Prompt. */ + unix_cli_add_pending_output (uf, cf, + cm->cli_prompt, vec_len (cm->cli_prompt)); + + vlib_start_process (vm, n->runtime_index); + return cf - cm->cli_file_pool; +} + +static clib_error_t * unix_cli_listen_read_ready (unix_file_t * uf) +{ + unix_main_t * um = &unix_main; + unix_cli_main_t * cm = &unix_cli_main; + clib_socket_t * s = &um->cli_listen_socket; + clib_socket_t client; + char * client_name; + clib_error_t * error; + unix_cli_file_t * cf; + u32 cf_index; + + error = clib_socket_accept (s, &client); + if (error) + return error; + + client_name = (char *) format (0, "%U%c", format_sockaddr, &client.peer, 0); + + cf_index = unix_cli_file_add (cm, client_name, client.fd); + cf = pool_elt_at_index (cm->cli_file_pool, cf_index); + + /* No longer need CLIB version of socket. */ + clib_socket_free (&client); + + vec_free (client_name); + + /* if we're supposed to run telnet session in character mode (default) */ + if (um->cli_line_mode == 0) + { + u8 charmode_option[6]; + + cf->has_history = 1; + cf->history_limit = um->cli_history_limit ? um->cli_history_limit : 50; + + /* + * Set telnet client character mode, echo on, suppress "go-ahead" + * Empirically, this sequence works. YMMV. + */ + + /* Tell the client no linemode, echo */ + charmode_option[0] = IAC; + charmode_option[1] = DONT; + charmode_option[2] = TELOPT_LINEMODE; + charmode_option[3] = IAC; + charmode_option[4] = DO; + charmode_option[5] = TELOPT_SGA; + + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + + unix_cli_add_pending_output (uf, cf, charmode_option, + ARRAY_LEN(charmode_option)); + } + + return error; +} + +static clib_error_t * +unix_cli_config (vlib_main_t * vm, unformat_input_t * input) +{ + unix_main_t * um = &unix_main; + unix_cli_main_t * cm = &unix_cli_main; + int flags, standard_input_fd; + clib_error_t * error; + + /* We depend on unix flags being set. */ + if ((error = vlib_call_config_function (vm, unix_config))) + return error; + + if (um->flags & UNIX_FLAG_INTERACTIVE) + { + standard_input_fd = 0; + + /* Set stdin to be non-blocking. */ + if ((flags = fcntl (standard_input_fd, F_GETFL, 0)) < 0) + flags = 0; + fcntl (standard_input_fd, F_SETFL, flags | O_NONBLOCK); + + unix_cli_file_add (cm, "stdin", standard_input_fd); + } + + { + /* CLI listen. */ + clib_socket_t * s = &um->cli_listen_socket; + unix_file_t template = {0}; + + s->flags = SOCKET_IS_SERVER; /* listen, don't connect */ + + error = clib_socket_init (s); + if (error) + return error; + + template.read_function = unix_cli_listen_read_ready; + template.file_descriptor = s->fd; + + unix_file_add (um, &template); + } + + /* Set CLI prompt. */ + if (! cm->cli_prompt) + cm->cli_prompt = format (0, "VLIB: "); + + return 0; +} + +VLIB_CONFIG_FUNCTION (unix_cli_config, "unix-cli"); + +void vlib_unix_cli_set_prompt (char * prompt) +{ + char * fmt = (prompt[strlen(prompt)-1] == ' ') ? "%s" : "%s "; + unix_cli_main_t * cm = &unix_cli_main; + if (cm->cli_prompt) + vec_free (cm->cli_prompt); + cm->cli_prompt = format (0, fmt, prompt); +} + +static clib_error_t * +unix_cli_quit (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unix_cli_main_t * cm = &unix_cli_main; + + vlib_process_signal_event (vm, + vlib_current_process (vm), + UNIX_CLI_PROCESS_EVENT_QUIT, + cm->current_input_file_index); + return 0; +} + +VLIB_CLI_COMMAND (unix_cli_quit_command, static) = { + .path = "quit", + .short_help = "Exit CLI", + .function = unix_cli_quit, +}; + +static clib_error_t * +unix_cli_exec (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + char * file_name; + int fd; + unformat_input_t sub_input; + clib_error_t * error; + + file_name = 0; + fd = -1; + error = 0; + + if (! unformat (input, "%s", &file_name)) + { + error = clib_error_return (0, "expecting file name, got `%U'", + format_unformat_error, input); + goto done; + } + + fd = open (file_name, O_RDONLY); + if (fd < 0) + { + error = clib_error_return_unix (0, "failed to open `%s'", file_name); + goto done; + } + + /* Make sure its a regular file. */ + { + struct stat s; + + if (fstat (fd, &s) < 0) + { + error = clib_error_return_unix (0, "failed to stat `%s'", file_name); + goto done; + } + + if (! (S_ISREG (s.st_mode) || S_ISLNK (s.st_mode))) + { + error = clib_error_return (0, "not a regular file `%s'", file_name); + goto done; + } + } + + unformat_init_unix_file (&sub_input, fd); + + vlib_cli_input (vm, &sub_input, 0, 0); + unformat_free (&sub_input); + + done: + if (fd > 0) + close (fd); + vec_free (file_name); + + return error; +} + +VLIB_CLI_COMMAND (cli_exec, static) = { + .path = "exec", + .short_help = "Execute commands from file", + .function = unix_cli_exec, +}; + +static clib_error_t * +unix_show_errors (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unix_main_t * um = &unix_main; + clib_error_t * error = 0; + int i, n_errors_to_show; + unix_error_history_t * unix_errors = 0; + + n_errors_to_show = 1 << 30; + + if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (! unformat (input, "%d", &n_errors_to_show)) + { + error = clib_error_return (0, "expecting integer number of errors to show, got `%U'", + format_unformat_error, input); + goto done; + } + } + + n_errors_to_show = clib_min (ARRAY_LEN (um->error_history), n_errors_to_show); + + i = um->error_history_index > 0 ? um->error_history_index - 1 : ARRAY_LEN (um->error_history) - 1; + + while (n_errors_to_show > 0) + { + unix_error_history_t * eh = um->error_history + i; + + if (! eh->error) + break; + + vec_add1 (unix_errors, eh[0]); + n_errors_to_show -= 1; + if (i == 0) + i = ARRAY_LEN (um->error_history) - 1; + else + i--; + } + + if (vec_len (unix_errors) == 0) + vlib_cli_output (vm, "no Unix errors so far"); + else + { + vlib_cli_output (vm, "%Ld total errors seen", um->n_total_errors); + for (i = vec_len (unix_errors) - 1; i >= 0; i--) + { + unix_error_history_t * eh = vec_elt_at_index (unix_errors, i); + vlib_cli_output (vm, "%U: %U", + format_time_interval, "h:m:s:u", eh->time, + format_clib_error, eh->error); + } + vlib_cli_output (vm, "%U: time now", + format_time_interval, "h:m:s:u", vlib_time_now (vm)); + } + + done: + vec_free (unix_errors); + return error; +} + +VLIB_CLI_COMMAND (cli_unix_show_errors, static) = { + .path = "show unix-errors", + .short_help = "Show Unix system call error history", + .function = unix_show_errors, +}; + +static clib_error_t * +unix_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (unix_cli_init); diff --git a/vlib/vlib/unix/input.c b/vlib/vlib/unix/input.c new file mode 100644 index 00000000000..ea10e4fc354 --- /dev/null +++ b/vlib/vlib/unix/input.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * input.c: Unix file input + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <signal.h> + +/* FIXME autoconf */ +#define HAVE_LINUX_EPOLL + +#ifdef HAVE_LINUX_EPOLL + +#include <sys/epoll.h> + +typedef struct { + int epoll_fd; + struct epoll_event * epoll_events; + + /* Statistics. */ + u64 epoll_files_ready; + u64 epoll_waits; +} linux_epoll_main_t; + +static linux_epoll_main_t linux_epoll_main; + +static void +linux_epoll_file_update (unix_file_t * f, + unix_file_update_type_t update_type) +{ + unix_main_t * um = &unix_main; + linux_epoll_main_t * em = &linux_epoll_main; + struct epoll_event e; + + memset (&e, 0, sizeof (e)); + + e.events = EPOLLIN; + if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE) + e.events |= EPOLLOUT; + e.data.u32 = f - um->file_pool; + + if (epoll_ctl (em->epoll_fd, + (update_type == UNIX_FILE_UPDATE_ADD + ? EPOLL_CTL_ADD + : (update_type == UNIX_FILE_UPDATE_MODIFY + ? EPOLL_CTL_MOD + : EPOLL_CTL_DEL)), + f->file_descriptor, + &e) < 0) + clib_warning ("epoll_ctl"); +} + +static uword +linux_epoll_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + unix_main_t * um = &unix_main; + linux_epoll_main_t * em = &linux_epoll_main; + struct epoll_event * e; + int n_fds_ready; + + { + vlib_node_main_t * nm = &vm->node_main; + u64 t = nm->cpu_time_next_process_ready; + f64 timeout; + int timeout_ms, max_timeout_ms = 10; + f64 vector_rate = vlib_last_vectors_per_main_loop (vm); + + if (t == ~0ULL) + { + timeout = 10e-3; + timeout_ms = max_timeout_ms; + } + else + { + timeout = + (((i64) t - (i64) clib_cpu_time_now ()) + * vm->clib_time.seconds_per_clock) + /* subtract off some slop time */ - 50e-6; + timeout_ms = timeout * 1e3; + + /* Must be between 1 and 10 ms. */ + timeout_ms = clib_max (1, timeout_ms); + timeout_ms = clib_min (max_timeout_ms, timeout_ms); + } + + /* If we still have input nodes polling (e.g. vnet packet generator) + don't sleep. */ + if (nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] > 0) + timeout_ms = 0; + + if (vector_rate > 1) + { + /* When busy don't wait & only epoll for input every 8 times + through main loop. */ + timeout_ms = 0; + node->input_main_loops_per_call = 1024; + } + else + /* We're not busy; go to sleep for a while. */ + node->input_main_loops_per_call = 0; + + /* Allow any signal to wakeup our sleep. */ + { + static sigset_t unblock_all_signals; + n_fds_ready = epoll_pwait (em->epoll_fd, + em->epoll_events, + vec_len (em->epoll_events), + timeout_ms, + &unblock_all_signals); + + /* This kludge is necessary to run over absurdly old kernels */ + if (n_fds_ready < 0 && errno == ENOSYS) + { + n_fds_ready = epoll_wait (em->epoll_fd, + em->epoll_events, + vec_len (em->epoll_events), + timeout_ms); + } + } + } + + if (n_fds_ready < 0) + { + if (unix_error_is_fatal (errno)) + vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait")); + + /* non fatal error (e.g. EINTR). */ + return 0; + } + + em->epoll_waits += 1; + em->epoll_files_ready += n_fds_ready; + + for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++) + { + u32 i = e->data.u32; + unix_file_t * f = pool_elt_at_index (um->file_pool, i); + clib_error_t * errors[4]; + int n_errors = 0; + + if (PREDICT_TRUE (! (e->events & EPOLLERR))) + { + if (e->events & EPOLLIN) + { + errors[n_errors] = f->read_function (f); + n_errors += errors[n_errors] != 0; + } + if (e->events & EPOLLOUT) + { + errors[n_errors] = f->write_function (f); + n_errors += errors[n_errors] != 0; + } + } + else + { + if (f->error_function) + { + errors[n_errors] = f->error_function (f); + n_errors += errors[n_errors] != 0; + } + } + + ASSERT (n_errors < ARRAY_LEN (errors)); + for (i = 0; i < n_errors; i++) + { + unix_save_error (um, errors[i]); + } + } + + return 0; +} + +VLIB_REGISTER_NODE (linux_epoll_input_node,static) = { + .function = linux_epoll_input, + .type = VLIB_NODE_TYPE_PRE_INPUT, + .name = "unix-epoll-input", +}; + +clib_error_t * +linux_epoll_input_init (vlib_main_t * vm) +{ + linux_epoll_main_t * em = &linux_epoll_main; + unix_main_t * um = &unix_main; + + /* Allocate some events. */ + vec_resize (em->epoll_events, VLIB_FRAME_SIZE); + + em->epoll_fd = epoll_create (vec_len (em->epoll_events)); + if (em->epoll_fd < 0) + return clib_error_return_unix (0, "epoll_create"); + + um->file_update = linux_epoll_file_update; + + return 0; +} + +VLIB_INIT_FUNCTION (linux_epoll_input_init); + +#endif /* HAVE_LINUX_EPOLL */ + +static clib_error_t * +unix_input_init (vlib_main_t * vm) +{ + return vlib_call_init_function (vm, linux_epoll_input_init); +} + +VLIB_INIT_FUNCTION (unix_input_init); diff --git a/vlib/vlib/unix/main.c b/vlib/vlib/unix/main.c new file mode 100644 index 00000000000..b85f3e73326 --- /dev/null +++ b/vlib/vlib/unix/main.c @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * main.c: Unix main routine + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vlib/unix/plugin.h> + +#include <signal.h> +#include <sys/ucontext.h> +#include <syslog.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +unix_main_t unix_main; + +static clib_error_t * +unix_main_init (vlib_main_t * vm) +{ + unix_main_t * um = &unix_main; + um->vlib_main = vm; + return vlib_call_init_function (vm, unix_input_init); +} + +VLIB_INIT_FUNCTION (unix_main_init); + +static void unix_signal_handler (int signum, siginfo_t * si, ucontext_t * uc) +{ + uword fatal; + u8 * msg = 0; + + msg = format (msg, "received signal %U, PC %U", + format_signal, signum, + format_ucontext_pc, uc); + + if (signum == SIGSEGV) + msg = format (msg, ", faulting address %p", si->si_addr); + + switch (signum) + { + /* these (caught) signals cause the application to exit */ + case SIGTERM: + if (unix_main.vlib_main->main_loop_exit_set) + { + syslog (LOG_ERR | LOG_DAEMON, "received SIGTERM, exiting..."); + + clib_longjmp (&unix_main.vlib_main->main_loop_exit, + VLIB_MAIN_LOOP_EXIT_CLI); + } + case SIGQUIT: + case SIGINT: + case SIGILL: + case SIGBUS: + case SIGSEGV: + case SIGHUP: + case SIGFPE: + fatal = 1; + break; + + /* by default, print a message and continue */ + default: + fatal = 0; + break; + } + + /* Null terminate. */ + vec_add1 (msg, 0); + + if (fatal) + { + syslog (LOG_ERR | LOG_DAEMON, "%s", msg); + os_exit (1); + } + else + clib_warning ("%s", msg); + + vec_free (msg); +} + +static clib_error_t * +setup_signal_handlers (unix_main_t * um) +{ + uword i; + struct sigaction sa; + + for (i = 1; i < 32; i++) + { + memset (&sa, 0, sizeof (sa)); + sa.sa_sigaction = (void *) unix_signal_handler; + sa.sa_flags = SA_SIGINFO; + + switch (i) + { + /* these signals take the default action */ + case SIGABRT: + case SIGKILL: + case SIGSTOP: + case SIGUSR1: + case SIGUSR2: + continue; + + /* ignore SIGPIPE, SIGCHLD */ + case SIGPIPE: + case SIGCHLD: + sa.sa_sigaction = (void *) SIG_IGN; + break; + + /* catch and handle all other signals */ + default: + break; + } + + if (sigaction (i, &sa, 0) < 0) + return clib_error_return_unix (0, "sigaction %U", format_signal, i); + } + + return 0; +} + +static void unix_error_handler (void * arg, u8 * msg, int msg_len) +{ + unix_main_t * um = arg; + + /* Echo to stderr when interactive. */ + if (um->flags & UNIX_FLAG_INTERACTIVE) + { + CLIB_UNUSED (int r) = write (2, msg, msg_len); + } + else + { + char save = msg[msg_len - 1]; + + /* Null Terminate. */ + msg[msg_len-1] = 0; + + syslog (LOG_ERR | LOG_DAEMON, "%s", msg); + + msg[msg_len-1] = save; + } +} + +void vlib_unix_error_report (vlib_main_t * vm, clib_error_t * error) +{ + unix_main_t * um = &unix_main; + + if (um->flags & UNIX_FLAG_INTERACTIVE || error == 0) + return; + + { + char save; + u8 * msg; + u32 msg_len; + + msg = error->what; + msg_len = vec_len(msg); + + /* Null Terminate. */ + save = msg[msg_len-1]; + msg[msg_len-1] = 0; + + syslog (LOG_ERR | LOG_DAEMON, "%s", msg); + + msg[msg_len-1] = save; + } +} + +static uword +startup_config_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, + vlib_frame_t * f) +{ + unix_main_t * um = &unix_main; + u8 * buf = 0; + uword l, n = 1; + + vlib_process_suspend (vm, 2.0); + + while (um->unix_config_complete == 0) + vlib_process_suspend (vm, 0.1); + + if (um->startup_config_filename) { + unformat_input_t sub_input; + int fd; + struct stat s; + char *fn = (char *)um->startup_config_filename; + + fd = open (fn, O_RDONLY); + if (fd < 0) { + clib_warning ("failed to open `%s'", fn); + return 0; + } + + if (fstat (fd, &s) < 0) { + clib_warning ("failed to stat `%s'", fn); + bail: + close(fd); + return 0; + } + + if (! (S_ISREG (s.st_mode) || S_ISLNK (s.st_mode))) { + clib_warning ("not a regular file: `%s'", fn); + goto bail; + } + + while (n > 0) + { + l = vec_len (buf); + vec_resize (buf, 4096); + n = read (fd, buf + l, 4096); + if (n > 0) + { + _vec_len (buf) = l + n; + if (n < 4096) + break; + } + else + break; + } + if (um->log_fd && vec_len (buf)) + { + u8 * lv = 0; + lv = format (lv, "%U: ***** Startup Config *****\n%v", + format_timeval, + 0 /* current bat-time */, + 0 /* current bat-format */, + buf); + { + int rv __attribute__((unused)) = + write (um->log_fd, lv, vec_len(lv)); + } + vec_reset_length (lv); + lv = format (lv, "%U: ***** End Startup Config *****\n", + format_timeval, + 0 /* current bat-time */, + 0 /* current bat-format */); + { + int rv __attribute__((unused)) = + write (um->log_fd, lv, vec_len(lv)); + } + vec_free (lv); + } + + if (vec_len(buf)) + { + unformat_init_vector (&sub_input, buf); + vlib_cli_input (vm, &sub_input, 0, 0); + /* frees buf for us */ + unformat_free (&sub_input); + } + close(fd); + } + return 0; +} + +VLIB_REGISTER_NODE (startup_config_node,static) = { + .function = startup_config_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "startup-config-process", +}; + +static clib_error_t * +unix_config (vlib_main_t * vm, unformat_input_t * input) +{ + unix_main_t * um = &unix_main; + clib_error_t * error = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + char * cli_prompt; + if (unformat (input, "interactive")) + um->flags |= UNIX_FLAG_INTERACTIVE; + else if (unformat (input, "nodaemon")) + um->flags |= UNIX_FLAG_NODAEMON; + else if (unformat (input, "cli-prompt %s", &cli_prompt)) + vlib_unix_cli_set_prompt (cli_prompt); + else if (unformat (input, "cli-listen %s", &um->cli_listen_socket.config)) + ; + else if (unformat (input, "cli-line-mode")) + um->cli_line_mode = 1; + else if (unformat (input, "cli-history-limit %d", &um->cli_history_limit)) + ; + else if (unformat (input, "full-coredump")) + { + int fd; + + fd = open ("/proc/self/coredump_filter", O_WRONLY); + if (fd > 0) + { + if (write (fd, "0x6f\n", 5) != 5) + clib_unix_warning ("coredump filter write failed!"); + close(fd); + } + else + clib_unix_warning ("couldn't open /proc/self/coredump_filter"); + } + else if (unformat (input, "startup-config %s", + &um->startup_config_filename)) + ; + else if (unformat (input, "exec %s", + &um->startup_config_filename)) + ; + else if (unformat (input, "log %s", &um->log_filename)) + { + um->log_fd = open ((char *) um->log_filename, + O_CREAT | O_WRONLY | O_APPEND, 0644); + if (um->log_fd < 0) + { + clib_warning ("couldn't open log '%s'\n", um->log_filename); + um->log_fd = 0; + } + else + { + u8 * lv = 0; + lv = format (0, "%U: ***** Start: PID %d *****\n", + format_timeval, + 0 /* current bat-time */, + 0 /* current bat-format */, + getpid()); + { + int rv __attribute__((unused)) = + write (um->log_fd, lv, vec_len(lv)); + } + vec_free (lv); + } + } + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (! (um->flags & UNIX_FLAG_INTERACTIVE)) + { + error = setup_signal_handlers (um); + if (error) + return error; + + openlog (vm->name, LOG_CONS | LOG_PERROR | LOG_PID, LOG_DAEMON); + clib_error_register_handler (unix_error_handler, um); + + if (! (um->flags & UNIX_FLAG_NODAEMON) + && daemon (/* chdir to / */ 0, + /* stdin/stdout/stderr -> /dev/null */ 0) < 0) + clib_error_return (0, "daemon () fails"); + } + um->unix_config_complete = 1; + + return 0; +} + +/* unix { ... } configuration. */ +VLIB_CONFIG_FUNCTION (unix_config, "unix"); + +static clib_error_t * +unix_exit (vlib_main_t * vm) +{ + /* Close syslog connection. */ + closelog (); + return 0; +} + +VLIB_MAIN_LOOP_EXIT_FUNCTION (unix_exit); + +u8 **vlib_thread_stacks; + +static char **argv_global; + +static uword thread0 (uword arg) +{ + vlib_main_t * vm = (vlib_main_t *)arg; + unformat_input_t input; + int i; + + unformat_init_command_line (&input, argv_global); + i = vlib_main (vm, &input); + unformat_free (&input); + + return i; + } + +int vlib_unix_main (int argc, char * argv[]) +{ + vlib_main_t * vm = &vlib_global_main; /* one and only time for this! */ + + clib_smp_main_t * sm = &clib_smp_main; + vlib_thread_main_t * tm = &vlib_thread_main; + unformat_input_t input; + u8 * thread_stacks; + clib_error_t * e; + int i; + + argv_global = argv; + vm->name = argv[0]; + vm->heap_base = clib_mem_get_heap (); + ASSERT(vm->heap_base); + + i = vlib_plugin_early_init (vm); + if (i) + return i; + + unformat_init_command_line (&input, argv_global); + vm->init_functions_called = hash_create (0, /* value bytes */ 0); + e = vlib_call_all_config_functions (vm, &input, 1 /* early */); + if (e != 0) + { + clib_error_report(e); + return 1; + } + unformat_free (&input); + + /* allocate N x 1mb stacks, aligned e.g. to a 16mb boundary */ + thread_stacks = clib_mem_alloc_aligned + (tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE, + (VLIB_MAX_CPUS << VLIB_LOG2_THREAD_STACK_SIZE)); + + sm->vm_base = thread_stacks; + sm->log2_n_per_cpu_vm_bytes = VLIB_LOG2_THREAD_STACK_SIZE; + + vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1); + for (i = 0; i < vec_len (vlib_thread_stacks); i++) + { + vlib_thread_stacks[i] = thread_stacks; + + /* + * Disallow writes to the bottom page of the stack, to + * catch stack overflows. + */ + if (mprotect (thread_stacks, 4096, PROT_READ) < 0) + clib_unix_warning ("thread stack"); + + thread_stacks += VLIB_THREAD_STACK_SIZE; + } + + i = clib_calljmp (thread0, (uword) vm, + (void *)(vlib_thread_stacks[0] + VLIB_THREAD_STACK_SIZE)); + return i; +} diff --git a/vlib/vlib/unix/mc_socket.c b/vlib/vlib/unix/mc_socket.c new file mode 100644 index 00000000000..1169203f855 --- /dev/null +++ b/vlib/vlib/unix/mc_socket.c @@ -0,0 +1,972 @@ +/* + * mc_socket.c: socket based multicast for vlib mc + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vlib/unix/mc_socket.h> + +#include <sys/ioctl.h> /* for FIONBIO */ +#include <netinet/tcp.h> /* for TCP_NODELAY */ +#include <net/if.h> /* for struct ifreq */ + +static u8 * format_socket_peer_id (u8 * s, va_list * args) +{ + u64 peer_id_as_u64 = va_arg (*args, u64); + mc_peer_id_t peer_id; + peer_id.as_u64 = peer_id_as_u64; + u32 a = mc_socket_peer_id_get_address (peer_id); + u32 p = mc_socket_peer_id_get_port (peer_id); + + s = format (s, "%U:%04x", format_network_address, AF_INET, &a, + ntohs (p)); + + return s; +} + +typedef void (mc_msg_handler_t) (mc_main_t * mcm, void * msg, u32 buffer_index); + +always_inline void msg_handler (mc_main_t * mcm, + u32 buffer_index, + u32 handler_frees_buffer, + void * _h) +{ + vlib_main_t * vm = mcm->vlib_main; + mc_msg_handler_t * h = _h; + vlib_buffer_t * b = vlib_get_buffer (vm, buffer_index); + void * the_msg = vlib_buffer_get_current (b); + + h (mcm, the_msg, buffer_index); + if (! handler_frees_buffer) + vlib_buffer_free_one (vm, buffer_index); +} + +static uword +append_buffer_index_to_iovec (vlib_main_t * vm, + u32 buffer_index, + struct iovec ** iovs_return) +{ + struct iovec * i; + vlib_buffer_t * b; + u32 bi = buffer_index; + u32 l = 0; + + while (1) + { + b = vlib_get_buffer (vm, bi); + vec_add2 (*iovs_return, i, 1); + i->iov_base = vlib_buffer_get_current (b); + i->iov_len = b->current_length; + l += i->iov_len; + if (! (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + break; + bi = b->next_buffer; + } + + return l; +} + +static clib_error_t * +sendmsg_helper (mc_socket_main_t * msm, + int socket, + struct sockaddr_in * tx_addr, + u32 buffer_index) +{ + vlib_main_t * vm = msm->mc_main.vlib_main; + struct msghdr h; + word n_bytes, n_bytes_tx, n_retries; + + memset (&h, 0, sizeof (h)); + h.msg_name = tx_addr; + h.msg_namelen = sizeof (tx_addr[0]); + + if (msm->iovecs) + _vec_len (msm->iovecs) = 0; + + n_bytes = append_buffer_index_to_iovec (vm, buffer_index, &msm->iovecs); + ASSERT (n_bytes <= msm->mc_main.transport.max_packet_size); + if (n_bytes > msm->mc_main.transport.max_packet_size) + clib_error ("sending packet larger than interace MTU %d bytes", n_bytes); + + h.msg_iov = msm->iovecs; + h.msg_iovlen = vec_len (msm->iovecs); + + n_retries = 0; + while ((n_bytes_tx = sendmsg (socket, &h, /* flags */ 0)) != n_bytes + && errno == EAGAIN) + n_retries++; + if (n_bytes_tx != n_bytes) + { + clib_unix_warning ("sendmsg"); + return 0; + } + if (n_retries) + { + ELOG_TYPE_DECLARE (e) = { + .format = "sendmsg-helper: %d retries", + .format_args = "i4", + }; + struct { u32 retries; } * ed = 0; + + ed = ELOG_DATA (&vm->elog_main, e); + ed->retries = n_retries; + } + return 0; +} + +static clib_error_t * +tx_buffer (void * transport, mc_transport_type_t type, u32 buffer_index) +{ + mc_socket_main_t *msm = (mc_socket_main_t *)transport; + vlib_main_t * vm = msm->mc_main.vlib_main; + mc_multicast_socket_t * ms = &msm->multicast_sockets[type]; + clib_error_t * error; + error = sendmsg_helper (msm, ms->socket, &ms->tx_addr, buffer_index); + if (type != MC_TRANSPORT_USER_REQUEST_TO_RELAY) + vlib_buffer_free_one (vm, buffer_index); + return error; +} + +static clib_error_t * +tx_ack (void *transport, mc_peer_id_t dest_peer_id, u32 buffer_index) +{ + struct sockaddr_in tx_addr; + mc_socket_main_t *msm = (mc_socket_main_t *)transport; + vlib_main_t * vm = msm->mc_main.vlib_main; + clib_error_t * error; + + memset (&tx_addr, 0, sizeof (tx_addr)); + tx_addr.sin_family = AF_INET; + tx_addr.sin_addr.s_addr = mc_socket_peer_id_get_address (dest_peer_id); + tx_addr.sin_port = mc_socket_peer_id_get_port (dest_peer_id); + + error = sendmsg_helper (msm, msm->ack_socket, &tx_addr, buffer_index); + vlib_buffer_free_one (vm, buffer_index); + return error; +} + +static clib_error_t * +recvmsg_helper (mc_socket_main_t * msm, + int socket, + struct sockaddr_in * rx_addr, + u32 * buffer_index, + u32 drop_message) +{ + vlib_main_t * vm = msm->mc_main.vlib_main; + vlib_buffer_t * b; + uword n_left, n_alloc, n_mtu, i, i_rx; + const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + word n_bytes_left; + + /* Make sure we have at least a MTU worth of buffers. */ + n_mtu = msm->rx_mtu_n_buffers; + n_left = vec_len (msm->rx_buffers); + if (n_left < n_mtu) + { + uword max_alloc = 8 * n_mtu; + vec_validate (msm->rx_buffers, max_alloc - 1); + n_alloc = vlib_buffer_alloc (vm, msm->rx_buffers + n_left, max_alloc - n_left); + _vec_len (msm->rx_buffers) = n_left + n_alloc; + } + + ASSERT (vec_len (msm->rx_buffers) >= n_mtu); + vec_validate (msm->iovecs, n_mtu - 1); + + /* Allocate RX buffers from end of rx_buffers. + Turn them into iovecs to pass to readv. */ + i_rx = vec_len (msm->rx_buffers) - 1; + for (i = 0; i < n_mtu; i++) + { + b = vlib_get_buffer (vm, msm->rx_buffers[i_rx - i]); + msm->iovecs[i].iov_base = b->data; + msm->iovecs[i].iov_len = buffer_size; + } + _vec_len (msm->iovecs) = n_mtu; + + { + struct msghdr h; + + memset (&h, 0, sizeof (h)); + if (rx_addr) + { + h.msg_name = rx_addr; + h.msg_namelen = sizeof (rx_addr[0]); + } + h.msg_iov = msm->iovecs; + h.msg_iovlen = vec_len (msm->iovecs); + + n_bytes_left = recvmsg (socket, &h, 0); + if (n_bytes_left < 0) + return clib_error_return_unix (0, "recvmsg"); + } + + if (drop_message) + { + *buffer_index = ~0; + return 0; + } + + *buffer_index = msm->rx_buffers[i_rx]; + while (1) + { + b = vlib_get_buffer (vm, msm->rx_buffers[i_rx]); + + b->flags = 0; + b->current_data = 0; + b->current_length = n_bytes_left < buffer_size ? n_bytes_left : buffer_size; + + n_bytes_left -= buffer_size; + + if (n_bytes_left <= 0) + break; + + i_rx--; + b->flags |= VLIB_BUFFER_NEXT_PRESENT; + b->next_buffer = msm->rx_buffers[i_rx]; + } + + _vec_len (msm->rx_buffers) = i_rx; + + return 0 /* no error */; +} + +static clib_error_t * mastership_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + mc_main_t * mcm = &msm->mc_main; + mc_multicast_socket_t * ms = &msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP]; + clib_error_t * error; + u32 bi; + + error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ 0); + if (! error) + msg_handler (mcm, bi, + /* handler_frees_buffer */ 0, + mc_msg_master_assert_handler); + + return error; +} + +static clib_error_t * to_relay_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + mc_main_t *mcm = &msm->mc_main; + vlib_main_t * vm = msm->mc_main.vlib_main; + mc_multicast_socket_t * ms_to_relay = &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY]; + mc_multicast_socket_t * ms_from_relay = &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY]; + clib_error_t * error; + u32 bi; + u32 is_master = mcm->relay_state == MC_RELAY_STATE_MASTER; + + /* Not the ordering master? Turf the msg */ + error = recvmsg_helper (msm, ms_to_relay->socket, /* rx_addr */ 0, &bi, + /* drop_message */ ! is_master); + + /* If we are the master, number and rebroadcast the msg. */ + if (! error && is_master) + { + vlib_buffer_t * b = vlib_get_buffer (vm, bi); + mc_msg_user_request_t * mp = vlib_buffer_get_current (b); + mp->global_sequence = clib_host_to_net_u32 (mcm->relay_global_sequence); + mcm->relay_global_sequence++; + error = sendmsg_helper (msm, ms_from_relay->socket, &ms_from_relay->tx_addr, bi); + vlib_buffer_free_one (vm, bi); + } + + return error; +} + +static clib_error_t * from_relay_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + mc_main_t * mcm = &msm->mc_main; + mc_multicast_socket_t * ms = &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY]; + clib_error_t * error; + u32 bi; + + error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ 0); + if (! error) + { + msg_handler (mcm, bi, /* handler_frees_buffer */ 1, + mc_msg_user_request_handler); + } + return error; +} + +static clib_error_t * join_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + mc_main_t * mcm = &msm->mc_main; + vlib_main_t * vm = mcm->vlib_main; + mc_multicast_socket_t * ms = &msm->multicast_sockets[MC_TRANSPORT_JOIN]; + clib_error_t * error; + u32 bi; + + error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ 0); + if (! error) + { + vlib_buffer_t * b = vlib_get_buffer (vm, bi); + mc_msg_join_or_leave_request_t * mp = vlib_buffer_get_current (b); + + switch (clib_host_to_net_u32 (mp->type)) + { + case MC_MSG_TYPE_join_or_leave_request: + msg_handler (mcm, bi, /* handler_frees_buffer */ 0, + mc_msg_join_or_leave_request_handler); + break; + + case MC_MSG_TYPE_join_reply: + msg_handler (mcm, bi, /* handler_frees_buffer */ 0, + mc_msg_join_reply_handler); + break; + + default: + ASSERT (0); + break; + } + } + return error; +} + +static clib_error_t * ack_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + mc_main_t * mcm = &msm->mc_main; + clib_error_t * error; + u32 bi; + + error = recvmsg_helper (msm, msm->ack_socket, /* rx_addr */ 0, &bi, /* drop_message */ 0); + if (! error) + msg_handler (mcm, bi, /* handler_frees_buffer */ 0, + mc_msg_user_ack_handler); + return error; +} + +static void catchup_cleanup (mc_socket_main_t *msm, + mc_socket_catchup_t *c, + unix_main_t *um, unix_file_t *uf) +{ + hash_unset (msm->catchup_index_by_file_descriptor, uf->file_descriptor); + unix_file_del (um, uf); + vec_free (c->input_vector); + vec_free (c->output_vector); + pool_put (msm->catchups, c); +} + +static mc_socket_catchup_t * +find_catchup_from_file_descriptor (mc_socket_main_t * msm, int file_descriptor) +{ + uword * p = hash_get (msm->catchup_index_by_file_descriptor, file_descriptor); + return p ? pool_elt_at_index (msm->catchups, p[0]) : 0; +} + +static clib_error_t * catchup_socket_read_ready (unix_file_t * uf, int is_server) +{ + unix_main_t * um = &unix_main; + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + mc_main_t *mcm = &msm->mc_main; + mc_socket_catchup_t * c = find_catchup_from_file_descriptor (msm, uf->file_descriptor); + word l, n, is_eof; + + l = vec_len (c->input_vector); + vec_resize (c->input_vector, 4096); + n = read (uf->file_descriptor, c->input_vector + l, vec_len (c->input_vector) - l); + is_eof = n == 0; + + if (n < 0) + { + if (errno == EAGAIN) + n = 0; + else + { + catchup_cleanup (msm, c, um, uf); + return clib_error_return_unix (0, "read"); + } + } + + _vec_len (c->input_vector) = l + n; + + if (is_eof && vec_len (c->input_vector) > 0) + { + if (is_server) + { + mc_msg_catchup_request_handler (mcm, (void *) c->input_vector, c - msm->catchups); + _vec_len (c->input_vector) = 0; + } + else + { + mc_msg_catchup_reply_handler (mcm, (void *) c->input_vector, c - msm->catchups); + c->input_vector = 0; /* reply handler is responsible for freeing vector */ + catchup_cleanup (msm, c, um, uf); + } + } + + return 0 /* no error */; +} + +static clib_error_t * catchup_server_read_ready (unix_file_t * uf) +{ return catchup_socket_read_ready (uf, /* is_server */ 1); } + +static clib_error_t * catchup_client_read_ready (unix_file_t * uf) +{ + if (MC_EVENT_LOGGING) + { + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + vlib_main_t * vm = msm->mc_main.vlib_main; + + ELOG_TYPE (e, "catchup_client_read_ready"); + ELOG (&vm->elog_main, e, 0); + } + return catchup_socket_read_ready (uf, /* is_server */ 0); +} + +static clib_error_t * +catchup_socket_write_ready (unix_file_t * uf, int is_server) +{ + unix_main_t * um = &unix_main; + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + mc_socket_catchup_t *c = find_catchup_from_file_descriptor (msm, uf->file_descriptor); + clib_error_t * error = 0; + int n; + + if (c->connect_in_progress) + { + u32 len, value; + + c->connect_in_progress = 0; + len = sizeof (value); + if (getsockopt (c->socket, SOL_SOCKET, + SO_ERROR, &value, &len) < 0) + { + error = clib_error_return_unix (0, "getsockopt SO_ERROR"); + goto error_quit; + } + if (value != 0) + { + error = clib_error_return_code (0, value, CLIB_ERROR_ERRNO_VALID, "connect fails"); + goto error_quit; + } + } + + while (1) + { + u32 n_this_write; + + n_this_write = + clib_min (vec_len (c->output_vector) - c->output_vector_n_written, + msm->rx_mtu_n_bytes - 64 /* ip + tcp + option allowance */); + + if (n_this_write <= 0) + break; + + do { + n = write (uf->file_descriptor, + c->output_vector + c->output_vector_n_written, + n_this_write); + } while (n < 0 && errno == EAGAIN); + + if (n < 0) + { + error = clib_error_return_unix (0, "write"); + goto error_quit; + } + c->output_vector_n_written += n; + } + + if (c->output_vector_n_written >= vec_len (c->output_vector)) + { + if (! is_server) + { + uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + /* Send EOF to other side. */ + shutdown (uf->file_descriptor, SHUT_WR); + return error; + } + else + { + error_quit: + catchup_cleanup (msm, c, um, uf); + } + } + return error; +} + +static clib_error_t * +catchup_server_write_ready (unix_file_t * uf) +{ return catchup_socket_write_ready (uf, /* is_server */ 1); } + +static clib_error_t * +catchup_client_write_ready (unix_file_t * uf) +{ return catchup_socket_write_ready (uf, /* is_server */ 0); } + +static clib_error_t *catchup_socket_error_ready (unix_file_t *uf) +{ + unix_main_t *um = &unix_main; + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + mc_socket_catchup_t *c = find_catchup_from_file_descriptor (msm, uf->file_descriptor); + catchup_cleanup (msm, c, um, uf); + return clib_error_return (0, "error"); +} + +static clib_error_t *catchup_listen_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data; + struct sockaddr_in client_addr; + int client_len; + mc_socket_catchup_t *c; + unix_file_t template = {0}; + + pool_get (msm->catchups, c); + memset(c, 0, sizeof (c[0])); + + client_len = sizeof(client_addr); + + /* Acquires the non-blocking attrib from the server socket. */ + c->socket = accept (uf->file_descriptor, + (struct sockaddr *)&client_addr, + (socklen_t *)&client_len); + + if (c->socket < 0) + { + pool_put (msm->catchups, c); + return clib_error_return_unix (0, "accept"); + } + + if (MC_EVENT_LOGGING) + { + mc_main_t * mcm = &msm->mc_main; + vlib_main_t * vm = mcm->vlib_main; + + ELOG_TYPE_DECLARE (e) = { + .format = "catchup accepted from 0x%lx", + .format_args = "i4", + }; + struct { u32 addr; } * ed = 0; + + ed = ELOG_DATA (&vm->elog_main, e); + ed->addr = ntohl(client_addr.sin_addr.s_addr); + } + + /* Disable the Nagle algorithm, ship catchup pkts immediately */ + { + int one = 1; + if ((setsockopt(c->socket, IPPROTO_TCP, + TCP_NODELAY, (void *)&one, sizeof(one))) < 0) { + clib_unix_warning("catchup socket: set TCP_NODELAY"); + } + } + + template.read_function = catchup_server_read_ready; + template.write_function = catchup_server_write_ready; + template.error_function = catchup_socket_error_ready; + template.file_descriptor = c->socket; + template.private_data = pointer_to_uword (msm); + c->unix_file_index = unix_file_add (&unix_main, &template); + hash_set (msm->catchup_index_by_file_descriptor, c->socket, c - msm->catchups); + + return 0; +} + +/* Return and bind to an unused port. */ +static word find_and_bind_to_free_port (word sock, word port) +{ + for (; port < 1 << 16; port++) + { + struct sockaddr_in a; + + memset (&a, 0, sizeof(a)); /* Warnings be gone */ + + a.sin_family = PF_INET; + a.sin_addr.s_addr = INADDR_ANY; + a.sin_port = htons (port); + + if (bind (sock, (struct sockaddr *) &a, sizeof (a)) >= 0) + break; + } + + return port < 1 << 16 ? port : -1; +} + +static clib_error_t * +setup_mutlicast_socket (mc_socket_main_t * msm, + mc_multicast_socket_t * ms, + char * type, + uword udp_port) +{ + int one = 1; + struct ip_mreq mcast_req; + + if (! msm->multicast_ttl) + msm->multicast_ttl = 1; + + /* mastership (multicast) TX socket */ + if ((ms->socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) + return clib_error_return_unix(0, "%s socket", type); + + { + u8 ttl = msm->multicast_ttl; + + if ((setsockopt(ms->socket, IPPROTO_IP, + IP_MULTICAST_TTL, (void *)&ttl, sizeof(ttl))) < 0) + return clib_error_return_unix(0, "%s set multicast ttl", type); + } + + if (setsockopt(ms->socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0) + return clib_error_return_unix (0, "%s setsockopt SO_REUSEADDR", type); + + memset (&ms->tx_addr, 0, sizeof (ms->tx_addr)); + ms->tx_addr.sin_family = AF_INET; + ms->tx_addr.sin_addr.s_addr = htonl (msm->multicast_tx_ip4_address_host_byte_order); + ms->tx_addr.sin_port = htons (udp_port); + + if (bind(ms->socket, (struct sockaddr *)&ms->tx_addr, + sizeof (ms->tx_addr)) < 0) + return clib_error_return_unix(0, "%s bind", type); + + memset (&mcast_req, 0, sizeof (mcast_req)); + mcast_req.imr_multiaddr.s_addr = htonl (msm->multicast_tx_ip4_address_host_byte_order); + mcast_req.imr_interface.s_addr = msm->if_ip4_address_net_byte_order; + + if ((setsockopt(ms->socket, IPPROTO_IP, + IP_ADD_MEMBERSHIP, (void *)&mcast_req, + sizeof (mcast_req))) < 0) + return clib_error_return_unix(0, "%s IP_ADD_MEMBERSHIP setsockopt", type); + + if (ioctl (ms->socket, FIONBIO, &one) < 0) + return clib_error_return_unix (0, "%s set FIONBIO", type); + + /* FIXME remove this when we support tx_ready. */ + { + u32 len = 1 << 20; + socklen_t sl = sizeof (len); + if (setsockopt(ms->socket, SOL_SOCKET, SO_SNDBUF, &len, sl) < 0) + clib_unix_error ("setsockopt"); + } + + return 0; +} + +static clib_error_t * +socket_setup (mc_socket_main_t *msm) +{ + int one = 1; + clib_error_t * error; + u32 port; + + if (! msm->base_multicast_udp_port_host_byte_order) + msm->base_multicast_udp_port_host_byte_order = + 0xffff - ((MC_N_TRANSPORT_TYPE + 2 /* ack socket, catchup socket */) + - 1); + + port = msm->base_multicast_udp_port_host_byte_order; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP], + "mastership", + port++); + if (error) + return error; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets[MC_TRANSPORT_JOIN], + "join", + port++); + if (error) + return error; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY], + "to relay", + port++); + if (error) + return error; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY], + "from relay", + port++); + if (error) + return error; + + /* ACK rx socket */ + msm->ack_socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (msm->ack_socket < 0) + return clib_error_return_unix(0, "ack socket"); + + msm->ack_udp_port = find_and_bind_to_free_port (msm->ack_socket, port++); + + if (ioctl (msm->ack_socket, FIONBIO, &one) < 0) + return clib_error_return_unix (0, "ack socket FIONBIO"); + + msm->catchup_server_socket = socket(AF_INET, SOCK_STREAM, 0); + if (msm->catchup_server_socket < 0) + return clib_error_return_unix (0, "catchup server socket"); + + msm->catchup_tcp_port = find_and_bind_to_free_port (msm->catchup_server_socket, port++); + + if (ioctl (msm->catchup_server_socket, FIONBIO, &one) < 0) + return clib_error_return_unix (0, "catchup server socket FIONBIO"); + + if (listen(msm->catchup_server_socket, 5) < 0) + return clib_error_return_unix (0, "catchup server socket listen"); + + /* epoll setup for multicast mastership socket */ + { + unix_file_t template = {0}; + + template.read_function = mastership_socket_read_ready; + template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for multicast to_relay socket */ + template.read_function = to_relay_socket_read_ready; + template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for multicast from_relay socket */ + template.read_function = from_relay_socket_read_ready; + template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + template.read_function = join_socket_read_ready; + template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_JOIN].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for ack rx socket */ + template.read_function = ack_socket_read_ready; + template.file_descriptor = msm->ack_socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for TCP catchup server */ + template.read_function = catchup_listen_read_ready; + template.file_descriptor = msm->catchup_server_socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + } + + return 0; +} + +static void * +catchup_add_pending_output (mc_socket_catchup_t * c, uword n_bytes, u8 * set_output_vector) +{ + unix_file_t * uf = pool_elt_at_index (unix_main.file_pool, + c->unix_file_index); + u8 * result=0; + + if (set_output_vector) + c->output_vector = set_output_vector; + else + vec_add2 (c->output_vector, result, n_bytes); + if (vec_len (c->output_vector) > 0) + { + int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + if (! skip_update) + unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } + return result; +} + +static uword catchup_request_fun (void *transport_main, + u32 stream_index, + mc_peer_id_t catchup_peer_id) +{ + mc_socket_main_t *msm = (mc_socket_main_t *)transport_main; + mc_main_t * mcm = &msm->mc_main; + vlib_main_t * vm = mcm->vlib_main; + mc_socket_catchup_t *c; + struct sockaddr_in addr; + unix_main_t *um = &unix_main; + int one = 1; + + pool_get (msm->catchups, c); + memset (c, 0, sizeof (*c)); + + c->socket = socket(AF_INET, SOCK_STREAM, 0); + if (c->socket < 0) + { + clib_unix_warning ("socket"); + return 0; + } + + if (ioctl (c->socket, FIONBIO, &one) < 0) + { + clib_unix_warning ("FIONBIO"); + return 0; + } + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = mc_socket_peer_id_get_address (catchup_peer_id); + addr.sin_port = mc_socket_peer_id_get_port (catchup_peer_id); + + c->connect_in_progress = 1; + + if (MC_EVENT_LOGGING) + { + ELOG_TYPE_DECLARE (e) = { + .format = "connecting to peer 0x%Lx", + .format_args = "i8", + }; + struct { u64 peer; } * ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->peer = catchup_peer_id.as_u64; + } + + if (connect(c->socket, (const void *)&addr,sizeof(addr)) + < 0 && errno != EINPROGRESS) + { + clib_unix_warning ("connect to %U fails", + format_socket_peer_id, catchup_peer_id); + return 0; + } + + { + unix_file_t template = {0}; + + template.read_function = catchup_client_read_ready; + template.write_function = catchup_client_write_ready; + template.error_function = catchup_socket_error_ready; + template.file_descriptor = c->socket; + template.private_data = (uword) msm; + c->unix_file_index = unix_file_add (um, &template); + + hash_set (msm->catchup_index_by_file_descriptor, c->socket, c - msm->catchups); + } + + { + mc_msg_catchup_request_t * mp; + mp = catchup_add_pending_output (c, sizeof (mp[0]), /* set_output_vector */ 0); + mp->peer_id = msm->mc_main.transport.our_catchup_peer_id; + mp->stream_index = stream_index; + mc_byte_swap_msg_catchup_request (mp); + } + + return c - msm->catchups; +} + +static void catchup_send_fun (void *transport_main, uword opaque, u8 * data) +{ + mc_socket_main_t *msm = (mc_socket_main_t *)transport_main; + mc_socket_catchup_t *c = pool_elt_at_index (msm->catchups, opaque); + catchup_add_pending_output (c, 0, data); +} + +static int +find_interface_ip4_address (char * if_name, u32 * ip4_address, u32 * mtu) +{ + int fd; + struct ifreq ifr; + struct sockaddr_in * sa; + + /* Dig up our IP address */ + fd = socket (PF_INET, AF_INET, 0); + if (fd < 0) { + clib_unix_error ("socket"); + return -1; + } + + ifr.ifr_addr.sa_family = AF_INET; + strncpy (ifr.ifr_name, if_name, sizeof(ifr.ifr_name)-1); + if (ioctl (fd, SIOCGIFADDR, &ifr) < 0) { + clib_unix_error ("ioctl(SIOCFIGADDR)"); + return -1; + } + + sa = (void *) &ifr.ifr_addr; + memcpy (ip4_address, &sa->sin_addr.s_addr, sizeof (ip4_address[0])); + + if (ioctl (fd, SIOCGIFMTU, &ifr) < 0) + return -1; + if (mtu) + *mtu = ifr.ifr_mtu - (/* IP4 header */ 20 + /* UDP header */ 8); + + close (fd); + + return 0; +} + +clib_error_t * +mc_socket_main_init (mc_socket_main_t * msm, char **intfc_probe_list, + int n_intfcs_to_probe) +{ + clib_error_t * error; + mc_main_t * mcm; + u32 mtu; + + mcm = &msm->mc_main; + + /* 239.255.0.7 */ + if (! msm->multicast_tx_ip4_address_host_byte_order) + msm->multicast_tx_ip4_address_host_byte_order = 0xefff0007; + + { + u32 i, a, win; + + win = 0; + if (msm->multicast_interface_name) + { + win = ! find_interface_ip4_address (msm->multicast_interface_name, &a, &mtu); + } + else + { + for (i = 0; i < n_intfcs_to_probe; i++) + if (! find_interface_ip4_address (intfc_probe_list[i], &a, &mtu)) + { + win = 1; + msm->multicast_interface_name = intfc_probe_list[i]; + break; + } + } + + if (! win) + return clib_error_return (0, "can't find interface ip4 address"); + + msm->if_ip4_address_net_byte_order = a; + } + + msm->rx_mtu_n_bytes = mtu; + msm->rx_mtu_n_buffers = msm->rx_mtu_n_bytes / VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + msm->rx_mtu_n_buffers += (msm->rx_mtu_n_bytes % VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES) != 0; + + error = socket_setup (msm); + if (error) + return error; + + mcm->transport.our_ack_peer_id = + mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order, msm->ack_udp_port); + + mcm->transport.our_catchup_peer_id = + mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order, msm->catchup_tcp_port); + + mcm->transport.tx_buffer = tx_buffer; + mcm->transport.tx_ack = tx_ack; + mcm->transport.catchup_request_fun = catchup_request_fun; + mcm->transport.catchup_send_fun = catchup_send_fun; + mcm->transport.format_peer_id = format_socket_peer_id; + mcm->transport.opaque = msm; + mcm->transport.max_packet_size = mtu; + + mc_main_init (mcm, "socket"); + + return error; +} diff --git a/vlib/vlib/unix/mc_socket.h b/vlib/vlib/unix/mc_socket.h new file mode 100644 index 00000000000..7dd6b5e27b1 --- /dev/null +++ b/vlib/vlib/unix/mc_socket.h @@ -0,0 +1,126 @@ +/* + * mc_socket.h: socket based multicast for vlib mc + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_mc_socket_h__ +#define __included_mc_socket_h__ + +#include <vlib/unix/unix.h> +#include <netinet/in.h> + +typedef struct { + int socket; + struct sockaddr_in tx_addr; +} mc_multicast_socket_t; + +/* TCP catchup socket */ +typedef struct { + int socket; + u32 unix_file_index; + + u8 * input_vector; + u8 * output_vector; + u32 output_vector_n_written; + + u32 connect_in_progress; +} mc_socket_catchup_t; + +typedef struct mc_socket_main_t { + mc_main_t mc_main; + + /* Multicast mastership/to-relay/from-relay sockets. */ + mc_multicast_socket_t multicast_sockets[MC_N_TRANSPORT_TYPE]; + + /* Unicast UDP ack sockets */ + int ack_socket; + + /* TCP catchup server socket */ + int catchup_server_socket; + + /* Pool of stream-private catchup sockets */ + mc_socket_catchup_t *catchups; + + uword * catchup_index_by_file_descriptor; + + u32 rx_mtu_n_bytes; + + /* Receive MTU in bytes and VLIB buffers. */ + u32 rx_mtu_n_buffers; + + /* Vector of RX VLIB buffers. */ + u32 * rx_buffers; + /* Vector of scatter/gather descriptors for sending/receiving VLIB buffers + via kernel. */ + struct iovec * iovecs; + + /* IP address of interface to use for multicast. */ + u32 if_ip4_address_net_byte_order; + + u32 ack_udp_port; + u32 catchup_tcp_port; + + /* Interface on which to listen for multicasts. */ + char * multicast_interface_name; + + /* Multicast address to use (e.g. 0xefff0000). + Host byte order. */ + u32 multicast_tx_ip4_address_host_byte_order; + + /* TTL to use for multicasts. */ + u32 multicast_ttl; + + /* Multicast ports for mastership, joins, etc. will be chosen + starting at the given port in host byte order. + A total of MC_N_TRANSPORT_TYPE ports will be used. */ + u32 base_multicast_udp_port_host_byte_order; +} mc_socket_main_t; + +always_inline u32 +mc_socket_peer_id_get_address (mc_peer_id_t i) +{ + u32 a = ((i.as_u8[0] << 24) + | (i.as_u8[1] << 16) + | (i.as_u8[2] << 8) + | (i.as_u8[3] << 0)); + return clib_host_to_net_u32 (a); +} + +always_inline u32 +mc_socket_peer_id_get_port (mc_peer_id_t i) +{ return clib_host_to_net_u16 ((i.as_u8[4] << 8) | i.as_u8[5]); } + +static_always_inline mc_peer_id_t +mc_socket_set_peer_id (u32 address_net_byte_order, u32 port_host_byte_order) +{ + mc_peer_id_t i; + u32 a = ntohl (address_net_byte_order); + u32 p = port_host_byte_order; + i.as_u8[0] = (a >> 24) & 0xff; + i.as_u8[1] = (a >> 16) & 0xff; + i.as_u8[2] = (a >> 8) & 0xff; + i.as_u8[3] = (a >> 0) & 0xff; + i.as_u8[4] = (p >> 8) & 0xff; + i.as_u8[5] = (p >> 0) & 0xff; + i.as_u8[6] = 0; + i.as_u8[7] = 0; + return i; +} + +clib_error_t * +mc_socket_main_init (mc_socket_main_t * msm, char **intfc_probe_list, + int n_intfcs_to_probe); +#endif /* __included_mc_socket_h__ */ + diff --git a/vlib/vlib/unix/pci.c b/vlib/vlib/unix/pci.c new file mode 100644 index 00000000000..02c37f72707 --- /dev/null +++ b/vlib/vlib/unix/pci.c @@ -0,0 +1,577 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/pci/pci.h> +#include <vlib/unix/unix.h> +#include <vlib/unix/pci.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <dirent.h> + +linux_pci_main_t linux_pci_main; + +static clib_error_t * +foreach_directory_file (char * dir_name, + clib_error_t * (* f) (void * arg, u8 * path_name, u8 * file_name), + void * arg, + int scan_dirs) +{ + DIR * d; + struct dirent * e; + clib_error_t * error = 0; + u8 * s, * t; + + d = opendir (dir_name); + if (! d) + { + /* System has no PCI bus. */ + if (errno == ENOENT) + return 0; + return clib_error_return_unix (0, "open `%s'", dir_name); + } + + s = t = 0; + while (1) + { + e = readdir (d); + if (! e) + break; + if (scan_dirs) + { + if (e->d_type == DT_DIR + && (! strcmp (e->d_name, ".") + || ! strcmp (e->d_name, ".."))) + continue; + } + else + { + if (e->d_type == DT_DIR) + continue; + } + + s = format (s, "%s/%s", dir_name, e->d_name); + t = format (t, "%s", e->d_name); + error = f (arg, s, t); + _vec_len (s) = 0; + _vec_len (t) = 0; + + if (error) + break; + } + + vec_free (s); + closedir (d); + + return error; +} + +static clib_error_t * +write_sys_fs (char * file_name, char * fmt, ...) +{ + u8 * s; + int fd; + + fd = open (file_name, O_WRONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + va_list va; + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + if (write (fd, s, vec_len (s)) < 0) + return clib_error_return_unix (0, "write `%s'", file_name); + + vec_free (s); + close (fd); + return 0; +} + +static clib_error_t * +scan_uio_dir (void * arg, u8 * path_name, u8 * file_name) +{ + linux_pci_device_t * l = arg; + unformat_input_t input; + + unformat_init_string (&input, (char *) file_name, vec_len (file_name)); + + if (! unformat (&input, "uio%d", &l->uio_minor)) + abort (); + + unformat_free (&input); + return 0; +} + +static clib_error_t * linux_pci_uio_read_ready (unix_file_t * uf) +{ + linux_pci_main_t * pm = &linux_pci_main; + vlib_main_t * vm = pm->vlib_main; + linux_pci_device_t * l; + u32 li = uf->private_data; + + l = pool_elt_at_index (pm->pci_devices, li); + vlib_node_set_interrupt_pending (vm, l->device_input_node_index); + + /* Let node know which device is interrupting. */ + { + vlib_node_runtime_t * rt = vlib_node_get_runtime (vm, l->device_input_node_index); + rt->runtime_data[0] |= 1 << l->device_index; + } + + return /* no error */ 0; +} + +static clib_error_t *linux_pci_uio_error_ready (unix_file_t *uf) +{ + u32 error_index = (u32) uf->private_data; + + return clib_error_return (0, "pci device %d: error", error_index); +} + +static uword pci_resource_size (uword os_handle, uword resource) +{ + linux_pci_main_t * pm = &linux_pci_main; + linux_pci_device_t * p; + u8 * file_name; + struct stat b; + uword result = 0; + + p = pool_elt_at_index (pm->pci_devices, os_handle); + + file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); + if (stat ((char *) file_name, &b) >= 0) + result = b.st_size; + vec_free (file_name); + return result; +} + +void os_add_pci_disable_interrupts_reg (uword os_handle, u32 resource, + u32 reg_offset, u32 reg_value) +{ + linux_pci_main_t * pm = &linux_pci_main; + linux_pci_device_t * l; + char * file_name; + clib_error_t * error; + + l = pool_elt_at_index (pm->pci_devices, os_handle); + ASSERT (resource == 0); + ASSERT (reg_offset < pci_resource_size (os_handle, resource)); + file_name = (char *) format (0, "%s/disable_interrupt_regs%c", l->dev_dir_name, 0); + error = write_sys_fs (file_name, "%x %x", reg_offset, reg_value); + if (error) + clib_error_report (error); + vec_free (file_name); +} + +static void add_device (pci_device_t * dev, linux_pci_device_t * pdev) +{ + linux_pci_main_t * pm = &linux_pci_main; + linux_pci_device_t * l; + pci_config_header_t * c; + u32 x[4]; + clib_error_t * error; + + c = &dev->config0.header; + + pool_get (pm->pci_devices, l); + l[0] = pdev[0]; + + l->dev_dir_name = vec_dup (l->dev_dir_name); + + /* Parse bus, dev, function from directory name. */ + { + unformat_input_t input; + + unformat_init_string (&input, (char *) l->dev_dir_name, + vec_len (l->dev_dir_name)); + + if (! unformat (&input, "/sys/bus/pci/devices/%x:%x:%x.%x", + &x[0], &x[1], &x[2], &x[3])) + abort (); + + unformat_free (&input); + + l->bus_address.bus = x[1]; + l->bus_address.slot_function = (x[2] << 3) | x[3]; + dev->bus_address = l->bus_address; + } + + dev->os_handle = l - pm->pci_devices; + + error = write_sys_fs ("/sys/bus/pci/drivers/uio_pci_dma/new_id", + "%x %x", c->vendor_id, c->device_id); + if (error) + clib_error_report (error); + error = write_sys_fs ("/sys/bus/pci/drivers/uio_pci_dma/bind", + "%04x:%02x:%02x.%x", x[0], x[1], x[2], x[3]); + /* Errors happen when re-binding so just ignore them. */ + if (error) + clib_error_free (error); + + { + u8 * uio_dir = format (0, "%s/uio", l->dev_dir_name); + foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ 1); + vec_free (uio_dir); + } + + { + char * uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0); + l->uio_fd = open (uio_name, O_RDWR); + if (l->uio_fd < 0) + clib_unix_error ("open `%s'", uio_name); + vec_free (uio_name); + } + + { + unix_file_t template = {0}; + unix_main_t * um = &unix_main; + + template.read_function = linux_pci_uio_read_ready; + template.file_descriptor = l->uio_fd; + template.error_function = linux_pci_uio_error_ready; + template.private_data = l - pm->pci_devices; + + /* To be filled in by driver. */ + l->device_input_node_index = ~0; + l->device_index = 0; + + l->unix_file_index = unix_file_add (um, &template); + } +} + +static void linux_pci_device_free (linux_pci_device_t * l) +{ + int i; + for (i = 0; i < vec_len (l->resource_fds); i++) + if (l->resource_fds[i] > 0) + close (l->resource_fds[i]); + if (l->config_fd > 0) + close (l->config_fd); + if (l->uio_fd > 0) + close (l->uio_fd); + vec_free (l->resource_fds); + vec_free (l->dev_dir_name); +} + +/* Configuration space read/write. */ +clib_error_t * +os_read_write_pci_config (uword os_handle, + vlib_read_or_write_t read_or_write, + uword address, + void * data, + u32 n_bytes) +{ + linux_pci_main_t * pm = &linux_pci_main; + linux_pci_device_t * p; + int n; + + p = pool_elt_at_index (pm->pci_devices, os_handle); + + if (address != lseek (p->config_fd, address, SEEK_SET)) + return clib_error_return_unix (0, "seek offset %d", address); + + if (read_or_write == VLIB_READ) + n = read (p->config_fd, data, n_bytes); + else + n = write (p->config_fd, data, n_bytes); + + if (n != n_bytes) + return clib_error_return_unix (0, "%s", + read_or_write == VLIB_READ + ? "read" : "write"); + + return 0; +} + +static clib_error_t * +os_map_pci_resource_internal (uword os_handle, + u32 resource, + u8 *addr, + void ** result) +{ + linux_pci_main_t * pm = &linux_pci_main; + linux_pci_device_t * p; + struct stat stat_buf; + u8 * file_name; + int fd; + clib_error_t * error; + int flags = MAP_SHARED; + + error = 0; + p = pool_elt_at_index (pm->pci_devices, os_handle); + + file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); + fd = open ((char *) file_name, O_RDWR); + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", file_name); + goto done; + } + + if (fstat (fd, &stat_buf) < 0) + { + error = clib_error_return_unix (0, "fstat `%s'", file_name); + goto done; + } + + vec_validate (p->resource_fds, resource); + p->resource_fds[resource] = fd; + if (addr != 0) + flags |= MAP_FIXED; + + *result = mmap (addr, + /* size */ stat_buf.st_size, + PROT_READ | PROT_WRITE, + flags, + /* file */ fd, + /* offset */ 0); + if (*result == (void *) -1) + { + error = clib_error_return_unix (0, "mmap `%s'", file_name); + goto done; + } + + done: + if (error) + { + if (fd > 0) + close (fd); + } + vec_free (file_name); + return error; +} + +clib_error_t * +os_map_pci_resource (uword os_handle, + u32 resource, + void ** result) +{ + return (os_map_pci_resource_internal (os_handle, resource, 0 /* addr */, + result)); +} + +clib_error_t * +os_map_pci_resource_fixed (uword os_handle, + u32 resource, + u8 *addr, + void ** result) +{ + return (os_map_pci_resource_internal (os_handle, resource, addr, result)); +} + +void os_free_pci_device (uword os_handle) +{ + linux_pci_main_t * pm = &linux_pci_main; + linux_pci_device_t * l; + + l = pool_elt_at_index (pm->pci_devices, os_handle); + linux_pci_device_free (l); + pool_put (pm->pci_devices, l); +} + +u8 * format_os_pci_handle (u8 * s, va_list * va) +{ + linux_pci_main_t * pm = &linux_pci_main; + uword os_pci_handle = va_arg (*va, uword); + linux_pci_device_t * l; + + l = pool_elt_at_index (pm->pci_devices, os_pci_handle); + return format (s, "%x/%x/%x", l->bus_address.bus, + (l->bus_address.slot_function >> 3), + (l->bus_address.slot_function & 0x7)); +} + +static inline pci_device_registration_t * +pci_device_next_registered (pci_device_registration_t * r) +{ + uword i; + + /* Null vendor id marks end of initialized list. */ + for (i = 0; r->supported_devices[i].vendor_id != 0; i++) + ; + + return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0])); +} + +static inline u8 kernel_driver_installed (pci_device_registration_t *r) +{ + u8 * link_name; + struct stat b; + + link_name = format (0, "/sys/bus/pci/drivers/%s", r->kernel_driver); + if (stat ((char *)link_name, &b) >= 0) + r->kernel_driver_running++; + else + r->kernel_driver_running=0; + + vec_free (link_name); + return r->kernel_driver_running; +} + +static clib_error_t * +init_device_from_registered (vlib_main_t * vm, + pci_device_t * dev, + linux_pci_device_t * pdev) +{ + unix_main_t * um = vlib_unix_get_main(); + pci_device_registration_t * r; + pci_device_id_t * i; + pci_config_header_t * c; + + c = &dev->config0.header; + + r = um->pci_device_registrations; + + while (r) + { + for (i = r->supported_devices; i->vendor_id != 0; i++) + if (i->vendor_id == c->vendor_id && i->device_id == c->device_id) + { + if (r->kernel_driver && kernel_driver_installed(r)) + { + if (r->kernel_driver_running == 1) + { + clib_warning("PCI device type [%04x:%04x] is busy!\n" + "\tUninstall the associated linux kernel " + "driver: sudo rmmod %s", + c->vendor_id, c->device_id, r->kernel_driver); + } + continue; + } + add_device (dev, pdev); + return r->init_function (vm, dev); + } + r = r->next_registration; + } + /* No driver, close the PCI config-space FD */ + close (pdev->config_fd); + return 0; +} + +static clib_error_t * +init_device (vlib_main_t * vm, + pci_device_t * dev, + linux_pci_device_t * pdev) +{ + return init_device_from_registered (vm, dev, pdev); +} + +static clib_error_t * +scan_device (void * arg, u8 * dev_dir_name, u8 * ignored) +{ + vlib_main_t * vm = arg; + int fd; + u8 * f; + clib_error_t * error = 0; + pci_device_t dev = {0}; + linux_pci_device_t pdev = {0}; + + f = format (0, "%v/config%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDWR); + + /* Try read-only access if write fails. */ + if (fd < 0) + fd = open ((char *) f, O_RDONLY); + + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", f); + goto done; + } + + /* You can only read more that 64 bytes of config space as root; so we try to + read the full space but fall back to just the first 64 bytes. */ + if (read (fd, &dev.config_data, sizeof (dev.config_data)) != sizeof (dev.config_data) + && read (fd, &dev.config0, sizeof (dev.config0)) != sizeof (dev.config0)) + { + error = clib_error_return_unix (0, "read `%s'", f); + goto done; + } + + { + static pci_config_header_t all_ones; + if (all_ones.vendor_id == 0) + memset (&all_ones, ~0, sizeof (all_ones)); + + if (! memcmp (&dev.config0.header, &all_ones, sizeof (all_ones))) + { + error = clib_error_return (0, "invalid PCI config for `%s'", f); + goto done; + } + } + + if (dev.config0.header.header_type == 0) + pci_config_type0_little_to_host (&dev.config0); + else + pci_config_type1_little_to_host (&dev.config1); + + pdev.config_fd = fd; + pdev.dev_dir_name = dev_dir_name; + + error = init_device (vm, &dev, &pdev); + + done: + vec_free (f); + return error; +} + +clib_error_t * pci_bus_init (vlib_main_t * vm) +{ + linux_pci_main_t * pm = &linux_pci_main; + clib_error_t * error; + + pm->vlib_main = vm; + + if ((error = vlib_call_init_function (vm, unix_input_init))) + return error; + + error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, /* scan_dirs */ 0); + + /* Complain and continue. might not be root, etc. */ + if (error) + clib_error_report (error); + + return error; +} + +VLIB_INIT_FUNCTION (pci_bus_init); diff --git a/vlib/vlib/unix/pci.h b/vlib/vlib/unix/pci.h new file mode 100644 index 00000000000..b384250eb47 --- /dev/null +++ b/vlib/vlib/unix/pci.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * unix/pci.h: Linux specific pci state + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_unix_pci_h +#define included_unix_pci_h + +#include <vlib/pci/pci.h> + +typedef struct { + /* /sys/bus/pci/devices/... directory name for this device. */ + u8 * dev_dir_name; + + /* Resource file descriptors. */ + int * resource_fds; + + /* File descriptor for config space read/write. */ + int config_fd; + + /* PCI bus address for this devices parsed from /sys/bus/pci/devices name. */ + pci_bus_address_t bus_address; + + /* File descriptor for /dev/uio%d */ + int uio_fd; + + /* Minor device for uio device. */ + u32 uio_minor; + + /* Index given by unix_file_add. */ + u32 unix_file_index; + + /* Input node to handle interrupts for this device. */ + u32 device_input_node_index; + + /* Node runtime will be a bitmap of device indices with pending interrupts. */ + u32 device_index; +} linux_pci_device_t; + +/* Pool of PCI devices. */ +typedef struct { + vlib_main_t * vlib_main; + linux_pci_device_t * pci_devices; +} linux_pci_main_t; + +extern linux_pci_main_t linux_pci_main; + +always_inline linux_pci_device_t * +pci_dev_for_linux (pci_device_t * dev) +{ + linux_pci_main_t * pm = &linux_pci_main; + return pool_elt_at_index (pm->pci_devices, dev->os_handle); +} + +/* Call to allocate/initialize the pci subsystem. + This is not an init function so that users can explicitly enable + pci only when it's needed. */ +clib_error_t * pci_bus_init (vlib_main_t * vm); + +#endif /* included_unix_pci_h */ diff --git a/vlib/vlib/unix/physmem.c b/vlib/vlib/unix/physmem.c new file mode 100644 index 00000000000..83b40be6449 --- /dev/null +++ b/vlib/vlib/unix/physmem.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.c: Unix physical memory + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/unix/physmem.h> + +static physmem_main_t physmem_main; + +static void * +unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, uword alignment) +{ + physmem_main_t * pm = &physmem_main; + uword lo_offset, hi_offset; + uword * to_free = 0; + +#if DPDK > 0 + clib_warning ("unsafe alloc!"); +#endif + + /* IO memory is always at least cache aligned. */ + alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); + + while (1) + { + mheap_get_aligned (pm->heap, n_bytes, + /* align */ alignment, + /* align offset */ 0, + &lo_offset); + + /* Allocation failed? */ + if (lo_offset == ~0) + break; + + /* Make sure allocation does not span DMA physical chunk boundary. */ + hi_offset = lo_offset + n_bytes - 1; + + if ((lo_offset >> vpm->log2_n_bytes_per_page) == + (hi_offset >> vpm->log2_n_bytes_per_page)) + break; + + /* Allocation would span chunk boundary, queue it to be freed as soon as + we find suitable chunk. */ + vec_add1 (to_free, lo_offset); + } + + if (to_free != 0) + { + uword i; + for (i = 0; i < vec_len (to_free); i++) + mheap_put (pm->heap, to_free[i]); + vec_free (to_free); + } + + return lo_offset != ~0 ? pm->heap + lo_offset : 0; +} + +static void unix_physmem_free (void * x) +{ + physmem_main_t * pm = &physmem_main; + + /* Return object to region's heap. */ + mheap_put (pm->heap, x - pm->heap); +} + +static void htlb_shutdown(void) +{ + physmem_main_t * pm = &physmem_main; + + if (! pm->shmid) + return; + shmctl (pm->shmid, IPC_RMID, 0); + pm->shmid = 0; +} + +/* try to use huge TLB pgs if possible */ +static int htlb_init (vlib_main_t * vm) +{ + vlib_physmem_main_t * vpm = &vm->physmem_main; + physmem_main_t * pm = &physmem_main; + u64 hugepagesize, pagesize; + u64 pfn, seek_loc; + u64 cur, physaddr, ptbits; + int fd, i; + + pm->shmid = shmget (11 /* key, my amp goes to 11 */, pm->mem_size, + IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W); + if (pm->shmid < 0) + { + clib_unix_warning ("shmget"); + return 0; + } + + pm->mem = shmat (pm->shmid, NULL, 0 /* flags */); + if (pm->mem == 0) + { + shmctl (pm->shmid, IPC_RMID, 0); + return 0; + } + + memset (pm->mem, 0, pm->mem_size); + + /* $$$ get page size info from /proc/meminfo */ + hugepagesize = 2<<20; + pagesize = 4<<10; + vpm->log2_n_bytes_per_page = min_log2 (hugepagesize); + vec_resize (vpm->page_table, pm->mem_size / hugepagesize); + + vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); + vpm->virtual.start = pointer_to_uword (pm->mem); + vpm->virtual.size = pm->mem_size; + vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; + + fd = open("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) + { + (void) shmdt (pm->mem); + return 0; + } + + pm->heap = mheap_alloc_with_flags + (pm->mem, pm->mem_size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM); + + cur = (u64) pm->mem; + i = 0; + + while (cur < (u64) pm->mem + pm->mem_size) + { + pfn = (u64) cur / pagesize; + seek_loc = pfn * sizeof (u64); + if (lseek (fd, seek_loc, SEEK_SET) != seek_loc) + { + clib_unix_warning ("lseek to 0x%llx", seek_loc); + shmctl (pm->shmid, IPC_RMID, 0); + close(fd); + return 0; + } + if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof(ptbits))) + { + clib_unix_warning ("read ptbits"); + shmctl (pm->shmid, IPC_RMID, 0); + close(fd); + return 0; + } + + /* bits 0-54 are the physical page number */ + physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize; + if (CLIB_DEBUG > 1) + fformat(stderr, "pm: virtual 0x%llx physical 0x%llx\n", + cur, physaddr); + vpm->page_table[i++] = physaddr; + + cur += hugepagesize; + } + close(fd); + atexit (htlb_shutdown); + return 1; +} + +int vlib_app_physmem_init (vlib_main_t * vm, + physmem_main_t * pm, int) __attribute__ ((weak)); +int vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x) +{ + return 0; +} + +clib_error_t * unix_physmem_init (vlib_main_t * vm, int physical_memory_required) +{ + vlib_physmem_main_t * vpm = &vm->physmem_main; + physmem_main_t * pm = &physmem_main; + clib_error_t * error = 0; + char * dev_uio_dma_file = "/dev/uio-dma"; + int using_fake_memory = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + pm->mem = MAP_FAILED; + + if (pm->mem_size == 0) + pm->mem_size = 16 << 20; + + /* OK, Mr. App, you tell us */ + if (vlib_app_physmem_init (vm, pm, physical_memory_required)) + return 0; + + if (physical_memory_required) + { + if (!pm->no_hugepages && htlb_init(vm)) + { + fformat(stderr, "%s: use huge pages\n", __FUNCTION__); + return 0; + } + pm->uio_dma_fd = open (dev_uio_dma_file, O_RDWR); + } + else + pm->uio_dma_fd = -1; + + if (pm->uio_dma_fd < 0) + { + if (physical_memory_required) + { + error = clib_error_return_unix (0, "open `%s'", dev_uio_dma_file); + goto done; + } + + using_fake_memory = 1; + pm->mem = mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (pm->mem == MAP_FAILED) + { + error = clib_error_return_unix (0, "mmap"); + goto done; + } + + pm->heap = mheap_alloc (pm->mem, pm->mem_size); + + /* Identity map with a single page. */ + vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size); + vec_add1 (vpm->page_table, pointer_to_uword (pm->mem)); + } + else + error = clib_error_return (0, "uio_dma deprecated"); + + if (using_fake_memory) + fformat(stderr, "%s: use fake dma pages\n", __FUNCTION__); + else + fformat(stderr, "%s: use uio dma pages\n", __FUNCTION__); + + done: + if (error) + { + if (pm->mem != MAP_FAILED) + munmap (pm->mem, pm->mem_size); + if (pm->uio_dma_fd >= 0) + { + close (pm->uio_dma_fd); + pm->uio_dma_fd = -1; + } + } + return error; +} + +static clib_error_t * +show_physmem (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ +#if DPDK > 0 + vlib_cli_output (vm, "Not supported with DPDK drivers."); +#else + physmem_main_t * pm = &physmem_main; + + if (pm->heap) + vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 0); + else + vlib_cli_output (vm, "No physmem allocated."); +#endif + return 0; +} + +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; + +static clib_error_t * +show_affinity (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + cpu_set_t set; + cpu_set_t *setp = &set; + int i, rv; + u8 *s = 0; + int first_set_bit_in_run = -1; + int last_set_bit_in_run = -1; + int output_done = 0; + + rv = sched_getaffinity (0 /* pid, 0 = this proc */, + sizeof (*setp), setp); + if (rv < 0) + { + vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", + strerror(errno)); + return 0; + } + + for (i = 0; i < 64; i++) + { + if (CPU_ISSET(i, setp)) + { + if (first_set_bit_in_run == -1) + { + first_set_bit_in_run = i; + last_set_bit_in_run = i; + if (output_done) + s = format (s, ","); + s = format (s, "%d-", i); + output_done = 1; + } + else + { + if (i == (last_set_bit_in_run+1)) + last_set_bit_in_run = i; + } + } + else + { + if (first_set_bit_in_run != -1) + { + if (first_set_bit_in_run == (i-1)) + { + _vec_len (s) -= 2 + ((first_set_bit_in_run/10)); + } + s = format (s, "%d", last_set_bit_in_run); + first_set_bit_in_run = -1; + last_set_bit_in_run = -1; + } + } + } + + if (first_set_bit_in_run != -1) + s = format (s, "%d", first_set_bit_in_run); + + vlib_cli_output (vm, "Process runs on: %v", s); + return 0; +} + +VLIB_CLI_COMMAND (show_affinity_command, static) = { + .path = "show affinity", + .short_help = "Show process cpu affinity", + .function = show_affinity, +}; + +static clib_error_t * +set_affinity (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + cpu_set_t set; + cpu_set_t *setp = &set; + int i, rv; + int another_round; + u32 first, last; + + memset (setp, 0, sizeof (*setp)); + + do { + another_round = 0; + if (unformat (input, "%d-%d,", &first, &last)) + { + if (first > 64 || last > 64) + { + barf1: + vlib_cli_output (vm, "range %d-%d invalid", first, last); + return 0; + } + + for (i = first; i <= last; i++) + CPU_SET(i, setp); + another_round = 1; + } + else if (unformat (input, "%d-%d", &first, &last)) + { + if (first > 64 || last > 64) + goto barf1; + + for (i = first; i <= last; i++) + CPU_SET(i, setp); + } + else if (unformat (input, "%d,", &first)) + { + if (first > 64) + { + barf2: + vlib_cli_output (vm, "cpu %d invalid", first); + return 0; + } + CPU_SET(first, setp); + another_round = 1; + } + else if (unformat (input, "%d", &first)) + { + if (first > 64) + goto barf2; + + CPU_SET(first, setp); + } + } while (another_round); + + rv = sched_setaffinity (0 /* pid, 0 = this proc */, + sizeof (*setp), setp); + + if (rv < 0) + { + vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", + strerror(errno)); + return 0; + } + return show_affinity (vm, input, cmd); +} + +VLIB_CLI_COMMAND (set_affinity_command, static) = { + .path = "set affinity", + .short_help = "Set process cpu affinity", + .function = set_affinity, +}; + +static clib_error_t * +vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input) +{ + physmem_main_t * pm = &physmem_main; + u32 size_in_mb; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "no-huge") || unformat (input, "no-huge-pages")) + pm->no_hugepages = 1; + + else if (unformat(input, "size-in-mb %d", &size_in_mb) || + unformat(input, "size %d", &size_in_mb)) + pm->mem_size = size_in_mb << 20; + else + return unformat_parse_error (input); + } + + unformat_free (input); + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem"); diff --git a/vlib/vlib/unix/physmem.h b/vlib/vlib/unix/physmem.h new file mode 100644 index 00000000000..a963be746d8 --- /dev/null +++ b/vlib/vlib/unix/physmem.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_physmem_h__ +#define __included_physmem_h__ + +/* Manage I/O physical memory. */ +#define _GNU_SOURCE +#include <sched.h> +#include <vppinfra/cache.h> +#include <vppinfra/error.h> +#include <vppinfra/mheap.h> +#include <vppinfra/os.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> + +#include <sys/fcntl.h> /* for open */ +#include <sys/file.h> /* for flock */ +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/ipc.h> +#include <sys/shm.h> + +typedef struct { + /* File descriptor for /dev/uio-dma. */ + int uio_dma_fd; + + /* Virtual memory via mmaped. */ + void * mem; + + /* Size in bytes. */ + uword mem_size; + + /* Heap allocated out of virtual memory. */ + void * heap; + + /* huge TLB segment id */ + int shmid; + + /* should we try to use htlb ? */ + int no_hugepages; + +} physmem_main_t; + +#endif /* __included_physmem_h__ */ diff --git a/vlib/vlib/unix/plugin.c b/vlib/vlib/unix/plugin.c new file mode 100644 index 00000000000..3411ef340af --- /dev/null +++ b/vlib/vlib/unix/plugin.c @@ -0,0 +1,210 @@ +/* + * plugin.c: plugin handling + * + * Copyright (c) 2011 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/unix/plugin.h> +#include <dlfcn.h> +#include <dirent.h> + +plugin_main_t vlib_plugin_main; + +void vlib_set_get_handoff_structure_cb (void *cb) +{ + plugin_main_t * pm = &vlib_plugin_main; + pm->handoff_structure_get_cb = cb; +} + +static void * vnet_get_handoff_structure (void) +{ + void * (*fp)(void); + + fp = vlib_plugin_main.handoff_structure_get_cb; + if (fp == 0) + return 0; + else + return (*fp)(); +} + +static int +load_one_plugin (plugin_main_t *pm, plugin_info_t *pi, int from_early_init) +{ + void *handle, *register_handle; + clib_error_t * (*fp)(vlib_main_t *, void *, int); + clib_error_t * error; + void *handoff_structure; + + handle = dlopen ((char *)pi->name, RTLD_LAZY); + + /* + * Note: this can happen if the plugin has an undefined symbol reference, + * so print a warning. Otherwise, the poor slob won't know what happened. + * Ask me how I know that... + */ + if (handle == 0) + { + clib_warning ("%s", dlerror()); + return -1; + } + + pi->handle = handle; + + register_handle = dlsym (pi->handle, "vlib_plugin_register"); + if (register_handle == 0) + { + dlclose (handle); + return 0; + } + + fp = register_handle; + + handoff_structure = vnet_get_handoff_structure(); + + if (handoff_structure == 0) + error = clib_error_return (0, "handoff structure callback returned 0"); + else + error = (*fp)(pm->vlib_main, handoff_structure, from_early_init); + + if (error) + { + clib_error_report (error); + dlclose (handle); + return 1; + } + + clib_warning ("Loaded plugin: %s", pi->name); + + return 0; +} + +static u8 **split_plugin_path (plugin_main_t *pm) +{ + int i; + u8 **rv = 0; + u8 *path = pm->plugin_path; + u8 *this = 0; + + for (i = 0; i < vec_len (pm->plugin_path); i++) + { + if (path[i] != ':') + { + vec_add1(this, path[i]); + continue; + } + vec_add1(this, 0); + vec_add1 (rv, this); + this = 0; + } + if (this) + { + vec_add1 (this, 0); + vec_add1 (rv, this); + } + return rv; +} + +int vlib_load_new_plugins (plugin_main_t *pm, int from_early_init) +{ + DIR *dp; + struct dirent *entry; + struct stat statb; + uword *p; + plugin_info_t *pi; + u8 **plugin_path; + int i; + + plugin_path = split_plugin_path (pm); + + for (i = 0; i < vec_len (plugin_path); i++) + { + dp = opendir ((char *)plugin_path[i]); + + if (dp == 0) + continue; + + while ((entry = readdir (dp))) + { + u8 *plugin_name; + + if (pm->plugin_name_filter) + { + int j; + for (j = 0; j < vec_len (pm->plugin_name_filter); j++) + if (entry->d_name[j] != pm->plugin_name_filter[j]) + goto next; + } + + plugin_name = format (0, "%s/%s%c", plugin_path[i], + entry->d_name, 0); + + /* unreadable */ + if (stat ((char *)plugin_name, &statb) < 0) + { + ignore: + vec_free (plugin_name); + continue; + } + + /* a dir or other things which aren't plugins */ + if (!S_ISREG(statb.st_mode)) + goto ignore; + + p = hash_get_mem (pm->plugin_by_name_hash, plugin_name); + if (p == 0) + { + vec_add2 (pm->plugin_info, pi, 1); + pi->name = plugin_name; + pi->file_info = statb; + + if (load_one_plugin (pm, pi, from_early_init)) + { + vec_free (plugin_name); + _vec_len (pm->plugin_info) = vec_len (pm->plugin_info) - 1; + continue; + } + memset (pi, 0, sizeof (*pi)); + hash_set_mem (pm->plugin_by_name_hash, plugin_name, + pi - pm->plugin_info); + } + next: + ; + } + closedir (dp); + vec_free (plugin_path[i]); + } + vec_free (plugin_path); + return 0; +} +char *vlib_plugin_path __attribute__((weak)); +char *vlib_plugin_path = ""; +char *vlib_plugin_name_filter __attribute__((weak)); +char *vlib_plugin_name_filter = 0; + +int vlib_plugin_early_init (vlib_main_t *vm) +{ + plugin_main_t *pm = &vlib_plugin_main; + + pm->plugin_path = format (0, "%s%c", vlib_plugin_path, 0); + + clib_warning ("plugin path %s", pm->plugin_path); + + if (vlib_plugin_name_filter) + pm->plugin_name_filter = format (0, "%s%c", vlib_plugin_name_filter, 0); + + pm->plugin_by_name_hash = hash_create_string (0, sizeof (uword)); + pm->vlib_main = vm; + + return vlib_load_new_plugins (pm, 1 /* from_early_init */); +} diff --git a/vlib/vlib/unix/plugin.h b/vlib/vlib/unix/plugin.h new file mode 100644 index 00000000000..e7d75099ed9 --- /dev/null +++ b/vlib/vlib/unix/plugin.h @@ -0,0 +1,88 @@ +/* + * plugin.h: plugin handling + * + * Copyright (c) 2011 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_plugin_h__ +#define __included_plugin_h__ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +/* + * vlib plugin scheme + * + * Almost anything which can be made to work in a vlib unix + * application will also work in a vlib plugin. + * + * The elf-section magic which registers static objects + * works so long as plugins are preset when the vlib unix process + * starts. But wait: there's more... + * + * If an application calls vlib_load_new_plugins() -- possibly after + * changing vlib_plugin_main.plugin_path / vlib_plugin_main.plugin_name_filter, + * -- new plugins will be loaded. That, in turn, allows considerable + * flexibility in terms of adding feature code or fixing bugs without + * requiring the data-plane process to restart. + * + * When the plugin mechanism loads a plugin, it uses dlsym to locate + * and call the plugin's function vlib_plugin_register() if it exists. + * A plugin which expects to be loaded after the vlib application + * starts uses this callback to modify the application. If vlib_plugin_register + * returns non-zero, the plugin mechanism dlclose()'s the plugin. + * + * Applications control the plugin search path and name filter by + * declaring the variables vlib_plugin_path and vlib_plugin_name_filter. + * libvlib_unix.la supplies weak references for these symbols which + * effectively disable the scheme. In order for the elf-section magic to + * work, static plugins must be loaded at the earliest possible moment. + * + * An application can change these parameters at any time and call + * vlib_load_new_plugins(). + */ + + + +typedef struct { + u8 *name; + struct stat file_info; + void *handle; +} plugin_info_t; + +typedef struct { + /* loaded plugin info */ + plugin_info_t *plugin_info; + uword *plugin_by_name_hash; + + /* path and name filter */ + u8 *plugin_path; + u8 *plugin_name_filter; + + /* handoff structure get callback */ + void *handoff_structure_get_cb; + + /* usual */ + vlib_main_t *vlib_main; +} plugin_main_t; + +plugin_main_t vlib_plugin_main; + +int vlib_plugin_early_init (vlib_main_t *vm); +int vlib_load_new_plugins (plugin_main_t *pm, int from_early_init); + +#endif /* __included_plugin_h__ */ diff --git a/vlib/vlib/unix/unix.h b/vlib/vlib/unix/unix.h new file mode 100644 index 00000000000..0802a93baa3 --- /dev/null +++ b/vlib/vlib/unix/unix.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * unix.h: Unix specific main state + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_unix_unix_h +#define included_unix_unix_h + +#include <vppinfra/socket.h> + +struct unix_file; +typedef clib_error_t * (unix_file_function_t) (struct unix_file * f); + +typedef struct unix_file { + /* Unix file descriptor from open/socket. */ + u32 file_descriptor; + + u32 flags; +#define UNIX_FILE_DATA_AVAILABLE_TO_WRITE (1 << 0) + + /* Data available for function's use. */ + uword private_data; + + /* Functions to be called when read/write data becomes ready. */ + unix_file_function_t * read_function, * write_function, * error_function; +} unix_file_t; + +typedef struct { + f64 time; + clib_error_t * error; +} unix_error_history_t; + +typedef enum { + UNIX_FILE_UPDATE_ADD, + UNIX_FILE_UPDATE_MODIFY, + UNIX_FILE_UPDATE_DELETE, +} unix_file_update_type_t; + +typedef struct { + /* Back pointer to main structure. */ + vlib_main_t * vlib_main; + + u32 flags; + /* Run interactively or as daemon (background process). */ +#define UNIX_FLAG_INTERACTIVE (1 << 0) +#define UNIX_FLAG_NODAEMON (1 << 1) + + /* Pool of files to poll for input/output. */ + unix_file_t * file_pool; + + /* CLI listen socket. */ + clib_socket_t cli_listen_socket; + + void (* file_update) (unix_file_t * file, unix_file_update_type_t update_type); + + /* Circular buffer of last unix errors. */ + unix_error_history_t error_history[128]; + u32 error_history_index; + u64 n_total_errors; + + /* startup-config filename */ + u8 *startup_config_filename; + + /* unix config complete */ + volatile int unix_config_complete; + + /* CLI log file. GIGO. */ + u8 *log_filename; + int log_fd; + /* Don't put telnet connections into character mode */ + int cli_line_mode; + u32 cli_history_limit; + +} unix_main_t; + +/* Global main structure. */ +extern unix_main_t unix_main; + +always_inline uword +unix_file_add (unix_main_t * um, unix_file_t * template) +{ + unix_file_t * f; + pool_get (um->file_pool, f); + f[0] = template[0]; + um->file_update (f, UNIX_FILE_UPDATE_ADD); + return f - um->file_pool; +} + +always_inline void +unix_file_del (unix_main_t * um, unix_file_t * f) +{ + um->file_update (f, UNIX_FILE_UPDATE_DELETE); + close (f->file_descriptor); + f->file_descriptor = ~0; + pool_put (um->file_pool, f); +} + +always_inline uword +unix_file_set_data_available_to_write (u32 unix_file_index, uword is_available) +{ + unix_file_t * uf = pool_elt_at_index (unix_main.file_pool, unix_file_index); + uword was_available = (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + if ((was_available != 0) != (is_available != 0)) + { + uf->flags ^= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } + return was_available != 0; +} + +always_inline void +unix_save_error (unix_main_t * um, clib_error_t * error) +{ + unix_error_history_t * eh = um->error_history + um->error_history_index; + clib_error_free_vector (eh->error); + eh->error = error; + eh->time = vlib_time_now (um->vlib_main); + um->n_total_errors += 1; + if (++um->error_history_index >= ARRAY_LEN (um->error_history)) + um->error_history_index = 0; +} + +/* Main function for Unix VLIB. */ +int vlib_unix_main (int argc, char * argv[]); + +/* Call to allocate/initialize physical DMA memory subsystem. + This is not an init function so that users can explicitly enable/disable + physmem when its not needed. */ +clib_error_t * unix_physmem_init (vlib_main_t * vm, + int fail_if_physical_memory_not_present); + +/* Set prompt for CLI. */ +void vlib_unix_cli_set_prompt (char * prompt); + +static inline unix_main_t * vlib_unix_get_main (void) +{ + return &unix_main; +} + +/* thread stack array; vec_len = max number of threads */ +u8 **vlib_thread_stacks; + +#endif /* included_unix_unix_h */ |