diff options
author | Damjan Marion <damarion@cisco.com> | 2016-05-11 23:07:18 +0200 |
---|---|---|
committer | Damjan Marion <damarion@cisco.com> | 2016-05-19 18:14:38 +0200 |
commit | 1c80e831b728ab378949714d5059a0b5b1822a0a (patch) | |
tree | e4f3ecfee143f7dee0e9905570d41ca3ee345c83 /vppinfra | |
parent | 82e29c455833b5b12e04c89d2dec1106b499e6b0 (diff) |
Add support for multiple microarchitectures in single binary
* compiler -march= parameter is changed from native to corei7
so code is always genereted with instructions which are available
on the Nehalem microarchitecture (up to SSE4.2)
* compiler -mtune= parameter is added so code is optimized for
corei7-avx which equals to Sandy Bridge microarchitecture
* set of macros is added which allows run-time detection of available
cpu instructions (e.g. clib_cpu_supports_avx())
* set of macros is added which allows us to clone graph node funcitons
where cloned function is optmized for different microarchitecture
Those macros are using following attributes:
__attribute__((flatten))
__attribute__((target("arch=core-avx2)))
I.e. If applied to foo_node_fn() macro will generate cloned
functions foo_node_fn_avx2() and foo_node_fn_avx512() (future)
It will also generate function void * foo_node_fn_multiarch_select()
which detects available instruction set and returns pointer to the
best matching function clone.
Change-Id: I2dce0ac92a5ede95fcb56f47f3d1f3c4c040bac0
Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'vppinfra')
-rw-r--r-- | vppinfra/vppinfra/cpu.c | 23 | ||||
-rw-r--r-- | vppinfra/vppinfra/cpu.h | 78 |
2 files changed, 95 insertions, 6 deletions
diff --git a/vppinfra/vppinfra/cpu.c b/vppinfra/vppinfra/cpu.c index f2dbaf1f7d8..9008ee3dd89 100644 --- a/vppinfra/vppinfra/cpu.c +++ b/vppinfra/vppinfra/cpu.c @@ -16,10 +16,6 @@ #include <vppinfra/format.h> #include <vppinfra/cpu.h> -#if __x86_64__ -#include <cpuid.h> -#endif - #define foreach_x86_cpu_uarch \ _(0x06, 0x4f, "Broadwell", "Broadwell-EP/EX") \ _(0x06, 0x3d, "Broadwell", "Broadwell") \ @@ -108,4 +104,21 @@ format_cpu_model_name (u8 * s, va_list * args) #else /* ! __x86_64__ */ return format (s, "unknown"); #endif -}
\ No newline at end of file +} + +u8 * +format_cpu_flags (u8 * s, va_list * args) +{ +#if __x86_64__ +#define _(flag, func, reg, bit) \ + if (clib_cpu_supports_ ## flag()) \ + s = format (s, #flag " "); + foreach_x86_64_flags + return s; +#undef _ +#else /* ! __x86_64__ */ + return format (s, "unknown"); +#endif +} + + diff --git a/vppinfra/vppinfra/cpu.h b/vppinfra/vppinfra/cpu.h index 79cdf74f84b..961af709a63 100644 --- a/vppinfra/vppinfra/cpu.h +++ b/vppinfra/vppinfra/cpu.h @@ -16,7 +16,83 @@ #ifndef included_clib_cpu_h #define included_clib_cpu_h +#include <vppinfra/format.h> + +/* + * multiarchitecture support. Adding new entry will produce + * new graph node function variant optimized for specific cpu + * microarchitecture. + * Order is important for runtime selection, as 1st match wins... + */ + +#if __x86_64__ && CLIB_DEBUG == 0 +#define foreach_march_variant(macro, x) \ + macro(avx2, x, "arch=core-avx2") +#else +#define foreach_march_variant(macro, x) +#endif + + +#if __GNUC__ > 4 && !__clang__ +#define CLIB_CPU_OPTIMIZED __attribute__ ((optimize ("tree-vectorize"))) +#else +#define CLIB_CPU_OPTIMIZED +#endif + + +#define CLIB_MULTIARCH_ARCH_CHECK(arch, fn, tgt) \ + if (clib_cpu_supports_ ## arch()) \ + return & fn ## _ ##arch; + +#define CLIB_MULTIARCH_SELECT_FN(fn,...) \ + __VA_ARGS__ void * fn ## _multiarch_select(void) \ +{ \ + foreach_march_variant(CLIB_MULTIARCH_ARCH_CHECK, fn) \ + return & fn; \ +} + +#if __x86_64__ +#include "cpuid.h" + +#define foreach_x86_64_flags \ +_ (sse3, 1, ecx, 0) \ +_ (ssse3, 1, ecx, 9) \ +_ (sse41, 1, ecx, 19) \ +_ (sse42, 1, ecx, 20) \ +_ (avx, 1, ecx, 28) \ +_ (avx2, 7, ebx, 5) \ +_ (avx512f, 7, ebx, 16) \ +_ (aes, 1, ecx, 25) \ +_ (sha, 7, ebx, 29) + +static inline int +clib_get_cpuid(const u32 lev, u32 * eax, u32 *ebx, u32 * ecx, u32 * edx) +{ + if ((u32) __get_cpuid_max (0x80000000 & lev, 0) < lev) + return 0; + if (lev == 7) + __cpuid_count(lev, 0, *eax, *ebx, *ecx, *edx); + else + __cpuid(lev, *eax, *ebx, *ecx, *edx); + return 1; +} + + +#define _(flag, func, reg, bit) \ +static inline int \ +clib_cpu_supports_ ## flag() \ +{ \ + u32 __attribute__((unused)) eax, ebx = 0, ecx = 0, edx = 0; \ + clib_get_cpuid (func, &eax, &ebx, &ecx, &edx); \ + \ + return ((reg & (1 << bit)) != 0); \ +} + foreach_x86_64_flags +#undef _ +#endif + format_function_t format_cpu_uarch; format_function_t format_cpu_model_name; +format_function_t format_cpu_flags; -#endif
\ No newline at end of file +#endif |