diff options
author | Adrian Villin <avillin@cisco.com> | 2024-06-17 08:51:27 +0200 |
---|---|---|
committer | Dave Wallace <dwallacelf@gmail.com> | 2024-07-08 16:27:38 +0000 |
commit | 5d171ebdc21efa4085e2c2130f595d7e0e1d2f59 (patch) | |
tree | 9b1e967d20e8c83f9780d5a76579db0edd3c5373 | |
parent | 75e8e1e948da182dbf4f6b3394f1b7fc44c1403a (diff) |
hs-test: CPU allocation improvements
- Release build runs on numa node0, debug on node1.
Using the last digit of a build number to reserve 4 cores per test
mmeans we can run 20 jobs (10 release, 10 debug) on the same machine,
assuming we have 111 cores available (not counting core 0).
Can be increased if needed, there are still some cores left.
- Added separate numa aware cpu allocation
- Added CPU0=true|false (useful for users with 4c/8t)
Type: test
Change-Id: Iba8e492a4e01a7f457e49112303887a2a27f6af9
Signed-off-by: Adrian Villin <avillin@cisco.com>
-rw-r--r-- | extras/hs-test/Makefile | 12 | ||||
-rw-r--r-- | extras/hs-test/framework_test.go | 6 | ||||
-rw-r--r-- | extras/hs-test/hs_test.sh | 6 | ||||
-rw-r--r-- | extras/hs-test/infra/cpu.go | 195 | ||||
-rw-r--r-- | extras/hs-test/infra/hst_suite.go | 7 |
5 files changed, 179 insertions, 47 deletions
diff --git a/extras/hs-test/Makefile b/extras/hs-test/Makefile index 596acb1b57d..e326e9f500c 100644 --- a/extras/hs-test/Makefile +++ b/extras/hs-test/Makefile @@ -41,6 +41,10 @@ ifeq ($(REPEAT),) REPEAT=0 endif +ifeq ($(CPU0),) +CPU0=false +endif + ifeq ($(VPPSRC),) VPPSRC=$(shell pwd)/../.. endif @@ -81,6 +85,7 @@ help: @echo " VPPSRC=[path-to-vpp-src] - path to vpp source files (for gdb)" @echo " PARALLEL=[n-cpus] - number of test processes to spawn to run in parallel" @echo " REPEAT=[n] - repeat tests up to N times or until a failure occurs" + @echo " CPU0=[true|false] - use cpu0" @echo @echo "List of all tests:" @$(MAKE) list-tests @@ -117,7 +122,7 @@ test: .deps.ok .build.ok @# necessary so gmake won't skip executing the bash script @-bash ./hs_test.sh --persist=$(PERSIST) --verbose=$(VERBOSE) \ --unconfigure=$(UNCONFIGURE) --debug=$(DEBUG) --test=$(TEST) --cpus=$(CPUS) \ - --vppsrc=$(VPPSRC) --parallel=$(PARALLEL) --repeat=$(REPEAT) + --vppsrc=$(VPPSRC) --parallel=$(PARALLEL) --repeat=$(REPEAT) --cpu0=$(CPU0) @bash ./script/compress.sh .PHONY: test-debug @@ -126,14 +131,15 @@ test-debug: .deps.ok .build_debug.ok @# necessary so gmake won't skip executing the bash script @-bash ./hs_test.sh --persist=$(PERSIST) --verbose=$(VERBOSE) \ --unconfigure=$(UNCONFIGURE) --debug=$(DEBUG) --test=$(TEST) --cpus=$(CPUS) \ - --vppsrc=$(VPPSRC) --parallel=$(PARALLEL) --repeat=$(REPEAT) --debug_build=true + --vppsrc=$(VPPSRC) --parallel=$(PARALLEL) --repeat=$(REPEAT) --debug_build=true \ + --cpu0=$(CPU0) @bash ./script/compress.sh .PHONY: test-cov test-cov: .deps.ok .build.cov.ok @-bash ./hs_test.sh --persist=$(PERSIST) --verbose=$(VERBOSE) \ --unconfigure=$(UNCONFIGURE) --debug=$(DEBUG) --test=$(TEST-HS) --cpus=$(CPUS) \ - --vppsrc=$(VPPSRC) + --vppsrc=$(VPPSRC) --cpu0=$(CPU0) @$(MAKE) -C ../.. test-cov-post HS_TEST=1 @bash ./script/compress.sh diff --git a/extras/hs-test/framework_test.go b/extras/hs-test/framework_test.go index a086f75a5fc..7c8c5648d7b 100644 --- a/extras/hs-test/framework_test.go +++ b/extras/hs-test/framework_test.go @@ -5,6 +5,7 @@ import ( "os" "path/filepath" "runtime" + "strings" "testing" "time" @@ -26,6 +27,11 @@ func TestHst(t *testing.T) { SuiteTimeout = time.Minute * 5 } + output, err := os.ReadFile("/sys/devices/system/node/online") + fmt.Println(string(output)) + if err == nil && strings.Contains(string(output), "-") { + NumaAwareCpuAlloc = true + } // creates a file with PPID, used for 'make cleanup-hst' ppid := fmt.Sprint(os.Getppid()) ppid = ppid[:len(ppid)-1] diff --git a/extras/hs-test/hs_test.sh b/extras/hs-test/hs_test.sh index 107fc686176..803b8f717da 100644 --- a/extras/hs-test/hs_test.sh +++ b/extras/hs-test/hs_test.sh @@ -68,6 +68,12 @@ case "${i}" in --repeat=*) ginkgo_args="$ginkgo_args --repeat=${i#*=}" ;; + --cpu0=*) + cpu0="${i#*=}" + if [ "$cpu0" = "true" ]; then + args="$args -cpu0" + fi + ;; esac done diff --git a/extras/hs-test/infra/cpu.go b/extras/hs-test/infra/cpu.go index b5555d85b98..b26a06c98db 100644 --- a/extras/hs-test/infra/cpu.go +++ b/extras/hs-test/infra/cpu.go @@ -7,6 +7,7 @@ import ( . "github.com/onsi/ginkgo/v2" "os" "os/exec" + "strconv" "strings" ) @@ -18,80 +19,188 @@ type CpuContext struct { } type CpuAllocatorT struct { - cpus []int + cpus []int + runningInCi bool + buildNumber int + maxContainerCount int +} + +func iterateAndAppend(start int, end int, slice []int) []int { + for i := start; i <= end; i++ { + slice = append(slice, i) + } + return slice } var cpuAllocator *CpuAllocatorT = nil func (c *CpuAllocatorT) Allocate(containerCount int, nCpus int) (*CpuContext, error) { var cpuCtx CpuContext + // indexes, not actual cores + var minCpu, maxCpu int - // splitting cpus into equal parts; this will over-allocate cores but it's good enough for now - maxContainerCount := 4 - // skip CPU 0 - minCpu := ((GinkgoParallelProcess() - 1) * maxContainerCount * nCpus) + 1 - maxCpu := (GinkgoParallelProcess() * maxContainerCount * nCpus) + if c.runningInCi { + minCpu = ((c.buildNumber) * c.maxContainerCount * nCpus) + maxCpu = ((c.buildNumber + 1) * c.maxContainerCount * nCpus) - 1 + } else { + minCpu = ((GinkgoParallelProcess() - 1) * c.maxContainerCount * nCpus) + maxCpu = (GinkgoParallelProcess() * c.maxContainerCount * nCpus) - 1 + } if len(c.cpus)-1 < maxCpu { - err := fmt.Errorf("could not allocate %d CPUs; available: %d; attempted to allocate cores %d-%d", - nCpus*containerCount, len(c.cpus)-1, minCpu, maxCpu) + err := fmt.Errorf("could not allocate %d CPUs; available count: %d; attempted to allocate cores with index %d-%d; max index: %d;\n"+ + "available cores: %v", nCpus*containerCount, len(c.cpus), minCpu, maxCpu, len(c.cpus)-1, c.cpus) return nil, err } + if containerCount == 1 { cpuCtx.cpus = c.cpus[minCpu : minCpu+nCpus] - } else if containerCount > 1 && containerCount <= maxContainerCount { + } else if containerCount > 1 && containerCount <= c.maxContainerCount { cpuCtx.cpus = c.cpus[minCpu+(nCpus*(containerCount-1)) : minCpu+(nCpus*containerCount)] } else { - return nil, fmt.Errorf("too many containers; CPU allocation for >%d containers is not implemented", maxContainerCount) + return nil, fmt.Errorf("too many containers; CPU allocation for >%d containers is not implemented", c.maxContainerCount) } - cpuCtx.cpuAllocator = c return &cpuCtx, nil } func (c *CpuAllocatorT) readCpus() error { - var first, last int - - // Path depends on cgroup version. We need to check which version is in use. - // For that following command can be used: 'stat -fc %T /sys/fs/cgroup/' - // In case the output states 'cgroup2fs' then cgroups v2 is used, 'tmpfs' in case cgroups v1. - cmd := exec.Command("stat", "-fc", "%T", "/sys/fs/cgroup/") - byteOutput, err := cmd.CombinedOutput() - if err != nil { - return err - } - CpuPath := CgroupPath - if strings.Contains(string(byteOutput), "tmpfs") { - CpuPath += "cpuset/cpuset.effective_cpus" - } else if strings.Contains(string(byteOutput), "cgroup2fs") { - CpuPath += "cpuset.cpus.effective" + var first, second, third, fourth int + var file *os.File + var err error + + if c.runningInCi { + // non-debug build runs on node0, debug on node1 + if *IsDebugBuild { + file, err = os.Open("/sys/devices/system/node/node1/cpulist") + } else { + file, err = os.Open("/sys/devices/system/node/node0/cpulist") + } + if err != nil { + return err + } + defer file.Close() + + sc := bufio.NewScanner(file) + sc.Scan() + line := sc.Text() + _, err = fmt.Sscanf(line, "%d-%d,%d-%d", &first, &second, &third, &fourth) + if err != nil { + return err + } + + c.cpus = iterateAndAppend(first, second, c.cpus) + c.cpus = iterateAndAppend(third, fourth, c.cpus) + } else if NumaAwareCpuAlloc { + var fifth, sixth int + var tmpCpus []int + + file, err := os.Open("/sys/devices/system/node/online") + if err != nil { + return err + } + defer file.Close() + + sc := bufio.NewScanner(file) + sc.Scan() + line := sc.Text() + // get numa node range + _, err = fmt.Sscanf(line, "%d-%d", &first, &second) + if err != nil { + return err + } + + for i := first; i <= second; i++ { + file, err := os.Open("/sys/devices/system/node/node" + fmt.Sprint(i) + "/cpulist") + if err != nil { + return err + } + defer file.Close() + + // get numa node cores + sc := bufio.NewScanner(file) + sc.Scan() + line := sc.Text() + _, err = fmt.Sscanf(line, "%d-%d,%d-%d", &third, &fourth, &fifth, &sixth) + if err != nil { + return err + } + + // get numa node cores from first range + tmpCpus = iterateAndAppend(third, fourth, tmpCpus) + + // discard cpu 0 + if tmpCpus[0] == 0 && !*UseCpu0{ + tmpCpus = tmpCpus[1:] + } + + // get numa node cores from second range + tmpCpus = iterateAndAppend(fifth, sixth, tmpCpus) + + // make c.cpus divisible by maxContainerCount * nCpus, so we don't have to check which numa will be used + // and we can use offsets + count_to_remove := len(tmpCpus) % (c.maxContainerCount * *NConfiguredCpus) + c.cpus = append(c.cpus, tmpCpus[:len(tmpCpus)-count_to_remove]...) + tmpCpus = tmpCpus[:0] + } } else { - return errors.New("cgroup unknown fs: " + string(byteOutput)) - } + // Path depends on cgroup version. We need to check which version is in use. + // For that following command can be used: 'stat -fc %T /sys/fs/cgroup/' + // In case the output states 'cgroup2fs' then cgroups v2 is used, 'tmpfs' in case cgroups v1. + cmd := exec.Command("stat", "-fc", "%T", "/sys/fs/cgroup/") + byteOutput, err := cmd.CombinedOutput() + if err != nil { + return err + } - file, err := os.Open(CpuPath) - if err != nil { - return err - } - defer file.Close() - - sc := bufio.NewScanner(file) - sc.Scan() - line := sc.Text() - _, err = fmt.Sscanf(line, "%d-%d", &first, &last) - if err != nil { - return err + CpuPath := CgroupPath + if strings.Contains(string(byteOutput), "tmpfs") { + CpuPath += "cpuset/cpuset.effective_cpus" + } else if strings.Contains(string(byteOutput), "cgroup2fs") { + CpuPath += "cpuset.cpus.effective" + } else { + return errors.New("cgroup unknown fs: " + string(byteOutput)) + } + + file, err := os.Open(CpuPath) + if err != nil { + return err + } + defer file.Close() + + sc := bufio.NewScanner(file) + sc.Scan() + line := sc.Text() + _, err = fmt.Sscanf(line, "%d-%d", &first, &second) + if err != nil { + return err + } + c.cpus = iterateAndAppend(first, second, c.cpus) } - for i := first; i <= last; i++ { - c.cpus = append(c.cpus, i) + + // discard cpu 0 + if c.cpus[0] == 0 && !*UseCpu0 { + c.cpus = c.cpus[1:] } return nil } func CpuAllocator() (*CpuAllocatorT, error) { if cpuAllocator == nil { + var err error cpuAllocator = new(CpuAllocatorT) - err := cpuAllocator.readCpus() + cpuAllocator.maxContainerCount = 4 + buildNumberStr := os.Getenv("BUILD_NUMBER") + + if buildNumberStr != "" { + cpuAllocator.runningInCi = true + // get last digit of build number + cpuAllocator.buildNumber, err = strconv.Atoi(buildNumberStr[len(buildNumberStr)-1:]) + if err != nil { + return nil, err + } + } + err = cpuAllocator.readCpus() if err != nil { return nil, err } diff --git a/extras/hs-test/infra/hst_suite.go b/extras/hs-test/infra/hst_suite.go index a6ba14676d0..b2e069343a1 100644 --- a/extras/hs-test/infra/hst_suite.go +++ b/extras/hs-test/infra/hst_suite.go @@ -33,6 +33,8 @@ var IsVppDebug = flag.Bool("debug", false, "attach gdb to vpp") var NConfiguredCpus = flag.Int("cpus", 1, "number of CPUs assigned to vpp") var VppSourceFileDir = flag.String("vppsrc", "", "vpp source file directory") var IsDebugBuild = flag.Bool("debug_build", false, "some paths are different with debug build") +var UseCpu0 = flag.Bool("cpu0", false, "use cpu0") +var NumaAwareCpuAlloc bool var SuiteTimeout time.Duration type HstSuite struct { @@ -78,7 +80,10 @@ func (s *HstSuite) SetupSuite() { func (s *HstSuite) AllocateCpus() []int { cpuCtx, err := s.CpuAllocator.Allocate(len(s.StartedContainers), s.CpuPerVpp) - s.AssertNil(err) + // using Fail instead of AssertNil to make error message more readable + if err != nil { + Fail(fmt.Sprint(err)) + } s.AddCpuContext(cpuCtx) return cpuCtx.cpus } |