diff options
Diffstat (limited to 'resources/libraries/python/jumpavg')
-rw-r--r-- | resources/libraries/python/jumpavg/__init__.py | 10 | ||||
-rw-r--r-- | resources/libraries/python/jumpavg/avg_stdev_stats.py (renamed from resources/libraries/python/jumpavg/AvgStdevStats.py) | 58 | ||||
-rw-r--r-- | resources/libraries/python/jumpavg/bit_counting_group.py (renamed from resources/libraries/python/jumpavg/BitCountingGroup.py) | 146 | ||||
-rw-r--r-- | resources/libraries/python/jumpavg/bit_counting_group_list.py (renamed from resources/libraries/python/jumpavg/BitCountingGroupList.py) | 140 | ||||
-rw-r--r-- | resources/libraries/python/jumpavg/bit_counting_stats.py (renamed from resources/libraries/python/jumpavg/BitCountingStats.py) | 131 | ||||
-rw-r--r-- | resources/libraries/python/jumpavg/classify.py | 78 |
6 files changed, 281 insertions, 282 deletions
diff --git a/resources/libraries/python/jumpavg/__init__.py b/resources/libraries/python/jumpavg/__init__.py index 4fa696c538..7f63b5ee39 100644 --- a/resources/libraries/python/jumpavg/__init__.py +++ b/resources/libraries/python/jumpavg/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -15,8 +15,8 @@ __init__ file for "jumpavg" Python package. """ -from .AvgStdevStats import AvgStdevStats -from .BitCountingStats import BitCountingStats -from .BitCountingGroup import BitCountingGroup -from .BitCountingGroupList import BitCountingGroupList +from .avg_stdev_stats import AvgStdevStats +from .bit_counting_stats import BitCountingStats +from .bit_counting_group import BitCountingGroup +from .bit_counting_group_list import BitCountingGroupList from .classify import classify diff --git a/resources/libraries/python/jumpavg/AvgStdevStats.py b/resources/libraries/python/jumpavg/avg_stdev_stats.py index 4720c10f3d..c21c50c8f8 100644 --- a/resources/libraries/python/jumpavg/AvgStdevStats.py +++ b/resources/libraries/python/jumpavg/avg_stdev_stats.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Cisco and/or its affiliates. +# Copyright (c) 2024 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -13,9 +13,12 @@ """Module holding AvgStdevStats class.""" +import dataclasses import math +import typing +@dataclasses.dataclass class AvgStdevStats: """Class for statistics which include average and stdev of a group. @@ -25,45 +28,18 @@ class AvgStdevStats: Instances are only statistics, the data itself is stored elsewhere. """ - def __init__(self, size=0, avg=0.0, stdev=0.0): - """Construct the stats object by storing the values needed. - - Each value has to be numeric. - The values are not sanitized depending on size, wrong initialization - can cause delayed math errors. - - :param size: Number of values participating in this group. - :param avg: Population average of the participating sample values. - :param stdev: Population standard deviation of the sample values. - :type size: int - :type avg: float - :type stdev: float - """ - self.size = size - self.avg = avg - self.stdev = stdev - - def __str__(self): - """Return string with human readable description of the group. - - :returns: Readable description. - :rtype: str - """ - return f"size={self.size} avg={self.avg} stdev={self.stdev}" - - def __repr__(self): - """Return string executable as Python constructor call. - - :returns: Executable constructor call. - :rtype: str - """ - return ( - f"AvgStdevStats(size={self.size!r},avg={self.avg!r}" - f",stdev={self.stdev!r})" - ) + size: int = 0 + """Number of scalar values (samples) participating in this group.""" + avg: float = 0.0 + """Population average of the participating sample values.""" + stdev: float = 0.0 + """Population standard deviation of the sample values.""" @classmethod - def for_runs(cls, runs): + def for_runs( + cls, + runs: typing.Iterable[typing.Union[float, "AvgStdevStats"]], + ) -> "AvgStdevStats": """Return new stats instance describing the sequence of runs. If you want to append data to existing stats object, @@ -72,8 +48,8 @@ class AvgStdevStats: Instead of a verb, "for" is used to start this method name, to signify the result contains less information than the input data. - Here, Run is a hypothetical abstract class, an union of float and cls. - Defining that as a real abstract class in Python 2 is too much hassle. + Here, run is a hypothetical abstract class, an union of float and cls. + Defining that as a real abstract class in Python is too much hassle. :param runs: Sequence of data to describe by the new metadata. :type runs: Iterable[Union[float, cls]] @@ -97,6 +73,8 @@ class AvgStdevStats: run_size = run.size run_avg = run.avg run_stdev = run.stdev + if run_size < 1: + continue old_total_size = total_size delta = run_avg - total_avg total_size += run_size diff --git a/resources/libraries/python/jumpavg/BitCountingGroup.py b/resources/libraries/python/jumpavg/bit_counting_group.py index f1bdc502fd..22c9337532 100644 --- a/resources/libraries/python/jumpavg/BitCountingGroup.py +++ b/resources/libraries/python/jumpavg/bit_counting_group.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -13,14 +13,16 @@ """Module holding BitCountingGroup class.""" -import copy +import collections +import dataclasses +import typing -from .AvgStdevStats import AvgStdevStats -from .BitCountingStats import BitCountingStats +from .avg_stdev_stats import AvgStdevStats +from .bit_counting_stats import BitCountingStats -class BitCountingGroup: - # TODO: Inherit from collections.abc.Sequence in Python 3. +@dataclasses.dataclass +class BitCountingGroup(collections.abc.Sequence): """Group of runs which tracks bit count in an efficient manner. This class contains methods that mutate the internal state, @@ -38,74 +40,63 @@ class BitCountingGroup: a method to add a single run in an efficient manner is provided. """ - def __init__(self, run_list=None, stats=None, bits=None, - max_value=None, prev_avg=None, comment="unknown"): - """Set the internal state and partially the stats. - - A "group" stands for an Iterable of runs, where "run" is either - a float value, or a stats-like object (only size, avg and stdev - are accessed). Run is a hypothetical abstract class, - defining it in Python 2 is too much hassle. - - Only a copy of the run list argument value is stored in the instance, - so it is not a problem if the value object is mutated afterwards. + run_list: typing.List[typing.Union[float, AvgStdevStats]] + """List of run to compose into this group. + The init call takes ownership of the list, + so the caller should clone it to avoid unexpected muations.""" + max_value: float + """Maximal sample value to expect.""" + unit: float = 1.0 + """Typical resolution of the values""" + comment: str = "normal" + """Any string giving more info, e.g. "regression".""" + prev_avg: typing.Optional[float] = None + """Average of the previous group, if any.""" + stats: AvgStdevStats = None + """Stats object used for computing bits. + Almost always recomputed, except when non-None in init.""" + cached_bits: typing.Optional[float] = None + """Cached value of information content. + Noned on edit, recomputed if needed and None.""" + + def __post_init__(self): + """Recompute stats is None. It is not verified whether the user provided values are valid, e.g. whether the stats and bits values reflect the runs. - - :param run_list: List of run to compose into this group. Default: empty. - :param stats: Stats object used for computing bits. - :param bits: Cached value of information content. - :param max_value: Maximal sample value to be used for computing. - :param prev_avg: Average of the previous group, affects bits. - :param comment: Any string giving more info, e.g. "regression". - :type run_list: Iterable[Run] - :type stats: Optional[AvgStdevStats] - :type bits: Optional[float] - :type max_value: float - :type prev_avg: Optional[float] - :type comment: str """ - self.run_list = copy.deepcopy(run_list) if run_list else list() - self.stats = stats - self.cached_bits = bits - self.max_value = max_value - self.prev_avg = prev_avg - self.comment = comment if self.stats is None: - self.stats = AvgStdevStats.for_runs(self.run_list) - - def __str__(self): - """Return string with human readable description of the group. + self.stats = AvgStdevStats.for_runs(runs=self.run_list) - :returns: Readable description. - :rtype: str - """ - return f"stats={self.stats} bits={self.cached_bits}" + @property + def bits(self) -> float: + """Return overall bit content of the group list. - def __repr__(self): - """Return string executable as Python constructor call. + If not cached, compute from stats and cache. - :returns: Executable constructor call. - :rtype: str + :returns: The overall information content in bits. + :rtype: float """ - return ( - f"BitCountingGroup(run_list={self.run_list!r},stats={self.stats!r}" - f",bits={self.cached_bits!r},max_value={self.max_value!r}" - f",prev_avg={self.prev_avg!r},comment={self.comment!r})" - ) + if self.cached_bits is None: + self.cached_bits = BitCountingStats.for_runs_and_params( + runs=[self.stats], + max_value=self.max_value, + unit=self.unit, + prev_avg=self.prev_avg, + ).bits + return self.cached_bits - def __getitem__(self, index): + def __getitem__(self, index: int) -> typing.Union[float, AvgStdevStats]: """Return the run at the index. :param index: Index of the run to return. :type index: int :returns: The run at the index. - :rtype: Run + :rtype: typing.Union[float, AvgStdevStats] """ return self.run_list[index] - def __len__(self): + def __len__(self) -> int: """Return the number of runs in the group. :returns: The Length of run_list. @@ -113,39 +104,36 @@ class BitCountingGroup: """ return len(self.run_list) - def copy(self): + def copy(self) -> "BitCountingGroup": """Return a new instance with copied internal state. + Stats are preserved to avoid re-computation. + As both float and AvgStdevStats are effectively immutable, + only a shallow copy of the runs list is performed. + :returns: The copied instance. :rtype: BitCountingGroup """ stats = AvgStdevStats.for_runs([self.stats]) return self.__class__( - run_list=self.run_list, stats=stats, bits=self.cached_bits, - max_value=self.max_value, prev_avg=self.prev_avg, - comment=self.comment) - - @property - def bits(self): - """Return overall bit content of the group list. - - If not cached, compute from stats and cache. - - :returns: The overall information content in bits. - :rtype: float - """ - if self.cached_bits is None: - self.cached_bits = BitCountingStats.for_runs( - [self.stats], self.max_value, self.prev_avg).bits - return self.cached_bits + run_list=list(self.run_list), + stats=stats, + cached_bits=self.cached_bits, + max_value=self.max_value, + unit=self.unit, + prev_avg=self.prev_avg, + comment=self.comment, + ) - def append(self, run): + def append( + self, run: typing.Union[float, AvgStdevStats] + ) -> "BitCountingGroup": """Mutate to add the new run, return self. Stats are updated, but old bits value is deleted from cache. :param run: The run value to add to the group. - :type value: Run + :type value: typing.Union[float, AvgStdevStats] :returns: The updated self. :rtype: BitCountingGroup """ @@ -154,7 +142,9 @@ class BitCountingGroup: self.cached_bits = None return self - def extend(self, runs): + def extend( + self, runs: typing.Iterable[typing.Union[float, AvgStdevStats]] + ) -> "BitCountingGroup": """Mutate to add the new runs, return self. This is saves small amount of computation @@ -163,7 +153,7 @@ class BitCountingGroup: Stats are updated, but old bits value is deleted from cache. :param runs: The runs to add to the group. - :type value: Iterable[Run] + :type value: typing.Iterable[typing.Union[float, AvgStdevStats]] :returns: The updated self. :rtype: BitCountingGroup """ diff --git a/resources/libraries/python/jumpavg/BitCountingGroupList.py b/resources/libraries/python/jumpavg/bit_counting_group_list.py index aed1304d97..e4d33b53a2 100644 --- a/resources/libraries/python/jumpavg/BitCountingGroupList.py +++ b/resources/libraries/python/jumpavg/bit_counting_group_list.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -13,13 +13,16 @@ """Module holding BitCountingGroupList class.""" -import copy +import collections +import dataclasses +import typing -from .BitCountingGroup import BitCountingGroup +from .avg_stdev_stats import AvgStdevStats # Just for type hints. +from .bit_counting_group import BitCountingGroup -class BitCountingGroupList: - # TODO: Inherit from collections.abc.Sequence in Python 3. +@dataclasses.dataclass +class BitCountingGroupList(collections.abc.Sequence): """List of data groups which tracks overall bit count. The Sequence-like access is related to the list of groups, @@ -41,55 +44,29 @@ class BitCountingGroupList: recalculations if the bit count is not needed. """ - def __init__(self, group_list=None, bits_except_last=0.0, max_value=None): - """Set the internal state without any calculations. - - The group list argument is copied deeply, so it is not a problem - if the value object is mutated afterwards. + max_value: float + """Maximal sample value to base bits computation on.""" + unit: float = 1.0 + """Typical resolution of the values.""" + group_list: typing.List[BitCountingGroup] = None + """List of groups to compose this group list. + Init also accepts None standing for an empty list. + This class takes ownership of the list, + so caller of init should clone their copy to avoid unexpected mutations. + """ + bits_except_last: float = 0.0 + """Partial sum of all but one group bits.""" - A "group" stands for an Iterable of runs, where "run" is either - a float value, or a stats-like object (only size, avg and stdev - are accessed). Run is a hypothetical abstract class, - defining it in Python 2 is too much hassle. + def __post_init__(self): + """Turn possible None into an empty list. It is not verified whether the user provided values are valid, - e.g. whether the cached bits values make sense. - - The max_value is required and immutable, - it is recommended the callers find their maximum beforehand. - - :param group_list: List of groups to compose this group list (or empty). - :param bits_except_last: Partial sum of all but one group bits. - :param max_value: Maximal sample value to base bits computation on. - :type group_list: Iterable[BitCountingGroup] - :type bits_except_last: float - :type max_value: float - """ - self.group_list = copy.deepcopy(group_list) if group_list else list() - self.bits_except_last = bits_except_last - self.max_value = max_value - - def __str__(self): - """Return string with human readable description of the group list. - - :returns: Readable description. - :rtype: str + e.g. whether the cached bits values (and bits_except_last) make sense. """ - return u"group_list={self.group_list} bits={self.bits}" - - def __repr__(self): - """Return string executable as Python constructor call. + if self.group_list is None: + self.group_list = [] - :returns: Executable constructor call. - :rtype: str - """ - return ( - f"BitCountingGroupList(group_list={self.group_list!r}" - f",bits_except_last={self.bits_except_last!r}" - f",max_value={self.max_value!r})" - ) - - def __getitem__(self, index): + def __getitem__(self, index: int) -> BitCountingGroup: """Return the group at the index. :param index: Index of the group to return. @@ -99,7 +76,7 @@ class BitCountingGroupList: """ return self.group_list[index] - def __len__(self): + def __len__(self) -> int: """Return the length of the group list. :returns: The Length of group_list. @@ -107,19 +84,46 @@ class BitCountingGroupList: """ return len(self.group_list) - def copy(self): + def copy(self) -> "BitCountingGroupList": """Return a new instance with copied internal state. :returns: The copied instance. :rtype: BitCountingGroupList """ return self.__class__( - group_list=self.group_list, bits_except_last=self.bits_except_last, - max_value=self.max_value + max_value=self.max_value, + unit=self.unit, + group_list=[group.copy() for group in self.group_list], + bits_except_last=self.bits_except_last, + ) + + def copy_fast(self) -> "BitCountingGroupList": + """Return a new instance with minimaly copied internal state. + + The assumption here is that only the last group will ever be mutated + (in self, probably never in the return value), + so all the previous groups can be "copied by reference". + + :returns: The copied instance. + :rtype: BitCountingGroupList + """ + group_list = list(self.group_list) + if group_list: + group_list[-1] = group_list[-1].copy() + # Further speedup is possible by keeping the last group + # as a singly linked (from end) list, + # but for CSIT sample sizes, copy of whole Python list is faster. + # TODO: Implement linked list as an option + # for users with many samples. + return self.__class__( + max_value=self.max_value, + unit=self.unit, + group_list=group_list, + bits_except_last=self.bits_except_last, ) @property - def bits(self): + def bits(self) -> float: """Return overall bit content of the group list. :returns: The overall information content in bits. @@ -130,12 +134,17 @@ class BitCountingGroupList: # TODO: Is it worth to cache the overall result? return self.bits_except_last + self.group_list[-1].bits - def append_group_of_runs(self, runs): + def append_group_of_runs( + self, + runs: typing.Union[ + BitCountingGroup, typing.List[typing.Union[float, AvgStdevStats]] + ], + ) -> "BitCountingGroupList": """Mutate to add a new group based on the runs, return self. - The argument is copied before adding to the group list, - so further edits do not affect the grup list. - The argument can also be a group, only runs from it are used. + The list argument is NOT copied before adding to the group list, + so further edits MAY not affect the grup list. + The list from BitCountingGroup is shallow copied though. :param runs: Runs to form the next group to be appended to self. :type runs: Union[Iterable[Run], BitCountingGroup] @@ -147,16 +156,23 @@ class BitCountingGroupList: # It is faster to avoid stats recalculation. new_group = runs.copy() new_group.max_value = self.max_value + # Unit is common. new_group.prev_avg = prev_avg new_group.cached_bits = None else: new_group = BitCountingGroup( - run_list=runs, max_value=self.max_value, prev_avg=prev_avg) + run_list=runs, + max_value=self.max_value, + unit=self.unit, + prev_avg=prev_avg, + ) self.bits_except_last = self.bits self.group_list.append(new_group) return self - def append_run_to_to_last_group(self, run): + def append_run_to_to_last_group( + self, run: typing.Union[float, AvgStdevStats] + ) -> "BitCountingGroupList": """Mutate to add new run at the end of the last group. Basically a one-liner, only returning group list instead of last group. @@ -170,7 +186,9 @@ class BitCountingGroupList: self.group_list[-1].append(run) return self - def extend_runs_to_last_group(self, runs): + def extend_runs_to_last_group( + self, runs: typing.Iterable[typing.Union[float, AvgStdevStats]] + ) -> "BitCountingGroupList": """Mutate to add new runs to the end of the last group. A faster alternative to appending runs one by one in a loop. diff --git a/resources/libraries/python/jumpavg/BitCountingStats.py b/resources/libraries/python/jumpavg/bit_counting_stats.py index 7b5e659214..3d1cb8aef0 100644 --- a/resources/libraries/python/jumpavg/BitCountingStats.py +++ b/resources/libraries/python/jumpavg/bit_counting_stats.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -13,11 +13,14 @@ """Module holding BitCountingStats class.""" +import dataclasses import math +import typing -from .AvgStdevStats import AvgStdevStats +from .avg_stdev_stats import AvgStdevStats +@dataclasses.dataclass class BitCountingStats(AvgStdevStats): """Class for statistics which include information content of a group. @@ -33,11 +36,22 @@ class BitCountingStats(AvgStdevStats): Only for_runs method calls the parent implementation, without using super(). """ - def __init__( - self, size=0, avg=None, stdev=0.0, max_value=None, prev_avg=None): - """Construct the stats object by computing from the values needed. + max_value: float = None + """Maximal sample value (real or estimated). + Default value is there just for argument ordering reasons, + leaving None leads to exceptions.""" + unit: float = 1.0 + """Typical resolution of the values.""" + prev_avg: typing.Optional[float] = None + """Population average of the previous group (if any).""" + bits: float = None + """The computed information content of the group. + It is formally an argument to init function, just to keep repr string + a valid call. ut the init value is ignored and always recomputed. + """ - The values are not sanitized, faulty callers can cause math errors. + def __post_init__(self): + """Construct the stats object by computing from the values needed. The None values are allowed for stats for zero size data, but such stats can report arbitrary avg and max_value. @@ -54,91 +68,60 @@ class BitCountingStats(AvgStdevStats): (but not with floating point mechanic). The hope is the difference will have no real impact on the classification procedure. - - :param size: Number of values participating in this group. - :param avg: Population average of the participating sample values. - :param stdev: Population standard deviation of the sample values. - :param max_value: Maximal expected value. - TODO: This might be more optimal, - but max-invariant algorithm will be nicer. - :param prev_avg: Population average of the previous group. - If None, no previous average is taken into account. - If not None, the given previous average is used to discourage - consecutive groups with similar averages - (opposite triangle distribution is assumed). - :type avg: float - :type size: int - :type stdev: float - :type max_value: Union[float, NoneType] - :type prev_avg: Union[float, NoneType] """ - self.avg = avg - self.size = size - self.stdev = stdev - self.max_value = max_value - self.prev_avg = prev_avg # Zero size should in principle have non-zero bits (coding zero size), # but zero allows users to add empty groups without affecting bits. self.bits = 0.0 if self.size < 1: return - if avg is None: - raise ValueError(f"Avg is None: {self!r}") - if max_value is None or max_value <= 0.0: + if self.max_value <= 0.0: raise ValueError(f"Invalid max value: {self!r}") + max_value = self.max_value / self.unit + avg = self.avg / self.unit # Length of the sequence must be also counted in bits, # otherwise the message would not be decodable. # Model: probability of k samples is 1/k - 1/(k+1) == 1/k/(k+1) # This is compatible with zero size leading to zero bits. - self.bits += math.log(size * (size + 1), 2) - if prev_avg is None: + self.bits += math.log(self.size * (self.size + 1), 2) + if self.prev_avg is None: # Avg is considered to be uniformly distributed # from zero to max_value. - self.bits += math.log(max_value + 1.0, 2) + self.bits += math.log(max_value + 1, 2) else: # Opposite triangle distribution with minimum. - self.bits += math.log( - max_value * (max_value + 1) / (abs(avg - prev_avg) + 1), 2) + prev_avg = self.prev_avg / self.unit + norm = prev_avg * prev_avg + norm -= (prev_avg - 1) * max_value + norm += max_value * max_value / 2 + self.bits -= math.log((abs(avg - prev_avg) + 1) / norm, 2) if self.size < 2: return - # Stdev is considered to be uniformly distributed - # from zero to max_value. That is quite a bad expectation, - # but resilient to negative samples etc. - self.bits += math.log(max_value + 1.0, 2) + stdev = self.stdev / self.unit + # Stdev can be anything between zero and max value. + # For size==2, sphere surface is 2 points regardless of radius, + # we need to penalize large stdev already when encoding the stdev. + # The simplest way is to use the same distribution as with size... + self.bits += math.log((stdev + 1) * (stdev + 2), 2) + # .. just with added normalization from the max value cut-off. + self.bits += math.log(1 - 1 / (max_value + 2), 2) # Now we know the samples lie on sphere in size-1 dimensions. # So it is (size-2)-sphere, with radius^2 == stdev^2 * size. # https://en.wikipedia.org/wiki/N-sphere - sphere_area_ln = math.log(2) + math.log(math.pi) * ((size - 1) / 2.0) - sphere_area_ln -= math.lgamma((size - 1) / 2.0) - sphere_area_ln += math.log(stdev + 1.0) * (size - 2) - sphere_area_ln += math.log(size) * ((size - 2) / 2.0) + sphere_area_ln = math.log(2) + sphere_area_ln += math.log(math.pi) * ((self.size - 1) / 2) + sphere_area_ln -= math.lgamma((self.size - 1) / 2) + sphere_area_ln += math.log(stdev + 1) * (self.size - 2) + sphere_area_ln += math.log(self.size) * ((self.size - 2) / 2) self.bits += sphere_area_ln / math.log(2) - def __str__(self): - """Return string with human readable description of the group. - - :returns: Readable description. - :rtype: str - """ - return ( - f"size={self.size} avg={self.avg} stdev={self.stdev}" - f" bits={self.bits}" - ) - - def __repr__(self): - """Return string executable as Python constructor call. - - :returns: Executable constructor call. - :rtype: str - """ - return ( - f"BitCountingStats(size={self.size!r},avg={self.avg!r}" - f",stdev={self.stdev!r},max_value={self.max_value!r}" - f",prev_avg={self.prev_avg!r})" - ) - @classmethod - def for_runs(cls, runs, max_value=None, prev_avg=None): + def for_runs_and_params( + cls, + runs: typing.Iterable[typing.Union[float, AvgStdevStats]], + max_value: float, + unit: float = 1.0, + prev_avg: typing.Optional[float] = None, + ): """Return new stats instance describing the sequence of runs. If you want to append data to existing stats object, @@ -156,14 +139,22 @@ class BitCountingStats(AvgStdevStats): :param runs: Sequence of data to describe by the new metadata. :param max_value: Maximal expected value. + :param unit: Typical resolution of the values. :param prev_avg: Population average of the previous group, if any. :type runs: Iterable[Union[float, AvgStdevStats]] :type max_value: Union[float, NoneType] + :type unit: float :type prev_avg: Union[float, NoneType] :returns: The new stats instance. :rtype: cls """ asd = AvgStdevStats.for_runs(runs) - ret_obj = cls(size=asd.size, avg=asd.avg, stdev=asd.stdev, - max_value=max_value, prev_avg=prev_avg) + ret_obj = cls( + size=asd.size, + avg=asd.avg, + stdev=asd.stdev, + max_value=max_value, + unit=unit, + prev_avg=prev_avg, + ) return ret_obj diff --git a/resources/libraries/python/jumpavg/classify.py b/resources/libraries/python/jumpavg/classify.py index 252c71e8d5..cc3cdcceed 100644 --- a/resources/libraries/python/jumpavg/classify.py +++ b/resources/libraries/python/jumpavg/classify.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -13,32 +13,54 @@ """Module holding the classify function -Classification os one of primary purposes of this package. +Classification is one of primary purposes of this package. Minimal message length principle is used for grouping results into the list of groups, assuming each group is a population of different Gaussian distribution. """ -from .AvgStdevStats import AvgStdevStats -from .BitCountingGroupList import BitCountingGroupList +from typing import Iterable, Optional, Union +from .avg_stdev_stats import AvgStdevStats +from .bit_counting_group_list import BitCountingGroupList -def classify(values): + +def classify( + values: Iterable[Union[float, Iterable[float]]], + unit: Optional[float] = None, + sbps: Optional[float] = None, +) -> BitCountingGroupList: """Return the values in groups of optimal bit count. Here, a value is either a float, or an iterable of floats. Such iterables represent an undivisible sequence of floats. + Int is accepted anywhere instead of float. Internally, such sequence is replaced by AvgStdevStats after maximal value is found. + If the values are smaller than expected (below one unit), + the underlying assumption break down and the classification is wrong. + Use the "unit" parameter to hint at what the input resolution is. + + If the correct value of unit is not known beforehand, + the argument "sbps" (Significant Bits Per Sample) can be used + to set unit such that maximal sample value is this many ones in binary. + If neither "unit" nor "sbps" are given, "sbps" of 12 is used by default. + :param values: Sequence of runs to classify. + :param unit: Typical resolution of the values. + Zero and None means no unit given. + :param sbps: Significant Bits Per Sample. None on zero means 12. + If units is not set, this is used to compute unit from max sample value. :type values: Iterable[Union[float, Iterable[float]]] + :type unit: Optional[float] + :type sbps: Optional[float] :returns: Classified group list. :rtype: BitCountingGroupList """ - processed_values = list() + processed_values = [] max_value = 0.0 for value in values: if isinstance(value, (float, int)): @@ -50,27 +72,27 @@ def classify(values): if subvalue > max_value: max_value = subvalue processed_values.append(AvgStdevStats.for_runs(value)) - open_at = list() - closed_before = [BitCountingGroupList(max_value=max_value)] - for index, value in enumerate(processed_values): - newly_open = closed_before[index].copy() - newly_open.append_group_of_runs([value]) - open_at.append(newly_open) - record_group_list = newly_open - for previous_index, old_open in enumerate(open_at[:index]): - new_open = old_open.copy().append_run_to_to_last_group(value) - open_at[previous_index] = new_open - if new_open.bits < record_group_list.bits: - record_group_list = new_open - closed_before.append(record_group_list) - partition = closed_before[-1] - previous_average = partition[0].stats.avg - for group in partition: - if group.stats.avg == previous_average: - group.comment = u"normal" - elif group.stats.avg < previous_average: - group.comment = u"regression" + if not unit: + if not sbps: + sbps = 12.0 + max_in_units = pow(2.0, sbps + 1.0) - 1.0 + unit = max_value / max_in_units + # Glist means group list (BitCountingGroupList). + open_glists = [] + record_glist = BitCountingGroupList(max_value=max_value, unit=unit) + for value in processed_values: + new_open_glist = record_glist.copy_fast().append_group_of_runs([value]) + record_glist = new_open_glist + for old_open_glist in open_glists: + old_open_glist.append_run_to_to_last_group(value) + if old_open_glist.bits < record_glist.bits: + record_glist = old_open_glist + open_glists.append(new_open_glist) + previous_average = record_glist[0].stats.avg + for group in record_glist: + if group.stats.avg < previous_average: + group.comment = "regression" elif group.stats.avg > previous_average: - group.comment = u"progression" + group.comment = "progression" previous_average = group.stats.avg - return partition + return record_glist |