From 079c390e0903a98182781ff5c2af2bba9902b4ed Mon Sep 17 00:00:00 2001 From: Vratko Polak Date: Fri, 2 Jun 2023 14:44:47 +0200 Subject: feat(jumpavg): support small values via unit param Previously, Jumpavg was known to give wrong results when the data contains values of order one or smaller. This change introduces a new "unit" parameter, which changes how the information content is calculated. For example if the data values are mutiplies of 0.01, the unit parameter should be set to 0.01 to compensate. For callers not knowing their correct unit value, another parameter is introduced, called "sbps" (meaning Significant Bits Per Sample). A binary integer number with this many ones is how much units should the maximal sample be. This way jumpavg computes the corresponding "unit" value to use. If neither "unit" nor "sbps" are given, the "sbps" value of 12 is applied. + Rename files to conform to snake_style naming. + Fix normalization for the "opposite triangle" distribution. + Simplify logic, all groups now start as "normal", not "unknown". + Minor style improvements as suggested by pylint. + From user perspective, this change should be backward compatible. - The normalization fix is a behavior change, but it is a bugfix and the new behavior should be better. Change-Id: I5a5ca11757f087fff13faf1d0b8e34a741400258 Signed-off-by: Vratko Polak --- .../libraries/python/jumpavg/AvgStdevStats.py | 89 --------- .../libraries/python/jumpavg/BitCountingGroup.py | 157 ---------------- .../python/jumpavg/BitCountingGroupList.py | 195 -------------------- .../libraries/python/jumpavg/BitCountingStats.py | 149 --------------- resources/libraries/python/jumpavg/__init__.py | 10 +- .../libraries/python/jumpavg/avg_stdev_stats.py | 89 +++++++++ .../libraries/python/jumpavg/bit_counting_group.py | 163 +++++++++++++++++ .../python/jumpavg/bit_counting_group_list.py | 203 +++++++++++++++++++++ .../libraries/python/jumpavg/bit_counting_stats.py | 157 ++++++++++++++++ resources/libraries/python/jumpavg/classify.py | 44 +++-- 10 files changed, 649 insertions(+), 607 deletions(-) delete mode 100644 resources/libraries/python/jumpavg/AvgStdevStats.py delete mode 100644 resources/libraries/python/jumpavg/BitCountingGroup.py delete mode 100644 resources/libraries/python/jumpavg/BitCountingGroupList.py delete mode 100644 resources/libraries/python/jumpavg/BitCountingStats.py create mode 100644 resources/libraries/python/jumpavg/avg_stdev_stats.py create mode 100644 resources/libraries/python/jumpavg/bit_counting_group.py create mode 100644 resources/libraries/python/jumpavg/bit_counting_group_list.py create mode 100644 resources/libraries/python/jumpavg/bit_counting_stats.py (limited to 'resources/libraries/python/jumpavg') diff --git a/resources/libraries/python/jumpavg/AvgStdevStats.py b/resources/libraries/python/jumpavg/AvgStdevStats.py deleted file mode 100644 index d40b316bf1..0000000000 --- a/resources/libraries/python/jumpavg/AvgStdevStats.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Module holding AvgStdevStats class.""" - -import dataclasses -import math -import typing - - -@dataclasses.dataclass -class AvgStdevStats: - """Class for statistics which include average and stdev of a group. - - Contrary to other stats types, adding values to the group - is computationally light without any caching. - - Instances are only statistics, the data itself is stored elsewhere. - """ - - size: int = 0 - """Number of scalar values (samples) participating in this group.""" - avg: float = 0.0 - """Population average of the participating sample values.""" - stdev: float = 0.0 - """Population standard deviation of the sample values.""" - - @classmethod - def for_runs( - cls, - runs: typing.Iterable[typing.Union[float, "AvgStdevStats"]], - ) -> "AvgStdevStats": - """Return new stats instance describing the sequence of runs. - - If you want to append data to existing stats object, - you can simply use the stats object as the first run. - - Instead of a verb, "for" is used to start this method name, - to signify the result contains less information than the input data. - - Here, run is a hypothetical abstract class, an union of float and cls. - Defining that as a real abstract class in Python is too much hassle. - - :param runs: Sequence of data to describe by the new metadata. - :type runs: Iterable[Union[float, cls]] - :returns: The new stats instance. - :rtype: cls - """ - # Using Welford method to be more resistant to rounding errors. - # Adapted from code for sample standard deviation at: - # https://www.johndcook.com/blog/standard_deviation/ - # The logic of plus operator is taken from - # https://www.johndcook.com/blog/skewness_kurtosis/ - total_size = 0 - total_avg = 0.0 - moment_2 = 0.0 - for run in runs: - if isinstance(run, (float, int)): - run_size = 1 - run_avg = run - run_stdev = 0.0 - else: - run_size = run.size - run_avg = run.avg - run_stdev = run.stdev - old_total_size = total_size - delta = run_avg - total_avg - total_size += run_size - total_avg += delta * run_size / total_size - moment_2 += run_stdev * run_stdev * run_size - moment_2 += delta * delta * old_total_size * run_size / total_size - if total_size < 1: - # Avoid division by zero. - return cls(size=0) - # TODO: Is it worth tracking moment_2 instead, and compute and cache - # stdev on demand, just to possibly save some sqrt calls? - total_stdev = math.sqrt(moment_2 / total_size) - ret_obj = cls(size=total_size, avg=total_avg, stdev=total_stdev) - return ret_obj diff --git a/resources/libraries/python/jumpavg/BitCountingGroup.py b/resources/libraries/python/jumpavg/BitCountingGroup.py deleted file mode 100644 index 48bea086f4..0000000000 --- a/resources/libraries/python/jumpavg/BitCountingGroup.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Module holding BitCountingGroup class.""" - -import collections -import dataclasses -import typing - -from .AvgStdevStats import AvgStdevStats -from .BitCountingStats import BitCountingStats - - -@dataclasses.dataclass -class BitCountingGroup(collections.abc.Sequence): - """Group of runs which tracks bit count in an efficient manner. - - This class contains methods that mutate the internal state, - use copy() method to save the previous state. - - The Sequence-like access is related to the list of runs, - for example group[0] returns the first run in the list. - Writable list-like methods are not implemented. - - As the group bit count depends on previous average - and overall maximal value, those values are assumed - to be known beforehand (and immutable). - - As the caller is allowed to divide runs into groups in any way, - a method to add a single run in an efficient manner is provided. - """ - - run_list: typing.List[typing.Union[float, AvgStdevStats]] - """List of run to compose into this group. - The init call takes ownership of the list, - so the caller should clone it to avoid unexpected muations.""" - max_value: float - """Maximal sample value to expect.""" - comment: str = "unknown" - """Any string giving more info, e.g. "regression".""" - prev_avg: typing.Optional[float] = None - """Average of the previous group, if any.""" - stats: AvgStdevStats = None - """Stats object used for computing bits. - Almost always recomputed, except when non-None in init.""" - cached_bits: typing.Optional[float] = None - """Cached value of information content. - Noned on edit, recomputed if needed and None.""" - - def __post_init__(self): - """Recompute stats is None. - - It is not verified whether the user provided values are valid, - e.g. whether the stats and bits values reflect the runs. - """ - if self.stats is None: - self.stats = AvgStdevStats.for_runs(self.run_list) - - @property - def bits(self) -> float: - """Return overall bit content of the group list. - - If not cached, compute from stats and cache. - - :returns: The overall information content in bits. - :rtype: float - """ - if self.cached_bits is None: - self.cached_bits = BitCountingStats.for_runs( - [self.stats], self.max_value, self.prev_avg - ).bits - return self.cached_bits - - def __getitem__(self, index: int) -> typing.Union[float, AvgStdevStats]: - """Return the run at the index. - - :param index: Index of the run to return. - :type index: int - :returns: The run at the index. - :rtype: typing.Union[float, AvgStdevStats] - """ - return self.run_list[index] - - def __len__(self) -> int: - """Return the number of runs in the group. - - :returns: The Length of run_list. - :rtype: int - """ - return len(self.run_list) - - def copy(self) -> "BitCountingGroup": - """Return a new instance with copied internal state. - - Stats are preserved to avoid re-computation. - As both float and AvgStdevStats are effectively immutable, - only a shallow copy of the runs list is performed. - - :returns: The copied instance. - :rtype: BitCountingGroup - """ - stats = AvgStdevStats.for_runs([self.stats]) - return self.__class__( - run_list=list(self.run_list), - stats=stats, - cached_bits=self.cached_bits, - max_value=self.max_value, - prev_avg=self.prev_avg, - comment=self.comment, - ) - - def append( - self, run: typing.Union[float, AvgStdevStats] - ) -> "BitCountingGroup": - """Mutate to add the new run, return self. - - Stats are updated, but old bits value is deleted from cache. - - :param run: The run value to add to the group. - :type value: typing.Union[float, AvgStdevStats] - :returns: The updated self. - :rtype: BitCountingGroup - """ - self.run_list.append(run) - self.stats = AvgStdevStats.for_runs([self.stats, run]) - self.cached_bits = None - return self - - def extend( - self, runs: typing.Iterable[typing.Union[float, AvgStdevStats]] - ) -> "BitCountingGroup": - """Mutate to add the new runs, return self. - - This is saves small amount of computation - compared to adding runs one by one in a loop. - - Stats are updated, but old bits value is deleted from cache. - - :param runs: The runs to add to the group. - :type value: typing.Iterable[typing.Union[float, AvgStdevStats]] - :returns: The updated self. - :rtype: BitCountingGroup - """ - self.run_list.extend(runs) - self.stats = AvgStdevStats.for_runs([self.stats] + runs) - self.cached_bits = None - return self diff --git a/resources/libraries/python/jumpavg/BitCountingGroupList.py b/resources/libraries/python/jumpavg/BitCountingGroupList.py deleted file mode 100644 index 468e79b236..0000000000 --- a/resources/libraries/python/jumpavg/BitCountingGroupList.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Module holding BitCountingGroupList class.""" - -import collections -import dataclasses -import typing - -from .AvgStdevStats import AvgStdevStats # Just for type hints. -from .BitCountingGroup import BitCountingGroup - - -@dataclasses.dataclass -class BitCountingGroupList(collections.abc.Sequence): - """List of data groups which tracks overall bit count. - - The Sequence-like access is related to the list of groups, - for example group_list[0] returns the first group in the list. - Writable list-like methods are not implemented. - - The overall bit count is the sum of bit counts of each group. - Group is a sequence of data samples accompanied by their stats. - Different partitioning of data samples into the groups - results in different overall bit count. - This can be used to group samples in various contexts. - - As the group bit count depends on previous average - and overall maximal value, order of groups is important. - Having the logic encapsulated here spares the caller - the effort to pass averages around. - - The data can be only added, and there is some logic to skip - recalculations if the bit count is not needed. - """ - - max_value: float - """Maximal sample value to base bits computation on.""" - group_list: typing.List[BitCountingGroup] = None - """List of groups to compose this group list. - Init also accepts None standing for an empty list. - This class takes ownership of the list, - so caller of init should clone their copy to avoid unexpected mutations. - """ - bits_except_last: float = 0.0 - """Partial sum of all but one group bits.""" - - def __post_init__(self): - """Turn possible None into an empty list. - - It is not verified whether the user provided values are valid, - e.g. whether the cached bits values (and bits_except_last) make sense. - """ - if self.group_list is None: - self.group_list = list() - - def __getitem__(self, index: int) -> BitCountingGroup: - """Return the group at the index. - - :param index: Index of the group to return. - :type index: int - :returns: The group at the index. - :rtype: BitCountingGroup - """ - return self.group_list[index] - - def __len__(self) -> int: - """Return the length of the group list. - - :returns: The Length of group_list. - :rtype: int - """ - return len(self.group_list) - - def copy(self) -> "BitCountingGroupList": - """Return a new instance with copied internal state. - - :returns: The copied instance. - :rtype: BitCountingGroupList - """ - return self.__class__( - max_value=self.max_value, - group_list=[group.copy() for group in self.group_list], - bits_except_last=self.bits_except_last, - ) - - def copy_fast(self) -> "BitCountingGroupList": - """Return a new instance with minimaly copied internal state. - - The assumption here is that only the last group will ever be mutated - (in self, probably never in the return value), - so all the previous groups can be "copied by reference". - - :returns: The copied instance. - :rtype: BitCountingGroupList - """ - group_list = list(self.group_list) - if group_list: - group_list[-1] = group_list[-1].copy() - # Further speedup is possible by keeping the last group - # as a singly linked (from end) list, - # but for CSIT sample sizes, copy of whole Python list is faster. - # TODO: Implement linked list as an option - # for users with many samples. - return self.__class__( - max_value=self.max_value, - group_list=group_list, - bits_except_last=self.bits_except_last, - ) - - @property - def bits(self) -> float: - """Return overall bit content of the group list. - - :returns: The overall information content in bits. - :rtype: float - """ - if not self.group_list: - return 0.0 - # TODO: Is it worth to cache the overall result? - return self.bits_except_last + self.group_list[-1].bits - - def append_group_of_runs( - self, - runs: typing.Union[ - BitCountingGroup, typing.List[typing.Union[float, AvgStdevStats]] - ], - ) -> "BitCountingGroupList": - """Mutate to add a new group based on the runs, return self. - - The list argument is NOT copied before adding to the group list, - so further edits MAY not affect the grup list. - The list from BitCountingGroup is shallow copied though. - - :param runs: Runs to form the next group to be appended to self. - :type runs: Union[Iterable[Run], BitCountingGroup] - :returns: The updated self. - :rtype: BitCountingGroupList - """ - prev_avg = self.group_list[-1].stats.avg if self.group_list else None - if isinstance(runs, BitCountingGroup): - # It is faster to avoid stats recalculation. - new_group = runs.copy() - new_group.max_value = self.max_value - new_group.prev_avg = prev_avg - new_group.cached_bits = None - else: - new_group = BitCountingGroup( - run_list=runs, max_value=self.max_value, prev_avg=prev_avg - ) - self.bits_except_last = self.bits - self.group_list.append(new_group) - return self - - def append_run_to_to_last_group( - self, run: typing.Union[float, AvgStdevStats] - ) -> "BitCountingGroupList": - """Mutate to add new run at the end of the last group. - - Basically a one-liner, only returning group list instead of last group. - - :param run: The run value to add to the last group. - :type run: Run - :returns: The updated self. - :rtype: BitCountingGroupList - :raises IndexError: If group list is empty, no last group to add to. - """ - self.group_list[-1].append(run) - return self - - def extend_runs_to_last_group( - self, runs: typing.Iterable[typing.Union[float, AvgStdevStats]] - ) -> "BitCountingGroupList": - """Mutate to add new runs to the end of the last group. - - A faster alternative to appending runs one by one in a loop. - - :param runs: The runs to add to the last group. - :type runs: Iterable[Run] - :returns: The updated self - :rtype: BitCountingGroupList - :raises IndexError: If group list is empty, no last group to add to. - """ - self.group_list[-1].extend(runs) - return self diff --git a/resources/libraries/python/jumpavg/BitCountingStats.py b/resources/libraries/python/jumpavg/BitCountingStats.py deleted file mode 100644 index 524ac952c8..0000000000 --- a/resources/libraries/python/jumpavg/BitCountingStats.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Module holding BitCountingStats class.""" - -import dataclasses -import math -import typing - -from .AvgStdevStats import AvgStdevStats - - -@dataclasses.dataclass -class BitCountingStats(AvgStdevStats): - """Class for statistics which include information content of a group. - - The information content is based on an assumption that the data - consists of independent random values from a normal distribution. - - Instances are only statistics, the data itself is stored elsewhere. - - The coding needs to know the previous average, and a maximal value - so both values are required as inputs. - - This is a subclass of AvgStdevStats, even though all methods are overriden. - Only for_runs method calls the parent implementation, without using super(). - """ - - max_value: float = None - """Maximal sample value (real or estimated). - Default value is there just for argument ordering reasons, - leaving None leads to exceptions.""" - prev_avg: typing.Optional[float] = None - """Population average of the previous group (if any).""" - bits: float = None - """The computed information content of the group. - It is formally an argument to init function, just to keep repr string - a valid call. ut the init value is ignored and always recomputed. - """ - - def __post_init__(self): - """Construct the stats object by computing from the values needed. - - The None values are allowed for stats for zero size data, - but such stats can report arbitrary avg and max_value. - Stats for nonzero size data cannot contain None, - else ValueError is raised. - - The max_value needs to be numeric for nonzero size, - but its relations to avg and prev_avg are not examined. - - The bit count is not real, as that would depend on numeric precision - (number of significant bits in values). - The difference is assumed to be constant per value, - which is consistent with Gauss distribution - (but not with floating point mechanic). - The hope is the difference will have - no real impact on the classification procedure. - """ - # Zero size should in principle have non-zero bits (coding zero size), - # but zero allows users to add empty groups without affecting bits. - self.bits = 0.0 - if self.size < 1: - return - if self.max_value <= 0.0: - raise ValueError(f"Invalid max value: {self!r}") - # Length of the sequence must be also counted in bits, - # otherwise the message would not be decodable. - # Model: probability of k samples is 1/k - 1/(k+1) == 1/k/(k+1) - # This is compatible with zero size leading to zero bits. - self.bits += math.log(self.size * (self.size + 1), 2) - if self.prev_avg is None: - # Avg is considered to be uniformly distributed - # from zero to max_value. - self.bits += math.log(self.max_value + 1.0, 2) - else: - # Opposite triangle distribution with minimum. - self.bits += math.log( - (self.max_value * (self.max_value + 1)) - / (abs(self.avg - self.prev_avg) + 1), - 2, - ) - if self.size < 2: - return - # Stdev is considered to be uniformly distributed - # from zero to max_value. That is quite a bad expectation, - # but resilient to negative samples etc. - self.bits += math.log(self.max_value + 1.0, 2) - # Now we know the samples lie on sphere in size-1 dimensions. - # So it is (size-2)-sphere, with radius^2 == stdev^2 * size. - # https://en.wikipedia.org/wiki/N-sphere - sphere_area_ln = math.log(2) - sphere_area_ln += math.log(math.pi) * ((self.size - 1) / 2.0) - sphere_area_ln -= math.lgamma((self.size - 1) / 2.0) - sphere_area_ln += math.log(self.stdev + 1.0) * (self.size - 2) - sphere_area_ln += math.log(self.size) * ((self.size - 2) / 2.0) - self.bits += sphere_area_ln / math.log(2) - - # TODO: Rename, so pylint stops complaining about signature change. - @classmethod - def for_runs( - cls, - runs: typing.Iterable[typing.Union[float, AvgStdevStats]], - max_value: float, - prev_avg: typing.Optional[float] = None, - ): - """Return new stats instance describing the sequence of runs. - - If you want to append data to existing stats object, - you can simply use the stats object as the first run. - - Instead of a verb, "for" is used to start this method name, - to signify the result contains less information than the input data. - - The two optional values can come from outside of the runs provided. - - The max_value cannot be None for non-zero size data. - The implementation does not check if no datapoint exceeds max_value. - - TODO: Document the behavior for zero size result. - - :param runs: Sequence of data to describe by the new metadata. - :param max_value: Maximal expected value. - :param prev_avg: Population average of the previous group, if any. - :type runs: Iterable[Union[float, AvgStdevStats]] - :type max_value: Union[float, NoneType] - :type prev_avg: Union[float, NoneType] - :returns: The new stats instance. - :rtype: cls - """ - asd = AvgStdevStats.for_runs(runs) - ret_obj = cls( - size=asd.size, - avg=asd.avg, - stdev=asd.stdev, - max_value=max_value, - prev_avg=prev_avg, - ) - return ret_obj diff --git a/resources/libraries/python/jumpavg/__init__.py b/resources/libraries/python/jumpavg/__init__.py index 4fa696c538..7f63b5ee39 100644 --- a/resources/libraries/python/jumpavg/__init__.py +++ b/resources/libraries/python/jumpavg/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -15,8 +15,8 @@ __init__ file for "jumpavg" Python package. """ -from .AvgStdevStats import AvgStdevStats -from .BitCountingStats import BitCountingStats -from .BitCountingGroup import BitCountingGroup -from .BitCountingGroupList import BitCountingGroupList +from .avg_stdev_stats import AvgStdevStats +from .bit_counting_stats import BitCountingStats +from .bit_counting_group import BitCountingGroup +from .bit_counting_group_list import BitCountingGroupList from .classify import classify diff --git a/resources/libraries/python/jumpavg/avg_stdev_stats.py b/resources/libraries/python/jumpavg/avg_stdev_stats.py new file mode 100644 index 0000000000..3d6a834919 --- /dev/null +++ b/resources/libraries/python/jumpavg/avg_stdev_stats.py @@ -0,0 +1,89 @@ +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module holding AvgStdevStats class.""" + +import dataclasses +import math +import typing + + +@dataclasses.dataclass +class AvgStdevStats: + """Class for statistics which include average and stdev of a group. + + Contrary to other stats types, adding values to the group + is computationally light without any caching. + + Instances are only statistics, the data itself is stored elsewhere. + """ + + size: int = 0 + """Number of scalar values (samples) participating in this group.""" + avg: float = 0.0 + """Population average of the participating sample values.""" + stdev: float = 0.0 + """Population standard deviation of the sample values.""" + + @classmethod + def for_runs( + cls, + runs: typing.Iterable[typing.Union[float, "AvgStdevStats"]], + ) -> "AvgStdevStats": + """Return new stats instance describing the sequence of runs. + + If you want to append data to existing stats object, + you can simply use the stats object as the first run. + + Instead of a verb, "for" is used to start this method name, + to signify the result contains less information than the input data. + + Here, run is a hypothetical abstract class, an union of float and cls. + Defining that as a real abstract class in Python is too much hassle. + + :param runs: Sequence of data to describe by the new metadata. + :type runs: Iterable[Union[float, cls]] + :returns: The new stats instance. + :rtype: cls + """ + # Using Welford method to be more resistant to rounding errors. + # Adapted from code for sample standard deviation at: + # https://www.johndcook.com/blog/standard_deviation/ + # The logic of plus operator is taken from + # https://www.johndcook.com/blog/skewness_kurtosis/ + total_size = 0 + total_avg = 0.0 + moment_2 = 0.0 + for run in runs: + if isinstance(run, (float, int)): + run_size = 1 + run_avg = run + run_stdev = 0.0 + else: + run_size = run.size + run_avg = run.avg + run_stdev = run.stdev + old_total_size = total_size + delta = run_avg - total_avg + total_size += run_size + total_avg += delta * run_size / total_size + moment_2 += run_stdev * run_stdev * run_size + moment_2 += delta * delta * old_total_size * run_size / total_size + if total_size < 1: + # Avoid division by zero. + return cls(size=0) + # TODO: Is it worth tracking moment_2 instead, and compute and cache + # stdev on demand, just to possibly save some sqrt calls? + total_stdev = math.sqrt(moment_2 / total_size) + ret_obj = cls(size=total_size, avg=total_avg, stdev=total_stdev) + return ret_obj diff --git a/resources/libraries/python/jumpavg/bit_counting_group.py b/resources/libraries/python/jumpavg/bit_counting_group.py new file mode 100644 index 0000000000..22c9337532 --- /dev/null +++ b/resources/libraries/python/jumpavg/bit_counting_group.py @@ -0,0 +1,163 @@ +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module holding BitCountingGroup class.""" + +import collections +import dataclasses +import typing + +from .avg_stdev_stats import AvgStdevStats +from .bit_counting_stats import BitCountingStats + + +@dataclasses.dataclass +class BitCountingGroup(collections.abc.Sequence): + """Group of runs which tracks bit count in an efficient manner. + + This class contains methods that mutate the internal state, + use copy() method to save the previous state. + + The Sequence-like access is related to the list of runs, + for example group[0] returns the first run in the list. + Writable list-like methods are not implemented. + + As the group bit count depends on previous average + and overall maximal value, those values are assumed + to be known beforehand (and immutable). + + As the caller is allowed to divide runs into groups in any way, + a method to add a single run in an efficient manner is provided. + """ + + run_list: typing.List[typing.Union[float, AvgStdevStats]] + """List of run to compose into this group. + The init call takes ownership of the list, + so the caller should clone it to avoid unexpected muations.""" + max_value: float + """Maximal sample value to expect.""" + unit: float = 1.0 + """Typical resolution of the values""" + comment: str = "normal" + """Any string giving more info, e.g. "regression".""" + prev_avg: typing.Optional[float] = None + """Average of the previous group, if any.""" + stats: AvgStdevStats = None + """Stats object used for computing bits. + Almost always recomputed, except when non-None in init.""" + cached_bits: typing.Optional[float] = None + """Cached value of information content. + Noned on edit, recomputed if needed and None.""" + + def __post_init__(self): + """Recompute stats is None. + + It is not verified whether the user provided values are valid, + e.g. whether the stats and bits values reflect the runs. + """ + if self.stats is None: + self.stats = AvgStdevStats.for_runs(runs=self.run_list) + + @property + def bits(self) -> float: + """Return overall bit content of the group list. + + If not cached, compute from stats and cache. + + :returns: The overall information content in bits. + :rtype: float + """ + if self.cached_bits is None: + self.cached_bits = BitCountingStats.for_runs_and_params( + runs=[self.stats], + max_value=self.max_value, + unit=self.unit, + prev_avg=self.prev_avg, + ).bits + return self.cached_bits + + def __getitem__(self, index: int) -> typing.Union[float, AvgStdevStats]: + """Return the run at the index. + + :param index: Index of the run to return. + :type index: int + :returns: The run at the index. + :rtype: typing.Union[float, AvgStdevStats] + """ + return self.run_list[index] + + def __len__(self) -> int: + """Return the number of runs in the group. + + :returns: The Length of run_list. + :rtype: int + """ + return len(self.run_list) + + def copy(self) -> "BitCountingGroup": + """Return a new instance with copied internal state. + + Stats are preserved to avoid re-computation. + As both float and AvgStdevStats are effectively immutable, + only a shallow copy of the runs list is performed. + + :returns: The copied instance. + :rtype: BitCountingGroup + """ + stats = AvgStdevStats.for_runs([self.stats]) + return self.__class__( + run_list=list(self.run_list), + stats=stats, + cached_bits=self.cached_bits, + max_value=self.max_value, + unit=self.unit, + prev_avg=self.prev_avg, + comment=self.comment, + ) + + def append( + self, run: typing.Union[float, AvgStdevStats] + ) -> "BitCountingGroup": + """Mutate to add the new run, return self. + + Stats are updated, but old bits value is deleted from cache. + + :param run: The run value to add to the group. + :type value: typing.Union[float, AvgStdevStats] + :returns: The updated self. + :rtype: BitCountingGroup + """ + self.run_list.append(run) + self.stats = AvgStdevStats.for_runs([self.stats, run]) + self.cached_bits = None + return self + + def extend( + self, runs: typing.Iterable[typing.Union[float, AvgStdevStats]] + ) -> "BitCountingGroup": + """Mutate to add the new runs, return self. + + This is saves small amount of computation + compared to adding runs one by one in a loop. + + Stats are updated, but old bits value is deleted from cache. + + :param runs: The runs to add to the group. + :type value: typing.Iterable[typing.Union[float, AvgStdevStats]] + :returns: The updated self. + :rtype: BitCountingGroup + """ + self.run_list.extend(runs) + self.stats = AvgStdevStats.for_runs([self.stats] + runs) + self.cached_bits = None + return self diff --git a/resources/libraries/python/jumpavg/bit_counting_group_list.py b/resources/libraries/python/jumpavg/bit_counting_group_list.py new file mode 100644 index 0000000000..e4d33b53a2 --- /dev/null +++ b/resources/libraries/python/jumpavg/bit_counting_group_list.py @@ -0,0 +1,203 @@ +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module holding BitCountingGroupList class.""" + +import collections +import dataclasses +import typing + +from .avg_stdev_stats import AvgStdevStats # Just for type hints. +from .bit_counting_group import BitCountingGroup + + +@dataclasses.dataclass +class BitCountingGroupList(collections.abc.Sequence): + """List of data groups which tracks overall bit count. + + The Sequence-like access is related to the list of groups, + for example group_list[0] returns the first group in the list. + Writable list-like methods are not implemented. + + The overall bit count is the sum of bit counts of each group. + Group is a sequence of data samples accompanied by their stats. + Different partitioning of data samples into the groups + results in different overall bit count. + This can be used to group samples in various contexts. + + As the group bit count depends on previous average + and overall maximal value, order of groups is important. + Having the logic encapsulated here spares the caller + the effort to pass averages around. + + The data can be only added, and there is some logic to skip + recalculations if the bit count is not needed. + """ + + max_value: float + """Maximal sample value to base bits computation on.""" + unit: float = 1.0 + """Typical resolution of the values.""" + group_list: typing.List[BitCountingGroup] = None + """List of groups to compose this group list. + Init also accepts None standing for an empty list. + This class takes ownership of the list, + so caller of init should clone their copy to avoid unexpected mutations. + """ + bits_except_last: float = 0.0 + """Partial sum of all but one group bits.""" + + def __post_init__(self): + """Turn possible None into an empty list. + + It is not verified whether the user provided values are valid, + e.g. whether the cached bits values (and bits_except_last) make sense. + """ + if self.group_list is None: + self.group_list = [] + + def __getitem__(self, index: int) -> BitCountingGroup: + """Return the group at the index. + + :param index: Index of the group to return. + :type index: int + :returns: The group at the index. + :rtype: BitCountingGroup + """ + return self.group_list[index] + + def __len__(self) -> int: + """Return the length of the group list. + + :returns: The Length of group_list. + :rtype: int + """ + return len(self.group_list) + + def copy(self) -> "BitCountingGroupList": + """Return a new instance with copied internal state. + + :returns: The copied instance. + :rtype: BitCountingGroupList + """ + return self.__class__( + max_value=self.max_value, + unit=self.unit, + group_list=[group.copy() for group in self.group_list], + bits_except_last=self.bits_except_last, + ) + + def copy_fast(self) -> "BitCountingGroupList": + """Return a new instance with minimaly copied internal state. + + The assumption here is that only the last group will ever be mutated + (in self, probably never in the return value), + so all the previous groups can be "copied by reference". + + :returns: The copied instance. + :rtype: BitCountingGroupList + """ + group_list = list(self.group_list) + if group_list: + group_list[-1] = group_list[-1].copy() + # Further speedup is possible by keeping the last group + # as a singly linked (from end) list, + # but for CSIT sample sizes, copy of whole Python list is faster. + # TODO: Implement linked list as an option + # for users with many samples. + return self.__class__( + max_value=self.max_value, + unit=self.unit, + group_list=group_list, + bits_except_last=self.bits_except_last, + ) + + @property + def bits(self) -> float: + """Return overall bit content of the group list. + + :returns: The overall information content in bits. + :rtype: float + """ + if not self.group_list: + return 0.0 + # TODO: Is it worth to cache the overall result? + return self.bits_except_last + self.group_list[-1].bits + + def append_group_of_runs( + self, + runs: typing.Union[ + BitCountingGroup, typing.List[typing.Union[float, AvgStdevStats]] + ], + ) -> "BitCountingGroupList": + """Mutate to add a new group based on the runs, return self. + + The list argument is NOT copied before adding to the group list, + so further edits MAY not affect the grup list. + The list from BitCountingGroup is shallow copied though. + + :param runs: Runs to form the next group to be appended to self. + :type runs: Union[Iterable[Run], BitCountingGroup] + :returns: The updated self. + :rtype: BitCountingGroupList + """ + prev_avg = self.group_list[-1].stats.avg if self.group_list else None + if isinstance(runs, BitCountingGroup): + # It is faster to avoid stats recalculation. + new_group = runs.copy() + new_group.max_value = self.max_value + # Unit is common. + new_group.prev_avg = prev_avg + new_group.cached_bits = None + else: + new_group = BitCountingGroup( + run_list=runs, + max_value=self.max_value, + unit=self.unit, + prev_avg=prev_avg, + ) + self.bits_except_last = self.bits + self.group_list.append(new_group) + return self + + def append_run_to_to_last_group( + self, run: typing.Union[float, AvgStdevStats] + ) -> "BitCountingGroupList": + """Mutate to add new run at the end of the last group. + + Basically a one-liner, only returning group list instead of last group. + + :param run: The run value to add to the last group. + :type run: Run + :returns: The updated self. + :rtype: BitCountingGroupList + :raises IndexError: If group list is empty, no last group to add to. + """ + self.group_list[-1].append(run) + return self + + def extend_runs_to_last_group( + self, runs: typing.Iterable[typing.Union[float, AvgStdevStats]] + ) -> "BitCountingGroupList": + """Mutate to add new runs to the end of the last group. + + A faster alternative to appending runs one by one in a loop. + + :param runs: The runs to add to the last group. + :type runs: Iterable[Run] + :returns: The updated self + :rtype: BitCountingGroupList + :raises IndexError: If group list is empty, no last group to add to. + """ + self.group_list[-1].extend(runs) + return self diff --git a/resources/libraries/python/jumpavg/bit_counting_stats.py b/resources/libraries/python/jumpavg/bit_counting_stats.py new file mode 100644 index 0000000000..caece2c8ca --- /dev/null +++ b/resources/libraries/python/jumpavg/bit_counting_stats.py @@ -0,0 +1,157 @@ +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module holding BitCountingStats class.""" + +import dataclasses +import math +import typing + +from .avg_stdev_stats import AvgStdevStats + + +@dataclasses.dataclass +class BitCountingStats(AvgStdevStats): + """Class for statistics which include information content of a group. + + The information content is based on an assumption that the data + consists of independent random values from a normal distribution. + + Instances are only statistics, the data itself is stored elsewhere. + + The coding needs to know the previous average, and a maximal value + so both values are required as inputs. + + This is a subclass of AvgStdevStats, even though all methods are overriden. + Only for_runs method calls the parent implementation, without using super(). + """ + + max_value: float = None + """Maximal sample value (real or estimated). + Default value is there just for argument ordering reasons, + leaving None leads to exceptions.""" + unit: float = 1.0 + """Typical resolution of the values.""" + prev_avg: typing.Optional[float] = None + """Population average of the previous group (if any).""" + bits: float = None + """The computed information content of the group. + It is formally an argument to init function, just to keep repr string + a valid call. ut the init value is ignored and always recomputed. + """ + + def __post_init__(self): + """Construct the stats object by computing from the values needed. + + The None values are allowed for stats for zero size data, + but such stats can report arbitrary avg and max_value. + Stats for nonzero size data cannot contain None, + else ValueError is raised. + + The max_value needs to be numeric for nonzero size, + but its relations to avg and prev_avg are not examined. + + The bit count is not real, as that would depend on numeric precision + (number of significant bits in values). + The difference is assumed to be constant per value, + which is consistent with Gauss distribution + (but not with floating point mechanic). + The hope is the difference will have + no real impact on the classification procedure. + """ + # Zero size should in principle have non-zero bits (coding zero size), + # but zero allows users to add empty groups without affecting bits. + self.bits = 0.0 + if self.size < 1: + return + if self.max_value <= 0.0: + raise ValueError(f"Invalid max value: {self!r}") + max_value = self.max_value / self.unit + avg = self.avg / self.unit + # Length of the sequence must be also counted in bits, + # otherwise the message would not be decodable. + # Model: probability of k samples is 1/k - 1/(k+1) == 1/k/(k+1) + # This is compatible with zero size leading to zero bits. + self.bits += math.log(self.size * (self.size + 1), 2) + if self.prev_avg is None: + # Avg is considered to be uniformly distributed + # from zero to max_value. + self.bits += math.log(max_value + 1, 2) + else: + # Opposite triangle distribution with minimum. + prev_avg = self.prev_avg / self.unit + norm = prev_avg * prev_avg + norm -= (prev_avg - 1) * max_value + norm += max_value * max_value / 2 + self.bits -= math.log((abs(avg - prev_avg) + 1) / norm, 2) + if self.size < 2: + return + stdev = self.stdev / self.unit + # Stdev is considered to be uniformly distributed + # from zero to max_value. That is quite a bad expectation, + # but resilient to negative samples etc. + self.bits += math.log(max_value + 1, 2) + # Now we know the samples lie on sphere in size-1 dimensions. + # So it is (size-2)-sphere, with radius^2 == stdev^2 * size. + # https://en.wikipedia.org/wiki/N-sphere + sphere_area_ln = math.log(2) + sphere_area_ln += math.log(math.pi) * ((self.size - 1) / 2) + sphere_area_ln -= math.lgamma((self.size - 1) / 2) + sphere_area_ln += math.log(stdev + 1) * (self.size - 2) + sphere_area_ln += math.log(self.size) * ((self.size - 2) / 2) + self.bits += sphere_area_ln / math.log(2) + + @classmethod + def for_runs_and_params( + cls, + runs: typing.Iterable[typing.Union[float, AvgStdevStats]], + max_value: float, + unit: float = 1.0, + prev_avg: typing.Optional[float] = None, + ): + """Return new stats instance describing the sequence of runs. + + If you want to append data to existing stats object, + you can simply use the stats object as the first run. + + Instead of a verb, "for" is used to start this method name, + to signify the result contains less information than the input data. + + The two optional values can come from outside of the runs provided. + + The max_value cannot be None for non-zero size data. + The implementation does not check if no datapoint exceeds max_value. + + TODO: Document the behavior for zero size result. + + :param runs: Sequence of data to describe by the new metadata. + :param max_value: Maximal expected value. + :param unit: Typical resolution of the values. + :param prev_avg: Population average of the previous group, if any. + :type runs: Iterable[Union[float, AvgStdevStats]] + :type max_value: Union[float, NoneType] + :type unit: float + :type prev_avg: Union[float, NoneType] + :returns: The new stats instance. + :rtype: cls + """ + asd = AvgStdevStats.for_runs(runs) + ret_obj = cls( + size=asd.size, + avg=asd.avg, + stdev=asd.stdev, + max_value=max_value, + unit=unit, + prev_avg=prev_avg, + ) + return ret_obj diff --git a/resources/libraries/python/jumpavg/classify.py b/resources/libraries/python/jumpavg/classify.py index 87d2502037..cc3cdcceed 100644 --- a/resources/libraries/python/jumpavg/classify.py +++ b/resources/libraries/python/jumpavg/classify.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -13,21 +13,23 @@ """Module holding the classify function -Classification os one of primary purposes of this package. +Classification is one of primary purposes of this package. Minimal message length principle is used for grouping results into the list of groups, assuming each group is a population of different Gaussian distribution. """ -import typing +from typing import Iterable, Optional, Union -from .AvgStdevStats import AvgStdevStats -from .BitCountingGroupList import BitCountingGroupList +from .avg_stdev_stats import AvgStdevStats +from .bit_counting_group_list import BitCountingGroupList def classify( - values: typing.Iterable[typing.Union[float, typing.Iterable[float]]] + values: Iterable[Union[float, Iterable[float]]], + unit: Optional[float] = None, + sbps: Optional[float] = None, ) -> BitCountingGroupList: """Return the values in groups of optimal bit count. @@ -38,12 +40,27 @@ def classify( Internally, such sequence is replaced by AvgStdevStats after maximal value is found. + If the values are smaller than expected (below one unit), + the underlying assumption break down and the classification is wrong. + Use the "unit" parameter to hint at what the input resolution is. + + If the correct value of unit is not known beforehand, + the argument "sbps" (Significant Bits Per Sample) can be used + to set unit such that maximal sample value is this many ones in binary. + If neither "unit" nor "sbps" are given, "sbps" of 12 is used by default. + :param values: Sequence of runs to classify. + :param unit: Typical resolution of the values. + Zero and None means no unit given. + :param sbps: Significant Bits Per Sample. None on zero means 12. + If units is not set, this is used to compute unit from max sample value. :type values: Iterable[Union[float, Iterable[float]]] + :type unit: Optional[float] + :type sbps: Optional[float] :returns: Classified group list. :rtype: BitCountingGroupList """ - processed_values = list() + processed_values = [] max_value = 0.0 for value in values: if isinstance(value, (float, int)): @@ -55,9 +72,14 @@ def classify( if subvalue > max_value: max_value = subvalue processed_values.append(AvgStdevStats.for_runs(value)) + if not unit: + if not sbps: + sbps = 12.0 + max_in_units = pow(2.0, sbps + 1.0) - 1.0 + unit = max_value / max_in_units # Glist means group list (BitCountingGroupList). - open_glists = list() - record_glist = BitCountingGroupList(max_value=max_value) + open_glists = [] + record_glist = BitCountingGroupList(max_value=max_value, unit=unit) for value in processed_values: new_open_glist = record_glist.copy_fast().append_group_of_runs([value]) record_glist = new_open_glist @@ -68,9 +90,7 @@ def classify( open_glists.append(new_open_glist) previous_average = record_glist[0].stats.avg for group in record_glist: - if group.stats.avg == previous_average: - group.comment = "normal" - elif group.stats.avg < previous_average: + if group.stats.avg < previous_average: group.comment = "regression" elif group.stats.avg > previous_average: group.comment = "progression" -- cgit 1.2.3-korg