aboutsummaryrefslogtreecommitdiffstats
path: root/resources/libraries/python/jumpavg
diff options
context:
space:
mode:
authorVratko Polak <vrpolak@cisco.com>2023-06-02 14:44:47 +0200
committerTibor Frank <tifrank@cisco.com>2023-06-07 05:53:55 +0000
commit079c390e0903a98182781ff5c2af2bba9902b4ed (patch)
tree98116eea60af0ed530548bb52dff126aef0dc9e9 /resources/libraries/python/jumpavg
parent2c9b2a4298cac5cc4d6ca60cb1da8bd72ec23c37 (diff)
feat(jumpavg): support small values via unit param
Previously, Jumpavg was known to give wrong results when the data contains values of order one or smaller. This change introduces a new "unit" parameter, which changes how the information content is calculated. For example if the data values are mutiplies of 0.01, the unit parameter should be set to 0.01 to compensate. For callers not knowing their correct unit value, another parameter is introduced, called "sbps" (meaning Significant Bits Per Sample). A binary integer number with this many ones is how much units should the maximal sample be. This way jumpavg computes the corresponding "unit" value to use. If neither "unit" nor "sbps" are given, the "sbps" value of 12 is applied. + Rename files to conform to snake_style naming. + Fix normalization for the "opposite triangle" distribution. + Simplify logic, all groups now start as "normal", not "unknown". + Minor style improvements as suggested by pylint. + From user perspective, this change should be backward compatible. - The normalization fix is a behavior change, but it is a bugfix and the new behavior should be better. Change-Id: I5a5ca11757f087fff13faf1d0b8e34a741400258 Signed-off-by: Vratko Polak <vrpolak@cisco.com>
Diffstat (limited to 'resources/libraries/python/jumpavg')
-rw-r--r--resources/libraries/python/jumpavg/__init__.py10
-rw-r--r--resources/libraries/python/jumpavg/avg_stdev_stats.py (renamed from resources/libraries/python/jumpavg/AvgStdevStats.py)2
-rw-r--r--resources/libraries/python/jumpavg/bit_counting_group.py (renamed from resources/libraries/python/jumpavg/BitCountingGroup.py)20
-rw-r--r--resources/libraries/python/jumpavg/bit_counting_group_list.py (renamed from resources/libraries/python/jumpavg/BitCountingGroupList.py)18
-rw-r--r--resources/libraries/python/jumpavg/bit_counting_stats.py (renamed from resources/libraries/python/jumpavg/BitCountingStats.py)38
-rw-r--r--resources/libraries/python/jumpavg/classify.py44
6 files changed, 87 insertions, 45 deletions
diff --git a/resources/libraries/python/jumpavg/__init__.py b/resources/libraries/python/jumpavg/__init__.py
index 4fa696c538..7f63b5ee39 100644
--- a/resources/libraries/python/jumpavg/__init__.py
+++ b/resources/libraries/python/jumpavg/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 Cisco and/or its affiliates.
+# Copyright (c) 2023 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -15,8 +15,8 @@
__init__ file for "jumpavg" Python package.
"""
-from .AvgStdevStats import AvgStdevStats
-from .BitCountingStats import BitCountingStats
-from .BitCountingGroup import BitCountingGroup
-from .BitCountingGroupList import BitCountingGroupList
+from .avg_stdev_stats import AvgStdevStats
+from .bit_counting_stats import BitCountingStats
+from .bit_counting_group import BitCountingGroup
+from .bit_counting_group_list import BitCountingGroupList
from .classify import classify
diff --git a/resources/libraries/python/jumpavg/AvgStdevStats.py b/resources/libraries/python/jumpavg/avg_stdev_stats.py
index d40b316bf1..3d6a834919 100644
--- a/resources/libraries/python/jumpavg/AvgStdevStats.py
+++ b/resources/libraries/python/jumpavg/avg_stdev_stats.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 Cisco and/or its affiliates.
+# Copyright (c) 2023 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
diff --git a/resources/libraries/python/jumpavg/BitCountingGroup.py b/resources/libraries/python/jumpavg/bit_counting_group.py
index 48bea086f4..22c9337532 100644
--- a/resources/libraries/python/jumpavg/BitCountingGroup.py
+++ b/resources/libraries/python/jumpavg/bit_counting_group.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 Cisco and/or its affiliates.
+# Copyright (c) 2023 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -17,8 +17,8 @@ import collections
import dataclasses
import typing
-from .AvgStdevStats import AvgStdevStats
-from .BitCountingStats import BitCountingStats
+from .avg_stdev_stats import AvgStdevStats
+from .bit_counting_stats import BitCountingStats
@dataclasses.dataclass
@@ -46,7 +46,9 @@ class BitCountingGroup(collections.abc.Sequence):
so the caller should clone it to avoid unexpected muations."""
max_value: float
"""Maximal sample value to expect."""
- comment: str = "unknown"
+ unit: float = 1.0
+ """Typical resolution of the values"""
+ comment: str = "normal"
"""Any string giving more info, e.g. "regression"."""
prev_avg: typing.Optional[float] = None
"""Average of the previous group, if any."""
@@ -64,7 +66,7 @@ class BitCountingGroup(collections.abc.Sequence):
e.g. whether the stats and bits values reflect the runs.
"""
if self.stats is None:
- self.stats = AvgStdevStats.for_runs(self.run_list)
+ self.stats = AvgStdevStats.for_runs(runs=self.run_list)
@property
def bits(self) -> float:
@@ -76,8 +78,11 @@ class BitCountingGroup(collections.abc.Sequence):
:rtype: float
"""
if self.cached_bits is None:
- self.cached_bits = BitCountingStats.for_runs(
- [self.stats], self.max_value, self.prev_avg
+ self.cached_bits = BitCountingStats.for_runs_and_params(
+ runs=[self.stats],
+ max_value=self.max_value,
+ unit=self.unit,
+ prev_avg=self.prev_avg,
).bits
return self.cached_bits
@@ -115,6 +120,7 @@ class BitCountingGroup(collections.abc.Sequence):
stats=stats,
cached_bits=self.cached_bits,
max_value=self.max_value,
+ unit=self.unit,
prev_avg=self.prev_avg,
comment=self.comment,
)
diff --git a/resources/libraries/python/jumpavg/BitCountingGroupList.py b/resources/libraries/python/jumpavg/bit_counting_group_list.py
index 468e79b236..e4d33b53a2 100644
--- a/resources/libraries/python/jumpavg/BitCountingGroupList.py
+++ b/resources/libraries/python/jumpavg/bit_counting_group_list.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 Cisco and/or its affiliates.
+# Copyright (c) 2023 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -17,8 +17,8 @@ import collections
import dataclasses
import typing
-from .AvgStdevStats import AvgStdevStats # Just for type hints.
-from .BitCountingGroup import BitCountingGroup
+from .avg_stdev_stats import AvgStdevStats # Just for type hints.
+from .bit_counting_group import BitCountingGroup
@dataclasses.dataclass
@@ -46,6 +46,8 @@ class BitCountingGroupList(collections.abc.Sequence):
max_value: float
"""Maximal sample value to base bits computation on."""
+ unit: float = 1.0
+ """Typical resolution of the values."""
group_list: typing.List[BitCountingGroup] = None
"""List of groups to compose this group list.
Init also accepts None standing for an empty list.
@@ -62,7 +64,7 @@ class BitCountingGroupList(collections.abc.Sequence):
e.g. whether the cached bits values (and bits_except_last) make sense.
"""
if self.group_list is None:
- self.group_list = list()
+ self.group_list = []
def __getitem__(self, index: int) -> BitCountingGroup:
"""Return the group at the index.
@@ -90,6 +92,7 @@ class BitCountingGroupList(collections.abc.Sequence):
"""
return self.__class__(
max_value=self.max_value,
+ unit=self.unit,
group_list=[group.copy() for group in self.group_list],
bits_except_last=self.bits_except_last,
)
@@ -114,6 +117,7 @@ class BitCountingGroupList(collections.abc.Sequence):
# for users with many samples.
return self.__class__(
max_value=self.max_value,
+ unit=self.unit,
group_list=group_list,
bits_except_last=self.bits_except_last,
)
@@ -152,11 +156,15 @@ class BitCountingGroupList(collections.abc.Sequence):
# It is faster to avoid stats recalculation.
new_group = runs.copy()
new_group.max_value = self.max_value
+ # Unit is common.
new_group.prev_avg = prev_avg
new_group.cached_bits = None
else:
new_group = BitCountingGroup(
- run_list=runs, max_value=self.max_value, prev_avg=prev_avg
+ run_list=runs,
+ max_value=self.max_value,
+ unit=self.unit,
+ prev_avg=prev_avg,
)
self.bits_except_last = self.bits
self.group_list.append(new_group)
diff --git a/resources/libraries/python/jumpavg/BitCountingStats.py b/resources/libraries/python/jumpavg/bit_counting_stats.py
index 524ac952c8..caece2c8ca 100644
--- a/resources/libraries/python/jumpavg/BitCountingStats.py
+++ b/resources/libraries/python/jumpavg/bit_counting_stats.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 Cisco and/or its affiliates.
+# Copyright (c) 2023 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -17,7 +17,7 @@ import dataclasses
import math
import typing
-from .AvgStdevStats import AvgStdevStats
+from .avg_stdev_stats import AvgStdevStats
@dataclasses.dataclass
@@ -40,6 +40,8 @@ class BitCountingStats(AvgStdevStats):
"""Maximal sample value (real or estimated).
Default value is there just for argument ordering reasons,
leaving None leads to exceptions."""
+ unit: float = 1.0
+ """Typical resolution of the values."""
prev_avg: typing.Optional[float] = None
"""Population average of the previous group (if any)."""
bits: float = None
@@ -74,6 +76,8 @@ class BitCountingStats(AvgStdevStats):
return
if self.max_value <= 0.0:
raise ValueError(f"Invalid max value: {self!r}")
+ max_value = self.max_value / self.unit
+ avg = self.avg / self.unit
# Length of the sequence must be also counted in bits,
# otherwise the message would not be decodable.
# Model: probability of k samples is 1/k - 1/(k+1) == 1/k/(k+1)
@@ -82,36 +86,37 @@ class BitCountingStats(AvgStdevStats):
if self.prev_avg is None:
# Avg is considered to be uniformly distributed
# from zero to max_value.
- self.bits += math.log(self.max_value + 1.0, 2)
+ self.bits += math.log(max_value + 1, 2)
else:
# Opposite triangle distribution with minimum.
- self.bits += math.log(
- (self.max_value * (self.max_value + 1))
- / (abs(self.avg - self.prev_avg) + 1),
- 2,
- )
+ prev_avg = self.prev_avg / self.unit
+ norm = prev_avg * prev_avg
+ norm -= (prev_avg - 1) * max_value
+ norm += max_value * max_value / 2
+ self.bits -= math.log((abs(avg - prev_avg) + 1) / norm, 2)
if self.size < 2:
return
+ stdev = self.stdev / self.unit
# Stdev is considered to be uniformly distributed
# from zero to max_value. That is quite a bad expectation,
# but resilient to negative samples etc.
- self.bits += math.log(self.max_value + 1.0, 2)
+ self.bits += math.log(max_value + 1, 2)
# Now we know the samples lie on sphere in size-1 dimensions.
# So it is (size-2)-sphere, with radius^2 == stdev^2 * size.
# https://en.wikipedia.org/wiki/N-sphere
sphere_area_ln = math.log(2)
- sphere_area_ln += math.log(math.pi) * ((self.size - 1) / 2.0)
- sphere_area_ln -= math.lgamma((self.size - 1) / 2.0)
- sphere_area_ln += math.log(self.stdev + 1.0) * (self.size - 2)
- sphere_area_ln += math.log(self.size) * ((self.size - 2) / 2.0)
+ sphere_area_ln += math.log(math.pi) * ((self.size - 1) / 2)
+ sphere_area_ln -= math.lgamma((self.size - 1) / 2)
+ sphere_area_ln += math.log(stdev + 1) * (self.size - 2)
+ sphere_area_ln += math.log(self.size) * ((self.size - 2) / 2)
self.bits += sphere_area_ln / math.log(2)
- # TODO: Rename, so pylint stops complaining about signature change.
@classmethod
- def for_runs(
+ def for_runs_and_params(
cls,
runs: typing.Iterable[typing.Union[float, AvgStdevStats]],
max_value: float,
+ unit: float = 1.0,
prev_avg: typing.Optional[float] = None,
):
"""Return new stats instance describing the sequence of runs.
@@ -131,9 +136,11 @@ class BitCountingStats(AvgStdevStats):
:param runs: Sequence of data to describe by the new metadata.
:param max_value: Maximal expected value.
+ :param unit: Typical resolution of the values.
:param prev_avg: Population average of the previous group, if any.
:type runs: Iterable[Union[float, AvgStdevStats]]
:type max_value: Union[float, NoneType]
+ :type unit: float
:type prev_avg: Union[float, NoneType]
:returns: The new stats instance.
:rtype: cls
@@ -144,6 +151,7 @@ class BitCountingStats(AvgStdevStats):
avg=asd.avg,
stdev=asd.stdev,
max_value=max_value,
+ unit=unit,
prev_avg=prev_avg,
)
return ret_obj
diff --git a/resources/libraries/python/jumpavg/classify.py b/resources/libraries/python/jumpavg/classify.py
index 87d2502037..cc3cdcceed 100644
--- a/resources/libraries/python/jumpavg/classify.py
+++ b/resources/libraries/python/jumpavg/classify.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 Cisco and/or its affiliates.
+# Copyright (c) 2023 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -13,21 +13,23 @@
"""Module holding the classify function
-Classification os one of primary purposes of this package.
+Classification is one of primary purposes of this package.
Minimal message length principle is used
for grouping results into the list of groups,
assuming each group is a population of different Gaussian distribution.
"""
-import typing
+from typing import Iterable, Optional, Union
-from .AvgStdevStats import AvgStdevStats
-from .BitCountingGroupList import BitCountingGroupList
+from .avg_stdev_stats import AvgStdevStats
+from .bit_counting_group_list import BitCountingGroupList
def classify(
- values: typing.Iterable[typing.Union[float, typing.Iterable[float]]]
+ values: Iterable[Union[float, Iterable[float]]],
+ unit: Optional[float] = None,
+ sbps: Optional[float] = None,
) -> BitCountingGroupList:
"""Return the values in groups of optimal bit count.
@@ -38,12 +40,27 @@ def classify(
Internally, such sequence is replaced by AvgStdevStats
after maximal value is found.
+ If the values are smaller than expected (below one unit),
+ the underlying assumption break down and the classification is wrong.
+ Use the "unit" parameter to hint at what the input resolution is.
+
+ If the correct value of unit is not known beforehand,
+ the argument "sbps" (Significant Bits Per Sample) can be used
+ to set unit such that maximal sample value is this many ones in binary.
+ If neither "unit" nor "sbps" are given, "sbps" of 12 is used by default.
+
:param values: Sequence of runs to classify.
+ :param unit: Typical resolution of the values.
+ Zero and None means no unit given.
+ :param sbps: Significant Bits Per Sample. None on zero means 12.
+ If units is not set, this is used to compute unit from max sample value.
:type values: Iterable[Union[float, Iterable[float]]]
+ :type unit: Optional[float]
+ :type sbps: Optional[float]
:returns: Classified group list.
:rtype: BitCountingGroupList
"""
- processed_values = list()
+ processed_values = []
max_value = 0.0
for value in values:
if isinstance(value, (float, int)):
@@ -55,9 +72,14 @@ def classify(
if subvalue > max_value:
max_value = subvalue
processed_values.append(AvgStdevStats.for_runs(value))
+ if not unit:
+ if not sbps:
+ sbps = 12.0
+ max_in_units = pow(2.0, sbps + 1.0) - 1.0
+ unit = max_value / max_in_units
# Glist means group list (BitCountingGroupList).
- open_glists = list()
- record_glist = BitCountingGroupList(max_value=max_value)
+ open_glists = []
+ record_glist = BitCountingGroupList(max_value=max_value, unit=unit)
for value in processed_values:
new_open_glist = record_glist.copy_fast().append_group_of_runs([value])
record_glist = new_open_glist
@@ -68,9 +90,7 @@ def classify(
open_glists.append(new_open_glist)
previous_average = record_glist[0].stats.avg
for group in record_glist:
- if group.stats.avg == previous_average:
- group.comment = "normal"
- elif group.stats.avg < previous_average:
+ if group.stats.avg < previous_average:
group.comment = "regression"
elif group.stats.avg > previous_average:
group.comment = "progression"