aboutsummaryrefslogtreecommitdiffstats
path: root/resources/libraries/python/jumpavg/BitCountingStats.py
diff options
context:
space:
mode:
authorVratko Polak <vrpolak@cisco.com>2022-08-09 14:56:15 +0200
committerTibor Frank <tifrank@cisco.com>2022-08-15 10:58:57 +0000
commit4bfbd4d72ad53eb1694868c19640c8b4a17d32cb (patch)
tree0a566caa3a9ce141f8045bf22c395833355f3a7c /resources/libraries/python/jumpavg/BitCountingStats.py
parentc1b770bc71eda83468c0e2a97c851b831b76641b (diff)
feat(jumpavg): speed up, use Python 3.8 features
+ The main speedup comes from abandoning copy.deepcopy(), doing shallow list copies (at most) and introcuding copy_fast(). + Turn into dataclasses whenever possible, use type hints. + Simplify the partition search code, + It is now clearer the time complexity is O(N*(N+n)), where N is number of samples, and n is the average size of the last group of the current record glist. + Used black for formatting, so no u"" anymore. + Update metadata for 0.3.0 release. Change-Id: I302203b4d42aeb22be1128e2fe72353a44eae5d0 Signed-off-by: Vratko Polak <vrpolak@cisco.com>
Diffstat (limited to 'resources/libraries/python/jumpavg/BitCountingStats.py')
-rw-r--r--resources/libraries/python/jumpavg/BitCountingStats.py110
1 files changed, 45 insertions, 65 deletions
diff --git a/resources/libraries/python/jumpavg/BitCountingStats.py b/resources/libraries/python/jumpavg/BitCountingStats.py
index 7b5e659214..524ac952c8 100644
--- a/resources/libraries/python/jumpavg/BitCountingStats.py
+++ b/resources/libraries/python/jumpavg/BitCountingStats.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 Cisco and/or its affiliates.
+# Copyright (c) 2022 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -13,11 +13,14 @@
"""Module holding BitCountingStats class."""
+import dataclasses
import math
+import typing
from .AvgStdevStats import AvgStdevStats
+@dataclasses.dataclass
class BitCountingStats(AvgStdevStats):
"""Class for statistics which include information content of a group.
@@ -33,11 +36,20 @@ class BitCountingStats(AvgStdevStats):
Only for_runs method calls the parent implementation, without using super().
"""
- def __init__(
- self, size=0, avg=None, stdev=0.0, max_value=None, prev_avg=None):
- """Construct the stats object by computing from the values needed.
+ max_value: float = None
+ """Maximal sample value (real or estimated).
+ Default value is there just for argument ordering reasons,
+ leaving None leads to exceptions."""
+ prev_avg: typing.Optional[float] = None
+ """Population average of the previous group (if any)."""
+ bits: float = None
+ """The computed information content of the group.
+ It is formally an argument to init function, just to keep repr string
+ a valid call. ut the init value is ignored and always recomputed.
+ """
- The values are not sanitized, faulty callers can cause math errors.
+ def __post_init__(self):
+ """Construct the stats object by computing from the values needed.
The None values are allowed for stats for zero size data,
but such stats can report arbitrary avg and max_value.
@@ -54,91 +66,54 @@ class BitCountingStats(AvgStdevStats):
(but not with floating point mechanic).
The hope is the difference will have
no real impact on the classification procedure.
-
- :param size: Number of values participating in this group.
- :param avg: Population average of the participating sample values.
- :param stdev: Population standard deviation of the sample values.
- :param max_value: Maximal expected value.
- TODO: This might be more optimal,
- but max-invariant algorithm will be nicer.
- :param prev_avg: Population average of the previous group.
- If None, no previous average is taken into account.
- If not None, the given previous average is used to discourage
- consecutive groups with similar averages
- (opposite triangle distribution is assumed).
- :type avg: float
- :type size: int
- :type stdev: float
- :type max_value: Union[float, NoneType]
- :type prev_avg: Union[float, NoneType]
"""
- self.avg = avg
- self.size = size
- self.stdev = stdev
- self.max_value = max_value
- self.prev_avg = prev_avg
# Zero size should in principle have non-zero bits (coding zero size),
# but zero allows users to add empty groups without affecting bits.
self.bits = 0.0
if self.size < 1:
return
- if avg is None:
- raise ValueError(f"Avg is None: {self!r}")
- if max_value is None or max_value <= 0.0:
+ if self.max_value <= 0.0:
raise ValueError(f"Invalid max value: {self!r}")
# Length of the sequence must be also counted in bits,
# otherwise the message would not be decodable.
# Model: probability of k samples is 1/k - 1/(k+1) == 1/k/(k+1)
# This is compatible with zero size leading to zero bits.
- self.bits += math.log(size * (size + 1), 2)
- if prev_avg is None:
+ self.bits += math.log(self.size * (self.size + 1), 2)
+ if self.prev_avg is None:
# Avg is considered to be uniformly distributed
# from zero to max_value.
- self.bits += math.log(max_value + 1.0, 2)
+ self.bits += math.log(self.max_value + 1.0, 2)
else:
# Opposite triangle distribution with minimum.
self.bits += math.log(
- max_value * (max_value + 1) / (abs(avg - prev_avg) + 1), 2)
+ (self.max_value * (self.max_value + 1))
+ / (abs(self.avg - self.prev_avg) + 1),
+ 2,
+ )
if self.size < 2:
return
# Stdev is considered to be uniformly distributed
# from zero to max_value. That is quite a bad expectation,
# but resilient to negative samples etc.
- self.bits += math.log(max_value + 1.0, 2)
+ self.bits += math.log(self.max_value + 1.0, 2)
# Now we know the samples lie on sphere in size-1 dimensions.
# So it is (size-2)-sphere, with radius^2 == stdev^2 * size.
# https://en.wikipedia.org/wiki/N-sphere
- sphere_area_ln = math.log(2) + math.log(math.pi) * ((size - 1) / 2.0)
- sphere_area_ln -= math.lgamma((size - 1) / 2.0)
- sphere_area_ln += math.log(stdev + 1.0) * (size - 2)
- sphere_area_ln += math.log(size) * ((size - 2) / 2.0)
+ sphere_area_ln = math.log(2)
+ sphere_area_ln += math.log(math.pi) * ((self.size - 1) / 2.0)
+ sphere_area_ln -= math.lgamma((self.size - 1) / 2.0)
+ sphere_area_ln += math.log(self.stdev + 1.0) * (self.size - 2)
+ sphere_area_ln += math.log(self.size) * ((self.size - 2) / 2.0)
self.bits += sphere_area_ln / math.log(2)
- def __str__(self):
- """Return string with human readable description of the group.
-
- :returns: Readable description.
- :rtype: str
- """
- return (
- f"size={self.size} avg={self.avg} stdev={self.stdev}"
- f" bits={self.bits}"
- )
-
- def __repr__(self):
- """Return string executable as Python constructor call.
-
- :returns: Executable constructor call.
- :rtype: str
- """
- return (
- f"BitCountingStats(size={self.size!r},avg={self.avg!r}"
- f",stdev={self.stdev!r},max_value={self.max_value!r}"
- f",prev_avg={self.prev_avg!r})"
- )
-
+ # TODO: Rename, so pylint stops complaining about signature change.
@classmethod
- def for_runs(cls, runs, max_value=None, prev_avg=None):
+ def for_runs(
+ cls,
+ runs: typing.Iterable[typing.Union[float, AvgStdevStats]],
+ max_value: float,
+ prev_avg: typing.Optional[float] = None,
+ ):
"""Return new stats instance describing the sequence of runs.
If you want to append data to existing stats object,
@@ -164,6 +139,11 @@ class BitCountingStats(AvgStdevStats):
:rtype: cls
"""
asd = AvgStdevStats.for_runs(runs)
- ret_obj = cls(size=asd.size, avg=asd.avg, stdev=asd.stdev,
- max_value=max_value, prev_avg=prev_avg)
+ ret_obj = cls(
+ size=asd.size,
+ avg=asd.avg,
+ stdev=asd.stdev,
+ max_value=max_value,
+ prev_avg=prev_avg,
+ )
return ret_obj