aboutsummaryrefslogtreecommitdiffstats
path: root/resources/libraries/python/jumpavg/classify.py
diff options
context:
space:
mode:
authorVratko Polak <vrpolak@cisco.com>2023-06-02 14:44:47 +0200
committerTibor Frank <tifrank@cisco.com>2023-06-07 05:53:55 +0000
commit079c390e0903a98182781ff5c2af2bba9902b4ed (patch)
tree98116eea60af0ed530548bb52dff126aef0dc9e9 /resources/libraries/python/jumpavg/classify.py
parent2c9b2a4298cac5cc4d6ca60cb1da8bd72ec23c37 (diff)
feat(jumpavg): support small values via unit param
Previously, Jumpavg was known to give wrong results when the data contains values of order one or smaller. This change introduces a new "unit" parameter, which changes how the information content is calculated. For example if the data values are mutiplies of 0.01, the unit parameter should be set to 0.01 to compensate. For callers not knowing their correct unit value, another parameter is introduced, called "sbps" (meaning Significant Bits Per Sample). A binary integer number with this many ones is how much units should the maximal sample be. This way jumpavg computes the corresponding "unit" value to use. If neither "unit" nor "sbps" are given, the "sbps" value of 12 is applied. + Rename files to conform to snake_style naming. + Fix normalization for the "opposite triangle" distribution. + Simplify logic, all groups now start as "normal", not "unknown". + Minor style improvements as suggested by pylint. + From user perspective, this change should be backward compatible. - The normalization fix is a behavior change, but it is a bugfix and the new behavior should be better. Change-Id: I5a5ca11757f087fff13faf1d0b8e34a741400258 Signed-off-by: Vratko Polak <vrpolak@cisco.com>
Diffstat (limited to 'resources/libraries/python/jumpavg/classify.py')
-rw-r--r--resources/libraries/python/jumpavg/classify.py44
1 files changed, 32 insertions, 12 deletions
diff --git a/resources/libraries/python/jumpavg/classify.py b/resources/libraries/python/jumpavg/classify.py
index 87d2502037..cc3cdcceed 100644
--- a/resources/libraries/python/jumpavg/classify.py
+++ b/resources/libraries/python/jumpavg/classify.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 Cisco and/or its affiliates.
+# Copyright (c) 2023 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -13,21 +13,23 @@
"""Module holding the classify function
-Classification os one of primary purposes of this package.
+Classification is one of primary purposes of this package.
Minimal message length principle is used
for grouping results into the list of groups,
assuming each group is a population of different Gaussian distribution.
"""
-import typing
+from typing import Iterable, Optional, Union
-from .AvgStdevStats import AvgStdevStats
-from .BitCountingGroupList import BitCountingGroupList
+from .avg_stdev_stats import AvgStdevStats
+from .bit_counting_group_list import BitCountingGroupList
def classify(
- values: typing.Iterable[typing.Union[float, typing.Iterable[float]]]
+ values: Iterable[Union[float, Iterable[float]]],
+ unit: Optional[float] = None,
+ sbps: Optional[float] = None,
) -> BitCountingGroupList:
"""Return the values in groups of optimal bit count.
@@ -38,12 +40,27 @@ def classify(
Internally, such sequence is replaced by AvgStdevStats
after maximal value is found.
+ If the values are smaller than expected (below one unit),
+ the underlying assumption break down and the classification is wrong.
+ Use the "unit" parameter to hint at what the input resolution is.
+
+ If the correct value of unit is not known beforehand,
+ the argument "sbps" (Significant Bits Per Sample) can be used
+ to set unit such that maximal sample value is this many ones in binary.
+ If neither "unit" nor "sbps" are given, "sbps" of 12 is used by default.
+
:param values: Sequence of runs to classify.
+ :param unit: Typical resolution of the values.
+ Zero and None means no unit given.
+ :param sbps: Significant Bits Per Sample. None on zero means 12.
+ If units is not set, this is used to compute unit from max sample value.
:type values: Iterable[Union[float, Iterable[float]]]
+ :type unit: Optional[float]
+ :type sbps: Optional[float]
:returns: Classified group list.
:rtype: BitCountingGroupList
"""
- processed_values = list()
+ processed_values = []
max_value = 0.0
for value in values:
if isinstance(value, (float, int)):
@@ -55,9 +72,14 @@ def classify(
if subvalue > max_value:
max_value = subvalue
processed_values.append(AvgStdevStats.for_runs(value))
+ if not unit:
+ if not sbps:
+ sbps = 12.0
+ max_in_units = pow(2.0, sbps + 1.0) - 1.0
+ unit = max_value / max_in_units
# Glist means group list (BitCountingGroupList).
- open_glists = list()
- record_glist = BitCountingGroupList(max_value=max_value)
+ open_glists = []
+ record_glist = BitCountingGroupList(max_value=max_value, unit=unit)
for value in processed_values:
new_open_glist = record_glist.copy_fast().append_group_of_runs([value])
record_glist = new_open_glist
@@ -68,9 +90,7 @@ def classify(
open_glists.append(new_open_glist)
previous_average = record_glist[0].stats.avg
for group in record_glist:
- if group.stats.avg == previous_average:
- group.comment = "normal"
- elif group.stats.avg < previous_average:
+ if group.stats.avg < previous_average:
group.comment = "regression"
elif group.stats.avg > previous_average:
group.comment = "progression"