diff options
Diffstat (limited to 'resources/tools/presentation/utils.py')
-rw-r--r-- | resources/tools/presentation/utils.py | 144 |
1 files changed, 40 insertions, 104 deletions
diff --git a/resources/tools/presentation/utils.py b/resources/tools/presentation/utils.py index ba32932187..a2aa0dc071 100644 --- a/resources/tools/presentation/utils.py +++ b/resources/tools/presentation/utils.py @@ -17,7 +17,6 @@ import multiprocessing import subprocess import numpy as np -import pandas as pd import logging import csv import prettytable @@ -28,6 +27,7 @@ from shutil import move, Error from math import sqrt from errors import PresentationError +from jumpavg.BitCountingClassifier import BitCountingClassifier def mean(items): @@ -71,73 +71,6 @@ def relative_change(nr1, nr2): return float(((nr2 - nr1) / nr1) * 100) -def remove_outliers(input_list, outlier_const=1.5, window=14): - """Return list with outliers removed, using split_outliers. - - :param input_list: Data from which the outliers will be removed. - :param outlier_const: Outlier constant. - :param window: How many preceding values to take into account. - :type input_list: list of floats - :type outlier_const: float - :type window: int - :returns: The input list without outliers. - :rtype: list of floats - """ - - data = np.array(input_list) - upper_quartile = np.percentile(data, 75) - lower_quartile = np.percentile(data, 25) - iqr = (upper_quartile - lower_quartile) * outlier_const - quartile_set = (lower_quartile - iqr, upper_quartile + iqr) - result_lst = list() - for y in input_list: - if quartile_set[0] <= y <= quartile_set[1]: - result_lst.append(y) - return result_lst - - -def split_outliers(input_series, outlier_const=1.5, window=14): - """Go through the input data and generate two pandas series: - - input data with outliers replaced by NAN - - outliers. - The function uses IQR to detect outliers. - - :param input_series: Data to be examined for outliers. - :param outlier_const: Outlier constant. - :param window: How many preceding values to take into account. - :type input_series: pandas.Series - :type outlier_const: float - :type window: int - :returns: Input data with NAN outliers and Outliers. - :rtype: (pandas.Series, pandas.Series) - """ - - list_data = list(input_series.items()) - head_size = min(window, len(list_data)) - head_list = list_data[:head_size] - trimmed_data = pd.Series() - outliers = pd.Series() - for item_x, item_y in head_list: - item_pd = pd.Series([item_y, ], index=[item_x, ]) - trimmed_data = trimmed_data.append(item_pd) - for index, (item_x, item_y) in list(enumerate(list_data))[head_size:]: - y_rolling_list = [y for (x, y) in list_data[index - head_size:index]] - y_rolling_array = np.array(y_rolling_list) - q1 = np.percentile(y_rolling_array, 25) - q3 = np.percentile(y_rolling_array, 75) - iqr = (q3 - q1) * outlier_const - low = q1 - iqr - item_pd = pd.Series([item_y, ], index=[item_x, ]) - if low <= item_y: - trimmed_data = trimmed_data.append(item_pd) - else: - outliers = outliers.append(item_pd) - nan_pd = pd.Series([np.nan, ], index=[item_x, ]) - trimmed_data = trimmed_data.append(nan_pd) - - return trimmed_data, outliers - - def get_files(path, extension=None, full_path=True): """Generates the list of files to process. @@ -276,46 +209,49 @@ def archive_input_data(spec): logging.info(" Done.") -def classify_anomalies(data, window): - """Evaluates if the sample value is an outlier, regression, normal or - progression compared to the previous data within the window. - We use the intervals defined as: - - regress: less than trimmed moving median - 3 * stdev - - normal: between trimmed moving median - 3 * stdev and median + 3 * stdev - - progress: more than trimmed moving median + 3 * stdev - where stdev is trimmed moving standard deviation. +def classify_anomalies(data): + """Process the data and return anomalies and trending values. + + Gather data into groups with average as trend value. + Decorate values within groups to be normal, + the first value of changed average as a regression, or a progression. - :param data: Full data set with the outliers replaced by nan. - :param window: Window size used to calculate moving average and moving - stdev. + :param data: Full data set with unavailable samples replaced by nan. :type data: pandas.Series - :type window: int - :returns: Evaluated results. - :rtype: list + :returns: Classification and trend values + :rtype: 2-tuple, list of strings and list of floats """ - - if data.size < 3: - return None - - win_size = data.size if data.size < window else window - tmm = data.rolling(window=win_size, min_periods=2).median() - tmstd = data.rolling(window=win_size, min_periods=2).std() - - classification = ["normal", ] - first = True - for build, value in data.iteritems(): - if first: - first = False - continue - if np.isnan(value) or np.isnan(tmm[build]) or np.isnan(tmstd[build]): + # Nan mean something went wrong. + # Use 0.0 to cause that being reported as a severe regression. + bare_data = [0.0 if np.isnan(sample) else sample + for _, sample in data.iteritems()] + # TODO: Put analogous iterator into jumpavg library. + groups = BitCountingClassifier().classify(bare_data) + groups.reverse() # Just to use .pop() for FIFO. + classification = [] + avgs = [] + active_group = None + values_left = 0 + avg = 0.0 + for _, sample in data.iteritems(): + if np.isnan(sample): classification.append("outlier") - elif value < (tmm[build] - 3 * tmstd[build]): - classification.append("regression") - elif value > (tmm[build] + 3 * tmstd[build]): - classification.append("progression") - else: - classification.append("normal") - return classification + avgs.append(sample) + continue + if values_left < 1 or active_group is None: + values_left = 0 + while values_left < 1: # Ignore empty groups (should not happen). + active_group = groups.pop() + values_left = len(active_group.values) + avg = active_group.metadata.avg + classification.append(active_group.metadata.classification) + avgs.append(avg) + values_left -= 1 + continue + classification.append("normal") + avgs.append(avg) + values_left -= 1 + return classification, avgs def convert_csv_to_pretty_txt(csv_file, txt_file): |