doc/TRexDataAnalysis.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236

#!/scratch/Anaconda2.4.0/bin/python
import pandas as pd
import numpy as np
import matplotlib

matplotlib.use('Agg')
from matplotlib import pyplot as plt
from matplotlib import dates as matdates
from matplotlib import lines as matlines
import os
import time
from datetime import datetime

"""
This Module is structured to work with a raw data at the following JSON format:

 {'setup_name': {'test1_name':[QUERY1,QUERY2,QUERY3],
                'test2_name':[QUERY1,QUERY2,QUERY3]
                }
  'setup_name2': {'test1_name':[QUERY1,QUERY2,QUERY3],
                'test2_name':[QUERY1,QUERY2,QUERY3]
                }
 }

 The Query structure is set (currently) to this:

 (test_name,state, date,hour,minute,mpps_result,mpps_min,mpps_max,build_id) example:

 ["syn attack - 64 bytes, single CPU", "stl", "20161226", "01", "39", "9.631898", "9.5", "11.5", "54289"]

 it can be changed to support other formats of queries, simply change the query class to support your desired structure
 the query class specify the indexes of the data within the query tuple

"""


class TestQuery(object):
    QUERY_TIMEFORMAT = "%Y-%m-%d %H:%M:%S"  # date format in the query
    QUERY_TIMESTAMP = 1
    QUERY_MPPS_RESULT = 2
    QUERY_BUILD_ID = 3


class Test:
    def __init__(self, name, setup_name, end_date):
        self.name = name
        self.setup_name = setup_name
        self.end_date = end_date
        self.stats = []  # tuple
        self.results_df = []  # dataFrame
        self.latest_result = []  # float
        self.latest_result_date = ''  # string

    def analyze_all_test_data(self, raw_test_data):
        test_results = []
        test_dates = []
        test_build_ids = []
        for query in raw_test_data:
            # date_formatted = time.strftime("%d-%m-%Y",
            #                                time.strptime(query[int(TestQuery.QUERY_DATE)], TestQuery.query_dateformat))
            # time_of_res = date_formatted + '-' + query[int(TestQuery.QUERY_HOUR)] + ':' + query[
            #     int(TestQuery.QUERY_MINUTE)]
            time_of_query = time.strptime(query[TestQuery.QUERY_TIMESTAMP], TestQuery.QUERY_TIMEFORMAT)
            time_formatted = time.strftime("%d-%m-%Y-%H:%M", time_of_query)
            test_dates.append(time_formatted)
            test_results.append(float(query[int(TestQuery.QUERY_MPPS_RESULT)]))
            test_build_ids.append(query[int(TestQuery.QUERY_BUILD_ID)])
        test_results_df = pd.DataFrame({self.name: test_results, self.name + ' Date': test_dates,
                                        "Setup": ([self.setup_name] * len(test_results)), "Build Id": test_build_ids},
                                       dtype='str')
        stats_avg = float(test_results_df[self.name].mean())
        stats_min = float(test_results_df[self.name].min())
        stats_max = float(test_results_df[self.name].max())
        stats = tuple(
            [stats_avg, stats_min, stats_max,
             float(test_results_df[self.name].std()),
             float(((stats_max - stats_min) / stats_avg) * 100),
             len(test_results)])  # stats = (avg_mpps,min,max,std,error, no of test_results) error = ((max-min)/avg)*100
        self.latest_result = float(test_results_df[self.name].iloc[-1])
        self.latest_result_date = str(test_results_df[test_results_df.columns[3]].iloc[-1])
        self.results_df = test_results_df
        self.stats = stats


class Setup:
    def __init__(self, name, end_date, raw_setup_data):
        self.name = name
        self.end_date = end_date  # string of date
        self.tests = []  # list of test objects
        self.all_tests_data_table = pd.DataFrame()  # dataframe
        self.setup_trend_stats = pd.DataFrame()  # dataframe
        self.latest_test_results = pd.DataFrame()  # dataframe
        self.raw_setup_data = raw_setup_data  # dictionary
        self.test_names = raw_setup_data.keys()  # list of names

    def analyze_all_tests(self):
        for test_name in self.test_names:
            t = Test(test_name, self.name, self.end_date)
            t.analyze_all_test_data(self.raw_setup_data[test_name])
            self.tests.append(t)

    def analyze_latest_test_results(self):
        test_names = []
        test_dates = []
        test_latest_results = []
        for test in self.tests:
            test_names.append(test.name)
            test_dates.append(test.latest_result_date)
            test_latest_results.append(test.latest_result)
        self.latest_test_results = pd.DataFrame(
            {'Date': test_dates, 'Test Name': test_names, 'MPPS\Core (Norm)': test_latest_results},
            index=range(1, len(test_latest_results) + 1))
        self.latest_test_results = self.latest_test_results[[2, 1, 0]]  # re-order columns to name|MPPS|date

    def analyze_all_tests_stats(self):
        test_names = []
        all_test_stats = []
        for test in self.tests:
            test_names.append(test.name)
            all_test_stats.append(test.stats)
        self.setup_trend_stats = pd.DataFrame(all_test_stats, index=test_names,
                                              columns=['Avg MPPS/Core (Norm)', 'Min', 'Max', 'Std', 'Error (%)',
                                                       'Total Results'])
        self.setup_trend_stats.index.name = 'Test Name'

    def analyze_all_tests_trend(self):
        all_tests_trend_data = []
        for test in self.tests:
            all_tests_trend_data.append(test.results_df)
        self.all_tests_data_table = reduce(lambda x, y: pd.merge(x, y, how='outer'), all_tests_trend_data)

    def plot_trend_graph_all_tests(self, save_path='', file_name='_trend_graph.png'):
        time_format1 = '%d-%m-%Y-%H:%M'
        time_format2 = '%Y-%m-%d-%H:%M'
        for test in self.tests:
            test_data = test.results_df[test.results_df.columns[2]].tolist()
            test_time_stamps = test.results_df[test.results_df.columns[3]].tolist()
            start_date = test_time_stamps[0]
            test_time_stamps.append(self.end_date + '-23:59')
            test_data.append(test_data[-1])
            float_test_time_stamps = []
            for ts in test_time_stamps:
                try:
                    float_test_time_stamps.append(matdates.date2num(datetime.strptime(ts, time_format1)))
                except:
                    float_test_time_stamps.append(matdates.date2num(datetime.strptime(ts, time_format2)))
            plt.plot_date(x=float_test_time_stamps, y=test_data, label=test.name, fmt='.-', xdate=True)
            plt.legend(fontsize='small', loc='best')
        plt.ylabel('MPPS/Core (Norm)')
        plt.title('Setup: ' + self.name)
        plt.tick_params(
            axis='x',
            which='both',
            bottom='off',
            top='off',
            labelbottom='off')
        plt.xlabel('Time Period: ' + start_date[:-6] + ' - ' + self.end_date)
        if save_path:
            plt.savefig(os.path.join(save_path, self.name + file_name))
            if not self.setup_trend_stats.empty:
                (self.setup_trend_stats.round(2)).to_csv(os.path.join(save_path, self.name +
                                                                      '_trend_stats.csv'))
            plt.close('all')

    def plot_latest_test_results_bar_chart(self, save_path='', img_file_name='_latest_test_runs.png',
                                           stats_file_name='_latest_test_runs_stats.csv'):
        plt.figure()
        colors_for_bars = ['b', 'g', 'r', 'c', 'm', 'y']
        self.latest_test_results[[1]].plot(kind='bar', legend=False,
                                           color=colors_for_bars)  # plot only mpps data, which is in column 1
        plt.xticks(rotation='horizontal')
        plt.xlabel('Index of Tests')
        plt.ylabel('MPPS/Core (Norm)')
        plt.title("Test Runs for Setup: " + self.name)
        if save_path:
            plt.savefig(os.path.join(save_path, self.name + img_file_name))
            (self.latest_test_results.round(2)).to_csv(
                os.path.join(save_path, self.name + stats_file_name))
        plt.close('all')

    def analyze_all_setup_data(self):
        self.analyze_all_tests()
        self.analyze_latest_test_results()
        self.analyze_all_tests_stats()
        self.analyze_all_tests_trend()

    def plot_all(self, save_path=''):
        self.plot_latest_test_results_bar_chart(save_path)
        self.plot_trend_graph_all_tests(save_path)


def latest_runs_comparison_bar_chart(setup_name1, setup_name2, setup1_latest_result, setup2_latest_result,
                                     save_path=''
                                     ):
    s1_res = setup1_latest_result[[0, 1]]  # column0 is test name, column1 is MPPS\Core
    s2_res = setup2_latest_result[[0, 1, 2]]  # column0 is test name, column1 is MPPS\Core, column2 is Date
    s1_res.columns = ['Test Name', setup_name1]
    s2_res.columns = ['Test Name', setup_name2, 'Date']
    compare_dframe = pd.merge(s1_res, s2_res, on='Test Name')
    compare_dframe.plot(kind='bar')
    plt.legend(fontsize='small', loc='best')
    plt.xticks(rotation='horizontal')
    plt.xlabel('Index of Tests')
    plt.ylabel('MPPS/Core (Norm)')
    plt.title("Comparison between " + setup_name1 + " and " + setup_name2)
    if save_path:
        plt.savefig(os.path.join(save_path, "_comparison.png"))
        compare_dframe = compare_dframe.round(2)
        compare_dframe.to_csv(os.path.join(save_path, '_comparison_stats_table.csv'))

        # WARNING: if the file _all_stats.csv already exists, this script deletes it, to prevent overflowing of data


def create_all_data(ga_data, end_date, save_path='', detailed_test_stats=''):
    all_setups = {}
    all_setups_data = []
    setup_names = ga_data.keys()
    for setup_name in setup_names:
        s = Setup(setup_name, end_date, ga_data[setup_name])
        s.analyze_all_setup_data()
        s.plot_all(save_path)
        all_setups_data.append(s.all_tests_data_table)
        all_setups[setup_name] = s

    if detailed_test_stats:
        if os.path.exists(os.path.join(save_path, '_detailed_table.csv')):
            os.remove(os.path.join(save_path, '_detailed_table.csv'))
        all_setups_data_dframe = pd.DataFrame().append(all_setups_data)
        all_setups_data_dframe.to_csv(os.path.join(save_path, '_detailed_table.csv'))

    trex07setup = all_setups['trex07']
    trex08setup = all_setups['trex08']
    latest_runs_comparison_bar_chart('Mellanox ConnectX-4',
                                     'Intel XL710', trex07setup.latest_test_results,
                                     trex08setup.latest_test_results,
                                     save_path=save_path)