dataset_maker.py 3.5 KB
Newer Older
Jean-Didier Totow's avatar
Jean-Didier Totow committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os, json, time
import pandas as pd 
from datetime import datetime

dataset_folder = "."

files = ['./EstimatedRemainingTimeContext.csv', './SimulationLeftNumber.csv', './SimulationElapsedTime.csv', './NotFinishedOnTime.csv', './MinimumCoresContext.csv', './NotFinished.csv', './WillFinishTooSoonContext.csv', './NotFinishedOnTimeContext.csv', './MinimumCores.csv', './ETPercentile.csv', './RemainingSimulationTimeMetric.csv', './TotalCores.csv']
tolerance = 1
field_to_keep = ["name","time","value"]
features_list = []

class Row():
    def __init__(self, _time, features):
        self._obj = datetime.strptime(_time,'%Y-%m-%dT%H:%M:%S.%fZ') 
        self.time = int(self._obj.timestamp())
        self.features = features
    def getTime(self):
        return self.time 
    def addFeatures(self, field_name, filed_value):
        self.features[field_name] = filed_value
    def getFeatures(self):
        return self.features
    def canBeAdded(self, _time):
        return abs(self.time - _time) <= tolerance
    def toRow(self):
        _line = "{0},".format(self.time)
        for key in features_list:
            if key in self.features:
                value = self.features[key]
                _line += "{0}".format(value)+","
            else:
                _line += "null,"
        _line = _line[:-1]
        return  _line 

class Dataset():
    def __init__(self):
        self.rows = {}
        self.size = 0
    def getRow(self,_time):
        for _timestamp, row in self.rows.items():
            if row.canBeAdded(_time):
                return row 
        return None 
    def sortRows(self):
        return sorted(list(self.rows.values()), key=lambda x: x.getTime(), reverse=True)
    def addRow(self, _time, name, value):
        if type(_time) == type(0):
            print("This is the problem")
            print(_time)
        _obj = datetime.strptime(_time,'%Y-%m-%dT%H:%M:%S.%fZ') 
        _timestamp = int(_obj.timestamp())
        row = self.getRow(_timestamp)
        if row != None:
            row.addFeatures(name,value)
        else:
            row = Row(_time,{})
            row.addFeatures(name,value)
            self.rows[_timestamp] = row
            self.size +=1 
    def debug(self):
        for k, row in self.rows.items():
            print(k)
            print(row.toRow())
    def build(self):
        content_file = "time,"
        index = 0
        for _time,row in self.rows.items():
            if index == 0:
                #adding title
                for key in features_list:
                    content_file += key+","
                content_file = content_file[:-1]
                content_file+= "\n"

            content_file += row.toRow() + "\n"
            index +=1
        _file = open("dataset.csv","w")
        _file.write(content_file)
        _file.close()

    def getSize(self):
        return self.size

def get_all_files():
    _files = []
    for root, dirs, files in os.walk(dataset_folder):
        for filename in files:
            _files.append(root +  '/' + filename)
    return _files 

def readFiles():
    global features_list
    dataset = Dataset()
    for _file in files:
        df = pd.read_csv(_file, error_bad_lines=False)
        df = df[field_to_keep]
        print(df)
        for row in df.values:
            if not row[0] in features_list:
                features_list.append(row[0])
            dataset.addRow(row[1],row[0],row[2])
    #print(dataset.getSize())
    dataset.sortRows()
    dataset.build()
    #dataset.debug()



readFiles()