-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_dataset.py
112 lines (79 loc) · 2.83 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import numpy as np
import pandas as pd
import sys
import glob
import os
import re
import xmltodict
def main(input_dir, output_file):
files = glob.glob("%s/*.xltm" % input_dir) + glob.glob("%s/*.xlsx" % input_dir)
#files = ['tmp/T1=130.xltm', 'tmp/T1=128.xltm']
#files = ['datasets/T1=130.xltm', 'datasets/T1=128.xltm']
final_df = create_df(files)
final_df.to_csv(output_file, index=False)
print("Dataset size: %d" % final_df.shape[0])
def create_df(files):
rows_spec_df = read_rows_spec()
sdfs = []
for file in files:
print("Processing %s" % file)
if os.path.basename(file).startswith('~'):
continue
df = pd.read_excel(file)
df.index = read_tags(df['Unnamed: 0'])
col_ix = ~pd.isnull(df.iloc[4]) & (np.arange(df.shape[1]) >= 4)
print("Number of configs tested: %d" % np.sum(col_ix))
#print(df.loc[rows_spec_df.index])
sdf = df.loc[rows_spec_df.index, df.columns[col_ix]].copy()
empty_ix = np.sum(sdf == '<empty>', axis=0) > 0
if np.sum(empty_ix) == 0:
sdf = sdf.astype(np.floating)
else:
sdf = sdf[sdf.columns[~empty_ix]].astype(np.floating)
assert np.all(sdf.index == rows_spec_df.index)
sdf.columns = np.arange(len(sdf.columns))
sdf.index = rows_spec_df['alias']
print(" Done")
sdfs.append(sdf.T)
final_df = pd.concat(sdfs, axis=0)
perc_cols = list(rows_spec_df[rows_spec_df['perc_to_frac'] == 1]['alias'])
final_df[perc_cols] = final_df[perc_cols] / 100
return final_df
def read_tags(rows):
new_rows = []
for i, row in enumerate(rows):
if type(row) != str:
new_rows.append((None, None, i))
continue
o = xmltodict.parse(row)
tag_name = ''
attr_name = ''
attr_value = ''
obj_path = ''
if 'UnisimTag' in o:
tag_name = 'UnisimTag'
elif 'UnisimElement' in o:
tag_name = 'UnisimElement'
attr_name = '@uopUnisimObjectName'
attr_value = o[tag_name][attr_name]
obj_path_attr = '@uopUnisimObjectPath'
obj_path = o[tag_name][obj_path_attr]
new_rows.append((tag_name, attr_name, attr_value, obj_path))
s = pd.Series(new_rows)
for r in s[s.duplicated()]:
print(r)
assert len(new_rows) == len(set(new_rows))
return new_rows
def read_rows_spec():
df = pd.read_csv('rows.csv')
df.index = list(zip(df['tag_name'], df['attr_name'], df['attr_value'], df['obj_path']))
return df
def is_float(element):
try:
float(element)
return True
except ValueError:
return False
if __name__ == "__main__":
import sys
main(sys.argv[1], sys.argv[2])