-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexample.py
35 lines (27 loc) · 1.2 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from dsbox.datapreprocessing.featurizer.multiTable import MultiTableFeaturization
from d3m.container import List
from d3m.metadata import hyperparams
from typing import Union
class Hyperparams(hyperparams.Hyperparams):
pass
from dsbox.datapreprocessing.featurizer.multiTable import MultiTableFeaturization
from os import listdir
import pandas as pd
# step 1: prepare inputs
# data_path = "/Users/luofanghao/work/USC_lab/isi-II/work/DSBox_project/multiple_table/test_data/financial/"
data_path = "/Users/luofanghao/work/USC_lab/isi-II/work/DSBox_project/multiple_table/test_data/mutagenesis/"
master_col_name = "molecule.csv_molecule_id"
# master_col_name = "loan.csv_account_id" # master table and its primary key
names = listdir(data_path)
tables_names = list(name for name in names) # list of table names
data = List()
names = List()
for x in tables_names:
data.append(pd.read_csv(data_path + x))
names.append(x)
print("tables are: {}".format(names))
names.append(master_col_name) # master table_column name
# step 2: use featurizer
featurizer = MultiTableFeaturization(hyperparams=Hyperparams())
result = featurizer.produce(inputs=(data, names))
result.value.to_csv("featureized_example.csv", index=False)