-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathgenerate_data.py
238 lines (196 loc) · 8.33 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# Copyright (C) 2021 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
"""Generate_data
Provides base functionality of the following:
1. Generating synthetic data
2. Training logistic regression model based on Efficient logistic regression training by
Bergamaschi et. al (https://eprint.iacr.org/2019/425)
3. Save trained model and generated data for LRHE example
"""
from enum import Enum
import csv
import argparse
import numpy as np
import lr_base as lrb
import sklearn.datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
def doTrain(Xtrain, ytrain, Xtest, ytest, epochs=10, verbose=False):
"""Efficient logistic regression training
Efficient logistic regression training by Bergamaschi et. al (https://eprint.iacr.org/2019/425)
Provides a fast/efficient logistic regression training with cleartext data
Args:
Xtrain (numpy.array): Training data samples in numpy 2d array
ytrain (numpy.array): Target of training set
Xtest (numpy.array): Test data samples for validation in numpy 2d array
ytest (numpy.array): Target of test set for validation
epochs (int): Number of training epochs. Default = 10
verbose (bool): Set to True for printing training progress. Default = False
Returns:
bias (float): bias of logistic regression trained model
weights (numpy.array): weights of logistic regression trained model
"""
v = lrb.get_initweight(Xtrain, ytrain)
w = v
lmbda = 0
if verbose:
print("== Logistic Regression Training ==")
for i in range(epochs):
learning_rate = 10.0 / ((i + 1) + 1)
new_lmbda = (1.0 + np.sqrt(1 + 4 * lmbda**2)) / 2.0
smoothing = (1 - lmbda) / new_lmbda
lmbda = new_lmbda
loss, dw = lrb.get_lgd(Xtrain, ytrain, v)
new_w = v - learning_rate * dw
new_v = (1 - smoothing) * new_w + smoothing * w
if verbose:
denom = 5 if epochs < 10 else 10
if i % (epochs // denom) == 0:
acc = lrb.test(Xtest, ytest, v)[1]
print(f"Epoch: {i}, - loss: {loss} - acc: {acc}")
v, w = new_v, new_w
bias, weights = v[0], v[1:]
return bias, weights
# DataMode enumeration
class DataMode(Enum):
train = 0
test = 1
eval = 2
# Save data to csv file (Default suffix = _eval)
def saveData(dataName, X, y, datamode: DataMode = DataMode.eval):
"""Save data samples to csv file
Stores the data samples to be used for the LRHE example.
Args:
dataName (str): data name prefix
X (numpy.array): data samples to be stored in 2d numpy array
y (numpy.array): targets to be stored in a 1d numpy array
datamode (DataMode): Determines the suffix [train, test, eval]. Default = DataMode.eval
"""
nFeatures = X.shape[1]
suffix = datamode.name
features = [f"feature_{i}" for i in range(nFeatures)]
features.append("target")
data = [features]
data += np.concatenate((X, np.transpose([y])), axis=1).tolist()
# Save to csv
with open(f"{dataName}_{suffix}.csv", "w") as csvfile:
writer = csv.writer(csvfile, delimiter=",")
writer.writerows(data)
# Save model (combine [bias, weights...] to csv file with suffix _lrmodel
def saveModel(dataName, b, w):
"""Save logistic regression model to csv file
Stores the model to be used for the LRHE example.
Args:
dataName (str): data name prefix
b (float): bias of LR model
w (numpy.array): weights of LR model
"""
lr_model = np.concatenate(([b], w), axis=0).tolist()
with open(f"{dataName}_lrmodel.csv", "w") as csvfile:
writer = csv.writer(csvfile, delimiter=",")
writer.writerow(lr_model)
def generateSynData(nSamples, nFeatures):
"""Generate synthetic dataset
Generates synthetic datasets with the use of sklearn.datasets.make_classification.
Splits the entire dataset into train, test and eval with the ratio of 2:1:1.
Note that this will generate purposely well-fitted data samples for LR training
Args:
nSamples (int): number of data samples
nFeatures (int): number of features
Returns:
Xtrain (numpy.array): train dataset. 1/4 of nSamples
ytrain (numpy.array): train target
Xtest (numpy.array): test dataset. 1/4 of nSamples
ytest (numpy.array): test target
Xeval (numpy.array): eval dataset. 1/4 of nSamples
yeval (numpy.array): eval target
"""
data = sklearn.datasets.make_classification(
n_samples=nSamples,
n_features=nFeatures,
n_classes=2,
n_clusters_per_class=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
)
x, y = data[0], data[1]
scaler = MinMaxScaler(feature_range=(-1.0, 1.0))
X_scaled = scaler.fit_transform(x)
Xtrain, Xtest, ytrain, ytest = train_test_split(
X_scaled, y, test_size=0.5, random_state=50, shuffle=True, stratify=y
)
Xtest, Xeval, ytest, yeval = train_test_split(
Xtest, ytest, test_size=0.5, random_state=50, shuffle=True, stratify=ytest
)
return Xtrain, ytrain, Xtest, ytest, Xeval, yeval
def parse_cmdline_args():
parser = argparse.ArgumentParser(
description="Synthetic data generation and LR model training"
)
parser.add_argument(
"--samples", "-s", default=0, type=int, help="# of samples to generate."
)
parser.add_argument("--features", "-f", default=0, type=int, help="# of features")
parser.add_argument("--name", "-n", default=None, help="Data prefix")
parser.add_argument(
"--verbose", "-v", action="store_true", help="Set to see training progress"
)
return parser.parse_args()
def main(args):
"""Base script to generate samples for LRHE example
This script generates a set of synthetic dataset with various sizes, train
a logistic regression model with each data, then stores them in csv files.
Generation and training happens during he-samples build time.
This script can also be used to generate user-defined synthetic dataset.
If --samples and --features flags are set via command line, it will instead
generate a synthetic data set and train LR model accordingly.
"""
# if no flags, proceed to preset generation
if args.samples == 0 and args.features == 0 and args.name is None:
print("=== Synthetic data generation for logistic regression HE example ===")
l_dataname = ["lrtest_small", "lrtest_mid", "lrtest_large", "lrtest_xlarge"]
l_features = [40, 80, 120, 200]
l_samples = [500, 2000, 10000, 50000]
for dataname, n_features, n_samples in zip(l_dataname, l_features, l_samples):
print(" - Generating", dataname, "dataset")
# generates 4x samples - ratio (train:test:eval = 2:1:1)
X_train, y_train, X_test, y_test, X_eval, y_eval = generateSynData(
n_samples * 4, n_features
)
print(" - Training LR model...")
bias, weights = doTrain(
X_train, y_train, X_test, y_test, verbose=args.verbose
)
print(" - Storing LR model and eval data")
saveModel(dataname, bias, weights)
saveData(dataname, X_eval, y_eval)
print("=== Data generation complete ===")
# if all arguments are set, make custom data
elif args.samples > 0 and args.features > 0 and args.name is not None:
print("=== Synthetic data generation for logistic regression HE example ===")
print(
" - Generating custom dataset :",
args.name,
" n_samples:",
args.samples,
" n_features:",
args.features,
)
X_train, y_train, X_test, y_test, X_eval, y_eval = generateSynData(
args.samples * 4, args.features
)
print(" - Training LR model...")
bias, weights = doTrain(X_train, y_train, X_test, y_test, verbose=args.verbose)
print(" - Storing LR model and eval data")
saveModel(args.name, bias, weights)
saveData(args.name, X_eval, y_eval)
print("=== Data generation complete ===")
# if less than 3 flags are defined, raise error
else:
raise ValueError(
"All arguments are mutually exclusive. Set none or all, otherwise will not work"
)
if __name__ == "__main__":
args = parse_cmdline_args()
main(args)