-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathdataset.py
63 lines (50 loc) · 2.47 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import numpy as np
import scipy
from scipy.linalg import expm
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
def breast_cancer(training_size, test_size, n, PLOT_DATA=True):
class_labels = [r'Benign', r'Malignant']
# First the dataset must be imported.
cancer = datasets.load_breast_cancer()
# To find if the classifier is accurate, a common strategy is
# to divide the dataset into a training set and a test set.
# Here the data is divided into 70% training, 30% testing.
X_train, X_test, Y_train, Y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=109)
# Now the dataset's features will be standardized
# to fit a normal distribution.
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# To be able to use this data with the given
# number of qubits, the data must be broken down from
# 30 dimensions to `n` dimensions.
# This is done with Principal Component Analysis (PCA),
# which finds patterns while keeping variation.
pca = PCA(n_components=n).fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
# The last step in the data processing is
# to scale the data to be between -1 and 1
samples = np.append(X_train, X_test, axis=0)
minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
X_train = minmax_scale.transform(X_train)
X_test = minmax_scale.transform(X_test)
# Now some sample should be picked to train the model from
training_input = {key: (X_train[Y_train == k, :])[:training_size] for k, key in enumerate(class_labels)}
test_input = {key: (X_train[Y_train == k, :])[training_size:(
training_size+test_size)] for k, key in enumerate(class_labels)}
if PLOT_DATA:
for k in range(0, 2):
x_axis_data = X_train[Y_train == k, 0][:training_size]
y_axis_data = X_train[Y_train == k, 1][:training_size]
label = 'Malignant' if k is 1 else 'Benign'
plt.scatter(x_axis_data, y_axis_data, label=label)
plt.title("Breast Cancer Dataset (Dimensionality Reduced With PCA)")
plt.legend()
plt.show()
return X_train, training_input, test_input, class_labels