|
| 1 | +# https://youtu.be/dqb0oYZipCI |
| 2 | + |
| 3 | +""" |
| 4 | +https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic) |
| 5 | +## 'data.frame': 569 obs. of 31 variables: |
| 6 | +## $ diagnosis : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ... |
| 7 | +## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ... |
| 8 | +## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ... |
| 9 | +## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ... |
| 10 | +## $ area_mean : num 1001 1326 1203 386 1297 ... |
| 11 | +## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ... |
| 12 | +## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ... |
| 13 | +## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ... |
| 14 | +## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ... |
| 15 | +## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ... |
| 16 | +## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ... |
| 17 | +## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ... |
| 18 | +## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ... |
| 19 | +## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ... |
| 20 | +## $ area_se : num 153.4 74.1 94 27.2 94.4 ... |
| 21 | +## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ... |
| 22 | +## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ... |
| 23 | +## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ... |
| 24 | +## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ... |
| 25 | +## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ... |
| 26 | +## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ... |
| 27 | +## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ... |
| 28 | +## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ... |
| 29 | +## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ... |
| 30 | +## $ area_worst : num 2019 1956 1709 568 1575 ... |
| 31 | +## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ... |
| 32 | +## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ... |
| 33 | +## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ... |
| 34 | +## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ... |
| 35 | +## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ... |
| 36 | +## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ... |
| 37 | +
|
| 38 | +
|
| 39 | +""" |
| 40 | + |
| 41 | +import numpy as np |
| 42 | +import cv2 |
| 43 | +import pandas as pd |
| 44 | +from matplotlib import pyplot as plt |
| 45 | +import seaborn as sns |
| 46 | + |
| 47 | +df = pd.read_csv("data/wisconsin_breast_cancer_dataset.csv") |
| 48 | + |
| 49 | +print(df.describe().T) #Values need to be normalized before fitting. |
| 50 | + |
| 51 | + |
| 52 | +print(df.isnull().sum()) |
| 53 | +#df = df.dropna() |
| 54 | + |
| 55 | +#Rename Dataset to Label to make it easy to understand |
| 56 | +df = df.rename(columns={'Diagnosis':'Label'}) |
| 57 | +print(df.dtypes) |
| 58 | + |
| 59 | +#Understand the data |
| 60 | +sns.countplot(x="Label", data=df) #M - malignant B - benign |
| 61 | + |
| 62 | +sns.distplot(df['radius_mean'], kde=False) |
| 63 | +sns.distplot(df['radius_mean'], kde=False) |
| 64 | + |
| 65 | +print(df.corr()) |
| 66 | + |
| 67 | +corrMatrix = df.corr() |
| 68 | +fig, ax = plt.subplots(figsize=(10,10)) # Sample figsize in inches |
| 69 | +#sns.heatmap(df.iloc[:, 1:6:], annot=True, linewidths=.5, ax=ax) |
| 70 | +sns.heatmap(corrMatrix, annot=False, linewidths=.5, ax=ax) |
| 71 | + |
| 72 | + |
| 73 | +#Replace categorical values with numbers |
| 74 | +df['Label'].value_counts() |
| 75 | + |
| 76 | +categories = {"B":1, "M":2} |
| 77 | +df['Label'] = df['Label'].replace(categories) |
| 78 | + |
| 79 | + |
| 80 | +#Define the dependent variable that needs to be predicted (labels) |
| 81 | +Y = df["Label"].values |
| 82 | + |
| 83 | +#Define the independent variables. Let's also drop Gender, so we can normalize other data |
| 84 | +X = df.drop(labels = ["Label", "ID"], axis=1) |
| 85 | + |
| 86 | +#from sklearn.preprocessing import normalize |
| 87 | +#X = normalize(X, axis=1) |
| 88 | + |
| 89 | +from sklearn.preprocessing import MinMaxScaler |
| 90 | +scaler = MinMaxScaler() |
| 91 | +scaler.fit(X) |
| 92 | +X = scaler.transform(X) |
| 93 | +#Split data into train and test to verify accuracy after fitting the model. |
| 94 | +from sklearn.model_selection import train_test_split |
| 95 | +X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) |
| 96 | + |
| 97 | +#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html?highlight=svc#sklearn.svm.SVC |
| 98 | +#https://scikit-learn.org/stable/modules/svm.html |
| 99 | +#from sklearn.svm import SVC |
| 100 | +from sklearn import svm |
| 101 | +model = svm.LinearSVC(max_iter=10000) |
| 102 | +#model = SVC(kernel='linear', C=10, gamma=1000, max_iter=10000) |
| 103 | +model.fit(X_train, y_train) |
| 104 | + |
| 105 | +prediction = model.predict(X_test) |
| 106 | + |
| 107 | +from sklearn import metrics |
| 108 | +print ("Accuracy = ", metrics.accuracy_score(y_test, prediction)) |
| 109 | + |
| 110 | + |
| 111 | +#Confusion Matrix |
| 112 | +from sklearn.metrics import confusion_matrix |
| 113 | +cm = confusion_matrix(y_test, prediction) |
| 114 | +print(cm) |
| 115 | + |
| 116 | +#Print individual accuracy values for each class, based on the confusion matrix |
| 117 | +print("Benign = ", cm[0,0] / (cm[0,0]+cm[1,0])) |
| 118 | +print("Malignant = ", cm[1,1] / (cm[0,1]+cm[1,1])) |
| 119 | + |
| 120 | + |
| 121 | + |
0 commit comments