Skip to content

Commit e85b9cb

Browse files
authored
Add files via upload
1 parent 2674287 commit e85b9cb

2 files changed

+244
-0
lines changed
+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# https://youtu.be/D8repXHkKdk
2+
3+
"""
4+
https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
5+
## 'data.frame': 569 obs. of 31 variables:
6+
## $ diagnosis : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
7+
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
8+
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
9+
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
10+
## $ area_mean : num 1001 1326 1203 386 1297 ...
11+
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
12+
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
13+
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
14+
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
15+
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
16+
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
17+
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
18+
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
19+
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
20+
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
21+
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
22+
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
23+
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
24+
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
25+
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
26+
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
27+
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
28+
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
29+
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
30+
## $ area_worst : num 2019 1956 1709 568 1575 ...
31+
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
32+
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
33+
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
34+
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
35+
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
36+
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
37+
38+
39+
"""
40+
41+
import numpy as np
42+
import cv2
43+
import pandas as pd
44+
from matplotlib import pyplot as plt
45+
import seaborn as sns
46+
47+
df = pd.read_csv("data/wisconsin_breast_cancer_dataset.csv")
48+
49+
print(df.describe().T) #Values need to be normalized before fitting.
50+
51+
52+
print(df.isnull().sum())
53+
#df = df.dropna()
54+
55+
#Rename Dataset to Label to make it easy to understand
56+
df = df.rename(columns={'Diagnosis':'Label'})
57+
print(df.dtypes)
58+
59+
#Understand the data
60+
sns.countplot(x="Label", data=df) #M - malignant B - benign
61+
62+
sns.distplot(df['radius_mean'], kde=False)
63+
64+
print(df.corr())
65+
66+
corrMatrix = df.corr()
67+
fig, ax = plt.subplots(figsize=(10,10)) # Sample figsize in inches
68+
#sns.heatmap(df.iloc[:, 1:6:], annot=True, linewidths=.5, ax=ax)
69+
sns.heatmap(corrMatrix, annot=False, linewidths=.5, ax=ax)
70+
71+
72+
#Replace categorical values with numbers
73+
df['Label'].value_counts()
74+
75+
categories = {"B":1, "M":2}
76+
df['Label'] = df['Label'].replace(categories)
77+
78+
79+
#Define the dependent variable that needs to be predicted (labels)
80+
Y = df["Label"].values
81+
82+
#Define the independent variables. Let's also drop Gender, so we can normalize other data
83+
X = df.drop(labels = ["Label", "ID"], axis=1)
84+
features_list = list(X.columns) #List features so we can rank them later.
85+
#from sklearn.preprocessing import normalize
86+
#X = normalize(X, axis=1)
87+
88+
from sklearn.preprocessing import MinMaxScaler
89+
scaler = MinMaxScaler()
90+
scaler.fit(X)
91+
X = scaler.transform(X)
92+
93+
#Split data into train and test to verify accuracy after fitting the model.
94+
from sklearn.model_selection import train_test_split
95+
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
96+
97+
#RANDOM FOREST
98+
from sklearn.ensemble import RandomForestClassifier
99+
model = RandomForestClassifier(n_estimators = 25, random_state = 42)
100+
101+
# Train the model on training data
102+
model.fit(X_train, y_train)
103+
104+
105+
prediction = model.predict(X_test)
106+
107+
from sklearn import metrics
108+
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction))
109+
110+
111+
#Confusion Matrix
112+
from sklearn.metrics import confusion_matrix
113+
cm = confusion_matrix(y_test, prediction)
114+
print(cm)
115+
116+
#Print individual accuracy values for each class, based on the confusion matrix
117+
print("Benign = ", cm[0,0] / (cm[0,0]+cm[1,0]))
118+
print("Malignant = ", cm[1,1] / (cm[0,1]+cm[1,1]))
119+
120+
121+
#importances = list(model_RF.feature_importances_)
122+
feature_imp = pd.Series(model.feature_importances_, index=features_list).sort_values(ascending=False)
123+
print(feature_imp)

tutorial77-SVM_breast_cancer.py

+121
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# https://youtu.be/dqb0oYZipCI
2+
3+
"""
4+
https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
5+
## 'data.frame': 569 obs. of 31 variables:
6+
## $ diagnosis : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
7+
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
8+
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
9+
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
10+
## $ area_mean : num 1001 1326 1203 386 1297 ...
11+
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
12+
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
13+
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
14+
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
15+
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
16+
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
17+
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
18+
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
19+
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
20+
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
21+
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
22+
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
23+
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
24+
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
25+
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
26+
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
27+
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
28+
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
29+
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
30+
## $ area_worst : num 2019 1956 1709 568 1575 ...
31+
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
32+
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
33+
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
34+
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
35+
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
36+
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
37+
38+
39+
"""
40+
41+
import numpy as np
42+
import cv2
43+
import pandas as pd
44+
from matplotlib import pyplot as plt
45+
import seaborn as sns
46+
47+
df = pd.read_csv("data/wisconsin_breast_cancer_dataset.csv")
48+
49+
print(df.describe().T) #Values need to be normalized before fitting.
50+
51+
52+
print(df.isnull().sum())
53+
#df = df.dropna()
54+
55+
#Rename Dataset to Label to make it easy to understand
56+
df = df.rename(columns={'Diagnosis':'Label'})
57+
print(df.dtypes)
58+
59+
#Understand the data
60+
sns.countplot(x="Label", data=df) #M - malignant B - benign
61+
62+
sns.distplot(df['radius_mean'], kde=False)
63+
sns.distplot(df['radius_mean'], kde=False)
64+
65+
print(df.corr())
66+
67+
corrMatrix = df.corr()
68+
fig, ax = plt.subplots(figsize=(10,10)) # Sample figsize in inches
69+
#sns.heatmap(df.iloc[:, 1:6:], annot=True, linewidths=.5, ax=ax)
70+
sns.heatmap(corrMatrix, annot=False, linewidths=.5, ax=ax)
71+
72+
73+
#Replace categorical values with numbers
74+
df['Label'].value_counts()
75+
76+
categories = {"B":1, "M":2}
77+
df['Label'] = df['Label'].replace(categories)
78+
79+
80+
#Define the dependent variable that needs to be predicted (labels)
81+
Y = df["Label"].values
82+
83+
#Define the independent variables. Let's also drop Gender, so we can normalize other data
84+
X = df.drop(labels = ["Label", "ID"], axis=1)
85+
86+
#from sklearn.preprocessing import normalize
87+
#X = normalize(X, axis=1)
88+
89+
from sklearn.preprocessing import MinMaxScaler
90+
scaler = MinMaxScaler()
91+
scaler.fit(X)
92+
X = scaler.transform(X)
93+
#Split data into train and test to verify accuracy after fitting the model.
94+
from sklearn.model_selection import train_test_split
95+
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
96+
97+
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html?highlight=svc#sklearn.svm.SVC
98+
#https://scikit-learn.org/stable/modules/svm.html
99+
#from sklearn.svm import SVC
100+
from sklearn import svm
101+
model = svm.LinearSVC(max_iter=10000)
102+
#model = SVC(kernel='linear', C=10, gamma=1000, max_iter=10000)
103+
model.fit(X_train, y_train)
104+
105+
prediction = model.predict(X_test)
106+
107+
from sklearn import metrics
108+
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction))
109+
110+
111+
#Confusion Matrix
112+
from sklearn.metrics import confusion_matrix
113+
cm = confusion_matrix(y_test, prediction)
114+
print(cm)
115+
116+
#Print individual accuracy values for each class, based on the confusion matrix
117+
print("Benign = ", cm[0,0] / (cm[0,0]+cm[1,0]))
118+
print("Malignant = ", cm[1,1] / (cm[0,1]+cm[1,1]))
119+
120+
121+

0 commit comments

Comments
 (0)