Add files via upload

bnsreenu · web-flow · commit e85b9cbab147 · 2020-09-21T18:36:37.000-07:00
diff --git a/tutorial76-Random Forest_breast_cancer.py b/tutorial76-Random Forest_breast_cancer.py
@@ -0,0 +1,123 @@
+# https://youtu.be/D8repXHkKdk
+
+"""
+https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
+## 'data.frame':    569 obs. of  31 variables:
+##  $ diagnosis              : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
+##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
+##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
+##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
+##  $ area_mean              : num  1001 1326 1203 386 1297 ...
+##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
+##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
+##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
+##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
+##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
+##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
+##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
+##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
+##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
+##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
+##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
+##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
+##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
+##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
+##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
+##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
+##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
+##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
+##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
+##  $ area_worst             : num  2019 1956 1709 568 1575 ...
+##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
+##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
+##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
+##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
+##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
+##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
+
+
+"""
+
+import numpy as np
+import cv2
+import pandas as pd
+from matplotlib import pyplot as plt
+import seaborn as sns
+ 
+df = pd.read_csv("data/wisconsin_breast_cancer_dataset.csv")
+
+print(df.describe().T)  #Values need to be normalized before fitting. 
+
+
+print(df.isnull().sum())
+#df = df.dropna()
+
+#Rename Dataset to Label to make it easy to understand
+df = df.rename(columns={'Diagnosis':'Label'})
+print(df.dtypes)
+
+#Understand the data 
+sns.countplot(x="Label", data=df) #M - malignant   B - benign
+
+sns.distplot(df['radius_mean'], kde=False)
+
+print(df.corr())
+
+corrMatrix = df.corr()
+fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches
+#sns.heatmap(df.iloc[:, 1:6:], annot=True, linewidths=.5, ax=ax)
+sns.heatmap(corrMatrix, annot=False, linewidths=.5, ax=ax)
+
+
+#Replace categorical values with numbers
+df['Label'].value_counts()
+
+categories = {"B":1, "M":2}
+df['Label'] = df['Label'].replace(categories)
+
+
+#Define the dependent variable that needs to be predicted (labels)
+Y = df["Label"].values
+
+#Define the independent variables. Let's also drop Gender, so we can normalize other data
+X = df.drop(labels = ["Label", "ID"], axis=1) 
+features_list = list(X.columns)  #List features so we can rank them later.
+#from sklearn.preprocessing import normalize
+#X = normalize(X, axis=1)
+
+from sklearn.preprocessing import MinMaxScaler
+scaler = MinMaxScaler()
+scaler.fit(X)
+X = scaler.transform(X)
+
+#Split data into train and test to verify accuracy after fitting the model. 
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
+
+#RANDOM FOREST
+from sklearn.ensemble import RandomForestClassifier
+model = RandomForestClassifier(n_estimators = 25, random_state = 42)
+
+# Train the model on training data
+model.fit(X_train, y_train)
+
+
+prediction = model.predict(X_test)
+
+from sklearn import metrics
+print ("Accuracy = ", metrics.accuracy_score(y_test, prediction))
+
+
+#Confusion Matrix
+from sklearn.metrics import confusion_matrix
+cm = confusion_matrix(y_test, prediction)
+print(cm)
+
+#Print individual accuracy values for each class, based on the confusion matrix
+print("Benign = ", cm[0,0] / (cm[0,0]+cm[1,0]))
+print("Malignant = ",   cm[1,1] / (cm[0,1]+cm[1,1]))
+
+
+#importances = list(model_RF.feature_importances_)
+feature_imp = pd.Series(model.feature_importances_, index=features_list).sort_values(ascending=False)
+print(feature_imp)
diff --git a/tutorial77-SVM_breast_cancer.py b/tutorial77-SVM_breast_cancer.py
@@ -0,0 +1,121 @@
+# https://youtu.be/dqb0oYZipCI
+
+"""
+https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
+## 'data.frame':    569 obs. of  31 variables:
+##  $ diagnosis              : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
+##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
+##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
+##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
+##  $ area_mean              : num  1001 1326 1203 386 1297 ...
+##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
+##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
+##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
+##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
+##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
+##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
+##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
+##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
+##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
+##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
+##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
+##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
+##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
+##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
+##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
+##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
+##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
+##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
+##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
+##  $ area_worst             : num  2019 1956 1709 568 1575 ...
+##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
+##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
+##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
+##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
+##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
+##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
+
+
+"""
+
+import numpy as np
+import cv2
+import pandas as pd
+from matplotlib import pyplot as plt
+import seaborn as sns
+ 
+df = pd.read_csv("data/wisconsin_breast_cancer_dataset.csv")
+
+print(df.describe().T)  #Values need to be normalized before fitting. 
+
+
+print(df.isnull().sum())
+#df = df.dropna()
+
+#Rename Dataset to Label to make it easy to understand
+df = df.rename(columns={'Diagnosis':'Label'})
+print(df.dtypes)
+
+#Understand the data 
+sns.countplot(x="Label", data=df) #M - malignant   B - benign
+
+sns.distplot(df['radius_mean'], kde=False)
+sns.distplot(df['radius_mean'], kde=False)
+
+print(df.corr())
+
+corrMatrix = df.corr()
+fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches
+#sns.heatmap(df.iloc[:, 1:6:], annot=True, linewidths=.5, ax=ax)
+sns.heatmap(corrMatrix, annot=False, linewidths=.5, ax=ax)
+
+
+#Replace categorical values with numbers
+df['Label'].value_counts()
+
+categories = {"B":1, "M":2}
+df['Label'] = df['Label'].replace(categories)
+
+
+#Define the dependent variable that needs to be predicted (labels)
+Y = df["Label"].values
+
+#Define the independent variables. Let's also drop Gender, so we can normalize other data
+X = df.drop(labels = ["Label", "ID"], axis=1) 
+
+#from sklearn.preprocessing import normalize
+#X = normalize(X, axis=1)
+
+from sklearn.preprocessing import MinMaxScaler
+scaler = MinMaxScaler()
+scaler.fit(X)
+X = scaler.transform(X)
+#Split data into train and test to verify accuracy after fitting the model. 
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
+
+#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html?highlight=svc#sklearn.svm.SVC
+#https://scikit-learn.org/stable/modules/svm.html
+#from sklearn.svm import SVC
+from sklearn import svm
+model = svm.LinearSVC(max_iter=10000)
+#model = SVC(kernel='linear', C=10, gamma=1000, max_iter=10000)
+model.fit(X_train, y_train)
+
+prediction = model.predict(X_test)
+
+from sklearn import metrics
+print ("Accuracy = ", metrics.accuracy_score(y_test, prediction))
+
+
+#Confusion Matrix
+from sklearn.metrics import confusion_matrix
+cm = confusion_matrix(y_test, prediction)
+print(cm)
+
+#Print individual accuracy values for each class, based on the confusion matrix
+print("Benign = ", cm[0,0] / (cm[0,0]+cm[1,0]))
+print("Malignant = ",   cm[1,1] / (cm[0,1]+cm[1,1]))
+
+
+