diff --git a/sravansmart7733@gmail.com/Classification-Imbalance.ipynb b/sravansmart7733@gmail.com/Classification-Imbalance.ipynb
new file mode 100644
index 000000000..558160b98
--- /dev/null
+++ b/sravansmart7733@gmail.com/Classification-Imbalance.ipynb
@@ -0,0 +1,944 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Using TensorFlow backend.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " V1 | \n",
+ " V2 | \n",
+ " V3 | \n",
+ " V4 | \n",
+ " V5 | \n",
+ " V6 | \n",
+ " V7 | \n",
+ " V8 | \n",
+ " V9 | \n",
+ " ... | \n",
+ " V21 | \n",
+ " V22 | \n",
+ " V23 | \n",
+ " V24 | \n",
+ " V25 | \n",
+ " V26 | \n",
+ " V27 | \n",
+ " V28 | \n",
+ " Amount | \n",
+ " Class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " -1.359807 | \n",
+ " -0.072781 | \n",
+ " 2.536347 | \n",
+ " 1.378155 | \n",
+ " -0.338321 | \n",
+ " 0.462388 | \n",
+ " 0.239599 | \n",
+ " 0.098698 | \n",
+ " 0.363787 | \n",
+ " ... | \n",
+ " -0.018307 | \n",
+ " 0.277838 | \n",
+ " -0.110474 | \n",
+ " 0.066928 | \n",
+ " 0.128539 | \n",
+ " -0.189115 | \n",
+ " 0.133558 | \n",
+ " -0.021053 | \n",
+ " 149.62 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 1.191857 | \n",
+ " 0.266151 | \n",
+ " 0.166480 | \n",
+ " 0.448154 | \n",
+ " 0.060018 | \n",
+ " -0.082361 | \n",
+ " -0.078803 | \n",
+ " 0.085102 | \n",
+ " -0.255425 | \n",
+ " ... | \n",
+ " -0.225775 | \n",
+ " -0.638672 | \n",
+ " 0.101288 | \n",
+ " -0.339846 | \n",
+ " 0.167170 | \n",
+ " 0.125895 | \n",
+ " -0.008983 | \n",
+ " 0.014724 | \n",
+ " 2.69 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " -1.358354 | \n",
+ " -1.340163 | \n",
+ " 1.773209 | \n",
+ " 0.379780 | \n",
+ " -0.503198 | \n",
+ " 1.800499 | \n",
+ " 0.791461 | \n",
+ " 0.247676 | \n",
+ " -1.514654 | \n",
+ " ... | \n",
+ " 0.247998 | \n",
+ " 0.771679 | \n",
+ " 0.909412 | \n",
+ " -0.689281 | \n",
+ " -0.327642 | \n",
+ " -0.139097 | \n",
+ " -0.055353 | \n",
+ " -0.059752 | \n",
+ " 378.66 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1.0 | \n",
+ " -0.966272 | \n",
+ " -0.185226 | \n",
+ " 1.792993 | \n",
+ " -0.863291 | \n",
+ " -0.010309 | \n",
+ " 1.247203 | \n",
+ " 0.237609 | \n",
+ " 0.377436 | \n",
+ " -1.387024 | \n",
+ " ... | \n",
+ " -0.108300 | \n",
+ " 0.005274 | \n",
+ " -0.190321 | \n",
+ " -1.175575 | \n",
+ " 0.647376 | \n",
+ " -0.221929 | \n",
+ " 0.062723 | \n",
+ " 0.061458 | \n",
+ " 123.50 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2.0 | \n",
+ " -1.158233 | \n",
+ " 0.877737 | \n",
+ " 1.548718 | \n",
+ " 0.403034 | \n",
+ " -0.407193 | \n",
+ " 0.095921 | \n",
+ " 0.592941 | \n",
+ " -0.270533 | \n",
+ " 0.817739 | \n",
+ " ... | \n",
+ " -0.009431 | \n",
+ " 0.798278 | \n",
+ " -0.137458 | \n",
+ " 0.141267 | \n",
+ " -0.206010 | \n",
+ " 0.502292 | \n",
+ " 0.219422 | \n",
+ " 0.215153 | \n",
+ " 69.99 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 31 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time V1 V2 V3 V4 V5 V6 V7 \\\n",
+ "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
+ "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
+ "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
+ "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
+ "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
+ "\n",
+ " V8 V9 ... V21 V22 V23 V24 V25 \\\n",
+ "0 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 \n",
+ "1 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 \n",
+ "2 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 \n",
+ "3 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 \n",
+ "4 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 \n",
+ "\n",
+ " V26 V27 V28 Amount Class \n",
+ "0 -0.189115 0.133558 -0.021053 149.62 0 \n",
+ "1 0.125895 -0.008983 0.014724 2.69 0 \n",
+ "2 -0.139097 -0.055353 -0.059752 378.66 0 \n",
+ "3 -0.221929 0.062723 0.061458 123.50 0 \n",
+ "4 0.502292 0.219422 0.215153 69.99 0 \n",
+ "\n",
+ "[5 rows x 31 columns]"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import numpy as np # linear algebra\n",
+ "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
+ "import tensorflow as tf\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from sklearn.manifold import TSNE\n",
+ "from sklearn.decomposition import PCA, TruncatedSVD\n",
+ "import matplotlib.patches as mpatches\n",
+ "import time\n",
+ "\n",
+ "# Classifier Libraries\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.svm import SVC\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "import collections\n",
+ "\n",
+ "\n",
+ "# Other Libraries\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.pipeline import make_pipeline\n",
+ "from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline\n",
+ "from imblearn.over_sampling import SMOTE\n",
+ "from imblearn.under_sampling import NearMiss\n",
+ "from imblearn.metrics import classification_report_imbalanced\n",
+ "from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report\n",
+ "from collections import Counter\n",
+ "from sklearn.model_selection import KFold, StratifiedKFold\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "\n",
+ "df = pd.read_csv('D:/creditcard.csv')\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "No Frauds 99.83 % of the dataset\n",
+ "Frauds 0.17 % of the dataset\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')\n",
+ "print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Class Distributions \\n (0: No Fraud || 1: Fraud)')"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "colors = [\"#0101DF\", \"#DF0101\"]\n",
+ "\n",
+ "sns.countplot('Class', data=df, palette=colors)\n",
+ "plt.title('Class Distributions \\n (0: No Fraud || 1: Fraud)', fontsize=14)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Separate input features and target\n",
+ "y = df.Class\n",
+ "X = df.drop('Class', axis=1)\n",
+ "\n",
+ "# setting up testing and training sets\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy Score : 99.92135052386169 %\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0 71108\n",
+ "1 94\n",
+ "Name: 0, dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)\n",
+ " \n",
+ "# Predict on training set\n",
+ "lr_pred = lr.predict(X_test)\n",
+ "\n",
+ "# Checking accuracy\n",
+ "a = accuracy_score(y_test, lr_pred)\n",
+ "print(\"Accuracy Score : \", a*100,\"%\")\n",
+ "\n",
+ "\n",
+ "# Checking unique values\n",
+ "predictions = pd.DataFrame(lr_pred)\n",
+ "predictions[0].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "F1 Score : 0.7522123893805309\n",
+ "Recall Score : 0.6439393939393939\n"
+ ]
+ }
+ ],
+ "source": [
+ "# f1 score\n",
+ "print(\"F1 Score : \", f1_score(y_test, lr_pred))\n",
+ "\n",
+ "# recall score\n",
+ "print(\"Recall Score : \", recall_score(y_test, lr_pred))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "\n",
+ "# train model\n",
+ "rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)\n",
+ "\n",
+ "# predict on test set\n",
+ "rfc_pred = rfc.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9995365298727564"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "accuracy_score(y_test, rfc_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "F1 score : 0.8607594936708862\n",
+ "Recall score : 0.7727272727272727\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"F1 score : \",f1_score(y_test, rfc_pred))\n",
+ "\n",
+ "print(\"Recall score : \", recall_score(y_test, rfc_pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy Score : 99.7415802926884\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.ensemble import GradientBoostingClassifier\n",
+ "\n",
+ "gb_clf = GradientBoostingClassifier(n_estimators=20, max_features=2, max_depth=2, random_state=0)\n",
+ "gb = gb_clf.fit(X_train, y_train)\n",
+ "gb_pred = gb.predict(X_test)\n",
+ "\n",
+ "print(\"Accuracy Score : \", accuracy_score(y_test, gb_pred)*100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "F1 score : 0.17117117117117117\n",
+ "Recall score : 0.14393939393939395\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"F1 score : \",f1_score(y_test, gb_pred))\n",
+ "\n",
+ "print(\"Recall score : \", recall_score(y_test, gb_pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " V1 | \n",
+ " V2 | \n",
+ " V3 | \n",
+ " V4 | \n",
+ " V5 | \n",
+ " V6 | \n",
+ " V7 | \n",
+ " V8 | \n",
+ " V9 | \n",
+ " ... | \n",
+ " V21 | \n",
+ " V22 | \n",
+ " V23 | \n",
+ " V24 | \n",
+ " V25 | \n",
+ " V26 | \n",
+ " V27 | \n",
+ " V28 | \n",
+ " Amount | \n",
+ " Class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 59318 | \n",
+ " 48789.0 | \n",
+ " -0.892344 | \n",
+ " 1.725185 | \n",
+ " 2.014072 | \n",
+ " 2.969493 | \n",
+ " -0.358103 | \n",
+ " 0.208247 | \n",
+ " 0.173195 | \n",
+ " 0.508835 | \n",
+ " -1.421750 | \n",
+ " ... | \n",
+ " -0.219772 | \n",
+ " -0.467348 | \n",
+ " -0.120825 | \n",
+ " 0.396310 | \n",
+ " 0.197382 | \n",
+ " 0.121402 | \n",
+ " 0.220645 | \n",
+ " 0.091821 | \n",
+ " 8.27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 74794 | \n",
+ " 55760.0 | \n",
+ " -6.003422 | \n",
+ " -3.930731 | \n",
+ " -0.007045 | \n",
+ " 1.714669 | \n",
+ " 3.414667 | \n",
+ " -2.329583 | \n",
+ " -1.901512 | \n",
+ " -2.746111 | \n",
+ " 0.887673 | \n",
+ " ... | \n",
+ " 1.101671 | \n",
+ " -0.992494 | \n",
+ " -0.698259 | \n",
+ " 0.139898 | \n",
+ " -0.205151 | \n",
+ " -0.472412 | \n",
+ " 1.775378 | \n",
+ " -0.104285 | \n",
+ " 311.91 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 36734 | \n",
+ " 38667.0 | \n",
+ " 1.179743 | \n",
+ " -1.164141 | \n",
+ " 1.015352 | \n",
+ " -0.405885 | \n",
+ " -1.850985 | \n",
+ " -0.503236 | \n",
+ " -1.136412 | \n",
+ " 0.074776 | \n",
+ " 0.133414 | \n",
+ " ... | \n",
+ " 0.344607 | \n",
+ " 0.785058 | \n",
+ " -0.164374 | \n",
+ " 0.411598 | \n",
+ " 0.369617 | \n",
+ " -0.051243 | \n",
+ " 0.022254 | \n",
+ " 0.038167 | \n",
+ " 101.37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 120837 | \n",
+ " 75978.0 | \n",
+ " -5.140723 | \n",
+ " 3.568751 | \n",
+ " -5.896245 | \n",
+ " 4.164720 | \n",
+ " -4.091193 | \n",
+ " -1.989960 | \n",
+ " -5.472436 | \n",
+ " 2.422821 | \n",
+ " -2.909735 | \n",
+ " ... | \n",
+ " 1.131130 | \n",
+ " 0.118022 | \n",
+ " -0.332704 | \n",
+ " 0.139941 | \n",
+ " 0.324758 | \n",
+ " -0.180769 | \n",
+ " 0.177810 | \n",
+ " 0.661555 | \n",
+ " 99.90 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 6427 | \n",
+ " 7610.0 | \n",
+ " 0.725646 | \n",
+ " 2.300894 | \n",
+ " -5.329976 | \n",
+ " 4.007683 | \n",
+ " -1.730411 | \n",
+ " -1.732193 | \n",
+ " -3.968593 | \n",
+ " 1.063728 | \n",
+ " -0.486097 | \n",
+ " ... | \n",
+ " 0.589669 | \n",
+ " 0.109541 | \n",
+ " 0.601045 | \n",
+ " -0.364700 | \n",
+ " -1.843078 | \n",
+ " 0.351909 | \n",
+ " 0.594550 | \n",
+ " 0.099372 | \n",
+ " 1.00 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 31 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time V1 V2 V3 V4 V5 V6 \\\n",
+ "59318 48789.0 -0.892344 1.725185 2.014072 2.969493 -0.358103 0.208247 \n",
+ "74794 55760.0 -6.003422 -3.930731 -0.007045 1.714669 3.414667 -2.329583 \n",
+ "36734 38667.0 1.179743 -1.164141 1.015352 -0.405885 -1.850985 -0.503236 \n",
+ "120837 75978.0 -5.140723 3.568751 -5.896245 4.164720 -4.091193 -1.989960 \n",
+ "6427 7610.0 0.725646 2.300894 -5.329976 4.007683 -1.730411 -1.732193 \n",
+ "\n",
+ " V7 V8 V9 ... V21 V22 V23 \\\n",
+ "59318 0.173195 0.508835 -1.421750 ... -0.219772 -0.467348 -0.120825 \n",
+ "74794 -1.901512 -2.746111 0.887673 ... 1.101671 -0.992494 -0.698259 \n",
+ "36734 -1.136412 0.074776 0.133414 ... 0.344607 0.785058 -0.164374 \n",
+ "120837 -5.472436 2.422821 -2.909735 ... 1.131130 0.118022 -0.332704 \n",
+ "6427 -3.968593 1.063728 -0.486097 ... 0.589669 0.109541 0.601045 \n",
+ "\n",
+ " V24 V25 V26 V27 V28 Amount Class \n",
+ "59318 0.396310 0.197382 0.121402 0.220645 0.091821 8.27 0 \n",
+ "74794 0.139898 -0.205151 -0.472412 1.775378 -0.104285 311.91 1 \n",
+ "36734 0.411598 0.369617 -0.051243 0.022254 0.038167 101.37 0 \n",
+ "120837 0.139941 0.324758 -0.180769 0.177810 0.661555 99.90 1 \n",
+ "6427 -0.364700 -1.843078 0.351909 0.594550 0.099372 1.00 1 \n",
+ "\n",
+ "[5 rows x 31 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = df.sample(frac=1)\n",
+ "\n",
+ "# amount of fraud classes 492 rows.\n",
+ "fraud_df = df.loc[df['Class'] == 1]\n",
+ "non_fraud_df = df.loc[df['Class'] == 0][:492]\n",
+ "\n",
+ "normal_distributed_df = pd.concat([fraud_df, non_fraud_df])\n",
+ "\n",
+ "# Shuffle dataframe rows\n",
+ "new_df = normal_distributed_df.sample(frac=1, random_state=42)\n",
+ "\n",
+ "new_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Distribution of the Classes in the subsample dataset\n",
+ "1 0.5\n",
+ "0 0.5\n",
+ "Name: Class, dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "print('Distribution of the Classes in the subsample dataset')\n",
+ "print(new_df['Class'].value_counts()/len(new_df))\n",
+ "\n",
+ "\n",
+ "\n",
+ "sns.countplot('Class', data=new_df, palette=colors)\n",
+ "plt.title('Equally Distributed Classes', fontsize=14)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Classifiers: LogisticRegression\n",
+ "Accuracy Score : 99.89993328885924\n",
+ "F1 score : 0.6459627329192548\n",
+ "Recall score : 0.5306122448979592\n",
+ "--------------------------------------------\n",
+ "Classifiers: RandomForestClassifier\n",
+ "Accuracy Score : 99.95611109160492\n",
+ "F1 score : 0.8587570621468926\n",
+ "Recall score : 0.7755102040816326\n",
+ "--------------------------------------------\n",
+ "Classifiers: GradientBoostingClassifier\n",
+ "Accuracy Score : 99.89466661985183\n",
+ "F1 score : 0.6629213483146067\n",
+ "Recall score : 0.6020408163265306\n",
+ "--------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Our data is already scaled we should split our training and test sets\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "# This is explicitly used for undersampling.\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+ "\n",
+ "# Turn the values into an array for feeding the classification algorithms.\n",
+ "X_train = X_train.values\n",
+ "X_test = X_test.values\n",
+ "y_train = y_train.values\n",
+ "y_test = y_test.values\n",
+ "\n",
+ "# Let's implement simple classifiers\n",
+ "\n",
+ "classifiers = {\n",
+ " \"LogisiticRegression\": LogisticRegression(),\n",
+ " \"RandomTreeClassifier\": RandomForestClassifier(),\n",
+ " \"GradientBoostingClassifier\": GradientBoostingClassifier() \n",
+ "}\n",
+ "\n",
+ "# Wow our scores are getting even high scores even when applying cross validation.\n",
+ "from sklearn.model_selection import cross_val_score\n",
+ "\n",
+ "\n",
+ "for key, classifier in classifiers.items():\n",
+ " train = classifier.fit(X_train, y_train)\n",
+ " pred = train.predict(X_test)\n",
+ " print(\"Classifiers: \", classifier.__class__.__name__)\n",
+ " print(\"Accuracy Score : \", accuracy_score(y_test, pred)*100)\n",
+ " print(\"F1 score : \",f1_score(y_test, pred))\n",
+ " print(\"Recall score : \", recall_score(y_test, pred))\n",
+ " print(\"--------------------------------------------\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Separate input features and target\n",
+ "y = df.Class\n",
+ "X = df.drop('Class', axis=1)\n",
+ "\n",
+ "# setting up testing and training sets\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import imblearn"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Separate input features and target\n",
+ "y = df.Class\n",
+ "X = df.drop('Class', axis=1)\n",
+ "\n",
+ "# setting up testing and training sets\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from imblearn.over_sampling import SMOTE"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# SMOTE Technique (OverSampling) After splitting and Cross Validating\n",
+ "sm = SMOTE(ratio='minority', random_state=42)\n",
+ "# Xsm_train, ysm_train = sm.fit_sample(X_train, y_train)\n",
+ "\n",
+ "\n",
+ "# This will be the data were we are going to \n",
+ "Xsm_train, ysm_train = sm.fit_sample(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Classifiers: LogisticRegression\n",
+ "Accuracy Score : 98.71492373809726\n",
+ "F1 score : 0.16893732970027248\n",
+ "Recall score : 0.8773584905660378\n",
+ "--------------------------------------------\n",
+ "Classifiers: RandomForestClassifier\n",
+ "Accuracy Score : 99.96067526193085\n",
+ "F1 score : 0.8600000000000001\n",
+ "Recall score : 0.8113207547169812\n",
+ "--------------------------------------------\n",
+ "Classifiers: GradientBoostingClassifier\n",
+ "Accuracy Score : 99.47332940085953\n",
+ "F1 score : 0.3267504488330341\n",
+ "Recall score : 0.8584905660377359\n",
+ "--------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "classifiers = {\n",
+ " \"LogisiticRegression\": LogisticRegression(),\n",
+ " \"RandomTreeClassifier\": RandomForestClassifier(),\n",
+ " \"GradientBoostingClassifier\": GradientBoostingClassifier() \n",
+ "}\n",
+ "\n",
+ "# Wow our scores are getting even high scores even when applying cross validation.\n",
+ "from sklearn.model_selection import cross_val_score\n",
+ "\n",
+ "\n",
+ "for key, classifier in classifiers.items():\n",
+ " train = classifier.fit(Xsm_train, ysm_train)\n",
+ " pred = train.predict(X_test)\n",
+ " print(\"Classifiers: \", classifier.__class__.__name__)\n",
+ " print(\"Accuracy Score : \", accuracy_score(y_test, pred)*100)\n",
+ " print(\"F1 score : \",f1_score(y_test, pred))\n",
+ " print(\"Recall score : \", recall_score(y_test, pred))\n",
+ " print(\"--------------------------------------------\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sravansmart7733@gmail.com/Classification-ImbalanceData.pdf b/sravansmart7733@gmail.com/Classification-ImbalanceData.pdf
new file mode 100644
index 000000000..41a69d9ce
Binary files /dev/null and b/sravansmart7733@gmail.com/Classification-ImbalanceData.pdf differ