🚧 Adding Gini and Entropy Impurity

Miguel Angel · Miguel Angel · commit 80b30019e695 · 2021-05-25T00:55:44.000-05:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "D:\\Anaconda3\\envs\\darlin\\python.exe"
+}
diff --git a/ImpurityMeasures/entropy.py b/ImpurityMeasures/entropy.py
@@ -0,0 +1,75 @@
+import numpy as np
+import pandas as pd
+
+'''
+     entropy helps us to build an appropriate decision tree 
+     for selecting the best splitter. Entropy can be defined 
+     as a measure of the purity of the sub split. 
+     Entropy always lies between 0 to 1. 
+'''
+
+# Read example from csv
+example = "ImpurityMeasures/example3.csv"
+data = pd.read_csv(example)
+
+# The fastest way to find occurences within an array
+# Check it out in this stackoverflow thread: 
+# https://stackoverflow.com/questions/10741346/numpy-most-efficient-frequency-counts-for-unique-values-in-an-array
+def unique_count(A):
+    unique, inverse = np.unique(A, return_inverse=True)
+    count = np.zeros(len(unique), dtype=int)
+    np.add.at(count, inverse, 1)
+    return np.vstack((unique, count)).T
+
+# Get the entropy of one node
+def entropy(Node):
+    uniques = unique_count(data[Node].values)
+    total = len(data[Node].values)
+
+    # Probability of every node leaf Pi
+    Pi = np.zeros(uniques.shape[0])
+    for i in range(uniques.shape[0]):
+        pi = uniques[i,1]/total
+        Pi[i] = pi 
+
+    # Appliying the Entropy formula: - Sum[Pi(t)*log2(Pi(t))]
+    entropy = - np.sum(Pi*np.log2(Pi, out=np.zeros_like(Pi), where=(Pi!=0)))
+
+    print(uniques)
+    print("Probabilities of Pi: ", Pi)
+    print("Entropy impurity: {}\n".format(entropy))
+    return entropy
+
+
+father = "Etiqueta_Clase"
+child = "a1"
+
+# Get the entropy of the Father node
+father_E = entropy('Etiqueta_Clase')
+
+def informationGain():
+    #
+    crosstab = pd.crosstab(data[child], data[father], margins=True, margins_name="Total")
+    print(crosstab,'\n')
+    index = crosstab.index
+    crosstab = crosstab.values
+    # Sum the entropys
+    entropyW = 0
+
+    for i in range(len(index)-1):
+        print("--------------- {} ---------------".format(index[i]))
+        pi = crosstab[i,:-1]/crosstab[i,-1]
+        entropy_ = - np.sum(pi*np.log2(pi, out=np.zeros_like(pi), where=(pi!=0)))
+
+        # Calculate the weighted entropy and sum
+        entropyW += (crosstab[i,-1]/crosstab[-1,-1])*entropy_
+
+        print("Probabilities of Pi: {}\nentropy impurity: {}\n"
+        .format(pi, entropy_))
+    
+    print("The information gain is: ", father_E - entropyW)
+    return father_E - entropyW
+
+infoGain = informationGain()
+
+
diff --git a/ImpurityMeasures/example.csv b/ImpurityMeasures/example.csv
@@ -0,0 +1,11 @@
+Home_Owner,Marital_Status,Annual_Income,Defaulted
+Yes,Single,125000,No
+No,Married,100000,No
+No,Single,70000,No
+Yes,Married,120000,No
+No,Divorced,95000,Yes
+No,Single,60000,No
+Yes,Divorced,220000,No
+No,Single,85000,Yes
+No,Married,75000,No
+No,Single,90000,Yes
diff --git a/ImpurityMeasures/example2.csv b/ImpurityMeasures/example2.csv
@@ -0,0 +1,21 @@
+ID_Client,Genero,Tipo_Vehiculo,Talla_Camisa,Clase
+1,M,Familiar,Pequeña,C0
+2,M,Deportivo,Mediana,C0
+3,M,Deportivo,Mediana,C0
+4,M,Deportivo,Larga,C0
+5,M,Deportivo,Extra Larga,C0
+6,M,Deportivo,Extra Larga,C0
+7,F,Deportivo,Pequeña,C0
+8,F,Deportivo,Pequeña,C0
+9,F,Deportivo,Mediana,C0
+10,F,Lujo,Larga,C0
+11,M,Familiar,Larga,C1
+12,M,Familiar,Extra Larga,C1
+13,M,Familiar,Mediana,C1
+14,M,Lujo,Extra Larga,C1
+15,F,Lujo,Pequeña,C1
+16,F,Lujo,Pequeña,C1
+17,F,Lujo,Mediana,C1
+18,F,Lujo,Mediana,C1
+19,F,Lujo,Mediana,C1
+20,F,Lujo,Larga,C1
diff --git a/ImpurityMeasures/example3.csv b/ImpurityMeasures/example3.csv
@@ -0,0 +1,10 @@
+a1,a2,a3,Etiqueta_Clase
+V,V,1.0,CP
+V,V,6.0,CP
+V,F,5.0,CN
+F,F,4.0,CP
+F,V,7.0,CN
+F,V,3.0,CN
+F,F,8.0,CN
+V,F,7.0,CP
+F,V,5.0,CN
diff --git a/ImpurityMeasures/gini_index.py b/ImpurityMeasures/gini_index.py
@@ -0,0 +1,91 @@
+import numpy as np
+import pandas as pd
+
+'''
+     Gini impurity is a measure of how often a randomly 
+     chosen element from the set would be incorrectly labeled 
+     if it was randomly labeled according to the distribution 
+     of labels in the subset. 
+'''
+
+# Read example from csv
+#example = "ImpurityMeasures/example.csv"
+example = "ImpurityMeasures/example2.csv"
+data = pd.read_csv(example)
+
+# The fastest way to find occurences within an array
+# Check it out in this stackoverflow thread: 
+# https://stackoverflow.com/questions/10741346/numpy-most-efficient-frequency-counts-for-unique-values-in-an-array
+def unique_count(A):
+    unique, inverse = np.unique(A, return_inverse=True)
+    count = np.zeros(len(unique), dtype=int)
+    np.add.at(count, inverse, 1)
+    return np.vstack((unique, count)).T
+
+# Get the gini impurity of one node
+def gini(N):
+    uniques = unique_count(N)
+    total = len(N)
+    # Probability of every node leaf Pi
+    Pi = np.zeros(uniques.shape[0])
+    for i in range(uniques.shape[0]):
+        pi = uniques[i,1]/total
+        Pi[i] = pi 
+
+    # Appliying the Gini formula: 1 - Sum[Pi(t)^2]
+    gini = 1 - np.sum(Pi**2)
+
+    print(uniques)
+    print("Probabilities of Pi: ", Pi)
+    print("Gini impurity: {}\n".format(gini))
+    return gini
+
+# Finding the purest node within data
+def findPurest():
+    purest = {
+        "column": "",
+        "gini": 1
+    }
+
+    for node in data:
+        print("------------ {} ------------".format(node))
+        gini_ = gini(data[node].values)
+
+        if gini_ < purest['gini']:
+            purest['column'] = node
+            purest['gini'] = gini_
+
+    print("The purest node is: {} \nWith an gini index: {}"
+    .format(purest['column'], purest['gini']))
+
+    return purest['gini']
+
+# You can select the Father node
+#father = findPurest() # is the purest node
+father = "Clase"
+child = "Talla_Camisa"
+
+def giniWeighted():
+    #
+    crosstab = pd.crosstab(data[child], data[father], margins=True, margins_name="Total")
+    print(crosstab,'\n')
+    index = crosstab.index
+    crosstab = crosstab.values
+    # Sum the ginis
+    giniW = 0
+
+    for i in range(len(index)-1):
+        print("--------------- {} ---------------".format(index[i]))
+        pi = crosstab[i,:-1]/crosstab[i,-1]
+        gini_ = 1 - np.sum(pi**2)
+
+        # Calculate the weighted gini and sum
+        giniW += (crosstab[i,-1]/crosstab[-1,-1])*gini_
+
+        print("Probabilities of Pi: {}\nGini impurity: {}\n"
+        .format(pi, gini_))
+    
+    print("The weighted gini is: ", giniW)
+    return giniW
+
+giniW = giniWeighted()
diff --git a/ImpurityMeasures/test2.py b/ImpurityMeasures/test2.py
@@ -0,0 +1,44 @@
+import numpy as np
+from numpy.core.numeric import zeros_like
+import pandas as pd
+
+data = {
+    "a3": [1.0, 6.0, 5.0, 4.0, 7.0, 3.0,8.0,7.0,5.0],
+    "class": ["CP", "CP", "CN", "CP", "CN", "CN", "CN", "CP", "CN"]
+}
+
+division = np.array([2.0, 3.5, 4.5, 5.5, 6.5, 7.5])
+
+df = pd.DataFrame(data)
+
+df.sort_values(by=["a3"], inplace=True)
+
+print(df)
+
+E_father = 0.9911
+
+for i in division:
+    print("------------------------------------------------------")
+    print("Split in ", str(i),"\n")
+    dfi = df.copy()
+    dfi["a3"] = dfi["a3"].apply(lambda x: "C0" if x <= i else "C1")
+    confusion = pd.crosstab(dfi["a3"], dfi["class"], margins=True, margins_name="Total")
+    print(confusion)
+    index = confusion.index
+    confusion = confusion.values
+
+    a = confusion[0,0]/confusion[0,-1]
+    b = confusion[0,1]/confusion[0,-1]
+    E0 = -(a*np.log2(a, out=np.zeros_like(a), where=(a!=0))) - (b*np.log2(b, out=np.zeros_like(b), where=(b!=0)))
+    print("\nEntropy of {}:\t\t{}".format(index[0],E0))
+    
+    c = confusion[1,0]/confusion[1,-1]
+    d = confusion[1,1]/confusion[1,-1]
+    E1 = -(c*np.log2(c, out=np.zeros_like(c), where=(c!=0))) - (d*np.log2(d, out=np.zeros_like(d), where=(d!=0)))
+    print("Entropy of {}:\t\t{}".format(index[1],E1))
+
+    C0 = confusion[0,-1]/confusion[-1,-1]
+    C1 = confusion[1,-1]/confusion[-1,-1]
+    InfGain = E_father - ((C0*E0)+(C1*E1))
+    print("Information Gain:\t{}".format(InfGain))
+

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "python.pythonPath": "D:\\Anaconda3\\envs\\darlin\\python.exe"`
	`3`	`+}`