Skip to content

Commit 80b3001

Browse files
author
Miguel Angel
committed
🚧 Adding Gini and Entropy Impurity
1 parent 56bb166 commit 80b3001

File tree

7 files changed

+255
-0
lines changed

7 files changed

+255
-0
lines changed

.vscode/settings.json

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"python.pythonPath": "D:\\Anaconda3\\envs\\darlin\\python.exe"
3+
}

ImpurityMeasures/entropy.py

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
'''
5+
entropy helps us to build an appropriate decision tree
6+
for selecting the best splitter. Entropy can be defined
7+
as a measure of the purity of the sub split.
8+
Entropy always lies between 0 to 1.
9+
'''
10+
11+
# Read example from csv
12+
example = "ImpurityMeasures/example3.csv"
13+
data = pd.read_csv(example)
14+
15+
# The fastest way to find occurences within an array
16+
# Check it out in this stackoverflow thread:
17+
# https://stackoverflow.com/questions/10741346/numpy-most-efficient-frequency-counts-for-unique-values-in-an-array
18+
def unique_count(A):
19+
unique, inverse = np.unique(A, return_inverse=True)
20+
count = np.zeros(len(unique), dtype=int)
21+
np.add.at(count, inverse, 1)
22+
return np.vstack((unique, count)).T
23+
24+
# Get the entropy of one node
25+
def entropy(Node):
26+
uniques = unique_count(data[Node].values)
27+
total = len(data[Node].values)
28+
29+
# Probability of every node leaf Pi
30+
Pi = np.zeros(uniques.shape[0])
31+
for i in range(uniques.shape[0]):
32+
pi = uniques[i,1]/total
33+
Pi[i] = pi
34+
35+
# Appliying the Entropy formula: - Sum[Pi(t)*log2(Pi(t))]
36+
entropy = - np.sum(Pi*np.log2(Pi, out=np.zeros_like(Pi), where=(Pi!=0)))
37+
38+
print(uniques)
39+
print("Probabilities of Pi: ", Pi)
40+
print("Entropy impurity: {}\n".format(entropy))
41+
return entropy
42+
43+
44+
father = "Etiqueta_Clase"
45+
child = "a1"
46+
47+
# Get the entropy of the Father node
48+
father_E = entropy('Etiqueta_Clase')
49+
50+
def informationGain():
51+
#
52+
crosstab = pd.crosstab(data[child], data[father], margins=True, margins_name="Total")
53+
print(crosstab,'\n')
54+
index = crosstab.index
55+
crosstab = crosstab.values
56+
# Sum the entropys
57+
entropyW = 0
58+
59+
for i in range(len(index)-1):
60+
print("--------------- {} ---------------".format(index[i]))
61+
pi = crosstab[i,:-1]/crosstab[i,-1]
62+
entropy_ = - np.sum(pi*np.log2(pi, out=np.zeros_like(pi), where=(pi!=0)))
63+
64+
# Calculate the weighted entropy and sum
65+
entropyW += (crosstab[i,-1]/crosstab[-1,-1])*entropy_
66+
67+
print("Probabilities of Pi: {}\nentropy impurity: {}\n"
68+
.format(pi, entropy_))
69+
70+
print("The information gain is: ", father_E - entropyW)
71+
return father_E - entropyW
72+
73+
infoGain = informationGain()
74+
75+

ImpurityMeasures/example.csv

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
Home_Owner,Marital_Status,Annual_Income,Defaulted
2+
Yes,Single,125000,No
3+
No,Married,100000,No
4+
No,Single,70000,No
5+
Yes,Married,120000,No
6+
No,Divorced,95000,Yes
7+
No,Single,60000,No
8+
Yes,Divorced,220000,No
9+
No,Single,85000,Yes
10+
No,Married,75000,No
11+
No,Single,90000,Yes

ImpurityMeasures/example2.csv

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
ID_Client,Genero,Tipo_Vehiculo,Talla_Camisa,Clase
2+
1,M,Familiar,Pequeña,C0
3+
2,M,Deportivo,Mediana,C0
4+
3,M,Deportivo,Mediana,C0
5+
4,M,Deportivo,Larga,C0
6+
5,M,Deportivo,Extra Larga,C0
7+
6,M,Deportivo,Extra Larga,C0
8+
7,F,Deportivo,Pequeña,C0
9+
8,F,Deportivo,Pequeña,C0
10+
9,F,Deportivo,Mediana,C0
11+
10,F,Lujo,Larga,C0
12+
11,M,Familiar,Larga,C1
13+
12,M,Familiar,Extra Larga,C1
14+
13,M,Familiar,Mediana,C1
15+
14,M,Lujo,Extra Larga,C1
16+
15,F,Lujo,Pequeña,C1
17+
16,F,Lujo,Pequeña,C1
18+
17,F,Lujo,Mediana,C1
19+
18,F,Lujo,Mediana,C1
20+
19,F,Lujo,Mediana,C1
21+
20,F,Lujo,Larga,C1

ImpurityMeasures/example3.csv

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
a1,a2,a3,Etiqueta_Clase
2+
V,V,1.0,CP
3+
V,V,6.0,CP
4+
V,F,5.0,CN
5+
F,F,4.0,CP
6+
F,V,7.0,CN
7+
F,V,3.0,CN
8+
F,F,8.0,CN
9+
V,F,7.0,CP
10+
F,V,5.0,CN

ImpurityMeasures/gini_index.py

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
'''
5+
Gini impurity is a measure of how often a randomly
6+
chosen element from the set would be incorrectly labeled
7+
if it was randomly labeled according to the distribution
8+
of labels in the subset.
9+
'''
10+
11+
# Read example from csv
12+
#example = "ImpurityMeasures/example.csv"
13+
example = "ImpurityMeasures/example2.csv"
14+
data = pd.read_csv(example)
15+
16+
# The fastest way to find occurences within an array
17+
# Check it out in this stackoverflow thread:
18+
# https://stackoverflow.com/questions/10741346/numpy-most-efficient-frequency-counts-for-unique-values-in-an-array
19+
def unique_count(A):
20+
unique, inverse = np.unique(A, return_inverse=True)
21+
count = np.zeros(len(unique), dtype=int)
22+
np.add.at(count, inverse, 1)
23+
return np.vstack((unique, count)).T
24+
25+
# Get the gini impurity of one node
26+
def gini(N):
27+
uniques = unique_count(N)
28+
total = len(N)
29+
# Probability of every node leaf Pi
30+
Pi = np.zeros(uniques.shape[0])
31+
for i in range(uniques.shape[0]):
32+
pi = uniques[i,1]/total
33+
Pi[i] = pi
34+
35+
# Appliying the Gini formula: 1 - Sum[Pi(t)^2]
36+
gini = 1 - np.sum(Pi**2)
37+
38+
print(uniques)
39+
print("Probabilities of Pi: ", Pi)
40+
print("Gini impurity: {}\n".format(gini))
41+
return gini
42+
43+
# Finding the purest node within data
44+
def findPurest():
45+
purest = {
46+
"column": "",
47+
"gini": 1
48+
}
49+
50+
for node in data:
51+
print("------------ {} ------------".format(node))
52+
gini_ = gini(data[node].values)
53+
54+
if gini_ < purest['gini']:
55+
purest['column'] = node
56+
purest['gini'] = gini_
57+
58+
print("The purest node is: {} \nWith an gini index: {}"
59+
.format(purest['column'], purest['gini']))
60+
61+
return purest['gini']
62+
63+
# You can select the Father node
64+
#father = findPurest() # is the purest node
65+
father = "Clase"
66+
child = "Talla_Camisa"
67+
68+
def giniWeighted():
69+
#
70+
crosstab = pd.crosstab(data[child], data[father], margins=True, margins_name="Total")
71+
print(crosstab,'\n')
72+
index = crosstab.index
73+
crosstab = crosstab.values
74+
# Sum the ginis
75+
giniW = 0
76+
77+
for i in range(len(index)-1):
78+
print("--------------- {} ---------------".format(index[i]))
79+
pi = crosstab[i,:-1]/crosstab[i,-1]
80+
gini_ = 1 - np.sum(pi**2)
81+
82+
# Calculate the weighted gini and sum
83+
giniW += (crosstab[i,-1]/crosstab[-1,-1])*gini_
84+
85+
print("Probabilities of Pi: {}\nGini impurity: {}\n"
86+
.format(pi, gini_))
87+
88+
print("The weighted gini is: ", giniW)
89+
return giniW
90+
91+
giniW = giniWeighted()

ImpurityMeasures/test2.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import numpy as np
2+
from numpy.core.numeric import zeros_like
3+
import pandas as pd
4+
5+
data = {
6+
"a3": [1.0, 6.0, 5.0, 4.0, 7.0, 3.0,8.0,7.0,5.0],
7+
"class": ["CP", "CP", "CN", "CP", "CN", "CN", "CN", "CP", "CN"]
8+
}
9+
10+
division = np.array([2.0, 3.5, 4.5, 5.5, 6.5, 7.5])
11+
12+
df = pd.DataFrame(data)
13+
14+
df.sort_values(by=["a3"], inplace=True)
15+
16+
print(df)
17+
18+
E_father = 0.9911
19+
20+
for i in division:
21+
print("------------------------------------------------------")
22+
print("Split in ", str(i),"\n")
23+
dfi = df.copy()
24+
dfi["a3"] = dfi["a3"].apply(lambda x: "C0" if x <= i else "C1")
25+
confusion = pd.crosstab(dfi["a3"], dfi["class"], margins=True, margins_name="Total")
26+
print(confusion)
27+
index = confusion.index
28+
confusion = confusion.values
29+
30+
a = confusion[0,0]/confusion[0,-1]
31+
b = confusion[0,1]/confusion[0,-1]
32+
E0 = -(a*np.log2(a, out=np.zeros_like(a), where=(a!=0))) - (b*np.log2(b, out=np.zeros_like(b), where=(b!=0)))
33+
print("\nEntropy of {}:\t\t{}".format(index[0],E0))
34+
35+
c = confusion[1,0]/confusion[1,-1]
36+
d = confusion[1,1]/confusion[1,-1]
37+
E1 = -(c*np.log2(c, out=np.zeros_like(c), where=(c!=0))) - (d*np.log2(d, out=np.zeros_like(d), where=(d!=0)))
38+
print("Entropy of {}:\t\t{}".format(index[1],E1))
39+
40+
C0 = confusion[0,-1]/confusion[-1,-1]
41+
C1 = confusion[1,-1]/confusion[-1,-1]
42+
InfGain = E_father - ((C0*E0)+(C1*E1))
43+
print("Information Gain:\t{}".format(InfGain))
44+

0 commit comments

Comments
 (0)