-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgini_index.py
91 lines (73 loc) · 2.51 KB
/
gini_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import numpy as np
import pandas as pd
'''
Gini impurity is a measure of how often a randomly
chosen element from the set would be incorrectly labeled
if it was randomly labeled according to the distribution
of labels in the subset.
'''
# Read example from csv
#example = "ImpurityMeasures/example.csv"
example = "ImpurityMeasures/example2.csv"
data = pd.read_csv(example)
# The fastest way to find occurences within an array
# Check it out in this stackoverflow thread:
# https://stackoverflow.com/questions/10741346/numpy-most-efficient-frequency-counts-for-unique-values-in-an-array
def unique_count(A):
unique, inverse = np.unique(A, return_inverse=True)
count = np.zeros(len(unique), dtype=int)
np.add.at(count, inverse, 1)
return np.vstack((unique, count)).T
# Get the gini impurity of one node
def gini(N):
uniques = unique_count(N)
total = len(N)
# Probability of every node leaf Pi
Pi = np.zeros(uniques.shape[0])
for i in range(uniques.shape[0]):
pi = uniques[i,1]/total
Pi[i] = pi
# Appliying the Gini formula: 1 - Sum[Pi(t)^2]
gini = 1 - np.sum(Pi**2)
print(uniques)
print("Probabilities of Pi: ", Pi)
print("Gini impurity: {}\n".format(gini))
return gini
# Finding the purest node within data
def findPurest():
purest = {
"column": "",
"gini": 1
}
for node in data:
print("------------ {} ------------".format(node))
gini_ = gini(data[node].values)
if gini_ < purest['gini']:
purest['column'] = node
purest['gini'] = gini_
print("The purest node is: {} \nWith an gini index: {}"
.format(purest['column'], purest['gini']))
return purest['gini']
# You can select the Father node
#father = findPurest() # is the purest node
father = "Clase"
child = "Talla_Camisa"
def giniWeighted():
#
crosstab = pd.crosstab(data[child], data[father], margins=True, margins_name="Total")
print(crosstab,'\n')
index = crosstab.index
crosstab = crosstab.values
# Sum the ginis
giniW = 0
for i in range(len(index)-1):
print("--------------- {} ---------------".format(index[i]))
pi = crosstab[i,:-1]/crosstab[i,-1]
gini_ = 1 - np.sum(pi**2)
# Calculate the weighted gini and sum
giniW += (crosstab[i,-1]/crosstab[-1,-1])*gini_
print("Probabilities of Pi: {}\nGini impurity: {}\n"
.format(pi, gini_))
print("The weighted gini is: ", giniW)
return giniW
giniW = giniWeighted()