-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathelbow_curve_no_of_cluster.py
57 lines (42 loc) · 1.66 KB
/
elbow_curve_no_of_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
###############
# Reference
# http://www.awesomestats.in/python-cluster-validation/
# http://blog.mpacula.com/2011/04/27/k-means-clustering-example-python/
# http://stamfordresearch.com/k-means-clustering-in-python/
###############
import pandas as pd
#fetching R datasets to use in python
import rpy2.robjects as ro
import rpy2.robjects.conversion as conversion
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
R = ro.r
df = conversion.ri2py(R['mtcars'])
print(df.head())
from sklearn.cluster import KMeans
df.columns
#get data
X = df[['mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']]
#import standard Scaler to scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform( X )
#set the number of cluser range assumption
cluster_range = range( 1, 20 )
cluster_errors = []
#run the algo to get the clustering done
for num_clusters in cluster_range:
clusters = KMeans( num_clusters )
clusters.fit( X_scaled )
cluster_errors.append( clusters.inertia_ )
#create a data frame to check the number of cluster and corresponding error
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:10]
#plot the eblow curve to visuvalize the results
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
#take the last point as number of cluster to be used in clustring, where you can it is a drastic change from the previous point