-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathtutorial.jl
165 lines (116 loc) · 3.24 KB
/
tutorial.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
using Pkg # hideall
Pkg.activate("_literate/ISL-lab-10/Project.toml")
Pkg.instantiate()
macro OUTPUT()
return isdefined(Main, :Franklin) ? Franklin.OUT_PATH[] : "/tmp/"
end;
# @@dropdown
# ## Getting started
# @@
# @@dropdown-content
using MLJ
import RDatasets: dataset
import DataFrames: DataFrame, select, Not, describe
using Random
MLJ.color_off() # hide
data = dataset("datasets", "USArrests")
names(data)
# Let's have a look at the mean and standard deviation of each feature:
describe(data, :mean, :std)
# Let's extract the numerical component and coerce
X = select(data, Not(:State))
X = coerce(X, :UrbanPop=>Continuous, :Assault=>Continuous);
#
# @@
# @@dropdown
# ## PCA pipeline
# @@
# @@dropdown-content
#
# PCA is usually best done after standardization but we won't do it here:
PCA = @load PCA pkg=MultivariateStats
pca_mdl = PCA(variance_ratio=1)
pca = machine(pca_mdl, X)
fit!(pca)
PCA
W = MLJ.transform(pca, X);
# W is the PCA'd data; here we've used default settings for PCA and it has recovered 2 components:
schema(W).names
# Let's inspect the fit:
r = report(pca)
cumsum(r.principalvars ./ r.tvar)
# In the second line we look at the explained variance with 1 then 2 PCA features and it seems that with 2 we almost completely recover all of the variance.
#
# @@
# @@dropdown
# ## More interesting data...
# @@
# @@dropdown-content
# Instead of just playing with toy data, let's load the orange juice data and extract only the columns corresponding to price data:
data = dataset("ISLR", "OJ")
feature_names = [
:PriceCH, :PriceMM, :DiscCH, :DiscMM, :SalePriceMM, :SalePriceCH,
:PriceDiff, :PctDiscMM, :PctDiscCH
]
X = select(data, feature_names);
# @@dropdown
# ### PCA pipeline
# @@
# @@dropdown-content
Random.seed!(1515)
SPCA = Pipeline(
Standardizer(),
PCA(variance_ratio=1-1e-4)
)
spca = machine(SPCA, X)
fit!(spca)
W = MLJ.transform(spca, X)
names(W)
# What kind of variance can we explain?
rpca = report(spca).pca
cs = cumsum(rpca.principalvars ./ rpca.tvar)
# Let's visualise this
using Plots
Plots.scalefontsizes() #hide
Plots.scalefontsizes(1.3) #hide
Plots.bar(1:length(cs), cs, legend=false, size=((800,600)), ylim=(0, 1.1))
xlabel!("Number of PCA features")
ylabel!("Ratio of explained variance")
plot!(1:length(cs), cs, color="red", marker="o", linewidth=3)
savefig(joinpath(@OUTPUT, "ISL-lab-10-g1.svg")); # hide
# \figalt{PCA explained variance}{ISL-lab-10-g1.svg}
# So 4 PCA features are enough to recover most of the variance.
#
# @@
# @@dropdown
# ### Clustering
# @@
# @@dropdown-content
Random.seed!(1515)
KMeans = @load KMeans pkg=Clustering
SPCA2 = Pipeline(
Standardizer(),
PCA(),
KMeans(k=3)
)
spca2 = machine(SPCA2, X)
fit!(spca2)
assignments = report(spca2).k_means.assignments
mask1 = assignments .== 1
mask2 = assignments .== 2
mask3 = assignments .== 3;
# Now we can try visualising this
p = plot(size=(800,600))
legend_items = ["Group 1", "Group 2", "Group 3"]
for (i, (m, c)) in enumerate(zip((mask1, mask2, mask3), ("red", "green", "blue")))
scatter!(p, W[m, 1], W[m, 2], color=c, label=legend_items[i])
end
plot(p)
xlabel!("PCA-1")
ylabel!("PCA-2")
savefig(joinpath(@OUTPUT, "ISL-lab-10-cluster.svg")); # hide
# \fig{ISL-lab-10-cluster.svg}
#
# @@
#
# @@