-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathtutorial.jl
340 lines (234 loc) · 8.76 KB
/
tutorial.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
using Pkg # hideall
Pkg.activate("_literate/ISL-lab-4/Project.toml")
Pkg.instantiate()
macro OUTPUT()
return isdefined(Main, :Franklin) ? Franklin.OUT_PATH[] : "/tmp/"
end;
# @@dropdown
# ## Stock market data
# @@
# @@dropdown-content
#
# Let's load the usual packages and the data
using MLJ
import RDatasets: dataset
import DataFrames: DataFrame, describe, select, Not
import StatsBase: countmap, cor, var
MLJ.color_off() # hide
using PrettyPrinting
smarket = dataset("ISLR", "Smarket")
@show size(smarket)
@show names(smarket)
# Since we often want to only show a few significant digits for the metrics etc, let's introduce a very simple function that does that:
r3(x) = round(x, sigdigits=3)
r3(pi)
# Let's get a description too
describe(smarket, :mean, :std, :eltype)
# The target variable is `:Direction`:
y = smarket.Direction
X = select(smarket, Not(:Direction));
# We can compute all the pairwise correlations; we use `Matrix` so that the dataframe entries are considered as one matrix of numbers with the same type (otherwise `cor` won't work):
cm = X |> Matrix |> cor
round.(cm, sigdigits=1)
# Let's see what the `:Volume` feature looks like:
using Plots
Plots.scalefontsizes() #hide
Plots.scalefontsizes(1.2) #hide
plot(X.Volume, size=(800,600), linewidth=2, legend=false)
xlabel!("Tick number")
ylabel!("Volume")
savefig(joinpath(@OUTPUT, "ISL-lab-4-volume.svg")); # hide
# \figalt{volume}{ISL-lab-4-volume.svg}
# @@dropdown
# ### Logistic Regression
# @@
# @@dropdown-content
#
# We will now try to train models; the target `:Direction` has two classes: `Up` and `Down`; it needs to be interpreted as a categorical object, and we will mark it as a _ordered factor_ to specify that 'Up' is positive and 'Down' negative (for the confusion matrix later):
y = coerce(y, OrderedFactor)
classes(y[1])
# Note that in this case the default order comes from the lexicographic order which happens to map to our intuition since `D` comes before `U`.
cm = countmap(y)
categories, vals = collect(keys(cm)), collect(values(cm))
Plots.bar(categories, vals, title="Bar Chart Example", legend=false)
ylabel!("Number of occurrences")
savefig(joinpath(@OUTPUT, "ISL-lab-4-bal.svg")); # hide
# \fig{ISL-lab-4-bal.svg}
#
# Seems pretty balanced.
# Let's now try fitting a simple logistic classifier (aka logistic regression) not using `:Year` and `:Today`:
LogisticClassifier = @load LogisticClassifier pkg=MLJLinearModels
X2 = select(X, Not([:Year, :Today]))
classif = machine(LogisticClassifier(), X2, y)
# Let's fit it to the data and try to reproduce the output:
fit!(classif)
ŷ = MLJ.predict(classif, X2)
ŷ[1:3]
# Note that here the `ŷ` are _scores_.
# We can recover the average cross-entropy loss:
cross_entropy(ŷ, y) |> mean |> r3
# in order to recover the class, we could use the mode and compare the misclassification rate:
ŷ = predict_mode(classif, X2)
misclassification_rate(ŷ, y) |> r3
# Well that's not fantastic...
#
# Let's visualise how we're doing building a confusion matrix,
# first is predicted, second is truth:
cm = confusion_matrix(ŷ, y)
# We can then compute the accuracy or precision, etc. easily for instance:
@show false_positive(cm)
@show accuracy(ŷ, y) |> r3
@show accuracy(cm) |> r3 # same thing
@show positive_predictive_value(ŷ, y) |> r3 # a.k.a. precision
@show recall(ŷ, y) |> r3
@show f1score(ŷ, y) |> r3
# Let's now train on the data before 2005 and use it to predict on the rest.
# Let's find the row indices for which the condition holds
train = 1:findlast(X.Year .< 2005)
test = last(train)+1:length(y);
# We can now just re-fit the machine that we've already defined just on those rows and predict on the test:
fit!(classif, rows=train)
ŷ = predict_mode(classif, rows=test)
accuracy(ŷ, y[test]) |> r3
# Well, that's not very good...
# Let's retrain a machine using only `:Lag1` and `:Lag2`:
X3 = select(X2, [:Lag1, :Lag2])
classif = machine(LogisticClassifier(), X3, y)
fit!(classif, rows=train)
ŷ = predict_mode(classif, rows=test)
accuracy(ŷ, y[test]) |> r3
# Interesting... it has higher accuracy than the model with more features! This could be investigated further by increasing the regularisation parameter but we'll leave that aside for now.
#
# We can use a trained machine to predict on new data:
Xnew = (Lag1 = [1.2, 1.5], Lag2 = [1.1, -0.8])
ŷ = MLJ.predict(classif, Xnew)
ŷ |> pprint
# **Note**: when specifying data, we used a simple `NamedTuple`; we could also have defined a dataframe or any other compatible tabular container.
# Note also that we retrieved the raw predictions here i.e.: a score for each class; we could have used `predict_mode` or indeed
mode.(ŷ)
#
# @@
# @@dropdown
# ### LDA
# @@
# @@dropdown-content
#
# Let's do a similar thing but with a LDA model this time:
BayesianLDA = @load BayesianLDA pkg=MultivariateStats
classif = machine(BayesianLDA(), X3, y)
fit!(classif, rows=train)
ŷ = predict_mode(classif, rows=test)
accuracy(ŷ, y[test]) |> r3
# Note: `BayesianLDA` is LDA using a multivariate normal model for each class with a default prior inferred from the proportions for each class in the training data.
# You can also use the bare `LDA` model which does not make these assumptions and allows using a different metric in the transformed space, see the docs for details.
LDA = @load LDA pkg=MultivariateStats
using Distances
classif = machine(LDA(dist=CosineDist()), X3, y)
fit!(classif, rows=train)
ŷ = predict_mode(classif, rows=test)
accuracy(ŷ, y[test]) |> r3
#
# @@
# @@dropdown
# ### QDA
# @@
# @@dropdown-content
#
# Bayesian QDA is available via ScikitLearn:
BayesianQDA = @load BayesianQDA pkg=MLJScikitLearnInterface
# Using it is done in much the same way as before:
classif = machine(BayesianQDA(), X3, y)
fit!(classif, rows=train)
ŷ = predict_mode(classif, rows=test)
accuracy(ŷ, y[test]) |> r3
#
# @@
# @@dropdown
# ### KNN
# @@
# @@dropdown-content
#
# We can use K-Nearest Neighbors models via the [`NearestNeighbors`](https://github.com/KristofferC/NearestNeighbors.jl) package:
KNNClassifier = @load KNNClassifier
knnc = KNNClassifier(K=1)
classif = machine(knnc, X3, y)
fit!(classif, rows=train)
ŷ = predict_mode(classif, rows=test)
accuracy(ŷ, y[test]) |> r3
# Pretty bad... let's try with three neighbors
knnc.K = 3
fit!(classif, rows=train)
ŷ = predict_mode(classif, rows=test)
accuracy(ŷ, y[test]) |> r3
# A bit better but not hugely so.
#
# @@
#
# @@
# @@dropdown
# ## Caravan insurance data
# @@
# @@dropdown-content
#
# The caravan dataset is part of ISLR as well:
caravan = dataset("ISLR", "Caravan")
size(caravan)
# The target variable is `Purchase`, effectively a categorical
purchase = caravan.Purchase
vals = unique(purchase)
# Let's see how many of each we have
nl1 = sum(purchase .== vals[1])
nl2 = sum(purchase .== vals[2])
println("#$(vals[1]) ", nl1)
println("#$(vals[2]) ", nl2)
# we can also visualise this as was done before:
cm = countmap(purchase)
categories, vals = collect(keys(cm)), collect(values(cm))
bar(categories, vals, title="Bar Chart Example", legend=false)
ylabel!("Number of occurrences")
savefig(joinpath(@OUTPUT, "ISL-lab-4-bal2.svg")); # hide
# \fig{ISL-lab-4-bal2.svg}
# that's quite unbalanced.
#
# Apart from the target, all other variables are numbers; we can standardize the data:
y, X = unpack(caravan, ==(:Purchase))
mstd = machine(Standardizer(), X)
fit!(mstd)
Xs = MLJ.transform(mstd, X)
var(Xs[:,1]) |> r3
# **Note**: in MLJ, it is recommended to work with pipelines / networks when possible and not do "step-by-step" transformation and fitting of the data as this is more error prone. We do it here to stick to the ISL tutorial.
#
# We split the data in the first 1000 rows for testing and the rest for training:
test = 1:1000
train = last(test)+1:nrows(Xs);
# Let's now fit a KNN model and check the misclassification rate
classif = machine(KNNClassifier(K=3), Xs, y)
fit!(classif, rows=train)
ŷ = predict_mode(classif, rows=test)
accuracy(ŷ, y[test]) |> r3
# that looks good but recall the problem is very unbalanced
mean(y[test] .!= "No") |> r3
# Let's fit a logistic classifier to this problem
classif = machine(LogisticClassifier(), Xs, y)
fit!(classif, rows=train)
ŷ = predict_mode(classif, rows=test)
accuracy(ŷ, y[test]) |> r3
# @@dropdown
# ### ROC and AUC
# @@
# @@dropdown-content
#
# Since we have a probabilistic classifier, we can also check metrics that take _scores_ into account such as the area under the ROC curve (AUC):
ŷ = MLJ.predict(classif, rows=test)
auc(ŷ, y[test])
# We can also display the curve itself
fprs, tprs, thresholds = roc_curve(ŷ, y[test])
plot(fprs, tprs, linewidth=2, size=(800,600))
xlabel!("False Positive Rate")
ylabel!("True Positive Rate")
savefig(joinpath(@OUTPUT, "ISL-lab-4-roc.svg")); # hide
# \figalt{ROC}{ISL-lab-4-roc.svg}
#
# @@
#
# @@