-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathtutorial.jl
338 lines (259 loc) · 8.1 KB
/
tutorial.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
using Pkg # hideall
Pkg.activate("_literate/ISL-lab-6b/Project.toml")
Pkg.instantiate()
macro OUTPUT()
return isdefined(Main, :Franklin) ? Franklin.OUT_PATH[] : "/tmp/"
end;
# In this tutorial, we are exploring the application of Ridge and Lasso
# regression to the Hitters R dataset.
#
# @@dropdown
# ## Getting started
# @@
# @@dropdown-content
using MLJ
import RDatasets: dataset
using PrettyPrinting
MLJ.color_off() # hide
import Distributions as D
LinearRegressor = @load LinearRegressor pkg = MLJLinearModels
RidgeRegressor = @load RidgeRegressor pkg = MLJLinearModels
LassoRegressor = @load LassoRegressor pkg = MLJLinearModels
# We load the dataset using the `dataset` function, which takes the Package and
# dataset names as arguments.
hitters = dataset("ISLR", "Hitters")
@show size(hitters)
names(hitters) |> pprint
# Let's unpack the dataset with the `unpack` function. In this case, the target is
# `Salary` (`==(:Salary)`); and all other columns are features, going into a table `X`.
y, X = unpack(hitters, ==(:Salary));
# The target has missing values which we will just ignore.
# We extract the row indices corresponding to non-missing values of the target.
# Note the use of the element-wise operator `.`.
no_miss = .!ismissing.(y);
# We collect the non missing values of the target in an Array.
# And keep only the corresponding features values.
y = collect(skipmissing(y))
X = X[no_miss, :];
# Let's now split our dataset into a train and test sets.
train, test = partition(eachindex(y), 0.5, shuffle = true, rng = 424);
# Let's have a look at the target.
using Plots
Plots.scalefontsizes() #hide
Plots.scalefontsizes(1.2) #hide
plot(
y,
seriestype = :scatter,
markershape = :circle,
legend = false,
size = (800, 600),
)
xlabel!("Index")
ylabel!("Salary")
# \figalt{Salary}{ISL-lab-6-g1.svg}
# That looks quite skewed, let's have a look at a histogram:
histogram(y, bins = 50, normalize = true, label = false, size = (800, 600))
xlabel!("Salary")
ylabel!("Density")
edfit = D.fit_mle(D.Exponential, y)
xx = range(minimum(y), 2500, length = 100)
yy = pdf.(edfit, xx)
plot!(
xx,
yy,
label = "Exponential distribution fit",
linecolor = :orange,
linewidth = 4,
)
savefig(joinpath(@OUTPUT, "ISL-lab-6-g2.svg")); # hide
# \figalt{Distribution of salary}{ISL-lab-6-g2.svg}
#
# @@dropdown
# ### Data preparation
# @@
# @@dropdown-content
# Most features are currently encoded as integers but we will consider them as continuous.
# To coerce `int` features to `Float`, we nest the `autotype` function in the `coerce`
# function. The `autotype` function returns a dictionary containing scientific types,
# which is then passed to the `coerce` function. For more details on the use of
# `autotype`, see the [Scientific
# Types](https://alan-turing-institute.github.io/DataScienceTutorials.jl/data/scitype/index.html#autotype)
Xc = coerce(X, autotype(X, rules = (:discrete_to_continuous,)))
schema(Xc)
# There're a few features that are categorical which we'll one-hot-encode.
#
# @@
#
# @@
# @@dropdown
# ## Ridge pipeline
# @@
# @@dropdown-content
# @@dropdown
# ### Baseline
# @@
# @@dropdown-content
# Let's first fit a simple pipeline with a standardizer, a one-hot-encoder and a basic
# linear regression:
model = Standardizer() |> OneHotEncoder() |> LinearRegressor()
pipe = machine(model, Xc, y)
fit!(pipe, rows = train)
ŷ = MLJ.predict(pipe, rows = test)
round(rms(ŷ, y[test])^2, sigdigits = 4)
# Let's get a feel for how we're doing
res = ŷ .- y[test]
plot(
res,
line = :stem,
ylims = (-1300, 1000),
linewidth = 3,
marker = :circle,
legend = false,
size = ((800, 600)),
)
hline!([0], linewidth = 2, color = :red)
xlabel!("Index")
ylabel!("Residual (ŷ - y)")
savefig(joinpath(@OUTPUT, "ISL-lab-6-g3.svg")); # hide
# \figalt{Residuals}{ISL-lab-6-g3.svg}
histogram(
res,
bins = 30,
normalize = true,
color = :green,
label = false,
size = (800, 600),
xlims = (-1100, 1100),
)
xx = range(-1100, 1100, length = 100)
ndfit = D.fit_mle(D.Normal, res)
lfit = D.fit_mle(D.Laplace, res)
plot!(xx, pdf.(ndfit, xx), linecolor = :orange, label = "Normal fit", linewidth = 3)
plot!(xx, pdf.(lfit, xx), linecolor = :magenta, label = "Laplace fit", linewidth = 3)
xlabel!("Residual (ŷ - y)")
ylabel!("Density")
savefig(joinpath(@OUTPUT, "ISL-lab-6-g4.svg")); # hide
# \figalt{Distribution of residuals}{ISL-lab-6-g4.svg}
#
# @@
# @@dropdown
# ### Basic Ridge
# @@
# @@dropdown-content
# Let's now swap the linear regressor for a Ridge one without specifying the penalty (`1`
# by default): We modify the supervised model in the pipeline directly.
pipe.model.linear_regressor = RidgeRegressor()
fit!(pipe, rows = train)
ŷ = MLJ.predict(pipe, rows = test)
round(rms(ŷ, y[test])^2, sigdigits = 4)
# Ok that's a bit better but perhaps we can do better with an appropriate selection of the
# hyperparameter.
#
# @@
# @@dropdown
# ### Cross validating
# @@
# @@dropdown-content
# What penalty should you use? Let's do a simple CV to try to find out:
r = range(
model,
:(linear_regressor.lambda),
lower = 1e-2,
upper = 100,
scale = :log10,
)
tm = TunedModel(
model,
ranges = r,
tuning = Grid(resolution = 50),
resampling = CV(nfolds = 3, rng = 4141),
measure = rms,
)
mtm = machine(tm, Xc, y)
fit!(mtm, rows = train)
best_mdl = fitted_params(mtm).best_model
round(best_mdl.linear_regressor.lambda, sigdigits = 4)
# Right, and with that we get:
ŷ = MLJ.predict(mtm, rows = test)
round(rms(ŷ, y[test])^2, sigdigits = 4)
# This is a poorer result, but that's not a complete surprise. To optimize `lambda` we've
# sacrificed some data (for the cross-validation), and we only have 263 observations
# total.
# Again visualizing the residuals:
res = ŷ .- y[test]
plot(
res,
line = :stem,
xlims = (1, length(res)),
ylims = (-1400, 1000),
linewidth = 3,
marker = :circle,
legend = false,
size = ((800, 600)),
)
hline!([0], linewidth = 2, color = :red)
xlabel!("Index")
ylabel!("Residual (ŷ - y)")
savefig(joinpath(@OUTPUT, "ISL-lab-6-g5.svg")); # hide
# \figalt{Ridge residuals}{ISL-lab-6-g5.svg}
#
# You can compare that with the residuals obtained earlier.
#
# @@
#
# @@
# @@dropdown
# ## Lasso pipeline
# @@
# @@dropdown-content
#
# Let's do the same as above but using a Lasso model (which also has a regularization
# parameter named `lambda`) but adjust the range a bit:
mtm.model.model.linear_regressor = LassoRegressor()
mtm.model.range = range(
model,
:(linear_regressor.lambda),
lower = 5,
upper = 1000,
scale = :log10,
)
fit!(mtm, rows = train)
best_mdl = fitted_params(mtm).best_model
round(best_mdl.linear_regressor.lambda, sigdigits = 4)
# Ok and let's see how that does:
ŷ = MLJ.predict(mtm, rows = test)
round(rms(ŷ, y[test])^2, sigdigits = 4)
# A pretty similar result to the previous one. Notice the parameters are reasonably
# sparse, as expected with an L1-regularizer:
coefs, intercept = fitted_params(mtm.fitresult).linear_regressor
@show coefs
@show intercept
# with around 50% sparsity:
coef_vals = [c[2] for c in coefs]
sum(coef_vals .≈ 0) / length(coefs)
# Let's visualise this:
## name of the features including one-hot-encoded ones
all_names = [:AtBat, :Hits, :HmRun, :Runs, :RBI, :Walks, :Years,
:CAtBat, :CHits, :CHmRun, :CRuns, :CRBI, :CWalks,
:League__A, :League__N, :Div_E, :Div_W,
:PutOuts, :Assists, :Errors, :NewLeague_A, :NewLeague_N]
idxshow = collect(1:length(coef_vals))[abs.(coef_vals).>0]
plot(
coef_vals,
xticks = (idxshow, all_names),
legend = false,
xrotation = 90,
line = :stem,
marker = :circle,
size = ((800, 700)),
)
hline!([0], linewidth = 2, color = :red)
ylabel!("Amplitude")
xlabel!("Coefficient")
savefig(joinpath(@OUTPUT, "ISL-lab-6-g6.svg")); # hide
# \figalt{Lasso coefficients}{ISL-lab-6-g6.svg}
# For this baby dataset, simple ridge regression, with default hyperparameters, appears to
# work best. Of course, without a deeper analysis, we cannot say the differences observed
# are statistically significant. For this small data set, it's doubtful.
#
# @@