-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathtutorial.jl
186 lines (130 loc) · 4.97 KB
/
tutorial.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
using Pkg # hideall
Pkg.activate("_literate/ISL-lab-5/Project.toml")
Pkg.instantiate()
macro OUTPUT()
return isdefined(Main, :Franklin) ? Franklin.OUT_PATH[] : "/tmp/"
end;
# @@dropdown
# ## Getting started
# @@
# @@dropdown-content
using MLJ
import RDatasets: dataset
import DataFrames: DataFrame, select
MLJ.color_off() # hide
auto = dataset("ISLR", "Auto")
y, X = unpack(auto, ==(:MPG))
train, test = partition(eachindex(y), 0.5, shuffle=true, rng=444);
# Note the use of `rng=` to seed the shuffling of indices so that the results are reproducible.
# @@dropdown
# ### Polynomial regression
# @@
# @@dropdown-content
#
LR = @load LinearRegressor pkg=MLJLinearModels
# In this part we only build models with the `Horsepower` feature.
using Plots
Plots.scalefontsizes() #hide
Plots.scalefontsizes(1.3) #hide
plot(X.Horsepower, y, seriestype=:scatter, legend=false, size=(800,600))
xlabel!("Horsepower")
ylabel!("MPG")
savefig(joinpath(@OUTPUT, "ISL-lab-5-g1.svg")); # hide
# \figalt{MPG v Horsepower}{ISL-lab-5-g1.svg}
# Let's get a baseline:
lm = LR()
mlm = machine(lm, select(X, :Horsepower), y)
fit!(mlm, rows=train)
rms(MLJ.predict(mlm, rows=test), y[test])^2
# Note that we square the measure to match the results obtained in the ISL labs where the mean squared error (here we use the `rms` which is the square root of that).
xx = (Horsepower=range(50, 225, length=100) |> collect, )
yy = MLJ.predict(mlm, xx)
plot(X.Horsepower, y, seriestype=:scatter, legend=false, size=(800,600))
plot!(xx.Horsepower, yy, legend=false, linewidth=3, color=:orange)
xlabel!("Horsepower")
ylabel!("MPG")
savefig(joinpath(@OUTPUT, "ISL-lab-5-g2.svg")); # hide
# \figalt{1st order baseline}{ISL-lab-5-g2.svg}
# We now want to build three polynomial models of degree 1, 2 and 3 respectively; we start by forming the corresponding feature matrix:
hp = X.Horsepower
Xhp = DataFrame(hp1=hp, hp2=hp.^2, hp3=hp.^3);
# Now we can write a simple pipeline where the first step selects the features we want (and with it the degree of the polynomial) and the second is the linear regressor:
LinMod = Pipeline(
FeatureSelector(features=[:hp1]),
LR()
);
# Then we can instantiate and fit 3 models where we specify the features each time:
lr1 = machine(LinMod, Xhp, y) # poly of degree 1 (line)
fit!(lr1, rows=train)
LinMod.feature_selector.features = [:hp1, :hp2] # poly of degree 2
lr2 = machine(LinMod, Xhp, y)
fit!(lr2, rows=train)
LinMod.feature_selector.features = [:hp1, :hp2, :hp3] # poly of degree 3
lr3 = machine(LinMod, Xhp, y)
fit!(lr3, rows=train)
# Let's check the performances on the test set
get_mse(lr) = rms(MLJ.predict(lr, rows=test), y[test])^2
@show get_mse(lr1)
@show get_mse(lr2)
@show get_mse(lr3)
# Let's visualise the models
hpn = xx.Horsepower
Xnew = DataFrame(hp1=hpn, hp2=hpn.^2, hp3=hpn.^3)
yy1 = MLJ.predict(lr1, Xnew)
yy2 = MLJ.predict(lr2, Xnew)
yy3 = MLJ.predict(lr3, Xnew)
plot(X.Horsepower, y, seriestype=:scatter, label=false, size=(800,600))
plot!(xx.Horsepower, yy1, label="Order 1", linewidth=3, color=:orange,)
plot!(xx.Horsepower, yy2, label="Order 2", linewidth=3, color=:green,)
plot!(xx.Horsepower, yy3, label="Order 3", linewidth=3, color=:red,)
xlabel!("Horsepower")
ylabel!("MPG")
savefig(joinpath(@OUTPUT, "ISL-lab-5-g3.svg")); # hide
# \figalt{1st, 2nd and 3d order fit}{ISL-lab-5-g3.svg}
#
# @@
#
# @@
# @@dropdown
# ## K-Folds Cross Validation
# @@
# @@dropdown-content
#
# Let's crossvalidate over the degree of the polynomial.
#
# **Note**: there's a bit of gymnastics here because MLJ doesn't directly support a polynomial regression; see our tutorial on [tuning models](/getting-started/model-tuning/) for a gentler introduction to model tuning.
# The gist of the following code is to create a dataframe where each column is a power of the `Horsepower` feature from 1 to 10 and we build a series of regression models using incrementally more of those features (higher degree):
Xhp = DataFrame([hp.^i for i in 1:10], :auto)
cases = [[Symbol("x$j") for j in 1:i] for i in 1:10]
r = range(LinMod, :(feature_selector.features), values=cases)
tm = TunedModel(model=LinMod, ranges=r, resampling=CV(nfolds=10), measure=rms)
# Now we're left with fitting the tuned model
mtm = machine(tm, Xhp, y)
fit!(mtm)
rep = report(mtm)
res = rep.plotting
@show round.(res.measurements.^2, digits=2)
@show argmin(res.measurements)
#
# So the conclusion here is that the 5th order polynomial does quite well.
#
# In ISL they use a different seed so the results are a bit different but comparable.
#
Xnew = DataFrame([hpn.^i for i in 1:10], :auto)
yy5 = MLJ.predict(mtm, Xnew)
plot(X.Horsepower, y, seriestype=:scatter, legend=false, size=(800,600))
plot!(xx.Horsepower, yy5, color=:orange, linewidth=4, legend=false)
xlabel!("Horsepower")
ylabel!("MPG")
savefig(joinpath(@OUTPUT, "ISL-lab-5-g4.svg")); # hide
# \figalt{5th order fit}{ISL-lab-5-g4.svg}
#
# @@
# @@dropdown
# ## The Bootstrap
# @@
# @@dropdown-content
#
# _Bootstrapping is not currently supported in MLJ._
#
# @@