Skip to content

Commit 97a51d3

Browse files
authored
Merge pull request #1060 from alan-turing-institute/dev
For a 0.20 release
2 parents 1313d4c + a421a6f commit 97a51d3

26 files changed

+485
-664
lines changed

Diff for: ORGANIZATION.md

+3
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ its conventional use, are marked with a ⟂ symbol:
4040
readme](https://github.com/JuliaAI/MLJBase.jl) for a
4141
detailed description of MLJBase's contents.
4242

43+
* [StatisticalMeasures.jl](https://github.com/JuliaAI/StatisticalMeasures.jl) provifes
44+
performance measures (metrics) such as losses and scores.
45+
4346
* [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl)
4447
hosts the *MLJ model registry*, which contains metadata on all the
4548
models the MLJ user can search and load from MLJ. Moreover, it

Diff for: Project.toml

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "MLJ"
22
uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
33
authors = ["Anthony D. Blaom <[email protected]>"]
4-
version = "0.19.5"
4+
version = "0.20.0"
55

66
[deps]
77
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -21,6 +21,7 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
2121
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
2222
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
2323
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
24+
StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
2425
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
2526
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
2627
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
@@ -29,17 +30,18 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
2930
CategoricalArrays = "0.8,0.9, 0.10"
3031
ComputationalResources = "0.3"
3132
Distributions = "0.21,0.22,0.23, 0.24, 0.25"
32-
MLJBase = "0.21.14"
33-
MLJEnsembles = "0.3"
34-
MLJFlow = "0.1"
35-
MLJIteration = "0.5"
33+
MLJBase = "1"
34+
MLJEnsembles = "0.4"
35+
MLJFlow = "0.2"
36+
MLJIteration = "0.6"
3637
MLJModels = "0.16"
37-
MLJTuning = "0.7"
38+
MLJTuning = "0.8"
3839
OpenML = "0.2,0.3"
3940
ProgressMeter = "1.1"
4041
Reexport = "1.2"
4142
ScientificTypes = "3"
4243
StatsBase = "0.32,0.33, 0.34"
44+
StatisticalMeasures = "0.1"
4345
Tables = "0.2,1.0"
4446
julia = "1.6"
4547

Diff for: docs/Project.toml

+4-9
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ EarlyStopping = "792122b4-ca99-40de-a6bc-6742525f08b6"
88
EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
99
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
1010
IterationControl = "b3c1a2ee-3fec-4384-bf48-272ea71de57c"
11-
LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
1211
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
1312
MLJClusteringInterface = "d354fa79-ed1c-40d4-88ef-b8c7bd1568af"
1413
MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
1514
MLJEnsembles = "50ed68f4-41fd-4504-931a-ed422449fee0"
15+
MLJFlow = "7b7b8358-b45c-48ea-a8ef-7ca328ad328f"
1616
MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c"
1717
MLJIteration = "614be32b-d00c-4edb-bd02-1eb411ab5e55"
1818
MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692"
@@ -25,16 +25,11 @@ NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36"
2525
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
2626
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
2727
ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
28+
StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
29+
StatisticalMeasuresBase = "c062fc1d-0d66-479b-b6ac-8b44719de4cc"
2830
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
2931
TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
3032

3133
[compat]
32-
CategoricalDistributions = "0.1"
33-
Documenter = "0.27"
34-
MLJEnsembles = "0.3"
35-
MLJIteration = "0.5"
36-
MLJModels = "0.16"
37-
MLJTuning = "0.7"
38-
ScientificTypes = "3"
39-
ScientificTypesBase = "3"
34+
Documenter = "1"
4035
julia = "1.6"

Diff for: docs/make.jl

+7-5
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@ import MLJModels
1414
import MLJEnsembles
1515
import ScientificTypes
1616
import MLJModelInterface
17+
import ScientificTypes
1718
import ScientificTypesBase
1819
import Distributions
1920
using CategoricalArrays
20-
using LossFunctions
2121
import CategoricalDistributions
22+
import StatisticalMeasures
23+
import StatisticalMeasuresBase
2224

2325
const MMI = MLJModelInterface
2426

@@ -87,9 +89,7 @@ pages = [
8789
"Third Party Packages" => "third_party_packages.md",
8890
"Glossary" => "glossary.md",
8991
"MLJ Cheatsheet" => "mlj_cheatsheet.md",
90-
"Known Issues" => "known_issues.md",
9192
"FAQ" => "frequently_asked_questions.md",
92-
"Julia BlogPost" => "julia_blogpost.md",
9393
"Index of Methods" => "api.md",
9494
]
9595

@@ -109,12 +109,14 @@ makedocs(
109109
ScientificTypes,
110110
MLJModelInterface,
111111
ScientificTypesBase,
112+
StatisticalMeasures,
112113
MLJIteration,
113114
EarlyStopping,
114115
IterationControl,
115-
CategoricalDistributions],
116+
CategoricalDistributions,
117+
StatisticalMeasures],
116118
pages = pages,
117-
strict = Documenter.except(:cross_references, :missing_docs),
119+
warnonly = [:cross_references, :missing_docs],
118120
)
119121

120122
@info "`makedocs` has finished running. "

Diff for: docs/model_docstring_tools.jl

+18-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,22 @@
22

33
const PATH_TO_MODEL_DOCS = joinpath(@__DIR__, "src", "models")
44

5+
"""
6+
remove_doc_refs(str::AbstractString)
7+
8+
Removes `@ref` references from `str. For example, a substring of the form
9+
"[`some.thing_like_this123!`](@ref)" is replaced with "`some.thing_like_this123!`".
10+
11+
"""
12+
function remove_doc_refs(page)
13+
regex = r"\[([\?'\.\d`\!\_a-zA-Z]*)\]\(\@ref\)"
14+
while contains(page, regex)
15+
# replace the first matched regex with the captured string
16+
page = replace(page, regex => s"\1")
17+
end
18+
page
19+
end
20+
521
demote_headings(str) = replace(str, "# "=>"## ")
622
handle(model) = model.name*"_"*model.package_name
723

@@ -25,7 +41,7 @@ function write_page(model; path=PATH_TO_MODEL_DOCS)
2541
open(pagepath, "w") do stream
2642
header = "# [$(model.name)](@id $id)\n\n"
2743
md_page = doc(model.name, pkg=model.package_name)
28-
page = header*demote_headings(string(md_page))
44+
page = header*demote_headings(string(md_page)) |> remove_doc_refs
2945
write(stream, page)
3046
nothing
3147
end
@@ -54,7 +70,7 @@ function models_missing_descriptors()
5470
handles = handle.(models())
5571
filter(handles) do h
5672
!(h in HANDLES)
57-
end
73+
end
5874
end
5975

6076
"""

Diff for: docs/src/about_mlj.md

-2
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,6 @@ Bugs, suggestions, and feature requests can be posted
221221
Users are also welcome to join the `#mlj` Julia slack channel to ask
222222
questions and make suggestions.
223223

224-
See also, [Known Issues](@ref)
225-
226224

227225
## Installation
228226

Diff for: docs/src/common_mlj_workflows.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,10 @@ KNN = @load KNNRegressor
176176
knn = KNN()
177177
evaluate(knn, X, y,
178178
resampling=CV(nfolds=5),
179-
measure=[RootMeanSquaredError(), MeanAbsoluteError()])
179+
measure=[RootMeanSquaredError(), LPLoss(1)])
180180
```
181181

182-
Note `RootMeanSquaredError()` has alias `rms` and `MeanAbsoluteError()` has alias `mae`.
182+
Note `RootMeanSquaredError()` has alias `rms` and `LPLoss(1)` has aliases `l1`, `mae`.
183183

184184
Do `measures()` to list all losses and scores and their aliases.
185185

@@ -220,7 +220,7 @@ Fit on the train data set and evaluate on the test data set:
220220
```@example workflows
221221
fit!(mach, rows=train)
222222
yhat = predict(mach, X[test,:])
223-
mean(LogLoss(tol=1e-4)(yhat, y[test]))
223+
LogLoss(tol=1e-4)(yhat, y[test])
224224
```
225225

226226
Note `LogLoss()` has aliases `log_loss` and `cross_entropy`.
@@ -451,14 +451,14 @@ transformation/inverse transformation:
451451
```@example workflows
452452
X, y = @load_reduced_ames
453453
KNN = @load KNNRegressor
454-
knn_with_target = TransformedTargetModel(model=KNN(K=3), target=Standardizer())
454+
knn_with_target = TransformedTargetModel(model=KNN(K=3), transformer=Standardizer())
455455
pipe = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> knn_with_target
456456
```
457457

458458
Evaluating the pipeline (just as you would any other model):
459459

460460
```@example workflows
461-
pipe.one_hot_encoder.drop_last = true
461+
pipe.one_hot_encoder.drop_last = true # mutate a nested hyper-parameter
462462
evaluate(pipe, X, y, resampling=Holdout(), measure=RootMeanSquaredError(), verbosity=2)
463463
```
464464

@@ -476,7 +476,7 @@ target transformation/inverse transformation:
476476
```@example workflows
477477
Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0
478478
tree_with_target = TransformedTargetModel(model=Tree(),
479-
target=y -> log.(y),
479+
transformer=y -> log.(y),
480480
inverse = z -> exp.(z))
481481
pipe2 = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> tree_with_target;
482482
nothing # hide

Diff for: docs/src/evaluating_model_performance.md

+35-30
Original file line numberDiff line numberDiff line change
@@ -45,31 +45,41 @@ machine potentially change. )
4545

4646
## Multiple measures
4747

48+
Multiple measures are specified as a vector:
49+
4850
```@repl evaluation_of_supervised_models
49-
evaluate!(mach,
50-
resampling=cv,
51-
measure=[l1, rms, rmslp1], verbosity=0)
51+
evaluate!(
52+
mach,
53+
resampling=cv,
54+
measures=[l1, rms, rmslp1],
55+
verbosity=0,
56+
)
5257
```
5358

54-
## Custom measures and weighted measures
55-
56-
```@repl evaluation_of_supervised_models
57-
my_loss(yhat, y) = maximum((yhat - y).^2);
59+
[Custom measures](@ref) can also be provided.
5860

59-
my_per_observation_loss(yhat, y) = abs.(yhat - y);
60-
MLJ.reports_each_observation(::typeof(my_per_observation_loss)) = true;
61+
## Specifying weights
6162

62-
my_weighted_score(yhat, y) = 1/mean(abs.(yhat - y));
63-
my_weighted_score(yhat, y, w) = 1/mean(abs.((yhat - y).^w));
64-
MLJ.supports_weights(::typeof(my_weighted_score)) = true;
65-
MLJ.orientation(::typeof(my_weighted_score)) = :score;
63+
Per-observation weights can be passed to measures. If a measure does not support weights,
64+
the weights are ignored:
6665

66+
```@repl evaluation_of_supervised_models
6767
holdout = Holdout(fraction_train=0.8)
6868
weights = [1, 1, 2, 1, 1, 2, 3, 1, 1, 2, 3, 1];
69-
evaluate!(mach,
70-
resampling=CV(nfolds=3),
71-
measure=[my_loss, my_per_observation_loss, my_weighted_score, l1],
72-
weights=weights, verbosity=0)
69+
evaluate!(
70+
mach,
71+
resampling=CV(nfolds=3),
72+
measure=[l2, rsquared],
73+
weights=weights,
74+
)
75+
```
76+
77+
In classification problems, use `class_weights=...` to specify a class weight dictionary.
78+
79+
```@docs
80+
MLJBase.evaluate!
81+
MLJBase.evaluate
82+
MLJBase.PerformanceEvaluation
7383
```
7484

7585
## User-specified train/test sets
@@ -78,18 +88,20 @@ Users can either provide an explicit list of train/test pairs of row indices for
7888

7989
```@repl evaluation_of_supervised_models
8090
fold1 = 1:6; fold2 = 7:12;
81-
evaluate!(mach,
82-
resampling = [(fold1, fold2), (fold2, fold1)],
83-
measure=[l1, l2], verbosity=0)
91+
evaluate!(
92+
mach,
93+
resampling = [(fold1, fold2), (fold2, fold1)],
94+
measures=[l1, l2],
95+
verbosity=0,
96+
)
8497
```
8598

86-
Or define their own re-usable `ResamplingStrategy` objects, - see
87-
[Custom resampling strategies](@ref) below.
99+
Or the user can define their own re-usable `ResamplingStrategy` objects, - see [Custom
100+
resampling strategies](@ref) below.
88101

89102

90103
## Built-in resampling strategies
91104

92-
93105
```@docs
94106
MLJBase.Holdout
95107
```
@@ -159,10 +171,3 @@ function train_test_pairs(holdout::Holdout, rows)
159171
end
160172
```
161173

162-
## API
163-
164-
```@docs
165-
MLJBase.evaluate!
166-
MLJBase.evaluate
167-
MLJBase.PerformanceEvaluation
168-
```

Diff for: docs/src/generating_synthetic_data.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Generating Synthetic Data
22

3+
Here *synthetic data* means artificially generated data, with no reference to a "real
4+
world" data set. Not to be confused "fake data" obtained by resampling from a distribution
5+
fit to some actual real data.
6+
37
MLJ has a set of functions - `make_blobs`, `make_circles`,
48
`make_moons` and `make_regression` (closely resembling functions in
59
[scikit-learn](https://scikit-learn.org/stable/datasets/index.html#generated-datasets)

0 commit comments

Comments
 (0)