1
- //! Strategies to build [`Bins`]s and [`Grid`]s (using [`GridBuilder`]) inferring
2
- //! optimal parameters directly from data .
1
+ //! Strategies used by [`GridBuilder`] to infer optimal parameters from data for building [`Bins`]
2
+ //! and [`Grid`] instances .
3
3
//!
4
4
//! The docs for each strategy have been taken almost verbatim from [`NumPy`].
5
5
//!
6
- //! Each strategy specifies how to compute the optimal number of [`Bins`] or
7
- //! the optimal bin width.
8
- //! For those strategies that prescribe the optimal number
9
- //! of [`Bins`] we then compute the optimal bin width with
6
+ //! Each strategy specifies how to compute the optimal number of [`Bins`] or the optimal bin width.
7
+ //! For those strategies that prescribe the optimal number of [`Bins`], the optimal bin width is
8
+ //! computed by `bin_width = (max - min)/n`.
10
9
//!
11
- //! `bin_width = (max - min)/n`
10
+ //! Since all bins are left-closed and right-open, it is guaranteed to add an extra bin to include
11
+ //! the maximum value from the given data when necessary, so that no data is discarded.
12
12
//!
13
- //! All our bins are left-inclusive and right-exclusive: we make sure to add an extra bin
14
- //! if it is necessary to include the maximum value of the array that has been passed as argument
15
- //! to the `from_array` method.
13
+ //! # Strategies
16
14
//!
15
+ //! Currently, the following strategies are implemented:
16
+ //!
17
+ //! - [`Auto`]: Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all
18
+ //! around performance.
19
+ //! - [`FreedmanDiaconis`]: Robust (resilient to outliers) strategy that takes into account data
20
+ //! variability and data size.
21
+ //! - [`Rice`]: A strategy that does not take variability into account, only data size. Commonly
22
+ //! overestimates number of bins required.
23
+ //! - [`Sqrt`]: Square root (of data size) strategy, used by Excel and other programs
24
+ //! for its speed and simplicity.
25
+ //! - [`Sturges`]: R’s default strategy, only accounts for data size. Only optimal for gaussian data
26
+ //! and underestimates number of bins for large non-gaussian datasets.
27
+ //!
28
+ //! # Notes
29
+ //!
30
+ //! In general, successful infererence on optimal bin width and number of bins relies on
31
+ //! **variability** of data. In other word, the provided ovservations should not be empty or
32
+ //! constant.
33
+ //!
34
+ //! In addition, [`Auto`] and [`FreedmanDiaconis`] requires the [`interquartile range (IQR)`][iqr],
35
+ //! i.e. the difference between upper and lower quartiles, to be positive.
36
+ //!
37
+ //! [`GridBuilder`]: ../struct.GridBuilder.html
17
38
//! [`Bins`]: ../struct.Bins.html
18
39
//! [`Grid`]: ../struct.Grid.html
19
- //! [`GridBuilder`]: ../struct.GridBuilder.html
20
40
//! [`NumPy`]: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges
21
- use super :: super :: interpolate:: Nearest ;
22
- use super :: super :: { Quantile1dExt , QuantileExt } ;
23
- use super :: errors:: BinsBuildError ;
24
- use super :: { Bins , Edges } ;
25
- use ndarray:: prelude:: * ;
26
- use ndarray:: Data ;
41
+ //! [`Auto`]: struct.Auto.html
42
+ //! [`Sturges`]: struct.Sturges.html
43
+ //! [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
44
+ //! [`Rice`]: struct.Rice.html
45
+ //! [`Sqrt`]: struct.Sqrt.html
46
+ //! [iqr]: https://www.wikiwand.com/en/Interquartile_range
47
+ use crate :: {
48
+ histogram:: { errors:: BinsBuildError , Bins , Edges } ,
49
+ quantile:: { interpolate:: Nearest , Quantile1dExt , QuantileExt } ,
50
+ } ;
51
+ use ndarray:: { prelude:: * , Data } ;
27
52
use noisy_float:: types:: n64;
28
53
use num_traits:: { FromPrimitive , NumOps , Zero } ;
29
54
30
- /// A trait implemented by all strategies to build [`Bins`]
31
- /// with parameters inferred from observations.
55
+ /// A trait implemented by all strategies to build [`Bins`] with parameters inferred from
56
+ /// observations.
32
57
///
33
- /// A `BinsBuildingStrategy` is required by [`GridBuilder`]
34
- /// to know how to build a [`Grid`]'s projections on the
58
+ /// This is required by [`GridBuilder`] to know how to build a [`Grid`]'s projections on the
35
59
/// coordinate axes.
36
60
///
37
61
/// [`Bins`]: ../struct.Bins.html
38
- /// [`Grid`]: ../struct.Grid.html
39
62
/// [`GridBuilder`]: ../struct.GridBuilder.html
63
+ /// [`Grid`]: ../struct.Grid.html
40
64
pub trait BinsBuildingStrategy {
41
65
type Elem : Ord ;
42
- /// Given some observations in a 1-dimensional array it returns a `BinsBuildingStrategy`
43
- /// that has learned the required parameter to build a collection of [`Bins`].
66
+ /// Returns a strategy that has learnt the required parameter fo building [`Bins`] for given
67
+ /// 1-dimensional array, or an `Err` if it is not possible to infer the required parameter
68
+ /// with the given data and specified strategy.
44
69
///
45
- /// It returns `Err` if it is not possible to build a collection of
46
- /// [`Bins`] given the observed data according to the chosen strategy.
70
+ /// # Errors
71
+ ///
72
+ /// See each of the struct-level documentation for details on errors an implementor may return.
47
73
///
48
74
/// [`Bins`]: ../struct.Bins.html
49
75
fn from_array < S > ( array : & ArrayBase < S , Ix1 > ) -> Result < Self , BinsBuildError >
50
76
where
51
77
S : Data < Elem = Self :: Elem > ,
52
78
Self : std:: marker:: Sized ;
53
79
54
- /// Returns a [`Bins`] instance, built accordingly to the parameters
55
- /// inferred from observations in [`from_array`].
80
+ /// Returns a [`Bins`] instance, according to parameters inferred from observations.
56
81
///
57
82
/// [`Bins`]: ../struct.Bins.html
58
- /// [`from_array`]: #method.from_array.html
59
83
fn build ( & self ) -> Bins < Self :: Elem > ;
60
84
61
- /// Returns the optimal number of bins, according to the parameters
62
- /// inferred from observations in [`from_array`].
63
- ///
64
- /// [`from_array`]: #method.from_array.html
85
+ /// Returns the optimal number of bins, according to parameters inferred from observations.
65
86
fn n_bins ( & self ) -> usize ;
66
87
}
67
88
@@ -72,12 +93,19 @@ struct EquiSpaced<T> {
72
93
max : T ,
73
94
}
74
95
75
- /// Square root (of data size) strategy, used by Excel and other programs
76
- /// for its speed and simplicity.
96
+ /// Square root (of data size) strategy, used by Excel and other programs for its speed and
97
+ /// simplicity.
77
98
///
78
99
/// Let `n` be the number of observations. Then
79
100
///
80
101
/// `n_bins` = `sqrt(n)`
102
+ ///
103
+ /// # Notes
104
+ ///
105
+ /// This strategy requires the data
106
+ ///
107
+ /// - not being empty
108
+ /// - not being constant
81
109
#[ derive( Debug ) ]
82
110
pub struct Sqrt < T > {
83
111
builder : EquiSpaced < T > ,
@@ -86,12 +114,19 @@ pub struct Sqrt<T> {
86
114
/// A strategy that does not take variability into account, only data size. Commonly
87
115
/// overestimates number of bins required.
88
116
///
89
- /// Let `n` be the number of observations and `n_bins` the number of bins.
117
+ /// Let `n` be the number of observations and `n_bins` be the number of bins.
90
118
///
91
119
/// `n_bins` = 2`n`<sup>1/3</sup>
92
120
///
93
121
/// `n_bins` is only proportional to cube root of `n`. It tends to overestimate
94
122
/// the `n_bins` and it does not take into account data variability.
123
+ ///
124
+ /// # Notes
125
+ ///
126
+ /// This strategy requires the data
127
+ ///
128
+ /// - not being empty
129
+ /// - not being constant
95
130
#[ derive( Debug ) ]
96
131
pub struct Rice < T > {
97
132
builder : EquiSpaced < T > ,
@@ -105,24 +140,38 @@ pub struct Rice<T> {
105
140
/// is too conservative for larger, non-normal datasets.
106
141
///
107
142
/// This is the default method in R’s hist method.
143
+ ///
144
+ /// # Notes
145
+ ///
146
+ /// This strategy requires the data
147
+ ///
148
+ /// - not being empty
149
+ /// - not being constant
108
150
#[ derive( Debug ) ]
109
151
pub struct Sturges < T > {
110
152
builder : EquiSpaced < T > ,
111
153
}
112
154
113
- /// Robust (resilient to outliers) strategy that takes into
114
- /// account data variability and data size.
155
+ /// Robust (resilient to outliers) strategy that takes into account data variability and data size.
115
156
///
116
157
/// Let `n` be the number of observations.
117
158
///
118
- /// `bin_width` = 2× `IQR`× `n`<sup>−1/3</sup>
159
+ /// `bin_width` = 2 × `IQR` × `n`<sup>−1/3</sup>
119
160
///
120
161
/// The bin width is proportional to the interquartile range ([`IQR`]) and inversely proportional to
121
- /// cube root of `n`. It can be too conservative for small datasets, but it is quite good for
122
- /// large datasets.
162
+ /// cube root of `n`. It can be too conservative for small datasets, but it is quite good for large
163
+ /// datasets.
123
164
///
124
165
/// The [`IQR`] is very robust to outliers.
125
166
///
167
+ /// # Notes
168
+ ///
169
+ /// This strategy requires the data
170
+ ///
171
+ /// - not being empty
172
+ /// - not being constant
173
+ /// - having positive [`IQR`]
174
+ ///
126
175
/// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
127
176
#[ derive( Debug ) ]
128
177
pub struct FreedmanDiaconis < T > {
@@ -135,16 +184,25 @@ enum SturgesOrFD<T> {
135
184
FreedmanDiaconis ( FreedmanDiaconis < T > ) ,
136
185
}
137
186
138
- /// Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies.
139
- /// Provides good all around performance.
187
+ /// Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all around
188
+ /// performance.
189
+ ///
190
+ /// A compromise to get a good value. For small datasets the [`Sturges`] value will usually be
191
+ /// chosen, while larger datasets will usually default to [`FreedmanDiaconis`]. Avoids the overly
192
+ /// conservative behaviour of [`FreedmanDiaconis`] and [`Sturges`] for small and large datasets
193
+ /// respectively.
140
194
///
141
- /// A compromise to get a good value. For small datasets the [`Sturges`] value will usually be chosen,
142
- /// while larger datasets will usually default to [`FreedmanDiaconis`]. Avoids the overly
143
- /// conservative behaviour of [`FreedmanDiaconis`] and [`Sturges`] for
144
- /// small and large datasets respectively.
195
+ /// # Notes
196
+ ///
197
+ /// This strategy requires the data
198
+ ///
199
+ /// - not being empty
200
+ /// - not being constant
201
+ /// - having positive [`IQR`]
145
202
///
146
203
/// [`Sturges`]: struct.Sturges.html
147
204
/// [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
205
+ /// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
148
206
#[ derive( Debug ) ]
149
207
pub struct Auto < T > {
150
208
builder : SturgesOrFD < T > ,
0 commit comments