histogram/strategies.rs: Improved documentations

lebensterben · lebensterben · commit a479cbacac12 · 2020-04-16T02:05:35.000-04:00
- Added more information in mod-level documentation, including
  - A section for strategies currently implemented
  - A section for requirement of data to infer optimal parameters
- Reformatting
diff --git a/src/histogram/strategies.rs b/src/histogram/strategies.rs
@@ -1,67 +1,88 @@
-//! Strategies to build [`Bins`]s and [`Grid`]s (using [`GridBuilder`]) inferring
-//! optimal parameters directly from data.
+//! Strategies used by [`GridBuilder`] to infer optimal parameters from data for building [`Bins`]
+//! and [`Grid`] instances.
 //!
 //! The docs for each strategy have been taken almost verbatim from [`NumPy`].
 //!
-//! Each strategy specifies how to compute the optimal number of [`Bins`] or
-//! the optimal bin width.
-//! For those strategies that prescribe the optimal number
-//! of [`Bins`] we then compute the optimal bin width with
+//! Each strategy specifies how to compute the optimal number of [`Bins`] or the optimal bin width.
+//! For those strategies that prescribe the optimal number of [`Bins`], the optimal bin width is
+//! computed by `bin_width = (max - min)/n`.
 //!
-//! `bin_width = (max - min)/n`
+//! Since all bins are left-closed and right-open, it is guaranteed to add an extra bin to include
+//! the maximum value from the given data when necessary, so that no data is discarded.
 //!
-//! All our bins are left-inclusive and right-exclusive: we make sure to add an extra bin
-//! if it is necessary to include the maximum value of the array that has been passed as argument
-//! to the `from_array` method.
+//! # Strategies
 //!
+//! Currently, the following strategies are implemented:
+//!
+//! - [`Auto`]: Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all
+//!   around performance.
+//! - [`FreedmanDiaconis`]: Robust (resilient to outliers) strategy that takes into account data
+//!   variability and data size.
+//! - [`Rice`]: A strategy that does not take variability into account, only data size. Commonly
+//!   overestimates number of bins required.
+//! - [`Sqrt`]: Square root (of data size) strategy, used by Excel and other programs
+//!   for its speed and simplicity.
+//! - [`Sturges`]: R’s default strategy, only accounts for data size. Only optimal for gaussian data
+//!   and underestimates number of bins for large non-gaussian datasets.
+//!
+//! # Notes
+//!
+//! In general, successful infererence on optimal bin width and number of bins relies on
+//! **variability** of data. In other word, the provided ovservations should not be empty or
+//! constant.
+//!
+//! In addition, [`Auto`] and [`FreedmanDiaconis`] requires the [`interquartile range (IQR)`][iqr],
+//! i.e. the difference between upper and lower quartiles, to be positive.
+//!
+//! [`GridBuilder`]: ../struct.GridBuilder.html
 //! [`Bins`]: ../struct.Bins.html
 //! [`Grid`]: ../struct.Grid.html
-//! [`GridBuilder`]: ../struct.GridBuilder.html
 //! [`NumPy`]: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges
-use super::super::interpolate::Nearest;
-use super::super::{Quantile1dExt, QuantileExt};
-use super::errors::BinsBuildError;
-use super::{Bins, Edges};
-use ndarray::prelude::*;
-use ndarray::Data;
+//! [`Auto`]: struct.Auto.html
+//! [`Sturges`]: struct.Sturges.html
+//! [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
+//! [`Rice`]: struct.Rice.html
+//! [`Sqrt`]: struct.Sqrt.html
+//! [iqr]: https://www.wikiwand.com/en/Interquartile_range
+use crate::{
+    histogram::{errors::BinsBuildError, Bins, Edges},
+    quantile::{interpolate::Nearest, Quantile1dExt, QuantileExt},
+};
+use ndarray::{prelude::*, Data};
 use noisy_float::types::n64;
 use num_traits::{FromPrimitive, NumOps, Zero};
 
-/// A trait implemented by all strategies to build [`Bins`]
-/// with parameters inferred from observations.
+/// A trait implemented by all strategies to build [`Bins`] with parameters inferred from
+/// observations.
 ///
-/// A `BinsBuildingStrategy` is required by [`GridBuilder`]
-/// to know how to build a [`Grid`]'s projections on the
+/// This is required by [`GridBuilder`] to know how to build a [`Grid`]'s projections on the
 /// coordinate axes.
 ///
 /// [`Bins`]: ../struct.Bins.html
-/// [`Grid`]: ../struct.Grid.html
 /// [`GridBuilder`]: ../struct.GridBuilder.html
+/// [`Grid`]: ../struct.Grid.html
 pub trait BinsBuildingStrategy {
     type Elem: Ord;
-    /// Given some observations in a 1-dimensional array it returns a `BinsBuildingStrategy`
-    /// that has learned the required parameter to build a collection of [`Bins`].
+    /// Returns a strategy that has learnt the required parameter fo building [`Bins`] for given
+    /// 1-dimensional array, or an `Err` if it is not possible to infer the required parameter
+    /// with the given data and specified strategy.
     ///
-    /// It returns `Err` if it is not possible to build a collection of
-    /// [`Bins`] given the observed data according to the chosen strategy.
+    /// # Errors
+    ///
+    /// See each of the struct-level documentation for details on errors an implementor may return.
     ///
     /// [`Bins`]: ../struct.Bins.html
     fn from_array<S>(array: &ArrayBase<S, Ix1>) -> Result<Self, BinsBuildError>
     where
         S: Data<Elem = Self::Elem>,
         Self: std::marker::Sized;
 
-    /// Returns a [`Bins`] instance, built accordingly to the parameters
-    /// inferred from observations in [`from_array`].
+    /// Returns a [`Bins`] instance, according to parameters inferred from observations.
     ///
     /// [`Bins`]: ../struct.Bins.html
-    /// [`from_array`]: #method.from_array.html
     fn build(&self) -> Bins<Self::Elem>;
 
-    /// Returns the optimal number of bins, according to the parameters
-    /// inferred from observations in [`from_array`].
-    ///
-    /// [`from_array`]: #method.from_array.html
+    /// Returns the optimal number of bins, according to parameters inferred from observations.
     fn n_bins(&self) -> usize;
 }
 
@@ -72,12 +93,19 @@ struct EquiSpaced<T> {
     max: T,
 }
 
-/// Square root (of data size) strategy, used by Excel and other programs
-/// for its speed and simplicity.
+/// Square root (of data size) strategy, used by Excel and other programs for its speed and
+/// simplicity.
 ///
 /// Let `n` be the number of observations. Then
 ///
 /// `n_bins` = `sqrt(n)`
+///
+/// # Notes
+///
+/// This strategy requires the data
+///
+/// - not being empty
+/// - not being constant
 #[derive(Debug)]
 pub struct Sqrt<T> {
     builder: EquiSpaced<T>,
@@ -86,12 +114,19 @@ pub struct Sqrt<T> {
 /// A strategy that does not take variability into account, only data size. Commonly
 /// overestimates number of bins required.
 ///
-/// Let `n` be the number of observations and `n_bins` the number of bins.
+/// Let `n` be the number of observations and `n_bins` be the number of bins.
 ///
 /// `n_bins` = 2`n`<sup>1/3</sup>
 ///
 /// `n_bins` is only proportional to cube root of `n`. It tends to overestimate
 /// the `n_bins` and it does not take into account data variability.
+///
+/// # Notes
+///
+/// This strategy requires the data
+///
+/// - not being empty
+/// - not being constant
 #[derive(Debug)]
 pub struct Rice<T> {
     builder: EquiSpaced<T>,
@@ -105,24 +140,38 @@ pub struct Rice<T> {
 /// is too conservative for larger, non-normal datasets.
 ///
 /// This is the default method in R’s hist method.
+///
+/// # Notes
+///
+/// This strategy requires the data
+///
+/// - not being empty
+/// - not being constant
 #[derive(Debug)]
 pub struct Sturges<T> {
     builder: EquiSpaced<T>,
 }
 
-/// Robust (resilient to outliers) strategy that takes into
-/// account data variability and data size.
+/// Robust (resilient to outliers) strategy that takes into account data variability and data size.
 ///
 /// Let `n` be the number of observations.
 ///
-/// `bin_width` = 2×`IQR`×`n`<sup>−1/3</sup>
+/// `bin_width` = 2 × `IQR` × `n`<sup>−1/3</sup>
 ///
 /// The bin width is proportional to the interquartile range ([`IQR`]) and inversely proportional to
-/// cube root of `n`. It can be too conservative for small datasets, but it is quite good for
-/// large datasets.
+/// cube root of `n`. It can be too conservative for small datasets, but it is quite good for large
+/// datasets.
 ///
 /// The [`IQR`] is very robust to outliers.
 ///
+/// # Notes
+///
+/// This strategy requires the data
+///
+/// - not being empty
+/// - not being constant
+/// - having positive [`IQR`]
+///
 /// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
 #[derive(Debug)]
 pub struct FreedmanDiaconis<T> {
@@ -135,16 +184,25 @@ enum SturgesOrFD<T> {
     FreedmanDiaconis(FreedmanDiaconis<T>),
 }
 
-/// Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies.
-/// Provides good all around performance.
+/// Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all around
+/// performance.
+///
+/// A compromise to get a good value. For small datasets the [`Sturges`] value will usually be
+/// chosen, while larger datasets will usually default to [`FreedmanDiaconis`]. Avoids the overly
+/// conservative behaviour of [`FreedmanDiaconis`] and [`Sturges`] for small and large datasets
+/// respectively.
 ///
-/// A compromise to get a good value. For small datasets the [`Sturges`] value will usually be chosen,
-/// while larger datasets will usually default to [`FreedmanDiaconis`]. Avoids the overly
-/// conservative behaviour of [`FreedmanDiaconis`] and [`Sturges`] for
-/// small and large datasets respectively.
+/// # Notes
+///
+/// This strategy requires the data
+///
+/// - not being empty
+/// - not being constant
+/// - having positive [`IQR`]
 ///
 /// [`Sturges`]: struct.Sturges.html
 /// [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
+/// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
 #[derive(Debug)]
 pub struct Auto<T> {
     builder: SturgesOrFD<T>,