From 2d1baf30612ea7e13cb5edfe5e83e7b6c57acdfa Mon Sep 17 00:00:00 2001
From: relf <remi.lafage@onera.fr>
Date: Sun, 31 Dec 2023 13:17:41 +0100
Subject: [PATCH] WIP

---
 gp/src/lib.rs            |   5 +
 gp/src/parameters.rs     |  10 +-
 gp/src/sgp_algorithm.rs  | 634 +++++++++++++++++++++++++++++++++++++++
 gp/src/sgp_parameters.rs | 185 ++++++++++++
 4 files changed, 829 insertions(+), 5 deletions(-)
 create mode 100644 gp/src/sgp_algorithm.rs
 create mode 100644 gp/src/sgp_parameters.rs
diff --git a/gp/src/lib.rs b/gp/src/lib.rs
index c536a069..4368146e 100644
--- a/gp/src/lib.rs
+++ b/gp/src/lib.rs
@@ -88,10 +88,15 @@ mod algorithm;
 pub mod correlation_models;
 mod errors;
 pub mod mean_models;
+mod sgp_algorithm;
+
 mod parameters;
+mod sgp_parameters;
 mod utils;
 
 pub use algorithm::*;
 pub use errors::*;
 pub use parameters::*;
+pub use sgp_algorithm::*;
+pub use sgp_parameters::*;
 pub use utils::NormalizedMatrix;
diff --git a/gp/src/parameters.rs b/gp/src/parameters.rs
index b1068bef..64e6578d 100644
--- a/gp/src/parameters.rs
+++ b/gp/src/parameters.rs
@@ -7,15 +7,15 @@ use linfa::{Float, ParamGuard};
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub struct GpValidParams<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> {
     /// Parameter of the autocorrelation model
-    theta: Option<Vec<F>>,
+    pub(crate) theta: Option<Vec<F>>,
     /// Regression model representing the mean(x)
-    mean: Mean,
+    pub(crate) mean: Mean,
     /// Correlation model representing the spatial correlation between errors at e(x) and e(x')
-    corr: Corr,
+    pub(crate) corr: Corr,
     /// Optionally apply dimension reduction (KPLS) or not
-    kpls_dim: Option<usize>,
+    pub(crate) kpls_dim: Option<usize>,
     /// Parameter to improve numerical stability
-    nugget: F,
+    pub(crate) nugget: F,
 }
 
 impl<F: Float> Default for GpValidParams<F, ConstantMean, SquaredExponentialCorr> {
diff --git a/gp/src/sgp_algorithm.rs b/gp/src/sgp_algorithm.rs
new file mode 100644
index 00000000..4dc36390
--- /dev/null
+++ b/gp/src/sgp_algorithm.rs
@@ -0,0 +1,634 @@
+use crate::correlation_models::*;
+use crate::errors::{GpError, Result};
+use crate::mean_models::*;
+use crate::sgp_parameters::{SgpParams, SgpValidParams, SparseMethod};
+use crate::utils::{pairwise_differences, NormalizedMatrix};
+use egobox_doe::{Lhs, SamplingMethod};
+use linfa::prelude::{Dataset, DatasetBase, Fit, Float, PredictInplace};
+use linfa_linalg::{cholesky::*, eigh::*, qr::*, svd::*, triangular::*};
+use linfa_pls::PlsRegression;
+use ndarray::{
+    arr1, concatenate, s, stack, Array, Array1, Array2, ArrayBase, Axis, Data, Ix2, Zip,
+};
+use ndarray_einsum_beta::*;
+use ndarray_stats::QuantileExt;
+
+use rand_xoshiro::rand_core::SeedableRng;
+use rand_xoshiro::Xoshiro256Plus;
+#[cfg(feature = "serializable")]
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+// use ndarray_rand::rand_distr::Normal;
+// use ndarray_rand::RandomExt;
+
+// const LOG10_20: f64 = 1.301_029_995_663_981_3; //f64::log10(20.);
+const N_START: usize = 10; // number of optimization restart (aka multistart)
+
+/// Woodbury data computed during training and used for prediction
+#[derive(Debug, Serialize, Deserialize)]
+struct WoodburyData<F: Float> {
+    vec: Array1<F>,
+    inv: Array2<F>,
+}
+impl<F: Float> Clone for WoodburyData<F> {
+    fn clone(&self) -> Self {
+        WoodburyData {
+            vec: self.vec.to_owned(),
+            inv: self.inv.clone(),
+        }
+    }
+}
+
+/// Structure for trained Gaussian Process model
+#[derive(Debug)]
+#[cfg_attr(
+    feature = "serializable",
+    derive(Serialize, Deserialize),
+    serde(bound(serialize = "F: Serialize", deserialize = "F: Deserialize<'de>"))
+)]
+pub struct SparseGaussianProcess<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> {
+    /// Regression model
+    #[cfg_attr(
+        feature = "serializable",
+        serde(bound(serialize = "Mean: Serialize", deserialize = "Mean: Deserialize<'de>"))
+    )]
+    mean: Mean,
+    /// Correlation kernel
+    #[cfg_attr(
+        feature = "serializable",
+        serde(bound(serialize = "Corr: Serialize", deserialize = "Corr: Deserialize<'de>"))
+    )]
+    corr: Corr,
+    /// Sparse method used
+    method: SparseMethod,
+    /// Parameter of the autocorrelation model
+    theta: Array1<F>,
+    /// Gaussian process variance
+    sigma2: F,
+    /// Gaussian noise variance
+    noise: F,
+    /// Weights in case of KPLS dimension reduction coming from PLS regression (orig_dim, kpls_dim)
+    w_star: Array2<F>,
+    /// Training inputs
+    xtrain: Array2<F>,
+    /// Training outputs
+    ytrain: Array2<F>,
+    /// Data used for prediction
+    w_data: WoodburyData<F>,
+}
+
+/// Kriging as GP special case when using constant mean and squared exponential correlation
+pub type SparseKriging<F> = SgpParams<F, ConstantMean, SquaredExponentialCorr>;
+
+impl<F: Float> SparseKriging<F> {
+    pub fn params() -> SgpParams<F, ConstantMean, SquaredExponentialCorr> {
+        SgpParams::new(ConstantMean(), SquaredExponentialCorr())
+    }
+}
+
+impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> Clone
+    for SparseGaussianProcess<F, Mean, Corr>
+{
+    fn clone(&self) -> Self {
+        Self {
+            mean: self.mean,
+            corr: self.corr,
+            method: self.method.clone(),
+            theta: self.theta.to_owned(),
+            sigma2: self.sigma2,
+            noise: self.noise,
+            w_star: self.w_star.to_owned(),
+            xtrain: self.xtrain.clone(),
+            ytrain: self.xtrain.clone(),
+            w_data: self.w_data.clone(),
+        }
+    }
+}
+
+impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> fmt::Display
+    for SparseGaussianProcess<F, Mean, Corr>
+{
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "SGP({}, {})", self.mean, self.corr)
+    }
+}
+
+impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>>
+    SparseGaussianProcess<F, Mean, Corr>
+{
+    /// Gp parameters contructor
+    pub fn params<NewMean: RegressionModel<F>, NewCorr: CorrelationModel<F>>(
+        mean: NewMean,
+        corr: NewCorr,
+    ) -> SgpParams<F, NewMean, NewCorr> {
+        SgpParams::new(mean, corr)
+    }
+
+    /// Predict output values at n given `x` points of nx components specified as a (n, nx) matrix.
+    /// Returns n scalar output values as (n, 1) column vector.
+    pub fn predict_values(&self, x: &ArrayBase<impl Data<Elem = F>, Ix2>) -> Result<Array2<F>> {
+        todo!()
+    }
+
+    /// Predict variance values at n given `x` points of nx components specified as a (n, nx) matrix.
+    /// Returns n variance values as (n, 1) column vector.
+    pub fn predict_variances(&self, x: &ArrayBase<impl Data<Elem = F>, Ix2>) -> Result<Array2<F>> {
+        todo!()
+    }
+
+    /// Retrieve number of PLS components 1 <= n <= x dimension
+    pub fn kpls_dim(&self) -> Option<usize> {
+        if self.w_star.ncols() < self.xtrain.ncols() {
+            Some(self.w_star.ncols())
+        } else {
+            None
+        }
+    }
+
+    /// Retrieve input dimension before kpls dimension reduction if any
+    pub fn input_dim(&self) -> usize {
+        self.ytrain.ncols()
+    }
+
+    /// Retrieve output dimension
+    pub fn output_dim(&self) -> usize {
+        self.ytrain.ncols()
+    }
+}
+
+impl<F, D, Mean, Corr> PredictInplace<ArrayBase<D, Ix2>, Array2<F>>
+    for SparseGaussianProcess<F, Mean, Corr>
+where
+    F: Float,
+    D: Data<Elem = F>,
+    Mean: RegressionModel<F>,
+    Corr: CorrelationModel<F>,
+{
+    fn predict_inplace(&self, x: &ArrayBase<D, Ix2>, y: &mut Array2<F>) {
+        assert_eq!(
+            x.nrows(),
+            y.nrows(),
+            "The number of data points must match the number of output targets."
+        );
+
+        let values = self.predict_values(x).expect("GP Prediction");
+        *y = values;
+    }
+
+    fn default_target(&self, x: &ArrayBase<D, Ix2>) -> Array2<F> {
+        Array2::zeros((x.nrows(), self.output_dim()))
+    }
+}
+
+/// Gausssian Process adaptator to implement `linfa::Predict` trait for variance prediction.
+struct GpVariancePredictor<'a, F, Mean, Corr>(&'a SparseGaussianProcess<F, Mean, Corr>)
+where
+    F: Float,
+    Mean: RegressionModel<F>,
+    Corr: CorrelationModel<F>;
+
+impl<'a, F, D, Mean, Corr> PredictInplace<ArrayBase<D, Ix2>, Array2<F>>
+    for GpVariancePredictor<'a, F, Mean, Corr>
+where
+    F: Float,
+    D: Data<Elem = F>,
+    Mean: RegressionModel<F>,
+    Corr: CorrelationModel<F>,
+{
+    fn predict_inplace(&self, x: &ArrayBase<D, Ix2>, y: &mut Array2<F>) {
+        assert_eq!(
+            x.nrows(),
+            y.nrows(),
+            "The number of data points must match the number of output targets."
+        );
+
+        let values = self.0.predict_variances(x).expect("GP Prediction");
+        *y = values;
+    }
+
+    fn default_target(&self, x: &ArrayBase<D, Ix2>) -> Array2<F> {
+        Array2::zeros((x.nrows(), self.0.output_dim()))
+    }
+}
+
+impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>, D: Data<Elem = F>>
+    Fit<ArrayBase<D, Ix2>, ArrayBase<D, Ix2>, GpError> for SgpValidParams<F, Mean, Corr>
+{
+    type Object = SparseGaussianProcess<F, Mean, Corr>;
+
+    /// Fit GP parameters using maximum likelihood
+    fn fit(
+        &self,
+        dataset: &DatasetBase<ArrayBase<D, Ix2>, ArrayBase<D, Ix2>>,
+    ) -> Result<Self::Object> {
+        let x = dataset.records();
+        let y = dataset.targets();
+        if let Some(d) = self.kpls_dim() {
+            if *d > x.ncols() {
+                return Err(GpError::InvalidValue(format!(
+                    "Dimension reduction {} should be smaller than actual \
+                    training input dimensions {}",
+                    d,
+                    x.ncols()
+                )));
+            };
+        }
+
+        let xtrain = x.to_owned();
+        let ytrain = y.to_owned();
+
+        let mut w_star = Array2::eye(x.ncols());
+        if let Some(n_components) = self.kpls_dim() {
+            let ds = Dataset::new(x.to_owned(), y.to_owned());
+            w_star = PlsRegression::params(*n_components).fit(&ds).map_or_else(
+                |e| match e {
+                    linfa_pls::PlsError::PowerMethodConstantResidualError() => {
+                        Ok(Array2::zeros((x.ncols(), *n_components)))
+                    }
+                    err => Err(err),
+                },
+                |v| Ok(v.rotations().0.to_owned()),
+            )?;
+        };
+        let theta0: Array1<_> = self
+            .initial_theta()
+            .clone()
+            .map_or(Array1::from_elem(w_star.ncols(), F::cast(1e-2)), |v| {
+                Array::from_vec(v)
+            });
+        let sigma2_0 = F::cast(1e-2);
+        let noise_0 = F::cast(1e-2);
+
+        let n = theta0.len() + 2;
+        let mut params_0 = Array1::zeros(n);
+        params_0.slice_mut(s![..n - 2]).assign(&theta0);
+        params_0[n - 2] = sigma2_0;
+        params_0[n - 1] = noise_0;
+
+        let base: f64 = 10.;
+        let objfn = |x: &[f64], _gradient: Option<&mut [f64]>, _params: &mut ()| -> f64 {
+            for v in x.iter() {
+                // check theta as optimizer may give nan values
+                if v.is_nan() {
+                    // shortcut return worst value wrt to rlf minimization
+                    return f64::INFINITY;
+                }
+            }
+            let input = Array1::from_shape_vec(
+                (x.len(),),
+                x.iter().map(|v| F::cast(base.powf(*v))).collect(),
+            )
+            .unwrap();
+
+            let theta = input.slice(s![..input.len() - 2]);
+            let sigma2 = input[input.len() - 2];
+            let noise = input[input.len() - 1];
+
+            let theta = theta.mapv(F::cast);
+            match self.reduced_likelihood(
+                &theta,
+                sigma2,
+                noise,
+                &w_star,
+                &xtrain,
+                &ytrain,
+                self.nugget(),
+            ) {
+                Ok(r) => unsafe { -(*(&r.0 as *const F as *const f64)) },
+                Err(_) => f64::INFINITY,
+            }
+        };
+
+        // Multistart: user theta0 + 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1., 10.
+        let mut params = Array2::zeros((N_START + 1, params_0.len()));
+        params.row_mut(0).assign(&params_0.mapv(|v| F::log10(v)));
+        let mut xlimits: Array2<F> = Array2::zeros((params_0.len(), 2));
+        for mut row in xlimits.rows_mut() {
+            row.assign(&arr1(&[F::cast(-6), F::cast(2)]));
+        }
+        // Use a seed here for reproducibility. Do we need to make it truly random
+        // Probably no, as it is just to get init values spread over
+        // [1e-6, 20] for multistart thanks to LHS method.
+        let seeds = Lhs::new(&xlimits)
+            .kind(egobox_doe::LhsKind::Maximin)
+            .with_rng(Xoshiro256Plus::seed_from_u64(42))
+            .sample(N_START);
+        Zip::from(params.slice_mut(s![1.., ..]).rows_mut())
+            .and(seeds.rows())
+            .par_for_each(|mut theta, row| theta.assign(&row));
+
+        let opt_params = params.map_axis(Axis(1), |p| optimize_theta(objfn, &p.to_owned()));
+        let opt_index = opt_params.map(|(_, opt_f)| opt_f).argmin().unwrap();
+        let opt_params = &(opt_params[opt_index]).0.mapv(F::cast);
+        // println!("opt_theta={}", opt_theta);
+        let opt_theta = opt_params.slice(s![..n - 2]).to_owned();
+        let opt_sigma2 = opt_params[n - 2];
+        let opt_noise = opt_params[n - 2];
+        let (_, w_data) = self.reduced_likelihood(
+            &opt_theta,
+            opt_sigma2,
+            opt_noise,
+            &w_star,
+            &xtrain,
+            &ytrain,
+            self.nugget(),
+        )?;
+        Ok(SparseGaussianProcess {
+            mean: *self.mean(),
+            corr: *self.corr(),
+            method: self.method(),
+            theta: opt_theta,
+            sigma2: opt_sigma2,
+            noise: opt_noise,
+            w_data,
+            w_star,
+            xtrain: xtrain.to_owned(),
+            ytrain: ytrain.to_owned(),
+        })
+    }
+}
+
+impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> SgpValidParams<F, Mean, Corr> {
+    /// Constructor of valid params from values
+    #[doc(hidden)]
+    pub fn from(
+        mean: Mean,
+        corr: Corr,
+        method: SparseMethod,
+        theta: Array1<F>,
+        sigma2: F,
+        noise: F,
+        w_star: Array2<F>,
+        xtrain: Array2<F>,
+        ytrain: Array2<F>,
+        w_data: WoodburyData<F>,
+    ) -> Result<SparseGaussianProcess<F, Mean, Corr>> {
+        // TODO: add some consistency checks
+        Ok(SparseGaussianProcess {
+            mean,
+            corr,
+            method,
+            theta,
+            sigma2,
+            noise,
+            w_star,
+            xtrain,
+            ytrain,
+            w_data,
+        })
+    }
+
+    /// Compute reduced likelihood function
+    /// nugget: factor to improve numerical stability  
+    pub fn reduced_likelihood(
+        &self,
+        theta: &Array1<F>,
+        sigma2: F,
+        noise: F,
+        w_star: &Array2<F>,
+        xtrain: &Array2<F>,
+        ytrain: &Array2<F>,
+        nugget: F,
+    ) -> Result<(F, WoodburyData<F>)> {
+        let (likelihood, w_data) = match self.method() {
+            SparseMethod::Fitc => self.fitc(theta, sigma2, noise, w_star, xtrain, ytrain, nugget),
+            SparseMethod::Vfe => self.vfe(theta, sigma2, noise, nugget),
+        };
+
+        Ok((likelihood, w_data))
+    }
+
+    /// Compute covariance matrix between a and b matrices
+    fn compute_k(
+        &self,
+        a: &ArrayBase<impl Data<Elem = F>, Ix2>,
+        b: &ArrayBase<impl Data<Elem = F>, Ix2>,
+        w_star: &Array2<F>,
+        theta: &Array1<F>,
+        sigma2: F,
+    ) -> Array2<F> {
+        // Get pairwise componentwise L1-distances to the input training set
+        let dx = pairwise_differences(a, b);
+        // Compute the correlation function
+        let r = self.corr().value(&dx, &theta, w_star);
+        r.into_shape((a.nrows(), b.nrows()))
+            .unwrap()
+            .mapv(|v| v * sigma2)
+    }
+
+    /// FITC method
+    fn fitc(
+        &self,
+        theta: &Array1<F>,
+        sigma2: F,
+        noise: F,
+        w_star: &Array2<F>,
+        xtrain: &Array2<F>,
+        ytrain: &Array2<F>,
+        nugget: F,
+    ) -> (F, WoodburyData<F>) {
+        let z = self.inducings();
+        let nz = z.nrows();
+        // Compute: diag(Knn), Kmm and Kmn
+        let knn = Array1::from_elem(xtrain.nrows(), sigma2);
+        let kmm = self.compute_k(&z, &z, w_star, theta, sigma2) + Array::eye(nz) * nugget;
+        let kmn = self.compute_k(&z, xtrain, w_star, theta, sigma2);
+
+        // Compute (lower) Cholesky decomposition: Kmm = U U^T
+        let u = kmm.cholesky().unwrap();
+        //U = linalg.cholesky(Kmm, lower=True)
+
+        // Compute (upper) Cholesky decomposition: Qnn = V^T V
+        let ui = u
+            .solve_triangular(&Array::eye(u.nrows()), UPLO::Lower)
+            .unwrap();
+        // Ui = linalg.inv(U)
+        let v = ui.dot(&kmm);
+        //V = Ui @ Kmn
+
+        // Assumption on the gaussian noise on training outputs
+        let eta2 = noise;
+
+        // Compute diagonal correction: nu = Knn_diag - Qnn_diag + \eta^2
+        let nu = knn - (v * v).sum_axis(Axis(0)) + eta2;
+        //let nu = knn - np.sum(np.square(V), 0) + eta2;
+        // Compute beta, the effective noise precision
+        let beta = nu.mapv(|v| F::one() / v);
+        //beta = 1.0 / nu
+
+        // Compute (lower) Cholesky decomposition: A = I + V diag(beta) V^T = L L^T
+        let a = Array::eye(nz) + &(v * beta.insert_axis(Axis(0))).dot(&v.t());
+        // A = np.eye(self.nz) + V * beta @ V.T
+
+        let l = a.cholesky().unwrap();
+        // L = linalg.cholesky(A, lower=True)
+        let li = l
+            .solve_triangular(&Array::eye(l.nrows()), UPLO::Lower)
+            .unwrap();
+        // Li = linalg.inv(L)
+
+        // Compute a and b
+        let a = einsum("ij,j->ij", &[ytrain, &beta])
+            .unwrap()
+            .into_dimensionality::<Ix2>()
+            .unwrap();
+        // a = np.einsum("ij,i->ij", Y, beta)  # avoid reshape for mat-vec multiplication
+        let tmp = li.dot(&v);
+        let b = tmp.dot(&a);
+        // b = Li @ V @ a
+
+        // Compute marginal log-likelihood
+        // constant term ignored in reduced likelihood
+        //let term0 = self.ytrain.nrows() * F::cast(2. * std::f64::consts::PI);
+        let term1 = nu.mapv(|v| v.ln()).sum();
+        let term2 = l.slice(s![.., ..; l.nrows()]).mapv(|v| v.ln()).sum();
+        let term3 = (a.t().to_owned()).dot(ytrain)[[0, 0]];
+        let term4 = einsum("ij,ij->", &[&b, &b]).unwrap()[[0, 0]];
+        let likelihood = term1 + term2 + term3 + term4;
+        // likelihood = -0.5 * (
+        //     # num_data * np.log(2.0 * np.pi)   # constant term ignored in reduced likelihood
+        //     +np.sum(np.log(nu))
+        //     + 2.0 * np.sum(np.log(np.diag(L)))
+        //     + a.T @ Y
+        //     - np.einsum("ij,ij->", b, b)
+        // )
+
+        // Store Woodbury vectors for prediction step
+        // LiUi = Li @ Ui
+        // LiUiT = LiUi.T
+        // woodbury_vec = LiUiT @ b
+        // woodbury_inv = Ui.T @ Ui - LiUiT @ LiUi
+        let li_ui = li.dot(&ui);
+        let li_ui_t = li_ui.t();
+
+        let w_data = WoodburyData {
+            vec: li_ui.dot(&b),
+            inv: (ui.t()).dot(&ui) - li_ui_t.dot(li_ui),
+        };
+
+        (likelihood, w_data)
+    }
+
+    fn vfe(&self, theta: &Array1<F>, variance: F, noise: F, nugget: F) -> (F, WoodburyData<F>) {
+        todo!()
+    }
+}
+
+/// Optimize gp hyper parameter theta given an initial guess `theta0`
+#[cfg(not(feature = "nlopt"))]
+fn optimize_theta<ObjF, F>(objfn: ObjF, theta0: &Array1<F>) -> (Array1<f64>, f64)
+where
+    ObjF: Fn(&[f64], Option<&mut [f64]>, &mut ()) -> f64,
+    F: Float,
+{
+    use cobyla::{minimize, Func, StopTols};
+
+    let base: f64 = 10.;
+    let cons: Vec<&dyn Func<()>> = vec![];
+    let theta_init = theta0
+        .map(|v| unsafe { *(v as *const F as *const f64) })
+        .into_raw_vec();
+
+    let initial_step = 0.5;
+    let ftol_rel = 1e-4;
+    let maxeval = 15 * theta0.len();
+    let bounds = vec![(-6., 2.); theta0.len()];
+
+    match minimize(
+        |x, u| objfn(x, None, u),
+        &theta_init,
+        &bounds,
+        &cons,
+        (),
+        maxeval,
+        cobyla::RhoBeg::All(initial_step),
+        Some(StopTols {
+            ftol_rel,
+            ..StopTols::default()
+        }),
+    ) {
+        Ok((_, x_opt, fval)) => {
+            let thetas_opt = arr1(&x_opt).mapv(|v| base.powf(v));
+            let fval = if f64::is_nan(fval) {
+                f64::INFINITY
+            } else {
+                fval
+            };
+            (thetas_opt, fval)
+        }
+        Err((status, x_opt, _)) => {
+            println!("ERROR Cobyla optimizer in GP status={:?}", status);
+            (arr1(&x_opt).mapv(|v| base.powf(v)), f64::INFINITY)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use approx::assert_abs_diff_eq;
+    use ndarray::{arr2, Array};
+    use ndarray_npy::write_npy;
+    use ndarray_rand::rand::SeedableRng;
+    use ndarray_rand::rand_distr::{Normal, StandardNormal};
+    use ndarray_rand::RandomExt;
+    use rand_xoshiro::Xoshiro256Plus;
+
+    const PI: f64 = std::f64::consts::PI;
+
+    fn f_obj(x: &ArrayBase<impl Data<Elem = f64>, Ix2>) -> Array2<f64> {
+        x.mapv(|v| (3. * PI * v).sin() + 0.3 * (9. * PI * v).cos() + 0.5 * (7. * PI * v).sin())
+    }
+
+    #[test]
+    fn test_sgp() {
+        let mut rng = Xoshiro256Plus::seed_from_u64(42);
+        // Generate training data
+        let nt = 200;
+        // Variance of the gaussian noise on our trainingg data
+        let eta2: f64 = 0.01;
+        let normal = Normal::new(0., eta2.sqrt()).unwrap();
+        let gaussian_noise = Array::<f64, _>::random_using((nt, 1), normal, &mut rng);
+        let xt = 2. * Array::<f64, _>::random_using((nt, 1), StandardNormal, &mut rng) - 1.;
+        let yt = f_obj(&xt) + gaussian_noise;
+
+        let xplot = Array::linspace(0., 4., 100).insert_axis(Axis(1));
+
+        let gp = SparseGaussianProcess::<f64, ConstantMean, SquaredExponentialCorr>::params(
+            ConstantMean::default(),
+            SquaredExponentialCorr::default(),
+        )
+        .initial_theta(Some(vec![0.1]))
+        .fit(&Dataset::new(xt, yt))
+        .expect("GP fit error");
+
+        let yvals = gp
+            .predict_values(&arr2(&[[1.0], [3.5]]))
+            .expect("prediction error");
+        let expected_y = arr2(&[[1.0], [0.9]]);
+        assert_abs_diff_eq!(expected_y, yvals, epsilon = 0.5);
+
+        let sgpr_vals = gp.predict_values(&xplot).unwrap();
+
+        let yvars = gp
+            .predict_variances(&arr2(&[[1.0], [3.5]]))
+            .expect("prediction error");
+        let expected_vars = arr2(&[[0.], [0.1]]);
+        assert_abs_diff_eq!(expected_vars, yvars, epsilon = 0.5);
+
+        let sgpr_vars = gp.predict_variances(&xplot).unwrap();
+
+        let test_dir = "target/tests";
+        std::fs::create_dir_all(test_dir).ok();
+
+        let file_path = format!("{}/sgp_x.npy", test_dir);
+        write_npy(file_path, &xplot).expect("x saved");
+
+        let file_path = format!("{}/sgp_vals.npy", test_dir);
+        write_npy(file_path, &sgpr_vals).expect("sgp vals saved");
+
+        let file_path = format!("{}/sgp_vars.npy", test_dir);
+        write_npy(file_path, &sgpr_vars).expect("sgp vars saved");
+    }
+}
diff --git a/gp/src/sgp_parameters.rs b/gp/src/sgp_parameters.rs
new file mode 100644
index 00000000..6e26fe22
--- /dev/null
+++ b/gp/src/sgp_parameters.rs
@@ -0,0 +1,185 @@
+use crate::correlation_models::{CorrelationModel, SquaredExponentialCorr};
+use crate::errors::{GpError, Result};
+use crate::mean_models::{ConstantMean, RegressionModel};
+use crate::parameters::GpValidParams;
+use linfa::{Float, ParamGuard};
+use ndarray::Array2;
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum NoiseVarianceConfig<F: Float> {
+    Constant(F),
+    Estimated {
+        initial_guess: F,
+        lower_bound: F,
+        upper_bound: F,
+    },
+}
+impl<F: Float> Default for NoiseVarianceConfig<F> {
+    fn default() -> NoiseVarianceConfig<F> {
+        Self::Constant(F::one())
+    }
+}
+
+#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
+pub enum SparseMethod {
+    Fitc,
+    Vfe,
+}
+impl Default for SparseMethod {
+    fn default() -> Self {
+        SparseMethod::Fitc
+    }
+}
+
+/// A set of validated SGP parameters.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct SgpValidParams<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> {
+    /// gp
+    gp_params: GpValidParams<F, Mean, Corr>,
+    /// Gaussian homeoscedastic noise variance
+    noise: NoiseVarianceConfig<F>,
+    /// Inducing points
+    z: Option<Array2<F>>,
+    /// Method
+    method: SparseMethod,
+}
+
+impl<F: Float> Default for SgpValidParams<F, ConstantMean, SquaredExponentialCorr> {
+    fn default() -> SgpValidParams<F, ConstantMean, SquaredExponentialCorr> {
+        SgpValidParams {
+            gp_params: GpValidParams::default(),
+            noise: NoiseVarianceConfig::default(),
+            z: None,
+            method: SparseMethod::default(),
+        }
+    }
+}
+
+impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> SgpValidParams<F, Mean, Corr> {
+    /// Get starting theta value for optimization
+    pub fn initial_theta(&self) -> &Option<Vec<F>> {
+        &self.gp_params.theta
+    }
+
+    /// Get mean model  
+    pub fn mean(&self) -> &Mean {
+        &self.gp_params.mean
+    }
+
+    /// Get correlation corr k(x, x')
+    pub fn corr(&self) -> &Corr {
+        &self.gp_params.corr
+    }
+
+    /// Get number of components used by PLS
+    pub fn kpls_dim(&self) -> &Option<usize> {
+        &self.gp_params.kpls_dim
+    }
+
+    /// Get number of components used by PLS
+    pub fn nugget(&self) -> F {
+        self.gp_params.nugget
+    }
+
+    /// Get used sparse method
+    pub fn method(&self) -> SparseMethod {
+        self.method
+    }
+
+    /// Get inducing points
+    pub fn inducings(&self) -> Array2<F> {
+        self.z
+    }
+}
+
+#[derive(Clone, Debug)]
+/// The set of hyperparameters that can be specified for the execution of
+/// the [GP algorithm](struct.GaussianProcess.html).
+pub struct SgpParams<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>>(
+    SgpValidParams<F, Mean, Corr>,
+);
+
+impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> SgpParams<F, Mean, Corr> {
+    /// A constructor for GP parameters given mean and correlation models
+    pub fn new(mean: Mean, corr: Corr) -> SgpParams<F, Mean, Corr> {
+        Self(SgpValidParams {
+            gp_params: GpValidParams {
+                theta: None,
+                mean,
+                corr,
+                kpls_dim: None,
+                nugget: F::cast(100.0) * F::epsilon(),
+            },
+            noise: NoiseVarianceConfig::default(),
+            z: None,
+            method: SparseMethod::default(),
+        })
+    }
+
+    /// Set initial value for theta hyper parameter.
+    ///
+    /// During training process, the internal optimization is started from `initial_theta`.
+    pub fn initial_theta(mut self, theta: Option<Vec<F>>) -> Self {
+        self.0.gp_params.theta = theta;
+        self
+    }
+
+    /// Set mean model.
+    pub fn mean(mut self, mean: Mean) -> Self {
+        self.0.gp_params.mean = mean;
+        self
+    }
+
+    /// Set correlation model.
+    pub fn corr(mut self, corr: Corr) -> Self {
+        self.0.gp_params.corr = corr;
+        self
+    }
+
+    /// Set number of PLS components.
+    /// Should be 0 < n < pb size (i.e. x dimension)
+    pub fn kpls_dim(mut self, kpls_dim: Option<usize>) -> Self {
+        self.0.gp_params.kpls_dim = kpls_dim;
+        self
+    }
+
+    /// Set nugget.
+    ///
+    /// Nugget is used to improve numerical stability
+    pub fn nugget(mut self, nugget: F) -> Self {
+        self.0.gp_params.nugget = nugget;
+        self
+    }
+}
+
+impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> ParamGuard
+    for SgpParams<F, Mean, Corr>
+{
+    type Checked = SgpValidParams<F, Mean, Corr>;
+    type Error = GpError;
+
+    fn check_ref(&self) -> Result<&Self::Checked> {
+        if let Some(d) = self.0.gp_params.kpls_dim {
+            if d == 0 {
+                return Err(GpError::InvalidValue("`kpls_dim` canot be 0!".to_string()));
+            }
+            if let Some(theta) = self.0.initial_theta() {
+                if theta.len() > 1 && d > theta.len() {
+                    return Err(GpError::InvalidValue(format!(
+                        "Dimension reduction ({}) should be smaller than expected
+                        training input size infered from given initial theta length ({})",
+                        d,
+                        theta.len()
+                    )));
+                };
+            }
+        }
+        Ok(&self.0)
+    }
+
+    fn check(self) -> Result<Self::Checked> {
+        self.check_ref()?;
+        Ok(self.0)
+    }
+}