From ad3ac49dde8b9dc2181e31200d0a2d41b441a0f8 Mon Sep 17 00:00:00 2001 From: morenol Date: Thu, 19 Nov 2020 14:19:22 -0400 Subject: [PATCH 01/78] Implement GaussianNB (#27) * feat: Add GaussianNB --- src/naive_bayes/gaussian.rs | 257 ++++++++++++++++++++++++++++++++++++ src/naive_bayes/mod.rs | 2 + 2 files changed, 259 insertions(+) create mode 100644 src/naive_bayes/gaussian.rs diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs new file mode 100644 index 0000000..8e7e37c --- /dev/null +++ b/src/naive_bayes/gaussian.rs @@ -0,0 +1,257 @@ +use crate::error::Failed; +use crate::linalg::row_iter; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use crate::math::vector::RealNumberVector; +use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; +use serde::{Deserialize, Serialize}; + +/// Naive Bayes classifier for categorical features +#[derive(Serialize, Deserialize, Debug, PartialEq)] +struct GaussianNBDistribution { + /// class labels known to the classifier + class_labels: Vec, + /// probability of each class. + class_priors: Vec, + /// variance of each feature per class + sigma: Vec>, + /// mean of each feature per class + theta: Vec>, +} + +impl> NBDistribution for GaussianNBDistribution { + fn prior(&self, class_index: usize) -> T { + if class_index >= self.class_labels.len() { + T::zero() + } else { + self.class_priors[class_index] + } + } + + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { + if class_index < self.class_labels.len() { + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + let mean = self.theta[class_index][feature]; + let variance = self.sigma[class_index][feature]; + likelihood += self.calculate_log_probability(value, mean, variance); + } + likelihood + } else { + T::zero() + } + } + + fn classes(&self) -> &Vec { + &self.class_labels + } +} + +/// `GaussianNB` parameters. Use `Default::default()` for default values. +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct GaussianNBParameters { + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub priors: Option>, +} + +impl GaussianNBParameters { + /// Create GaussianNBParameters with specific paramaters. + pub fn new(priors: Option>) -> Self { + Self { priors } + } +} + +impl GaussianNBDistribution { + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data. + /// * `y` - vector with target values (classes) of length N. + /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, + /// priors are adjusted according to the data. + pub fn fit>( + x: &M, + y: &M::RowVector, + priors: Option>, + ) -> Result { + let (n_samples, n_features) = x.shape(); + let y_samples = y.len(); + if y_samples != n_samples { + return Err(Failed::fit(&format!( + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", + n_samples, y_samples + ))); + } + + if n_samples == 0 { + return Err(Failed::fit(&format!( + "Size of x and y should greater than 0; |x|=[{}]", + n_samples + ))); + } + let y = y.to_vec(); + let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); + + let mut class_count = vec![T::zero(); class_labels.len()]; + + let mut subdataset: Vec>> = vec![vec![]; class_labels.len()]; + + for (row, class_index) in row_iter(x).zip(indices.iter()) { + class_count[*class_index] += T::one(); + subdataset[*class_index].push(row); + } + + let class_priors = if let Some(class_priors) = priors { + if class_priors.len() != class_labels.len() { + return Err(Failed::fit( + "Size of priors provided does not match the number of classes of the data.", + )); + } + class_priors + } else { + class_count + .into_iter() + .map(|c| c / T::from(n_samples).unwrap()) + .collect() + }; + + let subdataset: Vec = subdataset + .into_iter() + .map(|v| { + let mut m = M::zeros(v.len(), n_features); + for row in 0..v.len() { + for col in 0..n_features { + m.set(row, col, v[row][col]); + } + } + m + }) + .collect(); + + let (sigma, theta): (Vec>, Vec>) = subdataset + .iter() + .map(|data| (data.var(0), data.mean(0))) + .unzip(); + + Ok(Self { + class_labels, + class_priors, + sigma, + theta, + }) + } + + /// Calculate probability of x equals to a value of a Gaussian distribution given its mean and its + /// variance. + fn calculate_log_probability(&self, value: T, mean: T, variance: T) -> T { + let pi = T::from(std::f64::consts::PI).unwrap(); + -((value - mean).powf(T::two()) / (T::two() * variance)) + - (T::two() * pi).ln() / T::two() + - (variance).ln() / T::two() + } +} + +/// GaussianNB implements the categorical naive Bayes algorithm for categorically distributed data. +#[derive(Serialize, Deserialize, Debug, PartialEq)] +pub struct GaussianNB> { + inner: BaseNaiveBayes>, +} + +impl> GaussianNB { + /// Fits GaussianNB with given data + /// * `x` - training data of size NxM where N is the number of samples and M is the number of + /// features. + /// * `y` - vector with target values (classes) of length N. + /// * `parameters` - additional parameters like class priors. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: GaussianNBParameters, + ) -> Result { + let distribution = GaussianNBDistribution::fit(x, y, parameters.priors)?; + let inner = BaseNaiveBayes::fit(distribution)?; + Ok(Self { inner }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + self.inner.predict(x) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn run_gaussian_naive_bayes() { + let x = DenseMatrix::from_2d_array(&[ + &[-1., -1.], + &[-2., -1.], + &[-3., -2.], + &[1., 1.], + &[2., 1.], + &[3., 2.], + ]); + let y = vec![1., 1., 1., 2., 2., 2.]; + + let gnb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); + let y_hat = gnb.predict(&x).unwrap(); + assert_eq!(y_hat, y); + assert_eq!( + gnb.inner.distribution.sigma, + &[ + &[0.666666666666667, 0.22222222222222232], + &[0.666666666666667, 0.22222222222222232] + ] + ); + + assert_eq!(gnb.inner.distribution.class_priors, &[0.5, 0.5]); + + assert_eq!( + gnb.inner.distribution.theta, + &[&[-2., -1.3333333333333333], &[2., 1.3333333333333333]] + ); + } + + #[test] + fn run_gaussian_naive_bayes_with_priors() { + let x = DenseMatrix::from_2d_array(&[ + &[-1., -1.], + &[-2., -1.], + &[-3., -2.], + &[1., 1.], + &[2., 1.], + &[3., 2.], + ]); + let y = vec![1., 1., 1., 2., 2., 2.]; + + let priors = vec![0.3, 0.7]; + let parameters = GaussianNBParameters::new(Some(priors.clone())); + let gnb = GaussianNB::fit(&x, &y, parameters).unwrap(); + + assert_eq!(gnb.inner.distribution.class_priors, priors); + } + + #[test] + fn serde() { + let x = DenseMatrix::::from_2d_array(&[ + &[-1., -1.], + &[-2., -1.], + &[-3., -2.], + &[1., 1.], + &[2., 1.], + &[3., 2.], + ]); + let y = vec![1., 1., 1., 2., 2., 2.]; + + let gnb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); + let deserialized_gnb: GaussianNB> = + serde_json::from_str(&serde_json::to_string(&gnb).unwrap()).unwrap(); + + assert_eq!(gnb, deserialized_gnb); + } +} diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 8a9920e..0268da6 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -65,4 +65,6 @@ impl, D: NBDistribution> BaseNaiveBayes Date: Thu, 19 Nov 2020 16:07:10 -0400 Subject: [PATCH 02/78] Add serde to CategoricalNB (#30) * Add serde to CategoricalNB * Implement PartialEq for CategoricalNBDistribution --- src/naive_bayes/categorical.rs | 60 ++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index ae6eb0c..d32c34d 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -6,13 +6,41 @@ use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] struct CategoricalNBDistribution { class_labels: Vec, class_priors: Vec, coefficients: Vec>>, } +impl PartialEq for CategoricalNBDistribution { + fn eq(&self, other: &Self) -> bool { + if self.class_labels == other.class_labels && self.class_priors == other.class_priors { + if self.coefficients.len() != other.coefficients.len() { + return false; + } + for (a, b) in self.coefficients.iter().zip(other.coefficients.iter()) { + if a.len() != b.len() { + return false; + } + for (a_i, b_i) in a.iter().zip(b.iter()) { + if a_i.len() != b_i.len() { + return false; + } + for (a_i_j, b_i_j) in a_i.iter().zip(b_i.iter()) { + if (*a_i_j - *b_i_j).abs() > T::epsilon() { + return false; + } + } + } + } + true + } else { + false + } + } +} + impl> NBDistribution for CategoricalNBDistribution { fn prior(&self, class_index: usize) -> T { if class_index >= self.class_labels.len() { @@ -181,7 +209,7 @@ impl Default for CategoricalNBParameters { } /// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug, PartialEq)] pub struct CategoricalNB> { inner: BaseNaiveBayes>, } @@ -269,4 +297,32 @@ mod tests { vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1.] ); } + + #[test] + fn serde() { + let x = DenseMatrix::::from_2d_array(&[ + &[3., 4., 0., 1.], + &[3., 0., 0., 1.], + &[4., 4., 1., 2.], + &[4., 2., 4., 3.], + &[4., 2., 4., 2.], + &[4., 1., 1., 0.], + &[1., 1., 1., 1.], + &[0., 4., 1., 0.], + &[0., 3., 2., 1.], + &[0., 3., 1., 1.], + &[3., 4., 0., 1.], + &[3., 4., 2., 4.], + &[0., 3., 1., 2.], + &[0., 4., 1., 2.], + ]); + + let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; + let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); + + let deserialized_cnb: CategoricalNB> = + serde_json::from_str(&serde_json::to_string(&cnb).unwrap()).unwrap(); + + assert_eq!(cnb, deserialized_cnb); + } } From 583284e66f3e27c03a393f630bee0b677f05b706 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 24 Nov 2020 19:12:53 -0800 Subject: [PATCH 03/78] feat: adds LASSO --- src/linalg/high_order.rs | 28 ++ src/linalg/mod.rs | 63 ++++ src/linalg/naive/dense_matrix.rs | 60 +++- src/linalg/nalgebra_bindings.rs | 6 + src/linalg/ndarray_bindings.rs | 6 + src/linear/bg_solver.rs | 146 +++++++++ src/linear/lasso.rs | 509 ++++++++++++++++++++++++++++++ src/linear/logistic_regression.rs | 2 +- src/linear/mod.rs | 2 + 9 files changed, 819 insertions(+), 3 deletions(-) create mode 100644 src/linalg/high_order.rs create mode 100644 src/linear/bg_solver.rs create mode 100644 src/linear/lasso.rs diff --git a/src/linalg/high_order.rs b/src/linalg/high_order.rs new file mode 100644 index 0000000..359c4a1 --- /dev/null +++ b/src/linalg/high_order.rs @@ -0,0 +1,28 @@ +//! In this module you will find composite of matrix operations that are used elsewhere +//! for improved efficiency. + +use crate::linalg::BaseMatrix; +use crate::math::num::RealNumber; + +/// High order matrix operations. +pub trait HighOrderOperations: BaseMatrix { + /// Y = AB + /// ``` + /// use smartcore::linalg::naive::dense_matrix::*; + /// use smartcore::linalg::high_order::HighOrderOperations; + /// + /// let a = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.]]); + /// let b = DenseMatrix::from_2d_array(&[&[5., 6.], &[7., 8.], &[9., 10.]]); + /// let expected = DenseMatrix::from_2d_array(&[&[71., 80.], &[92., 104.]]); + /// + /// assert_eq!(a.ab(true, &b, false), expected); + /// ``` + fn ab(&self, a_transpose: bool, b: &Self, b_transpose: bool) -> Self { + match (a_transpose, b_transpose) { + (true, true) => self.transpose().matmul(&b.transpose()), + (false, true) => self.matmul(&b.transpose()), + (true, false) => self.transpose().matmul(b), + (false, false) => self.matmul(b), + } + } +} diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index c560b78..1be2e75 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -36,6 +36,7 @@ pub mod cholesky; /// The matrix is represented in terms of its eigenvalues and eigenvectors. pub mod evd; +pub mod high_order; /// Factors a matrix as the product of a lower triangular matrix and an upper triangular matrix. pub mod lu; /// Dense matrix with column-major order that wraps [Vec](https://doc.rust-lang.org/std/vec/struct.Vec.html). @@ -59,6 +60,7 @@ use std::ops::Range; use crate::math::num::RealNumber; use cholesky::CholeskyDecomposableMatrix; use evd::EVDDecomposableMatrix; +use high_order::HighOrderOperations; use lu::LUDecomposableMatrix; use qr::QRDecomposableMatrix; use stats::MatrixStats; @@ -134,6 +136,66 @@ pub trait BaseVector: Clone + Debug { /// Subtract `x` from single element of the vector, write result to original vector. fn sub_element_mut(&mut self, pos: usize, x: T); + /// Subtract scalar + fn sub_scalar_mut(&mut self, x: T) -> &Self { + for i in 0..self.len() { + self.set(i, self.get(i) - x); + } + self + } + + /// Subtract scalar + fn add_scalar_mut(&mut self, x: T) -> &Self { + for i in 0..self.len() { + self.set(i, self.get(i) + x); + } + self + } + + /// Subtract scalar + fn mul_scalar_mut(&mut self, x: T) -> &Self { + for i in 0..self.len() { + self.set(i, self.get(i) * x); + } + self + } + + /// Subtract scalar + fn div_scalar_mut(&mut self, x: T) -> &Self { + for i in 0..self.len() { + self.set(i, self.get(i) / x); + } + self + } + + /// Add vectors, element-wise + fn add_scalar(&self, x: T) -> Self { + let mut r = self.clone(); + r.add_scalar_mut(x); + r + } + + /// Subtract vectors, element-wise + fn sub_scalar(&self, x: T) -> Self { + let mut r = self.clone(); + r.sub_scalar_mut(x); + r + } + + /// Multiply vectors, element-wise + fn mul_scalar(&self, x: T) -> Self { + let mut r = self.clone(); + r.mul_scalar_mut(x); + r + } + + /// Divide vectors, element-wise + fn div_scalar(&self, x: T) -> Self { + let mut r = self.clone(); + r.div_scalar_mut(x); + r + } + /// Add vectors, element-wise, overriding original vector with result. fn add_mut(&mut self, other: &Self) -> &Self; @@ -557,6 +619,7 @@ pub trait Matrix: + LUDecomposableMatrix + CholeskyDecomposableMatrix + MatrixStats + + HighOrderOperations + PartialEq + Display { diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 7486329..f4c8a97 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; +use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::stats::MatrixStats; @@ -444,6 +445,38 @@ impl LUDecomposableMatrix for DenseMatrix {} impl CholeskyDecomposableMatrix for DenseMatrix {} +impl HighOrderOperations for DenseMatrix { + fn ab(&self, a_transpose: bool, b: &Self, b_transpose: bool) -> Self { + if !a_transpose && !b_transpose { + self.matmul(b) + } else { + let (d1, d2, d3, d4) = match (a_transpose, b_transpose) { + (true, false) => (self.nrows, self.ncols, b.ncols, b.nrows), + (false, true) => (self.ncols, self.nrows, b.nrows, b.ncols), + _ => (self.nrows, self.ncols, b.nrows, b.ncols), + }; + if d1 != d4 { + panic!("Can not multiply {}x{} by {}x{} matrices", d2, d1, d4, d3); + } + let mut result = Self::zeros(d2, d3); + for r in 0..d2 { + for c in 0..d3 { + let mut s = T::zero(); + for i in 0..d1 { + match (a_transpose, b_transpose) { + (true, false) => s += self.get(i, r) * b.get(i, c), + (false, true) => s += self.get(r, i) * b.get(c, i), + _ => s += self.get(i, r) * b.get(c, i), + } + } + result.set(r, c, s); + } + } + result + } + } +} + impl MatrixStats for DenseMatrix {} impl Matrix for DenseMatrix {} @@ -625,8 +658,8 @@ impl BaseMatrix for DenseMatrix { } fn dot(&self, other: &Self) -> T { - if self.nrows != 1 && other.nrows != 1 { - panic!("A and B should both be 1-dimentional vectors."); + if (self.nrows != 1 && other.nrows != 1) && (self.ncols != 1 && other.ncols != 1) { + panic!("A and B should both be either a row or a column vector."); } if self.nrows * self.ncols != other.nrows * other.ncols { panic!("A and B should have the same size"); @@ -1114,6 +1147,29 @@ mod tests { assert_eq!(result, expected); } + #[test] + fn ab() { + let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); + let b = DenseMatrix::from_2d_array(&[&[5., 6.], &[7., 8.], &[9., 10.]]); + let c = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.]]); + assert_eq!( + a.ab(false, &b, false), + DenseMatrix::from_2d_array(&[&[46., 52.], &[109., 124.]]) + ); + assert_eq!( + c.ab(true, &b, false), + DenseMatrix::from_2d_array(&[&[71., 80.], &[92., 104.]]) + ); + assert_eq!( + b.ab(false, &c, true), + DenseMatrix::from_2d_array(&[&[17., 39., 61.], &[23., 53., 83.,], &[29., 67., 105.]]) + ); + assert_eq!( + a.ab(true, &b, true), + DenseMatrix::from_2d_array(&[&[29., 39., 49.], &[40., 54., 68.,], &[51., 69., 87.]]) + ); + } + #[test] fn dot() { let a = DenseMatrix::from_array(1, 3, &[1., 2., 3.]); diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index 8ddfdb6..8f504c6 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -44,6 +44,7 @@ use nalgebra::{DMatrix, Dynamic, Matrix, MatrixMN, RowDVector, Scalar, VecStorag use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; +use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::stats::MatrixStats; @@ -552,6 +553,11 @@ impl + HighOrderOperations for Matrix> +{ +} + impl SmartCoreMatrix for Matrix> { diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index b5058ab..9945c5f 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -51,6 +51,7 @@ use ndarray::{s, stack, Array, ArrayBase, Axis, Ix1, Ix2, OwnedRepr}; use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; +use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::stats::MatrixStats; @@ -502,6 +503,11 @@ impl + HighOrderOperations for ArrayBase, Ix2> +{ +} + impl Matrix for ArrayBase, Ix2> { diff --git a/src/linear/bg_solver.rs b/src/linear/bg_solver.rs new file mode 100644 index 0000000..b299623 --- /dev/null +++ b/src/linear/bg_solver.rs @@ -0,0 +1,146 @@ +//! This is a generic solver for Ax = b type of equation +//! +//! for more information take a look at [this Wikipedia article](https://en.wikipedia.org/wiki/Biconjugate_gradient_method) +//! and [this paper](https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf) +use crate::error::Failed; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +pub trait BiconjugateGradientSolver> { + fn solve_mut(&self, a: &M, b: &M, x: &mut M, tol: T, max_iter: usize) -> Result { + if tol <= T::zero() { + return Err(Failed::fit("tolerance shoud be > 0")); + } + + if max_iter == 0 { + return Err(Failed::fit("maximum number of iterations should be > 0")); + } + + let (n, _) = b.shape(); + + let mut r = M::zeros(n, 1); + let mut rr = M::zeros(n, 1); + let mut z = M::zeros(n, 1); + let mut zz = M::zeros(n, 1); + + self.mat_vec_mul(a, x, &mut r); + + for j in 0..n { + r.set(j, 0, b.get(j, 0) - r.get(j, 0)); + rr.set(j, 0, r.get(j, 0)); + } + + let bnrm = b.norm(T::two()); + self.solve_preconditioner(a, &r, &mut z); + + let mut p = M::zeros(n, 1); + let mut pp = M::zeros(n, 1); + let mut bkden = T::zero(); + let mut err = T::zero(); + + for iter in 1..max_iter { + let mut bknum = T::zero(); + + self.solve_preconditioner(a, &rr, &mut zz); + for j in 0..n { + bknum += z.get(j, 0) * rr.get(j, 0); + } + if iter == 1 { + for j in 0..n { + p.set(j, 0, z.get(j, 0)); + pp.set(j, 0, zz.get(j, 0)); + } + } else { + let bk = bknum / bkden; + for j in 0..n { + p.set(j, 0, bk * p.get(j, 0) + z.get(j, 0)); + pp.set(j, 0, bk * pp.get(j, 0) + zz.get(j, 0)); + } + } + bkden = bknum; + self.mat_vec_mul(a, &p, &mut z); + let mut akden = T::zero(); + for j in 0..n { + akden += z.get(j, 0) * pp.get(j, 0); + } + let ak = bknum / akden; + self.mat_t_vec_mul(a, &pp, &mut zz); + for j in 0..n { + x.set(j, 0, x.get(j, 0) + ak * p.get(j, 0)); + r.set(j, 0, r.get(j, 0) - ak * z.get(j, 0)); + rr.set(j, 0, rr.get(j, 0) - ak * zz.get(j, 0)); + } + self.solve_preconditioner(a, &r, &mut z); + err = r.norm(T::two()) / bnrm; + + if err <= tol { + break; + } + } + + Ok(err) + } + + fn solve_preconditioner(&self, a: &M, b: &M, x: &mut M) { + let diag = Self::diag(a); + let n = diag.len(); + + for i in 0..n { + if diag[i] != T::zero() { + x.set(i, 0, b.get(i, 0) / diag[i]); + } else { + x.set(i, 0, b.get(i, 0)); + } + } + } + + // y = Ax + fn mat_vec_mul(&self, a: &M, x: &M, y: &mut M) { + y.copy_from(&a.matmul(x)); + } + + // y = Atx + fn mat_t_vec_mul(&self, a: &M, x: &M, y: &mut M) { + y.copy_from(&a.ab(true, x, false)); + } + + fn diag(a: &M) -> Vec { + let (nrows, ncols) = a.shape(); + let n = nrows.min(ncols); + + let mut d = Vec::with_capacity(n); + for i in 0..n { + d.push(a.get(i, i)); + } + + d + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + + pub struct BGSolver {} + + impl> BiconjugateGradientSolver for BGSolver {} + + #[test] + fn bg_solver() { + let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]); + let b = DenseMatrix::from_2d_array(&[&[40., 51., 28.]]); + let expected = DenseMatrix::from_2d_array(&[&[1.0, 2.0, 3.0]]); + + let mut x = DenseMatrix::zeros(3, 1); + + let solver = BGSolver {}; + + let err: f64 = solver + .solve_mut(&a, &b.transpose(), &mut x, 1e-6, 6) + .unwrap(); + + assert!(x.transpose().approximate_eq(&expected, 1e-4)); + assert!((err - 0.0).abs() < 1e-4); + } +} diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs new file mode 100644 index 0000000..306b1aa --- /dev/null +++ b/src/linear/lasso.rs @@ -0,0 +1,509 @@ +//! # Lasso +//! +//! [Linear regression](../linear_regression/index.html) is the standard algorithm for predicting a quantitative response \\(y\\) on the basis of a linear combination of explanatory variables \\(X\\) +//! that assumes that there is approximately a linear relationship between \\(X\\) and \\(y\\). +//! Lasso is an extension to linear regression that adds L1 regularization term to the loss function during training. +//! +//! Similar to [ridge regression](../ridge_regression/index.html), the lasso shrinks the coefficient estimates towards zero when. However, in the case of the lasso, the l1 penalty has the effect of +//! forcing some of the coefficient estimates to be exactly equal to zero when the tuning parameter \\(\alpha\\) is sufficiently large. +//! +//! Lasso coefficient estimates solve the problem: +//! +//! \\[\underset{\beta}{minimize} \space \space \sum_{i=1}^n \left( y_i - \beta_0 - \sum_{j=1}^p \beta_jx_{ij} \right)^2 + \alpha \sum_{j=1}^p \lVert \beta_j \rVert_1\\] +//! +//! This problem is solved with an interior-point method that is comparable to coordinate descent in solving large problems with modest accuracy, +//! but is able to solve them with high accuracy with relatively small additional computational cost. +//! +//! ## References: +//! +//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 6.2. Shrinkage Methods](http://faculty.marshall.usc.edu/gareth-james/ISL/) +//! * ["An Interior-Point Method for Large-Scale l1-Regularized Least Squares", K. Koh, M. Lustig, S. Boyd, D. Gorinevsky](https://web.stanford.edu/~boyd/papers/pdf/l1_ls.pdf) +//! * [Simple Matlab Solver for l1-regularized Least Squares Problems](https://web.stanford.edu/~boyd/l1_ls/) +//! +//! +//! +use std::fmt::Debug; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::linear::bg_solver::BiconjugateGradientSolver; +use crate::math::num::RealNumber; + +/// Lasso regression parameters +#[derive(Serialize, Deserialize, Debug)] +pub struct LassoParameters { + /// Controls the strength of the penalty to the loss function. + pub alpha: T, + /// If true the regressors X will be normalized before regression + /// by subtracting the mean and dividing by the standard deviation. + pub normalize: bool, + /// The tolerance for the optimization + pub tol: T, + /// The maximum number of iterations + pub max_iter: usize, +} + +#[derive(Serialize, Deserialize, Debug)] +/// Lasso regressor +pub struct Lasso> { + coefficients: M, + intercept: T, +} + +struct InteriorPointOptimizer> { + ata: M, + d1: Vec, + d2: Vec, + prb: Vec, + prs: Vec, +} + +impl Default for LassoParameters { + fn default() -> Self { + LassoParameters { + alpha: T::one(), + normalize: true, + tol: T::from_f64(1e-4).unwrap(), + max_iter: 1000, + } + } +} + +impl> PartialEq for Lasso { + fn eq(&self, other: &Self) -> bool { + self.coefficients == other.coefficients + && (self.intercept - other.intercept).abs() <= T::epsilon() + } +} + +impl> Lasso { + /// Fits Lasso regression to your data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - target values + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: LassoParameters, + ) -> Result, Failed> { + let (n, p) = x.shape(); + + if n <= p { + return Err(Failed::fit( + "Number of rows in X should be >= number of columns in X", + )); + } + + if parameters.alpha < T::zero() { + return Err(Failed::fit("alpha should be >= 0")); + } + + if parameters.tol <= T::zero() { + return Err(Failed::fit("tol should be > 0")); + } + + if parameters.max_iter == 0 { + return Err(Failed::fit("max_iter should be > 0")); + } + + if y.len() != n { + return Err(Failed::fit("Number of rows in X should = len(y)")); + } + + let (w, b) = if parameters.normalize { + let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; + + let mut optimizer = InteriorPointOptimizer::new(&scaled_x, p); + + let mut w = optimizer.optimize(&scaled_x, y, ¶meters)?; + + for j in 0..p { + w.set(j, 0, w.get(j, 0) / col_std[j]); + } + + let mut b = T::zero(); + + for i in 0..p { + b += w.get(i, 0) * col_mean[i]; + } + + b = y.mean() - b; + (w, b) + } else { + let mut optimizer = InteriorPointOptimizer::new(x, p); + + let w = optimizer.optimize(x, y, ¶meters)?; + + (w, y.mean()) + }; + + Ok(Lasso { + intercept: b, + coefficients: w, + }) + } + + /// Predict target values from `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &M) -> Result { + let (nrows, _) = x.shape(); + let mut y_hat = x.matmul(&self.coefficients); + y_hat.add_mut(&M::fill(nrows, 1, self.intercept)); + Ok(y_hat.transpose().to_row_vector()) + } + + /// Get estimates regression coefficients + pub fn coefficients(&self) -> &M { + &self.coefficients + } + + /// Get estimate of intercept + pub fn intercept(&self) -> T { + self.intercept + } + + fn rescale_x(x: &M) -> Result<(M, Vec, Vec), Failed> { + let col_mean = x.mean(0); + let col_std = x.std(0); + + for i in 0..col_std.len() { + if (col_std[i] - T::zero()).abs() < T::epsilon() { + return Err(Failed::fit(&format!( + "Cannot rescale constant column {}", + i + ))); + } + } + + let mut scaled_x = x.clone(); + scaled_x.scale_mut(&col_mean, &col_std, 0); + Ok((scaled_x, col_mean, col_std)) + } +} + +impl> InteriorPointOptimizer { + fn new(a: &M, n: usize) -> InteriorPointOptimizer { + InteriorPointOptimizer { + ata: a.ab(true, a, false), + d1: vec![T::zero(); n], + d2: vec![T::zero(); n], + prb: vec![T::zero(); n], + prs: vec![T::zero(); n], + } + } + + fn optimize( + &mut self, + x: &M, + y: &M::RowVector, + parameters: &LassoParameters, + ) -> Result { + let (n, p) = x.shape(); + let p_f64 = T::from_usize(p).unwrap(); + + //parameters + let pcgmaxi = 5000; + let min_pcgtol = T::from_f64(0.1).unwrap(); + let eta = T::from_f64(1E-3).unwrap(); + let alpha = T::from_f64(0.01).unwrap(); + let beta = T::from_f64(0.5).unwrap(); + let gamma = T::from_f64(-0.25).unwrap(); + let mu = T::two(); + + let y = M::from_row_vector(y.sub_scalar(y.mean())).transpose(); + + let mut max_ls_iter = 100; + let mut pitr = 0; + let mut w = M::zeros(p, 1); + let mut neww = w.clone(); + let mut u = M::ones(p, 1); + let mut newu = u.clone(); + + let mut f = M::fill(p, 2, -T::one()); + let mut newf = f.clone(); + + let mut q1 = vec![T::zero(); p]; + let mut q2 = vec![T::zero(); p]; + + let mut dx = M::zeros(p, 1); + let mut du = M::zeros(p, 1); + let mut dxu = M::zeros(2 * p, 1); + let mut grad = M::zeros(2 * p, 1); + + let mut nu = M::zeros(n, 1); + let mut dobj = T::zero(); + let mut s = T::infinity(); + let mut t = T::one() + .max(T::one() / parameters.alpha) + .min(T::two() * p_f64 / T::from(1e-3).unwrap()); + + for ntiter in 0..parameters.max_iter { + let mut z = x.matmul(&w); + + for i in 0..n { + z.set(i, 0, z.get(i, 0) - y.get(i, 0)); + nu.set(i, 0, T::two() * z.get(i, 0)); + } + + // CALCULATE DUALITY GAP + let xnu = x.ab(true, &nu, false); + let max_xnu = xnu.norm(T::infinity()); + if max_xnu > parameters.alpha { + let lnu = parameters.alpha / max_xnu; + nu.mul_scalar_mut(lnu); + } + + let pobj = z.dot(&z) + parameters.alpha * w.norm(T::one()); + dobj = dobj.max(gamma * nu.dot(&nu) - nu.dot(&y)); + + let gap = pobj - dobj; + + // STOPPING CRITERION + if gap / dobj < parameters.tol { + break; + } + + // UPDATE t + if s >= T::half() { + t = t.max((T::two() * p_f64 * mu / gap).min(mu * t)); + } + + // CALCULATE NEWTON STEP + for i in 0..p { + let q1i = T::one() / (u.get(i, 0) + w.get(i, 0)); + let q2i = T::one() / (u.get(i, 0) - w.get(i, 0)); + q1[i] = q1i; + q2[i] = q2i; + self.d1[i] = (q1i * q1i + q2i * q2i) / t; + self.d2[i] = (q1i * q1i - q2i * q2i) / t; + } + + let mut gradphi = x.ab(true, &z, false); + + for i in 0..p { + let g1 = T::two() * gradphi.get(i, 0) - (q1[i] - q2[i]) / t; + let g2 = parameters.alpha - (q1[i] + q2[i]) / t; + gradphi.set(i, 0, g1); + grad.set(i, 0, -g1); + grad.set(i + p, 0, -g2); + } + + for i in 0..p { + self.prb[i] = T::two() + self.d1[i]; + self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i] * self.d2[i]; + } + + let normg = grad.norm2(); + let mut pcgtol = min_pcgtol.min(eta * gap / T::one().min(normg)); + if ntiter != 0 && pitr == 0 { + pcgtol *= min_pcgtol; + } + + let error = self.solve_mut(x, &grad, &mut dxu, pcgtol, pcgmaxi)?; + if error > pcgtol { + pitr = pcgmaxi; + } + + for i in 0..p { + dx.set(i, 0, dxu.get(i, 0)); + du.set(i, 0, dxu.get(i + p, 0)); + } + + // BACKTRACKING LINE SEARCH + let phi = z.dot(&z) + parameters.alpha * u.sum() - Self::sumlogneg(&f) / t; + s = T::one(); + let gdx = grad.dot(&dxu); + + let lsiter = 0; + while lsiter < max_ls_iter { + for i in 0..p { + neww.set(i, 0, w.get(i, 0) + s * dx.get(i, 0)); + newu.set(i, 0, u.get(i, 0) + s * du.get(i, 0)); + newf.set(i, 0, neww.get(i, 0) - newu.get(i, 0)); + newf.set(i, 1, -neww.get(i, 0) - newu.get(i, 0)); + } + + if newf.max() < T::zero() { + let mut newz = x.matmul(&neww); + for i in 0..n { + newz.set(i, 0, newz.get(i, 0) - y.get(i, 0)); + } + + let newphi = newz.dot(&newz) + parameters.alpha * newu.sum() + - Self::sumlogneg(&newf) / t; + if newphi - phi <= alpha * s * gdx { + break; + } + } + s = beta * s; + max_ls_iter += 1; + } + + if lsiter == max_ls_iter { + return Err(Failed::fit( + "Exceeded maximum number of iteration for interior point optimizer", + )); + } + + w.copy_from(&neww); + u.copy_from(&newu); + f.copy_from(&newf); + } + + Ok(w) + } + + fn sumlogneg(f: &M) -> T { + let (n, _) = f.shape(); + let mut sum = T::zero(); + for i in 0..n { + sum += (-f.get(i, 0)).ln(); + sum += (-f.get(i, 1)).ln(); + } + sum + } +} + +impl<'a, T: RealNumber, M: Matrix> BiconjugateGradientSolver + for InteriorPointOptimizer +{ + fn solve_preconditioner(&self, a: &M, b: &M, x: &mut M) { + let (_, p) = a.shape(); + + for i in 0..p { + x.set( + i, + 0, + (self.d1[i] * b.get(i, 0) - self.d2[i] * b.get(i + p, 0)) / self.prs[i], + ); + x.set( + i + p, + 0, + (-self.d2[i] * b.get(i, 0) + self.prb[i] * b.get(i + p, 0)) / self.prs[i], + ); + } + } + + fn mat_vec_mul(&self, _: &M, x: &M, y: &mut M) { + let (_, p) = self.ata.shape(); + let atax = self.ata.matmul(&x.slice(0..p, 0..1)); + + for i in 0..p { + y.set( + i, + 0, + T::two() * atax.get(i, 0) + self.d1[i] * x.get(i, 0) + self.d2[i] * x.get(i + p, 0), + ); + y.set( + i + p, + 0, + self.d2[i] * x.get(i, 0) + self.d1[i] * x.get(i + p, 0), + ); + } + } + + fn mat_t_vec_mul(&self, a: &M, x: &M, y: &mut M) { + self.mat_vec_mul(a, x, y); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + use crate::metrics::mean_absolute_error; + + #[test] + fn lasso_fit_predict() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y: Vec = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let y_hat = Lasso::fit( + &x, + &y, + LassoParameters { + alpha: 0.1, + normalize: false, + tol: 1e-4, + max_iter: 1000, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat, &y) < 2.0); + + let y_hat = Lasso::fit( + &x, + &y, + LassoParameters { + alpha: 0.1, + normalize: false, + tol: 1e-4, + max_iter: 1000, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat, &y) < 2.0); + } + + #[test] + fn serde() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let lr = Lasso::fit(&x, &y, Default::default()).unwrap(); + + let deserialized_lr: Lasso> = + serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); + + assert_eq!(lr, deserialized_lr); + } +} diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 4b52529..a3674b3 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -289,7 +289,7 @@ impl> LogisticRegression { let n = x.shape().0; let mut result = M::zeros(1, n); if self.num_classes == 2 { - let y_hat: Vec = x.matmul(&self.coefficients.transpose()).get_col_as_vec(0); + let y_hat: Vec = x.ab(false, &self.coefficients, true).get_col_as_vec(0); let intercept = self.intercept.get(0, 0); for i in 0..n { result.set( diff --git a/src/linear/mod.rs b/src/linear/mod.rs index fef7070..edaea4f 100644 --- a/src/linear/mod.rs +++ b/src/linear/mod.rs @@ -20,6 +20,8 @@ //! //! +pub(crate) mod bg_solver; +pub mod lasso; pub mod linear_regression; pub mod logistic_regression; pub mod ridge_regression; From f9056f716ad1296c335d816b5c3e15c1823dd174 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 24 Nov 2020 19:21:27 -0800 Subject: [PATCH 04/78] lasso: minor change in unit test --- src/linear/lasso.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 306b1aa..965c1c4 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -447,7 +447,7 @@ mod tests { &y, LassoParameters { alpha: 0.1, - normalize: false, + normalize: true, tol: 1e-4, max_iter: 1000, }, From 89a5136191522a2882ffa3f8a10bda92161024b5 Mon Sep 17 00:00:00 2001 From: morenol Date: Wed, 25 Nov 2020 14:39:02 -0400 Subject: [PATCH 05/78] Change implementation of to_row_vector for nalgebra (#34) * Add failing test * Change implementation of to_row_vector for nalgebra --- Cargo.toml | 4 ++-- src/linalg/naive/dense_matrix.rs | 6 ++++++ src/linalg/nalgebra_bindings.rs | 11 +++++++++-- src/linalg/ndarray_bindings.rs | 6 ++++++ 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 20eebf5..6e15f88 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ datasets = [] [dependencies] ndarray = { version = "0.13", optional = true } -nalgebra = { version = "0.22.0", optional = true } +nalgebra = { version = "0.23.0", optional = true } num-traits = "0.2.12" num = "0.3.0" rand = "0.7.3" @@ -35,4 +35,4 @@ bincode = "1.3.1" [[bench]] name = "distance" -harness = false \ No newline at end of file +harness = false diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 7486329..9279c3c 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1064,6 +1064,12 @@ mod tests { ); } + #[test] + fn col_matrix_to_row_vector() { + let m: DenseMatrix = BaseMatrix::zeros(10, 1); + assert_eq!(m.to_row_vector().len(), 10) + } + #[test] fn iter() { let vec = vec![1., 2., 3., 4., 5., 6.]; diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index 8ddfdb6..da2ec05 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -185,14 +185,15 @@ impl BaseVector for MatrixMN { impl BaseMatrix for Matrix> { - type RowVector = MatrixMN; + type RowVector = RowDVector; fn from_row_vector(vec: Self::RowVector) -> Self { Matrix::from_rows(&[vec]) } fn to_row_vector(self) -> Self::RowVector { - self.row(0).into_owned() + let (nrows, ncols) = self.shape(); + self.reshape_generic(U1, Dynamic::new(nrows * ncols)) } fn get(&self, row: usize, col: usize) -> T { @@ -697,6 +698,12 @@ mod tests { assert_eq!(m.to_row_vector(), expected); } + #[test] + fn col_matrix_to_row_vector() { + let m: DMatrix = BaseMatrix::zeros(10, 1); + assert_eq!(m.to_row_vector().len(), 10) + } + #[test] fn get_row_col_as_vec() { let m = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]); diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index b5058ab..308e355 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -563,6 +563,12 @@ mod tests { ); } + #[test] + fn col_matrix_to_row_vector() { + let m: Array2 = BaseMatrix::zeros(10, 1); + assert_eq!(m.to_row_vector().len(), 10) + } + #[test] fn add_mut() { let mut a1 = arr2(&[[1., 2., 3.], [4., 5., 6.]]); From 67e582987792166ca15a8e0303968e78b5160626 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 25 Nov 2020 12:23:04 -0800 Subject: [PATCH 06/78] simplifies generic matrix.ab implementation --- src/linalg/high_order.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linalg/high_order.rs b/src/linalg/high_order.rs index 359c4a1..493c737 100644 --- a/src/linalg/high_order.rs +++ b/src/linalg/high_order.rs @@ -19,7 +19,7 @@ pub trait HighOrderOperations: BaseMatrix { /// ``` fn ab(&self, a_transpose: bool, b: &Self, b_transpose: bool) -> Self { match (a_transpose, b_transpose) { - (true, true) => self.transpose().matmul(&b.transpose()), + (true, true) => b.matmul(self).transpose(), (false, true) => self.matmul(&b.transpose()), (true, false) => self.transpose().matmul(b), (false, false) => self.matmul(b), From 4720a3a4ebf0137f68d87e0b34e83192d539e8bf Mon Sep 17 00:00:00 2001 From: morenol Date: Thu, 3 Dec 2020 09:51:33 -0400 Subject: [PATCH 07/78] MultinomialNB (#32) feat: add MultinomialNB --- src/naive_bayes/mod.rs | 3 + src/naive_bayes/multinomial.rs | 278 +++++++++++++++++++++++++++++++++ 2 files changed, 281 insertions(+) create mode 100644 src/naive_bayes/multinomial.rs diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 0268da6..8b63aaa 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -66,5 +66,8 @@ impl, D: NBDistribution> BaseNaiveBayes { + /// class labels known to the classifier + class_labels: Vec, + class_priors: Vec, + feature_prob: Vec>, +} + +impl> NBDistribution for MultinomialNBDistribution { + fn prior(&self, class_index: usize) -> T { + self.class_priors[class_index] + } + + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + likelihood += value * self.feature_prob[class_index][feature].ln(); + } + likelihood + } + + fn classes(&self) -> &Vec { + &self.class_labels + } +} + +/// `MultinomialNB` parameters. Use `Default::default()` for default values. +#[derive(Serialize, Deserialize, Debug)] +pub struct MultinomialNBParameters { + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub alpha: T, + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub priors: Option>, +} + +impl MultinomialNBParameters { + /// Create MultinomialNBParameters with specific paramaters. + pub fn new(alpha: T, priors: Option>) -> Self { + Self { alpha, priors } + } +} + +impl Default for MultinomialNBParameters { + fn default() -> Self { + Self { + alpha: T::one(), + priors: None, + } + } +} + +impl MultinomialNBDistribution { + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data. + /// * `y` - vector with target values (classes) of length N. + /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, + /// priors are adjusted according to the data. + /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter. + pub fn fit>( + x: &M, + y: &M::RowVector, + alpha: T, + priors: Option>, + ) -> Result { + let (n_samples, n_features) = x.shape(); + let y_samples = y.len(); + if y_samples != n_samples { + return Err(Failed::fit(&format!( + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", + n_samples, y_samples + ))); + } + + if n_samples == 0 { + return Err(Failed::fit(&format!( + "Size of x and y should greater than 0; |x|=[{}]", + n_samples + ))); + } + if alpha < T::zero() { + return Err(Failed::fit(&format!( + "Alpha should be greater than 0; |alpha|=[{}]", + alpha + ))); + } + + let y = y.to_vec(); + + let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); + let mut class_count = vec![T::zero(); class_labels.len()]; + + for class_index in indices.iter() { + class_count[*class_index] += T::one(); + } + + let class_priors = if let Some(class_priors) = priors { + if class_priors.len() != class_labels.len() { + return Err(Failed::fit( + "Size of priors provided does not match the number of classes of the data.", + )); + } + class_priors + } else { + class_count + .iter() + .map(|&c| c / T::from(n_samples).unwrap()) + .collect() + }; + + let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + + for (row, class_index) in row_iter(x).zip(indices) { + for idx in 0..n_features { + feature_in_class_counter[class_index][idx] += row[idx]; + } + } + + let feature_prob = feature_in_class_counter + .iter() + .map(|feature_count| { + let n_c = feature_count.sum(); + feature_count + .iter() + .map(|&count| (count + alpha) / (n_c + alpha * T::from(n_features).unwrap())) + .collect() + }) + .collect(); + + Ok(Self { + class_labels, + class_priors, + feature_prob, + }) + } +} + +/// MultinomialNB implements the categorical naive Bayes algorithm for categorically distributed data. +#[derive(Serialize, Deserialize, Debug, PartialEq)] +pub struct MultinomialNB> { + inner: BaseNaiveBayes>, +} + +impl> MultinomialNB { + /// Fits MultinomialNB with given data + /// * `x` - training data of size NxM where N is the number of samples and M is the number of + /// features. + /// * `y` - vector with target values (classes) of length N. + /// * `parameters` - additional parameters like class priors, alpha for smoothing and + /// binarizing threshold. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: MultinomialNBParameters, + ) -> Result { + let distribution = + MultinomialNBDistribution::fit(x, y, parameters.alpha, parameters.priors)?; + let inner = BaseNaiveBayes::fit(distribution)?; + Ok(Self { inner }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + self.inner.predict(x) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn run_multinomial_naive_bayes() { + // Tests that MultinomialNB when alpha=1.0 gives the same values as + // those given for the toy example in Manning, Raghavan, and + // Schuetze's "Introduction to Information Retrieval" book: + // https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html + + // Training data points are: + // Chinese Beijing Chinese (class: China) + // Chinese Chinese Shanghai (class: China) + // Chinese Macao (class: China) + // Tokyo Japan Chinese (class: Japan) + let x = DenseMatrix::::from_2d_array(&[ + &[1., 2., 0., 0., 0., 0.], + &[0., 2., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + + assert_eq!(mnb.inner.distribution.class_priors, &[0.75, 0.25]); + assert_eq!( + mnb.inner.distribution.feature_prob, + &[ + &[1. / 7., 3. / 7., 1. / 14., 1. / 7., 1. / 7., 1. / 14.], + &[1. / 9., 2. / 9.0, 2. / 9.0, 1. / 9.0, 1. / 9.0, 2. / 9.0] + ] + ); + + // Testing data point is: + // Chinese Chinese Chinese Tokyo Japan + let x_test = DenseMatrix::::from_2d_array(&[&[0., 3., 1., 0., 0., 1.]]); + let y_hat = mnb.predict(&x_test).unwrap(); + + assert_eq!(y_hat, &[0.]); + } + + #[test] + fn multinomial_nb_scikit_parity() { + let x = DenseMatrix::::from_2d_array(&[ + &[2., 4., 0., 0., 2., 1., 2., 4., 2., 0.], + &[3., 4., 0., 2., 1., 0., 1., 4., 0., 3.], + &[1., 4., 2., 4., 1., 0., 1., 2., 3., 2.], + &[0., 3., 3., 4., 1., 0., 3., 1., 1., 1.], + &[0., 2., 1., 4., 3., 4., 1., 2., 3., 1.], + &[3., 2., 4., 1., 3., 0., 2., 4., 0., 2.], + &[3., 1., 3., 0., 2., 0., 4., 4., 3., 4.], + &[2., 2., 2., 0., 1., 1., 2., 1., 0., 1.], + &[3., 3., 2., 2., 0., 2., 3., 2., 2., 3.], + &[4., 3., 4., 4., 4., 2., 2., 0., 1., 4.], + &[3., 4., 2., 2., 1., 4., 4., 4., 1., 3.], + &[3., 0., 1., 4., 4., 0., 0., 3., 2., 4.], + &[2., 0., 3., 3., 1., 2., 0., 2., 4., 1.], + &[2., 4., 0., 4., 2., 4., 1., 3., 1., 4.], + &[0., 2., 2., 3., 4., 0., 4., 4., 4., 4.], + ]); + let y = vec![2., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0., 0., 2., 0., 2.]; + let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + + let y_hat = nb.predict(&x).unwrap(); + + assert!(nb + .inner + .distribution + .class_priors + .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); + assert!(nb.inner.distribution.feature_prob[1].approximate_eq( + &vec!(0.07, 0.12, 0.07, 0.15, 0.07, 0.09, 0.08, 0.10, 0.08, 0.11), + 1e-1 + )); + assert!(y_hat.approximate_eq( + &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 2.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 2.0), + 1e-5 + )); + } + #[test] + fn serde() { + let x = DenseMatrix::::from_2d_array(&[ + &[1., 1., 0., 0., 0., 0.], + &[0., 1., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + + let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + let deserialized_mnb: MultinomialNB> = + serde_json::from_str(&serde_json::to_string(&mnb).unwrap()).unwrap(); + + assert_eq!(mnb, deserialized_mnb); + } +} From f0b348dd6ee45ecbcba26ec783b5e092862844af Mon Sep 17 00:00:00 2001 From: morenol Date: Fri, 4 Dec 2020 20:45:40 -0400 Subject: [PATCH 08/78] feat: BernoulliNB (#31) * feat: BernoulliNB * Move preprocessing to a trait in linalg/stats.rs --- src/linalg/mod.rs | 3 +- src/linalg/naive/dense_matrix.rs | 3 +- src/linalg/nalgebra_bindings.rs | 7 +- src/linalg/ndarray_bindings.rs | 7 +- src/linalg/stats.rs | 41 ++++ src/naive_bayes/bernoulli.rs | 308 +++++++++++++++++++++++++++++++ src/naive_bayes/mod.rs | 2 + 7 files changed, 367 insertions(+), 4 deletions(-) create mode 100644 src/naive_bayes/bernoulli.rs diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 1be2e75..d3fb635 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -63,7 +63,7 @@ use evd::EVDDecomposableMatrix; use high_order::HighOrderOperations; use lu::LUDecomposableMatrix; use qr::QRDecomposableMatrix; -use stats::MatrixStats; +use stats::{MatrixPreprocessing, MatrixStats}; use svd::SVDDecomposableMatrix; /// Column or row vector @@ -619,6 +619,7 @@ pub trait Matrix: + LUDecomposableMatrix + CholeskyDecomposableMatrix + MatrixStats + + MatrixPreprocessing + HighOrderOperations + PartialEq + Display diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 89abe20..14e5e62 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -12,7 +12,7 @@ use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; -use crate::linalg::stats::MatrixStats; +use crate::linalg::stats::{MatrixPreprocessing, MatrixStats}; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix; pub use crate::linalg::{BaseMatrix, BaseVector}; @@ -478,6 +478,7 @@ impl HighOrderOperations for DenseMatrix { } impl MatrixStats for DenseMatrix {} +impl MatrixPreprocessing for DenseMatrix {} impl Matrix for DenseMatrix {} diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index e108831..ad2d4a2 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -47,7 +47,7 @@ use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; -use crate::linalg::stats::MatrixStats; +use crate::linalg::stats::{MatrixPreprocessing, MatrixStats}; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix as SmartCoreMatrix; use crate::linalg::{BaseMatrix, BaseVector}; @@ -554,6 +554,11 @@ impl + MatrixPreprocessing for Matrix> +{ +} + impl HighOrderOperations for Matrix> { diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index e50bdcd..3f0478f 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -54,7 +54,7 @@ use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; -use crate::linalg::stats::MatrixStats; +use crate::linalg::stats::{MatrixPreprocessing, MatrixStats}; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix; use crate::linalg::{BaseMatrix, BaseVector}; @@ -503,6 +503,11 @@ impl + MatrixPreprocessing for ArrayBase, Ix2> +{ +} + impl HighOrderOperations for ArrayBase, Ix2> { diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index ac7a1bc..fff87c3 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -104,6 +104,47 @@ pub trait MatrixStats: BaseMatrix { } } +/// Defines baseline implementations for various matrix processing functions +pub trait MatrixPreprocessing: BaseMatrix { + /// Each element of the matrix greater than the threshold becomes 1, while values less than or equal to the threshold become 0 + /// ``` + /// use smartcore::linalg::naive::dense_matrix::*; + /// use crate::smartcore::linalg::stats::MatrixPreprocessing; + /// let mut a = DenseMatrix::from_array(2, 3, &[0., 2., 3., -5., -6., -7.]); + /// let expected = DenseMatrix::from_array(2, 3, &[0., 1., 1., 0., 0., 0.]); + /// a.binarize_mut(0.); + /// + /// assert_eq!(a, expected); + /// ``` + + fn binarize_mut(&mut self, threshold: T) { + let (nrows, ncols) = self.shape(); + for row in 0..nrows { + for col in 0..ncols { + if self.get(row, col) > threshold { + self.set(row, col, T::one()); + } else { + self.set(row, col, T::zero()); + } + } + } + } + /// Returns new matrix where elements are binarized according to a given threshold. + /// ``` + /// use smartcore::linalg::naive::dense_matrix::*; + /// use crate::smartcore::linalg::stats::MatrixPreprocessing; + /// let a = DenseMatrix::from_array(2, 3, &[0., 2., 3., -5., -6., -7.]); + /// let expected = DenseMatrix::from_array(2, 3, &[0., 1., 1., 0., 0., 0.]); + /// + /// assert_eq!(a.binarize(0.), expected); + /// ``` + fn binarize(&self, threshold: T) -> Self { + let mut m = self.clone(); + m.binarize_mut(threshold); + m + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs new file mode 100644 index 0000000..535b5ee --- /dev/null +++ b/src/naive_bayes/bernoulli.rs @@ -0,0 +1,308 @@ +use crate::error::Failed; +use crate::linalg::row_iter; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use crate::math::vector::RealNumberVector; +use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; + +use serde::{Deserialize, Serialize}; + +/// Naive Bayes classifier for Bearnoulli features +#[derive(Serialize, Deserialize, Debug, PartialEq)] +struct BernoulliNBDistribution { + /// class labels known to the classifier + class_labels: Vec, + class_priors: Vec, + feature_prob: Vec>, +} + +impl> NBDistribution for BernoulliNBDistribution { + fn prior(&self, class_index: usize) -> T { + self.class_priors[class_index] + } + + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + if value == T::one() { + likelihood += self.feature_prob[class_index][feature].ln(); + } else { + likelihood += (T::one() - self.feature_prob[class_index][feature]).ln(); + } + } + likelihood + } + + fn classes(&self) -> &Vec { + &self.class_labels + } +} + +/// `BernoulliNB` parameters. Use `Default::default()` for default values. +#[derive(Serialize, Deserialize, Debug)] +pub struct BernoulliNBParameters { + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub alpha: T, + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub priors: Option>, + /// Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors. + pub binarize: Option, +} + +impl BernoulliNBParameters { + /// Create BernoulliNBParameters with specific paramaters. + pub fn new(alpha: T, priors: Option>, binarize: Option) -> Self { + Self { + alpha, + priors, + binarize, + } + } +} + +impl Default for BernoulliNBParameters { + fn default() -> Self { + Self { + alpha: T::one(), + priors: None, + binarize: Some(T::zero()), + } + } +} + +impl BernoulliNBDistribution { + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data. + /// * `y` - vector with target values (classes) of length N. + /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, + /// priors are adjusted according to the data. + /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter. + /// * `binarize` - Threshold for binarizing. + pub fn fit>( + x: &M, + y: &M::RowVector, + alpha: T, + priors: Option>, + ) -> Result { + let (n_samples, n_features) = x.shape(); + let y_samples = y.len(); + if y_samples != n_samples { + return Err(Failed::fit(&format!( + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", + n_samples, y_samples + ))); + } + + if n_samples == 0 { + return Err(Failed::fit(&format!( + "Size of x and y should greater than 0; |x|=[{}]", + n_samples + ))); + } + if alpha < T::zero() { + return Err(Failed::fit(&format!( + "Alpha should be greater than 0; |alpha|=[{}]", + alpha + ))); + } + + let y = y.to_vec(); + + let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); + let mut class_count = vec![T::zero(); class_labels.len()]; + + for class_index in indices.iter() { + class_count[*class_index] += T::one(); + } + + let class_priors = if let Some(class_priors) = priors { + if class_priors.len() != class_labels.len() { + return Err(Failed::fit( + "Size of priors provided does not match the number of classes of the data.", + )); + } + class_priors + } else { + class_count + .iter() + .map(|&c| c / T::from(n_samples).unwrap()) + .collect() + }; + + let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + + for (row, class_index) in row_iter(x).zip(indices) { + for idx in 0..n_features { + feature_in_class_counter[class_index][idx] += row[idx]; + } + } + + let feature_prob = feature_in_class_counter + .iter() + .enumerate() + .map(|(class_index, feature_count)| { + feature_count + .iter() + .map(|&count| (count + alpha) / (class_count[class_index] + alpha * T::two())) + .collect() + }) + .collect(); + + Ok(Self { + class_labels, + class_priors, + feature_prob, + }) + } +} + +/// BernoulliNB implements the categorical naive Bayes algorithm for categorically distributed data. +#[derive(Serialize, Deserialize, Debug, PartialEq)] +pub struct BernoulliNB> { + inner: BaseNaiveBayes>, + binarize: Option, +} + +impl> BernoulliNB { + /// Fits BernoulliNB with given data + /// * `x` - training data of size NxM where N is the number of samples and M is the number of + /// features. + /// * `y` - vector with target values (classes) of length N. + /// * `parameters` - additional parameters like class priors, alpha for smoothing and + /// binarizing threshold. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: BernoulliNBParameters, + ) -> Result { + let distribution = if let Some(threshold) = parameters.binarize { + BernoulliNBDistribution::fit( + &(x.binarize(threshold)), + y, + parameters.alpha, + parameters.priors, + )? + } else { + BernoulliNBDistribution::fit(x, y, parameters.alpha, parameters.priors)? + }; + + let inner = BaseNaiveBayes::fit(distribution)?; + Ok(Self { + inner, + binarize: parameters.binarize, + }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + if let Some(threshold) = self.binarize { + self.inner.predict(&(x.binarize(threshold))) + } else { + self.inner.predict(x) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn run_bernoulli_naive_bayes() { + // Tests that BernoulliNB when alpha=1.0 gives the same values as + // those given for the toy example in Manning, Raghavan, and + // Schuetze's "Introduction to Information Retrieval" book: + // https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html + + // Training data points are: + // Chinese Beijing Chinese (class: China) + // Chinese Chinese Shanghai (class: China) + // Chinese Macao (class: China) + // Tokyo Japan Chinese (class: Japan) + let x = DenseMatrix::::from_2d_array(&[ + &[1., 1., 0., 0., 0., 0.], + &[0., 1., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + let bnb = BernoulliNB::fit(&x, &y, Default::default()).unwrap(); + + assert_eq!(bnb.inner.distribution.class_priors, &[0.75, 0.25]); + assert_eq!( + bnb.inner.distribution.feature_prob, + &[ + &[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], + &[1. / 3.0, 2. / 3.0, 2. / 3.0, 1. / 3.0, 1. / 3.0, 2. / 3.0] + ] + ); + + // Testing data point is: + // Chinese Chinese Chinese Tokyo Japan + let x_test = DenseMatrix::::from_2d_array(&[&[0., 1., 1., 0., 0., 1.]]); + let y_hat = bnb.predict(&x_test).unwrap(); + + assert_eq!(y_hat, &[1.]); + } + + #[test] + fn bernoulli_nb_scikit_parity() { + let x = DenseMatrix::::from_2d_array(&[ + &[2., 4., 0., 0., 2., 1., 2., 4., 2., 0.], + &[3., 4., 0., 2., 1., 0., 1., 4., 0., 3.], + &[1., 4., 2., 4., 1., 0., 1., 2., 3., 2.], + &[0., 3., 3., 4., 1., 0., 3., 1., 1., 1.], + &[0., 2., 1., 4., 3., 4., 1., 2., 3., 1.], + &[3., 2., 4., 1., 3., 0., 2., 4., 0., 2.], + &[3., 1., 3., 0., 2., 0., 4., 4., 3., 4.], + &[2., 2., 2., 0., 1., 1., 2., 1., 0., 1.], + &[3., 3., 2., 2., 0., 2., 3., 2., 2., 3.], + &[4., 3., 4., 4., 4., 2., 2., 0., 1., 4.], + &[3., 4., 2., 2., 1., 4., 4., 4., 1., 3.], + &[3., 0., 1., 4., 4., 0., 0., 3., 2., 4.], + &[2., 0., 3., 3., 1., 2., 0., 2., 4., 1.], + &[2., 4., 0., 4., 2., 4., 1., 3., 1., 4.], + &[0., 2., 2., 3., 4., 0., 4., 4., 4., 4.], + ]); + let y = vec![2., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0., 0., 2., 0., 2.]; + let bnb = BernoulliNB::fit(&x, &y, Default::default()).unwrap(); + + let y_hat = bnb.predict(&x).unwrap(); + + assert!(bnb + .inner + .distribution + .class_priors + .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); + assert!(bnb.inner.distribution.feature_prob[1].approximate_eq( + &vec!(0.8, 0.8, 0.8, 0.4, 0.8, 0.6, 0.8, 0.6, 0.6, 0.8), + 1e-1 + )); + assert!(y_hat.approximate_eq( + &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), + 1e-5 + )); + } + + #[test] + fn serde() { + let x = DenseMatrix::::from_2d_array(&[ + &[1., 1., 0., 0., 0., 0.], + &[0., 1., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + + let bnb = BernoulliNB::fit(&x, &y, Default::default()).unwrap(); + let deserialized_bnb: BernoulliNB> = + serde_json::from_str(&serde_json::to_string(&bnb).unwrap()).unwrap(); + + assert_eq!(bnb, deserialized_bnb); + } +} diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 8b63aaa..508b976 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -64,10 +64,12 @@ impl, D: NBDistribution> BaseNaiveBayes Date: Fri, 4 Dec 2020 20:46:36 -0400 Subject: [PATCH 09/78] Add benches for GNB (#33) * Add benches for GNB * use [black_box](https://github.com/bheisler/criterion.rs/blob/master/book/src/faq.md#when-should-i-use-criterionblack_box) --- Cargo.toml | 5 +++ benches/naive_bayes.rs | 73 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 benches/naive_bayes.rs diff --git a/Cargo.toml b/Cargo.toml index 6e15f88..1503957 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,3 +36,8 @@ bincode = "1.3.1" [[bench]] name = "distance" harness = false + +[[bench]] +name = "naive_bayes" +harness = false +required-features = ["ndarray-bindings", "nalgebra-bindings"] diff --git a/benches/naive_bayes.rs b/benches/naive_bayes.rs new file mode 100644 index 0000000..2a4595b --- /dev/null +++ b/benches/naive_bayes.rs @@ -0,0 +1,73 @@ +use criterion::BenchmarkId; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +use nalgebra::DMatrix; +use ndarray::Array2; +use smartcore::linalg::naive::dense_matrix::DenseMatrix; +use smartcore::linalg::BaseMatrix; +use smartcore::linalg::BaseVector; +use smartcore::naive_bayes::GaussianNB; + +pub fn gaussian_naive_bayes_fit_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("GaussianNB::fit"); + + for n_samples in [100_usize, 1000_usize, 10000_usize].iter() { + for n_features in [10_usize, 100_usize, 1000_usize].iter() { + let x = DenseMatrix::::rand(*n_samples, *n_features); + let y: Vec = (0..*n_samples) + .map(|i| (i % *n_samples / 5_usize) as f64) + .collect::>(); + group.bench_with_input( + BenchmarkId::from_parameter(format!( + "n_samples: {}, n_features: {}", + n_samples, n_features + )), + n_samples, + |b, _| { + b.iter(|| { + GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap(); + }) + }, + ); + } + } + group.finish(); +} + +pub fn gaussian_naive_matrix_datastructure(c: &mut Criterion) { + let mut group = c.benchmark_group("GaussianNB"); + let classes = (0..10000).map(|i| (i % 25) as f64).collect::>(); + + group.bench_function("DenseMatrix", |b| { + let x = DenseMatrix::::rand(10000, 500); + let y = as BaseMatrix>::RowVector::from_array(&classes); + + b.iter(|| { + GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap(); + }) + }); + + group.bench_function("ndarray", |b| { + let x = Array2::::rand(10000, 500); + let y = as BaseMatrix>::RowVector::from_array(&classes); + + b.iter(|| { + GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap(); + }) + }); + + group.bench_function("ndalgebra", |b| { + let x = DMatrix::::rand(10000, 500); + let y = as BaseMatrix>::RowVector::from_array(&classes); + + b.iter(|| { + GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap(); + }) + }); +} +criterion_group!( + benches, + gaussian_naive_bayes_fit_benchmark, + gaussian_naive_matrix_datastructure +); +criterion_main!(benches); From 53351b2eceff2e5f256ad7f2352ee0a9a2ef7c6f Mon Sep 17 00:00:00 2001 From: morenol Date: Fri, 11 Dec 2020 16:52:39 -0400 Subject: [PATCH 10/78] fix needless-range and clippy::ptr_arg warnings. (#36) * Fix needless for loop range * Do not ignore clippy::ptr_arg --- src/algorithm/neighbour/bbd_tree.rs | 33 ++++++------ src/algorithm/neighbour/cover_tree.rs | 2 +- src/algorithm/neighbour/mod.rs | 1 + src/decomposition/pca.rs | 22 ++++---- src/ensemble/random_forest_classifier.rs | 14 ++--- src/lib.rs | 2 - src/linalg/evd.rs | 68 ++++++++++++------------ src/linalg/lu.rs | 13 ++--- src/linalg/naive/dense_matrix.rs | 45 ++++++++-------- src/linalg/qr.rs | 8 +-- src/linalg/stats.rs | 16 +++--- src/linalg/svd.rs | 32 +++++------ src/linear/bg_solver.rs | 6 +-- src/linear/lasso.rs | 12 ++--- src/linear/logistic_regression.rs | 12 ++--- src/linear/ridge_regression.rs | 12 ++--- src/math/distance/euclidian.rs | 2 +- src/metrics/auc.rs | 4 +- src/metrics/cluster_helpers.rs | 9 ++-- src/model_selection/mod.rs | 12 ++--- src/naive_bayes/bernoulli.rs | 4 +- src/naive_bayes/gaussian.rs | 6 +-- src/naive_bayes/multinomial.rs | 4 +- src/neighbors/knn_classifier.rs | 4 +- src/optimization/line_search.rs | 2 +- src/tree/decision_tree_classifier.rs | 57 ++++++++++---------- src/tree/decision_tree_regressor.rs | 25 ++++----- 27 files changed, 208 insertions(+), 219 deletions(-) diff --git a/src/algorithm/neighbour/bbd_tree.rs b/src/algorithm/neighbour/bbd_tree.rs index 85e6628..0d11fc6 100644 --- a/src/algorithm/neighbour/bbd_tree.rs +++ b/src/algorithm/neighbour/bbd_tree.rs @@ -44,10 +44,7 @@ impl BBDTree { let (n, _) = data.shape(); - let mut index = vec![0; n]; - for i in 0..n { - index[i] = i; - } + let index = (0..n).collect::>(); let mut tree = BBDTree { nodes, @@ -64,7 +61,7 @@ impl BBDTree { pub(in crate) fn clustering( &self, - centroids: &Vec>, + centroids: &[Vec], sums: &mut Vec>, counts: &mut Vec, membership: &mut Vec, @@ -92,8 +89,8 @@ impl BBDTree { fn filter( &self, node: usize, - centroids: &Vec>, - candidates: &Vec, + centroids: &[Vec], + candidates: &[usize], k: usize, sums: &mut Vec>, counts: &mut Vec, @@ -117,15 +114,15 @@ impl BBDTree { let mut new_candidates = vec![0; k]; let mut newk = 0; - for i in 0..k { + for candidate in candidates.iter().take(k) { if !BBDTree::prune( &self.nodes[node].center, &self.nodes[node].radius, centroids, closest, - candidates[i], + *candidate, ) { - new_candidates[newk] = candidates[i]; + new_candidates[newk] = *candidate; newk += 1; } } @@ -166,9 +163,9 @@ impl BBDTree { } fn prune( - center: &Vec, - radius: &Vec, - centroids: &Vec>, + center: &[T], + radius: &[T], + centroids: &[Vec], best_index: usize, test_index: usize, ) -> bool { @@ -285,8 +282,8 @@ impl BBDTree { } let mut mean = vec![T::zero(); d]; - for i in 0..d { - mean[i] = node.sum[i] / T::from(node.count).unwrap(); + for (i, mean_i) in mean.iter_mut().enumerate().take(d) { + *mean_i = node.sum[i] / T::from(node.count).unwrap(); } node.cost = BBDTree::node_cost(&self.nodes[node.lower.unwrap()], &mean) @@ -295,11 +292,11 @@ impl BBDTree { self.add_node(node) } - fn node_cost(node: &BBDTreeNode, center: &Vec) -> T { + fn node_cost(node: &BBDTreeNode, center: &[T]) -> T { let d = center.len(); let mut scatter = T::zero(); - for i in 0..d { - let x = (node.sum[i] / T::from(node.count).unwrap()) - center[i]; + for (i, center_i) in center.iter().enumerate().take(d) { + let x = (node.sum[i] / T::from(node.count).unwrap()) - *center_i; scatter += x * x; } node.cost + T::from(node.count).unwrap() * scatter diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index e7dbac0..2fe7792 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -436,7 +436,7 @@ impl> CoverTree } } - fn max(&self, distance_set: &Vec>) -> F { + fn max(&self, distance_set: &[DistanceSet]) -> F { let mut max = F::zero(); for n in distance_set { if max < n.dist[n.dist.len() - 1] { diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 7ef1c5c..bf9e669 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -1,3 +1,4 @@ +#![allow(clippy::ptr_arg)] //! # Nearest Neighbors Search Algorithms and Data Structures //! //! Nearest neighbor search is a basic computational tool that is particularly relevant to machine learning, diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index f25aaad..9f5bd39 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -112,9 +112,9 @@ impl> PCA { let mut x = data.clone(); - for c in 0..n { + for (c, mu_c) in mu.iter().enumerate().take(n) { for r in 0..m { - x.sub_element_mut(r, c, mu[c]); + x.sub_element_mut(r, c, *mu_c); } } @@ -124,8 +124,8 @@ impl> PCA { if m > n && !parameters.use_correlation_matrix { let svd = x.svd()?; eigenvalues = svd.s; - for i in 0..eigenvalues.len() { - eigenvalues[i] = eigenvalues[i] * eigenvalues[i]; + for eigenvalue in &mut eigenvalues { + *eigenvalue = *eigenvalue * (*eigenvalue); } eigenvectors = svd.V; @@ -149,8 +149,8 @@ impl> PCA { if parameters.use_correlation_matrix { let mut sd = vec![T::zero(); n]; - for i in 0..n { - sd[i] = cov.get(i, i).sqrt(); + for (i, sd_i) in sd.iter_mut().enumerate().take(n) { + *sd_i = cov.get(i, i).sqrt(); } for i in 0..n { @@ -166,9 +166,9 @@ impl> PCA { eigenvectors = evd.V; - for i in 0..n { + for (i, sd_i) in sd.iter().enumerate().take(n) { for j in 0..n { - eigenvectors.div_element_mut(i, j, sd[i]); + eigenvectors.div_element_mut(i, j, *sd_i); } } } else { @@ -188,9 +188,9 @@ impl> PCA { } let mut pmu = vec![T::zero(); n_components]; - for k in 0..n { - for i in 0..n_components { - pmu[i] += projection.get(i, k) * mu[k]; + for (k, mu_k) in mu.iter().enumerate().take(n) { + for (i, pmu_i) in pmu.iter_mut().enumerate().take(n_components) { + *pmu_i += projection.get(i, k) * (*mu_k); } } diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 011b0ba..7229d92 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -132,9 +132,9 @@ impl RandomForestClassifier { let mut yi: Vec = vec![0; y_ncols]; let classes = y_m.unique(); - for i in 0..y_ncols { + for (i, yi_i) in yi.iter_mut().enumerate().take(y_ncols) { let yc = y_m.get(0, i); - yi[i] = classes.iter().position(|c| yc == *c).unwrap(); + *yi_i = classes.iter().position(|c| yc == *c).unwrap(); } let mtry = parameters.m.unwrap_or_else(|| { @@ -192,22 +192,22 @@ impl RandomForestClassifier { which_max(&result) } - fn sample_with_replacement(y: &Vec, num_classes: usize) -> Vec { + fn sample_with_replacement(y: &[usize], num_classes: usize) -> Vec { let mut rng = rand::thread_rng(); let class_weight = vec![1.; num_classes]; let nrows = y.len(); let mut samples = vec![0; nrows]; - for l in 0..num_classes { + for (l, class_weight_l) in class_weight.iter().enumerate().take(num_classes) { let mut n_samples = 0; let mut index: Vec = Vec::new(); - for i in 0..nrows { - if y[i] == l { + for (i, y_i) in y.iter().enumerate().take(nrows) { + if *y_i == l { index.push(i); n_samples += 1; } } - let size = ((n_samples as f64) / class_weight[l]) as usize; + let size = ((n_samples as f64) / *class_weight_l) as usize; for _ in 0..size { let xi: usize = rng.gen_range(0, n_samples); samples[index[xi]] += 1; diff --git a/src/lib.rs b/src/lib.rs index ada7925..9290c86 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,4 @@ #![allow( - clippy::needless_range_loop, - clippy::ptr_arg, clippy::type_complexity, clippy::too_many_arguments, clippy::many_single_char_names diff --git a/src/linalg/evd.rs b/src/linalg/evd.rs index c216696..4c1b6c3 100644 --- a/src/linalg/evd.rs +++ b/src/linalg/evd.rs @@ -99,27 +99,27 @@ pub trait EVDDecomposableMatrix: BaseMatrix { fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec) { let (n, _) = V.shape(); - for i in 0..n { - d[i] = V.get(n - 1, i); + for (i, d_i) in d.iter_mut().enumerate().take(n) { + *d_i = V.get(n - 1, i); } for i in (1..n).rev() { let mut scale = T::zero(); let mut h = T::zero(); - for k in 0..i { - scale += d[k].abs(); + for d_k in d.iter().take(i) { + scale += d_k.abs(); } if scale == T::zero() { e[i] = d[i - 1]; - for j in 0..i { - d[j] = V.get(i - 1, j); + for (j, d_j) in d.iter_mut().enumerate().take(i) { + *d_j = V.get(i - 1, j); V.set(i, j, T::zero()); V.set(j, i, T::zero()); } } else { - for k in 0..i { - d[k] /= scale; - h += d[k] * d[k]; + for d_k in d.iter_mut().take(i) { + *d_k /= scale; + h += (*d_k) * (*d_k); } let mut f = d[i - 1]; let mut g = h.sqrt(); @@ -129,8 +129,8 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec e[i] = scale * g; h -= f * g; d[i - 1] = f - g; - for j in 0..i { - e[j] = T::zero(); + for e_j in e.iter_mut().take(i) { + *e_j = T::zero(); } for j in 0..i { @@ -170,16 +170,16 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec V.set(i, i, T::one()); let h = d[i + 1]; if h != T::zero() { - for k in 0..=i { - d[k] = V.get(k, i + 1) / h; + for (k, d_k) in d.iter_mut().enumerate().take(i + 1) { + *d_k = V.get(k, i + 1) / h; } for j in 0..=i { let mut g = T::zero(); for k in 0..=i { g += V.get(k, i + 1) * V.get(k, j); } - for k in 0..=i { - V.sub_element_mut(k, j, g * d[k]); + for (k, d_k) in d.iter().enumerate().take(i + 1) { + V.sub_element_mut(k, j, g * (*d_k)); } } } @@ -187,8 +187,8 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec V.set(k, i + 1, T::zero()); } } - for j in 0..n { - d[j] = V.get(n - 1, j); + for (j, d_j) in d.iter_mut().enumerate().take(n) { + *d_j = V.get(n - 1, j); V.set(n - 1, j, T::zero()); } V.set(n - 1, n - 1, T::one()); @@ -238,8 +238,8 @@ fn tql2>(V: &mut M, d: &mut Vec, e: &mut Vec< d[l + 1] = e[l] * (p + r); let dl1 = d[l + 1]; let mut h = g - d[l]; - for i in l + 2..n { - d[i] -= h; + for d_i in d.iter_mut().take(n).skip(l + 2) { + *d_i -= h; } f += h; @@ -285,10 +285,10 @@ fn tql2>(V: &mut M, d: &mut Vec, e: &mut Vec< for i in 0..n - 1 { let mut k = i; let mut p = d[i]; - for j in i + 1..n { - if d[j] > p { + for (j, d_j) in d.iter().enumerate().take(n).skip(i + 1) { + if *d_j > p { k = j; - p = d[j]; + p = *d_j; } } if k != i { @@ -316,7 +316,7 @@ fn balance>(A: &mut M) -> Vec { let mut done = false; while !done { done = true; - for i in 0..n { + for (i, scale_i) in scale.iter_mut().enumerate().take(n) { let mut r = T::zero(); let mut c = T::zero(); for j in 0..n { @@ -341,7 +341,7 @@ fn balance>(A: &mut M) -> Vec { if (c + r) / f < t * s { done = false; g = T::one() / f; - scale[i] *= f; + *scale_i *= f; for j in 0..n { A.mul_element_mut(i, j, g); } @@ -360,7 +360,7 @@ fn elmhes>(A: &mut M) -> Vec { let (n, _) = A.shape(); let mut perm = vec![0; n]; - for m in 1..n - 1 { + for (m, perm_m) in perm.iter_mut().enumerate().take(n - 1).skip(1) { let mut x = T::zero(); let mut i = m; for j in m..n { @@ -369,7 +369,7 @@ fn elmhes>(A: &mut M) -> Vec { i = j; } } - perm[m] = i; + *perm_m = i; if i != m { for j in (m - 1)..n { let swap = A.get(i, j); @@ -402,7 +402,7 @@ fn elmhes>(A: &mut M) -> Vec { perm } -fn eltran>(A: &M, V: &mut M, perm: &Vec) { +fn eltran>(A: &M, V: &mut M, perm: &[usize]) { let (n, _) = A.shape(); for mp in (1..n - 1).rev() { for k in mp + 1..n { @@ -774,11 +774,11 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e } } -fn balbak>(V: &mut M, scale: &Vec) { +fn balbak>(V: &mut M, scale: &[T]) { let (n, _) = V.shape(); - for i in 0..n { + for (i, scale_i) in scale.iter().enumerate().take(n) { for j in 0..n { - V.mul_element_mut(i, j, scale[i]); + V.mul_element_mut(i, j, *scale_i); } } } @@ -789,8 +789,8 @@ fn sort>(d: &mut Vec, e: &mut Vec, V: &mut for j in 1..n { let real = d[j]; let img = e[j]; - for k in 0..n { - temp[k] = V.get(k, j); + for (k, temp_k) in temp.iter_mut().enumerate().take(n) { + *temp_k = V.get(k, j); } let mut i = j as i32 - 1; while i >= 0 { @@ -806,8 +806,8 @@ fn sort>(d: &mut Vec, e: &mut Vec, V: &mut } d[i as usize + 1] = real; e[i as usize + 1] = img; - for k in 0..n { - V.set(k, i as usize + 1, temp[k]); + for (k, temp_k) in temp.iter().enumerate().take(n) { + V.set(k, i as usize + 1, *temp_k); } } } diff --git a/src/linalg/lu.rs b/src/linalg/lu.rs index bfc7fff..6daed69 100644 --- a/src/linalg/lu.rs +++ b/src/linalg/lu.rs @@ -202,24 +202,21 @@ pub trait LUDecomposableMatrix: BaseMatrix { fn lu_mut(mut self) -> Result, Failed> { let (m, n) = self.shape(); - let mut piv = vec![0; m]; - for i in 0..m { - piv[i] = i; - } + let mut piv = (0..m).collect::>(); let mut pivsign = 1; let mut LUcolj = vec![T::zero(); m]; for j in 0..n { - for i in 0..m { - LUcolj[i] = self.get(i, j); + for (i, LUcolj_i) in LUcolj.iter_mut().enumerate().take(m) { + *LUcolj_i = self.get(i, j); } for i in 0..m { let kmax = usize::min(i, j); let mut s = T::zero(); - for k in 0..kmax { - s += self.get(i, k) * LUcolj[k]; + for (k, LUcolj_k) in LUcolj.iter().enumerate().take(kmax) { + s += self.get(i, k) * (*LUcolj_k); } LUcolj[i] -= s; diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 14e5e62..8c822d2 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1,3 +1,4 @@ +#![allow(clippy::ptr_arg)] use std::fmt; use std::fmt::Debug; use std::marker::PhantomData; @@ -164,8 +165,8 @@ impl BaseVector for Vec { fn sum(&self) -> T { let mut sum = T::zero(); - for i in 0..self.len() { - sum += self[i]; + for self_i in self.iter() { + sum += *self_i; } sum } @@ -239,9 +240,9 @@ impl DenseMatrix { nrows, values: vec![T::zero(); ncols * nrows], }; - for row in 0..nrows { - for col in 0..ncols { - m.set(row, col, values[row][col]); + for (row_index, row) in values.iter().enumerate().take(nrows) { + for (col_index, value) in row.iter().enumerate().take(ncols) { + m.set(row_index, col_index, *value); } } m @@ -259,7 +260,7 @@ impl DenseMatrix { /// * `nrows` - number of rows in new matrix. /// * `ncols` - number of columns in new matrix. /// * `values` - values to initialize the matrix. - pub fn from_vec(nrows: usize, ncols: usize, values: &Vec) -> DenseMatrix { + pub fn from_vec(nrows: usize, ncols: usize, values: &[T]) -> DenseMatrix { let mut m = DenseMatrix { ncols, nrows, @@ -543,8 +544,8 @@ impl BaseMatrix for DenseMatrix { fn get_row(&self, row: usize) -> Self::RowVector { let mut v = vec![T::zero(); self.ncols]; - for c in 0..self.ncols { - v[c] = self.get(row, c); + for (c, v_c) in v.iter_mut().enumerate().take(self.ncols) { + *v_c = self.get(row, c); } v @@ -552,29 +553,29 @@ impl BaseMatrix for DenseMatrix { fn get_row_as_vec(&self, row: usize) -> Vec { let mut result = vec![T::zero(); self.ncols]; - for c in 0..self.ncols { - result[c] = self.get(row, c); + for (c, result_c) in result.iter_mut().enumerate().take(self.ncols) { + *result_c = self.get(row, c); } result } fn copy_row_as_vec(&self, row: usize, result: &mut Vec) { - for c in 0..self.ncols { - result[c] = self.get(row, c); + for (c, result_c) in result.iter_mut().enumerate().take(self.ncols) { + *result_c = self.get(row, c); } } fn get_col_as_vec(&self, col: usize) -> Vec { let mut result = vec![T::zero(); self.nrows]; - for r in 0..self.nrows { - result[r] = self.get(r, col); + for (r, result_r) in result.iter_mut().enumerate().take(self.nrows) { + *result_r = self.get(r, col); } result } fn copy_col_as_vec(&self, col: usize, result: &mut Vec) { - for r in 0..self.nrows { - result[r] = self.get(r, col); + for (r, result_r) in result.iter_mut().enumerate().take(self.nrows) { + *result_r = self.get(r, col); } } @@ -836,13 +837,13 @@ impl BaseMatrix for DenseMatrix { let mut mean = vec![T::zero(); self.ncols]; for r in 0..self.nrows { - for c in 0..self.ncols { - mean[c] += self.get(r, c); + for (c, mean_c) in mean.iter_mut().enumerate().take(self.ncols) { + *mean_c += self.get(r, c); } } - for i in 0..mean.len() { - mean[i] /= T::from(self.nrows).unwrap(); + for mean_i in mean.iter_mut() { + *mean_i /= T::from(self.nrows).unwrap(); } mean @@ -989,7 +990,7 @@ impl BaseMatrix for DenseMatrix { fn argmax(&self) -> Vec { let mut res = vec![0usize; self.nrows]; - for r in 0..self.nrows { + for (r, res_r) in res.iter_mut().enumerate().take(self.nrows) { let mut max = T::neg_infinity(); let mut max_pos = 0usize; for c in 0..self.ncols { @@ -999,7 +1000,7 @@ impl BaseMatrix for DenseMatrix { max_pos = c; } } - res[r] = max_pos; + *res_r = max_pos; } res diff --git a/src/linalg/qr.rs b/src/linalg/qr.rs index c3a7978..a06a01f 100644 --- a/src/linalg/qr.rs +++ b/src/linalg/qr.rs @@ -44,8 +44,8 @@ pub struct QR> { impl> QR { pub(crate) fn new(QR: M, tau: Vec) -> QR { let mut singular = false; - for j in 0..tau.len() { - if tau[j] == T::zero() { + for tau_elem in tau.iter() { + if *tau_elem == T::zero() { singular = true; break; } @@ -153,7 +153,7 @@ pub trait QRDecomposableMatrix: BaseMatrix { let mut r_diagonal: Vec = vec![T::zero(); n]; - for k in 0..n { + for (k, r_diagonal_k) in r_diagonal.iter_mut().enumerate().take(n) { let mut nrm = T::zero(); for i in k..m { nrm = nrm.hypot(self.get(i, k)); @@ -179,7 +179,7 @@ pub trait QRDecomposableMatrix: BaseMatrix { } } } - r_diagonal[k] = -nrm; + *r_diagonal_k = -nrm; } Ok(QR::new(self, r_diagonal)) diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index fff87c3..45a17af 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -22,14 +22,14 @@ pub trait MatrixStats: BaseMatrix { let div = T::from_usize(m).unwrap(); - for i in 0..n { + for (i, x_i) in x.iter_mut().enumerate().take(n) { for j in 0..m { - x[i] += match axis { + *x_i += match axis { 0 => self.get(j, i), _ => self.get(i, j), }; } - x[i] /= div; + *x_i /= div; } x @@ -49,7 +49,7 @@ pub trait MatrixStats: BaseMatrix { let div = T::from_usize(m).unwrap(); - for i in 0..n { + for (i, x_i) in x.iter_mut().enumerate().take(n) { let mut mu = T::zero(); let mut sum = T::zero(); for j in 0..m { @@ -61,7 +61,7 @@ pub trait MatrixStats: BaseMatrix { sum += a * a; } mu /= div; - x[i] = sum / div - mu * mu; + *x_i = sum / div - mu * mu; } x @@ -76,15 +76,15 @@ pub trait MatrixStats: BaseMatrix { _ => self.shape().0, }; - for i in 0..n { - x[i] = x[i].sqrt(); + for x_i in x.iter_mut().take(n) { + *x_i = x_i.sqrt(); } x } /// standardize values by removing the mean and scaling to unit variance - fn scale_mut(&mut self, mean: &Vec, std: &Vec, axis: u8) { + fn scale_mut(&mut self, mean: &[T], std: &[T], axis: u8) { let (n, m) = match axis { 0 => { let (n, m) = self.shape(); diff --git a/src/linalg/svd.rs b/src/linalg/svd.rs index 9271f5b..e370453 100644 --- a/src/linalg/svd.rs +++ b/src/linalg/svd.rs @@ -156,8 +156,8 @@ pub trait SVDDecomposableMatrix: BaseMatrix { let h = f * g - s; U.set(i, l - 1, f - g); - for k in l - 1..n { - rv1[k] = U.get(i, k) / h; + for (k, rv1_k) in rv1.iter_mut().enumerate().take(n).skip(l - 1) { + *rv1_k = U.get(i, k) / h; } for j in l - 1..m { @@ -166,8 +166,8 @@ pub trait SVDDecomposableMatrix: BaseMatrix { s += U.get(j, k) * U.get(i, k); } - for k in l - 1..n { - U.add_element_mut(j, k, s * rv1[k]); + for (k, rv1_k) in rv1.iter().enumerate().take(n).skip(l - 1) { + U.add_element_mut(j, k, s * (*rv1_k)); } } @@ -365,11 +365,11 @@ pub trait SVDDecomposableMatrix: BaseMatrix { inc /= 3; for i in inc..n { let sw = w[i]; - for k in 0..m { - su[k] = U.get(k, i); + for (k, su_k) in su.iter_mut().enumerate().take(m) { + *su_k = U.get(k, i); } - for k in 0..n { - sv[k] = v.get(k, i); + for (k, sv_k) in sv.iter_mut().enumerate().take(n) { + *sv_k = v.get(k, i); } let mut j = i; while w[j - inc] < sw { @@ -386,11 +386,11 @@ pub trait SVDDecomposableMatrix: BaseMatrix { } } w[j] = sw; - for k in 0..m { - U.set(k, j, su[k]); + for (k, su_k) in su.iter().enumerate().take(m) { + U.set(k, j, *su_k); } - for k in 0..n { - v.set(k, j, sv[k]); + for (k, sv_k) in sv.iter().enumerate().take(n) { + v.set(k, j, *sv_k); } } if inc <= 1 { @@ -454,7 +454,7 @@ impl> SVD { for k in 0..p { let mut tmp = vec![T::zero(); self.n]; - for j in 0..self.n { + for (j, tmp_j) in tmp.iter_mut().enumerate().take(self.n) { let mut r = T::zero(); if self.s[j] > self.tol { for i in 0..self.m { @@ -462,13 +462,13 @@ impl> SVD { } r /= self.s[j]; } - tmp[j] = r; + *tmp_j = r; } for j in 0..self.n { let mut r = T::zero(); - for jj in 0..self.n { - r += self.V.get(j, jj) * tmp[jj]; + for (jj, tmp_jj) in tmp.iter().enumerate().take(self.n) { + r += self.V.get(j, jj) * (*tmp_jj); } b.set(j, k, r); } diff --git a/src/linear/bg_solver.rs b/src/linear/bg_solver.rs index b299623..46ef13d 100644 --- a/src/linear/bg_solver.rs +++ b/src/linear/bg_solver.rs @@ -85,9 +85,9 @@ pub trait BiconjugateGradientSolver> { let diag = Self::diag(a); let n = diag.len(); - for i in 0..n { - if diag[i] != T::zero() { - x.set(i, 0, b.get(i, 0) / diag[i]); + for (i, diag_i) in diag.iter().enumerate().take(n) { + if *diag_i != T::zero() { + x.set(i, 0, b.get(i, 0) / *diag_i); } else { x.set(i, 0, b.get(i, 0)); } diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 965c1c4..490694c 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -120,14 +120,14 @@ impl> Lasso { let mut w = optimizer.optimize(&scaled_x, y, ¶meters)?; - for j in 0..p { - w.set(j, 0, w.get(j, 0) / col_std[j]); + for (j, col_std_j) in col_std.iter().enumerate().take(p) { + w.set(j, 0, w.get(j, 0) / *col_std_j); } let mut b = T::zero(); - for i in 0..p { - b += w.get(i, 0) * col_mean[i]; + for (i, col_mean_i) in col_mean.iter().enumerate().take(p) { + b += w.get(i, 0) * *col_mean_i; } b = y.mean() - b; @@ -169,8 +169,8 @@ impl> Lasso { let col_mean = x.mean(0); let col_std = x.std(0); - for i in 0..col_std.len() { - if (col_std[i] - T::zero()).abs() < T::epsilon() { + for (i, col_std_i) in col_std.iter().enumerate() { + if (*col_std_i - T::zero()).abs() < T::epsilon() { return Err(Failed::fit(&format!( "Cannot rescale constant column {}", i diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index a3674b3..7b7cab6 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -228,9 +228,9 @@ impl> LogisticRegression { let mut yi: Vec = vec![0; y_nrows]; - for i in 0..y_nrows { + for (i, yi_i) in yi.iter_mut().enumerate().take(y_nrows) { let yc = y_m.get(0, i); - yi[i] = classes.iter().position(|c| yc == *c).unwrap(); + *yi_i = classes.iter().position(|c| yc == *c).unwrap(); } match k.cmp(&2) { @@ -291,11 +291,11 @@ impl> LogisticRegression { if self.num_classes == 2 { let y_hat: Vec = x.ab(false, &self.coefficients, true).get_col_as_vec(0); let intercept = self.intercept.get(0, 0); - for i in 0..n { + for (i, y_hat_i) in y_hat.iter().enumerate().take(n) { result.set( 0, i, - self.classes[if (y_hat[i] + intercept).sigmoid() > T::half() { + self.classes[if (*y_hat_i + intercept).sigmoid() > T::half() { 1 } else { 0 @@ -310,8 +310,8 @@ impl> LogisticRegression { } } let class_idxs = y_hat.argmax(); - for i in 0..n { - result.set(0, i, self.classes[class_idxs[i]]); + for (i, class_i) in class_idxs.iter().enumerate().take(n) { + result.set(0, i, self.classes[*class_i]); } } Ok(result.to_row_vector()) diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index bb03c54..98bc639 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -155,14 +155,14 @@ impl> RidgeRegression { RidgeRegressionSolverName::SVD => x_t_x.svd_solve_mut(x_t_y)?, }; - for i in 0..p { - w.set(i, 0, w.get(i, 0) / col_std[i]); + for (i, col_std_i) in col_std.iter().enumerate().take(p) { + w.set(i, 0, w.get(i, 0) / *col_std_i); } let mut b = T::zero(); - for i in 0..p { - b += w.get(i, 0) * col_mean[i]; + for (i, col_mean_i) in col_mean.iter().enumerate().take(p) { + b += w.get(i, 0) * *col_mean_i; } let b = y.mean() - b; @@ -196,8 +196,8 @@ impl> RidgeRegression { let col_mean = x.mean(0); let col_std = x.std(0); - for i in 0..col_std.len() { - if (col_std[i] - T::zero()).abs() < T::epsilon() { + for (i, col_std_i) in col_std.iter().enumerate() { + if (*col_std_i - T::zero()).abs() < T::epsilon() { return Err(Failed::fit(&format!( "Cannot rescale constant column {}", i diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 31503bd..e292f9c 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -30,7 +30,7 @@ pub struct Euclidian {} impl Euclidian { #[inline] - pub(crate) fn squared_distance(x: &Vec, y: &Vec) -> T { + pub(crate) fn squared_distance(x: &[T], y: &[T]) -> T { if x.len() != y.len() { panic!("Input vector sizes are different."); } diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index 571dd49..0f8d56a 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -68,8 +68,8 @@ impl AUC { j += 1; } let r = T::from_usize(i + 1 + j).unwrap() / T::two(); - for k in i..j { - rank[k] = r; + for rank_k in rank.iter_mut().take(j).skip(i) { + *rank_k = r; } i = j - 1; } diff --git a/src/metrics/cluster_helpers.rs b/src/metrics/cluster_helpers.rs index 8d1e17e..a8fa7e5 100644 --- a/src/metrics/cluster_helpers.rs +++ b/src/metrics/cluster_helpers.rs @@ -1,3 +1,4 @@ +#![allow(clippy::ptr_arg)] use std::collections::HashMap; use crate::math::num::RealNumber; @@ -23,7 +24,7 @@ pub fn contingency_matrix( contingency_matrix } -pub fn entropy(data: &Vec) -> Option { +pub fn entropy(data: &[T]) -> Option { let mut bincounts = HashMap::with_capacity(data.len()); for e in data.iter() { @@ -44,17 +45,17 @@ pub fn entropy(data: &Vec) -> Option { Some(entropy) } -pub fn mutual_info_score(contingency: &Vec>) -> T { +pub fn mutual_info_score(contingency: &[Vec]) -> T { let mut contingency_sum = 0; let mut pi = vec![0; contingency.len()]; let mut pj = vec![0; contingency[0].len()]; let (mut nzx, mut nzy, mut nz_val) = (Vec::new(), Vec::new(), Vec::new()); for r in 0..contingency.len() { - for c in 0..contingency[0].len() { + for (c, pj_c) in pj.iter_mut().enumerate().take(contingency[0].len()) { contingency_sum += contingency[r][c]; pi[r] += contingency[r][c]; - pj[c] += contingency[r][c]; + *pj_c += contingency[r][c]; if contingency[r][c] > 0 { nzx.push(r); nzy.push(c); diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index b066b30..bc0f9b8 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -44,10 +44,10 @@ pub fn train_test_split>( let mut n_test = 0; let mut index = vec![false; n]; - for i in 0..n { + for index_i in index.iter_mut().take(n) { let p_test: f32 = rng.gen(); if p_test <= test_size { - index[i] = true; + *index_i = true; n_test += 1; } } @@ -62,8 +62,8 @@ pub fn train_test_split>( let mut r_train = 0; let mut r_test = 0; - for r in 0..n { - if index[r] { + for (r, index_r) in index.iter().enumerate().take(n) { + if *index_r { //sample belongs to test for c in 0..m { x_test.set(r_test, c, x.get(r, c)); @@ -133,8 +133,8 @@ impl BaseKFold for KFold { let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; // increment by one if odd - for i in 0..(n_samples % self.n_splits) { - fold_sizes[i] += 1; + for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { + *fold_size += 1; } // generate the right array of arrays for test indices diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 535b5ee..057b447 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -134,8 +134,8 @@ impl BernoulliNBDistribution { let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { - for idx in 0..n_features { - feature_in_class_counter[class_index][idx] += row[idx]; + for (idx, row_i) in row.iter().enumerate().take(n_features) { + feature_in_class_counter[class_index][idx] += *row_i; } } diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 8e7e37c..af5732d 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -119,9 +119,9 @@ impl GaussianNBDistribution { .into_iter() .map(|v| { let mut m = M::zeros(v.len(), n_features); - for row in 0..v.len() { - for col in 0..n_features { - m.set(row, col, v[row][col]); + for (row_i, v_i) in v.iter().enumerate() { + for (col_j, v_i_j) in v_i.iter().enumerate().take(n_features) { + m.set(row_i, col_j, *v_i_j); } } m diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index a70fd2d..be8a7da 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -122,8 +122,8 @@ impl MultinomialNBDistribution { let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { - for idx in 0..n_features { - feature_in_class_counter[class_index][idx] += row[idx]; + for (idx, row_i) in row.iter().enumerate().take(n_features) { + feature_in_class_counter[class_index][idx] += *row_i; } } diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 135594a..f940211 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -119,9 +119,9 @@ impl, T>> KNNClassifier { let mut yi: Vec = vec![0; y_n]; let classes = y_m.unique(); - for i in 0..y_n { + for (i, yi_i) in yi.iter_mut().enumerate().take(y_n) { let yc = y_m.get(0, i); - yi[i] = classes.iter().position(|c| yc == *c).unwrap(); + *yi_i = classes.iter().position(|c| yc == *c).unwrap(); } if x_n != y_n { diff --git a/src/optimization/line_search.rs b/src/optimization/line_search.rs index e6a3b80..99457c9 100644 --- a/src/optimization/line_search.rs +++ b/src/optimization/line_search.rs @@ -41,7 +41,7 @@ impl Default for Backtracking { } impl LineSearchMethod for Backtracking { - fn search<'a>( + fn search( &self, f: &(dyn Fn(T) -> T), _: &(dyn Fn(T) -> T), diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 9fe1b1a..371bc4e 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -187,42 +187,42 @@ impl Node { struct NodeVisitor<'a, T: RealNumber, M: Matrix> { x: &'a M, - y: &'a Vec, + y: &'a [usize], node: usize, samples: Vec, - order: &'a Vec>, + order: &'a [Vec], true_child_output: usize, false_child_output: usize, level: u16, phantom: PhantomData<&'a T>, } -fn impurity(criterion: &SplitCriterion, count: &Vec, n: usize) -> T { +fn impurity(criterion: &SplitCriterion, count: &[usize], n: usize) -> T { let mut impurity = T::zero(); match criterion { SplitCriterion::Gini => { impurity = T::one(); - for i in 0..count.len() { - if count[i] > 0 { - let p = T::from(count[i]).unwrap() / T::from(n).unwrap(); + for count_i in count.iter() { + if *count_i > 0 { + let p = T::from(*count_i).unwrap() / T::from(n).unwrap(); impurity -= p * p; } } } SplitCriterion::Entropy => { - for i in 0..count.len() { - if count[i] > 0 { - let p = T::from(count[i]).unwrap() / T::from(n).unwrap(); + for count_i in count.iter() { + if *count_i > 0 { + let p = T::from(*count_i).unwrap() / T::from(n).unwrap(); impurity -= p * p.log2(); } } } SplitCriterion::ClassificationError => { - for i in 0..count.len() { - if count[i] > 0 { - impurity = impurity.max(T::from(count[i]).unwrap() / T::from(n).unwrap()); + for count_i in count.iter() { + if *count_i > 0 { + impurity = impurity.max(T::from(*count_i).unwrap() / T::from(n).unwrap()); } } impurity = (T::one() - impurity).abs(); @@ -236,9 +236,9 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { fn new( node_id: usize, samples: Vec, - order: &'a Vec>, + order: &'a [Vec], x: &'a M, - y: &'a Vec, + y: &'a [usize], level: u16, ) -> Self { NodeVisitor { @@ -255,13 +255,13 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } } -pub(in crate) fn which_max(x: &Vec) -> usize { +pub(in crate) fn which_max(x: &[usize]) -> usize { let mut m = x[0]; let mut which = 0; - for i in 1..x.len() { - if x[i] > m { - m = x[i]; + for (i, x_i) in x.iter().enumerate().skip(1) { + if *x_i > m { + m = *x_i; which = i; } } @@ -304,9 +304,9 @@ impl DecisionTreeClassifier { let mut yi: Vec = vec![0; y_ncols]; - for i in 0..y_ncols { + for (i, yi_i) in yi.iter_mut().enumerate().take(y_ncols) { let yc = y_m.get(0, i); - yi[i] = classes.iter().position(|c| yc == *c).unwrap(); + *yi_i = classes.iter().position(|c| yc == *c).unwrap(); } let mut nodes: Vec> = Vec::new(); @@ -431,23 +431,20 @@ impl DecisionTreeClassifier { let parent_impurity = impurity(&self.parameters.criterion, &count, n); - let mut variables = vec![0; n_attr]; - for i in 0..n_attr { - variables[i] = i; - } + let mut variables = (0..n_attr).collect::>(); if mtry < n_attr { variables.shuffle(&mut rand::thread_rng()); } - for j in 0..mtry { + for variable in variables.iter().take(mtry) { self.find_best_split( visitor, n, &count, &mut false_count, parent_impurity, - variables[j], + *variable, ); } @@ -458,7 +455,7 @@ impl DecisionTreeClassifier { &mut self, visitor: &mut NodeVisitor<'_, T, M>, n: usize, - count: &Vec, + count: &[usize], false_count: &mut Vec, parent_impurity: T, j: usize, @@ -527,13 +524,13 @@ impl DecisionTreeClassifier { let mut fc = 0; let mut true_samples: Vec = vec![0; n]; - for i in 0..n { + for (i, true_sample) in true_samples.iter_mut().enumerate().take(n) { if visitor.samples[i] > 0 { if visitor.x.get(i, self.nodes[visitor.node].split_feature) <= self.nodes[visitor.node].split_value.unwrap_or_else(T::nan) { - true_samples[i] = visitor.samples[i]; - tc += true_samples[i]; + *true_sample = visitor.samples[i]; + tc += *true_sample; visitor.samples[i] = 0; } else { fc += visitor.samples[i]; diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index c30c9e2..5e80b4c 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -161,7 +161,7 @@ struct NodeVisitor<'a, T: RealNumber, M: Matrix> { y: &'a M, node: usize, samples: Vec, - order: &'a Vec>, + order: &'a [Vec], true_child_output: T, false_child_output: T, level: u16, @@ -171,7 +171,7 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { fn new( node_id: usize, samples: Vec, - order: &'a Vec>, + order: &'a [Vec], x: &'a M, y: &'a M, level: u16, @@ -219,9 +219,9 @@ impl DecisionTreeRegressor { let mut n = 0; let mut sum = T::zero(); - for i in 0..y_ncols { - n += samples[i]; - sum += T::from(samples[i]).unwrap() * y_m.get(0, i); + for (i, sample_i) in samples.iter().enumerate().take(y_ncols) { + n += *sample_i; + sum += T::from(*sample_i).unwrap() * y_m.get(0, i); } let root = Node::new(0, sum / T::from(n).unwrap()); @@ -312,10 +312,7 @@ impl DecisionTreeRegressor { let sum = self.nodes[visitor.node].output * T::from(n).unwrap(); - let mut variables = vec![0; n_attr]; - for i in 0..n_attr { - variables[i] = i; - } + let mut variables = (0..n_attr).collect::>(); if mtry < n_attr { variables.shuffle(&mut rand::thread_rng()); @@ -324,8 +321,8 @@ impl DecisionTreeRegressor { let parent_gain = T::from(n).unwrap() * self.nodes[visitor.node].output * self.nodes[visitor.node].output; - for j in 0..mtry { - self.find_best_split(visitor, n, sum, parent_gain, variables[j]); + for variable in variables.iter().take(mtry) { + self.find_best_split(visitor, n, sum, parent_gain, *variable); } self.nodes[visitor.node].split_score != Option::None @@ -399,13 +396,13 @@ impl DecisionTreeRegressor { let mut fc = 0; let mut true_samples: Vec = vec![0; n]; - for i in 0..n { + for (i, true_sample) in true_samples.iter_mut().enumerate().take(n) { if visitor.samples[i] > 0 { if visitor.x.get(i, self.nodes[visitor.node].split_feature) <= self.nodes[visitor.node].split_value.unwrap_or_else(T::nan) { - true_samples[i] = visitor.samples[i]; - tc += true_samples[i]; + *true_sample = visitor.samples[i]; + tc += *true_sample; visitor.samples[i] = 0; } else { fc += visitor.samples[i]; From 78673b597fb02d825b882a650eac881c4c7dc916 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 11 Dec 2020 18:55:07 -0800 Subject: [PATCH 11/78] feat: adds elastic net --- src/linalg/mod.rs | 3 + src/linalg/naive/dense_matrix.rs | 14 ++ src/linalg/nalgebra_bindings.rs | 14 ++ src/linalg/ndarray_bindings.rs | 14 ++ src/linear/elasticnet.rs | 335 +++++++++++++++++++++++++++++++ src/linear/lasso.rs | 247 +---------------------- src/linear/lasso_optimizer.rs | 255 +++++++++++++++++++++++ src/linear/mod.rs | 2 + 8 files changed, 647 insertions(+), 237 deletions(-) create mode 100644 src/linear/elasticnet.rs create mode 100644 src/linear/lasso_optimizer.rs diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index d3fb635..c768cbf 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -271,6 +271,9 @@ pub trait BaseVector: Clone + Debug { fn std(&self) -> T { self.var().sqrt() } + + /// Copies content of `other` vector. + fn copy_from(&mut self, other: &Self); } /// Generic matrix type. diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 14e5e62..fd049ed 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -176,6 +176,20 @@ impl BaseVector for Vec { result.dedup(); result } + + fn copy_from(&mut self, other: &Self) { + if self.len() != other.len() { + panic!( + "Can't copy vector of length {} into a vector of length {}.", + self.len(), + other.len() + ); + } + + for i in 0..self.len() { + self[i] = other[i]; + } + } } /// Column-major, dense matrix. See [Simple Dense Matrix](../index.html). diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index ad2d4a2..b976fbd 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -181,6 +181,10 @@ impl BaseVector for MatrixMN { result.dedup(); result } + + fn copy_from(&mut self, other: &Self) { + Matrix::copy_from(self, other); + } } impl @@ -575,6 +579,16 @@ mod tests { use crate::linear::linear_regression::*; use nalgebra::{DMatrix, Matrix2x3, RowDVector}; + #[test] + fn vec_copy_from() { + let mut v1 = RowDVector::from_vec(vec![1., 2., 3.]); + let mut v2 = RowDVector::from_vec(vec![4., 5., 6.]); + v1.copy_from(&v2); + assert_eq!(v2, v1); + v2[0] = 10.0; + assert_ne!(v2, v1); + } + #[test] fn vec_len() { let v = RowDVector::from_vec(vec![1., 2., 3.]); diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 3f0478f..eb50f01 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -176,6 +176,10 @@ impl BaseVector for ArrayBase, Ix result.dedup(); result } + + fn copy_from(&mut self, other: &Self) { + self.assign(&other); + } } impl @@ -537,6 +541,16 @@ mod tests { assert_eq!(5., BaseVector::get(&result, 1)); } + #[test] + fn vec_copy_from() { + let mut v1 = arr1(&[1., 2., 3.]); + let mut v2 = arr1(&[4., 5., 6.]); + v1.copy_from(&v2); + assert_eq!(v1, v2); + v2[0] = 10.0; + assert_ne!(v1, v2); + } + #[test] fn vec_len() { let v = arr1(&[1., 2., 3.]); diff --git a/src/linear/elasticnet.rs b/src/linear/elasticnet.rs new file mode 100644 index 0000000..7b6acb1 --- /dev/null +++ b/src/linear/elasticnet.rs @@ -0,0 +1,335 @@ +//! # Elastic Net +//! +//! +//! ## References: +//! +//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 6.2. Shrinkage Methods](http://faculty.marshall.usc.edu/gareth-james/ISL/) +//! * ["Regularization and variable selection via the elastic net", Hui Zou and Trevor Hastie](https://web.stanford.edu/~hastie/Papers/B67.2%20(2005)%20301-320%20Zou%20&%20Hastie.pdf) +//! +//! +//! +use std::fmt::Debug; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +use crate::linear::lasso_optimizer::InteriorPointOptimizer; + +/// Ridge Regression parameters +#[derive(Serialize, Deserialize, Debug)] +pub struct ElasticNetParameters { + pub alpha: T, + pub l1_ratio: T, + pub normalize: bool, + pub tol: T, + pub max_iter: usize, +} + +/// Ridge regression +#[derive(Serialize, Deserialize, Debug)] +pub struct ElasticNet> { + coefficients: M, + intercept: T, +} + +impl Default for ElasticNetParameters { + fn default() -> Self { + ElasticNetParameters { + alpha: T::one(), + l1_ratio: T::half(), + normalize: true, + tol: T::from_f64(1e-4).unwrap(), + max_iter: 1000, + } + } +} + +impl> PartialEq for ElasticNet { + fn eq(&self, other: &Self) -> bool { + self.coefficients == other.coefficients + && (self.intercept - other.intercept).abs() <= T::epsilon() + } +} + +impl> ElasticNet { + /// Fits ridge regression to your data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - target values + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: ElasticNetParameters, + ) -> Result, Failed> { + let (n, p) = x.shape(); + + if y.len() != n { + return Err(Failed::fit("Number of rows in X should = len(y)")); + } + + let n_float = T::from_usize(n).unwrap(); + + let l1_reg = parameters.alpha * parameters.l1_ratio * n_float; + let l2_reg = parameters.alpha * (T::one() - parameters.l1_ratio) * n_float; + + let y_mean = y.mean(); + + let (w, b) = if parameters.normalize { + let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; + + let (x, y, gamma) = Self::augment_X_and_y(&scaled_x, y, l2_reg); + + let mut optimizer = InteriorPointOptimizer::new(&x, p); + + let mut w = + optimizer.optimize(&x, &y, l1_reg * gamma, parameters.max_iter, parameters.tol)?; + + for i in 0..p { + w.set(i, 0, gamma * w.get(i, 0) / col_std[i]); + } + + let mut b = T::zero(); + + for i in 0..p { + b += w.get(i, 0) * col_mean[i]; + } + + b = y_mean - b; + + (w, b) + } else { + let (x, y, gamma) = Self::augment_X_and_y(x, y, l2_reg); + + let mut optimizer = InteriorPointOptimizer::new(&x, p); + + let mut w = + optimizer.optimize(&x, &y, l1_reg * gamma, parameters.max_iter, parameters.tol)?; + + for i in 0..p { + w.set(i, 0, gamma * w.get(i, 0)); + } + + (w, y_mean) + }; + + Ok(ElasticNet { + intercept: b, + coefficients: w, + }) + } + + /// Predict target values from `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &M) -> Result { + let (nrows, _) = x.shape(); + let mut y_hat = x.matmul(&self.coefficients); + y_hat.add_mut(&M::fill(nrows, 1, self.intercept)); + Ok(y_hat.transpose().to_row_vector()) + } + + /// Get estimates regression coefficients + pub fn coefficients(&self) -> &M { + &self.coefficients + } + + /// Get estimate of intercept + pub fn intercept(&self) -> T { + self.intercept + } + + fn rescale_x(x: &M) -> Result<(M, Vec, Vec), Failed> { + let col_mean = x.mean(0); + let col_std = x.std(0); + + for i in 0..col_std.len() { + if (col_std[i] - T::zero()).abs() < T::epsilon() { + return Err(Failed::fit(&format!( + "Cannot rescale constant column {}", + i + ))); + } + } + + let mut scaled_x = x.clone(); + scaled_x.scale_mut(&col_mean, &col_std, 0); + Ok((scaled_x, col_mean, col_std)) + } + + fn augment_X_and_y(x: &M, y: &M::RowVector, l2_reg: T) -> (M, M::RowVector, T) { + let (n, p) = x.shape(); + + let gamma = T::one() / (T::one() + l2_reg).sqrt(); + let padding = gamma * l2_reg.sqrt(); + + let mut y2 = M::RowVector::zeros(n + p); + for i in 0..y.len() { + y2.set(i, y.get(i)); + } + + let mut x2 = M::zeros(n + p, p); + + for j in 0..p { + for i in 0..n { + x2.set(i, j, gamma * x.get(i, j)); + } + + x2.set(j + n, j, padding); + } + + (x2, y2, gamma) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + use crate::metrics::mean_absolute_error; + + #[test] + fn elasticnet_longley() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y: Vec = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let y_hat = ElasticNet::fit( + &x, + &y, + ElasticNetParameters { + alpha: 1.0, + l1_ratio: 0.5, + normalize: false, + tol: 1e-4, + max_iter: 1000, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat, &y) < 30.0); + } + + #[test] + fn elasticnet_fit_predict1() { + let x = DenseMatrix::from_2d_array(&[ + &[0.0, 1931.0, 1.2232755825400514], + &[1.0, 1933.0, 1.1379726120972395], + &[2.0, 1920.0, 1.4366265120543429], + &[3.0, 1918.0, 1.206005737827858], + &[4.0, 1934.0, 1.436613542400669], + &[5.0, 1918.0, 1.1594588621640636], + &[6.0, 1933.0, 1.19809994745985], + &[7.0, 1918.0, 1.3396363871645678], + &[8.0, 1931.0, 1.2535342096493207], + &[9.0, 1933.0, 1.3101281563456293], + &[10.0, 1922.0, 1.3585833349920762], + &[11.0, 1930.0, 1.4830786699709897], + &[12.0, 1916.0, 1.4919891143094546], + &[13.0, 1915.0, 1.259655137451551], + &[14.0, 1932.0, 1.3979191428724789], + &[15.0, 1917.0, 1.3686634746782371], + &[16.0, 1932.0, 1.381658454569724], + &[17.0, 1918.0, 1.4054969025700674], + &[18.0, 1929.0, 1.3271699396384906], + &[19.0, 1915.0, 1.1373332337674806], + ]); + + let y: Vec = vec![ + 1.48, 2.72, 4.52, 5.72, 5.25, 4.07, 3.75, 4.75, 6.77, 4.72, 6.78, 6.79, 8.3, 7.42, + 10.2, 7.92, 7.62, 8.06, 9.06, 9.29, + ]; + + let l1_model = ElasticNet::fit( + &x, + &y, + ElasticNetParameters { + alpha: 1.0, + l1_ratio: 1.0, + normalize: true, + tol: 1e-4, + max_iter: 1000, + }, + ) + .unwrap(); + + let l2_model = ElasticNet::fit( + &x, + &y, + ElasticNetParameters { + alpha: 1.0, + l1_ratio: 0.0, + normalize: true, + tol: 1e-4, + max_iter: 1000, + }, + ) + .unwrap(); + + let mae_l1 = mean_absolute_error(&l1_model.predict(&x).unwrap(), &y); + let mae_l2 = mean_absolute_error(&l2_model.predict(&x).unwrap(), &y); + + assert!(mae_l1 < 2.0); + assert!(mae_l2 < 2.0); + + assert!(l1_model.coefficients().get(0, 0) > l1_model.coefficients().get(1, 0)); + assert!(l1_model.coefficients().get(0, 0) > l1_model.coefficients().get(2, 0)); + } + + #[test] + fn serde() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let lr = ElasticNet::fit(&x, &y, Default::default()).unwrap(); + + let deserialized_lr: ElasticNet> = + serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); + + assert_eq!(lr, deserialized_lr); + } +} diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 965c1c4..b2c81d1 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -29,7 +29,7 @@ use serde::{Deserialize, Serialize}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; -use crate::linear::bg_solver::BiconjugateGradientSolver; +use crate::linear::lasso_optimizer::InteriorPointOptimizer; use crate::math::num::RealNumber; /// Lasso regression parameters @@ -53,14 +53,6 @@ pub struct Lasso> { intercept: T, } -struct InteriorPointOptimizer> { - ata: M, - d1: Vec, - d2: Vec, - prb: Vec, - prs: Vec, -} - impl Default for LassoParameters { fn default() -> Self { LassoParameters { @@ -118,7 +110,13 @@ impl> Lasso { let mut optimizer = InteriorPointOptimizer::new(&scaled_x, p); - let mut w = optimizer.optimize(&scaled_x, y, ¶meters)?; + let mut w = optimizer.optimize( + &scaled_x, + y, + parameters.alpha, + parameters.max_iter, + parameters.tol, + )?; for j in 0..p { w.set(j, 0, w.get(j, 0) / col_std[j]); @@ -135,7 +133,8 @@ impl> Lasso { } else { let mut optimizer = InteriorPointOptimizer::new(x, p); - let w = optimizer.optimize(x, y, ¶meters)?; + let w = + optimizer.optimize(x, y, parameters.alpha, parameters.max_iter, parameters.tol)?; (w, y.mean()) }; @@ -184,232 +183,6 @@ impl> Lasso { } } -impl> InteriorPointOptimizer { - fn new(a: &M, n: usize) -> InteriorPointOptimizer { - InteriorPointOptimizer { - ata: a.ab(true, a, false), - d1: vec![T::zero(); n], - d2: vec![T::zero(); n], - prb: vec![T::zero(); n], - prs: vec![T::zero(); n], - } - } - - fn optimize( - &mut self, - x: &M, - y: &M::RowVector, - parameters: &LassoParameters, - ) -> Result { - let (n, p) = x.shape(); - let p_f64 = T::from_usize(p).unwrap(); - - //parameters - let pcgmaxi = 5000; - let min_pcgtol = T::from_f64(0.1).unwrap(); - let eta = T::from_f64(1E-3).unwrap(); - let alpha = T::from_f64(0.01).unwrap(); - let beta = T::from_f64(0.5).unwrap(); - let gamma = T::from_f64(-0.25).unwrap(); - let mu = T::two(); - - let y = M::from_row_vector(y.sub_scalar(y.mean())).transpose(); - - let mut max_ls_iter = 100; - let mut pitr = 0; - let mut w = M::zeros(p, 1); - let mut neww = w.clone(); - let mut u = M::ones(p, 1); - let mut newu = u.clone(); - - let mut f = M::fill(p, 2, -T::one()); - let mut newf = f.clone(); - - let mut q1 = vec![T::zero(); p]; - let mut q2 = vec![T::zero(); p]; - - let mut dx = M::zeros(p, 1); - let mut du = M::zeros(p, 1); - let mut dxu = M::zeros(2 * p, 1); - let mut grad = M::zeros(2 * p, 1); - - let mut nu = M::zeros(n, 1); - let mut dobj = T::zero(); - let mut s = T::infinity(); - let mut t = T::one() - .max(T::one() / parameters.alpha) - .min(T::two() * p_f64 / T::from(1e-3).unwrap()); - - for ntiter in 0..parameters.max_iter { - let mut z = x.matmul(&w); - - for i in 0..n { - z.set(i, 0, z.get(i, 0) - y.get(i, 0)); - nu.set(i, 0, T::two() * z.get(i, 0)); - } - - // CALCULATE DUALITY GAP - let xnu = x.ab(true, &nu, false); - let max_xnu = xnu.norm(T::infinity()); - if max_xnu > parameters.alpha { - let lnu = parameters.alpha / max_xnu; - nu.mul_scalar_mut(lnu); - } - - let pobj = z.dot(&z) + parameters.alpha * w.norm(T::one()); - dobj = dobj.max(gamma * nu.dot(&nu) - nu.dot(&y)); - - let gap = pobj - dobj; - - // STOPPING CRITERION - if gap / dobj < parameters.tol { - break; - } - - // UPDATE t - if s >= T::half() { - t = t.max((T::two() * p_f64 * mu / gap).min(mu * t)); - } - - // CALCULATE NEWTON STEP - for i in 0..p { - let q1i = T::one() / (u.get(i, 0) + w.get(i, 0)); - let q2i = T::one() / (u.get(i, 0) - w.get(i, 0)); - q1[i] = q1i; - q2[i] = q2i; - self.d1[i] = (q1i * q1i + q2i * q2i) / t; - self.d2[i] = (q1i * q1i - q2i * q2i) / t; - } - - let mut gradphi = x.ab(true, &z, false); - - for i in 0..p { - let g1 = T::two() * gradphi.get(i, 0) - (q1[i] - q2[i]) / t; - let g2 = parameters.alpha - (q1[i] + q2[i]) / t; - gradphi.set(i, 0, g1); - grad.set(i, 0, -g1); - grad.set(i + p, 0, -g2); - } - - for i in 0..p { - self.prb[i] = T::two() + self.d1[i]; - self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i] * self.d2[i]; - } - - let normg = grad.norm2(); - let mut pcgtol = min_pcgtol.min(eta * gap / T::one().min(normg)); - if ntiter != 0 && pitr == 0 { - pcgtol *= min_pcgtol; - } - - let error = self.solve_mut(x, &grad, &mut dxu, pcgtol, pcgmaxi)?; - if error > pcgtol { - pitr = pcgmaxi; - } - - for i in 0..p { - dx.set(i, 0, dxu.get(i, 0)); - du.set(i, 0, dxu.get(i + p, 0)); - } - - // BACKTRACKING LINE SEARCH - let phi = z.dot(&z) + parameters.alpha * u.sum() - Self::sumlogneg(&f) / t; - s = T::one(); - let gdx = grad.dot(&dxu); - - let lsiter = 0; - while lsiter < max_ls_iter { - for i in 0..p { - neww.set(i, 0, w.get(i, 0) + s * dx.get(i, 0)); - newu.set(i, 0, u.get(i, 0) + s * du.get(i, 0)); - newf.set(i, 0, neww.get(i, 0) - newu.get(i, 0)); - newf.set(i, 1, -neww.get(i, 0) - newu.get(i, 0)); - } - - if newf.max() < T::zero() { - let mut newz = x.matmul(&neww); - for i in 0..n { - newz.set(i, 0, newz.get(i, 0) - y.get(i, 0)); - } - - let newphi = newz.dot(&newz) + parameters.alpha * newu.sum() - - Self::sumlogneg(&newf) / t; - if newphi - phi <= alpha * s * gdx { - break; - } - } - s = beta * s; - max_ls_iter += 1; - } - - if lsiter == max_ls_iter { - return Err(Failed::fit( - "Exceeded maximum number of iteration for interior point optimizer", - )); - } - - w.copy_from(&neww); - u.copy_from(&newu); - f.copy_from(&newf); - } - - Ok(w) - } - - fn sumlogneg(f: &M) -> T { - let (n, _) = f.shape(); - let mut sum = T::zero(); - for i in 0..n { - sum += (-f.get(i, 0)).ln(); - sum += (-f.get(i, 1)).ln(); - } - sum - } -} - -impl<'a, T: RealNumber, M: Matrix> BiconjugateGradientSolver - for InteriorPointOptimizer -{ - fn solve_preconditioner(&self, a: &M, b: &M, x: &mut M) { - let (_, p) = a.shape(); - - for i in 0..p { - x.set( - i, - 0, - (self.d1[i] * b.get(i, 0) - self.d2[i] * b.get(i + p, 0)) / self.prs[i], - ); - x.set( - i + p, - 0, - (-self.d2[i] * b.get(i, 0) + self.prb[i] * b.get(i + p, 0)) / self.prs[i], - ); - } - } - - fn mat_vec_mul(&self, _: &M, x: &M, y: &mut M) { - let (_, p) = self.ata.shape(); - let atax = self.ata.matmul(&x.slice(0..p, 0..1)); - - for i in 0..p { - y.set( - i, - 0, - T::two() * atax.get(i, 0) + self.d1[i] * x.get(i, 0) + self.d2[i] * x.get(i + p, 0), - ); - y.set( - i + p, - 0, - self.d2[i] * x.get(i, 0) + self.d1[i] * x.get(i + p, 0), - ); - } - } - - fn mat_t_vec_mul(&self, a: &M, x: &M, y: &mut M) { - self.mat_vec_mul(a, x, y); - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/src/linear/lasso_optimizer.rs b/src/linear/lasso_optimizer.rs new file mode 100644 index 0000000..4f5011f --- /dev/null +++ b/src/linear/lasso_optimizer.rs @@ -0,0 +1,255 @@ +//! An Interior-Point Method for Large-Scale l1-Regularized Least Squares +//! +//! This is a specialized interior-point method for solving large-scale 1-regularized LSPs that uses the +//! preconditioned conjugate gradients algorithm to compute the search direction. +//! +//! The interior-point method can solve large sparse problems, with a million variables and observations, in a few tens of minutes on a PC. +//! It can efficiently solve large dense problems, that arise in sparse signal recovery with orthogonal transforms, by exploiting fast algorithms for these transforms. +//! +//! ## References: +//! * ["An Interior-Point Method for Large-Scale l1-Regularized Least Squares", K. Koh, M. Lustig, S. Boyd, D. Gorinevsky](https://web.stanford.edu/~boyd/papers/pdf/l1_ls.pdf) +//! * [Simple Matlab Solver for l1-regularized Least Squares Problems](https://web.stanford.edu/~boyd/l1_ls/) +//! + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::linear::bg_solver::BiconjugateGradientSolver; +use crate::math::num::RealNumber; + +pub struct InteriorPointOptimizer> { + ata: M, + d1: Vec, + d2: Vec, + prb: Vec, + prs: Vec, +} + +impl> InteriorPointOptimizer { + pub fn new(a: &M, n: usize) -> InteriorPointOptimizer { + InteriorPointOptimizer { + ata: a.ab(true, a, false), + d1: vec![T::zero(); n], + d2: vec![T::zero(); n], + prb: vec![T::zero(); n], + prs: vec![T::zero(); n], + } + } + + pub fn optimize( + &mut self, + x: &M, + y: &M::RowVector, + lambda: T, + max_iter: usize, + tol: T, + ) -> Result { + let (n, p) = x.shape(); + let p_f64 = T::from_usize(p).unwrap(); + + let lambda = lambda.max(T::epsilon()); + + //parameters + let pcgmaxi = 5000; + let min_pcgtol = T::from_f64(0.1).unwrap(); + let eta = T::from_f64(1E-3).unwrap(); + let alpha = T::from_f64(0.01).unwrap(); + let beta = T::from_f64(0.5).unwrap(); + let gamma = T::from_f64(-0.25).unwrap(); + let mu = T::two(); + + let y = M::from_row_vector(y.sub_scalar(y.mean())).transpose(); + + let mut max_ls_iter = 100; + let mut pitr = 0; + let mut w = M::zeros(p, 1); + let mut neww = w.clone(); + let mut u = M::ones(p, 1); + let mut newu = u.clone(); + + let mut f = M::fill(p, 2, -T::one()); + let mut newf = f.clone(); + + let mut q1 = vec![T::zero(); p]; + let mut q2 = vec![T::zero(); p]; + + let mut dx = M::zeros(p, 1); + let mut du = M::zeros(p, 1); + let mut dxu = M::zeros(2 * p, 1); + let mut grad = M::zeros(2 * p, 1); + + let mut nu = M::zeros(n, 1); + let mut dobj = T::zero(); + let mut s = T::infinity(); + let mut t = T::one() + .max(T::one() / lambda) + .min(T::two() * p_f64 / T::from(1e-3).unwrap()); + + for ntiter in 0..max_iter { + let mut z = x.matmul(&w); + + for i in 0..n { + z.set(i, 0, z.get(i, 0) - y.get(i, 0)); + nu.set(i, 0, T::two() * z.get(i, 0)); + } + + // CALCULATE DUALITY GAP + let xnu = x.ab(true, &nu, false); + let max_xnu = xnu.norm(T::infinity()); + if max_xnu > lambda { + let lnu = lambda / max_xnu; + nu.mul_scalar_mut(lnu); + } + + let pobj = z.dot(&z) + lambda * w.norm(T::one()); + dobj = dobj.max(gamma * nu.dot(&nu) - nu.dot(&y)); + + let gap = pobj - dobj; + + // STOPPING CRITERION + if gap / dobj < tol { + break; + } + + // UPDATE t + if s >= T::half() { + t = t.max((T::two() * p_f64 * mu / gap).min(mu * t)); + } + + // CALCULATE NEWTON STEP + for i in 0..p { + let q1i = T::one() / (u.get(i, 0) + w.get(i, 0)); + let q2i = T::one() / (u.get(i, 0) - w.get(i, 0)); + q1[i] = q1i; + q2[i] = q2i; + self.d1[i] = (q1i * q1i + q2i * q2i) / t; + self.d2[i] = (q1i * q1i - q2i * q2i) / t; + } + + let mut gradphi = x.ab(true, &z, false); + + for i in 0..p { + let g1 = T::two() * gradphi.get(i, 0) - (q1[i] - q2[i]) / t; + let g2 = lambda - (q1[i] + q2[i]) / t; + gradphi.set(i, 0, g1); + grad.set(i, 0, -g1); + grad.set(i + p, 0, -g2); + } + + for i in 0..p { + self.prb[i] = T::two() + self.d1[i]; + self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i] * self.d2[i]; + } + + let normg = grad.norm2(); + let mut pcgtol = min_pcgtol.min(eta * gap / T::one().min(normg)); + if ntiter != 0 && pitr == 0 { + pcgtol *= min_pcgtol; + } + + let error = self.solve_mut(x, &grad, &mut dxu, pcgtol, pcgmaxi)?; + if error > pcgtol { + pitr = pcgmaxi; + } + + for i in 0..p { + dx.set(i, 0, dxu.get(i, 0)); + du.set(i, 0, dxu.get(i + p, 0)); + } + + // BACKTRACKING LINE SEARCH + let phi = z.dot(&z) + lambda * u.sum() - Self::sumlogneg(&f) / t; + s = T::one(); + let gdx = grad.dot(&dxu); + + let lsiter = 0; + while lsiter < max_ls_iter { + for i in 0..p { + neww.set(i, 0, w.get(i, 0) + s * dx.get(i, 0)); + newu.set(i, 0, u.get(i, 0) + s * du.get(i, 0)); + newf.set(i, 0, neww.get(i, 0) - newu.get(i, 0)); + newf.set(i, 1, -neww.get(i, 0) - newu.get(i, 0)); + } + + if newf.max() < T::zero() { + let mut newz = x.matmul(&neww); + for i in 0..n { + newz.set(i, 0, newz.get(i, 0) - y.get(i, 0)); + } + + let newphi = newz.dot(&newz) + lambda * newu.sum() - Self::sumlogneg(&newf) / t; + if newphi - phi <= alpha * s * gdx { + break; + } + } + s = beta * s; + max_ls_iter += 1; + } + + if lsiter == max_ls_iter { + return Err(Failed::fit( + "Exceeded maximum number of iteration for interior point optimizer", + )); + } + + w.copy_from(&neww); + u.copy_from(&newu); + f.copy_from(&newf); + } + + Ok(w) + } + + fn sumlogneg(f: &M) -> T { + let (n, _) = f.shape(); + let mut sum = T::zero(); + for i in 0..n { + sum += (-f.get(i, 0)).ln(); + sum += (-f.get(i, 1)).ln(); + } + sum + } +} + +impl<'a, T: RealNumber, M: Matrix> BiconjugateGradientSolver + for InteriorPointOptimizer +{ + fn solve_preconditioner(&self, a: &M, b: &M, x: &mut M) { + let (_, p) = a.shape(); + + for i in 0..p { + x.set( + i, + 0, + (self.d1[i] * b.get(i, 0) - self.d2[i] * b.get(i + p, 0)) / self.prs[i], + ); + x.set( + i + p, + 0, + (-self.d2[i] * b.get(i, 0) + self.prb[i] * b.get(i + p, 0)) / self.prs[i], + ); + } + } + + fn mat_vec_mul(&self, _: &M, x: &M, y: &mut M) { + let (_, p) = self.ata.shape(); + let atax = self.ata.matmul(&x.slice(0..p, 0..1)); + + for i in 0..p { + y.set( + i, + 0, + T::two() * atax.get(i, 0) + self.d1[i] * x.get(i, 0) + self.d2[i] * x.get(i + p, 0), + ); + y.set( + i + p, + 0, + self.d2[i] * x.get(i, 0) + self.d1[i] * x.get(i + p, 0), + ); + } + } + + fn mat_t_vec_mul(&self, a: &M, x: &M, y: &mut M) { + self.mat_vec_mul(a, x, y); + } +} diff --git a/src/linear/mod.rs b/src/linear/mod.rs index edaea4f..8c056e8 100644 --- a/src/linear/mod.rs +++ b/src/linear/mod.rs @@ -21,7 +21,9 @@ //! pub(crate) mod bg_solver; +pub mod elasticnet; pub mod lasso; +pub(crate) mod lasso_optimizer; pub mod linear_regression; pub mod logistic_regression; pub mod ridge_regression; From cceb2f046d112094dd149985e1d482da40b1b194 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Sun, 13 Dec 2020 13:35:14 -0800 Subject: [PATCH 12/78] feat: lasso documentation --- src/linalg/naive/dense_matrix.rs | 24 ++++++-- src/linear/{elasticnet.rs => elastic_net.rs} | 65 ++++++++++++++++++-- src/linear/lasso.rs | 29 +++------ src/linear/mod.rs | 2 +- 4 files changed, 86 insertions(+), 34 deletions(-) rename src/linear/{elasticnet.rs => elastic_net.rs} (74%) diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 400366d..a0b7bdb 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -187,9 +187,7 @@ impl BaseVector for Vec { ); } - for i in 0..self.len() { - self[i] = other[i]; - } + self[..].clone_from_slice(&other[..]); } } @@ -929,9 +927,7 @@ impl BaseMatrix for DenseMatrix { ); } - for i in 0..self.values.len() { - self.values[i] = other.values[i]; - } + self.values[..].clone_from_slice(&other.values[..]); } fn abs_mut(&mut self) -> &Self { @@ -1066,6 +1062,14 @@ mod tests { assert_eq!(32.0, BaseVector::dot(&v1, &v2)); } + #[test] + fn vec_copy_from() { + let mut v1 = vec![1., 2., 3.]; + let v2 = vec![4., 5., 6.]; + v1.copy_from(&v2); + assert_eq!(v1, v2); + } + #[test] fn vec_approximate_eq() { let a = vec![1., 2., 3.]; @@ -1199,6 +1203,14 @@ mod tests { assert_eq!(a.dot(&b), 32.); } + #[test] + fn copy_from() { + let mut a = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.]]); + let b = DenseMatrix::from_2d_array(&[&[7., 8.], &[9., 10.], &[11., 12.]]); + a.copy_from(&b); + assert_eq!(a, b); + } + #[test] fn slice() { let m = DenseMatrix::from_2d_array(&[ diff --git a/src/linear/elasticnet.rs b/src/linear/elastic_net.rs similarity index 74% rename from src/linear/elasticnet.rs rename to src/linear/elastic_net.rs index 7b6acb1..c01f3c7 100644 --- a/src/linear/elasticnet.rs +++ b/src/linear/elastic_net.rs @@ -1,5 +1,51 @@ +#![allow(clippy::needless_range_loop)] //! # Elastic Net //! +//! Elastic net is an extension of [linear regression](../linear_regression/index.html) that adds regularization penalties to the loss function during training. +//! Just like in ordinary linear regression you assume a linear relationship between input variables and the target variable. +//! Unlike linear regression elastic net adds regularization penalties to the loss function during training. +//! In particular, the elastic net coefficient estimates \\(\beta\\) are the values that minimize +//! +//! \\[L(\alpha, \beta) = \vert \boldsymbol{y} - \boldsymbol{X}\beta\vert^2 + \lambda_1 \vert \beta \vert^2 + \lambda_2 \vert \beta \vert_1\\] +//! +//! where \\(\lambda_1 = \\alpha l_{1r}\\), \\(\lambda_2 = \\alpha (1 - l_{1r})\\) and \\(l_{1r}\\) is the l1 ratio, elastic net mixing parameter. +//! +//! In essense, elastic net combines both the [L1](../lasso/index.html) and [L2](../ridge_regression/index.html) penalties during training, +//! which can result in better performance than a model with either one or the other penalty on some problems. +//! The elastic net is particularly useful when the number of predictors (p) is much bigger than the number of observations (n). +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::linear::elastic_net::*; +//! +//! // Longley dataset (https://www.statsmodels.org/stable/datasets/generated/longley.html) +//! let x = DenseMatrix::from_2d_array(&[ +//! &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], +//! &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], +//! &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], +//! &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], +//! &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], +//! &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], +//! &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], +//! &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], +//! &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], +//! &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], +//! &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], +//! &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], +//! &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], +//! &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], +//! &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], +//! &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], +//! ]); +//! +//! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, +//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; +//! +//! let y_hat = ElasticNet::fit(&x, &y, Default::default()). +//! and_then(|lr| lr.predict(&x)).unwrap(); +//! ``` //! //! ## References: //! @@ -19,17 +65,24 @@ use crate::math::num::RealNumber; use crate::linear::lasso_optimizer::InteriorPointOptimizer; -/// Ridge Regression parameters +/// Elastic net parameters #[derive(Serialize, Deserialize, Debug)] pub struct ElasticNetParameters { + /// Regularization parameter. pub alpha: T, + /// The elastic net mixing parameter, with 0 <= l1_ratio <= 1. + /// For l1_ratio = 0 the penalty is an L2 penalty. + /// For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. pub l1_ratio: T, + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. pub normalize: bool, + /// The tolerance for the optimization pub tol: T, + /// The maximum number of iterations pub max_iter: usize, } -/// Ridge regression +/// Elastic net #[derive(Serialize, Deserialize, Debug)] pub struct ElasticNet> { coefficients: M, @@ -56,7 +109,7 @@ impl> PartialEq for ElasticNet { } impl> ElasticNet { - /// Fits ridge regression to your data. + /// Fits elastic net regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target values /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. @@ -81,7 +134,7 @@ impl> ElasticNet { let (w, b) = if parameters.normalize { let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; - let (x, y, gamma) = Self::augment_X_and_y(&scaled_x, y, l2_reg); + let (x, y, gamma) = Self::augment_x_and_y(&scaled_x, y, l2_reg); let mut optimizer = InteriorPointOptimizer::new(&x, p); @@ -102,7 +155,7 @@ impl> ElasticNet { (w, b) } else { - let (x, y, gamma) = Self::augment_X_and_y(x, y, l2_reg); + let (x, y, gamma) = Self::augment_x_and_y(x, y, l2_reg); let mut optimizer = InteriorPointOptimizer::new(&x, p); @@ -159,7 +212,7 @@ impl> ElasticNet { Ok((scaled_x, col_mean, col_std)) } - fn augment_X_and_y(x: &M, y: &M::RowVector, l2_reg: T) -> (M, M::RowVector, T) { + fn augment_x_and_y(x: &M, y: &M::RowVector, l2_reg: T) -> (M, M::RowVector, T) { let (n, p) = x.shape(); let gamma = T::one() / (T::one() + l2_reg).sqrt(); diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index bb9e69c..7395bdc 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -105,18 +105,15 @@ impl> Lasso { return Err(Failed::fit("Number of rows in X should = len(y)")); } + let l1_reg = parameters.alpha * T::from_usize(n).unwrap(); + let (w, b) = if parameters.normalize { let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; let mut optimizer = InteriorPointOptimizer::new(&scaled_x, p); - let mut w = optimizer.optimize( - &scaled_x, - y, - parameters.alpha, - parameters.max_iter, - parameters.tol, - )?; + let mut w = + optimizer.optimize(&scaled_x, y, l1_reg, parameters.max_iter, parameters.tol)?; for (j, col_std_j) in col_std.iter().enumerate().take(p) { w.set(j, 0, w.get(j, 0) / *col_std_j); @@ -133,8 +130,7 @@ impl> Lasso { } else { let mut optimizer = InteriorPointOptimizer::new(x, p); - let w = - optimizer.optimize(x, y, parameters.alpha, parameters.max_iter, parameters.tol)?; + let w = optimizer.optimize(x, y, l1_reg, parameters.max_iter, parameters.tol)?; (w, y.mean()) }; @@ -215,18 +211,9 @@ mod tests { 114.2, 115.7, 116.9, ]; - let y_hat = Lasso::fit( - &x, - &y, - LassoParameters { - alpha: 0.1, - normalize: true, - tol: 1e-4, - max_iter: 1000, - }, - ) - .and_then(|lr| lr.predict(&x)) - .unwrap(); + let y_hat = Lasso::fit(&x, &y, Default::default()) + .and_then(|lr| lr.predict(&x)) + .unwrap(); assert!(mean_absolute_error(&y_hat, &y) < 2.0); diff --git a/src/linear/mod.rs b/src/linear/mod.rs index 8c056e8..3824d36 100644 --- a/src/linear/mod.rs +++ b/src/linear/mod.rs @@ -21,7 +21,7 @@ //! pub(crate) mod bg_solver; -pub mod elasticnet; +pub mod elastic_net; pub mod lasso; pub(crate) mod lasso_optimizer; pub mod linear_regression; From 74a7c45c75313045c0547ef5271e604c5995fb89 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 14 Dec 2020 14:59:02 -0800 Subject: [PATCH 13/78] feat: adds SVD --- src/decomposition/mod.rs | 1 + src/decomposition/pca.rs | 28 +++++ src/decomposition/svd.rs | 235 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+) create mode 100644 src/decomposition/svd.rs diff --git a/src/decomposition/mod.rs b/src/decomposition/mod.rs index a01c114..1460bd6 100644 --- a/src/decomposition/mod.rs +++ b/src/decomposition/mod.rs @@ -13,3 +13,4 @@ /// PCA is a popular approach for deriving a low-dimensional set of features from a large set of variables. pub mod pca; +pub mod svd; diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 9f5bd39..7d80f88 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -108,6 +108,13 @@ impl> PCA { ) -> Result, Failed> { let (m, n) = data.shape(); + if n_components > n { + return Err(Failed::fit(&format!( + "Number of components, n_components should be <= number of attributes ({})", + n + ))); + } + let mu = data.column_mean(); let mut x = data.clone(); @@ -224,6 +231,11 @@ impl> PCA { } Ok(x_transformed) } + + /// Get a projection matrix + pub fn components(&self) -> &M { + &self.projection + } } #[cfg(test)] @@ -286,6 +298,22 @@ mod tests { ]) } + #[test] + fn pca_components() { + let us_arrests = us_arrests_data(); + + let expected = DenseMatrix::from_2d_array(&[ + &[0.0417, 0.0448], + &[0.9952, 0.0588], + &[0.0463, 0.9769], + &[0.0752, 0.2007], + ]); + + let pca = PCA::fit(&us_arrests, 2, Default::default()).unwrap(); + + assert!(expected.approximate_eq(&pca.components().abs(), 0.4)); + } + #[test] fn decompose_covariance() { let us_arrests = us_arrests_data(); diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs new file mode 100644 index 0000000..fbaf042 --- /dev/null +++ b/src/decomposition/svd.rs @@ -0,0 +1,235 @@ +//! # Dimensionality reduction using SVD +//! +//! Similar to [`PCA`](../pca/index.html), SVD is a technique that can be used to reduce the number of input variables _p_ to a smaller number _k_, while preserving +//! the most important structure or relationships between the variables observed in the data. +//! +//! Contrary to PCA, SVD does not center the data before computing the singular value decomposition. +//! +//! Example: +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::decomposition::svd::*; +//! +//! // Iris data +//! let iris = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! +//! let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); // Reduce number of features to 2 +//! +//! let iris_reduced = svd.transform(&iris).unwrap(); +//! +//! ``` +//! +//! +//! +use std::fmt::Debug; +use std::marker::PhantomData; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +/// SVD +#[derive(Serialize, Deserialize, Debug)] +pub struct SVD> { + components: M, + phantom: PhantomData, +} + +impl> PartialEq for SVD { + fn eq(&self, other: &Self) -> bool { + self.components + .approximate_eq(&other.components, T::from_f64(1e-8).unwrap()) + } +} + +#[derive(Debug, Clone)] +/// SVD parameters +pub struct SVDParameters {} + +impl Default for SVDParameters { + fn default() -> Self { + SVDParameters {} + } +} + +impl> SVD { + /// Fits SVD to your data. + /// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `n_components` - number of components to keep. + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit(x: &M, n_components: usize, _: SVDParameters) -> Result, Failed> { + let (_, p) = x.shape(); + + if n_components >= p { + return Err(Failed::fit(&format!( + "Number of components, n_components should be < number of attributes ({})", + p + ))); + } + + let svd = x.svd()?; + + let components = svd.V.slice(0..p, 0..n_components); + + Ok(SVD { + components, + phantom: PhantomData, + }) + } + + /// Run dimensionality reduction for `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn transform(&self, x: &M) -> Result { + let (n, p) = x.shape(); + let (p_c, k) = self.components.shape(); + if p_c != p { + return Err(Failed::transform(&format!( + "Can not transform a {}x{} matrix into {}x{} matrix, incorrect input dimentions", + n, p, n, k + ))); + } + + Ok(x.matmul(&self.components)) + } + + /// Get a projection matrix + pub fn components(&self) -> &M { + &self.components + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn svd_decompose() { + // https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/USArrests.html + let x = DenseMatrix::from_2d_array(&[ + &[13.2, 236.0, 58.0, 21.2], + &[10.0, 263.0, 48.0, 44.5], + &[8.1, 294.0, 80.0, 31.0], + &[8.8, 190.0, 50.0, 19.5], + &[9.0, 276.0, 91.0, 40.6], + &[7.9, 204.0, 78.0, 38.7], + &[3.3, 110.0, 77.0, 11.1], + &[5.9, 238.0, 72.0, 15.8], + &[15.4, 335.0, 80.0, 31.9], + &[17.4, 211.0, 60.0, 25.8], + &[5.3, 46.0, 83.0, 20.2], + &[2.6, 120.0, 54.0, 14.2], + &[10.4, 249.0, 83.0, 24.0], + &[7.2, 113.0, 65.0, 21.0], + &[2.2, 56.0, 57.0, 11.3], + &[6.0, 115.0, 66.0, 18.0], + &[9.7, 109.0, 52.0, 16.3], + &[15.4, 249.0, 66.0, 22.2], + &[2.1, 83.0, 51.0, 7.8], + &[11.3, 300.0, 67.0, 27.8], + &[4.4, 149.0, 85.0, 16.3], + &[12.1, 255.0, 74.0, 35.1], + &[2.7, 72.0, 66.0, 14.9], + &[16.1, 259.0, 44.0, 17.1], + &[9.0, 178.0, 70.0, 28.2], + &[6.0, 109.0, 53.0, 16.4], + &[4.3, 102.0, 62.0, 16.5], + &[12.2, 252.0, 81.0, 46.0], + &[2.1, 57.0, 56.0, 9.5], + &[7.4, 159.0, 89.0, 18.8], + &[11.4, 285.0, 70.0, 32.1], + &[11.1, 254.0, 86.0, 26.1], + &[13.0, 337.0, 45.0, 16.1], + &[0.8, 45.0, 44.0, 7.3], + &[7.3, 120.0, 75.0, 21.4], + &[6.6, 151.0, 68.0, 20.0], + &[4.9, 159.0, 67.0, 29.3], + &[6.3, 106.0, 72.0, 14.9], + &[3.4, 174.0, 87.0, 8.3], + &[14.4, 279.0, 48.0, 22.5], + &[3.8, 86.0, 45.0, 12.8], + &[13.2, 188.0, 59.0, 26.9], + &[12.7, 201.0, 80.0, 25.5], + &[3.2, 120.0, 80.0, 22.9], + &[2.2, 48.0, 32.0, 11.2], + &[8.5, 156.0, 63.0, 20.7], + &[4.0, 145.0, 73.0, 26.2], + &[5.7, 81.0, 39.0, 9.3], + &[2.6, 53.0, 66.0, 10.8], + &[6.8, 161.0, 60.0, 15.6], + ]); + + let expected = DenseMatrix::from_2d_array(&[ + &[243.54655757, -18.76673788], + &[268.36802004, -33.79304302], + &[305.93972467, -15.39087376], + &[197.28420365, -11.66808306], + &[293.43187394, 1.91163633], + ]); + let svd = SVD::fit(&x, 2, Default::default()).unwrap(); + + let x_transformed = svd.transform(&x).unwrap(); + + assert_eq!(svd.components.shape(), (x.shape().1, 2)); + + assert!(x_transformed + .slice(0..5, 0..2) + .approximate_eq(&expected, 1e-4)); + } + + #[test] + fn serde() { + let iris = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + + let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); + + let deserialized_svd: SVD> = + serde_json::from_str(&serde_json::to_string(&svd).unwrap()).unwrap(); + + assert_eq!(svd, deserialized_svd); + } +} From d39b04e549fdf077825ee77205773bc7d044373b Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 14 Dec 2020 15:03:10 -0800 Subject: [PATCH 14/78] fix: fmt --- src/decomposition/svd.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index fbaf042..eea1969 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -2,7 +2,7 @@ //! //! Similar to [`PCA`](../pca/index.html), SVD is a technique that can be used to reduce the number of input variables _p_ to a smaller number _k_, while preserving //! the most important structure or relationships between the variables observed in the data. -//! +//! //! Contrary to PCA, SVD does not center the data before computing the singular value decomposition. //! //! Example: From 505f495445e1c51fc66ea670cc59744ba57fc49d Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 16 Dec 2020 00:20:07 -0400 Subject: [PATCH 15/78] fix: Update ndarray version --- Cargo.toml | 2 +- src/linalg/ndarray_bindings.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1503957..32d8695 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ nalgebra-bindings = ["nalgebra"] datasets = [] [dependencies] -ndarray = { version = "0.13", optional = true } +ndarray = { version = "0.14", optional = true } nalgebra = { version = "0.23.0", optional = true } num-traits = "0.2.12" num = "0.3.0" diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 3f0478f..b80fac8 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -47,7 +47,7 @@ use std::ops::Range; use std::ops::SubAssign; use ndarray::ScalarOperand; -use ndarray::{s, stack, Array, ArrayBase, Axis, Ix1, Ix2, OwnedRepr}; +use ndarray::{concatenate, s, Array, ArrayBase, Axis, Ix1, Ix2, OwnedRepr}; use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; @@ -246,11 +246,11 @@ impl Self { - stack(Axis(1), &[self.view(), other.view()]).unwrap() + concatenate(Axis(1), &[self.view(), other.view()]).unwrap() } fn v_stack(&self, other: &Self) -> Self { - stack(Axis(0), &[self.view(), other.view()]).unwrap() + concatenate(Axis(0), &[self.view(), other.view()]).unwrap() } fn matmul(&self, other: &Self) -> Self { From f76a1d142007a15bcc0e272acba4b29976dcda4c Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 17 Dec 2020 13:01:45 -0800 Subject: [PATCH 16/78] feat: makes smartcore::error:FailedError non-exhaustive --- src/error/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/error/mod.rs b/src/error/mod.rs index 1615290..2409889 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -12,6 +12,7 @@ pub struct Failed { } /// Type of error +#[non_exhaustive] #[derive(Copy, Clone, Debug, Serialize, Deserialize)] pub enum FailedError { /// Can't fit algorithm to data From 5a185479a7edd93574f835ca23e9c2acf9420747 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 17 Dec 2020 19:00:11 -0800 Subject: [PATCH 17/78] feat: NB documentation --- src/naive_bayes/bernoulli.rs | 35 ++++++++++++++++++++++++ src/naive_bayes/categorical.rs | 32 ++++++++++++++++++++++ src/naive_bayes/gaussian.rs | 24 ++++++++++++++++ src/naive_bayes/mod.rs | 50 ++++++++++++++++++++++++++++------ src/naive_bayes/multinomial.rs | 35 ++++++++++++++++++++++++ src/svm/svc.rs | 1 - 6 files changed, 167 insertions(+), 10 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 057b447..c478d58 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -1,3 +1,38 @@ +//! # Bernoulli Naive Bayes +//! +//! Bernoulli Naive Bayes classifier is a variant of [Naive Bayes](../index.html) for the data that is distributed according to multivariate Bernoulli distribution. +//! It is used for discrete data with binary features. One example of a binary feature is a word that occurs in the text or not. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::naive_bayes::bernoulli::BernoulliNB; +//! +//! // Training data points are: +//! // Chinese Beijing Chinese (class: China) +//! // Chinese Chinese Shanghai (class: China) +//! // Chinese Macao (class: China) +//! // Tokyo Japan Chinese (class: Japan) +//! let x = DenseMatrix::::from_2d_array(&[ +//! &[1., 1., 0., 0., 0., 0.], +//! &[0., 1., 0., 0., 1., 0.], +//! &[0., 1., 0., 1., 0., 0.], +//! &[0., 1., 1., 0., 0., 1.], +//! ]); +//! let y = vec![0., 0., 0., 1.]; +//! +//! let nb = BernoulliNB::fit(&x, &y, Default::default()).unwrap(); +//! +//! // Testing data point is: +//! // Chinese Chinese Chinese Tokyo Japan +//! let x_test = DenseMatrix::::from_2d_array(&[&[0., 1., 1., 0., 0., 1.]]); +//! let y_hat = nb.predict(&x_test).unwrap(); +//! ``` +//! +//! ## References: +//! +//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index d32c34d..d6b24a2 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -1,3 +1,35 @@ +//! # Categorical Naive Bayes +//! +//! Categorical Naive Bayes is a variant of [Naive Bayes](../index.html) for the categorically distributed data. +//! It assumes that each feature has its own categorical distribution. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::naive_bayes::categorical::CategoricalNB; +//! +//! let x = DenseMatrix::from_2d_array(&[ +//! &[3., 4., 0., 1.], +//! &[3., 0., 0., 1.], +//! &[4., 4., 1., 2.], +//! &[4., 2., 4., 3.], +//! &[4., 2., 4., 2.], +//! &[4., 1., 1., 0.], +//! &[1., 1., 1., 1.], +//! &[0., 4., 1., 0.], +//! &[0., 3., 2., 1.], +//! &[0., 3., 1., 1.], +//! &[3., 4., 0., 1.], +//! &[3., 4., 2., 4.], +//! &[0., 3., 1., 2.], +//! &[0., 4., 1., 2.], +//! ]); +//! let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; +//! +//! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); +//! let y_hat = nb.predict(&x).unwrap(); +//! ``` use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index af5732d..fc11b49 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -1,3 +1,27 @@ +//! # Gaussian Naive Bayes +//! +//! Gaussian Naive Bayes is a variant of [Naive Bayes](../index.html) for the data that follows Gaussian distribution and +//! it supports continuous valued features conforming to a normal distribution. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::naive_bayes::gaussian::GaussianNB; +//! +//! let x = DenseMatrix::from_2d_array(&[ +//! &[-1., -1.], +//! &[-2., -1.], +//! &[-3., -2.], +//! &[ 1., 1.], +//! &[ 2., 1.], +//! &[ 3., 2.], +//! ]); +//! let y = vec![1., 1., 1., 2., 2., 2.]; +//! +//! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); +//! let y_hat = nb.predict(&x).unwrap(); +//! ``` use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 508b976..7ab8b85 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -1,3 +1,40 @@ +//! # Naive Bayes +//! +//! Naive Bayes (NB) is a simple but powerful machine learning algorithm. +//! Naive Bayes classifier is based on Bayes’ Theorem with an ssumption of conditional independence +//! between every pair of features given the value of the class variable. +//! +//! Bayes’ theorem can be written as +//! +//! \\[ P(y | X) = \frac{P(y)P(X| y)}{P(X)} \\] +//! +//! where +//! +//! * \\(X = (x_1,...x_n)\\) represents the predictors. +//! * \\(P(y | X)\\) is the probability of class _y_ given the data X +//! * \\(P(X| y)\\) is the probability of data X given the class _y_. +//! * \\(P(y)\\) is the probability of class y. This is called the prior probability of y. +//! * \\(P(y | X)\\) is the probability of the data (regardless of the class value). +//! +//! The naive conditional independence assumption let us rewrite this equation as +//! +//! \\[ P(y | x_1,...x_n) = \frac{P(y)\prod_{i=1}^nP(x_i|y)}{P(x_1,...x_n)} \\] +//! +//! +//! The denominator can be removed since \\(P(x_1,...x_n)\\) is constrant for all the entries in the dataset. +//! +//! \\[ P(y | x_1,...x_n) \propto P(y)\prod_{i=1}^nP(x_i|y) \\] +//! +//! To find class y from predictors X we use this equation +//! +//! \\[ y = \underset{y}{argmax} P(y)\prod_{i=1}^nP(x_i|y) \\] +//! +//! ## References: +//! +//! * ["Machine Learning: A Probabilistic Perspective", Kevin P. Murphy, 2012, Chapter 3 ](https://mitpress.mit.edu/books/machine-learning-1) +//! +//! +//! use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -64,12 +101,7 @@ impl, D: NBDistribution> BaseNaiveBayes::from_2d_array(&[ +//! &[1., 2., 0., 0., 0., 0.], +//! &[0., 2., 0., 0., 1., 0.], +//! &[0., 1., 0., 1., 0., 0.], +//! &[0., 1., 1., 0., 0., 1.], +//! ]); +//! let y = vec![0., 0., 0., 1.]; +//! let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); +//! +//! // Testing data point is: +//! // Chinese Chinese Chinese Tokyo Japan +//! let x_test = DenseMatrix::::from_2d_array(&[&[0., 3., 1., 0., 0., 1.]]); +//! let y_hat = nb.predict(&x_test).unwrap(); +//! ``` +//! +//! ## References: +//! +//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 4fd70df..9e166d5 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -28,7 +28,6 @@ //! //! ``` //! use smartcore::linalg::naive::dense_matrix::*; -//! use smartcore::linear::linear_regression::*; //! use smartcore::svm::Kernels; //! use smartcore::svm::svc::{SVC, SVCParameters}; //! From 8ca13a76d699f577b9746376f878cf8d23ec59e1 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 17 Dec 2020 19:11:47 -0800 Subject: [PATCH 18/78] fix: criterion --- benches/naive_bayes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/naive_bayes.rs b/benches/naive_bayes.rs index 2a4595b..ba8cb6f 100644 --- a/benches/naive_bayes.rs +++ b/benches/naive_bayes.rs @@ -6,7 +6,7 @@ use ndarray::Array2; use smartcore::linalg::naive::dense_matrix::DenseMatrix; use smartcore::linalg::BaseMatrix; use smartcore::linalg::BaseVector; -use smartcore::naive_bayes::GaussianNB; +use smartcore::naive_bayes::gaussian::GaussianNB; pub fn gaussian_naive_bayes_fit_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("GaussianNB::fit"); From c9eb94ba939cbf9a9987a8bd1332568c9b49a0b5 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 16 Dec 2020 20:11:09 -0400 Subject: [PATCH 19/78] Derive clone for NB Parameters --- src/naive_bayes/bernoulli.rs | 2 +- src/naive_bayes/categorical.rs | 2 +- src/naive_bayes/gaussian.rs | 2 +- src/naive_bayes/multinomial.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index c478d58..dd34ae9 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -76,7 +76,7 @@ impl> NBDistribution for BernoulliNBDistributi } /// `BernoulliNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct BernoulliNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index d6b24a2..c4626ef 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -215,7 +215,7 @@ impl CategoricalNBDistribution { } /// `CategoricalNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct CategoricalNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index fc11b49..c5c1fb2 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -74,7 +74,7 @@ impl> NBDistribution for GaussianNBDistributio } /// `GaussianNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Default)] +#[derive(Serialize, Deserialize, Debug, Default, Clone)] pub struct GaussianNBParameters { /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data pub priors: Option>, diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 0fb7aa4..c9ac86b 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -72,7 +72,7 @@ impl> NBDistribution for MultinomialNBDistribu } /// `MultinomialNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct MultinomialNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, From a2be9e117f96e173c9aaaa0be0e6f79cdac719ac Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 15:41:53 -0800 Subject: [PATCH 20/78] feat: + cross_validate, trait Predictor, refactoring --- src/algorithm/neighbour/cover_tree.rs | 3 +- src/algorithm/neighbour/linear_search.rs | 2 + src/base.rs | 10 + src/ensemble/random_forest_classifier.rs | 9 +- src/ensemble/random_forest_regressor.rs | 7 + src/lib.rs | 3 +- src/linalg/mod.rs | 72 ++++ src/linalg/ndarray_bindings.rs | 4 +- src/linear/elastic_net.rs | 9 +- src/linear/lasso.rs | 9 +- src/linear/linear_regression.rs | 11 +- src/linear/logistic_regression.rs | 34 +- src/linear/ridge_regression.rs | 11 +- src/math/distance/euclidian.rs | 2 +- src/math/distance/hamming.rs | 2 +- src/math/distance/mahalanobis.rs | 2 +- src/math/distance/manhattan.rs | 2 +- src/math/distance/minkowski.rs | 2 +- src/math/distance/mod.rs | 2 +- src/metrics/mod.rs | 2 +- src/model_selection/kfold.rs | 286 ++++++++++++++ src/model_selection/mod.rs | 473 ++++++++++++----------- src/naive_bayes/bernoulli.rs | 7 + src/naive_bayes/categorical.rs | 7 + src/naive_bayes/gaussian.rs | 7 + src/naive_bayes/multinomial.rs | 7 + src/neighbors/knn_classifier.rs | 79 +++- src/neighbors/knn_regressor.rs | 80 +++- src/neighbors/mod.rs | 2 +- src/svm/mod.rs | 5 +- src/svm/svc.rs | 94 +++-- src/svm/svr.rs | 83 ++-- src/tree/decision_tree_classifier.rs | 9 +- src/tree/decision_tree_regressor.rs | 9 +- 34 files changed, 977 insertions(+), 369 deletions(-) create mode 100644 src/base.rs create mode 100644 src/model_selection/kfold.rs diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 2fe7792..d271ed6 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -6,6 +6,7 @@ //! use smartcore::algorithm::neighbour::cover_tree::*; //! use smartcore::math::distance::Distance; //! +//! #[derive(Clone)] //! struct SimpleDistance {} // Our distance function //! //! impl Distance for SimpleDistance { @@ -453,7 +454,7 @@ mod tests { use super::*; use crate::math::distance::Distances; - #[derive(Debug, Serialize, Deserialize)] + #[derive(Debug, Serialize, Deserialize, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index d09f2ed..45fbd6f 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -5,6 +5,7 @@ //! use smartcore::algorithm::neighbour::linear_search::*; //! use smartcore::math::distance::Distance; //! +//! #[derive(Clone)] //! struct SimpleDistance {} // Our distance function //! //! impl Distance for SimpleDistance { @@ -137,6 +138,7 @@ mod tests { use super::*; use crate::math::distance::Distances; + #[derive(Debug, Serialize, Deserialize, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/base.rs b/src/base.rs new file mode 100644 index 0000000..a2d4468 --- /dev/null +++ b/src/base.rs @@ -0,0 +1,10 @@ +//! # Common Interfaces and methods +//! +//! This module consolidates interfaces and uniform basic API that is used elsewhere in the code. + +use crate::error::Failed; + +/// Implements method predict that offers a way to estimate target value from new data +pub trait Predictor { + fn predict(&self, x: &X) -> Result; +} diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 7229d92..a742d90 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -9,7 +9,7 @@ //! //! ``` //! use smartcore::linalg::naive::dense_matrix::*; -//! use smartcore::ensemble::random_forest_classifier::*; +//! use smartcore::ensemble::random_forest_classifier::RandomForestClassifier; //! //! // Iris dataset //! let x = DenseMatrix::from_2d_array(&[ @@ -51,6 +51,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -117,6 +118,12 @@ impl Default for RandomForestClassifierParameters { } } +impl> Predictor for RandomForestClassifier { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl RandomForestClassifier { /// Build a forest of trees from the training set. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 36fa096..52b39f9 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -49,6 +49,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -106,6 +107,12 @@ impl PartialEq for RandomForestRegressor { } } +impl> Predictor for RandomForestRegressor { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl RandomForestRegressor { /// Build a forest of trees from the training set. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/lib.rs b/src/lib.rs index 9290c86..a1608c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,7 +63,7 @@ //! let y = vec![2., 2., 2., 3., 3.]; //! //! // Train classifier -//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); //! //! // Predict classes //! let y_hat = knn.predict(&x).unwrap(); @@ -71,6 +71,7 @@ /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; +pub(crate) mod base; /// Algorithms for clustering of unlabeled data pub mod cluster; /// Various datasets diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index c768cbf..5b49942 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -274,6 +274,19 @@ pub trait BaseVector: Clone + Debug { /// Copies content of `other` vector. fn copy_from(&mut self, other: &Self); + + /// Take elements from an array. + fn take(&self, index: &[usize]) -> Self { + let n = index.len(); + + let mut result = Self::zeros(n); + + for i in 0..n { + result.set(i, self.get(index[i])); + } + + result + } } /// Generic matrix type. @@ -611,6 +624,32 @@ pub trait BaseMatrix: Clone + Debug { /// Calculates the covariance matrix fn cov(&self) -> Self; + + /// Take elements from an array along an axis. + fn take(&self, index: &[usize], axis: u8) -> Self { + let (n, p) = self.shape(); + + let k = match axis { + 0 => p, + _ => n, + }; + + let mut result = match axis { + 0 => Self::zeros(index.len(), p), + _ => Self::zeros(n, index.len()), + }; + + for i in 0..index.len() { + for j in 0..k { + match axis { + 0 => result.set(i, j, self.get(index[i], j)), + _ => result.set(j, i, self.get(j, index[i])), + }; + } + } + + result + } } /// Generic matrix with additional mixins like various factorization methods. @@ -662,6 +701,8 @@ impl<'a, T: RealNumber, M: BaseMatrix> Iterator for RowIter<'a, T, M> { #[cfg(test)] mod tests { + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::linalg::BaseMatrix; use crate::linalg::BaseVector; #[test] @@ -684,4 +725,35 @@ mod tests { assert!((m.var() - 1.25f64).abs() < std::f64::EPSILON); } + + #[test] + fn vec_take() { + let m = vec![1., 2., 3., 4., 5.]; + + assert_eq!(m.take(&vec!(0, 0, 4, 4)), vec![1., 1., 5., 5.]); + } + + #[test] + fn take() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 2.0], + &[3.0, 4.0], + &[5.0, 6.0], + &[7.0, 8.0], + &[9.0, 10.0], + ]); + + let expected_0 = DenseMatrix::from_2d_array(&[&[3.0, 4.0], &[3.0, 4.0], &[7.0, 8.0]]); + + let expected_1 = DenseMatrix::from_2d_array(&[ + &[2.0, 1.0], + &[4.0, 3.0], + &[6.0, 5.0], + &[8.0, 7.0], + &[10.0, 9.0], + ]); + + assert_eq!(m.take(&vec!(1, 1, 3), 0), expected_0); + assert_eq!(m.take(&vec!(1, 0), 1), expected_1); + } } diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 085fd5d..6ed40c8 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -36,7 +36,7 @@ //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. //! ]); //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = lr.predict(&x).unwrap(); //! ``` use std::iter::Sum; @@ -917,7 +917,7 @@ mod tests { 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]); - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index c01f3c7..b386290 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -58,6 +58,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -66,7 +67,7 @@ use crate::math::num::RealNumber; use crate::linear::lasso_optimizer::InteriorPointOptimizer; /// Elastic net parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct ElasticNetParameters { /// Regularization parameter. pub alpha: T, @@ -108,6 +109,12 @@ impl> PartialEq for ElasticNet { } } +impl> Predictor for ElasticNet { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> ElasticNet { /// Fits elastic net regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 7395bdc..0dab3e5 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -26,6 +26,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -33,7 +34,7 @@ use crate::linear::lasso_optimizer::InteriorPointOptimizer; use crate::math::num::RealNumber; /// Lasso regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LassoParameters { /// Controls the strength of the penalty to the loss function. pub alpha: T, @@ -71,6 +72,12 @@ impl> PartialEq for Lasso { } } +impl> Predictor for Lasso { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> Lasso { /// Fits Lasso regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index d01b817..c7bd872 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -64,11 +64,12 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Approach to use for estimation of regression coefficients. QR is more efficient but SVD is more stable. pub enum LinearRegressionSolverName { /// QR decomposition, see [QR](../../linalg/qr/index.html) @@ -78,7 +79,7 @@ pub enum LinearRegressionSolverName { } /// Linear Regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LinearRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: LinearRegressionSolverName, @@ -107,6 +108,12 @@ impl> PartialEq for LinearRegression { } } +impl> Predictor for LinearRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> LinearRegression { /// Fits Linear Regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 7b7cab6..b85bbe8 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -40,7 +40,7 @@ //! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., //! ]; //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! ``` @@ -58,6 +58,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -66,6 +67,11 @@ use crate::optimization::first_order::{FirstOrderOptimizer, OptimizerResult}; use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; +/// Logistic Regression parameters +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct LogisticRegressionParameters { +} + /// Logistic Regression #[derive(Serialize, Deserialize, Debug)] pub struct LogisticRegression> { @@ -97,6 +103,13 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { phantom: PhantomData<&'a T>, } +impl Default for LogisticRegressionParameters { + fn default() -> Self { + LogisticRegressionParameters { + } + } +} + impl> PartialEq for LogisticRegression { fn eq(&self, other: &Self) -> bool { if self.num_classes != other.num_classes @@ -207,11 +220,18 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction } } +impl> Predictor for LogisticRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> LogisticRegression { /// Fits Logistic Regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target class values - pub fn fit(x: &M, y: &M::RowVector) -> Result, Failed> { + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); let (_, y_nrows) = y_m.shape(); @@ -461,7 +481,7 @@ mod tests { ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); assert_eq!(lr.coefficients().shape(), (3, 2)); assert_eq!(lr.intercept().shape(), (3, 1)); @@ -484,7 +504,7 @@ mod tests { let x = DenseMatrix::from_vec(15, 4, &blobs.data); let y = blobs.target; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -498,7 +518,7 @@ mod tests { let x = DenseMatrix::from_vec(20, 4, &blobs.data); let y = blobs.target; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -526,7 +546,7 @@ mod tests { ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let deserialized_lr: LogisticRegression> = serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); @@ -562,7 +582,7 @@ mod tests { 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 98bc639..2b5a898 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -63,12 +63,13 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable. pub enum RidgeRegressionSolverName { /// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html) @@ -78,7 +79,7 @@ pub enum RidgeRegressionSolverName { } /// Ridge Regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RidgeRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: RidgeRegressionSolverName, @@ -114,6 +115,12 @@ impl> PartialEq for RidgeRegression { } } +impl> Predictor for RidgeRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> RidgeRegression { /// Fits ridge regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index e292f9c..9034727 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -25,7 +25,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Euclidian {} impl Euclidian { diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index 4028259..129fe16 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -26,7 +26,7 @@ use crate::math::num::RealNumber; use super::Distance; /// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Hamming {} impl Distance, F> for Hamming { diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index fd320c3..84aa947 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -52,7 +52,7 @@ use super::Distance; use crate::linalg::Matrix; /// Mahalanobis distance. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Mahalanobis> { /// covariance matrix of the dataset pub sigma: M, diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 66125a5..9a69184 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -24,7 +24,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Manhattan distance -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Manhattan {} impl Distance, T> for Manhattan { diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index b7c5691..c5dd85d 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -28,7 +28,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Defines the Minkowski distance of order `p` -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Minkowski { /// order, integer pub p: u16, diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 696b5ff..9bfbd6b 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -28,7 +28,7 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; /// Distance metric, a function that calculates distance between two points -pub trait Distance { +pub trait Distance: Clone { /// Calculates distance between _a_ and _b_ fn distance(&self, a: &T, b: &T) -> F; } diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index f49300d..42b3994 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -42,7 +42,7 @@ //! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., //! ]; //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs new file mode 100644 index 0000000..0fbe224 --- /dev/null +++ b/src/model_selection/kfold.rs @@ -0,0 +1,286 @@ +//! # KFold +//! +//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), +//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. +//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. +//! Underfitted is bad because the model is undetrained and does not fit the training data well. +//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for +//! your data. +//! +//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. + +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use rand::seq::SliceRandom; +use rand::thread_rng; + +/// An interface for the K-Folds cross-validator +pub trait BaseKFold { + /// An iterator over indices that split data into training and test set. + type Output: Iterator, Vec)>; + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Self::Output; + /// Returns the number of splits + fn n_splits(&self) -> usize; +} + +/// K-Folds cross-validator +pub struct KFold { + /// Number of folds. Must be at least 2. + pub n_splits: usize, // cannot exceed std::usize::MAX + /// Whether to shuffle the data before splitting into batches + pub shuffle: bool, +} + +impl KFold { + fn test_indices>(&self, x: &M) -> Vec> { + // number of samples (rows) in the matrix + let n_samples: usize = x.shape().0; + + // initialise indices + let mut indices: Vec = (0..n_samples).collect(); + if self.shuffle { + indices.shuffle(&mut thread_rng()); + } + // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. + let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + + // increment by one if odd + for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { + *fold_size += 1; + } + + // generate the right array of arrays for test indices + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + let mut current: usize = 0; + for fold_size in fold_sizes.drain(..) { + let stop = current + fold_size; + return_values.push(indices[current..stop].to_vec()); + current = stop + } + + return_values + } + + fn test_masks>(&self, x: &M) -> Vec> { + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + for test_index in self.test_indices(x).drain(..) { + // init mask + let mut test_mask = vec![false; x.shape().0]; + // set mask's indices to true according to test indices + for i in test_index { + test_mask[i] = true; // can be implemented with map() + } + return_values.push(test_mask); + } + return_values + } +} + +impl Default for KFold { + fn default() -> KFold { + KFold { + n_splits: 3, + shuffle: true, + } + } +} + +impl KFold { + /// Number of folds. Must be at least 2. + pub fn with_n_splits(mut self, n_splits: usize) -> Self { + self.n_splits = n_splits; + self + } + /// Whether to shuffle the data before splitting into batches + pub fn with_shuffle(mut self, shuffle: bool) -> Self { + self.shuffle = shuffle; + self + } +} + +/// An iterator over indices that split data into training and test set. +pub struct BaseKFoldIter { + indices: Vec, + test_indices: Vec>, +} + +impl Iterator for BaseKFoldIter { + type Item = (Vec, Vec); + + fn next(&mut self) -> Option<(Vec, Vec)> { + self.test_indices.pop().map(|test_index| { + let train_index = self + .indices + .iter() + .enumerate() + .filter(|&(idx, _)| !test_index[idx]) + .map(|(idx, _)| idx) + .collect::>(); // filter train indices out according to mask + let test_index = self + .indices + .iter() + .enumerate() + .filter(|&(idx, _)| test_index[idx]) + .map(|(idx, _)| idx) + .collect::>(); // filter tests indices out according to mask + + (train_index, test_index) + }) + } +} + +/// Abstract class for all KFold functionalities +impl BaseKFold for KFold { + type Output = BaseKFoldIter; + + fn n_splits(&self) -> usize { + self.n_splits + } + + fn split>(&self, x: &M) -> Self::Output { + if self.n_splits < 2 { + panic!("Number of splits is too small: {}", self.n_splits); + } + let n_samples: usize = x.shape().0; + let indices: Vec = (0..n_samples).collect(); + let mut test_indices = self.test_masks(x); + test_indices.reverse(); + + BaseKFoldIter { + indices, + test_indices, + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn run_kfold_return_test_indices_simple() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(33, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..11).collect::>()); + assert_eq!(test_indices[1], (11..22).collect::>()); + assert_eq!(test_indices[2], (22..33).collect::>()); + } + + #[test] + fn run_kfold_return_test_indices_odd() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(34, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..12).collect::>()); + assert_eq!(test_indices[1], (12..23).collect::>()); + assert_eq!(test_indices[2], (23..34).collect::>()); + } + + #[test] + fn run_kfold_return_test_mask_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let test_masks = k.test_masks(&x); + + for t in &test_masks[0][0..11] { + // TODO: this can be prob done better + assert_eq!(*t, true) + } + for t in &test_masks[0][11..22] { + assert_eq!(*t, false) + } + + for t in &test_masks[1][0..11] { + assert_eq!(*t, false) + } + for t in &test_masks[1][11..22] { + assert_eq!(*t, true) + } + } + + #[test] + fn run_kfold_return_split_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let train_test_splits: Vec<(Vec, Vec)> = k.split(&x).collect(); + + assert_eq!(train_test_splits[0].1, (0..11).collect::>()); + assert_eq!(train_test_splits[0].0, (11..22).collect::>()); + assert_eq!(train_test_splits[1].0, (0..11).collect::>()); + assert_eq!(train_test_splits[1].1, (11..22).collect::>()); + } + + #[test] + fn run_kfold_return_split_simple_shuffle() { + let k = KFold { + n_splits: 2, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(23, 100); + let train_test_splits: Vec<(Vec, Vec)> = k.split(&x).collect(); + + assert_eq!(train_test_splits[0].1.len(), 12_usize); + assert_eq!(train_test_splits[0].0.len(), 11_usize); + assert_eq!(train_test_splits[1].0.len(), 12_usize); + assert_eq!(train_test_splits[1].1.len(), 11_usize); + } + + #[test] + fn numpy_parity_test() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test, expected_test); + assert_eq!(train, expected_train); + } + } + + #[test] + fn numpy_parity_test_shuffle() { + let k = KFold { + n_splits: 3, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test.len(), expected_test.len()); + assert_eq!(train.len(), expected_train.len()); + } + } +} diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index bc0f9b8..64527b3 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -9,21 +9,27 @@ //! //! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +use crate::base::Predictor; +use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; +use crate::model_selection::kfold::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -use rand::Rng; + +pub mod kfold; /// Splits data into 2 disjoint datasets. /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. /// * `y` - target values, should be of size _M_ /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. +/// * `shuffle`, - whether or not to shuffle the data before splitting pub fn train_test_split>( x: &M, y: &M::RowVector, test_size: f32, + shuffle: bool, ) -> (M, M, M::RowVector, M::RowVector) { if x.shape().0 != y.len() { panic!( @@ -38,155 +44,80 @@ pub fn train_test_split>( } let n = y.len(); - let m = x.shape().1; - let mut rng = rand::thread_rng(); - let mut n_test = 0; - let mut index = vec![false; n]; + let n_test = ((n as f32) * test_size) as usize; - for index_i in index.iter_mut().take(n) { - let p_test: f32 = rng.gen(); - if p_test <= test_size { - *index_i = true; - n_test += 1; - } + if n_test < 1 { + panic!("number of sample is too small {}", n); } - let n_train = n - n_test; + let mut indices: Vec = (0..n).collect(); - let mut x_train = M::zeros(n_train, m); - let mut x_test = M::zeros(n_test, m); - let mut y_train = M::RowVector::zeros(n_train); - let mut y_test = M::RowVector::zeros(n_test); - - let mut r_train = 0; - let mut r_test = 0; - - for (r, index_r) in index.iter().enumerate().take(n) { - if *index_r { - //sample belongs to test - for c in 0..m { - x_test.set(r_test, c, x.get(r, c)); - y_test.set(r_test, y.get(r)); - } - r_test += 1; - } else { - for c in 0..m { - x_train.set(r_train, c, x.get(r, c)); - y_train.set(r_train, y.get(r)); - } - r_train += 1; - } + if shuffle { + indices.shuffle(&mut thread_rng()); } + let x_train = x.take(&indices[n_test..n], 0); + let x_test = x.take(&indices[0..n_test], 0); + let y_train = y.take(&indices[n_test..n]); + let y_test = y.take(&indices[0..n_test]); + (x_train, x_test, y_train, y_test) } -/// -/// KFold Cross-Validation -/// -pub trait BaseKFold { - /// Returns integer indices corresponding to test sets - fn test_indices>(&self, x: &M) -> Vec>; - - /// Returns masksk corresponding to test sets - fn test_masks>(&self, x: &M) -> Vec>; - - /// Return a tuple containing the the training set indices for that split and - /// the testing set indices for that split. - fn split>(&self, x: &M) -> Vec<(Vec, Vec)>; +#[derive(Clone, Debug)] +pub struct CrossValidationResult { + pub test_score: Vec, + pub train_score: Vec, } -/// -/// An implementation of KFold -/// -pub struct KFold { - n_splits: usize, // cannot exceed std::usize::MAX - shuffle: bool, - // TODO: to be implemented later - // random_state: i32, -} +impl CrossValidationResult { + pub fn mean_test_score(&self) -> T { + self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap() + } -impl Default for KFold { - fn default() -> KFold { - KFold { - n_splits: 3_usize, - shuffle: true, - } + pub fn mean_train_score(&self) -> T { + self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap() } } -/// -/// Abstract class for all KFold functionalities -/// -impl BaseKFold for KFold { - fn test_indices>(&self, x: &M) -> Vec> { - // number of samples (rows) in the matrix - let n_samples: usize = x.shape().0; +pub fn cross_validate( + fit_estimator: F, + x: &M, + y: &M::RowVector, + parameters: H, + cv: K, + score: S, +) -> Result, Failed> +where + T: RealNumber, + M: Matrix, + H: Clone, + E: Predictor, + K: BaseKFold, + F: Fn(&M, &M::RowVector, H) -> Result, + S: Fn(&M::RowVector, &M::RowVector) -> T, +{ + let k = cv.n_splits(); + let mut test_score = Vec::with_capacity(k); + let mut train_score = Vec::with_capacity(k); - // initialise indices - let mut indices: Vec = (0..n_samples).collect(); - if self.shuffle { - indices.shuffle(&mut thread_rng()); - } - // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. - let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + for (test_idx, train_idx) in cv.split(x) { + let train_x = x.take(&train_idx, 0); + let train_y = y.take(&train_idx); + let test_x = x.take(&test_idx, 0); + let test_y = y.take(&test_idx); - // increment by one if odd - for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { - *fold_size += 1; - } + let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; - // generate the right array of arrays for test indices - let mut return_values: Vec> = Vec::with_capacity(self.n_splits); - let mut current: usize = 0; - for fold_size in fold_sizes.drain(..) { - let stop = current + fold_size; - return_values.push(indices[current..stop].to_vec()); - current = stop - } - - return_values + train_score.push(score(&train_y, &estimator.predict(&train_x)?)); + test_score.push(score(&test_y, &estimator.predict(&test_x)?)); } - fn test_masks>(&self, x: &M) -> Vec> { - let mut return_values: Vec> = Vec::with_capacity(self.n_splits); - for test_index in self.test_indices(x).drain(..) { - // init mask - let mut test_mask = vec![false; x.shape().0]; - // set mask's indices to true according to test indices - for i in test_index { - test_mask[i] = true; // can be implemented with map() - } - return_values.push(test_mask); - } - return_values - } - - fn split>(&self, x: &M) -> Vec<(Vec, Vec)> { - let n_samples: usize = x.shape().0; - let indices: Vec = (0..n_samples).collect(); - - let mut return_values: Vec<(Vec, Vec)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs - - for test_index in self.test_masks(x).drain(..) { - let train_index = indices - .clone() - .iter() - .enumerate() - .filter(|&(idx, _)| !test_index[idx]) - .map(|(idx, _)| idx) - .collect::>(); // filter train indices out according to mask - let test_index = indices - .iter() - .enumerate() - .filter(|&(idx, _)| test_index[idx]) - .map(|(idx, _)| idx) - .collect::>(); // filter tests indices out according to mask - return_values.push((train_index, test_index)) - } - return_values - } + Ok(CrossValidationResult { + test_score, + train_score, + }) } #[cfg(test)] @@ -194,14 +125,17 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; + use crate::metrics::{accuracy, mean_absolute_error}; + use crate::model_selection::kfold::KFold; + use crate::neighbors::knn_regressor::KNNRegressor; #[test] fn run_train_test_split() { - let n = 100; - let x: DenseMatrix = DenseMatrix::rand(100, 3); - let y = vec![0f64; 100]; + let n = 123; + let x: DenseMatrix = DenseMatrix::rand(n, 3); + let y = vec![0f64; n]; - let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2); + let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true); assert!( x_train.shape().0 > (n as f64 * 0.65) as usize @@ -215,126 +149,195 @@ mod tests { assert_eq!(x_test.shape().0, y_test.len()); } - #[test] - fn run_kfold_return_test_indices_simple() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(33, 100); - let test_indices = k.test_indices(&x); + #[derive(Clone)] + struct NoParameters {} - assert_eq!(test_indices[0], (0..11).collect::>()); - assert_eq!(test_indices[1], (11..22).collect::>()); - assert_eq!(test_indices[2], (22..33).collect::>()); + #[test] + fn test_cross_validate_biased() { + struct BiasedEstimator {} + + impl BiasedEstimator { + fn fit>( + _: &M, + _: &M::RowVector, + _: NoParameters, + ) -> Result { + Ok(BiasedEstimator {}) + } + } + + impl> Predictor for BiasedEstimator { + fn predict(&self, x: &M) -> Result { + let (n, _) = x.shape(); + Ok(M::RowVector::zeros(n)) + } + } + + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + let y = vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let cv = KFold { + n_splits: 5, + ..KFold::default() + }; + + let results = + cross_validate(BiasedEstimator::fit, &x, &y, NoParameters {}, cv, &accuracy).unwrap(); + + assert_eq!(0.4, results.mean_test_score()); + assert_eq!(0.4, results.mean_train_score()); } #[test] - fn run_kfold_return_test_indices_odd() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(34, 100); - let test_indices = k.test_indices(&x); + fn test_cross_validate_knn() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159., 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165., 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], + &[365.385, 187., 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335., 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; - assert_eq!(test_indices[0], (0..12).collect::>()); - assert_eq!(test_indices[1], (12..23).collect::>()); - assert_eq!(test_indices[2], (23..34).collect::>()); + let cv = KFold { + n_splits: 5, + ..KFold::default() + }; + + let results = cross_validate( + KNNRegressor::fit, + &x, + &y, + Default::default(), + cv, + &mean_absolute_error, + ) + .unwrap(); + + assert!(results.mean_test_score() < 15.0); + assert!(results.mean_train_score() < results.mean_test_score()); } + use crate::tree::decision_tree_regressor::*; + #[test] - fn run_kfold_return_test_mask_simple() { - let k = KFold { - n_splits: 2, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(22, 100); - let test_masks = k.test_masks(&x); + fn test_some_regressor() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159., 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165., 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], + &[365.385, 187., 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335., 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; - for t in &test_masks[0][0..11] { - // TODO: this can be prob done better - assert_eq!(*t, true) - } - for t in &test_masks[0][11..22] { - assert_eq!(*t, false) - } + let cv = KFold::default().with_n_splits(2); - for t in &test_masks[1][0..11] { - assert_eq!(*t, false) - } - for t in &test_masks[1][11..22] { - assert_eq!(*t, true) - } + let results = cross_validate( + DecisionTreeRegressor::fit, + &x, + &y, + Default::default(), + cv, + &mean_absolute_error, + ) + .unwrap(); + + println!("{}", results.mean_test_score()); + println!("{}", results.mean_train_score()); } - #[test] - fn run_kfold_return_split_simple() { - let k = KFold { - n_splits: 2, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(22, 100); - let train_test_splits = k.split(&x); - - assert_eq!(train_test_splits[0].1, (0..11).collect::>()); - assert_eq!(train_test_splits[0].0, (11..22).collect::>()); - assert_eq!(train_test_splits[1].0, (0..11).collect::>()); - assert_eq!(train_test_splits[1].1, (11..22).collect::>()); - } + use crate::tree::decision_tree_classifier::*; #[test] - fn run_kfold_return_split_simple_shuffle() { - let k = KFold { + fn test_some_classifier() { + + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + let y = vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let cv = KFold { n_splits: 2, ..KFold::default() }; - let x: DenseMatrix = DenseMatrix::rand(23, 100); - let train_test_splits = k.split(&x); - assert_eq!(train_test_splits[0].1.len(), 12_usize); - assert_eq!(train_test_splits[0].0.len(), 11_usize); - assert_eq!(train_test_splits[1].0.len(), 12_usize); - assert_eq!(train_test_splits[1].1.len(), 11_usize); - } + let results = + cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap(); - #[test] - fn numpy_parity_test() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(10, 4); - let expected: Vec<(Vec, Vec)> = vec![ - (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), - (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), - (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), - ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { - assert_eq!(test, expected_test); - assert_eq!(train, expected_train); - } - } - - #[test] - fn numpy_parity_test_shuffle() { - let k = KFold { - n_splits: 3, - ..KFold::default() - }; - let x: DenseMatrix = DenseMatrix::rand(10, 4); - let expected: Vec<(Vec, Vec)> = vec![ - (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), - (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), - (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), - ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { - assert_eq!(test.len(), expected_test.len()); - assert_eq!(train.len(), expected_train.len()); - } + println!("{}", results.mean_test_score()); + println!("{}", results.mean_train_score()); } } diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index dd34ae9..fe299f3 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -33,6 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -200,6 +201,12 @@ pub struct BernoulliNB> { binarize: Option, } +impl> Predictor for BernoulliNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> BernoulliNB { /// Fits BernoulliNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index c4626ef..ce526ce 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -30,6 +30,7 @@ //! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -246,6 +247,12 @@ pub struct CategoricalNB> { inner: BaseNaiveBayes>, } +impl> Predictor for CategoricalNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> CategoricalNB { /// Fits CategoricalNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index c5c1fb2..01dacd7 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -22,6 +22,7 @@ //! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -181,6 +182,12 @@ pub struct GaussianNB> { inner: BaseNaiveBayes>, } +impl> Predictor for GaussianNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> GaussianNB { /// Fits GaussianNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index c9ac86b..84d3fd1 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -33,6 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -187,6 +188,12 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } +impl> Predictor for MultinomialNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> MultinomialNB { /// Fits MultinomialNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index f940211..8b4db1b 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -25,31 +25,40 @@ //! &[9., 10.]]); //! let y = vec![2., 2., 2., 3., 3.]; //your class labels //! -//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = knn.predict(&x).unwrap(); //! ``` //! //! variable `y_hat` will hold a vector with estimates of class labels //! +use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::{row_iter, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNClassifier` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] -pub struct KNNClassifierParameters { +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct KNNClassifierParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub distance: D, /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. pub weight: KNNWeightFunction, /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, + /// this parameter is not used + t: PhantomData, } /// K Nearest Neighbors Classifier @@ -62,12 +71,39 @@ pub struct KNNClassifier, T>> { k: usize, } -impl Default for KNNClassifierParameters { +impl, T>> KNNClassifierParameters { + /// number of training samples to consider when estimating class for new point. Default value is 3. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance(mut self, distance: D) -> Self { + self.distance = distance; + self + } + /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self { + self.weight = weight; + self + } +} + +impl Default for KNNClassifierParameters { fn default() -> Self { KNNClassifierParameters { + distance: Distances::euclidian(), algorithm: KNNAlgorithmName::CoverTree, weight: KNNWeightFunction::Uniform, k: 3, + t: PhantomData, } } } @@ -95,19 +131,23 @@ impl, T>> PartialEq for KNNClassifier { } } +impl, D: Distance, T>> Predictor + for KNNClassifier +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> KNNClassifier { /// Fits KNN classifier to a NxM matrix where N is number of samples and M is number of features. /// * `x` - training data - /// * `y` - vector with target values (classes) of length N - /// * `distance` - a function that defines a distance between each pair of point in training data. - /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. - /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + /// * `y` - vector with target values (classes) of length N /// * `parameters` - additional parameters like search algorithm and k pub fn fit>( x: &M, y: &M::RowVector, - distance: D, - parameters: KNNClassifierParameters, + parameters: KNNClassifierParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); @@ -142,7 +182,7 @@ impl, T>> KNNClassifier { classes, y: yi, k: parameters.k, - knn_algorithm: parameters.algorithm.fit(data, distance)?, + knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?, weight: parameters.weight, }) } @@ -187,14 +227,13 @@ impl, T>> KNNClassifier { mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::math::distance::Distances; #[test] fn knn_fit_predict() { let x = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![2., 2., 2., 3., 3.]; - let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); let y_hat = knn.predict(&x).unwrap(); assert_eq!(5, Vec::len(&y_hat)); assert_eq!(y.to_vec(), y_hat); @@ -207,12 +246,10 @@ mod tests { let knn = KNNClassifier::fit( &x, &y, - Distances::euclidian(), - KNNClassifierParameters { - k: 5, - algorithm: KNNAlgorithmName::LinearSearch, - weight: KNNWeightFunction::Distance, - }, + KNNClassifierParameters::default() + .with_k(5) + .with_algorithm(KNNAlgorithmName::LinearSearch) + .with_weight(KNNWeightFunction::Distance), ) .unwrap(); let y_hat = knn.predict(&DenseMatrix::from_2d_array(&[&[4.1]])).unwrap(); @@ -225,7 +262,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![2., 2., 2., 3., 3.]; - let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index b7c0f2d..a97fdea 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -27,31 +27,41 @@ //! &[5., 5.]]); //! let y = vec![1., 2., 3., 4., 5.]; //your target values //! -//! let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = knn.predict(&x).unwrap(); //! ``` //! //! variable `y_hat` will hold predicted value //! //! +use std::marker::PhantomData; + use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::{row_iter, BaseVector, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNRegressor` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] -pub struct KNNRegressorParameters { +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct KNNRegressorParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + distance: D, /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. pub weight: KNNWeightFunction, /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, + /// this parameter is not used + t: PhantomData, } /// K Nearest Neighbors Regressor @@ -63,12 +73,39 @@ pub struct KNNRegressor, T>> { k: usize, } -impl Default for KNNRegressorParameters { +impl, T>> KNNRegressorParameters { + /// number of training samples to consider when estimating class for new point. Default value is 3. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance(mut self, distance: D) -> Self { + self.distance = distance; + self + } + /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self { + self.weight = weight; + self + } +} + +impl Default for KNNRegressorParameters { fn default() -> Self { KNNRegressorParameters { + distance: Distances::euclidian(), algorithm: KNNAlgorithmName::CoverTree, weight: KNNWeightFunction::Uniform, k: 3, + t: PhantomData, } } } @@ -88,19 +125,23 @@ impl, T>> PartialEq for KNNRegressor { } } +impl, D: Distance, T>> Predictor + for KNNRegressor +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> KNNRegressor { /// Fits KNN regressor to a NxM matrix where N is number of samples and M is number of features. /// * `x` - training data - /// * `y` - vector with real values - /// * `distance` - a function that defines a distance between each pair of point in training data. - /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. - /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + /// * `y` - vector with real values /// * `parameters` - additional parameters like search algorithm and k pub fn fit>( x: &M, y: &M::RowVector, - distance: D, - parameters: KNNRegressorParameters, + parameters: KNNRegressorParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); @@ -126,7 +167,7 @@ impl, T>> KNNRegressor { Ok(KNNRegressor { y: y.to_vec(), k: parameters.k, - knn_algorithm: parameters.algorithm.fit(data, distance)?, + knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?, weight: parameters.weight, }) } @@ -176,12 +217,11 @@ mod tests { let knn = KNNRegressor::fit( &x, &y, - Distances::euclidian(), - KNNRegressorParameters { - k: 3, - algorithm: KNNAlgorithmName::LinearSearch, - weight: KNNWeightFunction::Distance, - }, + KNNRegressorParameters::default() + .with_k(3) + .with_distance(Distances::euclidian()) + .with_algorithm(KNNAlgorithmName::LinearSearch) + .with_weight(KNNWeightFunction::Distance), ) .unwrap(); let y_hat = knn.predict(&x).unwrap(); @@ -197,7 +237,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y: Vec = vec![1., 2., 3., 4., 5.]; let y_exp = vec![2., 2., 3., 4., 4.]; - let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); let y_hat = knn.predict(&x).unwrap(); assert_eq!(5, Vec::len(&y_hat)); for i in 0..y_hat.len() { @@ -211,7 +251,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![1., 2., 3., 4., 5.]; - let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index be1ad4d..85ea6b8 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -48,7 +48,7 @@ pub mod knn_regressor; pub type KNNAlgorithmName = crate::algorithm::neighbour::KNNAlgorithmName; /// Weight function that is used to determine estimated value. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub enum KNNWeightFunction { /// All k nearest points are weighted equally Uniform, diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 1f563c1..1e013d2 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -93,16 +93,18 @@ impl Kernels { } /// Linear Kernel -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LinearKernel {} /// Radial basis function (Gaussian) kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RBFKernel { /// kernel coefficient pub gamma: T, } /// Polynomial kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct PolynomialKernel { /// degree of the polynomial pub degree: T, @@ -113,6 +115,7 @@ pub struct PolynomialKernel { } /// Sigmoid (hyperbolic tangent) kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct SigmoidKernel { /// kernel coefficient pub gamma: T, diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 9e166d5..cbe97f7 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -57,13 +57,7 @@ //! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0., //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; //! -//! let svr = SVC::fit(&x, &y, -//! Kernels::linear(), -//! SVCParameters { -//! epoch: 2, -//! c: 200.0, -//! tol: 1e-3, -//! }).unwrap(); +//! let svr = SVC::fit(&x, &y, SVCParameters::default().with_c(200.0)).unwrap(); //! //! let y_hat = svr.predict(&x).unwrap(); //! ``` @@ -84,22 +78,26 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::svm::Kernel; - -#[derive(Serialize, Deserialize, Debug)] +use crate::svm::{Kernel, Kernels, LinearKernel}; +#[derive(Serialize, Deserialize, Debug, Clone)] /// SVC Parameters -pub struct SVCParameters { - /// Number of epochs +pub struct SVCParameters, K: Kernel> { + /// Number of epochs. pub epoch: usize, /// Regularization parameter. pub c: T, - /// Tolerance for stopping criterion + /// Tolerance for stopping criterion. pub tol: T, + /// The kernel function. + pub kernel: K, + /// Unused parameter. + m: PhantomData, } #[derive(Serialize, Deserialize, Debug)] @@ -136,7 +134,7 @@ struct Cache<'a, T: RealNumber, M: Matrix, K: Kernel> { struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { x: &'a M, y: &'a M::RowVector, - parameters: &'a SVCParameters, + parameters: &'a SVCParameters, svmin: usize, svmax: usize, gmin: T, @@ -147,27 +145,61 @@ struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { recalculate_minmax_grad: bool, } -impl Default for SVCParameters { +impl, K: Kernel> SVCParameters { + /// Number of epochs. + pub fn with_epoch(mut self, epoch: usize) -> Self { + self.epoch = epoch; + self + } + /// Regularization parameter. + pub fn with_c(mut self, c: T) -> Self { + self.c = c; + self + } + /// Tolerance for stopping criterion. + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The kernel function. + pub fn with_kernel>(&self, kernel: KK) -> SVCParameters { + SVCParameters { + epoch: self.epoch, + c: self.c, + tol: self.tol, + kernel: kernel, + m: PhantomData + } + } +} + +impl> Default for SVCParameters { fn default() -> Self { SVCParameters { epoch: 2, c: T::one(), tol: T::from_f64(1e-3).unwrap(), + kernel: Kernels::linear(), + m: PhantomData } } } +impl, K: Kernel> Predictor for SVC { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, K: Kernel> SVC { /// Fits SVC to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - class labels - /// * `kernel` - the kernel function /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, y: &M::RowVector, - kernel: K, - parameters: SVCParameters, + parameters: SVCParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -198,13 +230,13 @@ impl, K: Kernel> SVC { } } - let optimizer = Optimizer::new(x, &y, &kernel, ¶meters); + let optimizer = Optimizer::new(x, &y, ¶meters.kernel, ¶meters); let (support_vectors, weight, b) = optimizer.optimize(); Ok(SVC { classes, - kernel, + kernel: parameters.kernel, instances: support_vectors, w: weight, b, @@ -321,7 +353,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, x: &'a M, y: &'a M::RowVector, kernel: &'a K, - parameters: &'a SVCParameters, + parameters: &'a SVCParameters, ) -> Optimizer<'a, T, M, K> { let (n, _) = x.shape(); @@ -711,17 +743,10 @@ mod tests { let y_hat = SVC::fit( &x, &y, - Kernels::linear(), - SVCParameters { - epoch: 2, - c: 200.0, - tol: 1e-3, - }, + SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()), ) .and_then(|lr| lr.predict(&x)) - .unwrap(); - - println!("{:?}", y_hat); + .unwrap(); assert!(accuracy(&y_hat, &y) >= 0.9); } @@ -759,12 +784,7 @@ mod tests { let y_hat = SVC::fit( &x, &y, - Kernels::rbf(0.7), - SVCParameters { - epoch: 2, - c: 1.0, - tol: 1e-3, - }, + SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)), ) .and_then(|lr| lr.predict(&x)) .unwrap(); @@ -801,7 +821,7 @@ mod tests { -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let svr = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap(); + let svr = SVC::fit(&x, &y, Default::default()).unwrap(); let deserialized_svr: SVC, LinearKernel> = serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 5d007d7..25c7ff6 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -49,13 +49,7 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let svr = SVR::fit(&x, &y, -//! LinearKernel {}, -//! SVRParameters { -//! eps: 2.0, -//! c: 10.0, -//! tol: 1e-3, -//! }).unwrap(); +//! let svr = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0)).unwrap(); //! //! let y_hat = svr.predict(&x).unwrap(); //! ``` @@ -72,25 +66,30 @@ use std::cell::{Ref, RefCell}; use std::fmt::Debug; +use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::svm::Kernel; - -#[derive(Serialize, Deserialize, Debug)] +use crate::svm::{Kernel, Kernels, LinearKernel}; +#[derive(Serialize, Deserialize, Debug, Clone)] /// SVR Parameters -pub struct SVRParameters { - /// Epsilon in the epsilon-SVR model +pub struct SVRParameters, K: Kernel> { + /// Epsilon in the epsilon-SVR model. pub eps: T, /// Regularization parameter. pub c: T, - /// Tolerance for stopping criterion + /// Tolerance for stopping criterion. pub tol: T, + /// The kernel function. + pub kernel: K, + /// Unused parameter. + m: PhantomData, } #[derive(Serialize, Deserialize, Debug)] @@ -135,16 +134,52 @@ struct Cache { data: Vec>>>, } -impl Default for SVRParameters { +impl, K: Kernel> SVRParameters { + /// Epsilon in the epsilon-SVR model. + pub fn with_eps(mut self, eps: T) -> Self { + self.eps = eps; + self + } + /// Regularization parameter. + pub fn with_c(mut self, c: T) -> Self { + self.c = c; + self + } + /// Tolerance for stopping criterion. + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The kernel function. + pub fn with_kernel>(&self, kernel: KK) -> SVRParameters { + SVRParameters { + eps: self.eps, + c: self.c, + tol: self.tol, + kernel: kernel, + m: PhantomData + } + } +} + +impl> Default for SVRParameters { fn default() -> Self { SVRParameters { eps: T::from_f64(0.1).unwrap(), c: T::one(), tol: T::from_f64(1e-3).unwrap(), + kernel: Kernels::linear(), + m: PhantomData } } } +impl, K: Kernel> Predictor for SVR { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, K: Kernel> SVR { /// Fits SVR to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. @@ -153,9 +188,8 @@ impl, K: Kernel> SVR { /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, - y: &M::RowVector, - kernel: K, - parameters: SVRParameters, + y: &M::RowVector, + parameters: SVRParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -165,12 +199,12 @@ impl, K: Kernel> SVR { )); } - let optimizer = Optimizer::new(x, y, &kernel, ¶meters); + let optimizer = Optimizer::new(x, y, ¶meters.kernel, ¶meters); let (support_vectors, weight, b) = optimizer.smo(); Ok(SVR { - kernel, + kernel: parameters.kernel, instances: support_vectors, w: weight, b, @@ -243,7 +277,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, x: &M, y: &M::RowVector, kernel: &'a K, - parameters: &SVRParameters, + parameters: &SVRParameters, ) -> Optimizer<'a, T, M, K> { let (n, _) = x.shape(); @@ -513,12 +547,7 @@ mod tests { let y_hat = SVR::fit( &x, &y, - LinearKernel {}, - SVRParameters { - eps: 2.0, - c: 10.0, - tol: 1e-3, - }, + SVRParameters::default().with_eps(2.0).with_c(10.0), ) .and_then(|lr| lr.predict(&x)) .unwrap(); @@ -552,7 +581,7 @@ mod tests { 114.2, 115.7, 116.9, ]; - let svr = SVR::fit(&x, &y, LinearKernel {}, Default::default()).unwrap(); + let svr = SVR::fit(&x, &y, Default::default()).unwrap(); let deserialized_svr: SVR, LinearKernel> = serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 371bc4e..1845d5e 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -71,11 +71,12 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Parameters of Decision Tree pub struct DecisionTreeClassifierParameters { /// Split criteria to use when building a tree. @@ -269,6 +270,12 @@ pub(in crate) fn which_max(x: &[usize]) -> usize { which } +impl> Predictor for DecisionTreeClassifier { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl DecisionTreeClassifier { /// Build a decision tree classifier from the training data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 5e80b4c..492f0a1 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -66,11 +66,12 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Parameters of Regression Tree pub struct DecisionTreeRegressorParameters { /// The maximum depth of the tree. @@ -189,6 +190,12 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } } +impl> Predictor for DecisionTreeRegressor { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl DecisionTreeRegressor { /// Build a decision tree regressor from the training data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. From 9b221979da51f9a26c693f5f5300599939416df6 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 16:35:28 -0800 Subject: [PATCH 21/78] fix: clippy, documentation and formatting --- src/linalg/mod.rs | 10 +++---- src/linear/logistic_regression.rs | 12 +++++---- src/model_selection/kfold.rs | 29 +++++---------------- src/model_selection/mod.rs | 43 ++++++++++++++++++++++++++----- src/naive_bayes/multinomial.rs | 2 +- src/svm/svc.rs | 20 +++++++++----- src/svm/svr.rs | 26 +++++++++---------- 7 files changed, 80 insertions(+), 62 deletions(-) diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 5b49942..264815b 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -281,8 +281,8 @@ pub trait BaseVector: Clone + Debug { let mut result = Self::zeros(n); - for i in 0..n { - result.set(i, self.get(index[i])); + for (i, idx) in index.iter().enumerate() { + result.set(i, self.get(*idx)); } result @@ -639,11 +639,11 @@ pub trait BaseMatrix: Clone + Debug { _ => Self::zeros(n, index.len()), }; - for i in 0..index.len() { + for (i, idx) in index.iter().enumerate() { for j in 0..k { match axis { - 0 => result.set(i, j, self.get(index[i], j)), - _ => result.set(j, i, self.get(j, index[i])), + 0 => result.set(i, j, self.get(*idx, j)), + _ => result.set(j, i, self.get(j, *idx)), }; } } diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index b85bbe8..ffb845c 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -69,8 +69,7 @@ use crate::optimization::FunctionOrder; /// Logistic Regression parameters #[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LogisticRegressionParameters { -} +pub struct LogisticRegressionParameters {} /// Logistic Regression #[derive(Serialize, Deserialize, Debug)] @@ -105,8 +104,7 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { impl Default for LogisticRegressionParameters { fn default() -> Self { - LogisticRegressionParameters { - } + LogisticRegressionParameters {} } } @@ -231,7 +229,11 @@ impl> LogisticRegression { /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target class values /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. - pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result, Failed> { + pub fn fit( + x: &M, + y: &M::RowVector, + _parameters: LogisticRegressionParameters, + ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); let (_, y_nrows) = y_m.shape(); diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs index 0fbe224..63827c4 100644 --- a/src/model_selection/kfold.rs +++ b/src/model_selection/kfold.rs @@ -1,30 +1,13 @@ //! # KFold //! -//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), -//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. -//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. -//! Underfitted is bad because the model is undetrained and does not fit the training data well. -//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for -//! your data. -//! -//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +//! Defines k-fold cross validator. use crate::linalg::Matrix; use crate::math::num::RealNumber; +use crate::model_selection::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -/// An interface for the K-Folds cross-validator -pub trait BaseKFold { - /// An iterator over indices that split data into training and test set. - type Output: Iterator, Vec)>; - /// Return a tuple containing the the training set indices for that split and - /// the testing set indices for that split. - fn split>(&self, x: &M) -> Self::Output; - /// Returns the number of splits - fn n_splits(&self) -> usize; -} - /// K-Folds cross-validator pub struct KFold { /// Number of folds. Must be at least 2. @@ -101,12 +84,12 @@ impl KFold { } /// An iterator over indices that split data into training and test set. -pub struct BaseKFoldIter { +pub struct KFoldIter { indices: Vec, test_indices: Vec>, } -impl Iterator for BaseKFoldIter { +impl Iterator for KFoldIter { type Item = (Vec, Vec); fn next(&mut self) -> Option<(Vec, Vec)> { @@ -133,7 +116,7 @@ impl Iterator for BaseKFoldIter { /// Abstract class for all KFold functionalities impl BaseKFold for KFold { - type Output = BaseKFoldIter; + type Output = KFoldIter; fn n_splits(&self) -> usize { self.n_splits @@ -148,7 +131,7 @@ impl BaseKFold for KFold { let mut test_indices = self.test_masks(x); test_indices.reverse(); - BaseKFoldIter { + KFoldIter { indices, test_indices, } diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 64527b3..0aabb97 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -14,15 +14,27 @@ use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::model_selection::kfold::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -pub mod kfold; +pub(crate) mod kfold; + +pub use kfold::{KFold, KFoldIter}; + +/// An interface for the K-Folds cross-validator +pub trait BaseKFold { + /// An iterator over indices that split data into training and test set. + type Output: Iterator, Vec)>; + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Self::Output; + /// Returns the number of splits + fn n_splits(&self) -> usize; +} /// Splits data into 2 disjoint datasets. /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. -/// * `y` - target values, should be of size _M_ +/// * `y` - target values, should be of size _N_ /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. /// * `shuffle`, - whether or not to shuffle the data before splitting pub fn train_test_split>( @@ -65,22 +77,33 @@ pub fn train_test_split>( (x_train, x_test, y_train, y_test) } +/// Cross validation results. #[derive(Clone, Debug)] pub struct CrossValidationResult { + /// Vector with test scores on each cv split pub test_score: Vec, + /// Vector with training scores on each cv split pub train_score: Vec, } impl CrossValidationResult { + /// Average test score pub fn mean_test_score(&self) -> T { self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap() } - + /// Average training score pub fn mean_train_score(&self) -> T { self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap() } } +/// Evaluate an estimator by cross-validation using given metric. +/// * `fit_estimator` - a `fit` function of an estimator +/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. +/// * `y` - target values, should be of size _N_ +/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters. +/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html) +/// * `score` - a metric to use for evaluation, see [metrics](../metrics/index.html) pub fn cross_validate( fit_estimator: F, x: &M, @@ -302,7 +325,6 @@ mod tests { #[test] fn test_some_classifier() { - let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], &[4.9, 3.0, 1.4, 0.2], @@ -334,8 +356,15 @@ mod tests { ..KFold::default() }; - let results = - cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap(); + let results = cross_validate( + DecisionTreeClassifier::fit, + &x, + &y, + Default::default(), + cv, + &accuracy, + ) + .unwrap(); println!("{}", results.mean_test_score()); println!("{}", results.mean_train_score()); diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 84d3fd1..849b8db 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -188,7 +188,7 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } -impl> Predictor for MultinomialNB { +impl> Predictor for MultinomialNB { fn predict(&self, x: &M) -> Result { self.predict(x) } diff --git a/src/svm/svc.rs b/src/svm/svc.rs index cbe97f7..aee4d3f 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -167,8 +167,8 @@ impl, K: Kernel> SVCParameters> Default for SVCParameters c: T::one(), tol: T::from_f64(1e-3).unwrap(), kernel: Kernels::linear(), - m: PhantomData + m: PhantomData, } } } -impl, K: Kernel> Predictor for SVC { +impl, K: Kernel> Predictor + for SVC +{ fn predict(&self, x: &M) -> Result { self.predict(x) } @@ -743,10 +745,12 @@ mod tests { let y_hat = SVC::fit( &x, &y, - SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()), + SVCParameters::default() + .with_c(200.0) + .with_kernel(Kernels::linear()), ) .and_then(|lr| lr.predict(&x)) - .unwrap(); + .unwrap(); assert!(accuracy(&y_hat, &y) >= 0.9); } @@ -784,7 +788,9 @@ mod tests { let y_hat = SVC::fit( &x, &y, - SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)), + SVCParameters::default() + .with_c(1.0) + .with_kernel(Kernels::rbf(0.7)), ) .and_then(|lr| lr.predict(&x)) .unwrap(); diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 25c7ff6..295ad78 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -134,7 +134,7 @@ struct Cache { data: Vec>>>, } -impl, K: Kernel> SVRParameters { +impl, K: Kernel> SVRParameters { /// Epsilon in the epsilon-SVR model. pub fn with_eps(mut self, eps: T) -> Self { self.eps = eps; @@ -153,11 +153,11 @@ impl, K: Kernel> SVRParameters>(&self, kernel: KK) -> SVRParameters { SVRParameters { - eps: self.eps, + eps: self.eps, c: self.c, tol: self.tol, - kernel: kernel, - m: PhantomData + kernel, + m: PhantomData, } } } @@ -169,12 +169,14 @@ impl> Default for SVRParameters c: T::one(), tol: T::from_f64(1e-3).unwrap(), kernel: Kernels::linear(), - m: PhantomData + m: PhantomData, } } } -impl, K: Kernel> Predictor for SVR { +impl, K: Kernel> Predictor + for SVR +{ fn predict(&self, x: &M) -> Result { self.predict(x) } @@ -188,7 +190,7 @@ impl, K: Kernel> SVR { /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, - y: &M::RowVector, + y: &M::RowVector, parameters: SVRParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -544,13 +546,9 @@ mod tests { 114.2, 115.7, 116.9, ]; - let y_hat = SVR::fit( - &x, - &y, - SVRParameters::default().with_eps(2.0).with_c(10.0), - ) - .and_then(|lr| lr.predict(&x)) - .unwrap(); + let y_hat = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0)) + .and_then(|lr| lr.predict(&x)) + .unwrap(); assert!(mean_squared_error(&y_hat, &y) < 2.5); } From f685f575e068080b64d660ebe34261f3556ffee7 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 17:42:18 -0800 Subject: [PATCH 22/78] feat: + cross_val_predict --- src/model_selection/mod.rs | 105 +++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 0aabb97..7178da8 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -125,7 +125,7 @@ where let mut test_score = Vec::with_capacity(k); let mut train_score = Vec::with_capacity(k); - for (test_idx, train_idx) in cv.split(x) { + for (train_idx, test_idx) in cv.split(x) { let train_x = x.take(&train_idx, 0); let train_y = y.take(&train_idx); let test_x = x.take(&test_idx, 0); @@ -143,6 +143,46 @@ where }) } +/// Generate cross-validated estimates for each input data point. +/// The data is split according to the cv parameter. Each sample belongs to exactly one test set, and its prediction is computed with an estimator fitted on the corresponding training set. +/// * `fit_estimator` - a `fit` function of an estimator +/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. +/// * `y` - target values, should be of size _N_ +/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters. +/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html) +pub fn cross_val_predict( + fit_estimator: F, + x: &M, + y: &M::RowVector, + parameters: H, + cv: K +) -> Result +where + T: RealNumber, + M: Matrix, + H: Clone, + E: Predictor, + K: BaseKFold, + F: Fn(&M, &M::RowVector, H) -> Result +{ + let mut y_hat = M::RowVector::zeros(y.len()); + + for (train_idx, test_idx) in cv.split(x) { + let train_x = x.take(&train_idx, 0); + let train_y = y.take(&train_idx); + let test_x = x.take(&test_idx, 0); + + let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; + + let y_test_hat = estimator.predict(&test_x)?; + for (i, &idx) in test_idx.iter().enumerate() { + y_hat.set(idx, y_test_hat.get(i)); + } + } + + Ok(y_hat) +} + #[cfg(test)] mod tests { @@ -278,10 +318,8 @@ mod tests { assert!(results.mean_train_score() < results.mean_test_score()); } - use crate::tree::decision_tree_regressor::*; - #[test] - fn test_some_regressor() { + fn test_cross_val_predict_knn() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159., 107.608, 1947., 60.323], &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], @@ -305,68 +343,21 @@ mod tests { 114.2, 115.7, 116.9, ]; - let cv = KFold::default().with_n_splits(2); - - let results = cross_validate( - DecisionTreeRegressor::fit, - &x, - &y, - Default::default(), - cv, - &mean_absolute_error, - ) - .unwrap(); - - println!("{}", results.mean_test_score()); - println!("{}", results.mean_train_score()); - } - - use crate::tree::decision_tree_classifier::*; - - #[test] - fn test_some_classifier() { - let x = DenseMatrix::from_2d_array(&[ - &[5.1, 3.5, 1.4, 0.2], - &[4.9, 3.0, 1.4, 0.2], - &[4.7, 3.2, 1.3, 0.2], - &[4.6, 3.1, 1.5, 0.2], - &[5.0, 3.6, 1.4, 0.2], - &[5.4, 3.9, 1.7, 0.4], - &[4.6, 3.4, 1.4, 0.3], - &[5.0, 3.4, 1.5, 0.2], - &[4.4, 2.9, 1.4, 0.2], - &[4.9, 3.1, 1.5, 0.1], - &[7.0, 3.2, 4.7, 1.4], - &[6.4, 3.2, 4.5, 1.5], - &[6.9, 3.1, 4.9, 1.5], - &[5.5, 2.3, 4.0, 1.3], - &[6.5, 2.8, 4.6, 1.5], - &[5.7, 2.8, 4.5, 1.3], - &[6.3, 3.3, 4.7, 1.6], - &[4.9, 2.4, 3.3, 1.0], - &[6.6, 2.9, 4.6, 1.3], - &[5.2, 2.7, 3.9, 1.4], - ]); - let y = vec![ - 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - ]; - let cv = KFold { n_splits: 2, ..KFold::default() }; - let results = cross_validate( - DecisionTreeClassifier::fit, + let y_hat = cross_val_predict( + KNNRegressor::fit, &x, &y, Default::default(), - cv, - &accuracy, + cv ) - .unwrap(); + .unwrap(); - println!("{}", results.mean_test_score()); - println!("{}", results.mean_train_score()); + assert!(mean_absolute_error(&y, &y_hat) < 10.0); } + } From 74f0d9e6fb574196cd84bc7d82169ad8a96cb910 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 17:44:44 -0800 Subject: [PATCH 23/78] fix: formatting --- src/model_selection/mod.rs | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 7178da8..7776354 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -155,7 +155,7 @@ pub fn cross_val_predict( x: &M, y: &M::RowVector, parameters: H, - cv: K + cv: K, ) -> Result where T: RealNumber, @@ -163,14 +163,14 @@ where H: Clone, E: Predictor, K: BaseKFold, - F: Fn(&M, &M::RowVector, H) -> Result -{ - let mut y_hat = M::RowVector::zeros(y.len()); - + F: Fn(&M, &M::RowVector, H) -> Result, +{ + let mut y_hat = M::RowVector::zeros(y.len()); + for (train_idx, test_idx) in cv.split(x) { let train_x = x.take(&train_idx, 0); let train_y = y.take(&train_idx); - let test_x = x.take(&test_idx, 0); + let test_x = x.take(&test_idx, 0); let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; @@ -348,16 +348,8 @@ mod tests { ..KFold::default() }; - let y_hat = cross_val_predict( - KNNRegressor::fit, - &x, - &y, - Default::default(), - cv - ) - .unwrap(); + let y_hat = cross_val_predict(KNNRegressor::fit, &x, &y, Default::default(), cv).unwrap(); assert!(mean_absolute_error(&y, &y_hat) < 10.0); } - } From dd341f4a12a8638f2f5538bc2fa68b5d2ca779de Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 23 Dec 2020 12:29:39 -0800 Subject: [PATCH 24/78] feat: + builders for algorithm parameters --- src/cluster/dbscan.rs | 22 ++++++++++++++-- src/cluster/kmeans.rs | 8 ++++++ src/decomposition/pca.rs | 9 +++++++ src/ensemble/random_forest_classifier.rs | 33 ++++++++++++++++++++++++ src/ensemble/random_forest_regressor.rs | 28 ++++++++++++++++++++ src/linear/elastic_net.rs | 30 +++++++++++++++++++++ src/linear/lasso.rs | 23 +++++++++++++++++ src/linear/linear_regression.rs | 8 ++++++ src/linear/ridge_regression.rs | 18 +++++++++++++ src/naive_bayes/bernoulli.rs | 15 +++++++++++ src/naive_bayes/categorical.rs | 6 +++++ src/naive_bayes/gaussian.rs | 5 ++++ src/naive_bayes/multinomial.rs | 10 +++++++ src/neighbors/knn_classifier.rs | 14 +++++++--- src/neighbors/knn_regressor.rs | 14 +++++++--- src/tree/decision_tree_classifier.rs | 23 +++++++++++++++++ src/tree/decision_tree_regressor.rs | 18 +++++++++++++ 17 files changed, 276 insertions(+), 8 deletions(-) diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index e595028..ac095f6 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -53,14 +53,32 @@ pub struct DBSCAN, T>> { #[derive(Debug, Clone)] /// DBSCAN clustering algorithm parameters pub struct DBSCANParameters { - /// Maximum number of iterations of the k-means algorithm for a single run. + /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. pub min_samples: usize, - /// The number of samples in a neighborhood for a point to be considered as a core point. + /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. pub eps: T, /// KNN algorithm to use. pub algorithm: KNNAlgorithmName, } +impl DBSCANParameters { + /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. + pub fn with_min_samples(mut self, min_samples: usize) -> Self { + self.min_samples = min_samples; + self + } + /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. + pub fn with_eps(mut self, eps: T) -> Self { + self.eps = eps; + self + } + /// KNN algorithm to use. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } +} + impl, T>> PartialEq for DBSCAN { fn eq(&self, other: &Self) -> bool { self.cluster_labels.len() == other.cluster_labels.len() diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 26a4038..bc5d673 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -105,6 +105,14 @@ pub struct KMeansParameters { pub max_iter: usize, } +impl KMeansParameters { + /// Maximum number of iterations of the k-means algorithm for a single run. + pub fn with_max_iter(mut self, max_iter: usize) -> Self { + self.max_iter = max_iter; + self + } +} + impl Default for KMeansParameters { fn default() -> Self { KMeansParameters { max_iter: 100 } diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 7d80f88..68220e3 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -88,6 +88,15 @@ pub struct PCAParameters { pub use_correlation_matrix: bool, } +impl PCAParameters { + /// By default, covariance matrix is used to compute principal components. + /// Enable this flag if you want to use correlation matrix instead. + pub fn with_use_correlation_matrix(mut self, use_correlation_matrix: bool) -> Self { + self.use_correlation_matrix = use_correlation_matrix; + self + } +} + impl Default for PCAParameters { fn default() -> Self { PCAParameters { diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index a742d90..9f1ba72 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -85,6 +85,39 @@ pub struct RandomForestClassifier { classes: Vec, } +impl RandomForestClassifierParameters { + /// Split criteria to use when building a tree. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self { + self.criterion = criterion; + self + } + /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } + /// The number of trees in the forest. + pub fn with_n_trees(mut self, n_trees: u16) -> Self { + self.n_trees = n_trees; + self + } + /// Number of random sample of predictors to use as split candidates. + pub fn with_m(mut self, m: usize) -> Self { + self.m = Some(m); + self + } +} + impl PartialEq for RandomForestClassifier { fn eq(&self, other: &Self) -> bool { if self.classes.len() != other.classes.len() || self.trees.len() != other.trees.len() { diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 52b39f9..6aa89d0 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -80,6 +80,34 @@ pub struct RandomForestRegressor { trees: Vec>, } +impl RandomForestRegressorParameters { + /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } + /// The number of trees in the forest. + pub fn with_n_trees(mut self, n_trees: usize) -> Self { + self.n_trees = n_trees; + self + } + /// Number of random sample of predictors to use as split candidates. + pub fn with_m(mut self, m: usize) -> Self { + self.m = Some(m); + self + } +} + impl Default for RandomForestRegressorParameters { fn default() -> Self { RandomForestRegressorParameters { diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index b386290..1ab933a 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -90,6 +90,36 @@ pub struct ElasticNet> { intercept: T, } +impl ElasticNetParameters { + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// The elastic net mixing parameter, with 0 <= l1_ratio <= 1. + /// For l1_ratio = 0 the penalty is an L2 penalty. + /// For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. + pub fn with_l1_ratio(mut self, l1_ratio: T) -> Self { + self.l1_ratio = l1_ratio; + self + } + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. + pub fn with_normalize(mut self, normalize: bool) -> Self { + self.normalize = normalize; + self + } + /// The tolerance for the optimization + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The maximum number of iterations + pub fn with_max_iter(mut self, max_iter: usize) -> Self { + self.max_iter = max_iter; + self + } +} + impl Default for ElasticNetParameters { fn default() -> Self { ElasticNetParameters { diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 0dab3e5..e16a316 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -54,6 +54,29 @@ pub struct Lasso> { intercept: T, } +impl LassoParameters { + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. + pub fn with_normalize(mut self, normalize: bool) -> Self { + self.normalize = normalize; + self + } + /// The tolerance for the optimization + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The maximum number of iterations + pub fn with_max_iter(mut self, max_iter: usize) -> Self { + self.max_iter = max_iter; + self + } +} + impl Default for LassoParameters { fn default() -> Self { LassoParameters { diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index c7bd872..0ebad34 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -93,6 +93,14 @@ pub struct LinearRegression> { solver: LinearRegressionSolverName, } +impl LinearRegressionParameters { + /// Solver to use for estimation of regression coefficients. + pub fn with_solver(mut self, solver: LinearRegressionSolverName) -> Self { + self.solver = solver; + self + } +} + impl Default for LinearRegressionParameters { fn default() -> Self { LinearRegressionParameters { diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 2b5a898..5c14313 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -98,6 +98,24 @@ pub struct RidgeRegression> { solver: RidgeRegressionSolverName, } +impl RidgeRegressionParameters { + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// Solver to use for estimation of regression coefficients. + pub fn with_solver(mut self, solver: RidgeRegressionSolverName) -> Self { + self.solver = solver; + self + } + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. + pub fn with_normalize(mut self, normalize: bool) -> Self { + self.normalize = normalize; + self + } +} + impl Default for RidgeRegressionParameters { fn default() -> Self { RidgeRegressionParameters { diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index fe299f3..db98efc 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -96,6 +96,21 @@ impl BernoulliNBParameters { binarize, } } + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub fn with_priors(mut self, priors: Vec) -> Self { + self.priors = Some(priors); + self + } + /// Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors. + pub fn with_binarize(mut self, binarize: T) -> Self { + self.binarize = Some(binarize); + self + } } impl Default for BernoulliNBParameters { diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index ce526ce..ea81eb5 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -234,7 +234,13 @@ impl CategoricalNBParameters { ))) } } + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } } + impl Default for CategoricalNBParameters { fn default() -> Self { Self { alpha: T::one() } diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 01dacd7..f1fc812 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -86,6 +86,11 @@ impl GaussianNBParameters { pub fn new(priors: Option>) -> Self { Self { priors } } + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub fn with_priors(mut self, priors: Vec) -> Self { + self.priors = Some(priors); + self + } } impl GaussianNBDistribution { diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 849b8db..50d2ee2 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -86,6 +86,16 @@ impl MultinomialNBParameters { pub fn new(alpha: T, priors: Option>) -> Self { Self { alpha, priors } } + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub fn with_priors(mut self, priors: Vec) -> Self { + self.priors = Some(priors); + self + } } impl Default for MultinomialNBParameters { diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 8b4db1b..6668539 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -80,9 +80,17 @@ impl, T>> KNNClassifierParameters { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. - pub fn with_distance(mut self, distance: D) -> Self { - self.distance = distance; - self + pub fn with_distance, T>>( + self, + distance: DD, + ) -> KNNClassifierParameters { + KNNClassifierParameters { + distance, + algorithm: self.algorithm, + weight: self.weight, + k: self.k, + t: PhantomData, + } } /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index a97fdea..80971e5 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -82,9 +82,17 @@ impl, T>> KNNRegressorParameters { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. - pub fn with_distance(mut self, distance: D) -> Self { - self.distance = distance; - self + pub fn with_distance, T>>( + self, + distance: DD, + ) -> KNNRegressorParameters { + KNNRegressorParameters { + distance, + algorithm: self.algorithm, + weight: self.weight, + k: self.k, + t: PhantomData, + } } /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 1845d5e..50a855b 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -161,6 +161,29 @@ impl PartialEq for Node { } } +impl DecisionTreeClassifierParameters { + /// Split criteria to use when building a tree. + pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self { + self.criterion = criterion; + self + } + /// The maximum depth of the tree. + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } +} + impl Default for DecisionTreeClassifierParameters { fn default() -> Self { DecisionTreeClassifierParameters { diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 492f0a1..806e680 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -101,6 +101,24 @@ struct Node { false_child: Option, } +impl DecisionTreeRegressorParameters { + /// The maximum depth of the tree. + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } +} + impl Default for DecisionTreeRegressorParameters { fn default() -> Self { DecisionTreeRegressorParameters { From 32ae63a577b3a84bcca2dc7472f830b00290f085 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 23 Dec 2020 12:38:10 -0800 Subject: [PATCH 25/78] feat: documentation adjusted to new builder --- src/cluster/dbscan.rs | 8 +++----- src/linear/linear_regression.rs | 6 +++--- src/linear/ridge_regression.rs | 7 ++----- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index ac095f6..c572ccc 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -15,11 +15,9 @@ //! let blobs = generator::make_blobs(100, 2, 3); //! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data); //! // Fit the algorithm and predict cluster labels -//! let labels = DBSCAN::fit(&x, Distances::euclidian(), DBSCANParameters{ -//! min_samples: 5, -//! eps: 3.0, -//! algorithm: KNNAlgorithmName::CoverTree -//! }).and_then(|dbscan| dbscan.predict(&x)); +//! let labels = DBSCAN::fit(&x, Distances::euclidian(), +//! DBSCANParameters::default().with_eps(3.0)). +//! and_then(|dbscan| dbscan.predict(&x)); //! //! println!("{:?}", labels); //! ``` diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 0ebad34..1855673 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -45,9 +45,9 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let lr = LinearRegression::fit(&x, &y, LinearRegressionParameters { -//! solver: LinearRegressionSolverName::QR, // or SVD -//! }).unwrap(); +//! let lr = LinearRegression::fit(&x, &y, +//! LinearRegressionParameters::default(). +//! with_solver(LinearRegressionSolverName::QR)).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! ``` diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 5c14313..f29898d 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -45,11 +45,8 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters { -//! solver: RidgeRegressionSolverName::Cholesky, -//! alpha: 0.1, -//! normalize: true -//! }).and_then(|lr| lr.predict(&x)).unwrap(); +//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters::default().with_alpha(0.1)). +//! and_then(|lr| lr.predict(&x)).unwrap(); //! ``` //! //! ## References: From d22be7d6ae44c1fddc412fde9ca434070ae890b5 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 24 Dec 2020 13:47:09 -0800 Subject: [PATCH 26/78] fix: post-review changes --- src/naive_bayes/bernoulli.rs | 8 -------- src/naive_bayes/categorical.rs | 11 ----------- src/naive_bayes/gaussian.rs | 6 +----- src/naive_bayes/multinomial.rs | 4 ---- 4 files changed, 1 insertion(+), 28 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index db98efc..c6cbfa8 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -88,14 +88,6 @@ pub struct BernoulliNBParameters { } impl BernoulliNBParameters { - /// Create BernoulliNBParameters with specific paramaters. - pub fn new(alpha: T, priors: Option>, binarize: Option) -> Self { - Self { - alpha, - priors, - binarize, - } - } /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub fn with_alpha(mut self, alpha: T) -> Self { self.alpha = alpha; diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index ea81eb5..667a270 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -223,17 +223,6 @@ pub struct CategoricalNBParameters { } impl CategoricalNBParameters { - /// Create CategoricalNBParameters with specific paramaters. - pub fn new(alpha: T) -> Result { - if alpha > T::zero() { - Ok(Self { alpha }) - } else { - Err(Failed::fit(&format!( - "alpha should be >= 0, alpha=[{}]", - alpha - ))) - } - } /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub fn with_alpha(mut self, alpha: T) -> Self { self.alpha = alpha; diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index f1fc812..bc96420 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -82,10 +82,6 @@ pub struct GaussianNBParameters { } impl GaussianNBParameters { - /// Create GaussianNBParameters with specific paramaters. - pub fn new(priors: Option>) -> Self { - Self { priors } - } /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data pub fn with_priors(mut self, priors: Vec) -> Self { self.priors = Some(priors); @@ -266,7 +262,7 @@ mod tests { let y = vec![1., 1., 1., 2., 2., 2.]; let priors = vec![0.3, 0.7]; - let parameters = GaussianNBParameters::new(Some(priors.clone())); + let parameters = GaussianNBParameters::default().with_priors(priors.clone()); let gnb = GaussianNB::fit(&x, &y, parameters).unwrap(); assert_eq!(gnb.inner.distribution.class_priors, priors); diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 50d2ee2..237b606 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -82,10 +82,6 @@ pub struct MultinomialNBParameters { } impl MultinomialNBParameters { - /// Create MultinomialNBParameters with specific paramaters. - pub fn new(alpha: T, priors: Option>) -> Self { - Self { alpha, priors } - } /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub fn with_alpha(mut self, alpha: T) -> Self { self.alpha = alpha; From 810a5c429b9df1aa383e1eaf607f7c4c1e0b7a3f Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 24 Dec 2020 18:36:23 -0800 Subject: [PATCH 27/78] feat: consolidates API --- src/api.rs | 43 +++++++++++++++ src/base.rs | 10 ---- src/cluster/dbscan.rs | 66 ++++++++++++++++-------- src/cluster/kmeans.rs | 65 +++++++++++++++-------- src/decomposition/pca.rs | 52 ++++++++++++------- src/decomposition/svd.rs | 40 +++++++++++--- src/ensemble/random_forest_classifier.rs | 15 +++++- src/ensemble/random_forest_regressor.rs | 15 +++++- src/lib.rs | 2 +- src/linear/elastic_net.rs | 10 +++- src/linear/lasso.rs | 10 +++- src/linear/linear_regression.rs | 14 ++++- src/linear/logistic_regression.rs | 14 ++++- src/linear/ridge_regression.rs | 14 ++++- src/model_selection/mod.rs | 2 +- src/naive_bayes/bernoulli.rs | 10 +++- src/naive_bayes/categorical.rs | 14 ++++- src/naive_bayes/gaussian.rs | 10 +++- src/naive_bayes/multinomial.rs | 14 ++++- src/neighbors/knn_classifier.rs | 14 ++++- src/neighbors/knn_regressor.rs | 14 ++++- src/svm/svc.rs | 10 +++- src/svm/svr.rs | 10 +++- src/tree/decision_tree_classifier.rs | 15 +++++- src/tree/decision_tree_regressor.rs | 15 +++++- 25 files changed, 400 insertions(+), 98 deletions(-) create mode 100644 src/api.rs delete mode 100644 src/base.rs diff --git a/src/api.rs b/src/api.rs new file mode 100644 index 0000000..c598e12 --- /dev/null +++ b/src/api.rs @@ -0,0 +1,43 @@ +//! # Common Interfaces and API +//! +//! This module provides interfaces and uniform API with simple conventions +//! that are used in other modules for supervised and unsupervised learning. + +use crate::error::Failed; + +/// An estimator for unsupervised learning, that provides method `fit` to learn from data +pub trait UnsupervisedEstimator { + /// Fit a model to a training dataset, estimate model's parameters. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `parameters` - hyperparameters of an algorithm + fn fit(x: &X, parameters: P) -> Result + where + Self: Sized, + P: Clone; +} + +/// An estimator for supervised learning, , that provides method `fit` to learn from data and training values +pub trait SupervisedEstimator { + /// Fit a model to a training dataset, estimate model's parameters. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - target training values of size _N_. + /// * `parameters` - hyperparameters of an algorithm + fn fit(x: &X, y: &Y, parameters: P) -> Result + where + Self: Sized, + P: Clone; +} + +/// Implements method predict that estimates target value from new data +pub trait Predictor { + /// Estimate target values from new data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + fn predict(&self, x: &X) -> Result; +} + +/// Implements method transform that filters or modifies input data +pub trait Transformer { + /// Transform data by modifying or filtering it + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + fn transform(&self, x: &X) -> Result; +} diff --git a/src/base.rs b/src/base.rs deleted file mode 100644 index a2d4468..0000000 --- a/src/base.rs +++ /dev/null @@ -1,10 +0,0 @@ -//! # Common Interfaces and methods -//! -//! This module consolidates interfaces and uniform basic API that is used elsewhere in the code. - -use crate::error::Failed; - -/// Implements method predict that offers a way to estimate target value from new data -pub trait Predictor { - fn predict(&self, x: &X) -> Result; -} diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index c572ccc..9aed2f0 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -15,8 +15,7 @@ //! let blobs = generator::make_blobs(100, 2, 3); //! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data); //! // Fit the algorithm and predict cluster labels -//! let labels = DBSCAN::fit(&x, Distances::euclidian(), -//! DBSCANParameters::default().with_eps(3.0)). +//! let labels = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0)). //! and_then(|dbscan| dbscan.predict(&x)); //! //! println!("{:?}", labels); @@ -33,9 +32,11 @@ use std::iter::Sum; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::api::{Predictor, UnsupervisedEstimator}; use crate::error::Failed; use crate::linalg::{row_iter, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::tree::decision_tree_classifier::which_max; @@ -50,7 +51,11 @@ pub struct DBSCAN, T>> { #[derive(Debug, Clone)] /// DBSCAN clustering algorithm parameters -pub struct DBSCANParameters { +pub struct DBSCANParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub distance: D, /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. pub min_samples: usize, /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. @@ -59,7 +64,18 @@ pub struct DBSCANParameters { pub algorithm: KNNAlgorithmName, } -impl DBSCANParameters { +impl, T>> DBSCANParameters { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance, T>>(self, distance: DD) -> DBSCANParameters { + DBSCANParameters { + distance, + min_samples: self.min_samples, + eps: self.eps, + algorithm: self.algorithm, + } + } /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. pub fn with_min_samples(mut self, min_samples: usize) -> Self { self.min_samples = min_samples; @@ -86,9 +102,10 @@ impl, T>> PartialEq for DBSCAN { } } -impl Default for DBSCANParameters { +impl Default for DBSCANParameters { fn default() -> Self { DBSCANParameters { + distance: Distances::euclidian(), min_samples: 5, eps: T::half(), algorithm: KNNAlgorithmName::CoverTree, @@ -96,6 +113,22 @@ impl Default for DBSCANParameters { } } +impl, D: Distance, T>> + UnsupervisedEstimator> for DBSCAN +{ + fn fit(x: &M, parameters: DBSCANParameters) -> Result { + DBSCAN::fit(x, parameters) + } +} + +impl, D: Distance, T>> Predictor + for DBSCAN +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> DBSCAN { /// Fit algorithm to _NxM_ matrix where _N_ is number of samples and _M_ is number of features. /// * `data` - training instances to cluster @@ -103,8 +136,7 @@ impl, T>> DBSCAN { /// * `parameters` - cluster parameters pub fn fit>( x: &M, - distance: D, - parameters: DBSCANParameters, + parameters: DBSCANParameters, ) -> Result, Failed> { if parameters.min_samples < 1 { return Err(Failed::fit(&"Invalid minPts".to_string())); @@ -121,7 +153,9 @@ impl, T>> DBSCAN { let n = x.shape().0; let mut y = vec![unassigned; n]; - let algo = parameters.algorithm.fit(row_iter(x).collect(), distance)?; + let algo = parameters + .algorithm + .fit(row_iter(x).collect(), parameters.distance)?; for (i, e) in row_iter(x).enumerate() { if y[i] == unassigned { @@ -195,7 +229,6 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::math::distance::euclidian::Euclidian; - use crate::math::distance::Distances; #[test] fn fit_predict_dbscan() { @@ -215,16 +248,7 @@ mod tests { let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0]; - let dbscan = DBSCAN::fit( - &x, - Distances::euclidian(), - DBSCANParameters { - min_samples: 5, - eps: 1.0, - algorithm: KNNAlgorithmName::CoverTree, - }, - ) - .unwrap(); + let dbscan = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(1.0)).unwrap(); let predicted_labels = dbscan.predict(&x).unwrap(); @@ -256,7 +280,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let dbscan = DBSCAN::fit(&x, Distances::euclidian(), Default::default()).unwrap(); + let dbscan = DBSCAN::fit(&x, Default::default()).unwrap(); let deserialized_dbscan: DBSCAN = serde_json::from_str(&serde_json::to_string(&dbscan).unwrap()).unwrap(); diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index bc5d673..44ce1e6 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -43,7 +43,7 @@ //! &[5.2, 2.7, 3.9, 1.4], //! ]); //! -//! let kmeans = KMeans::fit(&x, 2, Default::default()).unwrap(); // Fit to data, 2 clusters +//! let kmeans = KMeans::fit(&x, KMeansParameters::default().with_k(2)).unwrap(); // Fit to data, 2 clusters //! let y_hat = kmeans.predict(&x).unwrap(); // use the same points for prediction //! ``` //! @@ -59,6 +59,7 @@ use std::iter::Sum; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::bbd_tree::BBDTree; +use crate::api::{Predictor, UnsupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::distance::euclidian::*; @@ -101,11 +102,18 @@ impl PartialEq for KMeans { #[derive(Debug, Clone)] /// K-Means clustering algorithm parameters pub struct KMeansParameters { + /// Number of clusters. + pub k: usize, /// Maximum number of iterations of the k-means algorithm for a single run. pub max_iter: usize, } impl KMeansParameters { + /// Number of clusters. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } /// Maximum number of iterations of the k-means algorithm for a single run. pub fn with_max_iter(mut self, max_iter: usize) -> Self { self.max_iter = max_iter; @@ -115,24 +123,37 @@ impl KMeansParameters { impl Default for KMeansParameters { fn default() -> Self { - KMeansParameters { max_iter: 100 } + KMeansParameters { + k: 2, + max_iter: 100, + } + } +} + +impl> UnsupervisedEstimator for KMeans { + fn fit(x: &M, parameters: KMeansParameters) -> Result { + KMeans::fit(x, parameters) + } +} + +impl> Predictor for KMeans { + fn predict(&self, x: &M) -> Result { + self.predict(x) } } impl KMeans { /// Fit algorithm to _NxM_ matrix where _N_ is number of samples and _M_ is number of features. - /// * `data` - training instances to cluster - /// * `k` - number of clusters + /// * `data` - training instances to cluster /// * `parameters` - cluster parameters - pub fn fit>( - data: &M, - k: usize, - parameters: KMeansParameters, - ) -> Result, Failed> { + pub fn fit>(data: &M, parameters: KMeansParameters) -> Result, Failed> { let bbd = BBDTree::new(data); - if k < 2 { - return Err(Failed::fit(&format!("invalid number of clusters: {}", k))); + if parameters.k < 2 { + return Err(Failed::fit(&format!( + "invalid number of clusters: {}", + parameters.k + ))); } if parameters.max_iter == 0 { @@ -145,9 +166,9 @@ impl KMeans { let (n, d) = data.shape(); let mut distortion = T::max_value(); - let mut y = KMeans::kmeans_plus_plus(data, k); - let mut size = vec![0; k]; - let mut centroids = vec![vec![T::zero(); d]; k]; + let mut y = KMeans::kmeans_plus_plus(data, parameters.k); + let mut size = vec![0; parameters.k]; + let mut centroids = vec![vec![T::zero(); d]; parameters.k]; for i in 0..n { size[y[i]] += 1; @@ -159,16 +180,16 @@ impl KMeans { } } - for i in 0..k { + for i in 0..parameters.k { for j in 0..d { centroids[i][j] /= T::from(size[i]).unwrap(); } } - let mut sums = vec![vec![T::zero(); d]; k]; + let mut sums = vec![vec![T::zero(); d]; parameters.k]; for _ in 1..=parameters.max_iter { let dist = bbd.clustering(¢roids, &mut sums, &mut size, &mut y); - for i in 0..k { + for i in 0..parameters.k { if size[i] > 0 { for j in 0..d { centroids[i][j] = T::from(sums[i][j]).unwrap() / T::from(size[i]).unwrap(); @@ -184,7 +205,7 @@ impl KMeans { } Ok(KMeans { - k, + k: parameters.k, y, size, distortion, @@ -280,10 +301,10 @@ mod tests { fn invalid_k() { let x = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); - assert!(KMeans::fit(&x, 0, Default::default()).is_err()); + assert!(KMeans::fit(&x, KMeansParameters::default().with_k(0)).is_err()); assert_eq!( "Fit failed: invalid number of clusters: 1", - KMeans::fit(&x, 1, Default::default()) + KMeans::fit(&x, KMeansParameters::default().with_k(1)) .unwrap_err() .to_string() ); @@ -314,7 +335,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let kmeans = KMeans::fit(&x, 2, Default::default()).unwrap(); + let kmeans = KMeans::fit(&x, Default::default()).unwrap(); let y = kmeans.predict(&x).unwrap(); @@ -348,7 +369,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let kmeans = KMeans::fit(&x, 2, Default::default()).unwrap(); + let kmeans = KMeans::fit(&x, Default::default()).unwrap(); let deserialized_kmeans: KMeans = serde_json::from_str(&serde_json::to_string(&kmeans).unwrap()).unwrap(); diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 68220e3..189e6de 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -37,7 +37,7 @@ //! &[5.2, 2.7, 3.9, 1.4], //! ]); //! -//! let pca = PCA::fit(&iris, 2, Default::default()).unwrap(); // Reduce number of features to 2 +//! let pca = PCA::fit(&iris, PCAParameters::default().with_n_components(2)).unwrap(); // Reduce number of features to 2 //! //! let iris_reduced = pca.transform(&iris).unwrap(); //! @@ -49,6 +49,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -83,12 +84,19 @@ impl> PartialEq for PCA { #[derive(Debug, Clone)] /// PCA parameters pub struct PCAParameters { + /// Number of components to keep. + pub n_components: usize, /// By default, covariance matrix is used to compute principal components. /// Enable this flag if you want to use correlation matrix instead. pub use_correlation_matrix: bool, } impl PCAParameters { + /// Number of components to keep. + pub fn with_n_components(mut self, n_components: usize) -> Self { + self.n_components = n_components; + self + } /// By default, covariance matrix is used to compute principal components. /// Enable this flag if you want to use correlation matrix instead. pub fn with_use_correlation_matrix(mut self, use_correlation_matrix: bool) -> Self { @@ -100,24 +108,33 @@ impl PCAParameters { impl Default for PCAParameters { fn default() -> Self { PCAParameters { + n_components: 2, use_correlation_matrix: false, } } } +impl> UnsupervisedEstimator for PCA { + fn fit(x: &M, parameters: PCAParameters) -> Result { + PCA::fit(x, parameters) + } +} + +impl> Transformer for PCA { + fn transform(&self, x: &M) -> Result { + self.transform(x) + } +} + impl> PCA { /// Fits PCA to your data. /// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `n_components` - number of components to keep. /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. - pub fn fit( - data: &M, - n_components: usize, - parameters: PCAParameters, - ) -> Result, Failed> { + pub fn fit(data: &M, parameters: PCAParameters) -> Result, Failed> { let (m, n) = data.shape(); - if n_components > n { + if parameters.n_components > n { return Err(Failed::fit(&format!( "Number of components, n_components should be <= number of attributes ({})", n @@ -196,16 +213,16 @@ impl> PCA { } } - let mut projection = M::zeros(n_components, n); + let mut projection = M::zeros(parameters.n_components, n); for i in 0..n { - for j in 0..n_components { + for j in 0..parameters.n_components { projection.set(j, i, eigenvectors.get(i, j)); } } - let mut pmu = vec![T::zero(); n_components]; + let mut pmu = vec![T::zero(); parameters.n_components]; for (k, mu_k) in mu.iter().enumerate().take(n) { - for (i, pmu_i) in pmu.iter_mut().enumerate().take(n_components) { + for (i, pmu_i) in pmu.iter_mut().enumerate().take(parameters.n_components) { *pmu_i += projection.get(i, k) * (*mu_k); } } @@ -318,7 +335,7 @@ mod tests { &[0.0752, 0.2007], ]); - let pca = PCA::fit(&us_arrests, 2, Default::default()).unwrap(); + let pca = PCA::fit(&us_arrests, Default::default()).unwrap(); assert!(expected.approximate_eq(&pca.components().abs(), 0.4)); } @@ -414,7 +431,7 @@ mod tests { 302.04806302399646, ]; - let pca = PCA::fit(&us_arrests, 4, Default::default()).unwrap(); + let pca = PCA::fit(&us_arrests, PCAParameters::default().with_n_components(4)).unwrap(); assert!(pca .eigenvectors @@ -525,10 +542,9 @@ mod tests { let pca = PCA::fit( &us_arrests, - 4, - PCAParameters { - use_correlation_matrix: true, - }, + PCAParameters::default() + .with_n_components(4) + .with_use_correlation_matrix(true), ) .unwrap(); @@ -573,7 +589,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let pca = PCA::fit(&iris, 4, Default::default()).unwrap(); + let pca = PCA::fit(&iris, Default::default()).unwrap(); let deserialized_pca: PCA> = serde_json::from_str(&serde_json::to_string(&pca).unwrap()).unwrap(); diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index eea1969..d404ca7 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -34,7 +34,7 @@ //! &[5.2, 2.7, 3.9, 1.4], //! ]); //! -//! let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); // Reduce number of features to 2 +//! let svd = SVD::fit(&iris, SVDParameters::default().with_n_components(2)).unwrap(); // Reduce number of features to 2 //! //! let iris_reduced = svd.transform(&iris).unwrap(); //! @@ -47,6 +47,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -67,11 +68,34 @@ impl> PartialEq for SVD { #[derive(Debug, Clone)] /// SVD parameters -pub struct SVDParameters {} +pub struct SVDParameters { + /// Number of components to keep. + pub n_components: usize, +} impl Default for SVDParameters { fn default() -> Self { - SVDParameters {} + SVDParameters { n_components: 2 } + } +} + +impl SVDParameters { + /// Number of components to keep. + pub fn with_n_components(mut self, n_components: usize) -> Self { + self.n_components = n_components; + self + } +} + +impl> UnsupervisedEstimator for SVD { + fn fit(x: &M, parameters: SVDParameters) -> Result { + SVD::fit(x, parameters) + } +} + +impl> Transformer for SVD { + fn transform(&self, x: &M) -> Result { + self.transform(x) } } @@ -80,10 +104,10 @@ impl> SVD { /// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `n_components` - number of components to keep. /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. - pub fn fit(x: &M, n_components: usize, _: SVDParameters) -> Result, Failed> { + pub fn fit(x: &M, parameters: SVDParameters) -> Result, Failed> { let (_, p) = x.shape(); - if n_components >= p { + if parameters.n_components >= p { return Err(Failed::fit(&format!( "Number of components, n_components should be < number of attributes ({})", p @@ -92,7 +116,7 @@ impl> SVD { let svd = x.svd()?; - let components = svd.V.slice(0..p, 0..n_components); + let components = svd.V.slice(0..p, 0..parameters.n_components); Ok(SVD { components, @@ -189,7 +213,7 @@ mod tests { &[197.28420365, -11.66808306], &[293.43187394, 1.91163633], ]); - let svd = SVD::fit(&x, 2, Default::default()).unwrap(); + let svd = SVD::fit(&x, Default::default()).unwrap(); let x_transformed = svd.transform(&x).unwrap(); @@ -225,7 +249,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); + let svd = SVD::fit(&iris, Default::default()).unwrap(); let deserialized_svd: SVD> = serde_json::from_str(&serde_json::to_string(&svd).unwrap()).unwrap(); diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 9f1ba72..49c4239 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -51,7 +51,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -151,6 +151,19 @@ impl Default for RandomForestClassifierParameters { } } +impl> + SupervisedEstimator + for RandomForestClassifier +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: RandomForestClassifierParameters, + ) -> Result { + RandomForestClassifier::fit(x, y, parameters) + } +} + impl> Predictor for RandomForestClassifier { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 6aa89d0..fdeb9fc 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -49,7 +49,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -135,6 +135,19 @@ impl PartialEq for RandomForestRegressor { } } +impl> + SupervisedEstimator + for RandomForestRegressor +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: RandomForestRegressorParameters, + ) -> Result { + RandomForestRegressor::fit(x, y, parameters) + } +} + impl> Predictor for RandomForestRegressor { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/lib.rs b/src/lib.rs index a1608c3..297fcc4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,7 +71,7 @@ /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; -pub(crate) mod base; +pub mod api; /// Algorithms for clustering of unlabeled data pub mod cluster; /// Various datasets diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index 1ab933a..2833ff1 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -58,7 +58,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -139,6 +139,14 @@ impl> PartialEq for ElasticNet { } } +impl> SupervisedEstimator> + for ElasticNet +{ + fn fit(x: &M, y: &M::RowVector, parameters: ElasticNetParameters) -> Result { + ElasticNet::fit(x, y, parameters) + } +} + impl> Predictor for ElasticNet { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index e16a316..b99ecff 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -26,7 +26,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -95,6 +95,14 @@ impl> PartialEq for Lasso { } } +impl> SupervisedEstimator> + for Lasso +{ + fn fit(x: &M, y: &M::RowVector, parameters: LassoParameters) -> Result { + Lasso::fit(x, y, parameters) + } +} + impl> Predictor for Lasso { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 1855673..2ef03c1 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -64,7 +64,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -116,6 +116,18 @@ impl> PartialEq for LinearRegression { } } +impl> SupervisedEstimator + for LinearRegression +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: LinearRegressionParameters, + ) -> Result { + LinearRegression::fit(x, y, parameters) + } +} + impl> Predictor for LinearRegression { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index ffb845c..a71ac45 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -58,7 +58,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -218,6 +218,18 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction } } +impl> SupervisedEstimator + for LogisticRegression +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: LogisticRegressionParameters, + ) -> Result { + LogisticRegression::fit(x, y, parameters) + } +} + impl> Predictor for LogisticRegression { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index f29898d..e9ed1ff 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -60,7 +60,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -130,6 +130,18 @@ impl> PartialEq for RidgeRegression { } } +impl> SupervisedEstimator> + for RidgeRegression +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: RidgeRegressionParameters, + ) -> Result { + RidgeRegression::fit(x, y, parameters) + } +} + impl> Predictor for RidgeRegression { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 7776354..18dfa35 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -9,7 +9,7 @@ //! //! In SmartCore you can split your data into training and test datasets using `train_test_split` function. -use crate::base::Predictor; +use crate::api::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index c6cbfa8..388646f 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -33,7 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -208,6 +208,14 @@ pub struct BernoulliNB> { binarize: Option, } +impl> SupervisedEstimator> + for BernoulliNB +{ + fn fit(x: &M, y: &M::RowVector, parameters: BernoulliNBParameters) -> Result { + BernoulliNB::fit(x, y, parameters) + } +} + impl> Predictor for BernoulliNB { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index 667a270..c6f28bd 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -30,7 +30,7 @@ //! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -242,6 +242,18 @@ pub struct CategoricalNB> { inner: BaseNaiveBayes>, } +impl> SupervisedEstimator> + for CategoricalNB +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: CategoricalNBParameters, + ) -> Result { + CategoricalNB::fit(x, y, parameters) + } +} + impl> Predictor for CategoricalNB { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index bc96420..2ac9892 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -22,7 +22,7 @@ //! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -183,6 +183,14 @@ pub struct GaussianNB> { inner: BaseNaiveBayes>, } +impl> SupervisedEstimator> + for GaussianNB +{ + fn fit(x: &M, y: &M::RowVector, parameters: GaussianNBParameters) -> Result { + GaussianNB::fit(x, y, parameters) + } +} + impl> Predictor for GaussianNB { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 237b606..4cae1f3 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -33,7 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -194,6 +194,18 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } +impl> SupervisedEstimator> + for MultinomialNB +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: MultinomialNBParameters, + ) -> Result { + MultinomialNB::fit(x, y, parameters) + } +} + impl> Predictor for MultinomialNB { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 6668539..97dd748 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -36,7 +36,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::{row_iter, Matrix}; use crate::math::distance::euclidian::Euclidian; @@ -139,6 +139,18 @@ impl, T>> PartialEq for KNNClassifier { } } +impl, D: Distance, T>> + SupervisedEstimator> for KNNClassifier +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: KNNClassifierParameters, + ) -> Result { + KNNClassifier::fit(x, y, parameters) + } +} + impl, D: Distance, T>> Predictor for KNNClassifier { diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 80971e5..4e73103 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -39,7 +39,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::{row_iter, BaseVector, Matrix}; use crate::math::distance::euclidian::Euclidian; @@ -133,6 +133,18 @@ impl, T>> PartialEq for KNNRegressor { } } +impl, D: Distance, T>> + SupervisedEstimator> for KNNRegressor +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: KNNRegressorParameters, + ) -> Result { + KNNRegressor::fit(x, y, parameters) + } +} + impl, D: Distance, T>> Predictor for KNNRegressor { diff --git a/src/svm/svc.rs b/src/svm/svc.rs index aee4d3f..095d555 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -78,7 +78,7 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -185,6 +185,14 @@ impl> Default for SVCParameters } } +impl, K: Kernel> + SupervisedEstimator> for SVC +{ + fn fit(x: &M, y: &M::RowVector, parameters: SVCParameters) -> Result { + SVC::fit(x, y, parameters) + } +} + impl, K: Kernel> Predictor for SVC { diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 295ad78..9eb6046 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -70,7 +70,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -174,6 +174,14 @@ impl> Default for SVRParameters } } +impl, K: Kernel> + SupervisedEstimator> for SVR +{ + fn fit(x: &M, y: &M::RowVector, parameters: SVRParameters) -> Result { + SVR::fit(x, y, parameters) + } +} + impl, K: Kernel> Predictor for SVR { diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 50a855b..3a92c54 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -71,7 +71,7 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -293,6 +293,19 @@ pub(in crate) fn which_max(x: &[usize]) -> usize { which } +impl> + SupervisedEstimator + for DecisionTreeClassifier +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: DecisionTreeClassifierParameters, + ) -> Result { + DecisionTreeClassifier::fit(x, y, parameters) + } +} + impl> Predictor for DecisionTreeClassifier { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 806e680..06ee507 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -66,7 +66,7 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -208,6 +208,19 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } } +impl> + SupervisedEstimator + for DecisionTreeRegressor +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: DecisionTreeRegressorParameters, + ) -> Result { + DecisionTreeRegressor::fit(x, y, parameters) + } +} + impl> Predictor for DecisionTreeRegressor { fn predict(&self, x: &M) -> Result { self.predict(x) From 9475d500dbe08d6b7c98ba68ac2bf4ce47c2fe31 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Sun, 27 Dec 2020 18:39:37 -0800 Subject: [PATCH 28/78] feat: version change + api documentation updated --- Cargo.toml | 2 +- src/cluster/dbscan.rs | 16 +++++- src/lib.rs | 23 ++++----- src/model_selection/mod.rs | 103 +++++++++++++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 21 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 32d8695..5e21aef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "smartcore" description = "The most advanced machine learning library in rust." homepage = "https://smartcorelib.org" -version = "0.1.0" +version = "0.2.0" authors = ["SmartCore Developers"] edition = "2018" license = "Apache-2.0" diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 9aed2f0..7d641cd 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -1,6 +1,20 @@ //! # DBSCAN Clustering //! -//! DBSCAN - Density-Based Spatial Clustering of Applications with Noise. +//! DBSCAN stands for density-based spatial clustering of applications with noise. This algorithms is good for arbitrary shaped clusters and clusters with noise. +//! The main idea behind DBSCAN is that a point belongs to a cluster if it is close to many points from that cluster. There are two key parameters of DBSCAN: +//! +//! * `eps`, the maximum distance that specifies a neighborhood. Two points are considered to be neighbors if the distance between them are less than or equal to `eps`. +//! * `min_samples`, minimum number of data points that defines a cluster. +//! +//! Based on these two parameters, points are classified as core point, border point, or outlier: +//! +//! * A point is a core point if there are at least `min_samples` number of points, including the point itself in its vicinity. +//! * A point is a border point if it is reachable from a core point and there are less than `min_samples` number of points within its surrounding area. +//! * All points not reachable from any other point are outliers or noise points. +//! +//! The algorithm starts from picking up an arbitrarily point in the dataset. +//! If there are at least `min_samples` points within a radius of `eps` to the point then we consider all these points to be part of the same cluster. +//! The clusters are then expanded by recursively repeating the neighborhood calculation for each neighboring point. //! //! Example: //! diff --git a/src/lib.rs b/src/lib.rs index 297fcc4..d962894 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,16 +10,11 @@ //! //! Welcome to SmartCore, the most advanced machine learning library in Rust! //! -//! In SmartCore you will find implementation of these ML algorithms: -//! * __Regression__: Linear Regression (OLS), Decision Tree Regressor, Random Forest Regressor, K Nearest Neighbors -//! * __Classification__: Logistic Regressor, Decision Tree Classifier, Random Forest Classifier, Supervised Nearest Neighbors (KNN) -//! * __Clustering__: K-Means -//! * __Matrix Decomposition__: PCA, LU, QR, SVD, EVD -//! * __Distance Metrics__: Euclidian, Minkowski, Manhattan, Hamming, Mahalanobis -//! * __Evaluation Metrics__: Accuracy, AUC, Recall, Precision, F1, Mean Absolute Error, Mean Squared Error, R2 +//! SmartCore features various classification, regression and clustering algorithms including support vector machines, random forests, k-means and DBSCAN, +//! as well as tools for model selection and model evaluation. //! -//! Most of algorithms implemented in SmartCore operate on n-dimentional arrays. While you can use Rust vectors with all functions defined in this library -//! we do recommend to go with one of the popular linear algebra libraries available in Rust. At this moment we support these packages: +//! SmartCore is well integrated with a with wide variaty of libraries that provide support for large, multi-dimensional arrays and matrices. At this moment, +//! all Smartcore's algorithms work with ordinary Rust vectors, as well as matrices and vectors defined in these packages: //! * [ndarray](https://docs.rs/ndarray) //! * [nalgebra](https://docs.rs/nalgebra/) //! @@ -28,21 +23,21 @@ //! To start using SmartCore simply add the following to your Cargo.toml file: //! ```ignore //! [dependencies] -//! smartcore = "0.1.0" +//! smartcore = "0.2.0" //! ``` //! -//! All ML algorithms in SmartCore are grouped into these generic categories: +//! All machine learning algorithms in SmartCore are grouped into these broad categories: //! * [Clustering](cluster/index.html), unsupervised clustering of unlabeled data. //! * [Martix Decomposition](decomposition/index.html), various methods for matrix decomposition. //! * [Linear Models](linear/index.html), regression and classification methods where output is assumed to have linear relation to explanatory variables //! * [Ensemble Models](ensemble/index.html), variety of regression and classification ensemble models //! * [Tree-based Models](tree/index.html), classification and regression trees //! * [Nearest Neighbors](neighbors/index.html), K Nearest Neighbors for classification and regression +//! * [Naive Bayes](naive_bayes/index.html), statistical classification technique based on Bayes Theorem +//! * [SVM](svm/index.html), support vector machines //! -//! Each category is assigned to a separate module. //! -//! For example, KNN classifier is defined in [smartcore::neighbors::knn_classifier](neighbors/knn_classifier/index.html). To train and run it using standard Rust vectors you will -//! run this code: +//! For example, you can use this code to fit a [K Nearest Neighbors classifier](neighbors/knn_classifier/index.html) to a dataset that is defined as standard Rust vector: //! //! ``` //! // DenseMatrix defenition diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 18dfa35..0058367 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -1,13 +1,106 @@ //! # Model Selection methods //! -//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), -//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. +//! In statistics and machine learning we usually split our data into two sets: one for training and the other one for testing. +//! We fit our model to the training data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. //! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. //! Underfitted is bad because the model is undetrained and does not fit the training data well. -//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for -//! your data. +//! Splitting data into multiple subsets helps us to find the right combination of hyperparameters, estimate model performance and choose the right model for +//! the data. //! -//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +//! In SmartCore a random split into training and test sets can be quickly computed with the [train_test_split](./fn.train_test_split.html) helper function. +//! +//! ``` +//! use crate::smartcore::linalg::BaseMatrix; +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::model_selection::train_test_split; +//! +//! //Iris data +//! let x = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! let y: Vec = vec![ +//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., +//! ]; +//! +//! let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true); +//! +//! println!("X train: {:?}, y train: {}, X test: {:?}, y test: {}", +//! x_train.shape(), y_train.len(), x_test.shape(), y_test.len()); +//! ``` +//! +//! When we partition the available data into two disjoint sets, we drastically reduce the number of samples that can be used for training. +//! +//! One way to solve this problem is to use k-fold cross-validation. With k-fold validation, the dataset is split into k disjoint sets. +//! A model is trained using k - 1 of the folds, and the resulting model is validated on the remaining portion of the data. +//! +//! The simplest way to run cross-validation is to use the [cross_val_score](./fn.cross_validate.html) helper function on your estimator and the dataset. +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::model_selection::{KFold, cross_validate}; +//! use smartcore::metrics::accuracy; +//! use smartcore::linear::logistic_regression::LogisticRegression; +//! +//! //Iris data +//! let x = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! let y: Vec = vec![ +//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., +//! ]; +//! +//! let cv = KFold::default().with_n_splits(3); +//! +//! let results = cross_validate(LogisticRegression::fit, //estimator +//! &x, &y, //data +//! Default::default(), //hyperparameters +//! cv, //cross validation split +//! &accuracy).unwrap(); //metric +//! +//! println!("Training accuracy: {}, test accuracy: {}", +//! results.mean_test_score(), results.mean_train_score()); +//! ``` +//! +//! The function [cross_val_predict](./fn.cross_val_predict.html) has a similar interface to `cross_val_score`, +//! but instead of test error it calculates predictions for all samples in the test set. use crate::api::Predictor; use crate::error::Failed; From bb9a05b9930e686c7a6691ab27696eae35f5ee34 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Sat, 2 Jan 2021 18:08:40 -0800 Subject: [PATCH 29/78] fix: fixes a bug in DBSCAN, removes println's --- src/cluster/dbscan.rs | 65 ++++++++++++++++++++++++++++------------ src/dataset/generator.rs | 3 -- src/decomposition/svd.rs | 3 +- 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 7d641cd..c793039 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -161,39 +161,60 @@ impl, T>> DBSCAN { } let mut k = 0; - let unassigned = -2; + let queued = -2; let outlier = -1; + let undefined = -3; let n = x.shape().0; - let mut y = vec![unassigned; n]; + let mut y = vec![undefined; n]; let algo = parameters .algorithm .fit(row_iter(x).collect(), parameters.distance)?; for (i, e) in row_iter(x).enumerate() { - if y[i] == unassigned { + if y[i] == undefined { let mut neighbors = algo.find_radius(&e, parameters.eps)?; if neighbors.len() < parameters.min_samples { y[i] = outlier; } else { y[i] = k; + for j in 0..neighbors.len() { - if y[neighbors[j].0] == unassigned { - y[neighbors[j].0] = k; - - let mut secondary_neighbors = - algo.find_radius(neighbors[j].2, parameters.eps)?; - - if secondary_neighbors.len() >= parameters.min_samples { - neighbors.append(&mut secondary_neighbors); - } - } - - if y[neighbors[j].0] == outlier { - y[neighbors[j].0] = k; + if y[neighbors[j].0] == undefined { + y[neighbors[j].0] = queued; } } + + while !neighbors.is_empty() { + let neighbor = neighbors.pop().unwrap(); + let index = neighbor.0; + + if y[index] == outlier { + y[index] = k; + } + + if y[index] == undefined || y[index] == queued { + y[index] = k; + + let secondary_neighbors = + algo.find_radius(neighbor.2, parameters.eps)?; + + if secondary_neighbors.len() >= parameters.min_samples { + for j in 0..secondary_neighbors.len() { + let label = y[secondary_neighbors[j].0]; + if label == undefined { + y[secondary_neighbors[j].0] = queued; + } + + if label == undefined || label == outlier { + neighbors.push(secondary_neighbors[j]); + } + } + } + } + } + k += 1; } } @@ -250,19 +271,25 @@ mod tests { &[1.0, 2.0], &[1.1, 2.1], &[0.9, 1.9], - &[1.2, 1.2], + &[1.2, 2.2], &[0.8, 1.8], &[2.0, 1.0], &[2.1, 1.1], - &[2.2, 1.2], &[1.9, 0.9], + &[2.2, 1.2], &[1.8, 0.8], &[3.0, 5.0], ]); let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0]; - let dbscan = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(1.0)).unwrap(); + let dbscan = DBSCAN::fit( + &x, + DBSCANParameters::default() + .with_eps(0.5) + .with_min_samples(2), + ) + .unwrap(); let predicted_labels = dbscan.predict(&x).unwrap(); diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index e0b2939..28a2224 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -59,8 +59,6 @@ pub fn make_circles(num_samples: usize, factor: f32, noise: f32) -> Dataset Date: Tue, 5 Jan 2021 16:57:14 +0000 Subject: [PATCH 30/78] Fix Matrix typo in documentation --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index d962894..7d2b089 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,7 +28,7 @@ //! //! All machine learning algorithms in SmartCore are grouped into these broad categories: //! * [Clustering](cluster/index.html), unsupervised clustering of unlabeled data. -//! * [Martix Decomposition](decomposition/index.html), various methods for matrix decomposition. +//! * [Matrix Decomposition](decomposition/index.html), various methods for matrix decomposition. //! * [Linear Models](linear/index.html), regression and classification methods where output is assumed to have linear relation to explanatory variables //! * [Ensemble Models](ensemble/index.html), variety of regression and classification ensemble models //! * [Tree-based Models](tree/index.html), classification and regression trees From eb769493e78702aaf5d3b6a1210fde447440525e Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 5 Jan 2021 16:13:39 -0400 Subject: [PATCH 31/78] Add coverage check (#57) * Add coverage check --- .circleci/config.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 17da167..a931ff5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,6 +6,8 @@ workflows: jobs: - build - clippy + - coverage + jobs: build: docker: @@ -41,3 +43,17 @@ jobs: - run: name: Run cargo clippy command: cargo clippy --all-features -- -Drust-2018-idioms -Dwarnings + + coverage: + machine: true + steps: + - checkout + - run: + name: Generate report + command: > + docker run --security-opt seccomp=unconfined -v $PWD:/volume + xd009642/tarpaulin:latest-nightly cargo tarpaulin -v --ciserver circle-ci + --out Lcov --all-features -- --test-threads 1 + - run: + name: Upload + command: bash <(curl -s https://codecov.io/bash) -Z -f From e0d46f430be0f7016a4816665fabaa6b9318a6fd Mon Sep 17 00:00:00 2001 From: Ben Cross Date: Sun, 17 Jan 2021 21:35:03 +0000 Subject: [PATCH 32/78] feat: Make SerDe optional --- Cargo.toml | 3 +-- src/algorithm/neighbour/cover_tree.rs | 10 ++++++---- src/algorithm/neighbour/linear_search.rs | 5 +++-- src/algorithm/neighbour/mod.rs | 8 +++++--- src/cluster/dbscan.rs | 5 +++-- src/cluster/kmeans.rs | 5 +++-- src/decomposition/pca.rs | 5 +++-- src/decomposition/svd.rs | 5 +++-- src/ensemble/random_forest_classifier.rs | 8 +++++--- src/ensemble/random_forest_regressor.rs | 8 +++++--- src/error/mod.rs | 8 +++++--- src/linalg/naive/dense_matrix.rs | 7 ++++++- src/linear/elastic_net.rs | 8 +++++--- src/linear/lasso.rs | 8 +++++--- src/linear/linear_regression.rs | 11 +++++++---- src/linear/logistic_regression.rs | 8 +++++--- src/linear/ridge_regression.rs | 11 +++++++---- src/math/distance/euclidian.rs | 5 +++-- src/math/distance/hamming.rs | 5 +++-- src/math/distance/mahalanobis.rs | 5 +++-- src/math/distance/manhattan.rs | 5 +++-- src/math/distance/minkowski.rs | 5 +++-- src/metrics/accuracy.rs | 5 +++-- src/metrics/auc.rs | 5 +++-- src/metrics/cluster_hcv.rs | 5 +++-- src/metrics/f1.rs | 5 +++-- src/metrics/mean_absolute_error.rs | 5 +++-- src/metrics/mean_squared_error.rs | 5 +++-- src/metrics/precision.rs | 5 +++-- src/metrics/r2.rs | 5 +++-- src/metrics/recall.rs | 5 +++-- src/naive_bayes/bernoulli.rs | 11 +++++++---- src/naive_bayes/categorical.rs | 11 +++++++---- src/naive_bayes/gaussian.rs | 11 +++++++---- src/naive_bayes/mod.rs | 5 +++-- src/naive_bayes/multinomial.rs | 11 +++++++---- src/neighbors/knn_classifier.rs | 8 +++++--- src/neighbors/knn_regressor.rs | 8 +++++--- src/neighbors/mod.rs | 5 +++-- src/svm/mod.rs | 14 +++++++++----- src/svm/svc.rs | 15 +++++++++------ src/svm/svr.rs | 15 +++++++++------ src/tree/decision_tree_classifier.rs | 14 +++++++++----- src/tree/decision_tree_regressor.rs | 11 +++++++---- 44 files changed, 206 insertions(+), 126 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5e21aef..d941735 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,8 +25,7 @@ num-traits = "0.2.12" num = "0.3.0" rand = "0.7.3" rand_distr = "0.3.0" -serde = { version = "1.0.115", features = ["derive"] } -serde_derive = "1.0.115" +serde = { version = "1.0.115", features = ["derive"], optional = true } [dev-dependencies] criterion = "0.3" diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index d271ed6..553dc99 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -24,7 +24,7 @@ //! ``` use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::sort::heap_select::HeapSelection; use crate::error::{Failed, FailedError}; @@ -32,7 +32,8 @@ use crate::math::distance::Distance; use crate::math::num::RealNumber; /// Implements Cover Tree algorithm -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct CoverTree> { base: F, inv_log_base: F, @@ -56,7 +57,8 @@ impl> PartialEq for CoverTree { } } -#[derive(Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct Node { idx: usize, max_dist: F, @@ -65,7 +67,7 @@ struct Node { scale: i64, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug)] struct DistanceSet { idx: usize, dist: Vec, diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index 45fbd6f..d82e575 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -22,7 +22,7 @@ //! //! ``` -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use std::cmp::{Ordering, PartialOrd}; use std::marker::PhantomData; @@ -32,7 +32,8 @@ use crate::math::distance::Distance; use crate::math::num::RealNumber; /// Implements Linear Search algorithm, see [KNN algorithms](../index.html) -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct LinearKNNSearch> { distance: D, data: Vec, diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index bf9e669..9e432bd 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -35,7 +35,7 @@ use crate::algorithm::neighbour::linear_search::LinearKNNSearch; use crate::error::Failed; use crate::math::distance::Distance; use crate::math::num::RealNumber; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; pub(crate) mod bbd_tree; /// tree data structure for fast nearest neighbor search @@ -45,7 +45,8 @@ pub mod linear_search; /// Both, KNN classifier and regressor benefits from underlying search algorithms that helps to speed up queries. /// `KNNAlgorithmName` maintains a list of supported search algorithms, see [KNN algorithms](../algorithm/neighbour/index.html) -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub enum KNNAlgorithmName { /// Heap Search algorithm, see [`LinearSearch`](../algorithm/neighbour/linear_search/index.html) LinearSearch, @@ -53,7 +54,8 @@ pub enum KNNAlgorithmName { CoverTree, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub(crate) enum KNNAlgorithm, T>> { LinearSearch(LinearKNNSearch, T, D>), CoverTree(CoverTree, T, D>), diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index c793039..a117982 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -43,7 +43,7 @@ use std::fmt::Debug; use std::iter::Sum; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, UnsupervisedEstimator}; @@ -55,7 +55,8 @@ use crate::math::num::RealNumber; use crate::tree::decision_tree_classifier::which_max; /// DBSCAN clustering algorithm -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct DBSCAN, T>> { cluster_labels: Vec, num_classes: usize, diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 44ce1e6..78c9105 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -56,7 +56,7 @@ use rand::Rng; use std::fmt::Debug; use std::iter::Sum; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::bbd_tree::BBDTree; use crate::api::{Predictor, UnsupervisedEstimator}; @@ -66,7 +66,8 @@ use crate::math::distance::euclidian::*; use crate::math::num::RealNumber; /// K-Means clustering algorithm -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct KMeans { k: usize, y: Vec, diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 189e6de..626f268 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -47,7 +47,7 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; @@ -55,7 +55,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; /// Principal components analysis algorithm -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct PCA> { eigenvectors: M, eigenvalues: Vec, diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index 595e93c..7dc48dc 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -46,7 +46,7 @@ use std::fmt::Debug; use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; @@ -54,7 +54,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; /// SVD -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct SVD> { components: M, phantom: PhantomData, diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 49c4239..74f210c 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -49,7 +49,7 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -61,7 +61,8 @@ use crate::tree::decision_tree_classifier::{ /// Parameters of the Random Forest algorithm. /// Some parameters here are passed directly into base estimator. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct RandomForestClassifierParameters { /// Split criteria to use when building a tree. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) pub criterion: SplitCriterion, @@ -78,7 +79,8 @@ pub struct RandomForestClassifierParameters { } /// Random Forest Classifier -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct RandomForestClassifier { parameters: RandomForestClassifierParameters, trees: Vec>, diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index fdeb9fc..74a1b59 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -47,7 +47,7 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -57,7 +57,8 @@ use crate::tree::decision_tree_regressor::{ DecisionTreeRegressor, DecisionTreeRegressorParameters, }; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Parameters of the Random Forest Regressor /// Some parameters here are passed directly into base estimator. pub struct RandomForestRegressorParameters { @@ -74,7 +75,8 @@ pub struct RandomForestRegressorParameters { } /// Random Forest Regressor -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct RandomForestRegressor { parameters: RandomForestRegressorParameters, trees: Vec>, diff --git a/src/error/mod.rs b/src/error/mod.rs index 2409889..9a9bb8b 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -2,10 +2,11 @@ use std::error::Error; use std::fmt; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Generic error to be raised when something goes wrong. -#[derive(Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct Failed { err: FailedError, msg: String, @@ -13,7 +14,8 @@ pub struct Failed { /// Type of error #[non_exhaustive] -#[derive(Copy, Clone, Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Copy, Clone, Debug)] pub enum FailedError { /// Can't fit algorithm to data FitFailed = 1, diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index a0b7bdb..9816a28 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1,11 +1,14 @@ #![allow(clippy::ptr_arg)] use std::fmt; use std::fmt::Debug; -use std::marker::PhantomData; +#[cfg(feature = "serde")] use std::marker::PhantomData; use std::ops::Range; +#[cfg(feature = "serde")] use serde::de::{Deserializer, MapAccess, SeqAccess, Visitor}; +#[cfg(feature = "serde")] use serde::ser::{SerializeStruct, Serializer}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::cholesky::CholeskyDecomposableMatrix; @@ -349,6 +352,7 @@ impl<'a, T: RealNumber> Iterator for DenseMatrixIterator<'a, T> { } } +#[cfg(feature = "serde")] impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for DenseMatrix { fn deserialize(deserializer: D) -> Result where @@ -434,6 +438,7 @@ impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for De } } +#[cfg(feature = "serde")] impl Serialize for DenseMatrix { fn serialize(&self, serializer: S) -> Result where diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index 2833ff1..7e7a29a 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -56,7 +56,7 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -67,7 +67,8 @@ use crate::math::num::RealNumber; use crate::linear::lasso_optimizer::InteriorPointOptimizer; /// Elastic net parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct ElasticNetParameters { /// Regularization parameter. pub alpha: T, @@ -84,7 +85,8 @@ pub struct ElasticNetParameters { } /// Elastic net -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct ElasticNet> { coefficients: M, intercept: T, diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index b99ecff..8f46bbc 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -24,7 +24,7 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -34,7 +34,8 @@ use crate::linear::lasso_optimizer::InteriorPointOptimizer; use crate::math::num::RealNumber; /// Lasso regression parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct LassoParameters { /// Controls the strength of the penalty to the loss function. pub alpha: T, @@ -47,7 +48,8 @@ pub struct LassoParameters { pub max_iter: usize, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] /// Lasso regressor pub struct Lasso> { coefficients: M, diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 2ef03c1..6d24312 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -62,14 +62,15 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Approach to use for estimation of regression coefficients. QR is more efficient but SVD is more stable. pub enum LinearRegressionSolverName { /// QR decomposition, see [QR](../../linalg/qr/index.html) @@ -79,14 +80,16 @@ pub enum LinearRegressionSolverName { } /// Linear Regression parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct LinearRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: LinearRegressionSolverName, } /// Linear Regression -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct LinearRegression> { coefficients: M, intercept: T, diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index a71ac45..cdf78d1 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -56,7 +56,7 @@ use std::cmp::Ordering; use std::fmt::Debug; use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -68,11 +68,13 @@ use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; /// Logistic Regression parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct LogisticRegressionParameters {} /// Logistic Regression -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct LogisticRegression> { coefficients: M, intercept: M, diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index e9ed1ff..5afa2f9 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -58,7 +58,7 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -66,7 +66,8 @@ use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable. pub enum RidgeRegressionSolverName { /// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html) @@ -76,7 +77,8 @@ pub enum RidgeRegressionSolverName { } /// Ridge Regression parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct RidgeRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: RidgeRegressionSolverName, @@ -88,7 +90,8 @@ pub struct RidgeRegressionParameters { } /// Ridge regression -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct RidgeRegression> { coefficients: M, intercept: T, diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 9034727..6385f6e 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -18,14 +18,15 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; /// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Euclidian {} impl Euclidian { diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index 129fe16..bdd8e14 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -19,14 +19,15 @@ //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; /// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Hamming {} impl Distance, F> for Hamming { diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index 84aa947..9f47894 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -44,7 +44,7 @@ use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; @@ -52,7 +52,8 @@ use super::Distance; use crate::linalg::Matrix; /// Mahalanobis distance. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Mahalanobis> { /// covariance matrix of the dataset pub sigma: M, diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 9a69184..758763b 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -17,14 +17,15 @@ //! ``` //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; /// Manhattan distance -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Manhattan {} impl Distance, T> for Manhattan { diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index c5dd85d..e953571 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -21,14 +21,15 @@ //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; /// Defines the Minkowski distance of order `p` -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Minkowski { /// order, integer pub p: u16, diff --git a/src/metrics/accuracy.rs b/src/metrics/accuracy.rs index ef7028f..c5a129b 100644 --- a/src/metrics/accuracy.rs +++ b/src/metrics/accuracy.rs @@ -16,13 +16,14 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Accuracy metric. -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct Accuracy {} impl Accuracy { diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index 0f8d56a..f352ca7 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -20,14 +20,15 @@ //! * ["The ROC-AUC and the Mann-Whitney U-test", Haupt, J.](https://johaupt.github.io/roc-auc/model%20evaluation/Area_under_ROC_curve.html) #![allow(non_snake_case)] -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Area Under the Receiver Operating Characteristic Curve (ROC AUC) -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct AUC {} impl AUC { diff --git a/src/metrics/cluster_hcv.rs b/src/metrics/cluster_hcv.rs index 29a9db2..40e5173 100644 --- a/src/metrics/cluster_hcv.rs +++ b/src/metrics/cluster_hcv.rs @@ -1,10 +1,11 @@ -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; use crate::metrics::cluster_helpers::*; -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] /// Homogeneity, completeness and V-Measure scores. pub struct HCVScore {} diff --git a/src/metrics/f1.rs b/src/metrics/f1.rs index 5c8537c..29f989e 100644 --- a/src/metrics/f1.rs +++ b/src/metrics/f1.rs @@ -18,7 +18,7 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; @@ -26,7 +26,8 @@ use crate::metrics::precision::Precision; use crate::metrics::recall::Recall; /// F-measure -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct F1 { /// a positive real factor pub beta: T, diff --git a/src/metrics/mean_absolute_error.rs b/src/metrics/mean_absolute_error.rs index a069335..1049589 100644 --- a/src/metrics/mean_absolute_error.rs +++ b/src/metrics/mean_absolute_error.rs @@ -18,12 +18,13 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] /// Mean Absolute Error pub struct MeanAbsoluteError {} diff --git a/src/metrics/mean_squared_error.rs b/src/metrics/mean_squared_error.rs index 137c8e6..3bcb7e1 100644 --- a/src/metrics/mean_squared_error.rs +++ b/src/metrics/mean_squared_error.rs @@ -18,12 +18,13 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] /// Mean Squared Error pub struct MeanSquareError {} diff --git a/src/metrics/precision.rs b/src/metrics/precision.rs index 3524e7f..806c119 100644 --- a/src/metrics/precision.rs +++ b/src/metrics/precision.rs @@ -18,13 +18,14 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Precision metric. -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct Precision {} impl Precision { diff --git a/src/metrics/r2.rs b/src/metrics/r2.rs index cbcf7e4..0d661b7 100644 --- a/src/metrics/r2.rs +++ b/src/metrics/r2.rs @@ -18,13 +18,14 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Coefficient of Determination (R2) -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct R2 {} impl R2 { diff --git a/src/metrics/recall.rs b/src/metrics/recall.rs index 4d2be95..22f5402 100644 --- a/src/metrics/recall.rs +++ b/src/metrics/recall.rs @@ -18,13 +18,14 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Recall metric. -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct Recall {} impl Recall { diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 388646f..7233b83 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -42,10 +42,11 @@ use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Bearnoulli features -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] struct BernoulliNBDistribution { /// class labels known to the classifier class_labels: Vec, @@ -77,7 +78,8 @@ impl> NBDistribution for BernoulliNBDistributi } /// `BernoulliNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct BernoulliNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, @@ -202,7 +204,8 @@ impl BernoulliNBDistribution { } /// BernoulliNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub struct BernoulliNB> { inner: BaseNaiveBayes>, binarize: Option, diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index c6f28bd..c6b66c6 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -36,10 +36,11 @@ use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct CategoricalNBDistribution { class_labels: Vec, class_priors: Vec, @@ -216,7 +217,8 @@ impl CategoricalNBDistribution { } /// `CategoricalNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct CategoricalNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, @@ -237,7 +239,8 @@ impl Default for CategoricalNBParameters { } /// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub struct CategoricalNB> { inner: BaseNaiveBayes>, } diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 2ac9892..6ba78bb 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -30,10 +30,11 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] struct GaussianNBDistribution { /// class labels known to the classifier class_labels: Vec, @@ -75,7 +76,8 @@ impl> NBDistribution for GaussianNBDistributio } /// `GaussianNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Default, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Default, Clone)] pub struct GaussianNBParameters { /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data pub priors: Option>, @@ -178,7 +180,8 @@ impl GaussianNBDistribution { } /// GaussianNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub struct GaussianNB> { inner: BaseNaiveBayes>, } diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 7ab8b85..9a24466 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -39,7 +39,7 @@ use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use std::marker::PhantomData; /// Distribution used in the Naive Bayes classifier. @@ -55,7 +55,8 @@ pub(crate) trait NBDistribution> { } /// Base struct for the Naive Bayes classifier. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub(crate) struct BaseNaiveBayes, D: NBDistribution> { distribution: D, _phantom_t: PhantomData, diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 4cae1f3..23382a1 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -42,10 +42,11 @@ use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Multinomial features -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] struct MultinomialNBDistribution { /// class labels known to the classifier class_labels: Vec, @@ -73,7 +74,8 @@ impl> NBDistribution for MultinomialNBDistribu } /// `MultinomialNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct MultinomialNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, @@ -189,7 +191,8 @@ impl MultinomialNBDistribution { } /// MultinomialNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub struct MultinomialNB> { inner: BaseNaiveBayes>, } diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 97dd748..0f75220 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -33,7 +33,7 @@ //! use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, SupervisedEstimator}; @@ -45,7 +45,8 @@ use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNClassifier` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct KNNClassifierParameters, T>> { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. @@ -62,7 +63,8 @@ pub struct KNNClassifierParameters, T>> { } /// K Nearest Neighbors Classifier -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct KNNClassifier, T>> { classes: Vec, y: Vec, diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 4e73103..86bfd85 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -36,7 +36,7 @@ //! use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, SupervisedEstimator}; @@ -48,7 +48,8 @@ use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNRegressor` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct KNNRegressorParameters, T>> { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. @@ -65,7 +66,8 @@ pub struct KNNRegressorParameters, T>> { } /// K Nearest Neighbors Regressor -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct KNNRegressor, T>> { y: Vec, knn_algorithm: KNNAlgorithm, diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index 85ea6b8..6beb75e 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -33,7 +33,7 @@ //! use crate::math::num::RealNumber; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// K Nearest Neighbors Classifier pub mod knn_classifier; @@ -48,7 +48,8 @@ pub mod knn_regressor; pub type KNNAlgorithmName = crate::algorithm::neighbour::KNNAlgorithmName; /// Weight function that is used to determine estimated value. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub enum KNNWeightFunction { /// All k nearest points are weighted equally Uniform, diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 1e013d2..abe8071 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -26,7 +26,7 @@ pub mod svc; pub mod svr; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; @@ -93,18 +93,21 @@ impl Kernels { } /// Linear Kernel -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct LinearKernel {} /// Radial basis function (Gaussian) kernel -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct RBFKernel { /// kernel coefficient pub gamma: T, } /// Polynomial kernel -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct PolynomialKernel { /// degree of the polynomial pub degree: T, @@ -115,7 +118,8 @@ pub struct PolynomialKernel { } /// Sigmoid (hyperbolic tangent) kernel -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct SigmoidKernel { /// kernel coefficient pub gamma: T, diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 095d555..0582cdc 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -76,7 +76,7 @@ use std::marker::PhantomData; use rand::seq::SliceRandom; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -85,7 +85,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::svm::{Kernel, Kernels, LinearKernel}; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// SVC Parameters pub struct SVCParameters, K: Kernel> { /// Number of epochs. @@ -100,11 +101,12 @@ pub struct SVCParameters, K: Kernel m: PhantomData, } -#[derive(Serialize, Deserialize, Debug)] -#[serde(bound( +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] +#[cfg_attr(feature = "serde", serde(bound( serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", -))] +)))] /// Support Vector Classifier pub struct SVC, K: Kernel> { classes: Vec, @@ -114,7 +116,8 @@ pub struct SVC, K: Kernel> { b: T, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct SupportVector> { index: usize, x: V, diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 9eb6046..4d61b97 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -68,7 +68,7 @@ use std::cell::{Ref, RefCell}; use std::fmt::Debug; use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -77,7 +77,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::svm::{Kernel, Kernels, LinearKernel}; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// SVR Parameters pub struct SVRParameters, K: Kernel> { /// Epsilon in the epsilon-SVR model. @@ -92,11 +93,12 @@ pub struct SVRParameters, K: Kernel m: PhantomData, } -#[derive(Serialize, Deserialize, Debug)] -#[serde(bound( +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] +#[cfg_attr(feature = "serde", serde(bound( serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", -))] +)))] /// Epsilon-Support Vector Regression pub struct SVR, K: Kernel> { @@ -106,7 +108,8 @@ pub struct SVR, K: Kernel> { b: T, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct SupportVector> { index: usize, x: V, diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 3a92c54..b014152 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -68,7 +68,7 @@ use std::fmt::Debug; use std::marker::PhantomData; use rand::seq::SliceRandom; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::api::{Predictor, SupervisedEstimator}; @@ -76,7 +76,8 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Parameters of Decision Tree pub struct DecisionTreeClassifierParameters { /// Split criteria to use when building a tree. @@ -90,7 +91,8 @@ pub struct DecisionTreeClassifierParameters { } /// Decision Tree -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct DecisionTreeClassifier { nodes: Vec>, parameters: DecisionTreeClassifierParameters, @@ -100,7 +102,8 @@ pub struct DecisionTreeClassifier { } /// The function to measure the quality of a split. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub enum SplitCriterion { /// [Gini index](../decision_tree_classifier/index.html) Gini, @@ -110,7 +113,8 @@ pub enum SplitCriterion { ClassificationError, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct Node { index: usize, output: usize, diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 06ee507..ef8c52c 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -63,7 +63,7 @@ use std::default::Default; use std::fmt::Debug; use rand::seq::SliceRandom; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::api::{Predictor, SupervisedEstimator}; @@ -71,7 +71,8 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Parameters of Regression Tree pub struct DecisionTreeRegressorParameters { /// The maximum depth of the tree. @@ -83,14 +84,16 @@ pub struct DecisionTreeRegressorParameters { } /// Regression Tree -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct DecisionTreeRegressor { nodes: Vec>, parameters: DecisionTreeRegressorParameters, depth: u16, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct Node { index: usize, output: T, From 762986b271c141b112ecb5d855834de76923ae3c Mon Sep 17 00:00:00 2001 From: Ben Cross Date: Sun, 17 Jan 2021 21:37:30 +0000 Subject: [PATCH 33/78] Cargo format --- src/algorithm/neighbour/cover_tree.rs | 3 ++- src/algorithm/neighbour/linear_search.rs | 3 ++- src/algorithm/neighbour/mod.rs | 3 ++- src/cluster/dbscan.rs | 3 ++- src/cluster/kmeans.rs | 3 ++- src/decomposition/pca.rs | 3 ++- src/decomposition/svd.rs | 3 ++- src/ensemble/random_forest_classifier.rs | 3 ++- src/ensemble/random_forest_regressor.rs | 3 ++- src/error/mod.rs | 3 ++- src/linalg/naive/dense_matrix.rs | 3 ++- src/linear/elastic_net.rs | 3 ++- src/linear/lasso.rs | 3 ++- src/linear/linear_regression.rs | 3 ++- src/linear/logistic_regression.rs | 3 ++- src/linear/ridge_regression.rs | 3 ++- src/math/distance/euclidian.rs | 3 ++- src/math/distance/hamming.rs | 3 ++- src/math/distance/mahalanobis.rs | 3 ++- src/math/distance/manhattan.rs | 3 ++- src/math/distance/minkowski.rs | 3 ++- src/metrics/accuracy.rs | 3 ++- src/metrics/auc.rs | 3 ++- src/metrics/cluster_hcv.rs | 3 ++- src/metrics/f1.rs | 3 ++- src/metrics/mean_absolute_error.rs | 3 ++- src/metrics/mean_squared_error.rs | 3 ++- src/metrics/precision.rs | 3 ++- src/metrics/r2.rs | 3 ++- src/metrics/recall.rs | 3 ++- src/naive_bayes/bernoulli.rs | 3 ++- src/naive_bayes/categorical.rs | 3 ++- src/naive_bayes/gaussian.rs | 3 ++- src/naive_bayes/mod.rs | 3 ++- src/naive_bayes/multinomial.rs | 3 ++- src/neighbors/knn_classifier.rs | 3 ++- src/neighbors/knn_regressor.rs | 3 ++- src/neighbors/mod.rs | 3 ++- src/svm/mod.rs | 3 ++- src/svm/svc.rs | 14 +++++++++----- src/svm/svr.rs | 14 +++++++++----- src/tree/decision_tree_classifier.rs | 3 ++- src/tree/decision_tree_regressor.rs | 3 ++- 43 files changed, 100 insertions(+), 51 deletions(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 553dc99..96a3389 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -24,7 +24,8 @@ //! ``` use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::sort::heap_select::HeapSelection; use crate::error::{Failed, FailedError}; diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index d82e575..f89e751 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -22,7 +22,8 @@ //! //! ``` -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use std::cmp::{Ordering, PartialOrd}; use std::marker::PhantomData; diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 9e432bd..321ec01 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -35,7 +35,8 @@ use crate::algorithm::neighbour::linear_search::LinearKNNSearch; use crate::error::Failed; use crate::math::distance::Distance; use crate::math::num::RealNumber; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; pub(crate) mod bbd_tree; /// tree data structure for fast nearest neighbor search diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index a117982..73d686d 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -43,7 +43,8 @@ use std::fmt::Debug; use std::iter::Sum; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, UnsupervisedEstimator}; diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 78c9105..a454b1f 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -56,7 +56,8 @@ use rand::Rng; use std::fmt::Debug; use std::iter::Sum; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::bbd_tree::BBDTree; use crate::api::{Predictor, UnsupervisedEstimator}; diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 626f268..e3212e3 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -47,7 +47,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index 7dc48dc..5524e29 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -46,7 +46,8 @@ use std::fmt::Debug; use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 74f210c..62e83b5 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -49,7 +49,8 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 74a1b59..18c2f69 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -47,7 +47,8 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/error/mod.rs b/src/error/mod.rs index 9a9bb8b..4e84f6e 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -2,7 +2,8 @@ use std::error::Error; use std::fmt; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Generic error to be raised when something goes wrong. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 9816a28..1a9b3a6 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1,7 +1,8 @@ #![allow(clippy::ptr_arg)] use std::fmt; use std::fmt::Debug; -#[cfg(feature = "serde")] use std::marker::PhantomData; +#[cfg(feature = "serde")] +use std::marker::PhantomData; use std::ops::Range; #[cfg(feature = "serde")] diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index 7e7a29a..f4a4326 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -56,7 +56,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 8f46bbc..17712b1 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -24,7 +24,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 6d24312..290a2db 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -62,7 +62,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index cdf78d1..45777be 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -56,7 +56,8 @@ use std::cmp::Ordering; use std::fmt::Debug; use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 5afa2f9..4e1ebad 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -58,7 +58,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 6385f6e..b06d7d1 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index bdd8e14..d23b57f 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -19,7 +19,8 @@ //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index 9f47894..7ff86e9 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -44,7 +44,8 @@ use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 758763b..3162178 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -17,7 +17,8 @@ //! ``` //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index e953571..1e97ea8 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -21,7 +21,8 @@ //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/metrics/accuracy.rs b/src/metrics/accuracy.rs index c5a129b..6912a4c 100644 --- a/src/metrics/accuracy.rs +++ b/src/metrics/accuracy.rs @@ -16,7 +16,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index f352ca7..508295b 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -20,7 +20,8 @@ //! * ["The ROC-AUC and the Mann-Whitney U-test", Haupt, J.](https://johaupt.github.io/roc-auc/model%20evaluation/Area_under_ROC_curve.html) #![allow(non_snake_case)] -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::linalg::BaseVector; diff --git a/src/metrics/cluster_hcv.rs b/src/metrics/cluster_hcv.rs index 40e5173..d881bdc 100644 --- a/src/metrics/cluster_hcv.rs +++ b/src/metrics/cluster_hcv.rs @@ -1,4 +1,5 @@ -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/f1.rs b/src/metrics/f1.rs index 29f989e..d957d9b 100644 --- a/src/metrics/f1.rs +++ b/src/metrics/f1.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/mean_absolute_error.rs b/src/metrics/mean_absolute_error.rs index 1049589..db3039f 100644 --- a/src/metrics/mean_absolute_error.rs +++ b/src/metrics/mean_absolute_error.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/mean_squared_error.rs b/src/metrics/mean_squared_error.rs index 3bcb7e1..3003e5d 100644 --- a/src/metrics/mean_squared_error.rs +++ b/src/metrics/mean_squared_error.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/precision.rs b/src/metrics/precision.rs index 806c119..2bd0dcf 100644 --- a/src/metrics/precision.rs +++ b/src/metrics/precision.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/r2.rs b/src/metrics/r2.rs index 0d661b7..c710ef5 100644 --- a/src/metrics/r2.rs +++ b/src/metrics/r2.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/recall.rs b/src/metrics/recall.rs index 22f5402..d1fad56 100644 --- a/src/metrics/recall.rs +++ b/src/metrics/recall.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 7233b83..cdbfa80 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -42,7 +42,8 @@ use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Bearnoulli features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index c6b66c6..dc8587a 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -36,7 +36,8 @@ use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 6ba78bb..c27c396 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -30,7 +30,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 9a24466..f7c8da6 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -39,7 +39,8 @@ use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use std::marker::PhantomData; /// Distribution used in the Naive Bayes classifier. diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 23382a1..fa91020 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -42,7 +42,8 @@ use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Multinomial features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 0f75220..839eea3 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -33,7 +33,8 @@ //! use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, SupervisedEstimator}; diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 86bfd85..1edf86a 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -36,7 +36,8 @@ //! use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, SupervisedEstimator}; diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index 6beb75e..86b1e46 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -33,7 +33,8 @@ //! use crate::math::num::RealNumber; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// K Nearest Neighbors Classifier pub mod knn_classifier; diff --git a/src/svm/mod.rs b/src/svm/mod.rs index abe8071..068f773 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -26,7 +26,8 @@ pub mod svc; pub mod svr; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 0582cdc..9d77812 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -76,7 +76,8 @@ use std::marker::PhantomData; use rand::seq::SliceRandom; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -103,10 +104,13 @@ pub struct SVCParameters, K: Kernel #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] -#[cfg_attr(feature = "serde", serde(bound( - serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", - deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", -)))] +#[cfg_attr( + feature = "serde", + serde(bound( + serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", + deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", + )) +)] /// Support Vector Classifier pub struct SVC, K: Kernel> { classes: Vec, diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 4d61b97..cbb1ea5 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -68,7 +68,8 @@ use std::cell::{Ref, RefCell}; use std::fmt::Debug; use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -95,10 +96,13 @@ pub struct SVRParameters, K: Kernel #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] -#[cfg_attr(feature = "serde", serde(bound( - serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", - deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", -)))] +#[cfg_attr( + feature = "serde", + serde(bound( + serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", + deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", + )) +)] /// Epsilon-Support Vector Regression pub struct SVR, K: Kernel> { diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index b014152..7575a5a 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -68,7 +68,8 @@ use std::fmt::Debug; use std::marker::PhantomData; use rand::seq::SliceRandom; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::api::{Predictor, SupervisedEstimator}; diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index ef8c52c..d1292db 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -63,7 +63,8 @@ use std::default::Default; use std::fmt::Debug; use rand::seq::SliceRandom; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::api::{Predictor, SupervisedEstimator}; From f1cf8a6f0845f48e16f342f3e56b1bdb93ae2d2a Mon Sep 17 00:00:00 2001 From: Ben Cross Date: Mon, 18 Jan 2021 10:32:35 +0000 Subject: [PATCH 34/78] Added serde feature flags to tests --- src/algorithm/neighbour/cover_tree.rs | 4 +++- src/algorithm/neighbour/linear_search.rs | 3 ++- src/cluster/dbscan.rs | 2 ++ src/cluster/kmeans.rs | 1 + src/decomposition/pca.rs | 1 + src/decomposition/svd.rs | 1 + src/ensemble/random_forest_classifier.rs | 1 + src/ensemble/random_forest_regressor.rs | 1 + src/linalg/naive/dense_matrix.rs | 2 ++ src/linear/elastic_net.rs | 1 + src/linear/lasso.rs | 1 + src/linear/linear_regression.rs | 1 + src/linear/logistic_regression.rs | 1 + src/linear/ridge_regression.rs | 1 + src/naive_bayes/bernoulli.rs | 1 + src/naive_bayes/categorical.rs | 1 + src/naive_bayes/gaussian.rs | 1 + src/naive_bayes/multinomial.rs | 1 + src/neighbors/knn_classifier.rs | 1 + src/neighbors/knn_regressor.rs | 1 + src/svm/svc.rs | 2 ++ src/svm/svr.rs | 2 ++ src/tree/decision_tree_classifier.rs | 1 + src/tree/decision_tree_regressor.rs | 1 + 24 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 96a3389..9c5c806 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -457,7 +457,8 @@ mod tests { use super::*; use crate::math::distance::Distances; - #[derive(Debug, Serialize, Deserialize, Clone)] + #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] + #[derive(Debug, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { @@ -503,6 +504,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index f89e751..b4a3c89 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -140,7 +140,8 @@ mod tests { use super::*; use crate::math::distance::Distances; - #[derive(Debug, Serialize, Deserialize, Clone)] + #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] + #[derive(Debug, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 73d686d..d7a706a 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -265,6 +265,7 @@ impl, T>> DBSCAN { mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg(feature = "serde")] use crate::math::distance::euclidian::Euclidian; #[test] @@ -299,6 +300,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index a454b1f..6be52a5 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -347,6 +347,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index e3212e3..de258dc 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -567,6 +567,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let iris = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index 5524e29..6f5a1bd 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -228,6 +228,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let iris = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 62e83b5..4127627 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -325,6 +325,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 18c2f69..02eef99 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -274,6 +274,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159., 107.608, 1947., 60.323], diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 1a9b3a6..4faa77d 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1312,6 +1312,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn to_from_json() { let a = DenseMatrix::from_2d_array(&[&[0.9, 0.4, 0.7], &[0.4, 0.5, 0.3], &[0.7, 0.3, 0.8]]); let deserialized_a: DenseMatrix = @@ -1320,6 +1321,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn to_from_bincode() { let a = DenseMatrix::from_2d_array(&[&[0.9, 0.4, 0.7], &[0.4, 0.5, 0.3], &[0.7, 0.3, 0.8]]); let deserialized_a: DenseMatrix = diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index f4a4326..479ae2a 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -401,6 +401,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 17712b1..8c59a4f 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -275,6 +275,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 290a2db..2734a78 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -251,6 +251,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 45777be..cbdef77 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -543,6 +543,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[1., -5.], diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 4e1ebad..787c338 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -330,6 +330,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index cdbfa80..6a7d0b4 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -351,6 +351,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::::from_2d_array(&[ &[1., 1., 0., 0., 0., 0.], diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index dc8587a..2161528 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -349,6 +349,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::::from_2d_array(&[ &[3., 4., 0., 1.], diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index c27c396..28c4785 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -281,6 +281,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::::from_2d_array(&[ &[-1., -1.], diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index fa91020..06ee071 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -324,6 +324,7 @@ mod tests { )); } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::::from_2d_array(&[ &[1., 1., 0., 0., 0., 0.], diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 839eea3..ba6693e 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -280,6 +280,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 1edf86a..ed52496 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -269,6 +269,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 9d77812..3101425 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -726,6 +726,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; use crate::metrics::accuracy; + #[cfg(feature = "serde")] use crate::svm::*; #[test] @@ -814,6 +815,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn svc_serde() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/svm/svr.rs b/src/svm/svr.rs index cbb1ea5..b160cca 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -533,6 +533,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; use crate::metrics::mean_squared_error; + #[cfg(feature = "serde")] use crate::svm::*; #[test] @@ -569,6 +570,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn svr_serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 7575a5a..ba79d52 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -745,6 +745,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[1., 1., 1., 0.], diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index d1292db..307d357 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -581,6 +581,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159., 107.608, 1947., 60.323], From fd00bc3780a5d4e289d6689179ebb80798d74e77 Mon Sep 17 00:00:00 2001 From: Ben Cross Date: Mon, 18 Jan 2021 20:50:49 +0000 Subject: [PATCH 35/78] Run the pipeline with --all-features enabled --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a931ff5..6cdd0e4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,10 +23,10 @@ jobs: command: cargo fmt -- --check - run: name: Stable Build - command: cargo build --features "nalgebra-bindings ndarray-bindings" + command: cargo build --all-features - run: name: Test - command: cargo test --features "nalgebra-bindings ndarray-bindings" + command: cargo test --all-features - save_cache: key: project-cache paths: From bd5fbb63b155af9400e690d625a703fee9ff08f6 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 20 Jan 2021 16:55:58 -0800 Subject: [PATCH 36/78] feat: adds a new parameter to the logistic regression: solver --- src/linear/logistic_regression.rs | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index cbdef77..a23c15a 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -68,10 +68,21 @@ use crate::optimization::first_order::{FirstOrderOptimizer, OptimizerResult}; use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] +/// Solver options for Logistic regression. Right now only LBFGS solver is supported. +pub enum LogisticRegressionSolverName { + /// Limited-memory Broyden–Fletcher–Goldfarb–Shanno method, see [LBFGS paper](http://users.iems.northwestern.edu/~nocedal/lbfgsb.html) + LBFGS, +} + /// Logistic Regression parameters #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] -pub struct LogisticRegressionParameters {} +pub struct LogisticRegressionParameters { + /// Solver to use for estimation of regression coefficients. + pub solver: LogisticRegressionSolverName, +} /// Logistic Regression #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -105,9 +116,19 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { phantom: PhantomData<&'a T>, } +impl LogisticRegressionParameters { + /// Solver to use for estimation of regression coefficients. + pub fn with_solver(mut self, solver: LogisticRegressionSolverName) -> Self { + self.solver = solver; + self + } +} + impl Default for LogisticRegressionParameters { fn default() -> Self { - LogisticRegressionParameters {} + LogisticRegressionParameters { + solver: LogisticRegressionSolverName::LBFGS, + } } } From 40a92ee4dbaeb6a485555e67d2c864a2a42e3b5c Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 21 Jan 2021 14:37:34 -0800 Subject: [PATCH 37/78] feat: adds l2 regularization penalty to the Logistic Regression --- src/linear/logistic_regression.rs | 134 ++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 16 deletions(-) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index a23c15a..2a12c19 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -54,7 +54,6 @@ //! use std::cmp::Ordering; use std::fmt::Debug; -use std::marker::PhantomData; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -79,9 +78,11 @@ pub enum LogisticRegressionSolverName { /// Logistic Regression parameters #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] -pub struct LogisticRegressionParameters { +pub struct LogisticRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: LogisticRegressionSolverName, + /// Regularization parameter. + pub alpha: T, } /// Logistic Regression @@ -113,21 +114,27 @@ trait ObjectiveFunction> { struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { x: &'a M, y: Vec, - phantom: PhantomData<&'a T>, + alpha: T, } -impl LogisticRegressionParameters { +impl LogisticRegressionParameters { /// Solver to use for estimation of regression coefficients. pub fn with_solver(mut self, solver: LogisticRegressionSolverName) -> Self { self.solver = solver; self } + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } } -impl Default for LogisticRegressionParameters { +impl Default for LogisticRegressionParameters { fn default() -> Self { LogisticRegressionParameters { solver: LogisticRegressionSolverName::LBFGS, + alpha: T::zero(), } } } @@ -156,13 +163,22 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction { fn f(&self, w_bias: &M) -> T { let mut f = T::zero(); - let (n, _) = self.x.shape(); + let (n, p) = self.x.shape(); for i in 0..n { let wx = BinaryObjectiveFunction::partial_dot(w_bias, self.x, 0, i); f += wx.ln_1pe() - (T::from(self.y[i]).unwrap()) * wx; } + if self.alpha > T::zero() { + let mut w_squared = T::zero(); + for i in 0..p { + let w = w_bias.get(0, i); + w_squared += w * w; + } + f += T::half() * self.alpha * w_squared; + } + f } @@ -180,6 +196,13 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction } g.set(0, p, g.get(0, p) - dyi); } + + if self.alpha > T::zero() { + for i in 0..p { + let w = w_bias.get(0, i); + g.set(0, i, g.get(0, i) + self.alpha * w); + } + } } } @@ -187,7 +210,7 @@ struct MultiClassObjectiveFunction<'a, T: RealNumber, M: Matrix> { x: &'a M, y: Vec, k: usize, - phantom: PhantomData<&'a T>, + alpha: T, } impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction @@ -209,6 +232,17 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction f -= prob.get(0, self.y[i]).ln(); } + if self.alpha > T::zero() { + let mut w_squared = T::zero(); + for i in 0..self.k { + for j in 0..p { + let wi = w_bias.get(0, i * (p + 1) + j); + w_squared += wi * wi; + } + } + f += T::half() * self.alpha * w_squared; + } + f } @@ -239,16 +273,27 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction g.set(0, j * (p + 1) + p, g.get(0, j * (p + 1) + p) - yi); } } + + if self.alpha > T::zero() { + for i in 0..self.k { + for j in 0..p { + let pos = i * (p + 1); + let wi = w.get(0, pos + j); + g.set(0, pos + j, g.get(0, pos + j) + self.alpha * wi); + } + } + } } } -impl> SupervisedEstimator +impl> + SupervisedEstimator> for LogisticRegression { fn fit( x: &M, y: &M::RowVector, - parameters: LogisticRegressionParameters, + parameters: LogisticRegressionParameters, ) -> Result { LogisticRegression::fit(x, y, parameters) } @@ -268,7 +313,7 @@ impl> LogisticRegression { pub fn fit( x: &M, y: &M::RowVector, - _parameters: LogisticRegressionParameters, + parameters: LogisticRegressionParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); @@ -302,7 +347,7 @@ impl> LogisticRegression { let objective = BinaryObjectiveFunction { x, y: yi, - phantom: PhantomData, + alpha: parameters.alpha, }; let result = LogisticRegression::minimize(x0, objective); @@ -324,7 +369,7 @@ impl> LogisticRegression { x, y: yi, k, - phantom: PhantomData, + alpha: parameters.alpha, }; let result = LogisticRegression::minimize(x0, objective); @@ -431,9 +476,9 @@ mod tests { let objective = MultiClassObjectiveFunction { x: &x, - y, + y: y.clone(), k: 3, - phantom: PhantomData, + alpha: 0.0, }; let mut g: DenseMatrix = DenseMatrix::zeros(1, 9); @@ -454,6 +499,24 @@ mod tests { ])); assert!((f - 408.0052230582765).abs() < std::f64::EPSILON); + + let objective_reg = MultiClassObjectiveFunction { + x: &x, + y: y.clone(), + k: 3, + alpha: 1.0, + }; + + let f = objective_reg.f(&DenseMatrix::row_vector_from_array(&[ + 1., 2., 3., 4., 5., 6., 7., 8., 9., + ])); + assert!((f - 487.5052).abs() < 1e-4); + + objective_reg.df( + &mut g, + &DenseMatrix::row_vector_from_array(&[1., 2., 3., 4., 5., 6., 7., 8., 9.]), + ); + assert!((g.get(0, 0).abs() - 32.0).abs() < 1e-4); } #[test] @@ -480,8 +543,8 @@ mod tests { let objective = BinaryObjectiveFunction { x: &x, - y, - phantom: PhantomData, + y: y.clone(), + alpha: 0.0, }; let mut g: DenseMatrix = DenseMatrix::zeros(1, 3); @@ -496,6 +559,20 @@ mod tests { let f = objective.f(&DenseMatrix::row_vector_from_array(&[1., 2., 3.])); assert!((f - 59.76994756647412).abs() < std::f64::EPSILON); + + let objective_reg = BinaryObjectiveFunction { + x: &x, + y: y.clone(), + alpha: 1.0, + }; + + let f = objective_reg.f(&DenseMatrix::row_vector_from_array(&[1., 2., 3.])); + assert!((f - 62.2699).abs() < 1e-4); + + objective_reg.df(&mut g, &DenseMatrix::row_vector_from_array(&[1., 2., 3.])); + assert!((g.get(0, 0) - 27.0511).abs() < 1e-4); + assert!((g.get(0, 1) - 12.239).abs() < 1e-4); + assert!((g.get(0, 2) - 3.8693).abs() < 1e-4); } #[test] @@ -547,6 +624,15 @@ mod tests { let y_hat = lr.predict(&x).unwrap(); assert!(accuracy(&y_hat, &y) > 0.9); + + let lr_reg = LogisticRegression::fit( + &x, + &y, + LogisticRegressionParameters::default().with_alpha(10.0), + ) + .unwrap(); + + assert!(lr_reg.coefficients().abs().sum() < lr.coefficients().abs().sum()); } #[test] @@ -561,6 +647,15 @@ mod tests { let y_hat = lr.predict(&x).unwrap(); assert!(accuracy(&y_hat, &y) > 0.9); + + let lr_reg = LogisticRegression::fit( + &x, + &y, + LogisticRegressionParameters::default().with_alpha(10.0), + ) + .unwrap(); + + assert!(lr_reg.coefficients().abs().sum() < lr.coefficients().abs().sum()); } #[test] @@ -622,6 +717,12 @@ mod tests { ]; let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); + let lr_reg = LogisticRegression::fit( + &x, + &y, + LogisticRegressionParameters::default().with_alpha(1.0), + ) + .unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -632,5 +733,6 @@ mod tests { .sum(); assert!(error <= 1.0); + assert!(lr_reg.coefficients().abs().sum() < lr.coefficients().abs().sum()); } } From 991631876eb0bd55b6acf4fdecd85181b985de63 Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 25 Jan 2021 23:33:48 -0800 Subject: [PATCH 38/78] build one-hot encoder --- src/lib.rs | 2 + src/preprocessing/mod.rs | 1 + src/preprocessing/target_encoders.rs | 209 +++++++++++++++++++++++++++ 3 files changed, 212 insertions(+) create mode 100644 src/preprocessing/mod.rs create mode 100644 src/preprocessing/target_encoders.rs diff --git a/src/lib.rs b/src/lib.rs index 7d2b089..c5802d2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -95,3 +95,5 @@ pub(crate) mod optimization; pub mod svm; /// Supervised tree-based learning methods pub mod tree; +/// Preprocessing utilities +pub mod preprocessing; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs new file mode 100644 index 0000000..e4b5190 --- /dev/null +++ b/src/preprocessing/mod.rs @@ -0,0 +1 @@ +pub mod target_encoders; \ No newline at end of file diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs new file mode 100644 index 0000000..1894361 --- /dev/null +++ b/src/preprocessing/target_encoders.rs @@ -0,0 +1,209 @@ + +#![allow(clippy::ptr_arg)] +//! # Encode categorical features as a one-hot or multi-class numeric array. +//! + +use std::hash::Hash; +use std::collections::HashMap; + +use crate::math::num::RealNumber; +use crate::error::Failed; + + +/// Turn a collection of label types into a one-hot vectors. +/// This struct encodes single class per exmample +pub struct OneHotEncoder { + label_to_idx: HashMap, + labels: Vec, + num_classes: usize + +} + +enum LabelDefinition { + LabelToClsNumMap(HashMap), + PositionalLabel(Vec), +} + +/// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) +pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { + let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); + (0..num_labels).map(|idx| if idx == label_idx {pos.clone()} else {neg.clone()}).collect() + +} + +impl<'a, T: Hash + Eq + Clone> OneHotEncoder +{ + + /// Fit an encoder to a lable list + /// + /// Label numbers will be assigned in the order they are encountered + /// Example: + /// ``` + /// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; + /// let enc = OneHotEncoder::::fit(&fake_labels[0..]); + /// let oh_vec = enc.transform_one(&1); // notice that 1 is actually a zero-th positional label + /// assert_eq!(oh_vec, vec![1f64,0f64,0f64,0f64,0f64]); + /// ``` + pub fn fit(labels: &[T]) -> Self { + + let mut label_map: HashMap = HashMap::new(); + let mut class_num = 0usize; + let mut unique_lables: Vec = Vec::new(); + + for l in labels + { + if !label_map.contains_key(&l) { + label_map.insert(l.clone(), class_num); + unique_lables.push(l.clone()); + class_num += 1; + } + } + Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} + } + + + /// Build an encoder from a predefined (label -> class number) map + /// + /// Definition example: + /// ``` + /// let fake_label_map: HashMap<&str, u32> = vec![("background",0), ("dog", 1), ("cat", 2)] + /// .into_iter() + /// .collect(); + /// let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); + /// ``` + pub fn from_label_map(labels: HashMap) -> Self { + Self::from_label_def(LabelDefinition::LabelToClsNumMap(labels)) + } + /// Build an encoder from a predefined positional label-class num vector + /// + /// Definition example: + /// ``` + /// let fake_label_pos = vec!["background","dog", "cat"]; + /// let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + /// ``` + pub fn from_positional_label_vec(labels: Vec) -> Self { + Self::from_label_def(LabelDefinition::PositionalLabel(labels)) + } + + /// Transform a slice of label types into one-hot vectors + /// None is returned if unknown label is encountered + pub fn transform(&self, labels: &[T]) -> Vec>> { + labels + .into_iter() + .map(|l| self.transform_one(l)) + .collect() + } + + /// Transform a single label type into a one-hot vector + pub fn transform_one(&self, label: &T) -> Option> { + match self.label_to_idx.get(label) { + None => None, + Some(&idx) => Some(make_one_hot(idx, self.num_classes)) + } + } + + /// Invert one-hot vector, back to the label + ///``` + /// let lab = enc.invert_one(res)?; // e.g. res = [0,1,0,0...] "dog" == class 1 + /// assert_eq!(lab, "dog") + /// ``` + pub fn invert_one(&self, one_hot: Vec) -> Result { + let pos = U::from_f64(1f64).unwrap(); + + let s: Vec = one_hot + .into_iter() + .enumerate() + .filter_map(|(idx, v)| if v == pos {Some(idx)} else {None}) + .collect(); + + if s.len() == 1 { + let idx = s[0]; + return Ok(self.labels[idx].clone()) + } + let pos_entries = format!("Expected a single positive entry, {} entires found", s.len()); + Err(Failed::transform(&pos_entries[..])) + } + + + fn from_label_def(labels: LabelDefinition) -> Self { + + let (label_map, class_num, unique_lables) = match labels { + LabelDefinition::LabelToClsNumMap(h) => { + let mut _unique_lab: Vec<(T, usize)> = h.iter().map(|(k,v)| (k.clone(), v.clone())).collect(); + _unique_lab.sort_by(|a,b| a.1.cmp(&b.1)); + let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); + (h, unique_lab.len(), unique_lab) + }, + LabelDefinition::PositionalLabel(unique_lab) => { + let h: HashMap = unique_lab.iter().enumerate().map(|(v, k)| (k.clone(),v)).collect(); + (h, unique_lab.len(), unique_lab) + } + }; + Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} + + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_labels() { + let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; + let enc = OneHotEncoder::::fit(&fake_labels[0..]); + let oh_vec = match enc.transform_one(&1) { + None => panic!("Wrong labels"), + Some(v) => v + }; + let res: Vec = vec![1f64,0f64,0f64,0f64,0f64]; + assert_eq!(oh_vec, res); + } + + + fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str>{ + let fake_label_pos = vec!["background","dog", "cat"]; + let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + enc + } + + #[test] + fn label_map_and_vec() { + let fake_label_map: HashMap<&str, usize> = vec![("background",0), ("dog", 1), ("cat", 2)].into_iter().collect(); + let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v + }; + let res: Vec = vec![0f64, 1f64,0f64]; + assert_eq!(oh_vec, res); + } + + #[test] + fn positional_labels_vec() { + let enc = build_fake_str_enc(); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v + }; + let res: Vec = vec![0f64, 1f64,0f64]; + assert_eq!(oh_vec, res); + } + + #[test] + fn invert_label_test() { + let enc = build_fake_str_enc(); + let res: Vec = vec![0f64, 1f64,0f64]; + let lab = enc.invert_one(res).unwrap(); + assert_eq!(lab, "dog"); + + if let Err(e) = enc.invert_one(vec![0.0, 0.0,0.0]) { + let pos_entries = format!("Expected a single positive entry, 0 entires found"); + assert_eq!(e, Failed::transform(&pos_entries[..])); + }; + } + + + +} \ No newline at end of file From dbca6d43cede008cd6be5cb8c60e210c6f25994f Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 25 Jan 2021 23:55:43 -0800 Subject: [PATCH 39/78] fmt fix --- src/lib.rs | 4 +- src/preprocessing/mod.rs | 2 +- src/preprocessing/target_encoders.rs | 148 ++++++++++++++------------- 3 files changed, 79 insertions(+), 75 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c5802d2..6e6205f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -91,9 +91,9 @@ pub mod naive_bayes; /// Supervised neighbors-based learning methods pub mod neighbors; pub(crate) mod optimization; +/// Preprocessing utilities +pub mod preprocessing; /// Support Vector Machines pub mod svm; /// Supervised tree-based learning methods pub mod tree; -/// Preprocessing utilities -pub mod preprocessing; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index e4b5190..c70f7dc 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1 +1 @@ -pub mod target_encoders; \ No newline at end of file +pub mod target_encoders; diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 1894361..81cbdbd 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -1,22 +1,18 @@ - #![allow(clippy::ptr_arg)] //! # Encode categorical features as a one-hot or multi-class numeric array. -//! +//! -use std::hash::Hash; -use std::collections::HashMap; - -use crate::math::num::RealNumber; use crate::error::Failed; - +use crate::math::num::RealNumber; +use std::collections::HashMap; +use std::hash::Hash; /// Turn a collection of label types into a one-hot vectors. /// This struct encodes single class per exmample pub struct OneHotEncoder { label_to_idx: HashMap, labels: Vec, - num_classes: usize - + num_classes: usize, } enum LabelDefinition { @@ -27,13 +23,18 @@ enum LabelDefinition { /// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); - (0..num_labels).map(|idx| if idx == label_idx {pos.clone()} else {neg.clone()}).collect() - + (0..num_labels) + .map(|idx| { + if idx == label_idx { + pos.clone() + } else { + neg.clone() + } + }) + .collect() } -impl<'a, T: Hash + Eq + Clone> OneHotEncoder -{ - +impl<'a, T: Hash + Eq + Clone> OneHotEncoder { /// Fit an encoder to a lable list /// /// Label numbers will be assigned in the order they are encountered @@ -45,23 +46,24 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder /// assert_eq!(oh_vec, vec![1f64,0f64,0f64,0f64,0f64]); /// ``` pub fn fit(labels: &[T]) -> Self { - let mut label_map: HashMap = HashMap::new(); let mut class_num = 0usize; let mut unique_lables: Vec = Vec::new(); - for l in labels - { + for l in labels { if !label_map.contains_key(&l) { label_map.insert(l.clone(), class_num); unique_lables.push(l.clone()); class_num += 1; } } - Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} + Self { + label_to_idx: label_map, + num_classes: class_num, + labels: unique_lables, + } } - /// Build an encoder from a predefined (label -> class number) map /// /// Definition example: @@ -84,21 +86,18 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder pub fn from_positional_label_vec(labels: Vec) -> Self { Self::from_label_def(LabelDefinition::PositionalLabel(labels)) } - + /// Transform a slice of label types into one-hot vectors - /// None is returned if unknown label is encountered + /// None is returned if unknown label is encountered pub fn transform(&self, labels: &[T]) -> Vec>> { - labels - .into_iter() - .map(|l| self.transform_one(l)) - .collect() + labels.into_iter().map(|l| self.transform_one(l)).collect() } /// Transform a single label type into a one-hot vector pub fn transform_one(&self, label: &T) -> Option> { match self.label_to_idx.get(label) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_classes)) + Some(&idx) => Some(make_one_hot(idx, self.num_classes)), } } @@ -111,99 +110,104 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot - .into_iter() - .enumerate() - .filter_map(|(idx, v)| if v == pos {Some(idx)} else {None}) - .collect(); - + .into_iter() + .enumerate() + .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) + .collect(); + if s.len() == 1 { let idx = s[0]; - return Ok(self.labels[idx].clone()) + return Ok(self.labels[idx].clone()); } - let pos_entries = format!("Expected a single positive entry, {} entires found", s.len()); + let pos_entries = format!( + "Expected a single positive entry, {} entires found", + s.len() + ); Err(Failed::transform(&pos_entries[..])) } - fn from_label_def(labels: LabelDefinition) -> Self { - let (label_map, class_num, unique_lables) = match labels { LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(T, usize)> = h.iter().map(|(k,v)| (k.clone(), v.clone())).collect(); - _unique_lab.sort_by(|a,b| a.1.cmp(&b.1)); + let mut _unique_lab: Vec<(T, usize)> = + h.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); + _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); (h, unique_lab.len(), unique_lab) - }, + } LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab.iter().enumerate().map(|(v, k)| (k.clone(),v)).collect(); + let h: HashMap = unique_lab + .iter() + .enumerate() + .map(|(v, k)| (k.clone(), v)) + .collect(); (h, unique_lab.len(), unique_lab) } }; - Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} - + Self { + label_to_idx: label_map, + num_classes: class_num, + labels: unique_lables, + } } } - #[cfg(test)] mod tests { use super::*; #[test] fn from_labels() { - let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; + let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; let enc = OneHotEncoder::::fit(&fake_labels[0..]); let oh_vec = match enc.transform_one(&1) { None => panic!("Wrong labels"), - Some(v) => v + Some(v) => v, }; - let res: Vec = vec![1f64,0f64,0f64,0f64,0f64]; + let res: Vec = vec![1f64, 0f64, 0f64, 0f64, 0f64]; assert_eq!(oh_vec, res); } - - fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str>{ - let fake_label_pos = vec!["background","dog", "cat"]; + fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { + let fake_label_pos = vec!["background", "dog", "cat"]; let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); enc } #[test] fn label_map_and_vec() { - let fake_label_map: HashMap<&str, usize> = vec![("background",0), ("dog", 1), ("cat", 2)].into_iter().collect(); - let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); - let oh_vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), - Some(v) => v - }; - let res: Vec = vec![0f64, 1f64,0f64]; - assert_eq!(oh_vec, res); - } - + let fake_label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + .into_iter() + .collect(); + let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v, + }; + let res: Vec = vec![0f64, 1f64, 0f64]; + assert_eq!(oh_vec, res); + } + #[test] fn positional_labels_vec() { - let enc = build_fake_str_enc(); - let oh_vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), - Some(v) => v - }; - let res: Vec = vec![0f64, 1f64,0f64]; - assert_eq!(oh_vec, res); + let enc = build_fake_str_enc(); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v, + }; + let res: Vec = vec![0.0, 1.0, 0.0]; + assert_eq!(oh_vec, res); } #[test] fn invert_label_test() { let enc = build_fake_str_enc(); - let res: Vec = vec![0f64, 1f64,0f64]; + let res: Vec = vec![0.0, 1.0, 0.0]; let lab = enc.invert_one(res).unwrap(); assert_eq!(lab, "dog"); - - if let Err(e) = enc.invert_one(vec![0.0, 0.0,0.0]) { + if let Err(e) = enc.invert_one(vec![0.0, 0.0, 0.0]) { let pos_entries = format!("Expected a single positive entry, 0 entires found"); assert_eq!(e, Failed::transform(&pos_entries[..])); }; } - - - -} \ No newline at end of file +} From 139bbae4564347cc8b44403c89baad14647ff37f Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 00:01:20 -0800 Subject: [PATCH 40/78] cliipy fixes --- src/preprocessing/target_encoders.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 81cbdbd..c282a4d 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -26,9 +26,9 @@ pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec OneHotEncoder { /// Transform a slice of label types into one-hot vectors /// None is returned if unknown label is encountered pub fn transform(&self, labels: &[T]) -> Vec>> { - labels.into_iter().map(|l| self.transform_one(l)).collect() + labels.iter().map(|l| self.transform_one(l)).collect() } /// Transform a single label type into a one-hot vector @@ -130,7 +130,7 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder { let (label_map, class_num, unique_lables) = match labels { LabelDefinition::LabelToClsNumMap(h) => { let mut _unique_lab: Vec<(T, usize)> = - h.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); + h.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); (h, unique_lab.len(), unique_lab) From 0df797cbae484e50c751910c9c726956ae1a2848 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 00:04:15 -0800 Subject: [PATCH 41/78] fmt fix --- src/preprocessing/target_encoders.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index c282a4d..44a5c05 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -24,13 +24,7 @@ enum LabelDefinition { pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); (0..num_labels) - .map(|idx| { - if idx == label_idx { - pos - } else { - neg - } - }) + .map(|idx| if idx == label_idx { pos } else { neg }) .collect() } From 7daf536aebff1c1d73118bd7d9dfc3bf70cc6b41 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 09:15:24 -0800 Subject: [PATCH 42/78] fixed docs --- src/preprocessing/target_encoders.rs | 125 ++++++++++++++++----------- 1 file changed, 76 insertions(+), 49 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 44a5c05..76f4c92 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -7,11 +7,47 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; -/// Turn a collection of label types into a one-hot vectors. +/// Turn a collection of `LabelType`s into a one-hot vectors. /// This struct encodes single class per exmample -pub struct OneHotEncoder { - label_to_idx: HashMap, - labels: Vec, +/// +/// You can fit a label enumeration by passing a collection of labels. +/// Label numbers will be assigned in the order they are encountered +/// +/// Example: +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// +/// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; +/// let enc = OneHotEncoder::::fit(&fake_labels[..]); +/// let oh_vec: Vec = enc.transform_one(&1).unwrap(); +/// // notice that 1 is actually a zero-th positional label +/// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); +/// ``` +/// +/// You can also pass a predefined label enumeration such as a hashmap `HashMap` or a vector `Vec` +/// +/// +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// +/// let label_map: HashMap<&str, usize> = +/// vec![("cat", 2), ("background",0), ("dog", 1)] +/// .into_iter() +/// .collect(); +/// let label_vec = vec!["background", "dog", "cat"]; +/// +/// let enc_lv = OneHotEncoder::<&str>::from_positional_label_vec(label_vec); +/// let enc_lm = OneHotEncoder::<&str>::from_label_map(label_map); +/// +/// // ["background", "dog", "cat"] +/// println!("{:?}", enc_lv.get_labels()); +/// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) +/// ``` +pub struct OneHotEncoder { + label_to_idx: HashMap, + labels: Vec, num_classes: usize, } @@ -28,21 +64,12 @@ pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec OneHotEncoder { +impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { /// Fit an encoder to a lable list - /// - /// Label numbers will be assigned in the order they are encountered - /// Example: - /// ``` - /// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; - /// let enc = OneHotEncoder::::fit(&fake_labels[0..]); - /// let oh_vec = enc.transform_one(&1); // notice that 1 is actually a zero-th positional label - /// assert_eq!(oh_vec, vec![1f64,0f64,0f64,0f64,0f64]); - /// ``` - pub fn fit(labels: &[T]) -> Self { - let mut label_map: HashMap = HashMap::new(); + pub fn fit(labels: &[LabelType]) -> Self { + let mut label_map: HashMap = HashMap::new(); let mut class_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + let mut unique_lables: Vec = Vec::new(); for l in labels { if !label_map.contains_key(&l) { @@ -59,48 +86,35 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder { } /// Build an encoder from a predefined (label -> class number) map - /// - /// Definition example: - /// ``` - /// let fake_label_map: HashMap<&str, u32> = vec![("background",0), ("dog", 1), ("cat", 2)] - /// .into_iter() - /// .collect(); - /// let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); - /// ``` - pub fn from_label_map(labels: HashMap) -> Self { + pub fn from_label_map(labels: HashMap) -> Self { Self::from_label_def(LabelDefinition::LabelToClsNumMap(labels)) } /// Build an encoder from a predefined positional label-class num vector - /// - /// Definition example: - /// ``` - /// let fake_label_pos = vec!["background","dog", "cat"]; - /// let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); - /// ``` - pub fn from_positional_label_vec(labels: Vec) -> Self { + pub fn from_positional_label_vec(labels: Vec) -> Self { Self::from_label_def(LabelDefinition::PositionalLabel(labels)) } /// Transform a slice of label types into one-hot vectors /// None is returned if unknown label is encountered - pub fn transform(&self, labels: &[T]) -> Vec>> { + pub fn transform(&self, labels: &[LabelType]) -> Vec>> { labels.iter().map(|l| self.transform_one(l)).collect() } /// Transform a single label type into a one-hot vector - pub fn transform_one(&self, label: &T) -> Option> { + pub fn transform_one(&self, label: &LabelType) -> Option> { match self.label_to_idx.get(label) { None => None, Some(&idx) => Some(make_one_hot(idx, self.num_classes)), } } + /// Get labels ordered by encoder's label enumeration + pub fn get_labels(&self) -> &Vec { + &self.labels + } + /// Invert one-hot vector, back to the label - ///``` - /// let lab = enc.invert_one(res)?; // e.g. res = [0,1,0,0...] "dog" == class 1 - /// assert_eq!(lab, "dog") - /// ``` - pub fn invert_one(&self, one_hot: Vec) -> Result { + pub fn invert_one(&self, one_hot: Vec) -> Result { let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot @@ -120,17 +134,17 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder { Err(Failed::transform(&pos_entries[..])) } - fn from_label_def(labels: LabelDefinition) -> Self { + fn from_label_def(labels: LabelDefinition) -> Self { let (label_map, class_num, unique_lables) = match labels { LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(T, usize)> = + let mut _unique_lab: Vec<(LabelType, usize)> = h.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); - let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); + let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); (h, unique_lab.len(), unique_lab) } LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab + let h: HashMap = unique_lab .iter() .enumerate() .map(|(v, k)| (k.clone(), v)) @@ -154,7 +168,7 @@ mod tests { fn from_labels() { let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; let enc = OneHotEncoder::::fit(&fake_labels[0..]); - let oh_vec = match enc.transform_one(&1) { + let oh_vec: Vec = match enc.transform_one(&1) { None => panic!("Wrong labels"), Some(v) => v, }; @@ -170,11 +184,11 @@ mod tests { #[test] fn label_map_and_vec() { - let fake_label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + let label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); - let oh_vec = match enc.transform_one(&"dog") { + let enc = OneHotEncoder::<&str>::from_label_map(label_map); + let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong labels"), Some(v) => v, }; @@ -185,7 +199,7 @@ mod tests { #[test] fn positional_labels_vec() { let enc = build_fake_str_enc(); - let oh_vec = match enc.transform_one(&"dog") { + let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong labels"), Some(v) => v, }; @@ -204,4 +218,17 @@ mod tests { assert_eq!(e, Failed::transform(&pos_entries[..])); }; } + + #[test] + fn test_many_labels() { + let enc = build_fake_str_enc(); + let res: Vec>> = enc.transform(&["dog", "cat", "fish", "background"]); + let v = vec![ + Some(vec![0.0, 1.0, 0.0]), + Some(vec![0.0, 0.0, 1.0]), + None, + Some(vec![1.0, 0.0, 0.0]), + ]; + assert_eq!(res, v) + } } From 9833a2f8514bea27e3913bdf144d00637751ec61 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 10:03:33 -0800 Subject: [PATCH 43/78] codecov-fix --- src/preprocessing/target_encoders.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 76f4c92..56a97ed 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -1,6 +1,5 @@ #![allow(clippy::ptr_arg)] //! # Encode categorical features as a one-hot or multi-class numeric array. -//! use crate::error::Failed; use crate::math::num::RealNumber; From 244a72444520cc6ac832779a44538fc93f6b68e3 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 12:03:13 -0800 Subject: [PATCH 44/78] Genertic make_one_hot. Current implementation returns BaseVector of RealNumber --- src/preprocessing/target_encoders.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 56a97ed..3f2592b 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -6,7 +6,13 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; -/// Turn a collection of `LabelType`s into a one-hot vectors. +/// Make a one-hot encoded vector from a categorical variable +pub fn make_one_hot>(label_idx: usize, num_labels: usize) -> V { + let pos = T::from_f64(1f64).unwrap(); + let mut z = V::zeros(num_labels); + z.set(label_idx, pos); + z +} /// This struct encodes single class per exmample /// /// You can fit a label enumeration by passing a collection of labels. From 19088b682a52b81ec8709fc8ec12e25624062a3c Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 12:06:43 -0800 Subject: [PATCH 45/78] remoe LabelDefinition, looks like unnecesery abstraction for now --- src/preprocessing/target_encoders.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 3f2592b..ff9fa6e 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -91,12 +91,31 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { } /// Build an encoder from a predefined (label -> class number) map - pub fn from_label_map(labels: HashMap) -> Self { - Self::from_label_def(LabelDefinition::LabelToClsNumMap(labels)) + pub fn from_label_map(category_map: HashMap) -> Self { + let mut _unique_cat: Vec<(CategoryType, usize)> = + category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); + _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); + let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); + Self { + num_categories: categories.len(), + categories, + category_map, } + } + /// Build an encoder from a predefined positional label-class num vector - pub fn from_positional_label_vec(labels: Vec) -> Self { - Self::from_label_def(LabelDefinition::PositionalLabel(labels)) + pub fn from_positional_label_vec(categories: Vec) -> Self { + // Self::from_label_def(LabelDefinition::PositionalLabel(categories)) + let category_map: HashMap = categories + .iter() + .enumerate() + .map(|(v, k)| (k.clone(), v)) + .collect(); + Self { + num_categories: categories.len(), + category_map, + categories, + } } /// Transform a slice of label types into one-hot vectors From 6109fc5211d0ebba410e66ec8b824992e775c1d5 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 12:13:45 -0800 Subject: [PATCH 46/78] Renaming fit/transform for API compatibility. Also rename label to category. --- src/preprocessing/target_encoders.rs | 172 +++++++++++---------------- 1 file changed, 70 insertions(+), 102 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index ff9fa6e..a929ab6 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -2,96 +2,86 @@ //! # Encode categorical features as a one-hot or multi-class numeric array. use crate::error::Failed; +use crate::linalg::BaseVector; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable -pub fn make_one_hot>(label_idx: usize, num_labels: usize) -> V { +pub fn make_one_hot>(category_idx: usize, num_categories: usize) -> V { let pos = T::from_f64(1f64).unwrap(); - let mut z = V::zeros(num_labels); - z.set(label_idx, pos); + let mut z = V::zeros(num_categories); + z.set(category_idx, pos); z } + +/// Turn a collection of `CategoryType`s into a one-hot vectors. /// This struct encodes single class per exmample /// -/// You can fit a label enumeration by passing a collection of labels. -/// Label numbers will be assigned in the order they are encountered +/// You can fit_to_series a category enumeration by passing a collection of categories. +/// category numbers will be assigned in the order they are encountered /// /// Example: /// ``` /// use std::collections::HashMap; /// use smartcore::preprocessing::target_encoders::OneHotEncoder; /// -/// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; -/// let enc = OneHotEncoder::::fit(&fake_labels[..]); +/// let fake_categories: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; +/// let enc = OneHotEncoder::::fit_to_series(&fake_categories[..]); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); -/// // notice that 1 is actually a zero-th positional label +/// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); /// ``` /// -/// You can also pass a predefined label enumeration such as a hashmap `HashMap` or a vector `Vec` +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` /// /// /// ``` /// use std::collections::HashMap; /// use smartcore::preprocessing::target_encoders::OneHotEncoder; /// -/// let label_map: HashMap<&str, usize> = +/// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] /// .into_iter() /// .collect(); -/// let label_vec = vec!["background", "dog", "cat"]; +/// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = OneHotEncoder::<&str>::from_positional_label_vec(label_vec); -/// let enc_lm = OneHotEncoder::<&str>::from_label_map(label_map); +/// let enc_lv = OneHotEncoder::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = OneHotEncoder::<&str>::from_category_map(category_map); /// /// // ["background", "dog", "cat"] -/// println!("{:?}", enc_lv.get_labels()); +/// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` -pub struct OneHotEncoder { - label_to_idx: HashMap, - labels: Vec, - num_classes: usize, +pub struct OneHotEncoder { + category_map: HashMap, + categories: Vec, + num_categories: usize, } -enum LabelDefinition { - LabelToClsNumMap(HashMap), - PositionalLabel(Vec), -} - -/// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) -pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { - let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); - (0..num_labels) - .map(|idx| if idx == label_idx { pos } else { neg }) - .collect() -} - -impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { +impl OneHotEncoder { /// Fit an encoder to a lable list - pub fn fit(labels: &[LabelType]) -> Self { - let mut label_map: HashMap = HashMap::new(); - let mut class_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + pub fn fit_to_series(categories: &[CategoryType]) -> Self { + let mut category_map: HashMap = HashMap::new(); + let mut category_num = 0usize; + let mut unique_lables: Vec = Vec::new(); - for l in labels { - if !label_map.contains_key(&l) { - label_map.insert(l.clone(), class_num); + for l in categories { + if !category_map.contains_key(&l) { + category_map.insert(l.clone(), category_num); unique_lables.push(l.clone()); - class_num += 1; + category_num += 1; } } Self { - label_to_idx: label_map, - num_classes: class_num, - labels: unique_lables, + category_map: category_map, + num_categories: category_num, + categories: unique_lables, } } - /// Build an encoder from a predefined (label -> class number) map - pub fn from_label_map(category_map: HashMap) -> Self { + /// Build an encoder from a predefined (category -> class number) map + pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(CategoryType, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); @@ -100,12 +90,11 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { num_categories: categories.len(), categories, category_map, - } + } } - /// Build an encoder from a predefined positional label-class num vector - pub fn from_positional_label_vec(categories: Vec) -> Self { - // Self::from_label_def(LabelDefinition::PositionalLabel(categories)) + /// Build an encoder from a predefined positional category-class num vector + pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories .iter() .enumerate() @@ -118,27 +107,30 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { } } - /// Transform a slice of label types into one-hot vectors - /// None is returned if unknown label is encountered - pub fn transform(&self, labels: &[LabelType]) -> Vec>> { - labels.iter().map(|l| self.transform_one(l)).collect() + /// Transform a slice of category types into one-hot vectors + /// None is returned if unknown category is encountered + pub fn transfrom_series( + &self, + categories: &[CategoryType], + ) -> Vec>> { + categories.iter().map(|l| self.transform_one(l)).collect() } - /// Transform a single label type into a one-hot vector - pub fn transform_one(&self, label: &LabelType) -> Option> { - match self.label_to_idx.get(label) { + /// Transform a single category type into a one-hot vector + pub fn transform_one(&self, category: &CategoryType) -> Option> { + match self.category_map.get(category) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_classes)), + Some(&idx) => Some(make_one_hot(idx, self.num_categories)), } } - /// Get labels ordered by encoder's label enumeration - pub fn get_labels(&self) -> &Vec { - &self.labels + /// Get categories ordered by encoder's category enumeration + pub fn get_categories(&self) -> &Vec { + &self.categories } - /// Invert one-hot vector, back to the label - pub fn invert_one(&self, one_hot: Vec) -> Result { + /// Invert one-hot vector, back to the category + pub fn invert_one(&self, one_hot: Vec) -> Result { let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot @@ -149,7 +141,7 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { if s.len() == 1 { let idx = s[0]; - return Ok(self.labels[idx].clone()); + return Ok(self.categories[idx].clone()); } let pos_entries = format!( "Expected a single positive entry, {} entires found", @@ -157,31 +149,6 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { ); Err(Failed::transform(&pos_entries[..])) } - - fn from_label_def(labels: LabelDefinition) -> Self { - let (label_map, class_num, unique_lables) = match labels { - LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(LabelType, usize)> = - h.iter().map(|(k, v)| (k.clone(), *v)).collect(); - _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); - let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); - (h, unique_lab.len(), unique_lab) - } - LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab - .iter() - .enumerate() - .map(|(v, k)| (k.clone(), v)) - .collect(); - (h, unique_lab.len(), unique_lab) - } - }; - Self { - label_to_idx: label_map, - num_classes: class_num, - labels: unique_lables, - } - } } #[cfg(test)] @@ -189,11 +156,11 @@ mod tests { use super::*; #[test] - fn from_labels() { - let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; - let enc = OneHotEncoder::::fit(&fake_labels[0..]); + fn from_categories() { + let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; + let enc = OneHotEncoder::::fit_to_series(&fake_categories[0..]); let oh_vec: Vec = match enc.transform_one(&1) { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![1f64, 0f64, 0f64, 0f64, 0f64]; @@ -201,19 +168,19 @@ mod tests { } fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { - let fake_label_pos = vec!["background", "dog", "cat"]; - let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + let fake_category_pos = vec!["background", "dog", "cat"]; + let enc = OneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); enc } #[test] - fn label_map_and_vec() { - let label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + fn category_map_and_vec() { + let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_label_map(label_map); + let enc = OneHotEncoder::<&str>::from_category_map(category_map); let oh_vec: Vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![0f64, 1f64, 0f64]; @@ -221,10 +188,10 @@ mod tests { } #[test] - fn positional_labels_vec() { + fn positional_categories_vec() { let enc = build_fake_str_enc(); let oh_vec: Vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![0.0, 1.0, 0.0]; @@ -244,9 +211,10 @@ mod tests { } #[test] - fn test_many_labels() { + fn test_many_categorys() { let enc = build_fake_str_enc(); - let res: Vec>> = enc.transform(&["dog", "cat", "fish", "background"]); + let res: Vec>> = + enc.transfrom_series(&["dog", "cat", "fish", "background"]); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]), From 408b97d8aaa56ce72375f934f8cc56721962ee5b Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 19:31:14 -0800 Subject: [PATCH 47/78] Rename series encoder and move to separate module file --- src/preprocessing/mod.rs | 3 +- .../{target_encoders.rs => series_encoder.rs} | 50 +++++++++++-------- 2 files changed, 32 insertions(+), 21 deletions(-) rename src/preprocessing/{target_encoders.rs => series_encoder.rs} (80%) diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index c70f7dc..4534c6d 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1 +1,2 @@ -pub mod target_encoders; +pub mod categorical_encoders; +pub mod series_encoder; \ No newline at end of file diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/series_encoder.rs similarity index 80% rename from src/preprocessing/target_encoders.rs rename to src/preprocessing/series_encoder.rs index a929ab6..132d160 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/series_encoder.rs @@ -1,14 +1,17 @@ #![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot or multi-class numeric array. +//! # Encode categorical features as a one-hot numeric array. use crate::error::Failed; -use crate::linalg::BaseVector; +use crate::linalg::{BaseVector, Matrix}; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable -pub fn make_one_hot>(category_idx: usize, num_categories: usize) -> V { +pub fn make_one_hot>( + category_idx: usize, + num_categories: usize, +) -> V { let pos = T::from_f64(1f64).unwrap(); let mut z = V::zeros(num_categories); z.set(category_idx, pos); @@ -18,16 +21,17 @@ pub fn make_one_hot>(category_idx: usize, num_ca /// Turn a collection of `CategoryType`s into a one-hot vectors. /// This struct encodes single class per exmample /// -/// You can fit_to_series a category enumeration by passing a collection of categories. +/// You can fit_to_iter a category enumeration by passing an iterator of categories. /// category numbers will be assigned in the order they are encountered /// /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; /// -/// let fake_categories: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; -/// let enc = OneHotEncoder::::fit_to_series(&fake_categories[..]); +/// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; +/// let it = fake_categories.iter().map(|&a| a); +/// let enc = SeriesOneHotEncoder::::fit_to_iter(it); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); /// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); @@ -38,7 +42,7 @@ pub fn make_one_hot>(category_idx: usize, num_ca /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -46,22 +50,22 @@ pub fn make_one_hot>(category_idx: usize, num_ca /// .collect(); /// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = OneHotEncoder::<&str>::from_positional_category_vec(category_vec); -/// let enc_lm = OneHotEncoder::<&str>::from_category_map(category_map); +/// let enc_lv = SeriesOneHotEncoder::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = SeriesOneHotEncoder::<&str>::from_category_map(category_map); /// /// // ["background", "dog", "cat"] /// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` -pub struct OneHotEncoder { +pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, - num_categories: usize, + pub num_categories: usize, } -impl OneHotEncoder { +impl SeriesOneHotEncoder { /// Fit an encoder to a lable list - pub fn fit_to_series(categories: &[CategoryType]) -> Self { + pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); let mut category_num = 0usize; let mut unique_lables: Vec = Vec::new(); @@ -74,7 +78,7 @@ impl OneHotEncoder { } } Self { - category_map: category_map, + category_map, num_categories: category_num, categories: unique_lables, } @@ -107,15 +111,20 @@ impl OneHotEncoder { } } + + pub fn transform_iter(&self, cat_it: impl Iterator)-> Vec>> { + cat_it.map(|l| self.transform_one(l)).collect() + } /// Transform a slice of category types into one-hot vectors /// None is returned if unknown category is encountered pub fn transfrom_series( &self, categories: &[CategoryType], ) -> Vec>> { - categories.iter().map(|l| self.transform_one(l)).collect() + self.transform_iter(categories.iter()) } + /// Transform a single category type into a one-hot vector pub fn transform_one(&self, category: &CategoryType) -> Option> { match self.category_map.get(category) { @@ -158,7 +167,8 @@ mod tests { #[test] fn from_categories() { let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; - let enc = OneHotEncoder::::fit_to_series(&fake_categories[0..]); + let it = fake_categories.iter().map(|&a| a); + let enc = SeriesOneHotEncoder::::fit_to_iter(it); let oh_vec: Vec = match enc.transform_one(&1) { None => panic!("Wrong categories"), Some(v) => v, @@ -167,9 +177,9 @@ mod tests { assert_eq!(oh_vec, res); } - fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { + fn build_fake_str_enc<'a>() -> SeriesOneHotEncoder<&'a str> { let fake_category_pos = vec!["background", "dog", "cat"]; - let enc = OneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); + let enc = SeriesOneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); enc } @@ -178,7 +188,7 @@ mod tests { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_category_map(category_map); + let enc = SeriesOneHotEncoder::<&str>::from_category_map(category_map); let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong categories"), Some(v) => v, From 5c400f40d258c989659daefab030efcb24cec823 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 19:36:38 -0800 Subject: [PATCH 48/78] Scaffold for turniing floats to hashable and fittinng to columns --- src/preprocessing/categorical_encoders.rs | 27 +++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 src/preprocessing/categorical_encoders.rs diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs new file mode 100644 index 0000000..828eeef --- /dev/null +++ b/src/preprocessing/categorical_encoders.rs @@ -0,0 +1,27 @@ +#![allow(clippy::ptr_arg)] +//! # Encode categorical features as a one-hot numeric array. + +use crate::error::Failed; +use crate::linalg::{BaseVector, Matrix}; +use crate::math::num::RealNumber; + +use crate::preprocessing::series_encoder::SeriesOneHotEncoder; + +pub type HashableReal = u32; + +fn hashable_num(v: &T) -> HashableReal { + // gaxler: If first 32 bits are the same, assume numbers are the same for the categorical coercion + v.to_f32_bits() +} + +#[derive(Debug, Clone)] +pub struct OneHotEncoderParams { + pub categorical_param_idxs: Option>, + pub infer_categorical: bool, +} +/// Encode Categorical variavbles of data matrix to one-hot +pub struct OneHotEncoder { + series_encoders: Vec>, + categorical_param_idxs: Vec, +} + From f91b1f99425789b6d11c10941b079b4cd7150f5c Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 19:37:54 -0800 Subject: [PATCH 49/78] fit SeriesOneHotEncoders to predefined columns --- src/preprocessing/categorical_encoders.rs | 42 +++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 828eeef..012f364 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -25,3 +25,45 @@ pub struct OneHotEncoder { categorical_param_idxs: Vec, } +impl> OneHotEncoder { + /// PlaceHolder + + pub fn fit(data: &M, params: OneHotEncoderParams) -> Result { + match (params.categorical_param_idxs, params.infer_categorical) { + (None, false) => Err(Failed::fit( + "Must pass categorical series ids or infer flag", + )), + + (Some(idxs), true) => Err(Failed::fit( + "Ambigous parameters, got both infer and categroy ids", + )), + + (Some(idxs), false) => Ok(Self { + series_encoders: Self::build_series_encoders::(data, &idxs[..]), + categorical_param_idxs: idxs, + }), + + (None, true) => { + todo!("implement categorical auto-inference") + } + } + } + + fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { + let (nrows, _) = data.shape(); + // let mut res: Vec> = Vec::with_capacity(idxs.len()); + let mut tmp_col: Vec = Vec::with_capacity(nrows); + + let res: Vec> = idxs + .iter() + .map(|&idx| { + data.copy_col_as_vec(idx, &mut tmp_col); + let hashable_col = tmp_col.iter().map(|v| hashable_num::(v)); + SeriesOneHotEncoder::fit_to_iter(hashable_col) + }) + .collect(); + res + } + + +} \ No newline at end of file From 3480e728af5ec16edadc8ec63946e76970eaf2d2 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 16:04:41 -0800 Subject: [PATCH 50/78] Documentation updates --- src/preprocessing/categorical_encoders.rs | 26 ++++++++++++++-- src/preprocessing/mod.rs | 5 ++- src/preprocessing/series_encoder.rs | 37 +++++++++++++++-------- 3 files changed, 53 insertions(+), 15 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 012f364..0436787 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -1,5 +1,27 @@ -#![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot numeric array. +//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies +//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents +//! +//! ### Usage Example +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; +//! let data = DenseMatrix::from_2d_array(&[ +//! &[1.5, 1.0, 1.5, 3.0], +//! &[1.5, 2.0, 1.5, 4.0], +//! &[1.5, 1.0, 1.5, 5.0], +//! &[1.5, 2.0, 1.5, 6.0], +//! ]); +//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); +//! // Infer number of categories from data and return a reusable encoder +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! // Transform categorical to one-hot encoded (can transform similar) +//! let oh_data = encoder.transform(&data).unwrap(); +//! // Produces the following: +//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] +//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] +//! ``` use crate::error::Failed; use crate::linalg::{BaseVector, Matrix}; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index 4534c6d..c07b982 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,2 +1,5 @@ +/// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents pub mod categorical_encoders; -pub mod series_encoder; \ No newline at end of file +mod data_traits; +/// Encode a series (column, array) of categorical variables as one-hot vectors +pub mod series_encoder; diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 132d160..321f049 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -1,13 +1,21 @@ #![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot numeric array. +//! # Series Encoder +//! Encode a series of categorical features as a one-hot numeric array. use crate::error::Failed; -use crate::linalg::{BaseVector, Matrix}; +use crate::linalg::BaseVector; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable +/// +/// Example: +/// ``` +/// use smartcore::preprocessing::series_encoder::make_one_hot; +/// let one_hot: Vec = make_one_hot(2, 3); +/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); +/// ``` pub fn make_one_hot>( category_idx: usize, num_categories: usize, @@ -18,7 +26,7 @@ pub fn make_one_hot>( z } -/// Turn a collection of `CategoryType`s into a one-hot vectors. +/// Turn a collection of Hashable objects into a one-hot vectors. /// This struct encodes single class per exmample /// /// You can fit_to_iter a category enumeration by passing an iterator of categories. @@ -27,7 +35,7 @@ pub fn make_one_hot>( /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; /// /// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; /// let it = fake_categories.iter().map(|&a| a); @@ -42,7 +50,7 @@ pub fn make_one_hot>( /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -60,10 +68,11 @@ pub fn make_one_hot>( pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, + /// Number of categories for categorical variable pub num_categories: usize, } -impl SeriesOneHotEncoder { +impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder { /// Fit an encoder to a lable list pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); @@ -111,20 +120,24 @@ impl SeriesOneHotEncoder { } } - - pub fn transform_iter(&self, cat_it: impl Iterator)-> Vec>> { - cat_it.map(|l| self.transform_one(l)).collect() + /// Take an iterator as a series to transform + pub fn transform_iter( + &self, + cat_it: impl Iterator, + ) -> Vec>> { + cat_it.map(|l| self.transform_one(&l)).collect() } + /// Transform a slice of category types into one-hot vectors /// None is returned if unknown category is encountered pub fn transfrom_series( &self, - categories: &[CategoryType], + categories: &'a [CategoryType], ) -> Vec>> { - self.transform_iter(categories.iter()) + let v = categories.iter().map(|a| a.clone()); + self.transform_iter(v) } - /// Transform a single category type into a one-hot vector pub fn transform_one(&self, category: &CategoryType) -> Option> { match self.category_map.get(category) { From 3dc8a4283298d6622a6a0c74cd008339d6b8e9c4 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 16:05:45 -0800 Subject: [PATCH 51/78] Adapt column numbers to the new columns introduced by categorical variables. --- src/preprocessing/categorical_encoders.rs | 34 +++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 0436787..31d3500 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -41,6 +41,40 @@ pub struct OneHotEncoderParams { pub categorical_param_idxs: Option>, pub infer_categorical: bool, } +/// Calculate the offset to parameters to due introduction of one-hot encoding +fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { + // This functions uses iterators and returns a vector. + // In case we get a huge amount of paramenters this might be a problem + // todo: Change this such that it will return an iterator + + let cat_idx = encoded_idxs.iter().copied().chain((num_params..).take(1)); + + // Offset is constant between two categorical values, here we calculate the number of steps + // that remain constant + let repeats = cat_idx.scan(0, |a, v| { + let im = v + 1 - *a; + *a = v; + Some(im) + }); + + // Calculate the offset to parameter idx due to newly intorduced one-hot vectors + let offset_ = cat_sizes.iter().scan(0, |a, &v| { + *a = *a + v - 1; + Some(*a) + }); + let offset = (0..1).chain(offset_); + + let new_param_idxs: Vec = (0..num_params) + .zip( + repeats + .zip(offset) + .map(|(r, o)| iter::repeat(o).take(r)) + .flatten(), + ) + .map(|(idx, ofst)| idx + ofst) + .collect(); + new_param_idxs +} /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { series_encoders: Vec>, From dd39433ff8ddea5445e3b1ca27db2474c002885d Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 18:48:23 -0800 Subject: [PATCH 52/78] Categorizable trait defines logic of turning floats into hashable categorical variables. Since we only support RealNumbers for now, the idea is to treat round numbers as ordinal (or nominal if user chooses to ignore order) categories. --- src/preprocessing/data_traits.rs | 43 ++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/preprocessing/data_traits.rs diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs new file mode 100644 index 0000000..04b534e --- /dev/null +++ b/src/preprocessing/data_traits.rs @@ -0,0 +1,43 @@ +//! Traits to indicate that float variables can be viewed as categorical +//! This module assumes + +pub type CategoricalFloat = u16; + +// pub struct CategoricalFloat(u16); + +pub trait Categorizable { + type A; + + fn to_category(self) -> CategoricalFloat; + + fn is_valid(self) -> bool; + +} + +impl Categorizable for f32 { + + type A = CategoricalFloat; + + fn to_category(self) -> CategoricalFloat { + self as CategoricalFloat + } + + fn is_valid(self) -> bool { + let a = self.to_category(); + a as f32 == self + } +} + +impl Categorizable for f64 { + + type A = CategoricalFloat; + + fn to_category(self) ->CategoricalFloat { + self as CategoricalFloat + } + + fn is_valid(self) -> bool { + let a = self.to_category(); + a as f64 == self + } +} \ No newline at end of file From cd5611079caae782f148397a0ebad465aea6faef Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:29:33 -0800 Subject: [PATCH 53/78] Fit OneHotEncoder --- src/preprocessing/categorical_encoders.rs | 56 ++++++++++++++++++----- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 31d3500..794c1d6 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -75,32 +75,66 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) .collect(); new_param_idxs } +fn validate_col_is_categorical(data: &Vec) -> bool { + for v in data { + if !v.is_valid() { return false} + } + true +} /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { - series_encoders: Vec>, - categorical_param_idxs: Vec, + series_encoders: Vec>, + col_idx_categorical: Vec, } -impl> OneHotEncoder { +impl OneHotEncoder { /// PlaceHolder - pub fn fit(data: &M, params: OneHotEncoderParams) -> Result { - match (params.categorical_param_idxs, params.infer_categorical) { + pub fn fit>( + data: &M, + params: OneHotEncoderParams, + ) -> Result { + match (params.col_idx_categorical, params.infer_categorical) { (None, false) => Err(Failed::fit( "Must pass categorical series ids or infer flag", )), - (Some(idxs), true) => Err(Failed::fit( + (Some(_idxs), true) => Err(Failed::fit( "Ambigous parameters, got both infer and categroy ids", )), - (Some(idxs), false) => Ok(Self { - series_encoders: Self::build_series_encoders::(data, &idxs[..]), - categorical_param_idxs: idxs, - }), + (Some(mut idxs), false) => { + // make sure categories have same order as data columns + idxs.sort(); + + let (nrows, _) = data.shape(); + + // col buffer to avoid allocations + let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); + + let mut res: Vec> = Vec::with_capacity(idxs.len()); + + for &idx in &idxs { + data.copy_col_as_vec(idx, &mut col_buf); + if !validate_col_is_categorical(&col_buf) { + let msg = format!("Column {} of data matrix containts non categorizable (integer) values", idx); + return Err(Failed::fit(&msg[..])) + } + let hashable_col = col_buf.iter().map(|v| v.to_category()); + res.push(SeriesOneHotEncoder::fit_to_iter(hashable_col)); + } + + Ok(Self { + series_encoders: res, //Self::build_series_encoders::(data, &idxs[..]), + col_idx_categorical: idxs, + }) + } (None, true) => { - todo!("implement categorical auto-inference") + todo!("Auto-Inference for Categorical Variables not yet implemented") + } + } + } } } } From fd6b2e801479f709870921f192153c6abeeab53d Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:29:58 -0800 Subject: [PATCH 54/78] Transform matrix --- src/preprocessing/categorical_encoders.rs | 42 +++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 794c1d6..585f13a 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -135,9 +135,51 @@ impl OneHotEncoder { } } } + + /// Transform categorical variables to one-hot encoded and return a new matrix + pub fn transform>(&self, x: &M) -> Option { + let (nrows, p) = x.shape(); + let additional_params: Vec = self + .series_encoders + .iter() + .map(|enc| enc.num_categories) + .collect(); + + let new_param_num: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); + let mut res = M::zeros(nrows, new_param_num); + // copy old data in x to their new location + for (old_p, &new_p) in new_col_idx.iter().enumerate() { + for r in 0..nrows { + let val = x.get(r, old_p); + res.set(r, new_p, val); } } + for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { + let cidx = new_col_idx[old_cidx]; + let col_iter = (0..nrows).map(|r| res.get(r, cidx).to_category()); + let sencoder = &self.series_encoders[pidx]; + let oh_series: Vec>> = sencoder.transform_iter(col_iter); + + for (row, oh_vec) in oh_series.iter().enumerate() { + match oh_vec { + None => { + // Bad value in a series causes in to be invalid + // todo: proper error handling, so user can know where the bad value is + return None; + } + Some(v) => { + // copy one hot vectors to their place in the data matrix; + for (col_ofst, &val) in v.iter().enumerate() { + res.set(row, cidx + col_ofst, val); + } } + } + } + } + Some(res) + } +} fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { let (nrows, _) = data.shape(); From c987d39d439462e5abc12cf34276d8735afb1145 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:31:09 -0800 Subject: [PATCH 55/78] tests + force Categorizable be RealNumber --- src/preprocessing/categorical_encoders.rs | 138 +++++++++++++++++----- src/preprocessing/data_traits.rs | 4 +- 2 files changed, 114 insertions(+), 28 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 585f13a..063aa5c 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -1,6 +1,8 @@ //! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies //! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents //! +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [SeriesOneHotEncoder](../series_encoder/struct.SeriesOneHotEncoder.html) +//! //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; @@ -22,25 +24,33 @@ //! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] //! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] //! ``` +use std::iter; use crate::error::Failed; -use crate::linalg::{BaseVector, Matrix}; -use crate::math::num::RealNumber; +use crate::linalg::Matrix; +use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; use crate::preprocessing::series_encoder::SeriesOneHotEncoder; -pub type HashableReal = u32; - -fn hashable_num(v: &T) -> HashableReal { - // gaxler: If first 32 bits are the same, assume numbers are the same for the categorical coercion - v.to_f32_bits() -} - +/// OneHotEncoder Parameters #[derive(Debug, Clone)] pub struct OneHotEncoderParams { - pub categorical_param_idxs: Option>, + /// Column number that contain categorical variable + pub col_idx_categorical: Option>, + /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables pub infer_categorical: bool, } + +impl OneHotEncoderParams { + /// Generate parameters from categorical variable column numbers + pub fn from_cat_idx(categorical_params: &[usize]) -> Self { + Self { + col_idx_categorical: Some(categorical_params.to_vec()), + infer_categorical: false, + } + } +} + /// Calculate the offset to parameters to due introduction of one-hot encoding fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { // This functions uses iterators and returns a vector. @@ -75,12 +85,14 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) .collect(); new_param_idxs } + fn validate_col_is_categorical(data: &Vec) -> bool { for v in data { if !v.is_valid() { return false} } true } + /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { series_encoders: Vec>, @@ -167,13 +179,13 @@ impl OneHotEncoder { // Bad value in a series causes in to be invalid // todo: proper error handling, so user can know where the bad value is return None; - } + } Some(v) => { // copy one hot vectors to their place in the data matrix; for (col_ofst, &val) in v.iter().enumerate() { res.set(row, cidx + col_ofst, val); - } - } + } + } } } } @@ -181,21 +193,93 @@ impl OneHotEncoder { } } - fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { - let (nrows, _) = data.shape(); - // let mut res: Vec> = Vec::with_capacity(idxs.len()); - let mut tmp_col: Vec = Vec::with_capacity(nrows); +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::preprocessing::series_encoder::SeriesOneHotEncoder; - let res: Vec> = idxs - .iter() - .map(|&idx| { - data.copy_col_as_vec(idx, &mut tmp_col); - let hashable_col = tmp_col.iter().map(|v| hashable_num::(v)); - SeriesOneHotEncoder::fit_to_iter(hashable_col) - }) - .collect(); - res + #[test] + fn adjust_idxs() { + assert_eq!(find_new_idxs(0, &[], &[]), Vec::new()); + // [0,1,2] -> [0, 1, 1, 1, 2] + assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); } + fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { + let orig = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); -} \ No newline at end of file + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { + // Categorical first and last + let orig = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 1.5, 3.0], + &[1.5, 2.0, 1.5, 4.0], + &[1.5, 1.0, 1.5, 5.0], + &[1.5, 2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + #[test] + fn hash_encode_f64_series() { + let series = vec![3.0, 1.0, 2.0, 1.0]; + let hashable_series: Vec = + series.iter().map(|v| v.to_category()).collect(); + let enc = SeriesOneHotEncoder::from_positional_category_vec(hashable_series); + let inv = enc.invert_one(vec![0.0, 0.0, 1.0]); + let orig_val: f64 = inv.unwrap().into(); + assert_eq!(orig_val, 2.0); + } + #[test] + fn test_fit() { + let (X, _) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + assert_eq!(oh_enc.series_encoders.len(), 2); + + let num_cat: Vec = oh_enc + .series_encoders + .iter() + .map(|a| a.num_categories) + .collect(); + assert_eq!(num_cat, vec![2, 4]); + } + + #[test] + fn matrix_transform_test() { + let (X, expectedX) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let nm = oh_enc.transform(&X).unwrap(); + assert_eq!(nm, expectedX); + + let (X, expectedX) = build_cat_first_and_last(); + let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let nm = oh_enc.transform(&X).unwrap(); + assert_eq!(nm, expectedX); + } +} diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs index 04b534e..16924bb 100644 --- a/src/preprocessing/data_traits.rs +++ b/src/preprocessing/data_traits.rs @@ -1,11 +1,13 @@ //! Traits to indicate that float variables can be viewed as categorical //! This module assumes +use crate::math::num::RealNumber; + pub type CategoricalFloat = u16; // pub struct CategoricalFloat(u16); -pub trait Categorizable { +pub trait Categorizable: RealNumber { type A; fn to_category(self) -> CategoricalFloat; From 2f03c1d6d74834d5bad990a5fd9c7cd7962fa351 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:54:42 -0800 Subject: [PATCH 56/78] module name change --- ...cal_encoders.rs => categorical_encoder.rs} | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) rename src/preprocessing/{categorical_encoders.rs => categorical_encoder.rs} (89%) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoder.rs similarity index 89% rename from src/preprocessing/categorical_encoders.rs rename to src/preprocessing/categorical_encoder.rs index 063aa5c..22cd052 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -38,7 +38,7 @@ pub struct OneHotEncoderParams { /// Column number that contain categorical variable pub col_idx_categorical: Option>, /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables - pub infer_categorical: bool, + infer_categorical: bool, } impl OneHotEncoderParams { @@ -86,14 +86,17 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) new_param_idxs } -fn validate_col_is_categorical(data: &Vec) -> bool { +fn validate_col_is_categorical(data: &[T]) -> bool { for v in data { - if !v.is_valid() { return false} + if !v.is_valid() { + return false; + } } true } /// Encode Categorical variavbles of data matrix to one-hot +#[derive(Debug, Clone)] pub struct OneHotEncoder { series_encoders: Vec>, col_idx_categorical: Vec, @@ -102,7 +105,7 @@ pub struct OneHotEncoder { impl OneHotEncoder { /// PlaceHolder - pub fn fit>( + pub fn fit>( data: &M, params: OneHotEncoderParams, ) -> Result { @@ -117,20 +120,24 @@ impl OneHotEncoder { (Some(mut idxs), false) => { // make sure categories have same order as data columns - idxs.sort(); + idxs.sort_unstable(); let (nrows, _) = data.shape(); // col buffer to avoid allocations let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - - let mut res: Vec> = Vec::with_capacity(idxs.len()); - + + let mut res: Vec> = + Vec::with_capacity(idxs.len()); + for &idx in &idxs { data.copy_col_as_vec(idx, &mut col_buf); if !validate_col_is_categorical(&col_buf) { - let msg = format!("Column {} of data matrix containts non categorizable (integer) values", idx); - return Err(Failed::fit(&msg[..])) + let msg = format!( + "Column {} of data matrix containts non categorizable (integer) values", + idx + ); + return Err(Failed::fit(&msg[..])); } let hashable_col = col_buf.iter().map(|v| v.to_category()); res.push(SeriesOneHotEncoder::fit_to_iter(hashable_col)); @@ -149,7 +156,7 @@ impl OneHotEncoder { } /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform>(&self, x: &M) -> Option { + pub fn transform>(&self, x: &M) -> Option { let (nrows, p) = x.shape(); let additional_params: Vec = self .series_encoders @@ -201,7 +208,7 @@ mod tests { #[test] fn adjust_idxs() { - assert_eq!(find_new_idxs(0, &[], &[]), Vec::new()); + assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); // [0,1,2] -> [0, 1, 1, 1, 2] assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); } @@ -282,4 +289,22 @@ mod tests { let nm = oh_enc.transform(&X).unwrap(); assert_eq!(nm, expectedX); } + + #[test] + fn fail_on_bad_category() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); + + let params = OneHotEncoderParams::from_cat_idx(&[1]); + match OneHotEncoder::fit(&m, params) { + Err(_) => { + assert!(true); + } + _ => assert!(false), + } + } } From ca0816db97d7fa1426b98c5b97b548a8a89d2b12 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:55:04 -0800 Subject: [PATCH 57/78] Clippy fixes --- src/preprocessing/data_traits.rs | 14 ++++++-------- src/preprocessing/mod.rs | 2 +- src/preprocessing/series_encoder.rs | 3 ++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs index 16924bb..38d9e3e 100644 --- a/src/preprocessing/data_traits.rs +++ b/src/preprocessing/data_traits.rs @@ -1,11 +1,12 @@ //! Traits to indicate that float variables can be viewed as categorical -//! This module assumes +//! This module assumes use crate::math::num::RealNumber; pub type CategoricalFloat = u16; // pub struct CategoricalFloat(u16); +const ERROR_MARGIN: f64 = 0.001; pub trait Categorizable: RealNumber { type A; @@ -13,11 +14,9 @@ pub trait Categorizable: RealNumber { fn to_category(self) -> CategoricalFloat; fn is_valid(self) -> bool; - } impl Categorizable for f32 { - type A = CategoricalFloat; fn to_category(self) -> CategoricalFloat { @@ -26,20 +25,19 @@ impl Categorizable for f32 { fn is_valid(self) -> bool { let a = self.to_category(); - a as f32 == self + (a as f32 - self).abs() < (ERROR_MARGIN as f32) } } impl Categorizable for f64 { - type A = CategoricalFloat; - fn to_category(self) ->CategoricalFloat { + fn to_category(self) -> CategoricalFloat { self as CategoricalFloat } fn is_valid(self) -> bool { let a = self.to_category(); - a as f64 == self + (a as f64 - self).abs() < ERROR_MARGIN } -} \ No newline at end of file +} diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index c07b982..4a1abf3 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,5 +1,5 @@ /// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents -pub mod categorical_encoders; +pub mod categorical_encoder; mod data_traits; /// Encode a series (column, array) of categorical variables as one-hot vectors pub mod series_encoder; diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 321f049..438d678 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -65,6 +65,7 @@ pub fn make_one_hot>( /// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` +#[derive(Debug, Clone)] pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, @@ -134,7 +135,7 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder &self, categories: &'a [CategoryType], ) -> Vec>> { - let v = categories.iter().map(|a| a.clone()); + let v = categories.iter().cloned(); self.transform_iter(v) } From 863be5ef756518f8d213266f195a4c06b403d5fd Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 20:09:52 -0800 Subject: [PATCH 58/78] style fixes --- src/preprocessing/categorical_encoder.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 22cd052..b05a344 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -262,9 +262,9 @@ mod tests { } #[test] fn test_fit() { - let (X, _) = build_fake_matrix(); + let (x, _) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); assert_eq!(oh_enc.series_encoders.len(), 2); let num_cat: Vec = oh_enc @@ -277,17 +277,17 @@ mod tests { #[test] fn matrix_transform_test() { - let (X, expectedX) = build_fake_matrix(); + let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); - let nm = oh_enc.transform(&X).unwrap(); - assert_eq!(nm, expectedX); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); - let (X, expectedX) = build_cat_first_and_last(); + let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); - let nm = oh_enc.transform(&X).unwrap(); - assert_eq!(nm, expectedX); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); } #[test] From f4b5936dcfde9c3e82c4098016c2555a4e6210e2 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 20:18:52 -0800 Subject: [PATCH 59/78] fmt --- src/preprocessing/categorical_encoder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index b05a344..706670b 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -277,17 +277,17 @@ mod tests { #[test] fn matrix_transform_test() { - let (x, expected_x) = build_fake_matrix(); + let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); + assert_eq!(nm, expected_x); - let (x, expected_x) = build_cat_first_and_last(); + let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); + assert_eq!(nm, expected_x); } #[test] From a882741e1273e7e0d2742f48f84920ae759aadaf Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 1 Feb 2021 11:20:03 -0800 Subject: [PATCH 60/78] If transform fails - fail before copying the whole matrix (changed the order of coping, first do the categorical, than copy ther rest) --- src/preprocessing/categorical_encoder.rs | 46 ++++++++++++++++-------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 706670b..7e71119 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -156,7 +156,7 @@ impl OneHotEncoder { } /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform>(&self, x: &M) -> Option { + pub fn transform>(&self, x: &M) -> Result { let (nrows, p) = x.shape(); let additional_params: Vec = self .series_encoders @@ -164,28 +164,24 @@ impl OneHotEncoder { .map(|enc| enc.num_categories) .collect(); - let new_param_num: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + // Eac category of size v adds v-1 params + let expandws_p: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); - let mut res = M::zeros(nrows, new_param_num); - // copy old data in x to their new location - for (old_p, &new_p) in new_col_idx.iter().enumerate() { - for r in 0..nrows { - let val = x.get(r, old_p); - res.set(r, new_p, val); - } - } + let mut res = M::zeros(nrows, expandws_p); + for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { let cidx = new_col_idx[old_cidx]; - let col_iter = (0..nrows).map(|r| res.get(r, cidx).to_category()); + let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); let sencoder = &self.series_encoders[pidx]; let oh_series: Vec>> = sencoder.transform_iter(col_iter); for (row, oh_vec) in oh_series.iter().enumerate() { match oh_vec { None => { - // Bad value in a series causes in to be invalid - // todo: proper error handling, so user can know where the bad value is - return None; + // Since we support T types, bad value in a series causes in to be invalid + let msg = format!("At least one value in column {} doesn't conform to category definition", old_cidx); + return Err(Failed::transform(&msg[..])); } Some(v) => { // copy one hot vectors to their place in the data matrix; @@ -196,7 +192,27 @@ impl OneHotEncoder { } } } - Some(res) + + // copy old data in x to their new location while skipping catergorical vars (already treated) + let mut skip_idx_iter = self.col_idx_categorical.iter(); + let mut cur_skip = skip_idx_iter.next(); + + for (old_p, &new_p) in new_col_idx.iter().enumerate() { + // if found treated varible, skip it + if let Some(&v) = cur_skip { + if v == old_p { + cur_skip = skip_idx_iter.next(); + continue; + } + } + + for r in 0..nrows { + let val = x.get(r, old_p); + res.set(r, new_p, val); + } + } + + Ok(res) } } From 03b9f76e9f9a18910cd59c5859b21571e05bb559 Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 1 Feb 2021 11:24:20 -0800 Subject: [PATCH 61/78] Doc+Naming Improvement --- src/preprocessing/categorical_encoder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 7e71119..7a0f5d9 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -52,12 +52,12 @@ impl OneHotEncoderParams { } /// Calculate the offset to parameters to due introduction of one-hot encoding -fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { +fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> Vec { // This functions uses iterators and returns a vector. // In case we get a huge amount of paramenters this might be a problem // todo: Change this such that it will return an iterator - let cat_idx = encoded_idxs.iter().copied().chain((num_params..).take(1)); + let cat_idx = cat_idxs.iter().copied().chain((num_params..).take(1)); // Offset is constant between two categorical values, here we calculate the number of steps // that remain constant @@ -103,8 +103,8 @@ pub struct OneHotEncoder { } impl OneHotEncoder { - /// PlaceHolder - + + /// Create an encoder instance with categories infered from data matrix pub fn fit>( data: &M, params: OneHotEncoderParams, From 228b54baf7d04715c1e170af2be506a99caf044e Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 1 Feb 2021 11:24:50 -0800 Subject: [PATCH 62/78] fmt --- src/preprocessing/categorical_encoder.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 7a0f5d9..e3e8ce9 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -103,8 +103,7 @@ pub struct OneHotEncoder { } impl OneHotEncoder { - - /// Create an encoder instance with categories infered from data matrix + /// Create an encoder instance with categories infered from data matrix pub fn fit>( data: &M, params: OneHotEncoderParams, From 19ff6df84cd3d55f7accd44b2986289691059fa8 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 17:40:58 -0800 Subject: [PATCH 63/78] Separate mapper object --- src/preprocessing/series_encoder.rs | 67 +++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 438d678..4e9625e 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -8,6 +8,73 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; +#[derive(Debug, Clone)] +pub struct CategoryMapper { + category_map: HashMap, + categories: Vec, + num_categories: usize, +} + +impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { + fn fit_to_iter(categories: impl Iterator) -> Self { + let mut category_map: HashMap = HashMap::new(); + let mut category_num = 0usize; + let mut unique_lables: Vec = Vec::new(); + + for l in categories { + if !category_map.contains_key(&l) { + category_map.insert(l.clone(), category_num); + unique_lables.push(l.clone()); + category_num += 1; + } + } + Self { + category_map, + num_categories: category_num, + categories: unique_lables, + } + } + + fn from_category_map(category_map: HashMap) -> Self { + let mut _unique_cat: Vec<(CategoryType, usize)> = + category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); + _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); + let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); + Self { + num_categories: categories.len(), + categories, + category_map, + } + } + + fn from_positional_category_vec(categories: Vec) -> Self { + let category_map: HashMap = categories + .iter() + .enumerate() + .map(|(v, k)| (k.clone(), v)) + .collect(); + Self { + num_categories: categories.len(), + category_map, + categories, + } + } + + /// Get label num of a category + fn get_num(&self, category: &CategoryType) -> Option<&usize> { + self.category_map.get(category) + } + + /// Return category corresponding to label num + fn get_cat(&self, num: usize) -> &CategoryType { + &self.categories[num] + } + + fn get_categories(&self) -> &[CategoryType] { + &self.categories[..] + } +} + /// Make a one-hot encoded vector from a categorical variable /// /// Example: From d31145b4fe24e0718aef3b0b9371e9e2834b31ce Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 18:19:36 -0800 Subject: [PATCH 64/78] Define common series encoder behavior --- src/preprocessing/series_encoder.rs | 146 +++++++++++++--------------- 1 file changed, 70 insertions(+), 76 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 4e9625e..4e9ddf9 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -75,6 +75,50 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } } +/// Defines common behavior for series encoders(e.g. OneHot, Ordinal) +pub trait SeriesEncoder: + where + CategoryType:Hash + Eq + Clone +{ + /// Fit an encoder to a lable list + fn fit_to_iter(categories: impl Iterator) -> Self; + + /// Number of categories for categorical variable + fn num_categories(&self) -> usize; + + /// Build an encoder from a predefined (category -> class number) map + fn from_category_map(category_map: HashMap) -> Self; + + /// Build an encoder from a predefined positional category-class num vector + fn from_positional_category_vec(categories: Vec) -> Self; + + /// Transform a single category type into a one-hot vector + fn transform_one>(&self, category: &CategoryType) -> Option; + + /// Invert one-hot vector, back to the category + fn invert_one>(&self, one_hot: V) -> Result; + + /// Get categories ordered by encoder's category enumeration + fn get_categories(&self) -> &[CategoryType]; + + /// Take an iterator as a series to transform + fn transform_iter>( + &self, + cat_it: impl Iterator, + ) -> Vec> { + cat_it.map(|l| self.transform_one(&l)).collect() + } + + /// Transform a slice of category types into one-hot vectors + /// None is returned if unknown category is encountered + fn transfrom_series>( + &self, + categories: &[CategoryType], + ) -> Vec> { + let v = categories.iter().cloned(); + self.transform_iter(v) + } +} /// Make a one-hot encoded vector from a categorical variable /// /// Example: @@ -134,104 +178,47 @@ pub fn make_one_hot>( /// ``` #[derive(Debug, Clone)] pub struct SeriesOneHotEncoder { - category_map: HashMap, - categories: Vec, - /// Number of categories for categorical variable - pub num_categories: usize, + mapper: CategoryMapper, } -impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder { - /// Fit an encoder to a lable list - pub fn fit_to_iter(categories: impl Iterator) -> Self { - let mut category_map: HashMap = HashMap::new(); - let mut category_num = 0usize; - let mut unique_lables: Vec = Vec::new(); +impl SeriesEncoder for SeriesOneHotEncoder { - for l in categories { - if !category_map.contains_key(&l) { - category_map.insert(l.clone(), category_num); - unique_lables.push(l.clone()); - category_num += 1; + fn fit_to_iter(categories: impl Iterator) -> Self { + Self {mapper:CategoryMapper::fit_to_iter(categories)} } - } - Self { - category_map, - num_categories: category_num, - categories: unique_lables, - } - } /// Build an encoder from a predefined (category -> class number) map - pub fn from_category_map(category_map: HashMap) -> Self { - let mut _unique_cat: Vec<(CategoryType, usize)> = - category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); - _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); - let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); - Self { - num_categories: categories.len(), - categories, - category_map, + fn from_category_map(category_map: HashMap) -> Self { + Self {mapper: CategoryMapper::from_category_map(category_map)} } - } /// Build an encoder from a predefined positional category-class num vector - pub fn from_positional_category_vec(categories: Vec) -> Self { - let category_map: HashMap = categories - .iter() - .enumerate() - .map(|(v, k)| (k.clone(), v)) - .collect(); - Self { - num_categories: categories.len(), - category_map, - categories, + fn from_positional_category_vec(categories: Vec) -> Self { + Self {mapper:CategoryMapper::from_positional_category_vec(categories)} } + + fn num_categories(&self) -> usize { + self.mapper.num_categories } - /// Take an iterator as a series to transform - pub fn transform_iter( - &self, - cat_it: impl Iterator, - ) -> Vec>> { - cat_it.map(|l| self.transform_one(&l)).collect() + fn get_categories(&self) -> &[CategoryType] { + self.mapper.get_categories() } - /// Transform a slice of category types into one-hot vectors - /// None is returned if unknown category is encountered - pub fn transfrom_series( - &self, - categories: &'a [CategoryType], - ) -> Vec>> { - let v = categories.iter().cloned(); - self.transform_iter(v) - } - - /// Transform a single category type into a one-hot vector - pub fn transform_one(&self, category: &CategoryType) -> Option> { - match self.category_map.get(category) { - None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_categories)), - } - } - - /// Get categories ordered by encoder's category enumeration - pub fn get_categories(&self) -> &Vec { - &self.categories - } - - /// Invert one-hot vector, back to the category - pub fn invert_one(&self, one_hot: Vec) -> Result { + fn invert_one>(&self, one_hot: V) -> Result + { let pos = U::from_f64(1f64).unwrap(); + + let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); - let s: Vec = one_hot - .into_iter() + let s: Vec = oh_it .enumerate() .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) .collect(); if s.len() == 1 { let idx = s[0]; - return Ok(self.categories[idx].clone()); + return Ok(self.mapper.get_cat(idx).clone()); } let pos_entries = format!( "Expected a single positive entry, {} entires found", @@ -239,6 +226,13 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder ); Err(Failed::transform(&pos_entries[..])) } + + fn transform_one>(&self, category: &CategoryType) -> Option { + match self.mapper.get_num(category) { + None => None, + Some(&idx) => Some(make_one_hot(idx, self.num_categories())), + } + } } #[cfg(test)] From 237b1160b17308252b6040d4c5ca07880079051c Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 18:20:27 -0800 Subject: [PATCH 65/78] doc update --- src/preprocessing/series_encoder.rs | 64 ++++++++++++++++------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 4e9ddf9..9d7e259 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -8,6 +8,7 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; +/// Bi-directional map category <-> label num. #[derive(Debug, Clone)] pub struct CategoryMapper { category_map: HashMap, @@ -16,7 +17,9 @@ pub struct CategoryMapper { } impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { - fn fit_to_iter(categories: impl Iterator) -> Self { + + /// Fit an encoder to a lable iterator + pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); let mut category_num = 0usize; let mut unique_lables: Vec = Vec::new(); @@ -34,8 +37,9 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { categories: unique_lables, } } - - fn from_category_map(category_map: HashMap) -> Self { + + /// Build an encoder from a predefined (category -> class number) map + pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(CategoryType, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); @@ -46,8 +50,9 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { category_map, } } - - fn from_positional_category_vec(categories: Vec) -> Self { + + /// Build an encoder from a predefined positional category-class num vector + pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories .iter() .enumerate() @@ -61,16 +66,17 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Get label num of a category - fn get_num(&self, category: &CategoryType) -> Option<&usize> { + pub fn get_num(&self, category: &CategoryType) -> Option<&usize> { self.category_map.get(category) } /// Return category corresponding to label num - fn get_cat(&self, num: usize) -> &CategoryType { + pub fn get_cat(&self, num: usize) -> &CategoryType { &self.categories[num] } - fn get_categories(&self) -> &[CategoryType] { + /// List all categories (position = category number) + pub fn get_categories(&self) -> &[CategoryType] { &self.categories[..] } } @@ -80,14 +86,14 @@ pub trait SeriesEncoder: where CategoryType:Hash + Eq + Clone { - /// Fit an encoder to a lable list + /// Fit an encoder to a lable iterator fn fit_to_iter(categories: impl Iterator) -> Self; /// Number of categories for categorical variable fn num_categories(&self) -> usize; /// Build an encoder from a predefined (category -> class number) map - fn from_category_map(category_map: HashMap) -> Self; + fn from_category_map(category_map: HashMap) -> Self; /// Build an encoder from a predefined positional category-class num vector fn from_positional_category_vec(categories: Vec) -> Self; @@ -119,6 +125,7 @@ pub trait SeriesEncoder: self.transform_iter(v) } } + /// Make a one-hot encoded vector from a categorical variable /// /// Example: @@ -182,20 +189,20 @@ pub struct SeriesOneHotEncoder { } impl SeriesEncoder for SeriesOneHotEncoder { - + fn fit_to_iter(categories: impl Iterator) -> Self { Self {mapper:CategoryMapper::fit_to_iter(categories)} - } + } /// Build an encoder from a predefined (category -> class number) map fn from_category_map(category_map: HashMap) -> Self { Self {mapper: CategoryMapper::from_category_map(category_map)} - } + } /// Build an encoder from a predefined positional category-class num vector fn from_positional_category_vec(categories: Vec) -> Self { Self {mapper:CategoryMapper::from_positional_category_vec(categories)} - } + } fn num_categories(&self) -> usize { self.mapper.num_categories @@ -207,25 +214,25 @@ impl SeriesEncoder for SeriesOneH fn invert_one>(&self, one_hot: V) -> Result { - let pos = U::from_f64(1f64).unwrap(); + let pos = U::from_f64(1f64).unwrap(); let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); - + let s: Vec = oh_it - .enumerate() - .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) - .collect(); - - if s.len() == 1 { - let idx = s[0]; + .enumerate() + .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) + .collect(); + + if s.len() == 1 { + let idx = s[0]; return Ok(self.mapper.get_cat(idx).clone()); + } + let pos_entries = format!( + "Expected a single positive entry, {} entires found", + s.len() + ); + Err(Failed::transform(&pos_entries[..])) } - let pos_entries = format!( - "Expected a single positive entry, {} entires found", - s.len() - ); - Err(Failed::transform(&pos_entries[..])) - } fn transform_one>(&self, category: &CategoryType) -> Option { match self.mapper.get_num(category) { @@ -233,6 +240,7 @@ impl SeriesEncoder for SeriesOneH Some(&idx) => Some(make_one_hot(idx, self.num_categories())), } } + } #[cfg(test)] From ef06f45638ec42540d74f41ffd2171f2d97e793f Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 18:21:06 -0800 Subject: [PATCH 66/78] Switch to use SeriesEncoder trait --- src/preprocessing/categorical_encoder.rs | 35 ++++++++++++++---------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index e3e8ce9..75cbf2b 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -6,7 +6,7 @@ //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; -//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEnc, OneHotEncoderParams}; //! let data = DenseMatrix::from_2d_array(&[ //! &[1.5, 1.0, 1.5, 3.0], //! &[1.5, 2.0, 1.5, 4.0], @@ -15,7 +15,7 @@ //! ]); //! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); //! // Infer number of categories from data and return a reusable encoder -//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! let encoder = OneHotEnc::fit(&data, encoder_params).unwrap(); //! // Transform categorical to one-hot encoded (can transform similar) //! let oh_data = encoder.transform(&data).unwrap(); //! // Produces the following: @@ -30,7 +30,7 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; -use crate::preprocessing::series_encoder::SeriesOneHotEncoder; +use crate::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; /// OneHotEncoder Parameters #[derive(Debug, Clone)] @@ -97,17 +97,17 @@ fn validate_col_is_categorical(data: &[T]) -> bool { /// Encode Categorical variavbles of data matrix to one-hot #[derive(Debug, Clone)] -pub struct OneHotEncoder { - series_encoders: Vec>, +pub struct OneHotEncoder { + series_encoders: Vec, col_idx_categorical: Vec, } -impl OneHotEncoder { +impl> OneHotEncoder { /// Create an encoder instance with categories infered from data matrix pub fn fit>( data: &M, params: OneHotEncoderParams, - ) -> Result { + ) -> Result, Failed> { match (params.col_idx_categorical, params.infer_categorical) { (None, false) => Err(Failed::fit( "Must pass categorical series ids or infer flag", @@ -126,7 +126,7 @@ impl OneHotEncoder { // col buffer to avoid allocations let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - let mut res: Vec> = + let mut res: Vec = Vec::with_capacity(idxs.len()); for &idx in &idxs { @@ -139,7 +139,7 @@ impl OneHotEncoder { return Err(Failed::fit(&msg[..])); } let hashable_col = col_buf.iter().map(|v| v.to_category()); - res.push(SeriesOneHotEncoder::fit_to_iter(hashable_col)); + res.push(E::fit_to_iter(hashable_col)); } Ok(Self { @@ -160,7 +160,7 @@ impl OneHotEncoder { let additional_params: Vec = self .series_encoders .iter() - .map(|enc| enc.num_categories) + .map(|enc| enc.num_categories()) .collect(); // Eac category of size v adds v-1 params @@ -215,12 +215,17 @@ impl OneHotEncoder { } } +/// Convinince type for common use +pub type OneHotEnc = OneHotEncoder>; + + #[cfg(test)] mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::preprocessing::series_encoder::SeriesOneHotEncoder; + #[test] fn adjust_idxs() { assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); @@ -279,13 +284,13 @@ mod tests { fn test_fit() { let (x, _) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let oh_enc = OneHotEnc::fit(&x, params).unwrap(); assert_eq!(oh_enc.series_encoders.len(), 2); let num_cat: Vec = oh_enc .series_encoders .iter() - .map(|a| a.num_categories) + .map(|a| a.num_categories()) .collect(); assert_eq!(num_cat, vec![2, 4]); } @@ -294,13 +299,13 @@ mod tests { fn matrix_transform_test() { let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let oh_enc = OneHotEnc::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let oh_enc = OneHotEnc::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); } @@ -315,7 +320,7 @@ mod tests { ]); let params = OneHotEncoderParams::from_cat_idx(&[1]); - match OneHotEncoder::fit(&m, params) { + match OneHotEnc::fit(&m, params) { Err(_) => { assert!(true); } From 700d320724c8dad09cdd31e3d73e5cc4d91c33ce Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 10:45:25 -0800 Subject: [PATCH 67/78] simplify SeriesEncoder trait --- src/preprocessing/series_encoder.rs | 134 ++++++++++++++-------------- 1 file changed, 68 insertions(+), 66 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 9d7e259..6975c0d 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -10,19 +10,22 @@ use std::hash::Hash; /// Bi-directional map category <-> label num. #[derive(Debug, Clone)] -pub struct CategoryMapper { - category_map: HashMap, - categories: Vec, +pub struct CategoryMapper { + category_map: HashMap, + categories: Vec, num_categories: usize, } -impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { +impl<'a, C> CategoryMapper +where + C: 'a + Hash + Eq + Clone +{ /// Fit an encoder to a lable iterator - pub fn fit_to_iter(categories: impl Iterator) -> Self { - let mut category_map: HashMap = HashMap::new(); + pub fn fit_to_iter(categories: impl Iterator) -> Self { + let mut category_map: HashMap = HashMap::new(); let mut category_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + let mut unique_lables: Vec = Vec::new(); for l in categories { if !category_map.contains_key(&l) { @@ -39,11 +42,11 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Build an encoder from a predefined (category -> class number) map - pub fn from_category_map(category_map: HashMap) -> Self { - let mut _unique_cat: Vec<(CategoryType, usize)> = + pub fn from_category_map(category_map: HashMap) -> Self { + let mut _unique_cat: Vec<(C, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); - let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); + let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); Self { num_categories: categories.len(), categories, @@ -52,8 +55,8 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Build an encoder from a predefined positional category-class num vector - pub fn from_positional_category_vec(categories: Vec) -> Self { - let category_map: HashMap = categories + pub fn from_positional_category_vec(categories: Vec) -> Self { + let category_map: HashMap = categories .iter() .enumerate() .map(|(v, k)| (k.clone(), v)) @@ -66,64 +69,49 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Get label num of a category - pub fn get_num(&self, category: &CategoryType) -> Option<&usize> { + pub fn get_num(&self, category: &C) -> Option<&usize> { self.category_map.get(category) } /// Return category corresponding to label num - pub fn get_cat(&self, num: usize) -> &CategoryType { + pub fn get_cat(&self, num: usize) -> &C { &self.categories[num] } /// List all categories (position = category number) - pub fn get_categories(&self) -> &[CategoryType] { + pub fn get_categories(&self) -> &[C] { &self.categories[..] } } /// Defines common behavior for series encoders(e.g. OneHot, Ordinal) -pub trait SeriesEncoder: +pub trait SeriesEncoder: where - CategoryType:Hash + Eq + Clone + C: Hash + Eq + Clone { /// Fit an encoder to a lable iterator - fn fit_to_iter(categories: impl Iterator) -> Self; + fn fit_to_iter(categories: impl Iterator) -> Self; /// Number of categories for categorical variable fn num_categories(&self) -> usize; - /// Build an encoder from a predefined (category -> class number) map - fn from_category_map(category_map: HashMap) -> Self; - - /// Build an encoder from a predefined positional category-class num vector - fn from_positional_category_vec(categories: Vec) -> Self; - /// Transform a single category type into a one-hot vector - fn transform_one>(&self, category: &CategoryType) -> Option; + fn transform_one>(&self, category: &C) -> Option; /// Invert one-hot vector, back to the category - fn invert_one>(&self, one_hot: V) -> Result; + fn invert_one>(&self, one_hot: V) -> Result; /// Get categories ordered by encoder's category enumeration - fn get_categories(&self) -> &[CategoryType]; + fn get_categories(&self) -> &[C]; /// Take an iterator as a series to transform + /// None is returned if unknown category is encountered fn transform_iter>( &self, - cat_it: impl Iterator, + cat_it: impl Iterator, ) -> Vec> { cat_it.map(|l| self.transform_one(&l)).collect() } - - /// Transform a slice of category types into one-hot vectors - /// None is returned if unknown category is encountered - fn transfrom_series>( - &self, - categories: &[CategoryType], - ) -> Vec> { - let v = categories.iter().cloned(); - self.transform_iter(v) - } } /// Make a one-hot encoded vector from a categorical variable @@ -153,22 +141,22 @@ pub fn make_one_hot>( /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; /// /// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; /// let it = fake_categories.iter().map(|&a| a); -/// let enc = SeriesOneHotEncoder::::fit_to_iter(it); +/// let enc: SeriesOneHotEncoder:: = SeriesEncoder::fit_to_iter(it); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); /// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); /// ``` /// -/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` /// /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder, CategoryMapper}; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -176,43 +164,53 @@ pub fn make_one_hot>( /// .collect(); /// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = SeriesOneHotEncoder::<&str>::from_positional_category_vec(category_vec); -/// let enc_lm = SeriesOneHotEncoder::<&str>::from_category_map(category_map); +/// let enc_lv = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_positional_category_vec(category_vec)); +/// let enc_lm = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_category_map(category_map)); /// /// // ["background", "dog", "cat"] /// println!("{:?}", enc_lv.get_categories()); -/// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) +/// let lv: Vec = enc_lv.transform_one(&"dog").unwrap(); +/// let lm: Vec = enc_lm.transform_one(&"dog").unwrap(); +/// assert_eq!(lv, lm); /// ``` #[derive(Debug, Clone)] -pub struct SeriesOneHotEncoder { - mapper: CategoryMapper, +pub struct SeriesOneHotEncoder { + mapper: CategoryMapper, } -impl SeriesEncoder for SeriesOneHotEncoder { +impl SeriesOneHotEncoder +where + C: Hash + Eq + Clone +{ + /// Create SeriesEncoder form existing mapper + pub fn new(mapper: CategoryMapper) -> Self { + Self {mapper} + } +} + +impl SeriesEncoder for SeriesOneHotEncoder +where + C: Hash + Eq + Clone +{ - fn fit_to_iter(categories: impl Iterator) -> Self { + + fn fit_to_iter(categories: impl Iterator) -> Self { Self {mapper:CategoryMapper::fit_to_iter(categories)} } - /// Build an encoder from a predefined (category -> class number) map - fn from_category_map(category_map: HashMap) -> Self { - Self {mapper: CategoryMapper::from_category_map(category_map)} - } - - /// Build an encoder from a predefined positional category-class num vector - fn from_positional_category_vec(categories: Vec) -> Self { - Self {mapper:CategoryMapper::from_positional_category_vec(categories)} - } - fn num_categories(&self) -> usize { self.mapper.num_categories } - fn get_categories(&self) -> &[CategoryType] { + fn get_categories(&self) -> &[C] { self.mapper.get_categories() } - fn invert_one>(&self, one_hot: V) -> Result + fn invert_one(&self, one_hot: V) -> Result + where + U: RealNumber, + V: BaseVector + { let pos = U::from_f64(1f64).unwrap(); @@ -234,7 +232,11 @@ impl SeriesEncoder for SeriesOneH Err(Failed::transform(&pos_entries[..])) } - fn transform_one>(&self, category: &CategoryType) -> Option { + fn transform_one(&self, category: &C) -> Option + where + U: RealNumber, + V: BaseVector + { match self.mapper.get_num(category) { None => None, Some(&idx) => Some(make_one_hot(idx, self.num_categories())), @@ -262,7 +264,7 @@ mod tests { fn build_fake_str_enc<'a>() -> SeriesOneHotEncoder<&'a str> { let fake_category_pos = vec!["background", "dog", "cat"]; - let enc = SeriesOneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); + let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_positional_category_vec(fake_category_pos)); enc } @@ -271,7 +273,7 @@ mod tests { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = SeriesOneHotEncoder::<&str>::from_category_map(category_map); + let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_category_map(category_map)); let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong categories"), Some(v) => v, @@ -306,8 +308,8 @@ mod tests { #[test] fn test_many_categorys() { let enc = build_fake_str_enc(); - let res: Vec>> = - enc.transfrom_series(&["dog", "cat", "fish", "background"]); + let cat_it = ["dog", "cat", "fish", "background"].iter().cloned(); + let res: Vec>> = enc.transform_iter(cat_it); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]), From 3cc20fd400682356ac0dfe1dfeb1206172983123 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 13:39:26 -0800 Subject: [PATCH 68/78] Move all functionality to CategoryMapper (one-hot and ordinal). --- src/preprocessing/series_encoder.rs | 181 +++++++++------------------- 1 file changed, 58 insertions(+), 123 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 6975c0d..cdbae16 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -8,7 +8,48 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; -/// Bi-directional map category <-> label num. +/// ## Bi-directional map category <-> label num. +/// Turn Hashable objects into a one-hot vectors or ordinal values. +/// This struct encodes single class per exmample +/// +/// You can fit_to_iter a category enumeration by passing an iterator of categories. +/// category numbers will be assigned in the order they are encountered +/// +/// Example: +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::series_encoder::CategoryMapper; +/// +/// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; +/// let it = fake_categories.iter().map(|&a| a); +/// let enc = CategoryMapper::::fit_to_iter(it); +/// let oh_vec: Vec = enc.get_one_hot(&1).unwrap(); +/// // notice that 1 is actually a zero-th positional category +/// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); +/// ``` +/// +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` +/// +/// +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::series_encoder::CategoryMapper; +/// +/// let category_map: HashMap<&str, usize> = +/// vec![("cat", 2), ("background",0), ("dog", 1)] +/// .into_iter() +/// .collect(); +/// let category_vec = vec!["background", "dog", "cat"]; +/// +/// let enc_lv = CategoryMapper::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = CategoryMapper::<&str>::from_category_map(category_map); +/// +/// // ["background", "dog", "cat"] +/// println!("{:?}", enc_lv.get_categories()); +/// let lv: Vec = enc_lv.get_one_hot(&"dog").unwrap(); +/// let lm: Vec = enc_lm.get_one_hot(&"dog").unwrap(); +/// assert_eq!(lv, lm); +/// ``` #[derive(Debug, Clone)] pub struct CategoryMapper { category_map: HashMap, @@ -16,10 +57,14 @@ pub struct CategoryMapper { num_categories: usize, } -impl<'a, C> CategoryMapper +impl CategoryMapper where - C: 'a + Hash + Eq + Clone + C: Hash + Eq + Clone, { + /// Get the number of categories in the mapper + pub fn num_categories(&self) -> usize { + self.num_categories + } /// Fit an encoder to a lable iterator pub fn fit_to_iter(categories: impl Iterator) -> Self { @@ -82,131 +127,21 @@ where pub fn get_categories(&self) -> &[C] { &self.categories[..] } -} -/// Defines common behavior for series encoders(e.g. OneHot, Ordinal) -pub trait SeriesEncoder: + /// Get one-hot encoding of the category + pub fn get_one_hot(&self, category: &C) -> Option where - C: Hash + Eq + Clone + U: RealNumber, + V: BaseVector, { - /// Fit an encoder to a lable iterator - fn fit_to_iter(categories: impl Iterator) -> Self; - - /// Number of categories for categorical variable - fn num_categories(&self) -> usize; - - /// Transform a single category type into a one-hot vector - fn transform_one>(&self, category: &C) -> Option; + match self.get_num(category) { + None => None, + Some(&idx) => Some(make_one_hot::(idx, self.num_categories)), + } +} /// Invert one-hot vector, back to the category - fn invert_one>(&self, one_hot: V) -> Result; - - /// Get categories ordered by encoder's category enumeration - fn get_categories(&self) -> &[C]; - - /// Take an iterator as a series to transform - /// None is returned if unknown category is encountered - fn transform_iter>( - &self, - cat_it: impl Iterator, - ) -> Vec> { - cat_it.map(|l| self.transform_one(&l)).collect() - } -} - -/// Make a one-hot encoded vector from a categorical variable -/// -/// Example: -/// ``` -/// use smartcore::preprocessing::series_encoder::make_one_hot; -/// let one_hot: Vec = make_one_hot(2, 3); -/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); -/// ``` -pub fn make_one_hot>( - category_idx: usize, - num_categories: usize, -) -> V { - let pos = T::from_f64(1f64).unwrap(); - let mut z = V::zeros(num_categories); - z.set(category_idx, pos); - z -} - -/// Turn a collection of Hashable objects into a one-hot vectors. -/// This struct encodes single class per exmample -/// -/// You can fit_to_iter a category enumeration by passing an iterator of categories. -/// category numbers will be assigned in the order they are encountered -/// -/// Example: -/// ``` -/// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; -/// -/// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; -/// let it = fake_categories.iter().map(|&a| a); -/// let enc: SeriesOneHotEncoder:: = SeriesEncoder::fit_to_iter(it); -/// let oh_vec: Vec = enc.transform_one(&1).unwrap(); -/// // notice that 1 is actually a zero-th positional category -/// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); -/// ``` -/// -/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` -/// -/// -/// ``` -/// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder, CategoryMapper}; -/// -/// let category_map: HashMap<&str, usize> = -/// vec![("cat", 2), ("background",0), ("dog", 1)] -/// .into_iter() -/// .collect(); -/// let category_vec = vec!["background", "dog", "cat"]; -/// -/// let enc_lv = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_positional_category_vec(category_vec)); -/// let enc_lm = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_category_map(category_map)); -/// -/// // ["background", "dog", "cat"] -/// println!("{:?}", enc_lv.get_categories()); -/// let lv: Vec = enc_lv.transform_one(&"dog").unwrap(); -/// let lm: Vec = enc_lm.transform_one(&"dog").unwrap(); -/// assert_eq!(lv, lm); -/// ``` -#[derive(Debug, Clone)] -pub struct SeriesOneHotEncoder { - mapper: CategoryMapper, -} - -impl SeriesOneHotEncoder -where - C: Hash + Eq + Clone -{ - /// Create SeriesEncoder form existing mapper - pub fn new(mapper: CategoryMapper) -> Self { - Self {mapper} - } -} - -impl SeriesEncoder for SeriesOneHotEncoder -where - C: Hash + Eq + Clone -{ - - - fn fit_to_iter(categories: impl Iterator) -> Self { - Self {mapper:CategoryMapper::fit_to_iter(categories)} - } - - fn num_categories(&self) -> usize { - self.mapper.num_categories - } - - fn get_categories(&self) -> &[C] { - self.mapper.get_categories() - } - - fn invert_one(&self, one_hot: V) -> Result + pub fn invert_one_hot(&self, one_hot: V) -> Result where U: RealNumber, V: BaseVector From 374dfeceb906262a2797967cfa02514b5ca2d48d Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 13:41:25 -0800 Subject: [PATCH 69/78] No more SeriesEncoders. --- src/preprocessing/series_encoder.rs | 104 +++++++++++++++++----------- 1 file changed, 63 insertions(+), 41 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index cdbae16..e24eca1 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -65,7 +65,7 @@ where pub fn num_categories(&self) -> usize { self.num_categories } - + /// Fit an encoder to a lable iterator pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); @@ -85,7 +85,7 @@ where categories: unique_lables, } } - + /// Build an encoder from a predefined (category -> class number) map pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(C, usize)> = @@ -98,7 +98,7 @@ where category_map, } } - + /// Build an encoder from a predefined positional category-class num vector pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories @@ -130,54 +130,71 @@ where /// Get one-hot encoding of the category pub fn get_one_hot(&self, category: &C) -> Option - where + where U: RealNumber, V: BaseVector, -{ + { match self.get_num(category) { None => None, Some(&idx) => Some(make_one_hot::(idx, self.num_categories)), + } } -} /// Invert one-hot vector, back to the category pub fn invert_one_hot(&self, one_hot: V) -> Result where U: RealNumber, - V: BaseVector + V: BaseVector, + { + let pos = U::one(); - { - let pos = U::from_f64(1f64).unwrap(); - - let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); - - let s: Vec = oh_it - .enumerate() - .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) - .collect(); - - if s.len() == 1 { - let idx = s[0]; - return Ok(self.mapper.get_cat(idx).clone()); - } - let pos_entries = format!( - "Expected a single positive entry, {} entires found", - s.len() - ); - Err(Failed::transform(&pos_entries[..])) + let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); + + let s: Vec = oh_it + .enumerate() + .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) + .collect(); + + if s.len() == 1 { + let idx = s[0]; + return Ok(self.get_cat(idx).clone()); } + let pos_entries = format!( + "Expected a single positive entry, {} entires found", + s.len() + ); + Err(Failed::transform(&pos_entries[..])) + } - fn transform_one(&self, category: &C) -> Option + /// Get ordinal encoding of the catergory + pub fn get_ordinal(&self, category: &C) -> Option where U: RealNumber, - V: BaseVector { - match self.mapper.get_num(category) { + match self.get_num(category) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_categories())), + Some(&idx) => U::from_usize(idx), } } - +} + +/// Make a one-hot encoded vector from a categorical variable +/// +/// Example: +/// ``` +/// use smartcore::preprocessing::series_encoder::make_one_hot; +/// let one_hot: Vec = make_one_hot(2, 3); +/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); +/// ``` +pub fn make_one_hot(category_idx: usize, num_categories: usize) -> V +where + T: RealNumber, + V: BaseVector, +{ + let pos = T::one(); + let mut z = V::zeros(num_categories); + z.set(category_idx, pos); + z } #[cfg(test)] @@ -188,8 +205,8 @@ mod tests { fn from_categories() { let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; let it = fake_categories.iter().map(|&a| a); - let enc = SeriesOneHotEncoder::::fit_to_iter(it); - let oh_vec: Vec = match enc.transform_one(&1) { + let enc = CategoryMapper::::fit_to_iter(it); + let oh_vec: Vec = match enc.get_one_hot(&1) { None => panic!("Wrong categories"), Some(v) => v, }; @@ -197,19 +214,24 @@ mod tests { assert_eq!(oh_vec, res); } - fn build_fake_str_enc<'a>() -> SeriesOneHotEncoder<&'a str> { + fn build_fake_str_enc<'a>() -> CategoryMapper<&'a str> { let fake_category_pos = vec!["background", "dog", "cat"]; - let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_positional_category_vec(fake_category_pos)); + let enc = CategoryMapper::<&str>::from_positional_category_vec(fake_category_pos); enc } + #[test] + fn ordinal_encoding() { + let enc = build_fake_str_enc(); + assert_eq!(1f64, enc.get_ordinal::(&"dog").unwrap()) + } #[test] fn category_map_and_vec() { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_category_map(category_map)); - let oh_vec: Vec = match enc.transform_one(&"dog") { + let enc = CategoryMapper::<&str>::from_category_map(category_map); + let oh_vec: Vec = match enc.get_one_hot(&"dog") { None => panic!("Wrong categories"), Some(v) => v, }; @@ -220,7 +242,7 @@ mod tests { #[test] fn positional_categories_vec() { let enc = build_fake_str_enc(); - let oh_vec: Vec = match enc.transform_one(&"dog") { + let oh_vec: Vec = match enc.get_one_hot(&"dog") { None => panic!("Wrong categories"), Some(v) => v, }; @@ -232,9 +254,9 @@ mod tests { fn invert_label_test() { let enc = build_fake_str_enc(); let res: Vec = vec![0.0, 1.0, 0.0]; - let lab = enc.invert_one(res).unwrap(); + let lab = enc.invert_one_hot(res).unwrap(); assert_eq!(lab, "dog"); - if let Err(e) = enc.invert_one(vec![0.0, 0.0, 0.0]) { + if let Err(e) = enc.invert_one_hot(vec![0.0, 0.0, 0.0]) { let pos_entries = format!("Expected a single positive entry, 0 entires found"); assert_eq!(e, Failed::transform(&pos_entries[..])); }; @@ -244,7 +266,7 @@ mod tests { fn test_many_categorys() { let enc = build_fake_str_enc(); let cat_it = ["dog", "cat", "fish", "background"].iter().cloned(); - let res: Vec>> = enc.transform_iter(cat_it); + let res: Vec>> = cat_it.map(|v| enc.get_one_hot(&v)).collect(); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]), From 828df4e338c0a44a38ad2004f3bae349322d1c94 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 13:42:27 -0800 Subject: [PATCH 70/78] Use CategoryMapper to transform an iterator. No more passing iterator to SeriesEncoders --- src/preprocessing/categorical_encoder.rs | 67 ++++++++++++------------ 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 75cbf2b..18e569a 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -1,12 +1,12 @@ //! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies //! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents //! -//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [SeriesOneHotEncoder](../series_encoder/struct.SeriesOneHotEncoder.html) +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [CategoryMapper](../series_encoder/struct.CategoryMapper.html) //! //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; -//! use smartcore::preprocessing::categorical_encoder::{OneHotEnc, OneHotEncoderParams}; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; //! let data = DenseMatrix::from_2d_array(&[ //! &[1.5, 1.0, 1.5, 3.0], //! &[1.5, 2.0, 1.5, 4.0], @@ -15,7 +15,7 @@ //! ]); //! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); //! // Infer number of categories from data and return a reusable encoder -//! let encoder = OneHotEnc::fit(&data, encoder_params).unwrap(); +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); //! // Transform categorical to one-hot encoded (can transform similar) //! let oh_data = encoder.transform(&data).unwrap(); //! // Produces the following: @@ -30,7 +30,7 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; -use crate::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; +use crate::preprocessing::series_encoder::CategoryMapper; /// OneHotEncoder Parameters #[derive(Debug, Clone)] @@ -97,17 +97,18 @@ fn validate_col_is_categorical(data: &[T]) -> bool { /// Encode Categorical variavbles of data matrix to one-hot #[derive(Debug, Clone)] -pub struct OneHotEncoder { - series_encoders: Vec, +pub struct OneHotEncoder { + category_mappers: Vec>, col_idx_categorical: Vec, } -impl> OneHotEncoder { +impl OneHotEncoder { /// Create an encoder instance with categories infered from data matrix - pub fn fit>( - data: &M, - params: OneHotEncoderParams, - ) -> Result, Failed> { + pub fn fit(data: &M, params: OneHotEncoderParams) -> Result + where + T: Categorizable, + M: Matrix, + { match (params.col_idx_categorical, params.infer_categorical) { (None, false) => Err(Failed::fit( "Must pass categorical series ids or infer flag", @@ -126,8 +127,7 @@ impl> OneHotEncoder { // col buffer to avoid allocations let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - let mut res: Vec = - Vec::with_capacity(idxs.len()); + let mut res: Vec> = Vec::with_capacity(idxs.len()); for &idx in &idxs { data.copy_col_as_vec(idx, &mut col_buf); @@ -139,11 +139,11 @@ impl> OneHotEncoder { return Err(Failed::fit(&msg[..])); } let hashable_col = col_buf.iter().map(|v| v.to_category()); - res.push(E::fit_to_iter(hashable_col)); + res.push(CategoryMapper::fit_to_iter(hashable_col)); } Ok(Self { - series_encoders: res, //Self::build_series_encoders::(data, &idxs[..]), + category_mappers: res, col_idx_categorical: idxs, }) } @@ -155,10 +155,14 @@ impl> OneHotEncoder { } /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform>(&self, x: &M) -> Result { + pub fn transform(&self, x: &M) -> Result + where + T: Categorizable, + M: Matrix, + { let (nrows, p) = x.shape(); let additional_params: Vec = self - .series_encoders + .category_mappers .iter() .map(|enc| enc.num_categories()) .collect(); @@ -172,10 +176,10 @@ impl> OneHotEncoder { for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { let cidx = new_col_idx[old_cidx]; let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); - let sencoder = &self.series_encoders[pidx]; - let oh_series: Vec>> = sencoder.transform_iter(col_iter); + let sencoder = &self.category_mappers[pidx]; + let oh_series = col_iter.map(|c| sencoder.get_one_hot::>(&c)); - for (row, oh_vec) in oh_series.iter().enumerate() { + for (row, oh_vec) in oh_series.enumerate() { match oh_vec { None => { // Since we support T types, bad value in a series causes in to be invalid @@ -215,16 +219,11 @@ impl> OneHotEncoder { } } -/// Convinince type for common use -pub type OneHotEnc = OneHotEncoder>; - - #[cfg(test)] mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::preprocessing::series_encoder::SeriesOneHotEncoder; - + use crate::preprocessing::series_encoder::CategoryMapper; #[test] fn adjust_idxs() { @@ -275,8 +274,8 @@ mod tests { let series = vec![3.0, 1.0, 2.0, 1.0]; let hashable_series: Vec = series.iter().map(|v| v.to_category()).collect(); - let enc = SeriesOneHotEncoder::from_positional_category_vec(hashable_series); - let inv = enc.invert_one(vec![0.0, 0.0, 1.0]); + let enc = CategoryMapper::from_positional_category_vec(hashable_series); + let inv = enc.invert_one_hot(vec![0.0, 0.0, 1.0]); let orig_val: f64 = inv.unwrap().into(); assert_eq!(orig_val, 2.0); } @@ -284,11 +283,11 @@ mod tests { fn test_fit() { let (x, _) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEnc::fit(&x, params).unwrap(); - assert_eq!(oh_enc.series_encoders.len(), 2); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + assert_eq!(oh_enc.category_mappers.len(), 2); let num_cat: Vec = oh_enc - .series_encoders + .category_mappers .iter() .map(|a| a.num_categories()) .collect(); @@ -299,13 +298,13 @@ mod tests { fn matrix_transform_test() { let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEnc::fit(&x, params).unwrap(); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEnc::fit(&x, params).unwrap(); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); } @@ -320,7 +319,7 @@ mod tests { ]); let params = OneHotEncoderParams::from_cat_idx(&[1]); - match OneHotEnc::fit(&m, params) { + match OneHotEncoder::fit(&m, params) { Err(_) => { assert!(true); } From af6ec2d402c1d3d6aca1881f7c80301487a94cab Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 9 Feb 2021 22:01:34 -0800 Subject: [PATCH 71/78] rename categorical --- src/preprocessing/categorical.rs | 329 +++++++++++++++++++++++++++++++ src/preprocessing/mod.rs | 2 +- 2 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 src/preprocessing/categorical.rs diff --git a/src/preprocessing/categorical.rs b/src/preprocessing/categorical.rs new file mode 100644 index 0000000..8571e74 --- /dev/null +++ b/src/preprocessing/categorical.rs @@ -0,0 +1,329 @@ +//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies +//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents +//! +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [CategoryMapper](../series_encoder/struct.CategoryMapper.html) +//! +//! ### Usage Example +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::preprocessing::categorical::{OneHotEncoder, OneHotEncoderParams}; +//! let data = DenseMatrix::from_2d_array(&[ +//! &[1.5, 1.0, 1.5, 3.0], +//! &[1.5, 2.0, 1.5, 4.0], +//! &[1.5, 1.0, 1.5, 5.0], +//! &[1.5, 2.0, 1.5, 6.0], +//! ]); +//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); +//! // Infer number of categories from data and return a reusable encoder +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! // Transform categorical to one-hot encoded (can transform similar) +//! let oh_data = encoder.transform(&data).unwrap(); +//! // Produces the following: +//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] +//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] +//! ``` +use std::iter; + +use crate::error::Failed; +use crate::linalg::Matrix; + +use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; +use crate::preprocessing::series_encoder::CategoryMapper; + +/// OneHotEncoder Parameters +#[derive(Debug, Clone)] +pub struct OneHotEncoderParams { + /// Column number that contain categorical variable + pub col_idx_categorical: Option>, + /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables + infer_categorical: bool, +} + +impl OneHotEncoderParams { + /// Generate parameters from categorical variable column numbers + pub fn from_cat_idx(categorical_params: &[usize]) -> Self { + Self { + col_idx_categorical: Some(categorical_params.to_vec()), + infer_categorical: false, + } + } +} + +/// Calculate the offset to parameters to due introduction of one-hot encoding +fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> Vec { + // This functions uses iterators and returns a vector. + // In case we get a huge amount of paramenters this might be a problem + // todo: Change this such that it will return an iterator + + let cat_idx = cat_idxs.iter().copied().chain((num_params..).take(1)); + + // Offset is constant between two categorical values, here we calculate the number of steps + // that remain constant + let repeats = cat_idx.scan(0, |a, v| { + let im = v + 1 - *a; + *a = v; + Some(im) + }); + + // Calculate the offset to parameter idx due to newly intorduced one-hot vectors + let offset_ = cat_sizes.iter().scan(0, |a, &v| { + *a = *a + v - 1; + Some(*a) + }); + let offset = (0..1).chain(offset_); + + let new_param_idxs: Vec = (0..num_params) + .zip( + repeats + .zip(offset) + .map(|(r, o)| iter::repeat(o).take(r)) + .flatten(), + ) + .map(|(idx, ofst)| idx + ofst) + .collect(); + new_param_idxs +} + +fn validate_col_is_categorical(data: &[T]) -> bool { + for v in data { + if !v.is_valid() { + return false; + } + } + true +} + +/// Encode Categorical variavbles of data matrix to one-hot +#[derive(Debug, Clone)] +pub struct OneHotEncoder { + category_mappers: Vec>, + col_idx_categorical: Vec, +} + +impl OneHotEncoder { + /// Create an encoder instance with categories infered from data matrix + pub fn fit(data: &M, params: OneHotEncoderParams) -> Result + where + T: Categorizable, + M: Matrix, + { + match (params.col_idx_categorical, params.infer_categorical) { + (None, false) => Err(Failed::fit( + "Must pass categorical series ids or infer flag", + )), + + (Some(_idxs), true) => Err(Failed::fit( + "Ambigous parameters, got both infer and categroy ids", + )), + + (Some(mut idxs), false) => { + // make sure categories have same order as data columns + idxs.sort_unstable(); + + let (nrows, _) = data.shape(); + + // col buffer to avoid allocations + let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); + + let mut res: Vec> = Vec::with_capacity(idxs.len()); + + for &idx in &idxs { + data.copy_col_as_vec(idx, &mut col_buf); + if !validate_col_is_categorical(&col_buf) { + let msg = format!( + "Column {} of data matrix containts non categorizable (integer) values", + idx + ); + return Err(Failed::fit(&msg[..])); + } + let hashable_col = col_buf.iter().map(|v| v.to_category()); + res.push(CategoryMapper::fit_to_iter(hashable_col)); + } + + Ok(Self { + category_mappers: res, + col_idx_categorical: idxs, + }) + } + + (None, true) => { + todo!("Auto-Inference for Categorical Variables not yet implemented") + } + } + } + + /// Transform categorical variables to one-hot encoded and return a new matrix + pub fn transform(&self, x: &M) -> Result + where + T: Categorizable, + M: Matrix, + { + let (nrows, p) = x.shape(); + let additional_params: Vec = self + .category_mappers + .iter() + .map(|enc| enc.num_categories()) + .collect(); + + // Eac category of size v adds v-1 params + let expandws_p: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + + let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); + let mut res = M::zeros(nrows, expandws_p); + + for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { + let cidx = new_col_idx[old_cidx]; + let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); + let sencoder = &self.category_mappers[pidx]; + let oh_series = col_iter.map(|c| sencoder.get_one_hot::>(&c)); + + for (row, oh_vec) in oh_series.enumerate() { + match oh_vec { + None => { + // Since we support T types, bad value in a series causes in to be invalid + let msg = format!("At least one value in column {} doesn't conform to category definition", old_cidx); + return Err(Failed::transform(&msg[..])); + } + Some(v) => { + // copy one hot vectors to their place in the data matrix; + for (col_ofst, &val) in v.iter().enumerate() { + res.set(row, cidx + col_ofst, val); + } + } + } + } + } + + // copy old data in x to their new location while skipping catergorical vars (already treated) + let mut skip_idx_iter = self.col_idx_categorical.iter(); + let mut cur_skip = skip_idx_iter.next(); + + for (old_p, &new_p) in new_col_idx.iter().enumerate() { + // if found treated varible, skip it + if let Some(&v) = cur_skip { + if v == old_p { + cur_skip = skip_idx_iter.next(); + continue; + } + } + + for r in 0..nrows { + let val = x.get(r, old_p); + res.set(r, new_p, val); + } + } + + Ok(res) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::preprocessing::series_encoder::CategoryMapper; + + #[test] + fn adjust_idxs() { + assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); + // [0,1,2] -> [0, 1, 1, 1, 2] + assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); + } + + fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { + let orig = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { + // Categorical first and last + let orig = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 1.5, 3.0], + &[1.5, 2.0, 1.5, 4.0], + &[1.5, 1.0, 1.5, 5.0], + &[1.5, 2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + #[test] + fn hash_encode_f64_series() { + let series = vec![3.0, 1.0, 2.0, 1.0]; + let hashable_series: Vec = + series.iter().map(|v| v.to_category()).collect(); + let enc = CategoryMapper::from_positional_category_vec(hashable_series); + let inv = enc.invert_one_hot(vec![0.0, 0.0, 1.0]); + let orig_val: f64 = inv.unwrap().into(); + assert_eq!(orig_val, 2.0); + } + #[test] + fn test_fit() { + let (x, _) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + assert_eq!(oh_enc.category_mappers.len(), 2); + + let num_cat: Vec = oh_enc + .category_mappers + .iter() + .map(|a| a.num_categories()) + .collect(); + assert_eq!(num_cat, vec![2, 4]); + } + + #[test] + fn matrix_transform_test() { + let (x, expected_x) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); + + let (x, expected_x) = build_cat_first_and_last(); + let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); + } + + #[test] + fn fail_on_bad_category() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); + + let params = OneHotEncoderParams::from_cat_idx(&[1]); + match OneHotEncoder::fit(&m, params) { + Err(_) => { + assert!(true); + } + _ => assert!(false), + } + } +} diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index 4a1abf3..32a0cfa 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,5 +1,5 @@ /// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents -pub mod categorical_encoder; +pub mod categorical; mod data_traits; /// Encode a series (column, array) of categorical variables as one-hot vectors pub mod series_encoder; From 6b5bed60928fb2fdd304eca03ff31c0612573164 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 9 Feb 2021 22:01:59 -0800 Subject: [PATCH 72/78] remove old --- src/preprocessing/categorical_encoder.rs | 329 ----------------------- 1 file changed, 329 deletions(-) delete mode 100644 src/preprocessing/categorical_encoder.rs diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs deleted file mode 100644 index 18e569a..0000000 --- a/src/preprocessing/categorical_encoder.rs +++ /dev/null @@ -1,329 +0,0 @@ -//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies -//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents -//! -//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [CategoryMapper](../series_encoder/struct.CategoryMapper.html) -//! -//! ### Usage Example -//! ``` -//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; -//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; -//! let data = DenseMatrix::from_2d_array(&[ -//! &[1.5, 1.0, 1.5, 3.0], -//! &[1.5, 2.0, 1.5, 4.0], -//! &[1.5, 1.0, 1.5, 5.0], -//! &[1.5, 2.0, 1.5, 6.0], -//! ]); -//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); -//! // Infer number of categories from data and return a reusable encoder -//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); -//! // Transform categorical to one-hot encoded (can transform similar) -//! let oh_data = encoder.transform(&data).unwrap(); -//! // Produces the following: -//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] -//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] -//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] -//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] -//! ``` -use std::iter; - -use crate::error::Failed; -use crate::linalg::Matrix; - -use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; -use crate::preprocessing::series_encoder::CategoryMapper; - -/// OneHotEncoder Parameters -#[derive(Debug, Clone)] -pub struct OneHotEncoderParams { - /// Column number that contain categorical variable - pub col_idx_categorical: Option>, - /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables - infer_categorical: bool, -} - -impl OneHotEncoderParams { - /// Generate parameters from categorical variable column numbers - pub fn from_cat_idx(categorical_params: &[usize]) -> Self { - Self { - col_idx_categorical: Some(categorical_params.to_vec()), - infer_categorical: false, - } - } -} - -/// Calculate the offset to parameters to due introduction of one-hot encoding -fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> Vec { - // This functions uses iterators and returns a vector. - // In case we get a huge amount of paramenters this might be a problem - // todo: Change this such that it will return an iterator - - let cat_idx = cat_idxs.iter().copied().chain((num_params..).take(1)); - - // Offset is constant between two categorical values, here we calculate the number of steps - // that remain constant - let repeats = cat_idx.scan(0, |a, v| { - let im = v + 1 - *a; - *a = v; - Some(im) - }); - - // Calculate the offset to parameter idx due to newly intorduced one-hot vectors - let offset_ = cat_sizes.iter().scan(0, |a, &v| { - *a = *a + v - 1; - Some(*a) - }); - let offset = (0..1).chain(offset_); - - let new_param_idxs: Vec = (0..num_params) - .zip( - repeats - .zip(offset) - .map(|(r, o)| iter::repeat(o).take(r)) - .flatten(), - ) - .map(|(idx, ofst)| idx + ofst) - .collect(); - new_param_idxs -} - -fn validate_col_is_categorical(data: &[T]) -> bool { - for v in data { - if !v.is_valid() { - return false; - } - } - true -} - -/// Encode Categorical variavbles of data matrix to one-hot -#[derive(Debug, Clone)] -pub struct OneHotEncoder { - category_mappers: Vec>, - col_idx_categorical: Vec, -} - -impl OneHotEncoder { - /// Create an encoder instance with categories infered from data matrix - pub fn fit(data: &M, params: OneHotEncoderParams) -> Result - where - T: Categorizable, - M: Matrix, - { - match (params.col_idx_categorical, params.infer_categorical) { - (None, false) => Err(Failed::fit( - "Must pass categorical series ids or infer flag", - )), - - (Some(_idxs), true) => Err(Failed::fit( - "Ambigous parameters, got both infer and categroy ids", - )), - - (Some(mut idxs), false) => { - // make sure categories have same order as data columns - idxs.sort_unstable(); - - let (nrows, _) = data.shape(); - - // col buffer to avoid allocations - let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - - let mut res: Vec> = Vec::with_capacity(idxs.len()); - - for &idx in &idxs { - data.copy_col_as_vec(idx, &mut col_buf); - if !validate_col_is_categorical(&col_buf) { - let msg = format!( - "Column {} of data matrix containts non categorizable (integer) values", - idx - ); - return Err(Failed::fit(&msg[..])); - } - let hashable_col = col_buf.iter().map(|v| v.to_category()); - res.push(CategoryMapper::fit_to_iter(hashable_col)); - } - - Ok(Self { - category_mappers: res, - col_idx_categorical: idxs, - }) - } - - (None, true) => { - todo!("Auto-Inference for Categorical Variables not yet implemented") - } - } - } - - /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform(&self, x: &M) -> Result - where - T: Categorizable, - M: Matrix, - { - let (nrows, p) = x.shape(); - let additional_params: Vec = self - .category_mappers - .iter() - .map(|enc| enc.num_categories()) - .collect(); - - // Eac category of size v adds v-1 params - let expandws_p: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); - - let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); - let mut res = M::zeros(nrows, expandws_p); - - for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { - let cidx = new_col_idx[old_cidx]; - let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); - let sencoder = &self.category_mappers[pidx]; - let oh_series = col_iter.map(|c| sencoder.get_one_hot::>(&c)); - - for (row, oh_vec) in oh_series.enumerate() { - match oh_vec { - None => { - // Since we support T types, bad value in a series causes in to be invalid - let msg = format!("At least one value in column {} doesn't conform to category definition", old_cidx); - return Err(Failed::transform(&msg[..])); - } - Some(v) => { - // copy one hot vectors to their place in the data matrix; - for (col_ofst, &val) in v.iter().enumerate() { - res.set(row, cidx + col_ofst, val); - } - } - } - } - } - - // copy old data in x to their new location while skipping catergorical vars (already treated) - let mut skip_idx_iter = self.col_idx_categorical.iter(); - let mut cur_skip = skip_idx_iter.next(); - - for (old_p, &new_p) in new_col_idx.iter().enumerate() { - // if found treated varible, skip it - if let Some(&v) = cur_skip { - if v == old_p { - cur_skip = skip_idx_iter.next(); - continue; - } - } - - for r in 0..nrows { - let val = x.get(r, old_p); - res.set(r, new_p, val); - } - } - - Ok(res) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::preprocessing::series_encoder::CategoryMapper; - - #[test] - fn adjust_idxs() { - assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); - // [0,1,2] -> [0, 1, 1, 1, 2] - assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); - } - - fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { - let orig = DenseMatrix::from_2d_array(&[ - &[1.0, 1.5, 3.0], - &[2.0, 1.5, 4.0], - &[1.0, 1.5, 5.0], - &[2.0, 1.5, 6.0], - ]); - - let oh_enc = DenseMatrix::from_2d_array(&[ - &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], - &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], - &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], - &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], - ]); - - (orig, oh_enc) - } - - fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { - // Categorical first and last - let orig = DenseMatrix::from_2d_array(&[ - &[1.5, 1.0, 1.5, 3.0], - &[1.5, 2.0, 1.5, 4.0], - &[1.5, 1.0, 1.5, 5.0], - &[1.5, 2.0, 1.5, 6.0], - ]); - - let oh_enc = DenseMatrix::from_2d_array(&[ - &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], - &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], - &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], - &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], - ]); - - (orig, oh_enc) - } - - #[test] - fn hash_encode_f64_series() { - let series = vec![3.0, 1.0, 2.0, 1.0]; - let hashable_series: Vec = - series.iter().map(|v| v.to_category()).collect(); - let enc = CategoryMapper::from_positional_category_vec(hashable_series); - let inv = enc.invert_one_hot(vec![0.0, 0.0, 1.0]); - let orig_val: f64 = inv.unwrap().into(); - assert_eq!(orig_val, 2.0); - } - #[test] - fn test_fit() { - let (x, _) = build_fake_matrix(); - let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); - assert_eq!(oh_enc.category_mappers.len(), 2); - - let num_cat: Vec = oh_enc - .category_mappers - .iter() - .map(|a| a.num_categories()) - .collect(); - assert_eq!(num_cat, vec![2, 4]); - } - - #[test] - fn matrix_transform_test() { - let (x, expected_x) = build_fake_matrix(); - let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); - let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); - - let (x, expected_x) = build_cat_first_and_last(); - let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); - let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); - } - - #[test] - fn fail_on_bad_category() { - let m = DenseMatrix::from_2d_array(&[ - &[1.0, 1.5, 3.0], - &[2.0, 1.5, 4.0], - &[1.0, 1.5, 5.0], - &[2.0, 1.5, 6.0], - ]); - - let params = OneHotEncoderParams::from_cat_idx(&[1]); - match OneHotEncoder::fit(&m, params) { - Err(_) => { - assert!(true); - } - _ => assert!(false), - } - } -} From 4af69878e01ab2abc88433573ce52d4473b8c871 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 16 Feb 2021 18:19:14 -0400 Subject: [PATCH 73/78] fix: Fix new clippy warnings (#79) * Fix new clippy warnings * Allow clippy::suspicious-operation-groupings --- src/lib.rs | 3 ++- src/linalg/mod.rs | 3 ++- src/linalg/stats.rs | 2 +- src/linear/lasso_optimizer.rs | 2 +- src/optimization/first_order/lbfgs.rs | 1 + 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6e6205f..c7c99c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,8 @@ #![allow( clippy::type_complexity, clippy::too_many_arguments, - clippy::many_single_char_names + clippy::many_single_char_names, + clippy::unnecessary_wraps )] #![warn(missing_docs)] #![warn(missing_doc_code_examples)] diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 264815b..cadbc3a 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -1,3 +1,4 @@ +#![allow(clippy::wrong_self_convention)] //! # Linear Algebra and Matrix Decomposition //! //! Most machine learning algorithms in SmartCore depend on linear algebra and matrix decomposition methods from this module. @@ -265,7 +266,7 @@ pub trait BaseVector: Clone + Debug { sum += xi * xi; } mu /= div; - sum / div - mu * mu + sum / div - mu.powi(2) } /// Computes the standard deviation. fn std(&self) -> T { diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index 45a17af..5a1dd38 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -61,7 +61,7 @@ pub trait MatrixStats: BaseMatrix { sum += a * a; } mu /= div; - *x_i = sum / div - mu * mu; + *x_i = sum / div - mu.powi(2); } x diff --git a/src/linear/lasso_optimizer.rs b/src/linear/lasso_optimizer.rs index 4f5011f..c4340fc 100644 --- a/src/linear/lasso_optimizer.rs +++ b/src/linear/lasso_optimizer.rs @@ -138,7 +138,7 @@ impl> InteriorPointOptimizer { for i in 0..p { self.prb[i] = T::two() + self.d1[i]; - self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i] * self.d2[i]; + self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i].powi(2); } let normg = grad.norm2(); diff --git a/src/optimization/first_order/lbfgs.rs b/src/optimization/first_order/lbfgs.rs index 5dedfe6..322df03 100644 --- a/src/optimization/first_order/lbfgs.rs +++ b/src/optimization/first_order/lbfgs.rs @@ -1,3 +1,4 @@ +#![allow(clippy::suspicious_operation_groupings)] use std::default::Default; use std::fmt::Debug; From a30802ec438cc9da1b439e9897f0de4fa884a5ec Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 16 Feb 2021 22:20:02 -0400 Subject: [PATCH 74/78] fix: Change to compile for wasm32-unknown-unknown target (#80) --- src/dataset/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index da790b4..31a12cf 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -83,7 +83,7 @@ pub(crate) fn deserialize_data( ) -> Result<(Vec, Vec, usize, usize), io::Error> { // read the same file back into a Vec of bytes let (num_samples, num_features) = { - let mut buffer = [0u8; 8]; + let mut buffer = [0u8; if cfg!(target_arch = "wasm32") { 4 } else { 8 }]; buffer.copy_from_slice(&bytes[0..8]); let num_features = usize::from_le_bytes(buffer); buffer.copy_from_slice(&bytes[8..16]); From 4fb2625a337646fc01dd904d817f2622572c54bf Mon Sep 17 00:00:00 2001 From: Chris McComb Date: Wed, 17 Feb 2021 21:22:06 -0500 Subject: [PATCH 75/78] Implemented make_moons generator per https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/datasets/_samples_generator.py#L683 --- src/dataset/generator.rs | 51 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index 28a2224..4d454af 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -88,6 +88,44 @@ pub fn make_circles(num_samples: usize, factor: f32, noise: f32) -> Dataset Dataset { + + let num_samples_out = num_samples / 2; + let num_samples_in = num_samples - num_samples_out; + + let linspace_out = linspace(0.0, std::f32::consts::PI, num_samples_out); + let linspace_in = linspace(0.0, std::f32::consts::PI, num_samples_in); + + let noise = Normal::new(0.0, noise).unwrap(); + let mut rng = rand::thread_rng(); + + let mut x: Vec = Vec::with_capacity(num_samples * 2); + let mut y: Vec = Vec::with_capacity(num_samples); + + for v in linspace_out { + x.push(v.cos() + noise.sample(&mut rng)); + x.push(v.sin() + noise.sample(&mut rng)); + y.push(0.0); + } + + for v in linspace_in { + x.push(1.0 - v.cos() + noise.sample(&mut rng)); + x.push(1.0 - v.sin() + noise.sample(&mut rng) - 0.5); + y.push(1.0); + } + + Dataset { + data: x, + target: y, + num_samples, + num_features: 2, + feature_names: (0..2).map(|n| n.to_string()).collect(), + target_names: vec!["label".to_string()], + description: "Two interleaving half circles in 2d".to_string(), + } +} + fn linspace(start: f32, stop: f32, num: usize) -> Vec { let div = num as f32; let delta = stop - start; @@ -123,4 +161,17 @@ mod tests { assert_eq!(dataset.num_features, 2); assert_eq!(dataset.num_samples, 10); } + + #[test] + fn test_make_moons() { + let dataset = make_moons(100, 0.05); + println!("{:?}", dataset.data); + assert_eq!( + dataset.data.len(), + dataset.num_features * dataset.num_samples + ); + assert_eq!(dataset.target.len(), dataset.num_samples); + assert_eq!(dataset.num_features, 2); + assert_eq!(dataset.num_samples, 10); + } } From 483a21bec06269e277eded5a1525d4ce7b3a2648 Mon Sep 17 00:00:00 2001 From: Chris McComb Date: Wed, 17 Feb 2021 21:22:41 -0500 Subject: [PATCH 76/78] Oops, test was failing due to typo. Fixed now. --- src/dataset/generator.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index 4d454af..4367308 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -164,8 +164,7 @@ mod tests { #[test] fn test_make_moons() { - let dataset = make_moons(100, 0.05); - println!("{:?}", dataset.data); + let dataset = make_moons(10, 0.05); assert_eq!( dataset.data.len(), dataset.num_features * dataset.num_samples From fed11f005c4c5acf6194dbca8c517895208b7fa4 Mon Sep 17 00:00:00 2001 From: Chris McComb Date: Wed, 17 Feb 2021 21:29:51 -0500 Subject: [PATCH 77/78] Fixed formatting to pass cargo format check. --- src/dataset/generator.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index 4367308..39299a5 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -90,7 +90,6 @@ pub fn make_circles(num_samples: usize, factor: f32, noise: f32) -> Dataset Dataset { - let num_samples_out = num_samples / 2; let num_samples_in = num_samples - num_samples_out; From 1b42f8a396f52d1df77c8d6773c78afc2951a827 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Thu, 25 Feb 2021 15:44:34 -0400 Subject: [PATCH 78/78] feat: Add getters for naive bayes structs (#74) * feat: Add getters for GaussianNB * Add classes getter to BernoulliNB Add classes getter to CategoricalNB Add classes getter to MultinomialNB * Add feature_log_prob getter to MultinomialNB * Add class_count to NB structs * Add n_features getter for NB * Add feature_count to MultinomialNB and BernoulliNB * Add n_categories to CategoricalNB * Implement feature_log_prob and category_count getter for CategoricalNB * Implement feature_log_prob for BernoulliNB --- src/naive_bayes/bernoulli.rs | 144 +++++++++++++++++++++++++++---- src/naive_bayes/categorical.rs | 153 ++++++++++++++++++++++++++++----- src/naive_bayes/gaussian.rs | 78 ++++++++++++----- src/naive_bayes/multinomial.rs | 122 ++++++++++++++++++++++---- 4 files changed, 420 insertions(+), 77 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 6a7d0b4..286a4a5 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -47,12 +47,44 @@ use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Bearnoulli features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Debug, PartialEq)] +#[derive(Debug)] struct BernoulliNBDistribution { /// class labels known to the classifier class_labels: Vec, + /// number of training samples observed in each class + class_count: Vec, + /// probability of each class class_priors: Vec, - feature_prob: Vec>, + /// Number of samples encountered for each (class, feature) + feature_count: Vec>, + /// probability of features per class + feature_log_prob: Vec>, + /// Number of features of each sample + n_features: usize, +} + +impl PartialEq for BernoulliNBDistribution { + fn eq(&self, other: &Self) -> bool { + if self.class_labels == other.class_labels + && self.class_count == other.class_count + && self.class_priors == other.class_priors + && self.feature_count == other.feature_count + && self.n_features == other.n_features + { + for (a, b) in self + .feature_log_prob + .iter() + .zip(other.feature_log_prob.iter()) + { + if !a.approximate_eq(b, T::epsilon()) { + return false; + } + } + true + } else { + false + } + } } impl> NBDistribution for BernoulliNBDistribution { @@ -65,9 +97,9 @@ impl> NBDistribution for BernoulliNBDistributi for feature in 0..j.len() { let value = j.get(feature); if value == T::one() { - likelihood += self.feature_prob[class_index][feature].ln(); + likelihood += self.feature_log_prob[class_index][feature]; } else { - likelihood += (T::one() - self.feature_prob[class_index][feature]).ln(); + likelihood += (T::one() - self.feature_log_prob[class_index][feature].exp()).ln(); } } likelihood @@ -157,10 +189,10 @@ impl BernoulliNBDistribution { let y = y.to_vec(); let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); - let mut class_count = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; for class_index in indices.iter() { - class_count[*class_index] += T::one(); + class_count[*class_index] += 1; } let class_priors = if let Some(class_priors) = priors { @@ -173,25 +205,35 @@ impl BernoulliNBDistribution { } else { class_count .iter() - .map(|&c| c / T::from(n_samples).unwrap()) + .map(|&c| T::from(c).unwrap() / T::from(n_samples).unwrap()) .collect() }; - let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + let mut feature_in_class_counter = vec![vec![0_usize; n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { for (idx, row_i) in row.iter().enumerate().take(n_features) { - feature_in_class_counter[class_index][idx] += *row_i; + feature_in_class_counter[class_index][idx] += + row_i.to_usize().ok_or_else(|| { + Failed::fit(&format!( + "Elements of the matrix should be 1.0 or 0.0 |found|=[{}]", + row_i + )) + })?; } } - let feature_prob = feature_in_class_counter + let feature_log_prob = feature_in_class_counter .iter() .enumerate() .map(|(class_index, feature_count)| { feature_count .iter() - .map(|&count| (count + alpha) / (class_count[class_index] + alpha * T::two())) + .map(|&count| { + ((T::from(count).unwrap() + alpha) + / (T::from(class_count[class_index]).unwrap() + alpha * T::two())) + .ln() + }) .collect() }) .collect(); @@ -199,7 +241,10 @@ impl BernoulliNBDistribution { Ok(Self { class_labels, class_priors, - feature_prob, + class_count, + feature_count: feature_in_class_counter, + feature_log_prob, + n_features, }) } } @@ -266,6 +311,34 @@ impl> BernoulliNB { self.inner.predict(x) } } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Number of features of each sample + pub fn n_features(&self) -> usize { + self.inner.distribution.n_features + } + + /// Number of samples encountered for each (class, feature) + /// Returns a 2d vector of shape (n_classes, n_features) + pub fn feature_count(&self) -> &Vec> { + &self.inner.distribution.feature_count + } + + /// Empirical log probability of features given a class + pub fn feature_log_prob(&self) -> &Vec> { + &self.inner.distribution.feature_log_prob + } } #[cfg(test)] @@ -296,10 +369,24 @@ mod tests { assert_eq!(bnb.inner.distribution.class_priors, &[0.75, 0.25]); assert_eq!( - bnb.inner.distribution.feature_prob, + bnb.feature_log_prob(), &[ - &[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], - &[1. / 3.0, 2. / 3.0, 2. / 3.0, 1. / 3.0, 1. / 3.0, 2. / 3.0] + &[ + -0.916290731874155, + -0.2231435513142097, + -1.6094379124341003, + -0.916290731874155, + -0.916290731874155, + -1.6094379124341003 + ], + &[ + -1.0986122886681098, + -0.40546510810816444, + -0.40546510810816444, + -1.0986122886681098, + -1.0986122886681098, + -0.40546510810816444 + ] ] ); @@ -335,13 +422,36 @@ mod tests { let y_hat = bnb.predict(&x).unwrap(); + assert_eq!(bnb.classes(), &[0., 1., 2.]); + assert_eq!(bnb.class_count(), &[7, 3, 5]); + assert_eq!(bnb.n_features(), 10); + assert_eq!( + bnb.feature_count(), + &[ + &[5, 6, 6, 7, 6, 4, 6, 7, 7, 7], + &[3, 3, 3, 1, 3, 2, 3, 2, 2, 3], + &[4, 4, 3, 4, 5, 2, 4, 5, 3, 4] + ] + ); + assert!(bnb .inner .distribution .class_priors .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); - assert!(bnb.inner.distribution.feature_prob[1].approximate_eq( - &vec!(0.8, 0.8, 0.8, 0.4, 0.8, 0.6, 0.8, 0.6, 0.6, 0.8), + assert!(bnb.feature_log_prob()[1].approximate_eq( + &vec![ + -0.22314355, + -0.22314355, + -0.22314355, + -0.91629073, + -0.22314355, + -0.51082562, + -0.22314355, + -0.51082562, + -0.51082562, + -0.22314355 + ], 1e-1 )); assert!(y_hat.approximate_eq( diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index 2161528..e308a01 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -43,14 +43,31 @@ use serde::{Deserialize, Serialize}; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] struct CategoricalNBDistribution { + /// number of training samples observed in each class + class_count: Vec, + /// class labels known to the classifier class_labels: Vec, + /// probability of each class class_priors: Vec, coefficients: Vec>>, + /// Number of features of each sample + n_features: usize, + /// Number of categories for each feature + n_categories: Vec, + /// Holds arrays of shape (n_classes, n_categories of respective feature) + /// for each feature. Each array provides the number of samples + /// encountered for each class and category of the specific feature. + category_count: Vec>>, } impl PartialEq for CategoricalNBDistribution { fn eq(&self, other: &Self) -> bool { - if self.class_labels == other.class_labels && self.class_priors == other.class_priors { + if self.class_labels == other.class_labels + && self.class_priors == other.class_priors + && self.n_features == other.n_features + && self.n_categories == other.n_categories + && self.class_count == other.class_count + { if self.coefficients.len() != other.coefficients.len() { return false; } @@ -90,8 +107,8 @@ impl> NBDistribution for CategoricalNBDistribu let mut likelihood = T::zero(); for feature in 0..j.len() { let value = j.get(feature).floor().to_usize().unwrap(); - if self.coefficients[class_index][feature].len() > value { - likelihood += self.coefficients[class_index][feature][value]; + if self.coefficients[feature][class_index].len() > value { + likelihood += self.coefficients[feature][class_index][value]; } else { return T::zero(); } @@ -149,12 +166,12 @@ impl CategoricalNBDistribution { let class_labels: Vec = (0..*y_max + 1) .map(|label| T::from(label).unwrap()) .collect(); - let mut classes_count: Vec = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; for elem in y.iter() { - classes_count[*elem] += T::one(); + class_count[*elem] += 1; } - let mut feature_categories: Vec> = Vec::with_capacity(n_features); + let mut n_categories: Vec = Vec::with_capacity(n_features); for feature in 0..n_features { let feature_max = x .get_col_as_vec(feature) @@ -167,18 +184,15 @@ impl CategoricalNBDistribution { feature )) })?; - let feature_types = (0..feature_max + 1) - .map(|feat| T::from(feat).unwrap()) - .collect(); - feature_categories.push(feature_types); + n_categories.push(feature_max + 1); } let mut coefficients: Vec>> = Vec::with_capacity(class_labels.len()); - for (label, label_count) in class_labels.iter().zip(classes_count.iter()) { + let mut category_count: Vec>> = Vec::with_capacity(class_labels.len()); + for (feature_index, &n_categories_i) in n_categories.iter().enumerate().take(n_features) { let mut coef_i: Vec> = Vec::with_capacity(n_features); - for (feature_index, feature_options) in - feature_categories.iter().enumerate().take(n_features) - { + let mut category_count_i: Vec> = Vec::with_capacity(n_features); + for (label, &label_count) in class_labels.iter().zip(class_count.iter()) { let col = x .get_col_as_vec(feature_index) .iter() @@ -186,33 +200,41 @@ impl CategoricalNBDistribution { .filter(|(i, _j)| T::from(y[*i]).unwrap() == *label) .map(|(_, j)| *j) .collect::>(); - let mut feat_count: Vec = vec![T::zero(); feature_options.len()]; + let mut feat_count: Vec = vec![0_usize; n_categories_i]; for row in col.iter() { let index = row.floor().to_usize().unwrap(); - feat_count[index] += T::one(); + feat_count[index] += 1; } + let coef_i_j = feat_count .iter() .map(|c| { - ((*c + alpha) - / (*label_count + T::from(feature_options.len()).unwrap() * alpha)) + ((T::from(*c).unwrap() + alpha) + / (T::from(label_count).unwrap() + + T::from(n_categories_i).unwrap() * alpha)) .ln() }) .collect::>(); + category_count_i.push(feat_count); coef_i.push(coef_i_j); } + category_count.push(category_count_i); coefficients.push(coef_i); } - let class_priors = classes_count - .into_iter() - .map(|count| count / T::from(n_samples).unwrap()) + let class_priors = class_count + .iter() + .map(|&count| T::from(count).unwrap() / T::from(n_samples).unwrap()) .collect::>(); Ok(Self { + class_count, class_labels, class_priors, coefficients, + n_categories, + n_features, + category_count, }) } } @@ -287,6 +309,41 @@ impl> CategoricalNB { pub fn predict(&self, x: &M) -> Result { self.inner.predict(x) } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Number of features of each sample + pub fn n_features(&self) -> usize { + self.inner.distribution.n_features + } + + /// Number of features of each sample + pub fn n_categories(&self) -> &Vec { + &self.inner.distribution.n_categories + } + + /// Holds arrays of shape (n_classes, n_categories of respective feature) + /// for each feature. Each array provides the number of samples + /// encountered for each class and category of the specific feature. + pub fn category_count(&self) -> &Vec>> { + &self.inner.distribution.category_count + } + /// Holds arrays of shape (n_classes, n_categories of respective feature) + /// for each feature. Each array provides the empirical log probability + /// of categories given the respective feature and class, ``P(x_i|y)``. + pub fn feature_log_prob(&self) -> &Vec>> { + &self.inner.distribution.coefficients + } } #[cfg(test)] @@ -315,6 +372,60 @@ mod tests { let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); + + // checking parity with scikit + assert_eq!(cnb.classes(), &[0., 1.]); + assert_eq!(cnb.class_count(), &[5, 9]); + assert_eq!(cnb.n_features(), 4); + assert_eq!(cnb.n_categories(), &[3, 3, 2, 2]); + assert_eq!( + cnb.category_count(), + &vec![ + vec![vec![3, 0, 2], vec![2, 4, 3]], + vec![vec![1, 2, 2], vec![3, 4, 2]], + vec![vec![1, 4], vec![6, 3]], + vec![vec![2, 3], vec![6, 3]] + ] + ); + + assert_eq!( + cnb.feature_log_prob(), + &vec![ + vec![ + vec![ + -0.6931471805599453, + -2.0794415416798357, + -0.9808292530117262 + ], + vec![ + -1.3862943611198906, + -0.8754687373538999, + -1.0986122886681098 + ] + ], + vec![ + vec![ + -1.3862943611198906, + -0.9808292530117262, + -0.9808292530117262 + ], + vec![ + -1.0986122886681098, + -0.8754687373538999, + -1.3862943611198906 + ] + ], + vec![ + vec![-1.252762968495368, -0.3364722366212129], + vec![-0.45198512374305727, -1.0116009116784799] + ], + vec![ + vec![-0.8472978603872037, -0.5596157879354228], + vec![-0.45198512374305727, -1.0116009116784799] + ] + ] + ); + let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]); let y_hat = cnb.predict(&x_test).unwrap(); assert_eq!(y_hat, vec![0., 1.]); diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 28c4785..00c7962 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -39,10 +39,12 @@ use serde::{Deserialize, Serialize}; struct GaussianNBDistribution { /// class labels known to the classifier class_labels: Vec, + /// number of training samples observed in each class + class_count: Vec, /// probability of each class. class_priors: Vec, /// variance of each feature per class - sigma: Vec>, + var: Vec>, /// mean of each feature per class theta: Vec>, } @@ -57,18 +59,14 @@ impl> NBDistribution for GaussianNBDistributio } fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { - if class_index < self.class_labels.len() { - let mut likelihood = T::zero(); - for feature in 0..j.len() { - let value = j.get(feature); - let mean = self.theta[class_index][feature]; - let variance = self.sigma[class_index][feature]; - likelihood += self.calculate_log_probability(value, mean, variance); - } - likelihood - } else { - T::zero() + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + let mean = self.theta[class_index][feature]; + let variance = self.var[class_index][feature]; + likelihood += self.calculate_log_probability(value, mean, variance); } + likelihood } fn classes(&self) -> &Vec { @@ -121,12 +119,12 @@ impl GaussianNBDistribution { let y = y.to_vec(); let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); - let mut class_count = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; let mut subdataset: Vec>> = vec![vec![]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices.iter()) { - class_count[*class_index] += T::one(); + class_count[*class_index] += 1; subdataset[*class_index].push(row); } @@ -139,8 +137,8 @@ impl GaussianNBDistribution { class_priors } else { class_count - .into_iter() - .map(|c| c / T::from(n_samples).unwrap()) + .iter() + .map(|&c| T::from(c).unwrap() / T::from(n_samples).unwrap()) .collect() }; @@ -157,15 +155,16 @@ impl GaussianNBDistribution { }) .collect(); - let (sigma, theta): (Vec>, Vec>) = subdataset + let (var, theta): (Vec>, Vec>) = subdataset .iter() .map(|data| (data.var(0), data.mean(0))) .unzip(); Ok(Self { class_labels, + class_count, class_priors, - sigma, + var, theta, }) } @@ -223,6 +222,36 @@ impl> GaussianNB { pub fn predict(&self, x: &M) -> Result { self.inner.predict(x) } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Probability of each class + /// Returns a vector of size n_classes. + pub fn class_priors(&self) -> &Vec { + &self.inner.distribution.class_priors + } + + /// Mean of each feature per class + /// Returns a 2d vector of shape (n_classes, n_features). + pub fn theta(&self) -> &Vec> { + &self.inner.distribution.theta + } + + /// Variance of each feature per class + /// Returns a 2d vector of shape (n_classes, n_features). + pub fn var(&self) -> &Vec> { + &self.inner.distribution.var + } } #[cfg(test)] @@ -245,18 +274,23 @@ mod tests { let gnb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); let y_hat = gnb.predict(&x).unwrap(); assert_eq!(y_hat, y); + + assert_eq!(gnb.classes(), &[1., 2.]); + + assert_eq!(gnb.class_count(), &[3, 3]); + assert_eq!( - gnb.inner.distribution.sigma, + gnb.var(), &[ &[0.666666666666667, 0.22222222222222232], &[0.666666666666667, 0.22222222222222232] ] ); - assert_eq!(gnb.inner.distribution.class_priors, &[0.5, 0.5]); + assert_eq!(gnb.class_priors(), &[0.5, 0.5]); assert_eq!( - gnb.inner.distribution.theta, + gnb.theta(), &[&[-2., -1.3333333333333333], &[2., 1.3333333333333333]] ); } @@ -277,7 +311,7 @@ mod tests { let parameters = GaussianNBParameters::default().with_priors(priors.clone()); let gnb = GaussianNB::fit(&x, &y, parameters).unwrap(); - assert_eq!(gnb.inner.distribution.class_priors, priors); + assert_eq!(gnb.class_priors(), &priors); } #[test] diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 06ee071..87e0ddd 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -51,8 +51,16 @@ use serde::{Deserialize, Serialize}; struct MultinomialNBDistribution { /// class labels known to the classifier class_labels: Vec, + /// number of training samples observed in each class + class_count: Vec, + /// probability of each class class_priors: Vec, - feature_prob: Vec>, + /// Empirical log probability of features given a class + feature_log_prob: Vec>, + /// Number of samples encountered for each (class, feature) + feature_count: Vec>, + /// Number of features of each sample + n_features: usize, } impl> NBDistribution for MultinomialNBDistribution { @@ -64,7 +72,7 @@ impl> NBDistribution for MultinomialNBDistribu let mut likelihood = T::zero(); for feature in 0..j.len() { let value = j.get(feature); - likelihood += value * self.feature_prob[class_index][feature].ln(); + likelihood += value * self.feature_log_prob[class_index][feature]; } likelihood } @@ -144,10 +152,10 @@ impl MultinomialNBDistribution { let y = y.to_vec(); let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); - let mut class_count = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; for class_index in indices.iter() { - class_count[*class_index] += T::one(); + class_count[*class_index] += 1; } let class_priors = if let Some(class_priors) = priors { @@ -160,33 +168,46 @@ impl MultinomialNBDistribution { } else { class_count .iter() - .map(|&c| c / T::from(n_samples).unwrap()) + .map(|&c| T::from(c).unwrap() / T::from(n_samples).unwrap()) .collect() }; - let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + let mut feature_in_class_counter = vec![vec![0_usize; n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { for (idx, row_i) in row.iter().enumerate().take(n_features) { - feature_in_class_counter[class_index][idx] += *row_i; + feature_in_class_counter[class_index][idx] += + row_i.to_usize().ok_or_else(|| { + Failed::fit(&format!( + "Elements of the matrix should be convertible to usize |found|=[{}]", + row_i + )) + })?; } } - let feature_prob = feature_in_class_counter + let feature_log_prob = feature_in_class_counter .iter() .map(|feature_count| { - let n_c = feature_count.sum(); + let n_c: usize = feature_count.iter().sum(); feature_count .iter() - .map(|&count| (count + alpha) / (n_c + alpha * T::from(n_features).unwrap())) + .map(|&count| { + ((T::from(count).unwrap() + alpha) + / (T::from(n_c).unwrap() + alpha * T::from(n_features).unwrap())) + .ln() + }) .collect() }) .collect(); Ok(Self { + class_count, class_labels, class_priors, - feature_prob, + feature_log_prob, + feature_count: feature_in_class_counter, + n_features, }) } } @@ -240,6 +261,35 @@ impl> MultinomialNB { pub fn predict(&self, x: &M) -> Result { self.inner.predict(x) } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Empirical log probability of features given a class, P(x_i|y). + /// Returns a 2d vector of shape (n_classes, n_features) + pub fn feature_log_prob(&self) -> &Vec> { + &self.inner.distribution.feature_log_prob + } + + /// Number of features of each sample + pub fn n_features(&self) -> usize { + self.inner.distribution.n_features + } + + /// Number of samples encountered for each (class, feature) + /// Returns a 2d vector of shape (n_classes, n_features) + pub fn feature_count(&self) -> &Vec> { + &self.inner.distribution.feature_count + } } #[cfg(test)] @@ -268,12 +318,29 @@ mod tests { let y = vec![0., 0., 0., 1.]; let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + assert_eq!(mnb.classes(), &[0., 1.]); + assert_eq!(mnb.class_count(), &[3, 1]); + assert_eq!(mnb.inner.distribution.class_priors, &[0.75, 0.25]); assert_eq!( - mnb.inner.distribution.feature_prob, + mnb.feature_log_prob(), &[ - &[1. / 7., 3. / 7., 1. / 14., 1. / 7., 1. / 7., 1. / 14.], - &[1. / 9., 2. / 9.0, 2. / 9.0, 1. / 9.0, 1. / 9.0, 2. / 9.0] + &[ + (1_f64 / 7_f64).ln(), + (3_f64 / 7_f64).ln(), + (1_f64 / 14_f64).ln(), + (1_f64 / 7_f64).ln(), + (1_f64 / 7_f64).ln(), + (1_f64 / 14_f64).ln() + ], + &[ + (1_f64 / 9_f64).ln(), + (2_f64 / 9_f64).ln(), + (2_f64 / 9_f64).ln(), + (1_f64 / 9_f64).ln(), + (1_f64 / 9_f64).ln(), + (2_f64 / 9_f64).ln() + ] ] ); @@ -307,6 +374,16 @@ mod tests { let y = vec![2., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0., 0., 2., 0., 2.]; let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + assert_eq!(nb.n_features(), 10); + assert_eq!( + nb.feature_count(), + &[ + &[12, 20, 11, 24, 12, 14, 13, 17, 13, 18], + &[9, 6, 9, 4, 7, 3, 8, 5, 4, 9], + &[10, 12, 9, 9, 11, 3, 9, 18, 10, 10] + ] + ); + let y_hat = nb.predict(&x).unwrap(); assert!(nb @@ -314,9 +391,20 @@ mod tests { .distribution .class_priors .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); - assert!(nb.inner.distribution.feature_prob[1].approximate_eq( - &vec!(0.07, 0.12, 0.07, 0.15, 0.07, 0.09, 0.08, 0.10, 0.08, 0.11), - 1e-1 + assert!(nb.feature_log_prob()[1].approximate_eq( + &vec![ + -2.00148, + -2.35815494, + -2.00148, + -2.69462718, + -2.22462355, + -2.91777073, + -2.10684052, + -2.51230562, + -2.69462718, + -2.00148 + ], + 1e-5 )); assert!(y_hat.approximate_eq( &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 2.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 2.0),