From df766eaf795455fa030682e82ed0813e62390690 Mon Sep 17 00:00:00 2001 From: Tim Toebrock <35797763+titoeb@users.noreply.github.com> Date: Fri, 26 Aug 2022 16:20:20 +0200 Subject: [PATCH] Implementation of Standard scaler (#143) * docs: Fix typo in doc for categorical transformer. * feat: Add option to take a column from Matrix. I created the method `Matrix::take_column` that uses the `Matrix::take`-interface to extract a single column from a matrix. I need that feature in the implementation of `StandardScaler`. * feat: Add `StandardScaler`. Authored-by: titoeb --- src/linalg/mod.rs | 21 ++ src/preprocessing/mod.rs | 4 +- src/preprocessing/numerical.rs | 404 +++++++++++++++++++++++++++++++++ 3 files changed, 428 insertions(+), 1 deletion(-) create mode 100644 src/preprocessing/numerical.rs diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 59b6089..8e27c0b 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -651,6 +651,10 @@ pub trait BaseMatrix: Clone + Debug { result } + /// Take an individual column from the matrix. + fn take_column(&self, column_index: usize) -> Self { + self.take(&[column_index], 1) + } } /// Generic matrix with additional mixins like various factorization methods. @@ -761,4 +765,21 @@ mod tests { assert_eq!(m.take(&vec!(1, 1, 3), 0), expected_0); assert_eq!(m.take(&vec!(1, 0), 1), expected_1); } + + #[test] + fn take_second_column_from_matrix() { + let four_columns: DenseMatrix = DenseMatrix::from_2d_array(&[ + &[0.0, 1.0, 2.0, 3.0], + &[0.0, 1.0, 2.0, 3.0], + &[0.0, 1.0, 2.0, 3.0], + &[0.0, 1.0, 2.0, 3.0], + ]); + + let second_column = four_columns.take_column(1); + assert_eq!( + second_column, + DenseMatrix::from_2d_array(&[&[1.0], &[1.0], &[1.0], &[1.0]]), + "The second column was not extracted correctly" + ); + } } diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index 32a0cfa..915fdab 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,5 +1,7 @@ -/// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents +/// Transform a data matrix by replacing all categorical variables with their one-hot vector equivalents pub mod categorical; mod data_traits; +/// Preprocess numerical matrices. +pub mod numerical; /// Encode a series (column, array) of categorical variables as one-hot vectors pub mod series_encoder; diff --git a/src/preprocessing/numerical.rs b/src/preprocessing/numerical.rs new file mode 100644 index 0000000..cc90b28 --- /dev/null +++ b/src/preprocessing/numerical.rs @@ -0,0 +1,404 @@ +//! # Standard-Scaling For [RealNumber](../../math/num/trait.RealNumber.html) Matricies +//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by removing the mean and scaling to unit variance. +//! +//! ### Usage Example +//! ``` +//! use smartcore::api::{Transformer, UnsupervisedEstimator}; +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::preprocessing::numerical; +//! let data = DenseMatrix::from_2d_vec(&vec![ +//! vec![0.0, 0.0], +//! vec![0.0, 0.0], +//! vec![1.0, 1.0], +//! vec![1.0, 1.0], +//! ]); +//! +//! let standard_scaler = +//! numerical::StandardScaler::fit(&data, numerical::StandardScalerParameters::default()) +//! .unwrap(); +//! let transformed_data = standard_scaler.transform(&data).unwrap(); +//! assert_eq!( +//! transformed_data, +//! DenseMatrix::from_2d_vec(&vec![ +//! vec![-1.0, -1.0], +//! vec![-1.0, -1.0], +//! vec![1.0, 1.0], +//! vec![1.0, 1.0], +//! ]) +//! ); +//! ``` +use crate::api::{Transformer, UnsupervisedEstimator}; +use crate::error::{Failed, FailedError}; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +/// Configure Behaviour of `StandardScaler`. +#[derive(Clone, Debug, Copy, Eq, PartialEq)] +pub struct StandardScalerParameters { + /// Optionaly adjust mean to be zero. + with_mean: bool, + /// Optionally adjust standard-deviation to be one. + with_std: bool, +} +impl Default for StandardScalerParameters { + fn default() -> Self { + StandardScalerParameters { + with_mean: true, + with_std: true, + } + } +} + +/// With the `StandardScaler` data can be adjusted so +/// that every column has a mean of zero and a standard +/// deviation of one. This can improve model training for +/// scaling sensitive models like neural network or nearest +/// neighbors based models. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct StandardScaler { + means: Vec, + stds: Vec, + parameters: StandardScalerParameters, +} +impl StandardScaler { + /// When the mean should be adjusted, the column mean + /// should be kept. Otherwise, replace it by zero. + fn adjust_column_mean(&self, mean: T) -> T { + if self.parameters.with_mean { + mean + } else { + T::zero() + } + } + /// When the standard-deviation should be adjusted, the column + /// standard-deviation should be kept. Otherwise, replace it by one. + fn adjust_column_std(&self, std: T) -> T { + if self.parameters.with_std { + ensure_std_valid(std) + } else { + T::one() + } + } +} + +/// Make sure the standard deviation is valid. If it is +/// negative or zero, it should replaced by the smallest +/// positive value the type can have. That way we can savely +/// divide the columns with the resulting scalar. +fn ensure_std_valid(value: T) -> T { + value.max(T::min_positive_value()) +} + +/// During `fit` the `StandardScaler` computes the column means and standard deviation. +impl> UnsupervisedEstimator + for StandardScaler +{ + fn fit(x: &M, parameters: StandardScalerParameters) -> Result { + Ok(Self { + means: x.column_mean(), + stds: x.std(0), + parameters, + }) + } +} + +/// During `transform` the `StandardScaler` applies the summary statistics +/// computed during `fit` to set the mean of each column to zero and the +/// standard deviation to one. +impl> Transformer for StandardScaler { + fn transform(&self, x: &M) -> Result { + let (_, n_cols) = x.shape(); + if n_cols != self.means.len() { + return Err(Failed::because( + FailedError::TransformFailed, + &format!( + "Expected {} columns, but got {} columns instead.", + self.means.len(), + n_cols, + ), + )); + } + + Ok(build_matrix_from_columns( + self.means + .iter() + .zip(self.stds.iter()) + .enumerate() + .map(|(column_index, (column_mean, column_std))| { + x.take_column(column_index) + .sub_scalar(self.adjust_column_mean(*column_mean)) + .div_scalar(self.adjust_column_std(*column_std)) + }) + .collect(), + ) + .unwrap()) + } +} + +/// From a collection of matrices, that contain columns, construct +/// a matrix by stacking the columns horizontally. +fn build_matrix_from_columns(columns: Vec) -> Option +where + T: RealNumber, + M: Matrix, +{ + if let Some(output_matrix) = columns.first().cloned() { + return Some( + columns + .iter() + .skip(1) + .fold(output_matrix, |current_matrix, new_colum| { + current_matrix.h_stack(new_colum) + }), + ); + } else { + None + } +} + +#[cfg(test)] +mod tests { + + mod helper_functionality { + use super::super::{build_matrix_from_columns, ensure_std_valid}; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn combine_three_columns() { + assert_eq!( + build_matrix_from_columns(vec![ + DenseMatrix::from_2d_vec(&vec![vec![1.0], vec![1.0], vec![1.0],]), + DenseMatrix::from_2d_vec(&vec![vec![2.0], vec![2.0], vec![2.0],]), + DenseMatrix::from_2d_vec(&vec![vec![3.0], vec![3.0], vec![3.0],]) + ]), + Some(DenseMatrix::from_2d_vec(&vec![ + vec![1.0, 2.0, 3.0], + vec![1.0, 2.0, 3.0], + vec![1.0, 2.0, 3.0] + ])) + ) + } + + #[test] + fn negative_value_should_be_replace_with_minimal_positive_value() { + assert_eq!(ensure_std_valid(-1.0), f64::MIN_POSITIVE) + } + + #[test] + fn zero_should_be_replace_with_minimal_positive_value() { + assert_eq!(ensure_std_valid(0.0), f64::MIN_POSITIVE) + } + } + mod standard_scaler { + use super::super::{StandardScaler, StandardScalerParameters}; + use crate::api::{Transformer, UnsupervisedEstimator}; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::linalg::BaseMatrix; + + #[test] + fn dont_adjust_mean_if_used() { + assert_eq!( + (StandardScaler { + means: vec![], + stds: vec![], + parameters: StandardScalerParameters { + with_mean: true, + with_std: true + } + }) + .adjust_column_mean(1.0), + 1.0 + ) + } + #[test] + fn replace_mean_with_zero_if_not_used() { + assert_eq!( + (StandardScaler { + means: vec![], + stds: vec![], + parameters: StandardScalerParameters { + with_mean: false, + with_std: true + } + }) + .adjust_column_mean(1.0), + 0.0 + ) + } + #[test] + fn dont_adjust_std_if_used() { + assert_eq!( + (StandardScaler { + means: vec![], + stds: vec![], + parameters: StandardScalerParameters { + with_mean: true, + with_std: true + } + }) + .adjust_column_std(10.0), + 10.0 + ) + } + #[test] + fn replace_std_with_one_if_not_used() { + assert_eq!( + (StandardScaler { + means: vec![], + stds: vec![], + parameters: StandardScalerParameters { + with_mean: true, + with_std: false + } + }) + .adjust_column_std(10.0), + 1.0 + ) + } + + /// Helper function to apply fit as well as transform at the same time. + fn fit_transform_with_default_standard_scaler( + values_to_be_transformed: &DenseMatrix, + ) -> DenseMatrix { + StandardScaler::fit( + values_to_be_transformed, + StandardScalerParameters::default(), + ) + .unwrap() + .transform(values_to_be_transformed) + .unwrap() + } + + /// Fit transform with random generated values, expected values taken from + /// sklearn. + #[test] + fn fit_transform_random_values() { + let transformed_values = + fit_transform_with_default_standard_scaler(&DenseMatrix::from_2d_array(&[ + &[0.1004222429, 0.2194113576, 0.9310663354, 0.3313593793], + &[0.2045493861, 0.1683865411, 0.5071506765, 0.7257355264], + &[0.5708488802, 0.1846414616, 0.9590802982, 0.5591871046], + &[0.8387612750, 0.5754861361, 0.5537109852, 0.1077646442], + ])); + println!("{}", transformed_values); + assert!(transformed_values.approximate_eq( + &DenseMatrix::from_2d_array(&[ + &[-1.1154020653, -0.4031985330, 0.9284605204, -0.4271473866], + &[-0.7615464283, -0.7076698384, -1.1075452562, 1.2632979631], + &[0.4832504303, -0.6106747444, 1.0630075435, 0.5494084257], + &[1.3936980634, 1.7215431158, -0.8839228078, -1.3855590021], + ]), + 1.0 + )) + } + + /// Test `fit` and `transform` for a column with zero variance. + #[test] + fn fit_transform_with_zero_variance() { + assert_eq!( + fit_transform_with_default_standard_scaler(&DenseMatrix::from_2d_array(&[ + &[1.0], + &[1.0], + &[1.0], + &[1.0] + ])), + DenseMatrix::from_2d_array(&[&[0.0], &[0.0], &[0.0], &[0.0]]), + "When scaling values with zero variance, zero is expected as return value" + ) + } + + /// Test `fit` for columns with nice summary statistics. + #[test] + fn fit_for_simple_values() { + assert_eq!( + StandardScaler::fit( + &DenseMatrix::from_2d_array(&[ + &[1.0, 1.0, 1.0], + &[1.0, 2.0, 5.0], + &[1.0, 1.0, 1.0], + &[1.0, 2.0, 5.0] + ]), + StandardScalerParameters::default(), + ), + Ok(StandardScaler { + means: vec![1.0, 1.5, 3.0], + stds: vec![0.0, 0.5, 2.0], + parameters: StandardScalerParameters { + with_mean: true, + with_std: true + } + }) + ) + } + /// Test `fit` for random generated values. + #[test] + fn fit_for_random_values() { + let fitted_scaler = StandardScaler::fit( + &DenseMatrix::from_2d_array(&[ + &[0.1004222429, 0.2194113576, 0.9310663354, 0.3313593793], + &[0.2045493861, 0.1683865411, 0.5071506765, 0.7257355264], + &[0.5708488802, 0.1846414616, 0.9590802982, 0.5591871046], + &[0.8387612750, 0.5754861361, 0.5537109852, 0.1077646442], + ]), + StandardScalerParameters::default(), + ) + .unwrap(); + + assert_eq!( + fitted_scaler.means, + vec![0.42864544605, 0.2869813741, 0.737752073825, 0.431011663625], + ); + + assert!( + &DenseMatrix::from_2d_vec(&vec![fitted_scaler.stds]).approximate_eq( + &DenseMatrix::from_2d_array(&[&[ + 0.29426447500954, + 0.16758497615485, + 0.20820945786863, + 0.23329718831165 + ],]), + 0.00000000000001 + ) + ) + } + + /// If `with_std` is set to `false` the values should not be + /// adjusted to have a std of one. + #[test] + fn transform_without_std() { + let standard_scaler = StandardScaler { + means: vec![1.0, 3.0], + stds: vec![1.0, 2.0], + parameters: StandardScalerParameters { + with_mean: true, + with_std: false, + }, + }; + + assert_eq!( + standard_scaler.transform(&DenseMatrix::from_2d_array(&[&[0.0, 2.0], &[2.0, 4.0]])), + Ok(DenseMatrix::from_2d_array(&[&[-1.0, -1.0], &[1.0, 1.0]])) + ) + } + + /// If `with_mean` is set to `false` the values should not be adjusted + /// to have a mean of zero. + #[test] + fn transform_without_mean() { + let standard_scaler = StandardScaler { + means: vec![1.0, 2.0], + stds: vec![2.0, 3.0], + parameters: StandardScalerParameters { + with_mean: false, + with_std: true, + }, + }; + + assert_eq!( + standard_scaler + .transform(&DenseMatrix::from_2d_array(&[&[0.0, 9.0], &[4.0, 12.0]])), + Ok(DenseMatrix::from_2d_array(&[&[0.0, 3.0], &[2.0, 4.0]])) + ) + } + } +}