Implementation of Standard scaler (#143)
* docs: Fix typo in doc for categorical transformer. * feat: Add option to take a column from Matrix. I created the method `Matrix::take_column` that uses the `Matrix::take`-interface to extract a single column from a matrix. I need that feature in the implementation of `StandardScaler`. * feat: Add `StandardScaler`. Authored-by: titoeb <timtoebrock@googlemail.com>
This commit is contained in:
@@ -651,6 +651,10 @@ pub trait BaseMatrix<T: RealNumber>: Clone + Debug {
|
|||||||
|
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
/// Take an individual column from the matrix.
|
||||||
|
fn take_column(&self, column_index: usize) -> Self {
|
||||||
|
self.take(&[column_index], 1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generic matrix with additional mixins like various factorization methods.
|
/// Generic matrix with additional mixins like various factorization methods.
|
||||||
@@ -761,4 +765,21 @@ mod tests {
|
|||||||
assert_eq!(m.take(&vec!(1, 1, 3), 0), expected_0);
|
assert_eq!(m.take(&vec!(1, 1, 3), 0), expected_0);
|
||||||
assert_eq!(m.take(&vec!(1, 0), 1), expected_1);
|
assert_eq!(m.take(&vec!(1, 0), 1), expected_1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn take_second_column_from_matrix() {
|
||||||
|
let four_columns: DenseMatrix<f64> = DenseMatrix::from_2d_array(&[
|
||||||
|
&[0.0, 1.0, 2.0, 3.0],
|
||||||
|
&[0.0, 1.0, 2.0, 3.0],
|
||||||
|
&[0.0, 1.0, 2.0, 3.0],
|
||||||
|
&[0.0, 1.0, 2.0, 3.0],
|
||||||
|
]);
|
||||||
|
|
||||||
|
let second_column = four_columns.take_column(1);
|
||||||
|
assert_eq!(
|
||||||
|
second_column,
|
||||||
|
DenseMatrix::from_2d_array(&[&[1.0], &[1.0], &[1.0], &[1.0]]),
|
||||||
|
"The second column was not extracted correctly"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
/// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents
|
/// Transform a data matrix by replacing all categorical variables with their one-hot vector equivalents
|
||||||
pub mod categorical;
|
pub mod categorical;
|
||||||
mod data_traits;
|
mod data_traits;
|
||||||
|
/// Preprocess numerical matrices.
|
||||||
|
pub mod numerical;
|
||||||
/// Encode a series (column, array) of categorical variables as one-hot vectors
|
/// Encode a series (column, array) of categorical variables as one-hot vectors
|
||||||
pub mod series_encoder;
|
pub mod series_encoder;
|
||||||
|
|||||||
@@ -0,0 +1,404 @@
|
|||||||
|
//! # Standard-Scaling For [RealNumber](../../math/num/trait.RealNumber.html) Matricies
|
||||||
|
//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by removing the mean and scaling to unit variance.
|
||||||
|
//!
|
||||||
|
//! ### Usage Example
|
||||||
|
//! ```
|
||||||
|
//! use smartcore::api::{Transformer, UnsupervisedEstimator};
|
||||||
|
//! use smartcore::linalg::naive::dense_matrix::DenseMatrix;
|
||||||
|
//! use smartcore::preprocessing::numerical;
|
||||||
|
//! let data = DenseMatrix::from_2d_vec(&vec![
|
||||||
|
//! vec![0.0, 0.0],
|
||||||
|
//! vec![0.0, 0.0],
|
||||||
|
//! vec![1.0, 1.0],
|
||||||
|
//! vec![1.0, 1.0],
|
||||||
|
//! ]);
|
||||||
|
//!
|
||||||
|
//! let standard_scaler =
|
||||||
|
//! numerical::StandardScaler::fit(&data, numerical::StandardScalerParameters::default())
|
||||||
|
//! .unwrap();
|
||||||
|
//! let transformed_data = standard_scaler.transform(&data).unwrap();
|
||||||
|
//! assert_eq!(
|
||||||
|
//! transformed_data,
|
||||||
|
//! DenseMatrix::from_2d_vec(&vec![
|
||||||
|
//! vec![-1.0, -1.0],
|
||||||
|
//! vec![-1.0, -1.0],
|
||||||
|
//! vec![1.0, 1.0],
|
||||||
|
//! vec![1.0, 1.0],
|
||||||
|
//! ])
|
||||||
|
//! );
|
||||||
|
//! ```
|
||||||
|
use crate::api::{Transformer, UnsupervisedEstimator};
|
||||||
|
use crate::error::{Failed, FailedError};
|
||||||
|
use crate::linalg::Matrix;
|
||||||
|
use crate::math::num::RealNumber;
|
||||||
|
|
||||||
|
/// Configure Behaviour of `StandardScaler`.
|
||||||
|
#[derive(Clone, Debug, Copy, Eq, PartialEq)]
|
||||||
|
pub struct StandardScalerParameters {
|
||||||
|
/// Optionaly adjust mean to be zero.
|
||||||
|
with_mean: bool,
|
||||||
|
/// Optionally adjust standard-deviation to be one.
|
||||||
|
with_std: bool,
|
||||||
|
}
|
||||||
|
impl Default for StandardScalerParameters {
|
||||||
|
fn default() -> Self {
|
||||||
|
StandardScalerParameters {
|
||||||
|
with_mean: true,
|
||||||
|
with_std: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// With the `StandardScaler` data can be adjusted so
|
||||||
|
/// that every column has a mean of zero and a standard
|
||||||
|
/// deviation of one. This can improve model training for
|
||||||
|
/// scaling sensitive models like neural network or nearest
|
||||||
|
/// neighbors based models.
|
||||||
|
#[derive(Clone, Debug, Default, Eq, PartialEq)]
|
||||||
|
pub struct StandardScaler<T: RealNumber> {
|
||||||
|
means: Vec<T>,
|
||||||
|
stds: Vec<T>,
|
||||||
|
parameters: StandardScalerParameters,
|
||||||
|
}
|
||||||
|
impl<T: RealNumber> StandardScaler<T> {
|
||||||
|
/// When the mean should be adjusted, the column mean
|
||||||
|
/// should be kept. Otherwise, replace it by zero.
|
||||||
|
fn adjust_column_mean(&self, mean: T) -> T {
|
||||||
|
if self.parameters.with_mean {
|
||||||
|
mean
|
||||||
|
} else {
|
||||||
|
T::zero()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// When the standard-deviation should be adjusted, the column
|
||||||
|
/// standard-deviation should be kept. Otherwise, replace it by one.
|
||||||
|
fn adjust_column_std(&self, std: T) -> T {
|
||||||
|
if self.parameters.with_std {
|
||||||
|
ensure_std_valid(std)
|
||||||
|
} else {
|
||||||
|
T::one()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Make sure the standard deviation is valid. If it is
|
||||||
|
/// negative or zero, it should replaced by the smallest
|
||||||
|
/// positive value the type can have. That way we can savely
|
||||||
|
/// divide the columns with the resulting scalar.
|
||||||
|
fn ensure_std_valid<T: RealNumber>(value: T) -> T {
|
||||||
|
value.max(T::min_positive_value())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// During `fit` the `StandardScaler` computes the column means and standard deviation.
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> UnsupervisedEstimator<M, StandardScalerParameters>
|
||||||
|
for StandardScaler<T>
|
||||||
|
{
|
||||||
|
fn fit(x: &M, parameters: StandardScalerParameters) -> Result<Self, Failed> {
|
||||||
|
Ok(Self {
|
||||||
|
means: x.column_mean(),
|
||||||
|
stds: x.std(0),
|
||||||
|
parameters,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// During `transform` the `StandardScaler` applies the summary statistics
|
||||||
|
/// computed during `fit` to set the mean of each column to zero and the
|
||||||
|
/// standard deviation to one.
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Transformer<M> for StandardScaler<T> {
|
||||||
|
fn transform(&self, x: &M) -> Result<M, Failed> {
|
||||||
|
let (_, n_cols) = x.shape();
|
||||||
|
if n_cols != self.means.len() {
|
||||||
|
return Err(Failed::because(
|
||||||
|
FailedError::TransformFailed,
|
||||||
|
&format!(
|
||||||
|
"Expected {} columns, but got {} columns instead.",
|
||||||
|
self.means.len(),
|
||||||
|
n_cols,
|
||||||
|
),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(build_matrix_from_columns(
|
||||||
|
self.means
|
||||||
|
.iter()
|
||||||
|
.zip(self.stds.iter())
|
||||||
|
.enumerate()
|
||||||
|
.map(|(column_index, (column_mean, column_std))| {
|
||||||
|
x.take_column(column_index)
|
||||||
|
.sub_scalar(self.adjust_column_mean(*column_mean))
|
||||||
|
.div_scalar(self.adjust_column_std(*column_std))
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
)
|
||||||
|
.unwrap())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// From a collection of matrices, that contain columns, construct
|
||||||
|
/// a matrix by stacking the columns horizontally.
|
||||||
|
fn build_matrix_from_columns<T, M>(columns: Vec<M>) -> Option<M>
|
||||||
|
where
|
||||||
|
T: RealNumber,
|
||||||
|
M: Matrix<T>,
|
||||||
|
{
|
||||||
|
if let Some(output_matrix) = columns.first().cloned() {
|
||||||
|
return Some(
|
||||||
|
columns
|
||||||
|
.iter()
|
||||||
|
.skip(1)
|
||||||
|
.fold(output_matrix, |current_matrix, new_colum| {
|
||||||
|
current_matrix.h_stack(new_colum)
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
mod helper_functionality {
|
||||||
|
use super::super::{build_matrix_from_columns, ensure_std_valid};
|
||||||
|
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn combine_three_columns() {
|
||||||
|
assert_eq!(
|
||||||
|
build_matrix_from_columns(vec![
|
||||||
|
DenseMatrix::from_2d_vec(&vec![vec![1.0], vec![1.0], vec![1.0],]),
|
||||||
|
DenseMatrix::from_2d_vec(&vec![vec![2.0], vec![2.0], vec![2.0],]),
|
||||||
|
DenseMatrix::from_2d_vec(&vec![vec![3.0], vec![3.0], vec![3.0],])
|
||||||
|
]),
|
||||||
|
Some(DenseMatrix::from_2d_vec(&vec![
|
||||||
|
vec![1.0, 2.0, 3.0],
|
||||||
|
vec![1.0, 2.0, 3.0],
|
||||||
|
vec![1.0, 2.0, 3.0]
|
||||||
|
]))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn negative_value_should_be_replace_with_minimal_positive_value() {
|
||||||
|
assert_eq!(ensure_std_valid(-1.0), f64::MIN_POSITIVE)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn zero_should_be_replace_with_minimal_positive_value() {
|
||||||
|
assert_eq!(ensure_std_valid(0.0), f64::MIN_POSITIVE)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mod standard_scaler {
|
||||||
|
use super::super::{StandardScaler, StandardScalerParameters};
|
||||||
|
use crate::api::{Transformer, UnsupervisedEstimator};
|
||||||
|
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
||||||
|
use crate::linalg::BaseMatrix;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn dont_adjust_mean_if_used() {
|
||||||
|
assert_eq!(
|
||||||
|
(StandardScaler {
|
||||||
|
means: vec![],
|
||||||
|
stds: vec![],
|
||||||
|
parameters: StandardScalerParameters {
|
||||||
|
with_mean: true,
|
||||||
|
with_std: true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.adjust_column_mean(1.0),
|
||||||
|
1.0
|
||||||
|
)
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn replace_mean_with_zero_if_not_used() {
|
||||||
|
assert_eq!(
|
||||||
|
(StandardScaler {
|
||||||
|
means: vec![],
|
||||||
|
stds: vec![],
|
||||||
|
parameters: StandardScalerParameters {
|
||||||
|
with_mean: false,
|
||||||
|
with_std: true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.adjust_column_mean(1.0),
|
||||||
|
0.0
|
||||||
|
)
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn dont_adjust_std_if_used() {
|
||||||
|
assert_eq!(
|
||||||
|
(StandardScaler {
|
||||||
|
means: vec![],
|
||||||
|
stds: vec![],
|
||||||
|
parameters: StandardScalerParameters {
|
||||||
|
with_mean: true,
|
||||||
|
with_std: true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.adjust_column_std(10.0),
|
||||||
|
10.0
|
||||||
|
)
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn replace_std_with_one_if_not_used() {
|
||||||
|
assert_eq!(
|
||||||
|
(StandardScaler {
|
||||||
|
means: vec![],
|
||||||
|
stds: vec![],
|
||||||
|
parameters: StandardScalerParameters {
|
||||||
|
with_mean: true,
|
||||||
|
with_std: false
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.adjust_column_std(10.0),
|
||||||
|
1.0
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to apply fit as well as transform at the same time.
|
||||||
|
fn fit_transform_with_default_standard_scaler(
|
||||||
|
values_to_be_transformed: &DenseMatrix<f64>,
|
||||||
|
) -> DenseMatrix<f64> {
|
||||||
|
StandardScaler::fit(
|
||||||
|
values_to_be_transformed,
|
||||||
|
StandardScalerParameters::default(),
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
.transform(values_to_be_transformed)
|
||||||
|
.unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fit transform with random generated values, expected values taken from
|
||||||
|
/// sklearn.
|
||||||
|
#[test]
|
||||||
|
fn fit_transform_random_values() {
|
||||||
|
let transformed_values =
|
||||||
|
fit_transform_with_default_standard_scaler(&DenseMatrix::from_2d_array(&[
|
||||||
|
&[0.1004222429, 0.2194113576, 0.9310663354, 0.3313593793],
|
||||||
|
&[0.2045493861, 0.1683865411, 0.5071506765, 0.7257355264],
|
||||||
|
&[0.5708488802, 0.1846414616, 0.9590802982, 0.5591871046],
|
||||||
|
&[0.8387612750, 0.5754861361, 0.5537109852, 0.1077646442],
|
||||||
|
]));
|
||||||
|
println!("{}", transformed_values);
|
||||||
|
assert!(transformed_values.approximate_eq(
|
||||||
|
&DenseMatrix::from_2d_array(&[
|
||||||
|
&[-1.1154020653, -0.4031985330, 0.9284605204, -0.4271473866],
|
||||||
|
&[-0.7615464283, -0.7076698384, -1.1075452562, 1.2632979631],
|
||||||
|
&[0.4832504303, -0.6106747444, 1.0630075435, 0.5494084257],
|
||||||
|
&[1.3936980634, 1.7215431158, -0.8839228078, -1.3855590021],
|
||||||
|
]),
|
||||||
|
1.0
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test `fit` and `transform` for a column with zero variance.
|
||||||
|
#[test]
|
||||||
|
fn fit_transform_with_zero_variance() {
|
||||||
|
assert_eq!(
|
||||||
|
fit_transform_with_default_standard_scaler(&DenseMatrix::from_2d_array(&[
|
||||||
|
&[1.0],
|
||||||
|
&[1.0],
|
||||||
|
&[1.0],
|
||||||
|
&[1.0]
|
||||||
|
])),
|
||||||
|
DenseMatrix::from_2d_array(&[&[0.0], &[0.0], &[0.0], &[0.0]]),
|
||||||
|
"When scaling values with zero variance, zero is expected as return value"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test `fit` for columns with nice summary statistics.
|
||||||
|
#[test]
|
||||||
|
fn fit_for_simple_values() {
|
||||||
|
assert_eq!(
|
||||||
|
StandardScaler::fit(
|
||||||
|
&DenseMatrix::from_2d_array(&[
|
||||||
|
&[1.0, 1.0, 1.0],
|
||||||
|
&[1.0, 2.0, 5.0],
|
||||||
|
&[1.0, 1.0, 1.0],
|
||||||
|
&[1.0, 2.0, 5.0]
|
||||||
|
]),
|
||||||
|
StandardScalerParameters::default(),
|
||||||
|
),
|
||||||
|
Ok(StandardScaler {
|
||||||
|
means: vec![1.0, 1.5, 3.0],
|
||||||
|
stds: vec![0.0, 0.5, 2.0],
|
||||||
|
parameters: StandardScalerParameters {
|
||||||
|
with_mean: true,
|
||||||
|
with_std: true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
)
|
||||||
|
}
|
||||||
|
/// Test `fit` for random generated values.
|
||||||
|
#[test]
|
||||||
|
fn fit_for_random_values() {
|
||||||
|
let fitted_scaler = StandardScaler::fit(
|
||||||
|
&DenseMatrix::from_2d_array(&[
|
||||||
|
&[0.1004222429, 0.2194113576, 0.9310663354, 0.3313593793],
|
||||||
|
&[0.2045493861, 0.1683865411, 0.5071506765, 0.7257355264],
|
||||||
|
&[0.5708488802, 0.1846414616, 0.9590802982, 0.5591871046],
|
||||||
|
&[0.8387612750, 0.5754861361, 0.5537109852, 0.1077646442],
|
||||||
|
]),
|
||||||
|
StandardScalerParameters::default(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
fitted_scaler.means,
|
||||||
|
vec![0.42864544605, 0.2869813741, 0.737752073825, 0.431011663625],
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
&DenseMatrix::from_2d_vec(&vec![fitted_scaler.stds]).approximate_eq(
|
||||||
|
&DenseMatrix::from_2d_array(&[&[
|
||||||
|
0.29426447500954,
|
||||||
|
0.16758497615485,
|
||||||
|
0.20820945786863,
|
||||||
|
0.23329718831165
|
||||||
|
],]),
|
||||||
|
0.00000000000001
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If `with_std` is set to `false` the values should not be
|
||||||
|
/// adjusted to have a std of one.
|
||||||
|
#[test]
|
||||||
|
fn transform_without_std() {
|
||||||
|
let standard_scaler = StandardScaler {
|
||||||
|
means: vec![1.0, 3.0],
|
||||||
|
stds: vec![1.0, 2.0],
|
||||||
|
parameters: StandardScalerParameters {
|
||||||
|
with_mean: true,
|
||||||
|
with_std: false,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
standard_scaler.transform(&DenseMatrix::from_2d_array(&[&[0.0, 2.0], &[2.0, 4.0]])),
|
||||||
|
Ok(DenseMatrix::from_2d_array(&[&[-1.0, -1.0], &[1.0, 1.0]]))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If `with_mean` is set to `false` the values should not be adjusted
|
||||||
|
/// to have a mean of zero.
|
||||||
|
#[test]
|
||||||
|
fn transform_without_mean() {
|
||||||
|
let standard_scaler = StandardScaler {
|
||||||
|
means: vec![1.0, 2.0],
|
||||||
|
stds: vec![2.0, 3.0],
|
||||||
|
parameters: StandardScalerParameters {
|
||||||
|
with_mean: false,
|
||||||
|
with_std: true,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
standard_scaler
|
||||||
|
.transform(&DenseMatrix::from_2d_array(&[&[0.0, 9.0], &[4.0, 12.0]])),
|
||||||
|
Ok(DenseMatrix::from_2d_array(&[&[0.0, 3.0], &[2.0, 4.0]]))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user