From c987d39d439462e5abc12cf34276d8735afb1145 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:31:09 -0800 Subject: [PATCH] tests + force Categorizable be RealNumber --- src/preprocessing/categorical_encoders.rs | 138 +++++++++++++++++----- src/preprocessing/data_traits.rs | 4 +- 2 files changed, 114 insertions(+), 28 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 585f13a..063aa5c 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -1,6 +1,8 @@ //! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies //! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents //! +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [SeriesOneHotEncoder](../series_encoder/struct.SeriesOneHotEncoder.html) +//! //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; @@ -22,25 +24,33 @@ //! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] //! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] //! ``` +use std::iter; use crate::error::Failed; -use crate::linalg::{BaseVector, Matrix}; -use crate::math::num::RealNumber; +use crate::linalg::Matrix; +use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; use crate::preprocessing::series_encoder::SeriesOneHotEncoder; -pub type HashableReal = u32; - -fn hashable_num(v: &T) -> HashableReal { - // gaxler: If first 32 bits are the same, assume numbers are the same for the categorical coercion - v.to_f32_bits() -} - +/// OneHotEncoder Parameters #[derive(Debug, Clone)] pub struct OneHotEncoderParams { - pub categorical_param_idxs: Option>, + /// Column number that contain categorical variable + pub col_idx_categorical: Option>, + /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables pub infer_categorical: bool, } + +impl OneHotEncoderParams { + /// Generate parameters from categorical variable column numbers + pub fn from_cat_idx(categorical_params: &[usize]) -> Self { + Self { + col_idx_categorical: Some(categorical_params.to_vec()), + infer_categorical: false, + } + } +} + /// Calculate the offset to parameters to due introduction of one-hot encoding fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { // This functions uses iterators and returns a vector. @@ -75,12 +85,14 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) .collect(); new_param_idxs } + fn validate_col_is_categorical(data: &Vec) -> bool { for v in data { if !v.is_valid() { return false} } true } + /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { series_encoders: Vec>, @@ -167,13 +179,13 @@ impl OneHotEncoder { // Bad value in a series causes in to be invalid // todo: proper error handling, so user can know where the bad value is return None; - } + } Some(v) => { // copy one hot vectors to their place in the data matrix; for (col_ofst, &val) in v.iter().enumerate() { res.set(row, cidx + col_ofst, val); - } - } + } + } } } } @@ -181,21 +193,93 @@ impl OneHotEncoder { } } - fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { - let (nrows, _) = data.shape(); - // let mut res: Vec> = Vec::with_capacity(idxs.len()); - let mut tmp_col: Vec = Vec::with_capacity(nrows); +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::preprocessing::series_encoder::SeriesOneHotEncoder; - let res: Vec> = idxs - .iter() - .map(|&idx| { - data.copy_col_as_vec(idx, &mut tmp_col); - let hashable_col = tmp_col.iter().map(|v| hashable_num::(v)); - SeriesOneHotEncoder::fit_to_iter(hashable_col) - }) - .collect(); - res + #[test] + fn adjust_idxs() { + assert_eq!(find_new_idxs(0, &[], &[]), Vec::new()); + // [0,1,2] -> [0, 1, 1, 1, 2] + assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); } + fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { + let orig = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); -} \ No newline at end of file + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { + // Categorical first and last + let orig = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 1.5, 3.0], + &[1.5, 2.0, 1.5, 4.0], + &[1.5, 1.0, 1.5, 5.0], + &[1.5, 2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + #[test] + fn hash_encode_f64_series() { + let series = vec![3.0, 1.0, 2.0, 1.0]; + let hashable_series: Vec = + series.iter().map(|v| v.to_category()).collect(); + let enc = SeriesOneHotEncoder::from_positional_category_vec(hashable_series); + let inv = enc.invert_one(vec![0.0, 0.0, 1.0]); + let orig_val: f64 = inv.unwrap().into(); + assert_eq!(orig_val, 2.0); + } + #[test] + fn test_fit() { + let (X, _) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + assert_eq!(oh_enc.series_encoders.len(), 2); + + let num_cat: Vec = oh_enc + .series_encoders + .iter() + .map(|a| a.num_categories) + .collect(); + assert_eq!(num_cat, vec![2, 4]); + } + + #[test] + fn matrix_transform_test() { + let (X, expectedX) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let nm = oh_enc.transform(&X).unwrap(); + assert_eq!(nm, expectedX); + + let (X, expectedX) = build_cat_first_and_last(); + let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let nm = oh_enc.transform(&X).unwrap(); + assert_eq!(nm, expectedX); + } +} diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs index 04b534e..16924bb 100644 --- a/src/preprocessing/data_traits.rs +++ b/src/preprocessing/data_traits.rs @@ -1,11 +1,13 @@ //! Traits to indicate that float variables can be viewed as categorical //! This module assumes +use crate::math::num::RealNumber; + pub type CategoricalFloat = u16; // pub struct CategoricalFloat(u16); -pub trait Categorizable { +pub trait Categorizable: RealNumber { type A; fn to_category(self) -> CategoricalFloat;