diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 012f364..0436787 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -1,5 +1,27 @@ -#![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot numeric array. +//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies +//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents +//! +//! ### Usage Example +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; +//! let data = DenseMatrix::from_2d_array(&[ +//! &[1.5, 1.0, 1.5, 3.0], +//! &[1.5, 2.0, 1.5, 4.0], +//! &[1.5, 1.0, 1.5, 5.0], +//! &[1.5, 2.0, 1.5, 6.0], +//! ]); +//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); +//! // Infer number of categories from data and return a reusable encoder +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! // Transform categorical to one-hot encoded (can transform similar) +//! let oh_data = encoder.transform(&data).unwrap(); +//! // Produces the following: +//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] +//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] +//! ``` use crate::error::Failed; use crate::linalg::{BaseVector, Matrix}; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index 4534c6d..c07b982 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,2 +1,5 @@ +/// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents pub mod categorical_encoders; -pub mod series_encoder; \ No newline at end of file +mod data_traits; +/// Encode a series (column, array) of categorical variables as one-hot vectors +pub mod series_encoder; diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 132d160..321f049 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -1,13 +1,21 @@ #![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot numeric array. +//! # Series Encoder +//! Encode a series of categorical features as a one-hot numeric array. use crate::error::Failed; -use crate::linalg::{BaseVector, Matrix}; +use crate::linalg::BaseVector; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable +/// +/// Example: +/// ``` +/// use smartcore::preprocessing::series_encoder::make_one_hot; +/// let one_hot: Vec = make_one_hot(2, 3); +/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); +/// ``` pub fn make_one_hot>( category_idx: usize, num_categories: usize, @@ -18,7 +26,7 @@ pub fn make_one_hot>( z } -/// Turn a collection of `CategoryType`s into a one-hot vectors. +/// Turn a collection of Hashable objects into a one-hot vectors. /// This struct encodes single class per exmample /// /// You can fit_to_iter a category enumeration by passing an iterator of categories. @@ -27,7 +35,7 @@ pub fn make_one_hot>( /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; /// /// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; /// let it = fake_categories.iter().map(|&a| a); @@ -42,7 +50,7 @@ pub fn make_one_hot>( /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -60,10 +68,11 @@ pub fn make_one_hot>( pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, + /// Number of categories for categorical variable pub num_categories: usize, } -impl SeriesOneHotEncoder { +impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder { /// Fit an encoder to a lable list pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); @@ -111,20 +120,24 @@ impl SeriesOneHotEncoder { } } - - pub fn transform_iter(&self, cat_it: impl Iterator)-> Vec>> { - cat_it.map(|l| self.transform_one(l)).collect() + /// Take an iterator as a series to transform + pub fn transform_iter( + &self, + cat_it: impl Iterator, + ) -> Vec>> { + cat_it.map(|l| self.transform_one(&l)).collect() } + /// Transform a slice of category types into one-hot vectors /// None is returned if unknown category is encountered pub fn transfrom_series( &self, - categories: &[CategoryType], + categories: &'a [CategoryType], ) -> Vec>> { - self.transform_iter(categories.iter()) + let v = categories.iter().map(|a| a.clone()); + self.transform_iter(v) } - /// Transform a single category type into a one-hot vector pub fn transform_one(&self, category: &CategoryType) -> Option> { match self.category_map.get(category) {