diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index ff9fa6e..a929ab6 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -2,96 +2,86 @@ //! # Encode categorical features as a one-hot or multi-class numeric array. use crate::error::Failed; +use crate::linalg::BaseVector; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable -pub fn make_one_hot>(label_idx: usize, num_labels: usize) -> V { +pub fn make_one_hot>(category_idx: usize, num_categories: usize) -> V { let pos = T::from_f64(1f64).unwrap(); - let mut z = V::zeros(num_labels); - z.set(label_idx, pos); + let mut z = V::zeros(num_categories); + z.set(category_idx, pos); z } + +/// Turn a collection of `CategoryType`s into a one-hot vectors. /// This struct encodes single class per exmample /// -/// You can fit a label enumeration by passing a collection of labels. -/// Label numbers will be assigned in the order they are encountered +/// You can fit_to_series a category enumeration by passing a collection of categories. +/// category numbers will be assigned in the order they are encountered /// /// Example: /// ``` /// use std::collections::HashMap; /// use smartcore::preprocessing::target_encoders::OneHotEncoder; /// -/// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; -/// let enc = OneHotEncoder::::fit(&fake_labels[..]); +/// let fake_categories: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; +/// let enc = OneHotEncoder::::fit_to_series(&fake_categories[..]); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); -/// // notice that 1 is actually a zero-th positional label +/// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); /// ``` /// -/// You can also pass a predefined label enumeration such as a hashmap `HashMap` or a vector `Vec` +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` /// /// /// ``` /// use std::collections::HashMap; /// use smartcore::preprocessing::target_encoders::OneHotEncoder; /// -/// let label_map: HashMap<&str, usize> = +/// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] /// .into_iter() /// .collect(); -/// let label_vec = vec!["background", "dog", "cat"]; +/// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = OneHotEncoder::<&str>::from_positional_label_vec(label_vec); -/// let enc_lm = OneHotEncoder::<&str>::from_label_map(label_map); +/// let enc_lv = OneHotEncoder::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = OneHotEncoder::<&str>::from_category_map(category_map); /// /// // ["background", "dog", "cat"] -/// println!("{:?}", enc_lv.get_labels()); +/// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` -pub struct OneHotEncoder { - label_to_idx: HashMap, - labels: Vec, - num_classes: usize, +pub struct OneHotEncoder { + category_map: HashMap, + categories: Vec, + num_categories: usize, } -enum LabelDefinition { - LabelToClsNumMap(HashMap), - PositionalLabel(Vec), -} - -/// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) -pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { - let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); - (0..num_labels) - .map(|idx| if idx == label_idx { pos } else { neg }) - .collect() -} - -impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { +impl OneHotEncoder { /// Fit an encoder to a lable list - pub fn fit(labels: &[LabelType]) -> Self { - let mut label_map: HashMap = HashMap::new(); - let mut class_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + pub fn fit_to_series(categories: &[CategoryType]) -> Self { + let mut category_map: HashMap = HashMap::new(); + let mut category_num = 0usize; + let mut unique_lables: Vec = Vec::new(); - for l in labels { - if !label_map.contains_key(&l) { - label_map.insert(l.clone(), class_num); + for l in categories { + if !category_map.contains_key(&l) { + category_map.insert(l.clone(), category_num); unique_lables.push(l.clone()); - class_num += 1; + category_num += 1; } } Self { - label_to_idx: label_map, - num_classes: class_num, - labels: unique_lables, + category_map: category_map, + num_categories: category_num, + categories: unique_lables, } } - /// Build an encoder from a predefined (label -> class number) map - pub fn from_label_map(category_map: HashMap) -> Self { + /// Build an encoder from a predefined (category -> class number) map + pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(CategoryType, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); @@ -100,12 +90,11 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { num_categories: categories.len(), categories, category_map, - } + } } - /// Build an encoder from a predefined positional label-class num vector - pub fn from_positional_label_vec(categories: Vec) -> Self { - // Self::from_label_def(LabelDefinition::PositionalLabel(categories)) + /// Build an encoder from a predefined positional category-class num vector + pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories .iter() .enumerate() @@ -118,27 +107,30 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { } } - /// Transform a slice of label types into one-hot vectors - /// None is returned if unknown label is encountered - pub fn transform(&self, labels: &[LabelType]) -> Vec>> { - labels.iter().map(|l| self.transform_one(l)).collect() + /// Transform a slice of category types into one-hot vectors + /// None is returned if unknown category is encountered + pub fn transfrom_series( + &self, + categories: &[CategoryType], + ) -> Vec>> { + categories.iter().map(|l| self.transform_one(l)).collect() } - /// Transform a single label type into a one-hot vector - pub fn transform_one(&self, label: &LabelType) -> Option> { - match self.label_to_idx.get(label) { + /// Transform a single category type into a one-hot vector + pub fn transform_one(&self, category: &CategoryType) -> Option> { + match self.category_map.get(category) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_classes)), + Some(&idx) => Some(make_one_hot(idx, self.num_categories)), } } - /// Get labels ordered by encoder's label enumeration - pub fn get_labels(&self) -> &Vec { - &self.labels + /// Get categories ordered by encoder's category enumeration + pub fn get_categories(&self) -> &Vec { + &self.categories } - /// Invert one-hot vector, back to the label - pub fn invert_one(&self, one_hot: Vec) -> Result { + /// Invert one-hot vector, back to the category + pub fn invert_one(&self, one_hot: Vec) -> Result { let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot @@ -149,7 +141,7 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { if s.len() == 1 { let idx = s[0]; - return Ok(self.labels[idx].clone()); + return Ok(self.categories[idx].clone()); } let pos_entries = format!( "Expected a single positive entry, {} entires found", @@ -157,31 +149,6 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { ); Err(Failed::transform(&pos_entries[..])) } - - fn from_label_def(labels: LabelDefinition) -> Self { - let (label_map, class_num, unique_lables) = match labels { - LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(LabelType, usize)> = - h.iter().map(|(k, v)| (k.clone(), *v)).collect(); - _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); - let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); - (h, unique_lab.len(), unique_lab) - } - LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab - .iter() - .enumerate() - .map(|(v, k)| (k.clone(), v)) - .collect(); - (h, unique_lab.len(), unique_lab) - } - }; - Self { - label_to_idx: label_map, - num_classes: class_num, - labels: unique_lables, - } - } } #[cfg(test)] @@ -189,11 +156,11 @@ mod tests { use super::*; #[test] - fn from_labels() { - let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; - let enc = OneHotEncoder::::fit(&fake_labels[0..]); + fn from_categories() { + let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; + let enc = OneHotEncoder::::fit_to_series(&fake_categories[0..]); let oh_vec: Vec = match enc.transform_one(&1) { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![1f64, 0f64, 0f64, 0f64, 0f64]; @@ -201,19 +168,19 @@ mod tests { } fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { - let fake_label_pos = vec!["background", "dog", "cat"]; - let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + let fake_category_pos = vec!["background", "dog", "cat"]; + let enc = OneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); enc } #[test] - fn label_map_and_vec() { - let label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + fn category_map_and_vec() { + let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_label_map(label_map); + let enc = OneHotEncoder::<&str>::from_category_map(category_map); let oh_vec: Vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![0f64, 1f64, 0f64]; @@ -221,10 +188,10 @@ mod tests { } #[test] - fn positional_labels_vec() { + fn positional_categories_vec() { let enc = build_fake_str_enc(); let oh_vec: Vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![0.0, 1.0, 0.0]; @@ -244,9 +211,10 @@ mod tests { } #[test] - fn test_many_labels() { + fn test_many_categorys() { let enc = build_fake_str_enc(); - let res: Vec>> = enc.transform(&["dog", "cat", "fish", "background"]); + let res: Vec>> = + enc.transfrom_series(&["dog", "cat", "fish", "background"]); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]),