From 991631876eb0bd55b6acf4fdecd85181b985de63 Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 25 Jan 2021 23:33:48 -0800 Subject: [PATCH 01/35] build one-hot encoder --- src/lib.rs | 2 + src/preprocessing/mod.rs | 1 + src/preprocessing/target_encoders.rs | 209 +++++++++++++++++++++++++++ 3 files changed, 212 insertions(+) create mode 100644 src/preprocessing/mod.rs create mode 100644 src/preprocessing/target_encoders.rs diff --git a/src/lib.rs b/src/lib.rs index 7d2b089..c5802d2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -95,3 +95,5 @@ pub(crate) mod optimization; pub mod svm; /// Supervised tree-based learning methods pub mod tree; +/// Preprocessing utilities +pub mod preprocessing; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs new file mode 100644 index 0000000..e4b5190 --- /dev/null +++ b/src/preprocessing/mod.rs @@ -0,0 +1 @@ +pub mod target_encoders; \ No newline at end of file diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs new file mode 100644 index 0000000..1894361 --- /dev/null +++ b/src/preprocessing/target_encoders.rs @@ -0,0 +1,209 @@ + +#![allow(clippy::ptr_arg)] +//! # Encode categorical features as a one-hot or multi-class numeric array. +//! + +use std::hash::Hash; +use std::collections::HashMap; + +use crate::math::num::RealNumber; +use crate::error::Failed; + + +/// Turn a collection of label types into a one-hot vectors. +/// This struct encodes single class per exmample +pub struct OneHotEncoder { + label_to_idx: HashMap, + labels: Vec, + num_classes: usize + +} + +enum LabelDefinition { + LabelToClsNumMap(HashMap), + PositionalLabel(Vec), +} + +/// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) +pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { + let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); + (0..num_labels).map(|idx| if idx == label_idx {pos.clone()} else {neg.clone()}).collect() + +} + +impl<'a, T: Hash + Eq + Clone> OneHotEncoder +{ + + /// Fit an encoder to a lable list + /// + /// Label numbers will be assigned in the order they are encountered + /// Example: + /// ``` + /// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; + /// let enc = OneHotEncoder::::fit(&fake_labels[0..]); + /// let oh_vec = enc.transform_one(&1); // notice that 1 is actually a zero-th positional label + /// assert_eq!(oh_vec, vec![1f64,0f64,0f64,0f64,0f64]); + /// ``` + pub fn fit(labels: &[T]) -> Self { + + let mut label_map: HashMap = HashMap::new(); + let mut class_num = 0usize; + let mut unique_lables: Vec = Vec::new(); + + for l in labels + { + if !label_map.contains_key(&l) { + label_map.insert(l.clone(), class_num); + unique_lables.push(l.clone()); + class_num += 1; + } + } + Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} + } + + + /// Build an encoder from a predefined (label -> class number) map + /// + /// Definition example: + /// ``` + /// let fake_label_map: HashMap<&str, u32> = vec![("background",0), ("dog", 1), ("cat", 2)] + /// .into_iter() + /// .collect(); + /// let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); + /// ``` + pub fn from_label_map(labels: HashMap) -> Self { + Self::from_label_def(LabelDefinition::LabelToClsNumMap(labels)) + } + /// Build an encoder from a predefined positional label-class num vector + /// + /// Definition example: + /// ``` + /// let fake_label_pos = vec!["background","dog", "cat"]; + /// let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + /// ``` + pub fn from_positional_label_vec(labels: Vec) -> Self { + Self::from_label_def(LabelDefinition::PositionalLabel(labels)) + } + + /// Transform a slice of label types into one-hot vectors + /// None is returned if unknown label is encountered + pub fn transform(&self, labels: &[T]) -> Vec>> { + labels + .into_iter() + .map(|l| self.transform_one(l)) + .collect() + } + + /// Transform a single label type into a one-hot vector + pub fn transform_one(&self, label: &T) -> Option> { + match self.label_to_idx.get(label) { + None => None, + Some(&idx) => Some(make_one_hot(idx, self.num_classes)) + } + } + + /// Invert one-hot vector, back to the label + ///``` + /// let lab = enc.invert_one(res)?; // e.g. res = [0,1,0,0...] "dog" == class 1 + /// assert_eq!(lab, "dog") + /// ``` + pub fn invert_one(&self, one_hot: Vec) -> Result { + let pos = U::from_f64(1f64).unwrap(); + + let s: Vec = one_hot + .into_iter() + .enumerate() + .filter_map(|(idx, v)| if v == pos {Some(idx)} else {None}) + .collect(); + + if s.len() == 1 { + let idx = s[0]; + return Ok(self.labels[idx].clone()) + } + let pos_entries = format!("Expected a single positive entry, {} entires found", s.len()); + Err(Failed::transform(&pos_entries[..])) + } + + + fn from_label_def(labels: LabelDefinition) -> Self { + + let (label_map, class_num, unique_lables) = match labels { + LabelDefinition::LabelToClsNumMap(h) => { + let mut _unique_lab: Vec<(T, usize)> = h.iter().map(|(k,v)| (k.clone(), v.clone())).collect(); + _unique_lab.sort_by(|a,b| a.1.cmp(&b.1)); + let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); + (h, unique_lab.len(), unique_lab) + }, + LabelDefinition::PositionalLabel(unique_lab) => { + let h: HashMap = unique_lab.iter().enumerate().map(|(v, k)| (k.clone(),v)).collect(); + (h, unique_lab.len(), unique_lab) + } + }; + Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} + + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_labels() { + let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; + let enc = OneHotEncoder::::fit(&fake_labels[0..]); + let oh_vec = match enc.transform_one(&1) { + None => panic!("Wrong labels"), + Some(v) => v + }; + let res: Vec = vec![1f64,0f64,0f64,0f64,0f64]; + assert_eq!(oh_vec, res); + } + + + fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str>{ + let fake_label_pos = vec!["background","dog", "cat"]; + let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + enc + } + + #[test] + fn label_map_and_vec() { + let fake_label_map: HashMap<&str, usize> = vec![("background",0), ("dog", 1), ("cat", 2)].into_iter().collect(); + let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v + }; + let res: Vec = vec![0f64, 1f64,0f64]; + assert_eq!(oh_vec, res); + } + + #[test] + fn positional_labels_vec() { + let enc = build_fake_str_enc(); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v + }; + let res: Vec = vec![0f64, 1f64,0f64]; + assert_eq!(oh_vec, res); + } + + #[test] + fn invert_label_test() { + let enc = build_fake_str_enc(); + let res: Vec = vec![0f64, 1f64,0f64]; + let lab = enc.invert_one(res).unwrap(); + assert_eq!(lab, "dog"); + + if let Err(e) = enc.invert_one(vec![0.0, 0.0,0.0]) { + let pos_entries = format!("Expected a single positive entry, 0 entires found"); + assert_eq!(e, Failed::transform(&pos_entries[..])); + }; + } + + + +} \ No newline at end of file From dbca6d43cede008cd6be5cb8c60e210c6f25994f Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 25 Jan 2021 23:55:43 -0800 Subject: [PATCH 02/35] fmt fix --- src/lib.rs | 4 +- src/preprocessing/mod.rs | 2 +- src/preprocessing/target_encoders.rs | 148 ++++++++++++++------------- 3 files changed, 79 insertions(+), 75 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c5802d2..6e6205f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -91,9 +91,9 @@ pub mod naive_bayes; /// Supervised neighbors-based learning methods pub mod neighbors; pub(crate) mod optimization; +/// Preprocessing utilities +pub mod preprocessing; /// Support Vector Machines pub mod svm; /// Supervised tree-based learning methods pub mod tree; -/// Preprocessing utilities -pub mod preprocessing; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index e4b5190..c70f7dc 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1 +1 @@ -pub mod target_encoders; \ No newline at end of file +pub mod target_encoders; diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 1894361..81cbdbd 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -1,22 +1,18 @@ - #![allow(clippy::ptr_arg)] //! # Encode categorical features as a one-hot or multi-class numeric array. -//! +//! -use std::hash::Hash; -use std::collections::HashMap; - -use crate::math::num::RealNumber; use crate::error::Failed; - +use crate::math::num::RealNumber; +use std::collections::HashMap; +use std::hash::Hash; /// Turn a collection of label types into a one-hot vectors. /// This struct encodes single class per exmample pub struct OneHotEncoder { label_to_idx: HashMap, labels: Vec, - num_classes: usize - + num_classes: usize, } enum LabelDefinition { @@ -27,13 +23,18 @@ enum LabelDefinition { /// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); - (0..num_labels).map(|idx| if idx == label_idx {pos.clone()} else {neg.clone()}).collect() - + (0..num_labels) + .map(|idx| { + if idx == label_idx { + pos.clone() + } else { + neg.clone() + } + }) + .collect() } -impl<'a, T: Hash + Eq + Clone> OneHotEncoder -{ - +impl<'a, T: Hash + Eq + Clone> OneHotEncoder { /// Fit an encoder to a lable list /// /// Label numbers will be assigned in the order they are encountered @@ -45,23 +46,24 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder /// assert_eq!(oh_vec, vec![1f64,0f64,0f64,0f64,0f64]); /// ``` pub fn fit(labels: &[T]) -> Self { - let mut label_map: HashMap = HashMap::new(); let mut class_num = 0usize; let mut unique_lables: Vec = Vec::new(); - for l in labels - { + for l in labels { if !label_map.contains_key(&l) { label_map.insert(l.clone(), class_num); unique_lables.push(l.clone()); class_num += 1; } } - Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} + Self { + label_to_idx: label_map, + num_classes: class_num, + labels: unique_lables, + } } - /// Build an encoder from a predefined (label -> class number) map /// /// Definition example: @@ -84,21 +86,18 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder pub fn from_positional_label_vec(labels: Vec) -> Self { Self::from_label_def(LabelDefinition::PositionalLabel(labels)) } - + /// Transform a slice of label types into one-hot vectors - /// None is returned if unknown label is encountered + /// None is returned if unknown label is encountered pub fn transform(&self, labels: &[T]) -> Vec>> { - labels - .into_iter() - .map(|l| self.transform_one(l)) - .collect() + labels.into_iter().map(|l| self.transform_one(l)).collect() } /// Transform a single label type into a one-hot vector pub fn transform_one(&self, label: &T) -> Option> { match self.label_to_idx.get(label) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_classes)) + Some(&idx) => Some(make_one_hot(idx, self.num_classes)), } } @@ -111,99 +110,104 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot - .into_iter() - .enumerate() - .filter_map(|(idx, v)| if v == pos {Some(idx)} else {None}) - .collect(); - + .into_iter() + .enumerate() + .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) + .collect(); + if s.len() == 1 { let idx = s[0]; - return Ok(self.labels[idx].clone()) + return Ok(self.labels[idx].clone()); } - let pos_entries = format!("Expected a single positive entry, {} entires found", s.len()); + let pos_entries = format!( + "Expected a single positive entry, {} entires found", + s.len() + ); Err(Failed::transform(&pos_entries[..])) } - fn from_label_def(labels: LabelDefinition) -> Self { - let (label_map, class_num, unique_lables) = match labels { LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(T, usize)> = h.iter().map(|(k,v)| (k.clone(), v.clone())).collect(); - _unique_lab.sort_by(|a,b| a.1.cmp(&b.1)); + let mut _unique_lab: Vec<(T, usize)> = + h.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); + _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); (h, unique_lab.len(), unique_lab) - }, + } LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab.iter().enumerate().map(|(v, k)| (k.clone(),v)).collect(); + let h: HashMap = unique_lab + .iter() + .enumerate() + .map(|(v, k)| (k.clone(), v)) + .collect(); (h, unique_lab.len(), unique_lab) } }; - Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} - + Self { + label_to_idx: label_map, + num_classes: class_num, + labels: unique_lables, + } } } - #[cfg(test)] mod tests { use super::*; #[test] fn from_labels() { - let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; + let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; let enc = OneHotEncoder::::fit(&fake_labels[0..]); let oh_vec = match enc.transform_one(&1) { None => panic!("Wrong labels"), - Some(v) => v + Some(v) => v, }; - let res: Vec = vec![1f64,0f64,0f64,0f64,0f64]; + let res: Vec = vec![1f64, 0f64, 0f64, 0f64, 0f64]; assert_eq!(oh_vec, res); } - - fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str>{ - let fake_label_pos = vec!["background","dog", "cat"]; + fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { + let fake_label_pos = vec!["background", "dog", "cat"]; let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); enc } #[test] fn label_map_and_vec() { - let fake_label_map: HashMap<&str, usize> = vec![("background",0), ("dog", 1), ("cat", 2)].into_iter().collect(); - let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); - let oh_vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), - Some(v) => v - }; - let res: Vec = vec![0f64, 1f64,0f64]; - assert_eq!(oh_vec, res); - } - + let fake_label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + .into_iter() + .collect(); + let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v, + }; + let res: Vec = vec![0f64, 1f64, 0f64]; + assert_eq!(oh_vec, res); + } + #[test] fn positional_labels_vec() { - let enc = build_fake_str_enc(); - let oh_vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), - Some(v) => v - }; - let res: Vec = vec![0f64, 1f64,0f64]; - assert_eq!(oh_vec, res); + let enc = build_fake_str_enc(); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v, + }; + let res: Vec = vec![0.0, 1.0, 0.0]; + assert_eq!(oh_vec, res); } #[test] fn invert_label_test() { let enc = build_fake_str_enc(); - let res: Vec = vec![0f64, 1f64,0f64]; + let res: Vec = vec![0.0, 1.0, 0.0]; let lab = enc.invert_one(res).unwrap(); assert_eq!(lab, "dog"); - - if let Err(e) = enc.invert_one(vec![0.0, 0.0,0.0]) { + if let Err(e) = enc.invert_one(vec![0.0, 0.0, 0.0]) { let pos_entries = format!("Expected a single positive entry, 0 entires found"); assert_eq!(e, Failed::transform(&pos_entries[..])); }; } - - - -} \ No newline at end of file +} From 139bbae4564347cc8b44403c89baad14647ff37f Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 00:01:20 -0800 Subject: [PATCH 03/35] cliipy fixes --- src/preprocessing/target_encoders.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 81cbdbd..c282a4d 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -26,9 +26,9 @@ pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec OneHotEncoder { /// Transform a slice of label types into one-hot vectors /// None is returned if unknown label is encountered pub fn transform(&self, labels: &[T]) -> Vec>> { - labels.into_iter().map(|l| self.transform_one(l)).collect() + labels.iter().map(|l| self.transform_one(l)).collect() } /// Transform a single label type into a one-hot vector @@ -130,7 +130,7 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder { let (label_map, class_num, unique_lables) = match labels { LabelDefinition::LabelToClsNumMap(h) => { let mut _unique_lab: Vec<(T, usize)> = - h.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); + h.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); (h, unique_lab.len(), unique_lab) From 0df797cbae484e50c751910c9c726956ae1a2848 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 00:04:15 -0800 Subject: [PATCH 04/35] fmt fix --- src/preprocessing/target_encoders.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index c282a4d..44a5c05 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -24,13 +24,7 @@ enum LabelDefinition { pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); (0..num_labels) - .map(|idx| { - if idx == label_idx { - pos - } else { - neg - } - }) + .map(|idx| if idx == label_idx { pos } else { neg }) .collect() } From 7daf536aebff1c1d73118bd7d9dfc3bf70cc6b41 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 09:15:24 -0800 Subject: [PATCH 05/35] fixed docs --- src/preprocessing/target_encoders.rs | 125 ++++++++++++++++----------- 1 file changed, 76 insertions(+), 49 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 44a5c05..76f4c92 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -7,11 +7,47 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; -/// Turn a collection of label types into a one-hot vectors. +/// Turn a collection of `LabelType`s into a one-hot vectors. /// This struct encodes single class per exmample -pub struct OneHotEncoder { - label_to_idx: HashMap, - labels: Vec, +/// +/// You can fit a label enumeration by passing a collection of labels. +/// Label numbers will be assigned in the order they are encountered +/// +/// Example: +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// +/// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; +/// let enc = OneHotEncoder::::fit(&fake_labels[..]); +/// let oh_vec: Vec = enc.transform_one(&1).unwrap(); +/// // notice that 1 is actually a zero-th positional label +/// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); +/// ``` +/// +/// You can also pass a predefined label enumeration such as a hashmap `HashMap` or a vector `Vec` +/// +/// +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// +/// let label_map: HashMap<&str, usize> = +/// vec![("cat", 2), ("background",0), ("dog", 1)] +/// .into_iter() +/// .collect(); +/// let label_vec = vec!["background", "dog", "cat"]; +/// +/// let enc_lv = OneHotEncoder::<&str>::from_positional_label_vec(label_vec); +/// let enc_lm = OneHotEncoder::<&str>::from_label_map(label_map); +/// +/// // ["background", "dog", "cat"] +/// println!("{:?}", enc_lv.get_labels()); +/// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) +/// ``` +pub struct OneHotEncoder { + label_to_idx: HashMap, + labels: Vec, num_classes: usize, } @@ -28,21 +64,12 @@ pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec OneHotEncoder { +impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { /// Fit an encoder to a lable list - /// - /// Label numbers will be assigned in the order they are encountered - /// Example: - /// ``` - /// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; - /// let enc = OneHotEncoder::::fit(&fake_labels[0..]); - /// let oh_vec = enc.transform_one(&1); // notice that 1 is actually a zero-th positional label - /// assert_eq!(oh_vec, vec![1f64,0f64,0f64,0f64,0f64]); - /// ``` - pub fn fit(labels: &[T]) -> Self { - let mut label_map: HashMap = HashMap::new(); + pub fn fit(labels: &[LabelType]) -> Self { + let mut label_map: HashMap = HashMap::new(); let mut class_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + let mut unique_lables: Vec = Vec::new(); for l in labels { if !label_map.contains_key(&l) { @@ -59,48 +86,35 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder { } /// Build an encoder from a predefined (label -> class number) map - /// - /// Definition example: - /// ``` - /// let fake_label_map: HashMap<&str, u32> = vec![("background",0), ("dog", 1), ("cat", 2)] - /// .into_iter() - /// .collect(); - /// let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); - /// ``` - pub fn from_label_map(labels: HashMap) -> Self { + pub fn from_label_map(labels: HashMap) -> Self { Self::from_label_def(LabelDefinition::LabelToClsNumMap(labels)) } /// Build an encoder from a predefined positional label-class num vector - /// - /// Definition example: - /// ``` - /// let fake_label_pos = vec!["background","dog", "cat"]; - /// let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); - /// ``` - pub fn from_positional_label_vec(labels: Vec) -> Self { + pub fn from_positional_label_vec(labels: Vec) -> Self { Self::from_label_def(LabelDefinition::PositionalLabel(labels)) } /// Transform a slice of label types into one-hot vectors /// None is returned if unknown label is encountered - pub fn transform(&self, labels: &[T]) -> Vec>> { + pub fn transform(&self, labels: &[LabelType]) -> Vec>> { labels.iter().map(|l| self.transform_one(l)).collect() } /// Transform a single label type into a one-hot vector - pub fn transform_one(&self, label: &T) -> Option> { + pub fn transform_one(&self, label: &LabelType) -> Option> { match self.label_to_idx.get(label) { None => None, Some(&idx) => Some(make_one_hot(idx, self.num_classes)), } } + /// Get labels ordered by encoder's label enumeration + pub fn get_labels(&self) -> &Vec { + &self.labels + } + /// Invert one-hot vector, back to the label - ///``` - /// let lab = enc.invert_one(res)?; // e.g. res = [0,1,0,0...] "dog" == class 1 - /// assert_eq!(lab, "dog") - /// ``` - pub fn invert_one(&self, one_hot: Vec) -> Result { + pub fn invert_one(&self, one_hot: Vec) -> Result { let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot @@ -120,17 +134,17 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder { Err(Failed::transform(&pos_entries[..])) } - fn from_label_def(labels: LabelDefinition) -> Self { + fn from_label_def(labels: LabelDefinition) -> Self { let (label_map, class_num, unique_lables) = match labels { LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(T, usize)> = + let mut _unique_lab: Vec<(LabelType, usize)> = h.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); - let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); + let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); (h, unique_lab.len(), unique_lab) } LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab + let h: HashMap = unique_lab .iter() .enumerate() .map(|(v, k)| (k.clone(), v)) @@ -154,7 +168,7 @@ mod tests { fn from_labels() { let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; let enc = OneHotEncoder::::fit(&fake_labels[0..]); - let oh_vec = match enc.transform_one(&1) { + let oh_vec: Vec = match enc.transform_one(&1) { None => panic!("Wrong labels"), Some(v) => v, }; @@ -170,11 +184,11 @@ mod tests { #[test] fn label_map_and_vec() { - let fake_label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + let label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); - let oh_vec = match enc.transform_one(&"dog") { + let enc = OneHotEncoder::<&str>::from_label_map(label_map); + let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong labels"), Some(v) => v, }; @@ -185,7 +199,7 @@ mod tests { #[test] fn positional_labels_vec() { let enc = build_fake_str_enc(); - let oh_vec = match enc.transform_one(&"dog") { + let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong labels"), Some(v) => v, }; @@ -204,4 +218,17 @@ mod tests { assert_eq!(e, Failed::transform(&pos_entries[..])); }; } + + #[test] + fn test_many_labels() { + let enc = build_fake_str_enc(); + let res: Vec>> = enc.transform(&["dog", "cat", "fish", "background"]); + let v = vec![ + Some(vec![0.0, 1.0, 0.0]), + Some(vec![0.0, 0.0, 1.0]), + None, + Some(vec![1.0, 0.0, 0.0]), + ]; + assert_eq!(res, v) + } } From 9833a2f8514bea27e3913bdf144d00637751ec61 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 10:03:33 -0800 Subject: [PATCH 06/35] codecov-fix --- src/preprocessing/target_encoders.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 76f4c92..56a97ed 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -1,6 +1,5 @@ #![allow(clippy::ptr_arg)] //! # Encode categorical features as a one-hot or multi-class numeric array. -//! use crate::error::Failed; use crate::math::num::RealNumber; From 244a72444520cc6ac832779a44538fc93f6b68e3 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 12:03:13 -0800 Subject: [PATCH 07/35] Genertic make_one_hot. Current implementation returns BaseVector of RealNumber --- src/preprocessing/target_encoders.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 56a97ed..3f2592b 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -6,7 +6,13 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; -/// Turn a collection of `LabelType`s into a one-hot vectors. +/// Make a one-hot encoded vector from a categorical variable +pub fn make_one_hot>(label_idx: usize, num_labels: usize) -> V { + let pos = T::from_f64(1f64).unwrap(); + let mut z = V::zeros(num_labels); + z.set(label_idx, pos); + z +} /// This struct encodes single class per exmample /// /// You can fit a label enumeration by passing a collection of labels. From 19088b682a52b81ec8709fc8ec12e25624062a3c Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 12:06:43 -0800 Subject: [PATCH 08/35] remoe LabelDefinition, looks like unnecesery abstraction for now --- src/preprocessing/target_encoders.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 3f2592b..ff9fa6e 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -91,12 +91,31 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { } /// Build an encoder from a predefined (label -> class number) map - pub fn from_label_map(labels: HashMap) -> Self { - Self::from_label_def(LabelDefinition::LabelToClsNumMap(labels)) + pub fn from_label_map(category_map: HashMap) -> Self { + let mut _unique_cat: Vec<(CategoryType, usize)> = + category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); + _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); + let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); + Self { + num_categories: categories.len(), + categories, + category_map, } + } + /// Build an encoder from a predefined positional label-class num vector - pub fn from_positional_label_vec(labels: Vec) -> Self { - Self::from_label_def(LabelDefinition::PositionalLabel(labels)) + pub fn from_positional_label_vec(categories: Vec) -> Self { + // Self::from_label_def(LabelDefinition::PositionalLabel(categories)) + let category_map: HashMap = categories + .iter() + .enumerate() + .map(|(v, k)| (k.clone(), v)) + .collect(); + Self { + num_categories: categories.len(), + category_map, + categories, + } } /// Transform a slice of label types into one-hot vectors From 6109fc5211d0ebba410e66ec8b824992e775c1d5 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 12:13:45 -0800 Subject: [PATCH 09/35] Renaming fit/transform for API compatibility. Also rename label to category. --- src/preprocessing/target_encoders.rs | 172 +++++++++++---------------- 1 file changed, 70 insertions(+), 102 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index ff9fa6e..a929ab6 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -2,96 +2,86 @@ //! # Encode categorical features as a one-hot or multi-class numeric array. use crate::error::Failed; +use crate::linalg::BaseVector; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable -pub fn make_one_hot>(label_idx: usize, num_labels: usize) -> V { +pub fn make_one_hot>(category_idx: usize, num_categories: usize) -> V { let pos = T::from_f64(1f64).unwrap(); - let mut z = V::zeros(num_labels); - z.set(label_idx, pos); + let mut z = V::zeros(num_categories); + z.set(category_idx, pos); z } + +/// Turn a collection of `CategoryType`s into a one-hot vectors. /// This struct encodes single class per exmample /// -/// You can fit a label enumeration by passing a collection of labels. -/// Label numbers will be assigned in the order they are encountered +/// You can fit_to_series a category enumeration by passing a collection of categories. +/// category numbers will be assigned in the order they are encountered /// /// Example: /// ``` /// use std::collections::HashMap; /// use smartcore::preprocessing::target_encoders::OneHotEncoder; /// -/// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; -/// let enc = OneHotEncoder::::fit(&fake_labels[..]); +/// let fake_categories: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; +/// let enc = OneHotEncoder::::fit_to_series(&fake_categories[..]); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); -/// // notice that 1 is actually a zero-th positional label +/// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); /// ``` /// -/// You can also pass a predefined label enumeration such as a hashmap `HashMap` or a vector `Vec` +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` /// /// /// ``` /// use std::collections::HashMap; /// use smartcore::preprocessing::target_encoders::OneHotEncoder; /// -/// let label_map: HashMap<&str, usize> = +/// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] /// .into_iter() /// .collect(); -/// let label_vec = vec!["background", "dog", "cat"]; +/// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = OneHotEncoder::<&str>::from_positional_label_vec(label_vec); -/// let enc_lm = OneHotEncoder::<&str>::from_label_map(label_map); +/// let enc_lv = OneHotEncoder::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = OneHotEncoder::<&str>::from_category_map(category_map); /// /// // ["background", "dog", "cat"] -/// println!("{:?}", enc_lv.get_labels()); +/// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` -pub struct OneHotEncoder { - label_to_idx: HashMap, - labels: Vec, - num_classes: usize, +pub struct OneHotEncoder { + category_map: HashMap, + categories: Vec, + num_categories: usize, } -enum LabelDefinition { - LabelToClsNumMap(HashMap), - PositionalLabel(Vec), -} - -/// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) -pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { - let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); - (0..num_labels) - .map(|idx| if idx == label_idx { pos } else { neg }) - .collect() -} - -impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { +impl OneHotEncoder { /// Fit an encoder to a lable list - pub fn fit(labels: &[LabelType]) -> Self { - let mut label_map: HashMap = HashMap::new(); - let mut class_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + pub fn fit_to_series(categories: &[CategoryType]) -> Self { + let mut category_map: HashMap = HashMap::new(); + let mut category_num = 0usize; + let mut unique_lables: Vec = Vec::new(); - for l in labels { - if !label_map.contains_key(&l) { - label_map.insert(l.clone(), class_num); + for l in categories { + if !category_map.contains_key(&l) { + category_map.insert(l.clone(), category_num); unique_lables.push(l.clone()); - class_num += 1; + category_num += 1; } } Self { - label_to_idx: label_map, - num_classes: class_num, - labels: unique_lables, + category_map: category_map, + num_categories: category_num, + categories: unique_lables, } } - /// Build an encoder from a predefined (label -> class number) map - pub fn from_label_map(category_map: HashMap) -> Self { + /// Build an encoder from a predefined (category -> class number) map + pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(CategoryType, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); @@ -100,12 +90,11 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { num_categories: categories.len(), categories, category_map, - } + } } - /// Build an encoder from a predefined positional label-class num vector - pub fn from_positional_label_vec(categories: Vec) -> Self { - // Self::from_label_def(LabelDefinition::PositionalLabel(categories)) + /// Build an encoder from a predefined positional category-class num vector + pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories .iter() .enumerate() @@ -118,27 +107,30 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { } } - /// Transform a slice of label types into one-hot vectors - /// None is returned if unknown label is encountered - pub fn transform(&self, labels: &[LabelType]) -> Vec>> { - labels.iter().map(|l| self.transform_one(l)).collect() + /// Transform a slice of category types into one-hot vectors + /// None is returned if unknown category is encountered + pub fn transfrom_series( + &self, + categories: &[CategoryType], + ) -> Vec>> { + categories.iter().map(|l| self.transform_one(l)).collect() } - /// Transform a single label type into a one-hot vector - pub fn transform_one(&self, label: &LabelType) -> Option> { - match self.label_to_idx.get(label) { + /// Transform a single category type into a one-hot vector + pub fn transform_one(&self, category: &CategoryType) -> Option> { + match self.category_map.get(category) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_classes)), + Some(&idx) => Some(make_one_hot(idx, self.num_categories)), } } - /// Get labels ordered by encoder's label enumeration - pub fn get_labels(&self) -> &Vec { - &self.labels + /// Get categories ordered by encoder's category enumeration + pub fn get_categories(&self) -> &Vec { + &self.categories } - /// Invert one-hot vector, back to the label - pub fn invert_one(&self, one_hot: Vec) -> Result { + /// Invert one-hot vector, back to the category + pub fn invert_one(&self, one_hot: Vec) -> Result { let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot @@ -149,7 +141,7 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { if s.len() == 1 { let idx = s[0]; - return Ok(self.labels[idx].clone()); + return Ok(self.categories[idx].clone()); } let pos_entries = format!( "Expected a single positive entry, {} entires found", @@ -157,31 +149,6 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { ); Err(Failed::transform(&pos_entries[..])) } - - fn from_label_def(labels: LabelDefinition) -> Self { - let (label_map, class_num, unique_lables) = match labels { - LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(LabelType, usize)> = - h.iter().map(|(k, v)| (k.clone(), *v)).collect(); - _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); - let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); - (h, unique_lab.len(), unique_lab) - } - LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab - .iter() - .enumerate() - .map(|(v, k)| (k.clone(), v)) - .collect(); - (h, unique_lab.len(), unique_lab) - } - }; - Self { - label_to_idx: label_map, - num_classes: class_num, - labels: unique_lables, - } - } } #[cfg(test)] @@ -189,11 +156,11 @@ mod tests { use super::*; #[test] - fn from_labels() { - let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; - let enc = OneHotEncoder::::fit(&fake_labels[0..]); + fn from_categories() { + let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; + let enc = OneHotEncoder::::fit_to_series(&fake_categories[0..]); let oh_vec: Vec = match enc.transform_one(&1) { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![1f64, 0f64, 0f64, 0f64, 0f64]; @@ -201,19 +168,19 @@ mod tests { } fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { - let fake_label_pos = vec!["background", "dog", "cat"]; - let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + let fake_category_pos = vec!["background", "dog", "cat"]; + let enc = OneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); enc } #[test] - fn label_map_and_vec() { - let label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + fn category_map_and_vec() { + let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_label_map(label_map); + let enc = OneHotEncoder::<&str>::from_category_map(category_map); let oh_vec: Vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![0f64, 1f64, 0f64]; @@ -221,10 +188,10 @@ mod tests { } #[test] - fn positional_labels_vec() { + fn positional_categories_vec() { let enc = build_fake_str_enc(); let oh_vec: Vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![0.0, 1.0, 0.0]; @@ -244,9 +211,10 @@ mod tests { } #[test] - fn test_many_labels() { + fn test_many_categorys() { let enc = build_fake_str_enc(); - let res: Vec>> = enc.transform(&["dog", "cat", "fish", "background"]); + let res: Vec>> = + enc.transfrom_series(&["dog", "cat", "fish", "background"]); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]), From 408b97d8aaa56ce72375f934f8cc56721962ee5b Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 19:31:14 -0800 Subject: [PATCH 10/35] Rename series encoder and move to separate module file --- src/preprocessing/mod.rs | 3 +- .../{target_encoders.rs => series_encoder.rs} | 50 +++++++++++-------- 2 files changed, 32 insertions(+), 21 deletions(-) rename src/preprocessing/{target_encoders.rs => series_encoder.rs} (80%) diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index c70f7dc..4534c6d 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1 +1,2 @@ -pub mod target_encoders; +pub mod categorical_encoders; +pub mod series_encoder; \ No newline at end of file diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/series_encoder.rs similarity index 80% rename from src/preprocessing/target_encoders.rs rename to src/preprocessing/series_encoder.rs index a929ab6..132d160 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/series_encoder.rs @@ -1,14 +1,17 @@ #![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot or multi-class numeric array. +//! # Encode categorical features as a one-hot numeric array. use crate::error::Failed; -use crate::linalg::BaseVector; +use crate::linalg::{BaseVector, Matrix}; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable -pub fn make_one_hot>(category_idx: usize, num_categories: usize) -> V { +pub fn make_one_hot>( + category_idx: usize, + num_categories: usize, +) -> V { let pos = T::from_f64(1f64).unwrap(); let mut z = V::zeros(num_categories); z.set(category_idx, pos); @@ -18,16 +21,17 @@ pub fn make_one_hot>(category_idx: usize, num_ca /// Turn a collection of `CategoryType`s into a one-hot vectors. /// This struct encodes single class per exmample /// -/// You can fit_to_series a category enumeration by passing a collection of categories. +/// You can fit_to_iter a category enumeration by passing an iterator of categories. /// category numbers will be assigned in the order they are encountered /// /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; /// -/// let fake_categories: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; -/// let enc = OneHotEncoder::::fit_to_series(&fake_categories[..]); +/// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; +/// let it = fake_categories.iter().map(|&a| a); +/// let enc = SeriesOneHotEncoder::::fit_to_iter(it); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); /// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); @@ -38,7 +42,7 @@ pub fn make_one_hot>(category_idx: usize, num_ca /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -46,22 +50,22 @@ pub fn make_one_hot>(category_idx: usize, num_ca /// .collect(); /// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = OneHotEncoder::<&str>::from_positional_category_vec(category_vec); -/// let enc_lm = OneHotEncoder::<&str>::from_category_map(category_map); +/// let enc_lv = SeriesOneHotEncoder::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = SeriesOneHotEncoder::<&str>::from_category_map(category_map); /// /// // ["background", "dog", "cat"] /// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` -pub struct OneHotEncoder { +pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, - num_categories: usize, + pub num_categories: usize, } -impl OneHotEncoder { +impl SeriesOneHotEncoder { /// Fit an encoder to a lable list - pub fn fit_to_series(categories: &[CategoryType]) -> Self { + pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); let mut category_num = 0usize; let mut unique_lables: Vec = Vec::new(); @@ -74,7 +78,7 @@ impl OneHotEncoder { } } Self { - category_map: category_map, + category_map, num_categories: category_num, categories: unique_lables, } @@ -107,15 +111,20 @@ impl OneHotEncoder { } } + + pub fn transform_iter(&self, cat_it: impl Iterator)-> Vec>> { + cat_it.map(|l| self.transform_one(l)).collect() + } /// Transform a slice of category types into one-hot vectors /// None is returned if unknown category is encountered pub fn transfrom_series( &self, categories: &[CategoryType], ) -> Vec>> { - categories.iter().map(|l| self.transform_one(l)).collect() + self.transform_iter(categories.iter()) } + /// Transform a single category type into a one-hot vector pub fn transform_one(&self, category: &CategoryType) -> Option> { match self.category_map.get(category) { @@ -158,7 +167,8 @@ mod tests { #[test] fn from_categories() { let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; - let enc = OneHotEncoder::::fit_to_series(&fake_categories[0..]); + let it = fake_categories.iter().map(|&a| a); + let enc = SeriesOneHotEncoder::::fit_to_iter(it); let oh_vec: Vec = match enc.transform_one(&1) { None => panic!("Wrong categories"), Some(v) => v, @@ -167,9 +177,9 @@ mod tests { assert_eq!(oh_vec, res); } - fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { + fn build_fake_str_enc<'a>() -> SeriesOneHotEncoder<&'a str> { let fake_category_pos = vec!["background", "dog", "cat"]; - let enc = OneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); + let enc = SeriesOneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); enc } @@ -178,7 +188,7 @@ mod tests { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_category_map(category_map); + let enc = SeriesOneHotEncoder::<&str>::from_category_map(category_map); let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong categories"), Some(v) => v, From 5c400f40d258c989659daefab030efcb24cec823 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 19:36:38 -0800 Subject: [PATCH 11/35] Scaffold for turniing floats to hashable and fittinng to columns --- src/preprocessing/categorical_encoders.rs | 27 +++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 src/preprocessing/categorical_encoders.rs diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs new file mode 100644 index 0000000..828eeef --- /dev/null +++ b/src/preprocessing/categorical_encoders.rs @@ -0,0 +1,27 @@ +#![allow(clippy::ptr_arg)] +//! # Encode categorical features as a one-hot numeric array. + +use crate::error::Failed; +use crate::linalg::{BaseVector, Matrix}; +use crate::math::num::RealNumber; + +use crate::preprocessing::series_encoder::SeriesOneHotEncoder; + +pub type HashableReal = u32; + +fn hashable_num(v: &T) -> HashableReal { + // gaxler: If first 32 bits are the same, assume numbers are the same for the categorical coercion + v.to_f32_bits() +} + +#[derive(Debug, Clone)] +pub struct OneHotEncoderParams { + pub categorical_param_idxs: Option>, + pub infer_categorical: bool, +} +/// Encode Categorical variavbles of data matrix to one-hot +pub struct OneHotEncoder { + series_encoders: Vec>, + categorical_param_idxs: Vec, +} + From f91b1f99425789b6d11c10941b079b4cd7150f5c Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 19:37:54 -0800 Subject: [PATCH 12/35] fit SeriesOneHotEncoders to predefined columns --- src/preprocessing/categorical_encoders.rs | 42 +++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 828eeef..012f364 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -25,3 +25,45 @@ pub struct OneHotEncoder { categorical_param_idxs: Vec, } +impl> OneHotEncoder { + /// PlaceHolder + + pub fn fit(data: &M, params: OneHotEncoderParams) -> Result { + match (params.categorical_param_idxs, params.infer_categorical) { + (None, false) => Err(Failed::fit( + "Must pass categorical series ids or infer flag", + )), + + (Some(idxs), true) => Err(Failed::fit( + "Ambigous parameters, got both infer and categroy ids", + )), + + (Some(idxs), false) => Ok(Self { + series_encoders: Self::build_series_encoders::(data, &idxs[..]), + categorical_param_idxs: idxs, + }), + + (None, true) => { + todo!("implement categorical auto-inference") + } + } + } + + fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { + let (nrows, _) = data.shape(); + // let mut res: Vec> = Vec::with_capacity(idxs.len()); + let mut tmp_col: Vec = Vec::with_capacity(nrows); + + let res: Vec> = idxs + .iter() + .map(|&idx| { + data.copy_col_as_vec(idx, &mut tmp_col); + let hashable_col = tmp_col.iter().map(|v| hashable_num::(v)); + SeriesOneHotEncoder::fit_to_iter(hashable_col) + }) + .collect(); + res + } + + +} \ No newline at end of file From 3480e728af5ec16edadc8ec63946e76970eaf2d2 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 16:04:41 -0800 Subject: [PATCH 13/35] Documentation updates --- src/preprocessing/categorical_encoders.rs | 26 ++++++++++++++-- src/preprocessing/mod.rs | 5 ++- src/preprocessing/series_encoder.rs | 37 +++++++++++++++-------- 3 files changed, 53 insertions(+), 15 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 012f364..0436787 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -1,5 +1,27 @@ -#![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot numeric array. +//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies +//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents +//! +//! ### Usage Example +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; +//! let data = DenseMatrix::from_2d_array(&[ +//! &[1.5, 1.0, 1.5, 3.0], +//! &[1.5, 2.0, 1.5, 4.0], +//! &[1.5, 1.0, 1.5, 5.0], +//! &[1.5, 2.0, 1.5, 6.0], +//! ]); +//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); +//! // Infer number of categories from data and return a reusable encoder +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! // Transform categorical to one-hot encoded (can transform similar) +//! let oh_data = encoder.transform(&data).unwrap(); +//! // Produces the following: +//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] +//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] +//! ``` use crate::error::Failed; use crate::linalg::{BaseVector, Matrix}; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index 4534c6d..c07b982 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,2 +1,5 @@ +/// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents pub mod categorical_encoders; -pub mod series_encoder; \ No newline at end of file +mod data_traits; +/// Encode a series (column, array) of categorical variables as one-hot vectors +pub mod series_encoder; diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 132d160..321f049 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -1,13 +1,21 @@ #![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot numeric array. +//! # Series Encoder +//! Encode a series of categorical features as a one-hot numeric array. use crate::error::Failed; -use crate::linalg::{BaseVector, Matrix}; +use crate::linalg::BaseVector; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable +/// +/// Example: +/// ``` +/// use smartcore::preprocessing::series_encoder::make_one_hot; +/// let one_hot: Vec = make_one_hot(2, 3); +/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); +/// ``` pub fn make_one_hot>( category_idx: usize, num_categories: usize, @@ -18,7 +26,7 @@ pub fn make_one_hot>( z } -/// Turn a collection of `CategoryType`s into a one-hot vectors. +/// Turn a collection of Hashable objects into a one-hot vectors. /// This struct encodes single class per exmample /// /// You can fit_to_iter a category enumeration by passing an iterator of categories. @@ -27,7 +35,7 @@ pub fn make_one_hot>( /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; /// /// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; /// let it = fake_categories.iter().map(|&a| a); @@ -42,7 +50,7 @@ pub fn make_one_hot>( /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -60,10 +68,11 @@ pub fn make_one_hot>( pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, + /// Number of categories for categorical variable pub num_categories: usize, } -impl SeriesOneHotEncoder { +impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder { /// Fit an encoder to a lable list pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); @@ -111,20 +120,24 @@ impl SeriesOneHotEncoder { } } - - pub fn transform_iter(&self, cat_it: impl Iterator)-> Vec>> { - cat_it.map(|l| self.transform_one(l)).collect() + /// Take an iterator as a series to transform + pub fn transform_iter( + &self, + cat_it: impl Iterator, + ) -> Vec>> { + cat_it.map(|l| self.transform_one(&l)).collect() } + /// Transform a slice of category types into one-hot vectors /// None is returned if unknown category is encountered pub fn transfrom_series( &self, - categories: &[CategoryType], + categories: &'a [CategoryType], ) -> Vec>> { - self.transform_iter(categories.iter()) + let v = categories.iter().map(|a| a.clone()); + self.transform_iter(v) } - /// Transform a single category type into a one-hot vector pub fn transform_one(&self, category: &CategoryType) -> Option> { match self.category_map.get(category) { From 3dc8a4283298d6622a6a0c74cd008339d6b8e9c4 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 16:05:45 -0800 Subject: [PATCH 14/35] Adapt column numbers to the new columns introduced by categorical variables. --- src/preprocessing/categorical_encoders.rs | 34 +++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 0436787..31d3500 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -41,6 +41,40 @@ pub struct OneHotEncoderParams { pub categorical_param_idxs: Option>, pub infer_categorical: bool, } +/// Calculate the offset to parameters to due introduction of one-hot encoding +fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { + // This functions uses iterators and returns a vector. + // In case we get a huge amount of paramenters this might be a problem + // todo: Change this such that it will return an iterator + + let cat_idx = encoded_idxs.iter().copied().chain((num_params..).take(1)); + + // Offset is constant between two categorical values, here we calculate the number of steps + // that remain constant + let repeats = cat_idx.scan(0, |a, v| { + let im = v + 1 - *a; + *a = v; + Some(im) + }); + + // Calculate the offset to parameter idx due to newly intorduced one-hot vectors + let offset_ = cat_sizes.iter().scan(0, |a, &v| { + *a = *a + v - 1; + Some(*a) + }); + let offset = (0..1).chain(offset_); + + let new_param_idxs: Vec = (0..num_params) + .zip( + repeats + .zip(offset) + .map(|(r, o)| iter::repeat(o).take(r)) + .flatten(), + ) + .map(|(idx, ofst)| idx + ofst) + .collect(); + new_param_idxs +} /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { series_encoders: Vec>, From dd39433ff8ddea5445e3b1ca27db2474c002885d Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 18:48:23 -0800 Subject: [PATCH 15/35] Categorizable trait defines logic of turning floats into hashable categorical variables. Since we only support RealNumbers for now, the idea is to treat round numbers as ordinal (or nominal if user chooses to ignore order) categories. --- src/preprocessing/data_traits.rs | 43 ++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/preprocessing/data_traits.rs diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs new file mode 100644 index 0000000..04b534e --- /dev/null +++ b/src/preprocessing/data_traits.rs @@ -0,0 +1,43 @@ +//! Traits to indicate that float variables can be viewed as categorical +//! This module assumes + +pub type CategoricalFloat = u16; + +// pub struct CategoricalFloat(u16); + +pub trait Categorizable { + type A; + + fn to_category(self) -> CategoricalFloat; + + fn is_valid(self) -> bool; + +} + +impl Categorizable for f32 { + + type A = CategoricalFloat; + + fn to_category(self) -> CategoricalFloat { + self as CategoricalFloat + } + + fn is_valid(self) -> bool { + let a = self.to_category(); + a as f32 == self + } +} + +impl Categorizable for f64 { + + type A = CategoricalFloat; + + fn to_category(self) ->CategoricalFloat { + self as CategoricalFloat + } + + fn is_valid(self) -> bool { + let a = self.to_category(); + a as f64 == self + } +} \ No newline at end of file From cd5611079caae782f148397a0ebad465aea6faef Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:29:33 -0800 Subject: [PATCH 16/35] Fit OneHotEncoder --- src/preprocessing/categorical_encoders.rs | 56 ++++++++++++++++++----- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 31d3500..794c1d6 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -75,32 +75,66 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) .collect(); new_param_idxs } +fn validate_col_is_categorical(data: &Vec) -> bool { + for v in data { + if !v.is_valid() { return false} + } + true +} /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { - series_encoders: Vec>, - categorical_param_idxs: Vec, + series_encoders: Vec>, + col_idx_categorical: Vec, } -impl> OneHotEncoder { +impl OneHotEncoder { /// PlaceHolder - pub fn fit(data: &M, params: OneHotEncoderParams) -> Result { - match (params.categorical_param_idxs, params.infer_categorical) { + pub fn fit>( + data: &M, + params: OneHotEncoderParams, + ) -> Result { + match (params.col_idx_categorical, params.infer_categorical) { (None, false) => Err(Failed::fit( "Must pass categorical series ids or infer flag", )), - (Some(idxs), true) => Err(Failed::fit( + (Some(_idxs), true) => Err(Failed::fit( "Ambigous parameters, got both infer and categroy ids", )), - (Some(idxs), false) => Ok(Self { - series_encoders: Self::build_series_encoders::(data, &idxs[..]), - categorical_param_idxs: idxs, - }), + (Some(mut idxs), false) => { + // make sure categories have same order as data columns + idxs.sort(); + + let (nrows, _) = data.shape(); + + // col buffer to avoid allocations + let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); + + let mut res: Vec> = Vec::with_capacity(idxs.len()); + + for &idx in &idxs { + data.copy_col_as_vec(idx, &mut col_buf); + if !validate_col_is_categorical(&col_buf) { + let msg = format!("Column {} of data matrix containts non categorizable (integer) values", idx); + return Err(Failed::fit(&msg[..])) + } + let hashable_col = col_buf.iter().map(|v| v.to_category()); + res.push(SeriesOneHotEncoder::fit_to_iter(hashable_col)); + } + + Ok(Self { + series_encoders: res, //Self::build_series_encoders::(data, &idxs[..]), + col_idx_categorical: idxs, + }) + } (None, true) => { - todo!("implement categorical auto-inference") + todo!("Auto-Inference for Categorical Variables not yet implemented") + } + } + } } } } From fd6b2e801479f709870921f192153c6abeeab53d Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:29:58 -0800 Subject: [PATCH 17/35] Transform matrix --- src/preprocessing/categorical_encoders.rs | 42 +++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 794c1d6..585f13a 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -135,9 +135,51 @@ impl OneHotEncoder { } } } + + /// Transform categorical variables to one-hot encoded and return a new matrix + pub fn transform>(&self, x: &M) -> Option { + let (nrows, p) = x.shape(); + let additional_params: Vec = self + .series_encoders + .iter() + .map(|enc| enc.num_categories) + .collect(); + + let new_param_num: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); + let mut res = M::zeros(nrows, new_param_num); + // copy old data in x to their new location + for (old_p, &new_p) in new_col_idx.iter().enumerate() { + for r in 0..nrows { + let val = x.get(r, old_p); + res.set(r, new_p, val); } } + for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { + let cidx = new_col_idx[old_cidx]; + let col_iter = (0..nrows).map(|r| res.get(r, cidx).to_category()); + let sencoder = &self.series_encoders[pidx]; + let oh_series: Vec>> = sencoder.transform_iter(col_iter); + + for (row, oh_vec) in oh_series.iter().enumerate() { + match oh_vec { + None => { + // Bad value in a series causes in to be invalid + // todo: proper error handling, so user can know where the bad value is + return None; + } + Some(v) => { + // copy one hot vectors to their place in the data matrix; + for (col_ofst, &val) in v.iter().enumerate() { + res.set(row, cidx + col_ofst, val); + } } + } + } + } + Some(res) + } +} fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { let (nrows, _) = data.shape(); From c987d39d439462e5abc12cf34276d8735afb1145 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:31:09 -0800 Subject: [PATCH 18/35] tests + force Categorizable be RealNumber --- src/preprocessing/categorical_encoders.rs | 138 +++++++++++++++++----- src/preprocessing/data_traits.rs | 4 +- 2 files changed, 114 insertions(+), 28 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 585f13a..063aa5c 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -1,6 +1,8 @@ //! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies //! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents //! +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [SeriesOneHotEncoder](../series_encoder/struct.SeriesOneHotEncoder.html) +//! //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; @@ -22,25 +24,33 @@ //! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] //! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] //! ``` +use std::iter; use crate::error::Failed; -use crate::linalg::{BaseVector, Matrix}; -use crate::math::num::RealNumber; +use crate::linalg::Matrix; +use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; use crate::preprocessing::series_encoder::SeriesOneHotEncoder; -pub type HashableReal = u32; - -fn hashable_num(v: &T) -> HashableReal { - // gaxler: If first 32 bits are the same, assume numbers are the same for the categorical coercion - v.to_f32_bits() -} - +/// OneHotEncoder Parameters #[derive(Debug, Clone)] pub struct OneHotEncoderParams { - pub categorical_param_idxs: Option>, + /// Column number that contain categorical variable + pub col_idx_categorical: Option>, + /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables pub infer_categorical: bool, } + +impl OneHotEncoderParams { + /// Generate parameters from categorical variable column numbers + pub fn from_cat_idx(categorical_params: &[usize]) -> Self { + Self { + col_idx_categorical: Some(categorical_params.to_vec()), + infer_categorical: false, + } + } +} + /// Calculate the offset to parameters to due introduction of one-hot encoding fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { // This functions uses iterators and returns a vector. @@ -75,12 +85,14 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) .collect(); new_param_idxs } + fn validate_col_is_categorical(data: &Vec) -> bool { for v in data { if !v.is_valid() { return false} } true } + /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { series_encoders: Vec>, @@ -167,13 +179,13 @@ impl OneHotEncoder { // Bad value in a series causes in to be invalid // todo: proper error handling, so user can know where the bad value is return None; - } + } Some(v) => { // copy one hot vectors to their place in the data matrix; for (col_ofst, &val) in v.iter().enumerate() { res.set(row, cidx + col_ofst, val); - } - } + } + } } } } @@ -181,21 +193,93 @@ impl OneHotEncoder { } } - fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { - let (nrows, _) = data.shape(); - // let mut res: Vec> = Vec::with_capacity(idxs.len()); - let mut tmp_col: Vec = Vec::with_capacity(nrows); +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::preprocessing::series_encoder::SeriesOneHotEncoder; - let res: Vec> = idxs - .iter() - .map(|&idx| { - data.copy_col_as_vec(idx, &mut tmp_col); - let hashable_col = tmp_col.iter().map(|v| hashable_num::(v)); - SeriesOneHotEncoder::fit_to_iter(hashable_col) - }) - .collect(); - res + #[test] + fn adjust_idxs() { + assert_eq!(find_new_idxs(0, &[], &[]), Vec::new()); + // [0,1,2] -> [0, 1, 1, 1, 2] + assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); } + fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { + let orig = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); -} \ No newline at end of file + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { + // Categorical first and last + let orig = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 1.5, 3.0], + &[1.5, 2.0, 1.5, 4.0], + &[1.5, 1.0, 1.5, 5.0], + &[1.5, 2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + #[test] + fn hash_encode_f64_series() { + let series = vec![3.0, 1.0, 2.0, 1.0]; + let hashable_series: Vec = + series.iter().map(|v| v.to_category()).collect(); + let enc = SeriesOneHotEncoder::from_positional_category_vec(hashable_series); + let inv = enc.invert_one(vec![0.0, 0.0, 1.0]); + let orig_val: f64 = inv.unwrap().into(); + assert_eq!(orig_val, 2.0); + } + #[test] + fn test_fit() { + let (X, _) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + assert_eq!(oh_enc.series_encoders.len(), 2); + + let num_cat: Vec = oh_enc + .series_encoders + .iter() + .map(|a| a.num_categories) + .collect(); + assert_eq!(num_cat, vec![2, 4]); + } + + #[test] + fn matrix_transform_test() { + let (X, expectedX) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let nm = oh_enc.transform(&X).unwrap(); + assert_eq!(nm, expectedX); + + let (X, expectedX) = build_cat_first_and_last(); + let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let nm = oh_enc.transform(&X).unwrap(); + assert_eq!(nm, expectedX); + } +} diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs index 04b534e..16924bb 100644 --- a/src/preprocessing/data_traits.rs +++ b/src/preprocessing/data_traits.rs @@ -1,11 +1,13 @@ //! Traits to indicate that float variables can be viewed as categorical //! This module assumes +use crate::math::num::RealNumber; + pub type CategoricalFloat = u16; // pub struct CategoricalFloat(u16); -pub trait Categorizable { +pub trait Categorizable: RealNumber { type A; fn to_category(self) -> CategoricalFloat; From 2f03c1d6d74834d5bad990a5fd9c7cd7962fa351 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:54:42 -0800 Subject: [PATCH 19/35] module name change --- ...cal_encoders.rs => categorical_encoder.rs} | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) rename src/preprocessing/{categorical_encoders.rs => categorical_encoder.rs} (89%) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoder.rs similarity index 89% rename from src/preprocessing/categorical_encoders.rs rename to src/preprocessing/categorical_encoder.rs index 063aa5c..22cd052 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -38,7 +38,7 @@ pub struct OneHotEncoderParams { /// Column number that contain categorical variable pub col_idx_categorical: Option>, /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables - pub infer_categorical: bool, + infer_categorical: bool, } impl OneHotEncoderParams { @@ -86,14 +86,17 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) new_param_idxs } -fn validate_col_is_categorical(data: &Vec) -> bool { +fn validate_col_is_categorical(data: &[T]) -> bool { for v in data { - if !v.is_valid() { return false} + if !v.is_valid() { + return false; + } } true } /// Encode Categorical variavbles of data matrix to one-hot +#[derive(Debug, Clone)] pub struct OneHotEncoder { series_encoders: Vec>, col_idx_categorical: Vec, @@ -102,7 +105,7 @@ pub struct OneHotEncoder { impl OneHotEncoder { /// PlaceHolder - pub fn fit>( + pub fn fit>( data: &M, params: OneHotEncoderParams, ) -> Result { @@ -117,20 +120,24 @@ impl OneHotEncoder { (Some(mut idxs), false) => { // make sure categories have same order as data columns - idxs.sort(); + idxs.sort_unstable(); let (nrows, _) = data.shape(); // col buffer to avoid allocations let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - - let mut res: Vec> = Vec::with_capacity(idxs.len()); - + + let mut res: Vec> = + Vec::with_capacity(idxs.len()); + for &idx in &idxs { data.copy_col_as_vec(idx, &mut col_buf); if !validate_col_is_categorical(&col_buf) { - let msg = format!("Column {} of data matrix containts non categorizable (integer) values", idx); - return Err(Failed::fit(&msg[..])) + let msg = format!( + "Column {} of data matrix containts non categorizable (integer) values", + idx + ); + return Err(Failed::fit(&msg[..])); } let hashable_col = col_buf.iter().map(|v| v.to_category()); res.push(SeriesOneHotEncoder::fit_to_iter(hashable_col)); @@ -149,7 +156,7 @@ impl OneHotEncoder { } /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform>(&self, x: &M) -> Option { + pub fn transform>(&self, x: &M) -> Option { let (nrows, p) = x.shape(); let additional_params: Vec = self .series_encoders @@ -201,7 +208,7 @@ mod tests { #[test] fn adjust_idxs() { - assert_eq!(find_new_idxs(0, &[], &[]), Vec::new()); + assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); // [0,1,2] -> [0, 1, 1, 1, 2] assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); } @@ -282,4 +289,22 @@ mod tests { let nm = oh_enc.transform(&X).unwrap(); assert_eq!(nm, expectedX); } + + #[test] + fn fail_on_bad_category() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); + + let params = OneHotEncoderParams::from_cat_idx(&[1]); + match OneHotEncoder::fit(&m, params) { + Err(_) => { + assert!(true); + } + _ => assert!(false), + } + } } From ca0816db97d7fa1426b98c5b97b548a8a89d2b12 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:55:04 -0800 Subject: [PATCH 20/35] Clippy fixes --- src/preprocessing/data_traits.rs | 14 ++++++-------- src/preprocessing/mod.rs | 2 +- src/preprocessing/series_encoder.rs | 3 ++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs index 16924bb..38d9e3e 100644 --- a/src/preprocessing/data_traits.rs +++ b/src/preprocessing/data_traits.rs @@ -1,11 +1,12 @@ //! Traits to indicate that float variables can be viewed as categorical -//! This module assumes +//! This module assumes use crate::math::num::RealNumber; pub type CategoricalFloat = u16; // pub struct CategoricalFloat(u16); +const ERROR_MARGIN: f64 = 0.001; pub trait Categorizable: RealNumber { type A; @@ -13,11 +14,9 @@ pub trait Categorizable: RealNumber { fn to_category(self) -> CategoricalFloat; fn is_valid(self) -> bool; - } impl Categorizable for f32 { - type A = CategoricalFloat; fn to_category(self) -> CategoricalFloat { @@ -26,20 +25,19 @@ impl Categorizable for f32 { fn is_valid(self) -> bool { let a = self.to_category(); - a as f32 == self + (a as f32 - self).abs() < (ERROR_MARGIN as f32) } } impl Categorizable for f64 { - type A = CategoricalFloat; - fn to_category(self) ->CategoricalFloat { + fn to_category(self) -> CategoricalFloat { self as CategoricalFloat } fn is_valid(self) -> bool { let a = self.to_category(); - a as f64 == self + (a as f64 - self).abs() < ERROR_MARGIN } -} \ No newline at end of file +} diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index c07b982..4a1abf3 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,5 +1,5 @@ /// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents -pub mod categorical_encoders; +pub mod categorical_encoder; mod data_traits; /// Encode a series (column, array) of categorical variables as one-hot vectors pub mod series_encoder; diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 321f049..438d678 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -65,6 +65,7 @@ pub fn make_one_hot>( /// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` +#[derive(Debug, Clone)] pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, @@ -134,7 +135,7 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder &self, categories: &'a [CategoryType], ) -> Vec>> { - let v = categories.iter().map(|a| a.clone()); + let v = categories.iter().cloned(); self.transform_iter(v) } From 863be5ef756518f8d213266f195a4c06b403d5fd Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 20:09:52 -0800 Subject: [PATCH 21/35] style fixes --- src/preprocessing/categorical_encoder.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 22cd052..b05a344 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -262,9 +262,9 @@ mod tests { } #[test] fn test_fit() { - let (X, _) = build_fake_matrix(); + let (x, _) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); assert_eq!(oh_enc.series_encoders.len(), 2); let num_cat: Vec = oh_enc @@ -277,17 +277,17 @@ mod tests { #[test] fn matrix_transform_test() { - let (X, expectedX) = build_fake_matrix(); + let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); - let nm = oh_enc.transform(&X).unwrap(); - assert_eq!(nm, expectedX); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); - let (X, expectedX) = build_cat_first_and_last(); + let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); - let nm = oh_enc.transform(&X).unwrap(); - assert_eq!(nm, expectedX); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); } #[test] From f4b5936dcfde9c3e82c4098016c2555a4e6210e2 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 20:18:52 -0800 Subject: [PATCH 22/35] fmt --- src/preprocessing/categorical_encoder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index b05a344..706670b 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -277,17 +277,17 @@ mod tests { #[test] fn matrix_transform_test() { - let (x, expected_x) = build_fake_matrix(); + let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); + assert_eq!(nm, expected_x); - let (x, expected_x) = build_cat_first_and_last(); + let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); + assert_eq!(nm, expected_x); } #[test] From a882741e1273e7e0d2742f48f84920ae759aadaf Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 1 Feb 2021 11:20:03 -0800 Subject: [PATCH 23/35] If transform fails - fail before copying the whole matrix (changed the order of coping, first do the categorical, than copy ther rest) --- src/preprocessing/categorical_encoder.rs | 46 ++++++++++++++++-------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 706670b..7e71119 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -156,7 +156,7 @@ impl OneHotEncoder { } /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform>(&self, x: &M) -> Option { + pub fn transform>(&self, x: &M) -> Result { let (nrows, p) = x.shape(); let additional_params: Vec = self .series_encoders @@ -164,28 +164,24 @@ impl OneHotEncoder { .map(|enc| enc.num_categories) .collect(); - let new_param_num: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + // Eac category of size v adds v-1 params + let expandws_p: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); - let mut res = M::zeros(nrows, new_param_num); - // copy old data in x to their new location - for (old_p, &new_p) in new_col_idx.iter().enumerate() { - for r in 0..nrows { - let val = x.get(r, old_p); - res.set(r, new_p, val); - } - } + let mut res = M::zeros(nrows, expandws_p); + for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { let cidx = new_col_idx[old_cidx]; - let col_iter = (0..nrows).map(|r| res.get(r, cidx).to_category()); + let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); let sencoder = &self.series_encoders[pidx]; let oh_series: Vec>> = sencoder.transform_iter(col_iter); for (row, oh_vec) in oh_series.iter().enumerate() { match oh_vec { None => { - // Bad value in a series causes in to be invalid - // todo: proper error handling, so user can know where the bad value is - return None; + // Since we support T types, bad value in a series causes in to be invalid + let msg = format!("At least one value in column {} doesn't conform to category definition", old_cidx); + return Err(Failed::transform(&msg[..])); } Some(v) => { // copy one hot vectors to their place in the data matrix; @@ -196,7 +192,27 @@ impl OneHotEncoder { } } } - Some(res) + + // copy old data in x to their new location while skipping catergorical vars (already treated) + let mut skip_idx_iter = self.col_idx_categorical.iter(); + let mut cur_skip = skip_idx_iter.next(); + + for (old_p, &new_p) in new_col_idx.iter().enumerate() { + // if found treated varible, skip it + if let Some(&v) = cur_skip { + if v == old_p { + cur_skip = skip_idx_iter.next(); + continue; + } + } + + for r in 0..nrows { + let val = x.get(r, old_p); + res.set(r, new_p, val); + } + } + + Ok(res) } } From 03b9f76e9f9a18910cd59c5859b21571e05bb559 Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 1 Feb 2021 11:24:20 -0800 Subject: [PATCH 24/35] Doc+Naming Improvement --- src/preprocessing/categorical_encoder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 7e71119..7a0f5d9 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -52,12 +52,12 @@ impl OneHotEncoderParams { } /// Calculate the offset to parameters to due introduction of one-hot encoding -fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { +fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> Vec { // This functions uses iterators and returns a vector. // In case we get a huge amount of paramenters this might be a problem // todo: Change this such that it will return an iterator - let cat_idx = encoded_idxs.iter().copied().chain((num_params..).take(1)); + let cat_idx = cat_idxs.iter().copied().chain((num_params..).take(1)); // Offset is constant between two categorical values, here we calculate the number of steps // that remain constant @@ -103,8 +103,8 @@ pub struct OneHotEncoder { } impl OneHotEncoder { - /// PlaceHolder - + + /// Create an encoder instance with categories infered from data matrix pub fn fit>( data: &M, params: OneHotEncoderParams, From 228b54baf7d04715c1e170af2be506a99caf044e Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 1 Feb 2021 11:24:50 -0800 Subject: [PATCH 25/35] fmt --- src/preprocessing/categorical_encoder.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 7a0f5d9..e3e8ce9 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -103,8 +103,7 @@ pub struct OneHotEncoder { } impl OneHotEncoder { - - /// Create an encoder instance with categories infered from data matrix + /// Create an encoder instance with categories infered from data matrix pub fn fit>( data: &M, params: OneHotEncoderParams, From 19ff6df84cd3d55f7accd44b2986289691059fa8 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 17:40:58 -0800 Subject: [PATCH 26/35] Separate mapper object --- src/preprocessing/series_encoder.rs | 67 +++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 438d678..4e9625e 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -8,6 +8,73 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; +#[derive(Debug, Clone)] +pub struct CategoryMapper { + category_map: HashMap, + categories: Vec, + num_categories: usize, +} + +impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { + fn fit_to_iter(categories: impl Iterator) -> Self { + let mut category_map: HashMap = HashMap::new(); + let mut category_num = 0usize; + let mut unique_lables: Vec = Vec::new(); + + for l in categories { + if !category_map.contains_key(&l) { + category_map.insert(l.clone(), category_num); + unique_lables.push(l.clone()); + category_num += 1; + } + } + Self { + category_map, + num_categories: category_num, + categories: unique_lables, + } + } + + fn from_category_map(category_map: HashMap) -> Self { + let mut _unique_cat: Vec<(CategoryType, usize)> = + category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); + _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); + let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); + Self { + num_categories: categories.len(), + categories, + category_map, + } + } + + fn from_positional_category_vec(categories: Vec) -> Self { + let category_map: HashMap = categories + .iter() + .enumerate() + .map(|(v, k)| (k.clone(), v)) + .collect(); + Self { + num_categories: categories.len(), + category_map, + categories, + } + } + + /// Get label num of a category + fn get_num(&self, category: &CategoryType) -> Option<&usize> { + self.category_map.get(category) + } + + /// Return category corresponding to label num + fn get_cat(&self, num: usize) -> &CategoryType { + &self.categories[num] + } + + fn get_categories(&self) -> &[CategoryType] { + &self.categories[..] + } +} + /// Make a one-hot encoded vector from a categorical variable /// /// Example: From d31145b4fe24e0718aef3b0b9371e9e2834b31ce Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 18:19:36 -0800 Subject: [PATCH 27/35] Define common series encoder behavior --- src/preprocessing/series_encoder.rs | 146 +++++++++++++--------------- 1 file changed, 70 insertions(+), 76 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 4e9625e..4e9ddf9 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -75,6 +75,50 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } } +/// Defines common behavior for series encoders(e.g. OneHot, Ordinal) +pub trait SeriesEncoder: + where + CategoryType:Hash + Eq + Clone +{ + /// Fit an encoder to a lable list + fn fit_to_iter(categories: impl Iterator) -> Self; + + /// Number of categories for categorical variable + fn num_categories(&self) -> usize; + + /// Build an encoder from a predefined (category -> class number) map + fn from_category_map(category_map: HashMap) -> Self; + + /// Build an encoder from a predefined positional category-class num vector + fn from_positional_category_vec(categories: Vec) -> Self; + + /// Transform a single category type into a one-hot vector + fn transform_one>(&self, category: &CategoryType) -> Option; + + /// Invert one-hot vector, back to the category + fn invert_one>(&self, one_hot: V) -> Result; + + /// Get categories ordered by encoder's category enumeration + fn get_categories(&self) -> &[CategoryType]; + + /// Take an iterator as a series to transform + fn transform_iter>( + &self, + cat_it: impl Iterator, + ) -> Vec> { + cat_it.map(|l| self.transform_one(&l)).collect() + } + + /// Transform a slice of category types into one-hot vectors + /// None is returned if unknown category is encountered + fn transfrom_series>( + &self, + categories: &[CategoryType], + ) -> Vec> { + let v = categories.iter().cloned(); + self.transform_iter(v) + } +} /// Make a one-hot encoded vector from a categorical variable /// /// Example: @@ -134,104 +178,47 @@ pub fn make_one_hot>( /// ``` #[derive(Debug, Clone)] pub struct SeriesOneHotEncoder { - category_map: HashMap, - categories: Vec, - /// Number of categories for categorical variable - pub num_categories: usize, + mapper: CategoryMapper, } -impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder { - /// Fit an encoder to a lable list - pub fn fit_to_iter(categories: impl Iterator) -> Self { - let mut category_map: HashMap = HashMap::new(); - let mut category_num = 0usize; - let mut unique_lables: Vec = Vec::new(); +impl SeriesEncoder for SeriesOneHotEncoder { - for l in categories { - if !category_map.contains_key(&l) { - category_map.insert(l.clone(), category_num); - unique_lables.push(l.clone()); - category_num += 1; + fn fit_to_iter(categories: impl Iterator) -> Self { + Self {mapper:CategoryMapper::fit_to_iter(categories)} } - } - Self { - category_map, - num_categories: category_num, - categories: unique_lables, - } - } /// Build an encoder from a predefined (category -> class number) map - pub fn from_category_map(category_map: HashMap) -> Self { - let mut _unique_cat: Vec<(CategoryType, usize)> = - category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); - _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); - let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); - Self { - num_categories: categories.len(), - categories, - category_map, + fn from_category_map(category_map: HashMap) -> Self { + Self {mapper: CategoryMapper::from_category_map(category_map)} } - } /// Build an encoder from a predefined positional category-class num vector - pub fn from_positional_category_vec(categories: Vec) -> Self { - let category_map: HashMap = categories - .iter() - .enumerate() - .map(|(v, k)| (k.clone(), v)) - .collect(); - Self { - num_categories: categories.len(), - category_map, - categories, + fn from_positional_category_vec(categories: Vec) -> Self { + Self {mapper:CategoryMapper::from_positional_category_vec(categories)} } + + fn num_categories(&self) -> usize { + self.mapper.num_categories } - /// Take an iterator as a series to transform - pub fn transform_iter( - &self, - cat_it: impl Iterator, - ) -> Vec>> { - cat_it.map(|l| self.transform_one(&l)).collect() + fn get_categories(&self) -> &[CategoryType] { + self.mapper.get_categories() } - /// Transform a slice of category types into one-hot vectors - /// None is returned if unknown category is encountered - pub fn transfrom_series( - &self, - categories: &'a [CategoryType], - ) -> Vec>> { - let v = categories.iter().cloned(); - self.transform_iter(v) - } - - /// Transform a single category type into a one-hot vector - pub fn transform_one(&self, category: &CategoryType) -> Option> { - match self.category_map.get(category) { - None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_categories)), - } - } - - /// Get categories ordered by encoder's category enumeration - pub fn get_categories(&self) -> &Vec { - &self.categories - } - - /// Invert one-hot vector, back to the category - pub fn invert_one(&self, one_hot: Vec) -> Result { + fn invert_one>(&self, one_hot: V) -> Result + { let pos = U::from_f64(1f64).unwrap(); + + let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); - let s: Vec = one_hot - .into_iter() + let s: Vec = oh_it .enumerate() .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) .collect(); if s.len() == 1 { let idx = s[0]; - return Ok(self.categories[idx].clone()); + return Ok(self.mapper.get_cat(idx).clone()); } let pos_entries = format!( "Expected a single positive entry, {} entires found", @@ -239,6 +226,13 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder ); Err(Failed::transform(&pos_entries[..])) } + + fn transform_one>(&self, category: &CategoryType) -> Option { + match self.mapper.get_num(category) { + None => None, + Some(&idx) => Some(make_one_hot(idx, self.num_categories())), + } + } } #[cfg(test)] From 237b1160b17308252b6040d4c5ca07880079051c Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 18:20:27 -0800 Subject: [PATCH 28/35] doc update --- src/preprocessing/series_encoder.rs | 64 ++++++++++++++++------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 4e9ddf9..9d7e259 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -8,6 +8,7 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; +/// Bi-directional map category <-> label num. #[derive(Debug, Clone)] pub struct CategoryMapper { category_map: HashMap, @@ -16,7 +17,9 @@ pub struct CategoryMapper { } impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { - fn fit_to_iter(categories: impl Iterator) -> Self { + + /// Fit an encoder to a lable iterator + pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); let mut category_num = 0usize; let mut unique_lables: Vec = Vec::new(); @@ -34,8 +37,9 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { categories: unique_lables, } } - - fn from_category_map(category_map: HashMap) -> Self { + + /// Build an encoder from a predefined (category -> class number) map + pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(CategoryType, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); @@ -46,8 +50,9 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { category_map, } } - - fn from_positional_category_vec(categories: Vec) -> Self { + + /// Build an encoder from a predefined positional category-class num vector + pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories .iter() .enumerate() @@ -61,16 +66,17 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Get label num of a category - fn get_num(&self, category: &CategoryType) -> Option<&usize> { + pub fn get_num(&self, category: &CategoryType) -> Option<&usize> { self.category_map.get(category) } /// Return category corresponding to label num - fn get_cat(&self, num: usize) -> &CategoryType { + pub fn get_cat(&self, num: usize) -> &CategoryType { &self.categories[num] } - fn get_categories(&self) -> &[CategoryType] { + /// List all categories (position = category number) + pub fn get_categories(&self) -> &[CategoryType] { &self.categories[..] } } @@ -80,14 +86,14 @@ pub trait SeriesEncoder: where CategoryType:Hash + Eq + Clone { - /// Fit an encoder to a lable list + /// Fit an encoder to a lable iterator fn fit_to_iter(categories: impl Iterator) -> Self; /// Number of categories for categorical variable fn num_categories(&self) -> usize; /// Build an encoder from a predefined (category -> class number) map - fn from_category_map(category_map: HashMap) -> Self; + fn from_category_map(category_map: HashMap) -> Self; /// Build an encoder from a predefined positional category-class num vector fn from_positional_category_vec(categories: Vec) -> Self; @@ -119,6 +125,7 @@ pub trait SeriesEncoder: self.transform_iter(v) } } + /// Make a one-hot encoded vector from a categorical variable /// /// Example: @@ -182,20 +189,20 @@ pub struct SeriesOneHotEncoder { } impl SeriesEncoder for SeriesOneHotEncoder { - + fn fit_to_iter(categories: impl Iterator) -> Self { Self {mapper:CategoryMapper::fit_to_iter(categories)} - } + } /// Build an encoder from a predefined (category -> class number) map fn from_category_map(category_map: HashMap) -> Self { Self {mapper: CategoryMapper::from_category_map(category_map)} - } + } /// Build an encoder from a predefined positional category-class num vector fn from_positional_category_vec(categories: Vec) -> Self { Self {mapper:CategoryMapper::from_positional_category_vec(categories)} - } + } fn num_categories(&self) -> usize { self.mapper.num_categories @@ -207,25 +214,25 @@ impl SeriesEncoder for SeriesOneH fn invert_one>(&self, one_hot: V) -> Result { - let pos = U::from_f64(1f64).unwrap(); + let pos = U::from_f64(1f64).unwrap(); let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); - + let s: Vec = oh_it - .enumerate() - .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) - .collect(); - - if s.len() == 1 { - let idx = s[0]; + .enumerate() + .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) + .collect(); + + if s.len() == 1 { + let idx = s[0]; return Ok(self.mapper.get_cat(idx).clone()); + } + let pos_entries = format!( + "Expected a single positive entry, {} entires found", + s.len() + ); + Err(Failed::transform(&pos_entries[..])) } - let pos_entries = format!( - "Expected a single positive entry, {} entires found", - s.len() - ); - Err(Failed::transform(&pos_entries[..])) - } fn transform_one>(&self, category: &CategoryType) -> Option { match self.mapper.get_num(category) { @@ -233,6 +240,7 @@ impl SeriesEncoder for SeriesOneH Some(&idx) => Some(make_one_hot(idx, self.num_categories())), } } + } #[cfg(test)] From ef06f45638ec42540d74f41ffd2171f2d97e793f Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 18:21:06 -0800 Subject: [PATCH 29/35] Switch to use SeriesEncoder trait --- src/preprocessing/categorical_encoder.rs | 35 ++++++++++++++---------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index e3e8ce9..75cbf2b 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -6,7 +6,7 @@ //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; -//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEnc, OneHotEncoderParams}; //! let data = DenseMatrix::from_2d_array(&[ //! &[1.5, 1.0, 1.5, 3.0], //! &[1.5, 2.0, 1.5, 4.0], @@ -15,7 +15,7 @@ //! ]); //! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); //! // Infer number of categories from data and return a reusable encoder -//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! let encoder = OneHotEnc::fit(&data, encoder_params).unwrap(); //! // Transform categorical to one-hot encoded (can transform similar) //! let oh_data = encoder.transform(&data).unwrap(); //! // Produces the following: @@ -30,7 +30,7 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; -use crate::preprocessing::series_encoder::SeriesOneHotEncoder; +use crate::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; /// OneHotEncoder Parameters #[derive(Debug, Clone)] @@ -97,17 +97,17 @@ fn validate_col_is_categorical(data: &[T]) -> bool { /// Encode Categorical variavbles of data matrix to one-hot #[derive(Debug, Clone)] -pub struct OneHotEncoder { - series_encoders: Vec>, +pub struct OneHotEncoder { + series_encoders: Vec, col_idx_categorical: Vec, } -impl OneHotEncoder { +impl> OneHotEncoder { /// Create an encoder instance with categories infered from data matrix pub fn fit>( data: &M, params: OneHotEncoderParams, - ) -> Result { + ) -> Result, Failed> { match (params.col_idx_categorical, params.infer_categorical) { (None, false) => Err(Failed::fit( "Must pass categorical series ids or infer flag", @@ -126,7 +126,7 @@ impl OneHotEncoder { // col buffer to avoid allocations let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - let mut res: Vec> = + let mut res: Vec = Vec::with_capacity(idxs.len()); for &idx in &idxs { @@ -139,7 +139,7 @@ impl OneHotEncoder { return Err(Failed::fit(&msg[..])); } let hashable_col = col_buf.iter().map(|v| v.to_category()); - res.push(SeriesOneHotEncoder::fit_to_iter(hashable_col)); + res.push(E::fit_to_iter(hashable_col)); } Ok(Self { @@ -160,7 +160,7 @@ impl OneHotEncoder { let additional_params: Vec = self .series_encoders .iter() - .map(|enc| enc.num_categories) + .map(|enc| enc.num_categories()) .collect(); // Eac category of size v adds v-1 params @@ -215,12 +215,17 @@ impl OneHotEncoder { } } +/// Convinince type for common use +pub type OneHotEnc = OneHotEncoder>; + + #[cfg(test)] mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::preprocessing::series_encoder::SeriesOneHotEncoder; + #[test] fn adjust_idxs() { assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); @@ -279,13 +284,13 @@ mod tests { fn test_fit() { let (x, _) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let oh_enc = OneHotEnc::fit(&x, params).unwrap(); assert_eq!(oh_enc.series_encoders.len(), 2); let num_cat: Vec = oh_enc .series_encoders .iter() - .map(|a| a.num_categories) + .map(|a| a.num_categories()) .collect(); assert_eq!(num_cat, vec![2, 4]); } @@ -294,13 +299,13 @@ mod tests { fn matrix_transform_test() { let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let oh_enc = OneHotEnc::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let oh_enc = OneHotEnc::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); } @@ -315,7 +320,7 @@ mod tests { ]); let params = OneHotEncoderParams::from_cat_idx(&[1]); - match OneHotEncoder::fit(&m, params) { + match OneHotEnc::fit(&m, params) { Err(_) => { assert!(true); } From 700d320724c8dad09cdd31e3d73e5cc4d91c33ce Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 10:45:25 -0800 Subject: [PATCH 30/35] simplify SeriesEncoder trait --- src/preprocessing/series_encoder.rs | 134 ++++++++++++++-------------- 1 file changed, 68 insertions(+), 66 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 9d7e259..6975c0d 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -10,19 +10,22 @@ use std::hash::Hash; /// Bi-directional map category <-> label num. #[derive(Debug, Clone)] -pub struct CategoryMapper { - category_map: HashMap, - categories: Vec, +pub struct CategoryMapper { + category_map: HashMap, + categories: Vec, num_categories: usize, } -impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { +impl<'a, C> CategoryMapper +where + C: 'a + Hash + Eq + Clone +{ /// Fit an encoder to a lable iterator - pub fn fit_to_iter(categories: impl Iterator) -> Self { - let mut category_map: HashMap = HashMap::new(); + pub fn fit_to_iter(categories: impl Iterator) -> Self { + let mut category_map: HashMap = HashMap::new(); let mut category_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + let mut unique_lables: Vec = Vec::new(); for l in categories { if !category_map.contains_key(&l) { @@ -39,11 +42,11 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Build an encoder from a predefined (category -> class number) map - pub fn from_category_map(category_map: HashMap) -> Self { - let mut _unique_cat: Vec<(CategoryType, usize)> = + pub fn from_category_map(category_map: HashMap) -> Self { + let mut _unique_cat: Vec<(C, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); - let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); + let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); Self { num_categories: categories.len(), categories, @@ -52,8 +55,8 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Build an encoder from a predefined positional category-class num vector - pub fn from_positional_category_vec(categories: Vec) -> Self { - let category_map: HashMap = categories + pub fn from_positional_category_vec(categories: Vec) -> Self { + let category_map: HashMap = categories .iter() .enumerate() .map(|(v, k)| (k.clone(), v)) @@ -66,64 +69,49 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Get label num of a category - pub fn get_num(&self, category: &CategoryType) -> Option<&usize> { + pub fn get_num(&self, category: &C) -> Option<&usize> { self.category_map.get(category) } /// Return category corresponding to label num - pub fn get_cat(&self, num: usize) -> &CategoryType { + pub fn get_cat(&self, num: usize) -> &C { &self.categories[num] } /// List all categories (position = category number) - pub fn get_categories(&self) -> &[CategoryType] { + pub fn get_categories(&self) -> &[C] { &self.categories[..] } } /// Defines common behavior for series encoders(e.g. OneHot, Ordinal) -pub trait SeriesEncoder: +pub trait SeriesEncoder: where - CategoryType:Hash + Eq + Clone + C: Hash + Eq + Clone { /// Fit an encoder to a lable iterator - fn fit_to_iter(categories: impl Iterator) -> Self; + fn fit_to_iter(categories: impl Iterator) -> Self; /// Number of categories for categorical variable fn num_categories(&self) -> usize; - /// Build an encoder from a predefined (category -> class number) map - fn from_category_map(category_map: HashMap) -> Self; - - /// Build an encoder from a predefined positional category-class num vector - fn from_positional_category_vec(categories: Vec) -> Self; - /// Transform a single category type into a one-hot vector - fn transform_one>(&self, category: &CategoryType) -> Option; + fn transform_one>(&self, category: &C) -> Option; /// Invert one-hot vector, back to the category - fn invert_one>(&self, one_hot: V) -> Result; + fn invert_one>(&self, one_hot: V) -> Result; /// Get categories ordered by encoder's category enumeration - fn get_categories(&self) -> &[CategoryType]; + fn get_categories(&self) -> &[C]; /// Take an iterator as a series to transform + /// None is returned if unknown category is encountered fn transform_iter>( &self, - cat_it: impl Iterator, + cat_it: impl Iterator, ) -> Vec> { cat_it.map(|l| self.transform_one(&l)).collect() } - - /// Transform a slice of category types into one-hot vectors - /// None is returned if unknown category is encountered - fn transfrom_series>( - &self, - categories: &[CategoryType], - ) -> Vec> { - let v = categories.iter().cloned(); - self.transform_iter(v) - } } /// Make a one-hot encoded vector from a categorical variable @@ -153,22 +141,22 @@ pub fn make_one_hot>( /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; /// /// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; /// let it = fake_categories.iter().map(|&a| a); -/// let enc = SeriesOneHotEncoder::::fit_to_iter(it); +/// let enc: SeriesOneHotEncoder:: = SeriesEncoder::fit_to_iter(it); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); /// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); /// ``` /// -/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` /// /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder, CategoryMapper}; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -176,43 +164,53 @@ pub fn make_one_hot>( /// .collect(); /// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = SeriesOneHotEncoder::<&str>::from_positional_category_vec(category_vec); -/// let enc_lm = SeriesOneHotEncoder::<&str>::from_category_map(category_map); +/// let enc_lv = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_positional_category_vec(category_vec)); +/// let enc_lm = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_category_map(category_map)); /// /// // ["background", "dog", "cat"] /// println!("{:?}", enc_lv.get_categories()); -/// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) +/// let lv: Vec = enc_lv.transform_one(&"dog").unwrap(); +/// let lm: Vec = enc_lm.transform_one(&"dog").unwrap(); +/// assert_eq!(lv, lm); /// ``` #[derive(Debug, Clone)] -pub struct SeriesOneHotEncoder { - mapper: CategoryMapper, +pub struct SeriesOneHotEncoder { + mapper: CategoryMapper, } -impl SeriesEncoder for SeriesOneHotEncoder { +impl SeriesOneHotEncoder +where + C: Hash + Eq + Clone +{ + /// Create SeriesEncoder form existing mapper + pub fn new(mapper: CategoryMapper) -> Self { + Self {mapper} + } +} + +impl SeriesEncoder for SeriesOneHotEncoder +where + C: Hash + Eq + Clone +{ - fn fit_to_iter(categories: impl Iterator) -> Self { + + fn fit_to_iter(categories: impl Iterator) -> Self { Self {mapper:CategoryMapper::fit_to_iter(categories)} } - /// Build an encoder from a predefined (category -> class number) map - fn from_category_map(category_map: HashMap) -> Self { - Self {mapper: CategoryMapper::from_category_map(category_map)} - } - - /// Build an encoder from a predefined positional category-class num vector - fn from_positional_category_vec(categories: Vec) -> Self { - Self {mapper:CategoryMapper::from_positional_category_vec(categories)} - } - fn num_categories(&self) -> usize { self.mapper.num_categories } - fn get_categories(&self) -> &[CategoryType] { + fn get_categories(&self) -> &[C] { self.mapper.get_categories() } - fn invert_one>(&self, one_hot: V) -> Result + fn invert_one(&self, one_hot: V) -> Result + where + U: RealNumber, + V: BaseVector + { let pos = U::from_f64(1f64).unwrap(); @@ -234,7 +232,11 @@ impl SeriesEncoder for SeriesOneH Err(Failed::transform(&pos_entries[..])) } - fn transform_one>(&self, category: &CategoryType) -> Option { + fn transform_one(&self, category: &C) -> Option + where + U: RealNumber, + V: BaseVector + { match self.mapper.get_num(category) { None => None, Some(&idx) => Some(make_one_hot(idx, self.num_categories())), @@ -262,7 +264,7 @@ mod tests { fn build_fake_str_enc<'a>() -> SeriesOneHotEncoder<&'a str> { let fake_category_pos = vec!["background", "dog", "cat"]; - let enc = SeriesOneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); + let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_positional_category_vec(fake_category_pos)); enc } @@ -271,7 +273,7 @@ mod tests { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = SeriesOneHotEncoder::<&str>::from_category_map(category_map); + let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_category_map(category_map)); let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong categories"), Some(v) => v, @@ -306,8 +308,8 @@ mod tests { #[test] fn test_many_categorys() { let enc = build_fake_str_enc(); - let res: Vec>> = - enc.transfrom_series(&["dog", "cat", "fish", "background"]); + let cat_it = ["dog", "cat", "fish", "background"].iter().cloned(); + let res: Vec>> = enc.transform_iter(cat_it); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]), From 3cc20fd400682356ac0dfe1dfeb1206172983123 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 13:39:26 -0800 Subject: [PATCH 31/35] Move all functionality to CategoryMapper (one-hot and ordinal). --- src/preprocessing/series_encoder.rs | 181 +++++++++------------------- 1 file changed, 58 insertions(+), 123 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 6975c0d..cdbae16 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -8,7 +8,48 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; -/// Bi-directional map category <-> label num. +/// ## Bi-directional map category <-> label num. +/// Turn Hashable objects into a one-hot vectors or ordinal values. +/// This struct encodes single class per exmample +/// +/// You can fit_to_iter a category enumeration by passing an iterator of categories. +/// category numbers will be assigned in the order they are encountered +/// +/// Example: +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::series_encoder::CategoryMapper; +/// +/// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; +/// let it = fake_categories.iter().map(|&a| a); +/// let enc = CategoryMapper::::fit_to_iter(it); +/// let oh_vec: Vec = enc.get_one_hot(&1).unwrap(); +/// // notice that 1 is actually a zero-th positional category +/// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); +/// ``` +/// +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` +/// +/// +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::series_encoder::CategoryMapper; +/// +/// let category_map: HashMap<&str, usize> = +/// vec![("cat", 2), ("background",0), ("dog", 1)] +/// .into_iter() +/// .collect(); +/// let category_vec = vec!["background", "dog", "cat"]; +/// +/// let enc_lv = CategoryMapper::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = CategoryMapper::<&str>::from_category_map(category_map); +/// +/// // ["background", "dog", "cat"] +/// println!("{:?}", enc_lv.get_categories()); +/// let lv: Vec = enc_lv.get_one_hot(&"dog").unwrap(); +/// let lm: Vec = enc_lm.get_one_hot(&"dog").unwrap(); +/// assert_eq!(lv, lm); +/// ``` #[derive(Debug, Clone)] pub struct CategoryMapper { category_map: HashMap, @@ -16,10 +57,14 @@ pub struct CategoryMapper { num_categories: usize, } -impl<'a, C> CategoryMapper +impl CategoryMapper where - C: 'a + Hash + Eq + Clone + C: Hash + Eq + Clone, { + /// Get the number of categories in the mapper + pub fn num_categories(&self) -> usize { + self.num_categories + } /// Fit an encoder to a lable iterator pub fn fit_to_iter(categories: impl Iterator) -> Self { @@ -82,131 +127,21 @@ where pub fn get_categories(&self) -> &[C] { &self.categories[..] } -} -/// Defines common behavior for series encoders(e.g. OneHot, Ordinal) -pub trait SeriesEncoder: + /// Get one-hot encoding of the category + pub fn get_one_hot(&self, category: &C) -> Option where - C: Hash + Eq + Clone + U: RealNumber, + V: BaseVector, { - /// Fit an encoder to a lable iterator - fn fit_to_iter(categories: impl Iterator) -> Self; - - /// Number of categories for categorical variable - fn num_categories(&self) -> usize; - - /// Transform a single category type into a one-hot vector - fn transform_one>(&self, category: &C) -> Option; + match self.get_num(category) { + None => None, + Some(&idx) => Some(make_one_hot::(idx, self.num_categories)), + } +} /// Invert one-hot vector, back to the category - fn invert_one>(&self, one_hot: V) -> Result; - - /// Get categories ordered by encoder's category enumeration - fn get_categories(&self) -> &[C]; - - /// Take an iterator as a series to transform - /// None is returned if unknown category is encountered - fn transform_iter>( - &self, - cat_it: impl Iterator, - ) -> Vec> { - cat_it.map(|l| self.transform_one(&l)).collect() - } -} - -/// Make a one-hot encoded vector from a categorical variable -/// -/// Example: -/// ``` -/// use smartcore::preprocessing::series_encoder::make_one_hot; -/// let one_hot: Vec = make_one_hot(2, 3); -/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); -/// ``` -pub fn make_one_hot>( - category_idx: usize, - num_categories: usize, -) -> V { - let pos = T::from_f64(1f64).unwrap(); - let mut z = V::zeros(num_categories); - z.set(category_idx, pos); - z -} - -/// Turn a collection of Hashable objects into a one-hot vectors. -/// This struct encodes single class per exmample -/// -/// You can fit_to_iter a category enumeration by passing an iterator of categories. -/// category numbers will be assigned in the order they are encountered -/// -/// Example: -/// ``` -/// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; -/// -/// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; -/// let it = fake_categories.iter().map(|&a| a); -/// let enc: SeriesOneHotEncoder:: = SeriesEncoder::fit_to_iter(it); -/// let oh_vec: Vec = enc.transform_one(&1).unwrap(); -/// // notice that 1 is actually a zero-th positional category -/// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); -/// ``` -/// -/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` -/// -/// -/// ``` -/// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder, CategoryMapper}; -/// -/// let category_map: HashMap<&str, usize> = -/// vec![("cat", 2), ("background",0), ("dog", 1)] -/// .into_iter() -/// .collect(); -/// let category_vec = vec!["background", "dog", "cat"]; -/// -/// let enc_lv = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_positional_category_vec(category_vec)); -/// let enc_lm = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_category_map(category_map)); -/// -/// // ["background", "dog", "cat"] -/// println!("{:?}", enc_lv.get_categories()); -/// let lv: Vec = enc_lv.transform_one(&"dog").unwrap(); -/// let lm: Vec = enc_lm.transform_one(&"dog").unwrap(); -/// assert_eq!(lv, lm); -/// ``` -#[derive(Debug, Clone)] -pub struct SeriesOneHotEncoder { - mapper: CategoryMapper, -} - -impl SeriesOneHotEncoder -where - C: Hash + Eq + Clone -{ - /// Create SeriesEncoder form existing mapper - pub fn new(mapper: CategoryMapper) -> Self { - Self {mapper} - } -} - -impl SeriesEncoder for SeriesOneHotEncoder -where - C: Hash + Eq + Clone -{ - - - fn fit_to_iter(categories: impl Iterator) -> Self { - Self {mapper:CategoryMapper::fit_to_iter(categories)} - } - - fn num_categories(&self) -> usize { - self.mapper.num_categories - } - - fn get_categories(&self) -> &[C] { - self.mapper.get_categories() - } - - fn invert_one(&self, one_hot: V) -> Result + pub fn invert_one_hot(&self, one_hot: V) -> Result where U: RealNumber, V: BaseVector From 374dfeceb906262a2797967cfa02514b5ca2d48d Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 13:41:25 -0800 Subject: [PATCH 32/35] No more SeriesEncoders. --- src/preprocessing/series_encoder.rs | 104 +++++++++++++++++----------- 1 file changed, 63 insertions(+), 41 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index cdbae16..e24eca1 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -65,7 +65,7 @@ where pub fn num_categories(&self) -> usize { self.num_categories } - + /// Fit an encoder to a lable iterator pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); @@ -85,7 +85,7 @@ where categories: unique_lables, } } - + /// Build an encoder from a predefined (category -> class number) map pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(C, usize)> = @@ -98,7 +98,7 @@ where category_map, } } - + /// Build an encoder from a predefined positional category-class num vector pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories @@ -130,54 +130,71 @@ where /// Get one-hot encoding of the category pub fn get_one_hot(&self, category: &C) -> Option - where + where U: RealNumber, V: BaseVector, -{ + { match self.get_num(category) { None => None, Some(&idx) => Some(make_one_hot::(idx, self.num_categories)), + } } -} /// Invert one-hot vector, back to the category pub fn invert_one_hot(&self, one_hot: V) -> Result where U: RealNumber, - V: BaseVector + V: BaseVector, + { + let pos = U::one(); - { - let pos = U::from_f64(1f64).unwrap(); - - let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); - - let s: Vec = oh_it - .enumerate() - .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) - .collect(); - - if s.len() == 1 { - let idx = s[0]; - return Ok(self.mapper.get_cat(idx).clone()); - } - let pos_entries = format!( - "Expected a single positive entry, {} entires found", - s.len() - ); - Err(Failed::transform(&pos_entries[..])) + let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); + + let s: Vec = oh_it + .enumerate() + .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) + .collect(); + + if s.len() == 1 { + let idx = s[0]; + return Ok(self.get_cat(idx).clone()); } + let pos_entries = format!( + "Expected a single positive entry, {} entires found", + s.len() + ); + Err(Failed::transform(&pos_entries[..])) + } - fn transform_one(&self, category: &C) -> Option + /// Get ordinal encoding of the catergory + pub fn get_ordinal(&self, category: &C) -> Option where U: RealNumber, - V: BaseVector { - match self.mapper.get_num(category) { + match self.get_num(category) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_categories())), + Some(&idx) => U::from_usize(idx), } } - +} + +/// Make a one-hot encoded vector from a categorical variable +/// +/// Example: +/// ``` +/// use smartcore::preprocessing::series_encoder::make_one_hot; +/// let one_hot: Vec = make_one_hot(2, 3); +/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); +/// ``` +pub fn make_one_hot(category_idx: usize, num_categories: usize) -> V +where + T: RealNumber, + V: BaseVector, +{ + let pos = T::one(); + let mut z = V::zeros(num_categories); + z.set(category_idx, pos); + z } #[cfg(test)] @@ -188,8 +205,8 @@ mod tests { fn from_categories() { let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; let it = fake_categories.iter().map(|&a| a); - let enc = SeriesOneHotEncoder::::fit_to_iter(it); - let oh_vec: Vec = match enc.transform_one(&1) { + let enc = CategoryMapper::::fit_to_iter(it); + let oh_vec: Vec = match enc.get_one_hot(&1) { None => panic!("Wrong categories"), Some(v) => v, }; @@ -197,19 +214,24 @@ mod tests { assert_eq!(oh_vec, res); } - fn build_fake_str_enc<'a>() -> SeriesOneHotEncoder<&'a str> { + fn build_fake_str_enc<'a>() -> CategoryMapper<&'a str> { let fake_category_pos = vec!["background", "dog", "cat"]; - let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_positional_category_vec(fake_category_pos)); + let enc = CategoryMapper::<&str>::from_positional_category_vec(fake_category_pos); enc } + #[test] + fn ordinal_encoding() { + let enc = build_fake_str_enc(); + assert_eq!(1f64, enc.get_ordinal::(&"dog").unwrap()) + } #[test] fn category_map_and_vec() { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_category_map(category_map)); - let oh_vec: Vec = match enc.transform_one(&"dog") { + let enc = CategoryMapper::<&str>::from_category_map(category_map); + let oh_vec: Vec = match enc.get_one_hot(&"dog") { None => panic!("Wrong categories"), Some(v) => v, }; @@ -220,7 +242,7 @@ mod tests { #[test] fn positional_categories_vec() { let enc = build_fake_str_enc(); - let oh_vec: Vec = match enc.transform_one(&"dog") { + let oh_vec: Vec = match enc.get_one_hot(&"dog") { None => panic!("Wrong categories"), Some(v) => v, }; @@ -232,9 +254,9 @@ mod tests { fn invert_label_test() { let enc = build_fake_str_enc(); let res: Vec = vec![0.0, 1.0, 0.0]; - let lab = enc.invert_one(res).unwrap(); + let lab = enc.invert_one_hot(res).unwrap(); assert_eq!(lab, "dog"); - if let Err(e) = enc.invert_one(vec![0.0, 0.0, 0.0]) { + if let Err(e) = enc.invert_one_hot(vec![0.0, 0.0, 0.0]) { let pos_entries = format!("Expected a single positive entry, 0 entires found"); assert_eq!(e, Failed::transform(&pos_entries[..])); }; @@ -244,7 +266,7 @@ mod tests { fn test_many_categorys() { let enc = build_fake_str_enc(); let cat_it = ["dog", "cat", "fish", "background"].iter().cloned(); - let res: Vec>> = enc.transform_iter(cat_it); + let res: Vec>> = cat_it.map(|v| enc.get_one_hot(&v)).collect(); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]), From 828df4e338c0a44a38ad2004f3bae349322d1c94 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 13:42:27 -0800 Subject: [PATCH 33/35] Use CategoryMapper to transform an iterator. No more passing iterator to SeriesEncoders --- src/preprocessing/categorical_encoder.rs | 67 ++++++++++++------------ 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 75cbf2b..18e569a 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -1,12 +1,12 @@ //! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies //! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents //! -//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [SeriesOneHotEncoder](../series_encoder/struct.SeriesOneHotEncoder.html) +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [CategoryMapper](../series_encoder/struct.CategoryMapper.html) //! //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; -//! use smartcore::preprocessing::categorical_encoder::{OneHotEnc, OneHotEncoderParams}; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; //! let data = DenseMatrix::from_2d_array(&[ //! &[1.5, 1.0, 1.5, 3.0], //! &[1.5, 2.0, 1.5, 4.0], @@ -15,7 +15,7 @@ //! ]); //! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); //! // Infer number of categories from data and return a reusable encoder -//! let encoder = OneHotEnc::fit(&data, encoder_params).unwrap(); +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); //! // Transform categorical to one-hot encoded (can transform similar) //! let oh_data = encoder.transform(&data).unwrap(); //! // Produces the following: @@ -30,7 +30,7 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; -use crate::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; +use crate::preprocessing::series_encoder::CategoryMapper; /// OneHotEncoder Parameters #[derive(Debug, Clone)] @@ -97,17 +97,18 @@ fn validate_col_is_categorical(data: &[T]) -> bool { /// Encode Categorical variavbles of data matrix to one-hot #[derive(Debug, Clone)] -pub struct OneHotEncoder { - series_encoders: Vec, +pub struct OneHotEncoder { + category_mappers: Vec>, col_idx_categorical: Vec, } -impl> OneHotEncoder { +impl OneHotEncoder { /// Create an encoder instance with categories infered from data matrix - pub fn fit>( - data: &M, - params: OneHotEncoderParams, - ) -> Result, Failed> { + pub fn fit(data: &M, params: OneHotEncoderParams) -> Result + where + T: Categorizable, + M: Matrix, + { match (params.col_idx_categorical, params.infer_categorical) { (None, false) => Err(Failed::fit( "Must pass categorical series ids or infer flag", @@ -126,8 +127,7 @@ impl> OneHotEncoder { // col buffer to avoid allocations let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - let mut res: Vec = - Vec::with_capacity(idxs.len()); + let mut res: Vec> = Vec::with_capacity(idxs.len()); for &idx in &idxs { data.copy_col_as_vec(idx, &mut col_buf); @@ -139,11 +139,11 @@ impl> OneHotEncoder { return Err(Failed::fit(&msg[..])); } let hashable_col = col_buf.iter().map(|v| v.to_category()); - res.push(E::fit_to_iter(hashable_col)); + res.push(CategoryMapper::fit_to_iter(hashable_col)); } Ok(Self { - series_encoders: res, //Self::build_series_encoders::(data, &idxs[..]), + category_mappers: res, col_idx_categorical: idxs, }) } @@ -155,10 +155,14 @@ impl> OneHotEncoder { } /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform>(&self, x: &M) -> Result { + pub fn transform(&self, x: &M) -> Result + where + T: Categorizable, + M: Matrix, + { let (nrows, p) = x.shape(); let additional_params: Vec = self - .series_encoders + .category_mappers .iter() .map(|enc| enc.num_categories()) .collect(); @@ -172,10 +176,10 @@ impl> OneHotEncoder { for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { let cidx = new_col_idx[old_cidx]; let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); - let sencoder = &self.series_encoders[pidx]; - let oh_series: Vec>> = sencoder.transform_iter(col_iter); + let sencoder = &self.category_mappers[pidx]; + let oh_series = col_iter.map(|c| sencoder.get_one_hot::>(&c)); - for (row, oh_vec) in oh_series.iter().enumerate() { + for (row, oh_vec) in oh_series.enumerate() { match oh_vec { None => { // Since we support T types, bad value in a series causes in to be invalid @@ -215,16 +219,11 @@ impl> OneHotEncoder { } } -/// Convinince type for common use -pub type OneHotEnc = OneHotEncoder>; - - #[cfg(test)] mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::preprocessing::series_encoder::SeriesOneHotEncoder; - + use crate::preprocessing::series_encoder::CategoryMapper; #[test] fn adjust_idxs() { @@ -275,8 +274,8 @@ mod tests { let series = vec![3.0, 1.0, 2.0, 1.0]; let hashable_series: Vec = series.iter().map(|v| v.to_category()).collect(); - let enc = SeriesOneHotEncoder::from_positional_category_vec(hashable_series); - let inv = enc.invert_one(vec![0.0, 0.0, 1.0]); + let enc = CategoryMapper::from_positional_category_vec(hashable_series); + let inv = enc.invert_one_hot(vec![0.0, 0.0, 1.0]); let orig_val: f64 = inv.unwrap().into(); assert_eq!(orig_val, 2.0); } @@ -284,11 +283,11 @@ mod tests { fn test_fit() { let (x, _) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEnc::fit(&x, params).unwrap(); - assert_eq!(oh_enc.series_encoders.len(), 2); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + assert_eq!(oh_enc.category_mappers.len(), 2); let num_cat: Vec = oh_enc - .series_encoders + .category_mappers .iter() .map(|a| a.num_categories()) .collect(); @@ -299,13 +298,13 @@ mod tests { fn matrix_transform_test() { let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEnc::fit(&x, params).unwrap(); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEnc::fit(&x, params).unwrap(); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); } @@ -320,7 +319,7 @@ mod tests { ]); let params = OneHotEncoderParams::from_cat_idx(&[1]); - match OneHotEnc::fit(&m, params) { + match OneHotEncoder::fit(&m, params) { Err(_) => { assert!(true); } From af6ec2d402c1d3d6aca1881f7c80301487a94cab Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 9 Feb 2021 22:01:34 -0800 Subject: [PATCH 34/35] rename categorical --- src/preprocessing/categorical.rs | 329 +++++++++++++++++++++++++++++++ src/preprocessing/mod.rs | 2 +- 2 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 src/preprocessing/categorical.rs diff --git a/src/preprocessing/categorical.rs b/src/preprocessing/categorical.rs new file mode 100644 index 0000000..8571e74 --- /dev/null +++ b/src/preprocessing/categorical.rs @@ -0,0 +1,329 @@ +//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies +//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents +//! +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [CategoryMapper](../series_encoder/struct.CategoryMapper.html) +//! +//! ### Usage Example +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::preprocessing::categorical::{OneHotEncoder, OneHotEncoderParams}; +//! let data = DenseMatrix::from_2d_array(&[ +//! &[1.5, 1.0, 1.5, 3.0], +//! &[1.5, 2.0, 1.5, 4.0], +//! &[1.5, 1.0, 1.5, 5.0], +//! &[1.5, 2.0, 1.5, 6.0], +//! ]); +//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); +//! // Infer number of categories from data and return a reusable encoder +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! // Transform categorical to one-hot encoded (can transform similar) +//! let oh_data = encoder.transform(&data).unwrap(); +//! // Produces the following: +//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] +//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] +//! ``` +use std::iter; + +use crate::error::Failed; +use crate::linalg::Matrix; + +use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; +use crate::preprocessing::series_encoder::CategoryMapper; + +/// OneHotEncoder Parameters +#[derive(Debug, Clone)] +pub struct OneHotEncoderParams { + /// Column number that contain categorical variable + pub col_idx_categorical: Option>, + /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables + infer_categorical: bool, +} + +impl OneHotEncoderParams { + /// Generate parameters from categorical variable column numbers + pub fn from_cat_idx(categorical_params: &[usize]) -> Self { + Self { + col_idx_categorical: Some(categorical_params.to_vec()), + infer_categorical: false, + } + } +} + +/// Calculate the offset to parameters to due introduction of one-hot encoding +fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> Vec { + // This functions uses iterators and returns a vector. + // In case we get a huge amount of paramenters this might be a problem + // todo: Change this such that it will return an iterator + + let cat_idx = cat_idxs.iter().copied().chain((num_params..).take(1)); + + // Offset is constant between two categorical values, here we calculate the number of steps + // that remain constant + let repeats = cat_idx.scan(0, |a, v| { + let im = v + 1 - *a; + *a = v; + Some(im) + }); + + // Calculate the offset to parameter idx due to newly intorduced one-hot vectors + let offset_ = cat_sizes.iter().scan(0, |a, &v| { + *a = *a + v - 1; + Some(*a) + }); + let offset = (0..1).chain(offset_); + + let new_param_idxs: Vec = (0..num_params) + .zip( + repeats + .zip(offset) + .map(|(r, o)| iter::repeat(o).take(r)) + .flatten(), + ) + .map(|(idx, ofst)| idx + ofst) + .collect(); + new_param_idxs +} + +fn validate_col_is_categorical(data: &[T]) -> bool { + for v in data { + if !v.is_valid() { + return false; + } + } + true +} + +/// Encode Categorical variavbles of data matrix to one-hot +#[derive(Debug, Clone)] +pub struct OneHotEncoder { + category_mappers: Vec>, + col_idx_categorical: Vec, +} + +impl OneHotEncoder { + /// Create an encoder instance with categories infered from data matrix + pub fn fit(data: &M, params: OneHotEncoderParams) -> Result + where + T: Categorizable, + M: Matrix, + { + match (params.col_idx_categorical, params.infer_categorical) { + (None, false) => Err(Failed::fit( + "Must pass categorical series ids or infer flag", + )), + + (Some(_idxs), true) => Err(Failed::fit( + "Ambigous parameters, got both infer and categroy ids", + )), + + (Some(mut idxs), false) => { + // make sure categories have same order as data columns + idxs.sort_unstable(); + + let (nrows, _) = data.shape(); + + // col buffer to avoid allocations + let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); + + let mut res: Vec> = Vec::with_capacity(idxs.len()); + + for &idx in &idxs { + data.copy_col_as_vec(idx, &mut col_buf); + if !validate_col_is_categorical(&col_buf) { + let msg = format!( + "Column {} of data matrix containts non categorizable (integer) values", + idx + ); + return Err(Failed::fit(&msg[..])); + } + let hashable_col = col_buf.iter().map(|v| v.to_category()); + res.push(CategoryMapper::fit_to_iter(hashable_col)); + } + + Ok(Self { + category_mappers: res, + col_idx_categorical: idxs, + }) + } + + (None, true) => { + todo!("Auto-Inference for Categorical Variables not yet implemented") + } + } + } + + /// Transform categorical variables to one-hot encoded and return a new matrix + pub fn transform(&self, x: &M) -> Result + where + T: Categorizable, + M: Matrix, + { + let (nrows, p) = x.shape(); + let additional_params: Vec = self + .category_mappers + .iter() + .map(|enc| enc.num_categories()) + .collect(); + + // Eac category of size v adds v-1 params + let expandws_p: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + + let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); + let mut res = M::zeros(nrows, expandws_p); + + for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { + let cidx = new_col_idx[old_cidx]; + let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); + let sencoder = &self.category_mappers[pidx]; + let oh_series = col_iter.map(|c| sencoder.get_one_hot::>(&c)); + + for (row, oh_vec) in oh_series.enumerate() { + match oh_vec { + None => { + // Since we support T types, bad value in a series causes in to be invalid + let msg = format!("At least one value in column {} doesn't conform to category definition", old_cidx); + return Err(Failed::transform(&msg[..])); + } + Some(v) => { + // copy one hot vectors to their place in the data matrix; + for (col_ofst, &val) in v.iter().enumerate() { + res.set(row, cidx + col_ofst, val); + } + } + } + } + } + + // copy old data in x to their new location while skipping catergorical vars (already treated) + let mut skip_idx_iter = self.col_idx_categorical.iter(); + let mut cur_skip = skip_idx_iter.next(); + + for (old_p, &new_p) in new_col_idx.iter().enumerate() { + // if found treated varible, skip it + if let Some(&v) = cur_skip { + if v == old_p { + cur_skip = skip_idx_iter.next(); + continue; + } + } + + for r in 0..nrows { + let val = x.get(r, old_p); + res.set(r, new_p, val); + } + } + + Ok(res) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::preprocessing::series_encoder::CategoryMapper; + + #[test] + fn adjust_idxs() { + assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); + // [0,1,2] -> [0, 1, 1, 1, 2] + assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); + } + + fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { + let orig = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { + // Categorical first and last + let orig = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 1.5, 3.0], + &[1.5, 2.0, 1.5, 4.0], + &[1.5, 1.0, 1.5, 5.0], + &[1.5, 2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + #[test] + fn hash_encode_f64_series() { + let series = vec![3.0, 1.0, 2.0, 1.0]; + let hashable_series: Vec = + series.iter().map(|v| v.to_category()).collect(); + let enc = CategoryMapper::from_positional_category_vec(hashable_series); + let inv = enc.invert_one_hot(vec![0.0, 0.0, 1.0]); + let orig_val: f64 = inv.unwrap().into(); + assert_eq!(orig_val, 2.0); + } + #[test] + fn test_fit() { + let (x, _) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + assert_eq!(oh_enc.category_mappers.len(), 2); + + let num_cat: Vec = oh_enc + .category_mappers + .iter() + .map(|a| a.num_categories()) + .collect(); + assert_eq!(num_cat, vec![2, 4]); + } + + #[test] + fn matrix_transform_test() { + let (x, expected_x) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); + + let (x, expected_x) = build_cat_first_and_last(); + let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); + } + + #[test] + fn fail_on_bad_category() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); + + let params = OneHotEncoderParams::from_cat_idx(&[1]); + match OneHotEncoder::fit(&m, params) { + Err(_) => { + assert!(true); + } + _ => assert!(false), + } + } +} diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index 4a1abf3..32a0cfa 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,5 +1,5 @@ /// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents -pub mod categorical_encoder; +pub mod categorical; mod data_traits; /// Encode a series (column, array) of categorical variables as one-hot vectors pub mod series_encoder; From 6b5bed60928fb2fdd304eca03ff31c0612573164 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 9 Feb 2021 22:01:59 -0800 Subject: [PATCH 35/35] remove old --- src/preprocessing/categorical_encoder.rs | 329 ----------------------- 1 file changed, 329 deletions(-) delete mode 100644 src/preprocessing/categorical_encoder.rs diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs deleted file mode 100644 index 18e569a..0000000 --- a/src/preprocessing/categorical_encoder.rs +++ /dev/null @@ -1,329 +0,0 @@ -//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies -//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents -//! -//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [CategoryMapper](../series_encoder/struct.CategoryMapper.html) -//! -//! ### Usage Example -//! ``` -//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; -//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; -//! let data = DenseMatrix::from_2d_array(&[ -//! &[1.5, 1.0, 1.5, 3.0], -//! &[1.5, 2.0, 1.5, 4.0], -//! &[1.5, 1.0, 1.5, 5.0], -//! &[1.5, 2.0, 1.5, 6.0], -//! ]); -//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); -//! // Infer number of categories from data and return a reusable encoder -//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); -//! // Transform categorical to one-hot encoded (can transform similar) -//! let oh_data = encoder.transform(&data).unwrap(); -//! // Produces the following: -//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] -//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] -//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] -//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] -//! ``` -use std::iter; - -use crate::error::Failed; -use crate::linalg::Matrix; - -use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; -use crate::preprocessing::series_encoder::CategoryMapper; - -/// OneHotEncoder Parameters -#[derive(Debug, Clone)] -pub struct OneHotEncoderParams { - /// Column number that contain categorical variable - pub col_idx_categorical: Option>, - /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables - infer_categorical: bool, -} - -impl OneHotEncoderParams { - /// Generate parameters from categorical variable column numbers - pub fn from_cat_idx(categorical_params: &[usize]) -> Self { - Self { - col_idx_categorical: Some(categorical_params.to_vec()), - infer_categorical: false, - } - } -} - -/// Calculate the offset to parameters to due introduction of one-hot encoding -fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> Vec { - // This functions uses iterators and returns a vector. - // In case we get a huge amount of paramenters this might be a problem - // todo: Change this such that it will return an iterator - - let cat_idx = cat_idxs.iter().copied().chain((num_params..).take(1)); - - // Offset is constant between two categorical values, here we calculate the number of steps - // that remain constant - let repeats = cat_idx.scan(0, |a, v| { - let im = v + 1 - *a; - *a = v; - Some(im) - }); - - // Calculate the offset to parameter idx due to newly intorduced one-hot vectors - let offset_ = cat_sizes.iter().scan(0, |a, &v| { - *a = *a + v - 1; - Some(*a) - }); - let offset = (0..1).chain(offset_); - - let new_param_idxs: Vec = (0..num_params) - .zip( - repeats - .zip(offset) - .map(|(r, o)| iter::repeat(o).take(r)) - .flatten(), - ) - .map(|(idx, ofst)| idx + ofst) - .collect(); - new_param_idxs -} - -fn validate_col_is_categorical(data: &[T]) -> bool { - for v in data { - if !v.is_valid() { - return false; - } - } - true -} - -/// Encode Categorical variavbles of data matrix to one-hot -#[derive(Debug, Clone)] -pub struct OneHotEncoder { - category_mappers: Vec>, - col_idx_categorical: Vec, -} - -impl OneHotEncoder { - /// Create an encoder instance with categories infered from data matrix - pub fn fit(data: &M, params: OneHotEncoderParams) -> Result - where - T: Categorizable, - M: Matrix, - { - match (params.col_idx_categorical, params.infer_categorical) { - (None, false) => Err(Failed::fit( - "Must pass categorical series ids or infer flag", - )), - - (Some(_idxs), true) => Err(Failed::fit( - "Ambigous parameters, got both infer and categroy ids", - )), - - (Some(mut idxs), false) => { - // make sure categories have same order as data columns - idxs.sort_unstable(); - - let (nrows, _) = data.shape(); - - // col buffer to avoid allocations - let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - - let mut res: Vec> = Vec::with_capacity(idxs.len()); - - for &idx in &idxs { - data.copy_col_as_vec(idx, &mut col_buf); - if !validate_col_is_categorical(&col_buf) { - let msg = format!( - "Column {} of data matrix containts non categorizable (integer) values", - idx - ); - return Err(Failed::fit(&msg[..])); - } - let hashable_col = col_buf.iter().map(|v| v.to_category()); - res.push(CategoryMapper::fit_to_iter(hashable_col)); - } - - Ok(Self { - category_mappers: res, - col_idx_categorical: idxs, - }) - } - - (None, true) => { - todo!("Auto-Inference for Categorical Variables not yet implemented") - } - } - } - - /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform(&self, x: &M) -> Result - where - T: Categorizable, - M: Matrix, - { - let (nrows, p) = x.shape(); - let additional_params: Vec = self - .category_mappers - .iter() - .map(|enc| enc.num_categories()) - .collect(); - - // Eac category of size v adds v-1 params - let expandws_p: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); - - let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); - let mut res = M::zeros(nrows, expandws_p); - - for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { - let cidx = new_col_idx[old_cidx]; - let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); - let sencoder = &self.category_mappers[pidx]; - let oh_series = col_iter.map(|c| sencoder.get_one_hot::>(&c)); - - for (row, oh_vec) in oh_series.enumerate() { - match oh_vec { - None => { - // Since we support T types, bad value in a series causes in to be invalid - let msg = format!("At least one value in column {} doesn't conform to category definition", old_cidx); - return Err(Failed::transform(&msg[..])); - } - Some(v) => { - // copy one hot vectors to their place in the data matrix; - for (col_ofst, &val) in v.iter().enumerate() { - res.set(row, cidx + col_ofst, val); - } - } - } - } - } - - // copy old data in x to their new location while skipping catergorical vars (already treated) - let mut skip_idx_iter = self.col_idx_categorical.iter(); - let mut cur_skip = skip_idx_iter.next(); - - for (old_p, &new_p) in new_col_idx.iter().enumerate() { - // if found treated varible, skip it - if let Some(&v) = cur_skip { - if v == old_p { - cur_skip = skip_idx_iter.next(); - continue; - } - } - - for r in 0..nrows { - let val = x.get(r, old_p); - res.set(r, new_p, val); - } - } - - Ok(res) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::preprocessing::series_encoder::CategoryMapper; - - #[test] - fn adjust_idxs() { - assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); - // [0,1,2] -> [0, 1, 1, 1, 2] - assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); - } - - fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { - let orig = DenseMatrix::from_2d_array(&[ - &[1.0, 1.5, 3.0], - &[2.0, 1.5, 4.0], - &[1.0, 1.5, 5.0], - &[2.0, 1.5, 6.0], - ]); - - let oh_enc = DenseMatrix::from_2d_array(&[ - &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], - &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], - &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], - &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], - ]); - - (orig, oh_enc) - } - - fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { - // Categorical first and last - let orig = DenseMatrix::from_2d_array(&[ - &[1.5, 1.0, 1.5, 3.0], - &[1.5, 2.0, 1.5, 4.0], - &[1.5, 1.0, 1.5, 5.0], - &[1.5, 2.0, 1.5, 6.0], - ]); - - let oh_enc = DenseMatrix::from_2d_array(&[ - &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], - &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], - &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], - &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], - ]); - - (orig, oh_enc) - } - - #[test] - fn hash_encode_f64_series() { - let series = vec![3.0, 1.0, 2.0, 1.0]; - let hashable_series: Vec = - series.iter().map(|v| v.to_category()).collect(); - let enc = CategoryMapper::from_positional_category_vec(hashable_series); - let inv = enc.invert_one_hot(vec![0.0, 0.0, 1.0]); - let orig_val: f64 = inv.unwrap().into(); - assert_eq!(orig_val, 2.0); - } - #[test] - fn test_fit() { - let (x, _) = build_fake_matrix(); - let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); - assert_eq!(oh_enc.category_mappers.len(), 2); - - let num_cat: Vec = oh_enc - .category_mappers - .iter() - .map(|a| a.num_categories()) - .collect(); - assert_eq!(num_cat, vec![2, 4]); - } - - #[test] - fn matrix_transform_test() { - let (x, expected_x) = build_fake_matrix(); - let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); - let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); - - let (x, expected_x) = build_cat_first_and_last(); - let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); - let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); - } - - #[test] - fn fail_on_bad_category() { - let m = DenseMatrix::from_2d_array(&[ - &[1.0, 1.5, 3.0], - &[2.0, 1.5, 4.0], - &[1.0, 1.5, 5.0], - &[2.0, 1.5, 6.0], - ]); - - let params = OneHotEncoderParams::from_cat_idx(&[1]); - match OneHotEncoder::fit(&m, params) { - Err(_) => { - assert!(true); - } - _ => assert!(false), - } - } -}