From 3d4d5f64f6ebcd9adf037442778639a7b6cbd00c Mon Sep 17 00:00:00 2001 From: morenol Date: Mon, 9 Nov 2020 15:54:27 -0400 Subject: [PATCH] feat: add Naive Bayes and CategoricalNB (#15) * feat: Implement Naive Bayes classifier * Implement CategoricalNB --- src/lib.rs | 2 + src/naive_bayes/categorical.rs | 232 +++++++++++++++++++++++++++++++++ src/naive_bayes/mod.rs | 69 ++++++++++ 3 files changed, 303 insertions(+) create mode 100644 src/naive_bayes/categorical.rs create mode 100644 src/naive_bayes/mod.rs diff --git a/src/lib.rs b/src/lib.rs index 083b95f..966d5ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -85,6 +85,8 @@ pub mod math; /// Functions for assessing prediction error. pub mod metrics; pub mod model_selection; +/// Supervised learning algorithms based on applying the Bayes theorem with the independence assumptions between predictors +pub mod naive_bayes; /// Supervised neighbors-based learning methods pub mod neighbors; pub(crate) mod optimization; diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs new file mode 100644 index 0000000..f948aeb --- /dev/null +++ b/src/naive_bayes/categorical.rs @@ -0,0 +1,232 @@ +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; +use serde::{Deserialize, Serialize}; + +/// Naive Bayes classifier for categorical features +struct CategoricalNBDistribution { + class_labels: Vec, + class_probabilities: Vec, + coef: Vec>>, + feature_categories: Vec>, +} + +impl> NBDistribution for CategoricalNBDistribution { + fn prior(&self, class_index: usize) -> T { + if class_index >= self.class_labels.len() { + T::zero() + } else { + self.class_probabilities[class_index] + } + } + + fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T { + if class_index < self.class_labels.len() { + let mut prob = T::one(); + for feature in 0..j.len() { + let value = j.get(feature); + match self.feature_categories[feature] + .iter() + .position(|&t| t == value) + { + Some(_i) => prob *= self.coef[class_index][feature][_i], + None => return T::zero(), + } + } + prob + } else { + T::zero() + } + } + + fn classes(&self) -> &Vec { + &self.class_labels + } +} + +impl CategoricalNBDistribution { + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data. + /// * `y` - vector with target values (classes) of length N. + /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn fit>(x: &M, y: &M::RowVector, alpha: T) -> Result { + if alpha < T::zero() { + return Err(Failed::fit(&format!( + "alpha should be >= 0, alpha=[{}]", + alpha + ))); + } + + let (n_samples, n_features) = x.shape(); + let y_samples = y.len(); + if y_samples != n_samples { + return Err(Failed::fit(&format!( + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", + n_samples, y_samples + ))); + } + + if n_samples == 0 { + return Err(Failed::fit(&format!( + "Size of x and y should greater than 0; |x|=[{}]", + n_samples + ))); + } + + let mut y_sorted = y.to_vec(); + y_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let mut class_labels = Vec::with_capacity(y.len()); + class_labels.push(y_sorted[0]); + let mut classes_count = Vec::with_capacity(y.len()); + let mut current_count = T::one(); + for idx in 1..y_samples { + if y_sorted[idx] == y_sorted[idx - 1] { + current_count += T::one(); + } else { + classes_count.push(current_count); + class_labels.push(y_sorted[idx]); + current_count = T::one() + } + classes_count.push(current_count); + } + + let mut feature_categories: Vec> = Vec::with_capacity(n_features); + + for feature in 0..n_features { + let feature_types = x.get_col_as_vec(feature).unique(); + feature_categories.push(feature_types); + } + let mut coef: Vec>> = Vec::with_capacity(class_labels.len()); + for (label, label_count) in class_labels.iter().zip(classes_count.iter()) { + let mut coef_i: Vec> = Vec::with_capacity(n_features); + for (feature_index, feature_options) in + feature_categories.iter().enumerate().take(n_features) + { + let col = x + .get_col_as_vec(feature_index) + .iter() + .enumerate() + .filter(|(i, _j)| y.get(*i) == *label) + .map(|(_, j)| *j) + .collect::>(); + let mut feat_count: Vec = Vec::with_capacity(feature_options.len()); + for k in feature_options.iter() { + let feat_k_count = col.iter().filter(|&v| v == k).count(); + feat_count.push(feat_k_count); + } + + let coef_i_j = feat_count + .iter() + .map(|c| { + (T::from(*c).unwrap() + alpha) + / (T::from(*label_count).unwrap() + + T::from(feature_options.len()).unwrap() * alpha) + }) + .collect::>(); + coef_i.push(coef_i_j); + } + coef.push(coef_i); + } + let class_probabilities = classes_count + .into_iter() + .map(|count| count / T::from(n_samples).unwrap()) + .collect::>(); + + Ok(Self { + class_labels, + class_probabilities, + coef, + feature_categories, + }) + } +} + +/// `CategoricalNB` parameters. Use `Default::default()` for default values. +#[derive(Serialize, Deserialize, Debug)] +pub struct CategoricalNBParameters { + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub alpha: T, +} + +impl CategoricalNBParameters { + /// Create CategoricalNBParameters with specific paramaters. + pub fn new(alpha: T) -> Result { + if alpha > T::zero() { + Ok(Self { alpha }) + } else { + Err(Failed::fit(&format!( + "alpha should be >= 0, alpha=[{}]", + alpha + ))) + } + } +} +impl Default for CategoricalNBParameters { + fn default() -> Self { + Self { alpha: T::one() } + } +} + +/// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data. +pub struct CategoricalNB> { + inner: BaseNaiveBayes>, +} + +impl> CategoricalNB { + /// Fits CategoricalNB with given data + /// * `x` - training data of size NxM where N is the number of samples and M is the number of + /// features. + /// * `y` - vector with target values (classes) of length N. + /// * `parameters` - additional parameters like alpha for smoothing + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: CategoricalNBParameters, + ) -> Result { + let alpha = parameters.alpha; + let distribution = CategoricalNBDistribution::fit(x, y, alpha)?; + let inner = BaseNaiveBayes::fit(distribution)?; + Ok(Self { inner }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + self.inner.predict(x) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn run_base_naive_bayes() { + let x = DenseMatrix::from_2d_array(&[ + &[0., 2., 1., 0.], + &[0., 2., 1., 1.], + &[1., 2., 1., 0.], + &[2., 1., 1., 0.], + &[2., 0., 0., 0.], + &[2., 0., 0., 1.], + &[1., 0., 0., 1.], + &[0., 1., 1., 0.], + &[0., 0., 0., 0.], + &[2., 1., 0., 0.], + &[0., 1., 0., 1.], + &[1., 1., 1., 1.], + &[1., 2., 0., 0.], + &[2., 1., 1., 1.], + ]); + let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; + + let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); + let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]); + let y_hat = cnb.predict(&x_test).unwrap(); + assert_eq!(y_hat, vec![0., 1.]); + } +} diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs new file mode 100644 index 0000000..e9ab792 --- /dev/null +++ b/src/naive_bayes/mod.rs @@ -0,0 +1,69 @@ +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use std::marker::PhantomData; + +/// Distribution used in the Naive Bayes classifier. +pub(crate) trait NBDistribution> { + /// Prior of class at the given index. + fn prior(&self, class_index: usize) -> T; + + /// Conditional probability of sample j given class in the specified index. + fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T; + + /// Possible classes of the distribution. + fn classes(&self) -> &Vec; +} + +/// Base struct for the Naive Bayes classifier. +pub(crate) struct BaseNaiveBayes, D: NBDistribution> { + distribution: D, + _phantom_t: PhantomData, + _phantom_m: PhantomData, +} + +impl, D: NBDistribution> BaseNaiveBayes { + /// Fits NB classifier to a given NBdistribution. + /// * `distribution` - NBDistribution of the training data + pub fn fit(distribution: D) -> Result { + Ok(Self { + distribution, + _phantom_t: PhantomData, + _phantom_m: PhantomData, + }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + let y_classes = self.distribution.classes(); + let (rows, _) = x.shape(); + let predictions = (0..rows) + .map(|row_index| { + let row = x.get_row(row_index); + let (prediction, _probability) = y_classes + .iter() + .enumerate() + .map(|(class_index, class)| { + ( + class, + self.distribution.conditional_probability(class_index, &row) + * self.distribution.prior(class_index), + ) + }) + .max_by(|(_, p1), (_, p2)| p1.partial_cmp(p2).unwrap()) + .unwrap(); + *prediction + }) + .collect::>(); + let mut y_hat = M::RowVector::zeros(rows); + for (i, prediction) in predictions.iter().enumerate().take(rows) { + y_hat.set(i, *prediction); + } + Ok(y_hat) + } +} +mod categorical; +pub use categorical::{CategoricalNB, CategoricalNBParameters};