diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 0268da6..8b63aaa 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -66,5 +66,8 @@ impl, D: NBDistribution> BaseNaiveBayes { + /// class labels known to the classifier + class_labels: Vec, + class_priors: Vec, + feature_prob: Vec>, +} + +impl> NBDistribution for MultinomialNBDistribution { + fn prior(&self, class_index: usize) -> T { + self.class_priors[class_index] + } + + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + likelihood += value * self.feature_prob[class_index][feature].ln(); + } + likelihood + } + + fn classes(&self) -> &Vec { + &self.class_labels + } +} + +/// `MultinomialNB` parameters. Use `Default::default()` for default values. +#[derive(Serialize, Deserialize, Debug)] +pub struct MultinomialNBParameters { + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub alpha: T, + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub priors: Option>, +} + +impl MultinomialNBParameters { + /// Create MultinomialNBParameters with specific paramaters. + pub fn new(alpha: T, priors: Option>) -> Self { + Self { alpha, priors } + } +} + +impl Default for MultinomialNBParameters { + fn default() -> Self { + Self { + alpha: T::one(), + priors: None, + } + } +} + +impl MultinomialNBDistribution { + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data. + /// * `y` - vector with target values (classes) of length N. + /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, + /// priors are adjusted according to the data. + /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter. + pub fn fit>( + x: &M, + y: &M::RowVector, + alpha: T, + priors: Option>, + ) -> Result { + let (n_samples, n_features) = x.shape(); + let y_samples = y.len(); + if y_samples != n_samples { + return Err(Failed::fit(&format!( + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", + n_samples, y_samples + ))); + } + + if n_samples == 0 { + return Err(Failed::fit(&format!( + "Size of x and y should greater than 0; |x|=[{}]", + n_samples + ))); + } + if alpha < T::zero() { + return Err(Failed::fit(&format!( + "Alpha should be greater than 0; |alpha|=[{}]", + alpha + ))); + } + + let y = y.to_vec(); + + let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); + let mut class_count = vec![T::zero(); class_labels.len()]; + + for class_index in indices.iter() { + class_count[*class_index] += T::one(); + } + + let class_priors = if let Some(class_priors) = priors { + if class_priors.len() != class_labels.len() { + return Err(Failed::fit( + "Size of priors provided does not match the number of classes of the data.", + )); + } + class_priors + } else { + class_count + .iter() + .map(|&c| c / T::from(n_samples).unwrap()) + .collect() + }; + + let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + + for (row, class_index) in row_iter(x).zip(indices) { + for idx in 0..n_features { + feature_in_class_counter[class_index][idx] += row[idx]; + } + } + + let feature_prob = feature_in_class_counter + .iter() + .map(|feature_count| { + let n_c = feature_count.sum(); + feature_count + .iter() + .map(|&count| (count + alpha) / (n_c + alpha * T::from(n_features).unwrap())) + .collect() + }) + .collect(); + + Ok(Self { + class_labels, + class_priors, + feature_prob, + }) + } +} + +/// MultinomialNB implements the categorical naive Bayes algorithm for categorically distributed data. +#[derive(Serialize, Deserialize, Debug, PartialEq)] +pub struct MultinomialNB> { + inner: BaseNaiveBayes>, +} + +impl> MultinomialNB { + /// Fits MultinomialNB with given data + /// * `x` - training data of size NxM where N is the number of samples and M is the number of + /// features. + /// * `y` - vector with target values (classes) of length N. + /// * `parameters` - additional parameters like class priors, alpha for smoothing and + /// binarizing threshold. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: MultinomialNBParameters, + ) -> Result { + let distribution = + MultinomialNBDistribution::fit(x, y, parameters.alpha, parameters.priors)?; + let inner = BaseNaiveBayes::fit(distribution)?; + Ok(Self { inner }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + self.inner.predict(x) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn run_multinomial_naive_bayes() { + // Tests that MultinomialNB when alpha=1.0 gives the same values as + // those given for the toy example in Manning, Raghavan, and + // Schuetze's "Introduction to Information Retrieval" book: + // https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html + + // Training data points are: + // Chinese Beijing Chinese (class: China) + // Chinese Chinese Shanghai (class: China) + // Chinese Macao (class: China) + // Tokyo Japan Chinese (class: Japan) + let x = DenseMatrix::::from_2d_array(&[ + &[1., 2., 0., 0., 0., 0.], + &[0., 2., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + + assert_eq!(mnb.inner.distribution.class_priors, &[0.75, 0.25]); + assert_eq!( + mnb.inner.distribution.feature_prob, + &[ + &[1. / 7., 3. / 7., 1. / 14., 1. / 7., 1. / 7., 1. / 14.], + &[1. / 9., 2. / 9.0, 2. / 9.0, 1. / 9.0, 1. / 9.0, 2. / 9.0] + ] + ); + + // Testing data point is: + // Chinese Chinese Chinese Tokyo Japan + let x_test = DenseMatrix::::from_2d_array(&[&[0., 3., 1., 0., 0., 1.]]); + let y_hat = mnb.predict(&x_test).unwrap(); + + assert_eq!(y_hat, &[0.]); + } + + #[test] + fn multinomial_nb_scikit_parity() { + let x = DenseMatrix::::from_2d_array(&[ + &[2., 4., 0., 0., 2., 1., 2., 4., 2., 0.], + &[3., 4., 0., 2., 1., 0., 1., 4., 0., 3.], + &[1., 4., 2., 4., 1., 0., 1., 2., 3., 2.], + &[0., 3., 3., 4., 1., 0., 3., 1., 1., 1.], + &[0., 2., 1., 4., 3., 4., 1., 2., 3., 1.], + &[3., 2., 4., 1., 3., 0., 2., 4., 0., 2.], + &[3., 1., 3., 0., 2., 0., 4., 4., 3., 4.], + &[2., 2., 2., 0., 1., 1., 2., 1., 0., 1.], + &[3., 3., 2., 2., 0., 2., 3., 2., 2., 3.], + &[4., 3., 4., 4., 4., 2., 2., 0., 1., 4.], + &[3., 4., 2., 2., 1., 4., 4., 4., 1., 3.], + &[3., 0., 1., 4., 4., 0., 0., 3., 2., 4.], + &[2., 0., 3., 3., 1., 2., 0., 2., 4., 1.], + &[2., 4., 0., 4., 2., 4., 1., 3., 1., 4.], + &[0., 2., 2., 3., 4., 0., 4., 4., 4., 4.], + ]); + let y = vec![2., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0., 0., 2., 0., 2.]; + let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + + let y_hat = nb.predict(&x).unwrap(); + + assert!(nb + .inner + .distribution + .class_priors + .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); + assert!(nb.inner.distribution.feature_prob[1].approximate_eq( + &vec!(0.07, 0.12, 0.07, 0.15, 0.07, 0.09, 0.08, 0.10, 0.08, 0.11), + 1e-1 + )); + assert!(y_hat.approximate_eq( + &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 2.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 2.0), + 1e-5 + )); + } + #[test] + fn serde() { + let x = DenseMatrix::::from_2d_array(&[ + &[1., 1., 0., 0., 0., 0.], + &[0., 1., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + + let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + let deserialized_mnb: MultinomialNB> = + serde_json::from_str(&serde_json::to_string(&mnb).unwrap()).unwrap(); + + assert_eq!(mnb, deserialized_mnb); + } +}