diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 6a7d0b4..286a4a5 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -47,12 +47,44 @@ use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Bearnoulli features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Debug, PartialEq)] +#[derive(Debug)] struct BernoulliNBDistribution { /// class labels known to the classifier class_labels: Vec, + /// number of training samples observed in each class + class_count: Vec, + /// probability of each class class_priors: Vec, - feature_prob: Vec>, + /// Number of samples encountered for each (class, feature) + feature_count: Vec>, + /// probability of features per class + feature_log_prob: Vec>, + /// Number of features of each sample + n_features: usize, +} + +impl PartialEq for BernoulliNBDistribution { + fn eq(&self, other: &Self) -> bool { + if self.class_labels == other.class_labels + && self.class_count == other.class_count + && self.class_priors == other.class_priors + && self.feature_count == other.feature_count + && self.n_features == other.n_features + { + for (a, b) in self + .feature_log_prob + .iter() + .zip(other.feature_log_prob.iter()) + { + if !a.approximate_eq(b, T::epsilon()) { + return false; + } + } + true + } else { + false + } + } } impl> NBDistribution for BernoulliNBDistribution { @@ -65,9 +97,9 @@ impl> NBDistribution for BernoulliNBDistributi for feature in 0..j.len() { let value = j.get(feature); if value == T::one() { - likelihood += self.feature_prob[class_index][feature].ln(); + likelihood += self.feature_log_prob[class_index][feature]; } else { - likelihood += (T::one() - self.feature_prob[class_index][feature]).ln(); + likelihood += (T::one() - self.feature_log_prob[class_index][feature].exp()).ln(); } } likelihood @@ -157,10 +189,10 @@ impl BernoulliNBDistribution { let y = y.to_vec(); let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); - let mut class_count = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; for class_index in indices.iter() { - class_count[*class_index] += T::one(); + class_count[*class_index] += 1; } let class_priors = if let Some(class_priors) = priors { @@ -173,25 +205,35 @@ impl BernoulliNBDistribution { } else { class_count .iter() - .map(|&c| c / T::from(n_samples).unwrap()) + .map(|&c| T::from(c).unwrap() / T::from(n_samples).unwrap()) .collect() }; - let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + let mut feature_in_class_counter = vec![vec![0_usize; n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { for (idx, row_i) in row.iter().enumerate().take(n_features) { - feature_in_class_counter[class_index][idx] += *row_i; + feature_in_class_counter[class_index][idx] += + row_i.to_usize().ok_or_else(|| { + Failed::fit(&format!( + "Elements of the matrix should be 1.0 or 0.0 |found|=[{}]", + row_i + )) + })?; } } - let feature_prob = feature_in_class_counter + let feature_log_prob = feature_in_class_counter .iter() .enumerate() .map(|(class_index, feature_count)| { feature_count .iter() - .map(|&count| (count + alpha) / (class_count[class_index] + alpha * T::two())) + .map(|&count| { + ((T::from(count).unwrap() + alpha) + / (T::from(class_count[class_index]).unwrap() + alpha * T::two())) + .ln() + }) .collect() }) .collect(); @@ -199,7 +241,10 @@ impl BernoulliNBDistribution { Ok(Self { class_labels, class_priors, - feature_prob, + class_count, + feature_count: feature_in_class_counter, + feature_log_prob, + n_features, }) } } @@ -266,6 +311,34 @@ impl> BernoulliNB { self.inner.predict(x) } } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Number of features of each sample + pub fn n_features(&self) -> usize { + self.inner.distribution.n_features + } + + /// Number of samples encountered for each (class, feature) + /// Returns a 2d vector of shape (n_classes, n_features) + pub fn feature_count(&self) -> &Vec> { + &self.inner.distribution.feature_count + } + + /// Empirical log probability of features given a class + pub fn feature_log_prob(&self) -> &Vec> { + &self.inner.distribution.feature_log_prob + } } #[cfg(test)] @@ -296,10 +369,24 @@ mod tests { assert_eq!(bnb.inner.distribution.class_priors, &[0.75, 0.25]); assert_eq!( - bnb.inner.distribution.feature_prob, + bnb.feature_log_prob(), &[ - &[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], - &[1. / 3.0, 2. / 3.0, 2. / 3.0, 1. / 3.0, 1. / 3.0, 2. / 3.0] + &[ + -0.916290731874155, + -0.2231435513142097, + -1.6094379124341003, + -0.916290731874155, + -0.916290731874155, + -1.6094379124341003 + ], + &[ + -1.0986122886681098, + -0.40546510810816444, + -0.40546510810816444, + -1.0986122886681098, + -1.0986122886681098, + -0.40546510810816444 + ] ] ); @@ -335,13 +422,36 @@ mod tests { let y_hat = bnb.predict(&x).unwrap(); + assert_eq!(bnb.classes(), &[0., 1., 2.]); + assert_eq!(bnb.class_count(), &[7, 3, 5]); + assert_eq!(bnb.n_features(), 10); + assert_eq!( + bnb.feature_count(), + &[ + &[5, 6, 6, 7, 6, 4, 6, 7, 7, 7], + &[3, 3, 3, 1, 3, 2, 3, 2, 2, 3], + &[4, 4, 3, 4, 5, 2, 4, 5, 3, 4] + ] + ); + assert!(bnb .inner .distribution .class_priors .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); - assert!(bnb.inner.distribution.feature_prob[1].approximate_eq( - &vec!(0.8, 0.8, 0.8, 0.4, 0.8, 0.6, 0.8, 0.6, 0.6, 0.8), + assert!(bnb.feature_log_prob()[1].approximate_eq( + &vec![ + -0.22314355, + -0.22314355, + -0.22314355, + -0.91629073, + -0.22314355, + -0.51082562, + -0.22314355, + -0.51082562, + -0.51082562, + -0.22314355 + ], 1e-1 )); assert!(y_hat.approximate_eq( diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index 2161528..e308a01 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -43,14 +43,31 @@ use serde::{Deserialize, Serialize}; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] struct CategoricalNBDistribution { + /// number of training samples observed in each class + class_count: Vec, + /// class labels known to the classifier class_labels: Vec, + /// probability of each class class_priors: Vec, coefficients: Vec>>, + /// Number of features of each sample + n_features: usize, + /// Number of categories for each feature + n_categories: Vec, + /// Holds arrays of shape (n_classes, n_categories of respective feature) + /// for each feature. Each array provides the number of samples + /// encountered for each class and category of the specific feature. + category_count: Vec>>, } impl PartialEq for CategoricalNBDistribution { fn eq(&self, other: &Self) -> bool { - if self.class_labels == other.class_labels && self.class_priors == other.class_priors { + if self.class_labels == other.class_labels + && self.class_priors == other.class_priors + && self.n_features == other.n_features + && self.n_categories == other.n_categories + && self.class_count == other.class_count + { if self.coefficients.len() != other.coefficients.len() { return false; } @@ -90,8 +107,8 @@ impl> NBDistribution for CategoricalNBDistribu let mut likelihood = T::zero(); for feature in 0..j.len() { let value = j.get(feature).floor().to_usize().unwrap(); - if self.coefficients[class_index][feature].len() > value { - likelihood += self.coefficients[class_index][feature][value]; + if self.coefficients[feature][class_index].len() > value { + likelihood += self.coefficients[feature][class_index][value]; } else { return T::zero(); } @@ -149,12 +166,12 @@ impl CategoricalNBDistribution { let class_labels: Vec = (0..*y_max + 1) .map(|label| T::from(label).unwrap()) .collect(); - let mut classes_count: Vec = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; for elem in y.iter() { - classes_count[*elem] += T::one(); + class_count[*elem] += 1; } - let mut feature_categories: Vec> = Vec::with_capacity(n_features); + let mut n_categories: Vec = Vec::with_capacity(n_features); for feature in 0..n_features { let feature_max = x .get_col_as_vec(feature) @@ -167,18 +184,15 @@ impl CategoricalNBDistribution { feature )) })?; - let feature_types = (0..feature_max + 1) - .map(|feat| T::from(feat).unwrap()) - .collect(); - feature_categories.push(feature_types); + n_categories.push(feature_max + 1); } let mut coefficients: Vec>> = Vec::with_capacity(class_labels.len()); - for (label, label_count) in class_labels.iter().zip(classes_count.iter()) { + let mut category_count: Vec>> = Vec::with_capacity(class_labels.len()); + for (feature_index, &n_categories_i) in n_categories.iter().enumerate().take(n_features) { let mut coef_i: Vec> = Vec::with_capacity(n_features); - for (feature_index, feature_options) in - feature_categories.iter().enumerate().take(n_features) - { + let mut category_count_i: Vec> = Vec::with_capacity(n_features); + for (label, &label_count) in class_labels.iter().zip(class_count.iter()) { let col = x .get_col_as_vec(feature_index) .iter() @@ -186,33 +200,41 @@ impl CategoricalNBDistribution { .filter(|(i, _j)| T::from(y[*i]).unwrap() == *label) .map(|(_, j)| *j) .collect::>(); - let mut feat_count: Vec = vec![T::zero(); feature_options.len()]; + let mut feat_count: Vec = vec![0_usize; n_categories_i]; for row in col.iter() { let index = row.floor().to_usize().unwrap(); - feat_count[index] += T::one(); + feat_count[index] += 1; } + let coef_i_j = feat_count .iter() .map(|c| { - ((*c + alpha) - / (*label_count + T::from(feature_options.len()).unwrap() * alpha)) + ((T::from(*c).unwrap() + alpha) + / (T::from(label_count).unwrap() + + T::from(n_categories_i).unwrap() * alpha)) .ln() }) .collect::>(); + category_count_i.push(feat_count); coef_i.push(coef_i_j); } + category_count.push(category_count_i); coefficients.push(coef_i); } - let class_priors = classes_count - .into_iter() - .map(|count| count / T::from(n_samples).unwrap()) + let class_priors = class_count + .iter() + .map(|&count| T::from(count).unwrap() / T::from(n_samples).unwrap()) .collect::>(); Ok(Self { + class_count, class_labels, class_priors, coefficients, + n_categories, + n_features, + category_count, }) } } @@ -287,6 +309,41 @@ impl> CategoricalNB { pub fn predict(&self, x: &M) -> Result { self.inner.predict(x) } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Number of features of each sample + pub fn n_features(&self) -> usize { + self.inner.distribution.n_features + } + + /// Number of features of each sample + pub fn n_categories(&self) -> &Vec { + &self.inner.distribution.n_categories + } + + /// Holds arrays of shape (n_classes, n_categories of respective feature) + /// for each feature. Each array provides the number of samples + /// encountered for each class and category of the specific feature. + pub fn category_count(&self) -> &Vec>> { + &self.inner.distribution.category_count + } + /// Holds arrays of shape (n_classes, n_categories of respective feature) + /// for each feature. Each array provides the empirical log probability + /// of categories given the respective feature and class, ``P(x_i|y)``. + pub fn feature_log_prob(&self) -> &Vec>> { + &self.inner.distribution.coefficients + } } #[cfg(test)] @@ -315,6 +372,60 @@ mod tests { let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); + + // checking parity with scikit + assert_eq!(cnb.classes(), &[0., 1.]); + assert_eq!(cnb.class_count(), &[5, 9]); + assert_eq!(cnb.n_features(), 4); + assert_eq!(cnb.n_categories(), &[3, 3, 2, 2]); + assert_eq!( + cnb.category_count(), + &vec![ + vec![vec![3, 0, 2], vec![2, 4, 3]], + vec![vec![1, 2, 2], vec![3, 4, 2]], + vec![vec![1, 4], vec![6, 3]], + vec![vec![2, 3], vec![6, 3]] + ] + ); + + assert_eq!( + cnb.feature_log_prob(), + &vec![ + vec![ + vec![ + -0.6931471805599453, + -2.0794415416798357, + -0.9808292530117262 + ], + vec![ + -1.3862943611198906, + -0.8754687373538999, + -1.0986122886681098 + ] + ], + vec![ + vec![ + -1.3862943611198906, + -0.9808292530117262, + -0.9808292530117262 + ], + vec![ + -1.0986122886681098, + -0.8754687373538999, + -1.3862943611198906 + ] + ], + vec![ + vec![-1.252762968495368, -0.3364722366212129], + vec![-0.45198512374305727, -1.0116009116784799] + ], + vec![ + vec![-0.8472978603872037, -0.5596157879354228], + vec![-0.45198512374305727, -1.0116009116784799] + ] + ] + ); + let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]); let y_hat = cnb.predict(&x_test).unwrap(); assert_eq!(y_hat, vec![0., 1.]); diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 28c4785..00c7962 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -39,10 +39,12 @@ use serde::{Deserialize, Serialize}; struct GaussianNBDistribution { /// class labels known to the classifier class_labels: Vec, + /// number of training samples observed in each class + class_count: Vec, /// probability of each class. class_priors: Vec, /// variance of each feature per class - sigma: Vec>, + var: Vec>, /// mean of each feature per class theta: Vec>, } @@ -57,18 +59,14 @@ impl> NBDistribution for GaussianNBDistributio } fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { - if class_index < self.class_labels.len() { - let mut likelihood = T::zero(); - for feature in 0..j.len() { - let value = j.get(feature); - let mean = self.theta[class_index][feature]; - let variance = self.sigma[class_index][feature]; - likelihood += self.calculate_log_probability(value, mean, variance); - } - likelihood - } else { - T::zero() + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + let mean = self.theta[class_index][feature]; + let variance = self.var[class_index][feature]; + likelihood += self.calculate_log_probability(value, mean, variance); } + likelihood } fn classes(&self) -> &Vec { @@ -121,12 +119,12 @@ impl GaussianNBDistribution { let y = y.to_vec(); let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); - let mut class_count = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; let mut subdataset: Vec>> = vec![vec![]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices.iter()) { - class_count[*class_index] += T::one(); + class_count[*class_index] += 1; subdataset[*class_index].push(row); } @@ -139,8 +137,8 @@ impl GaussianNBDistribution { class_priors } else { class_count - .into_iter() - .map(|c| c / T::from(n_samples).unwrap()) + .iter() + .map(|&c| T::from(c).unwrap() / T::from(n_samples).unwrap()) .collect() }; @@ -157,15 +155,16 @@ impl GaussianNBDistribution { }) .collect(); - let (sigma, theta): (Vec>, Vec>) = subdataset + let (var, theta): (Vec>, Vec>) = subdataset .iter() .map(|data| (data.var(0), data.mean(0))) .unzip(); Ok(Self { class_labels, + class_count, class_priors, - sigma, + var, theta, }) } @@ -223,6 +222,36 @@ impl> GaussianNB { pub fn predict(&self, x: &M) -> Result { self.inner.predict(x) } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Probability of each class + /// Returns a vector of size n_classes. + pub fn class_priors(&self) -> &Vec { + &self.inner.distribution.class_priors + } + + /// Mean of each feature per class + /// Returns a 2d vector of shape (n_classes, n_features). + pub fn theta(&self) -> &Vec> { + &self.inner.distribution.theta + } + + /// Variance of each feature per class + /// Returns a 2d vector of shape (n_classes, n_features). + pub fn var(&self) -> &Vec> { + &self.inner.distribution.var + } } #[cfg(test)] @@ -245,18 +274,23 @@ mod tests { let gnb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); let y_hat = gnb.predict(&x).unwrap(); assert_eq!(y_hat, y); + + assert_eq!(gnb.classes(), &[1., 2.]); + + assert_eq!(gnb.class_count(), &[3, 3]); + assert_eq!( - gnb.inner.distribution.sigma, + gnb.var(), &[ &[0.666666666666667, 0.22222222222222232], &[0.666666666666667, 0.22222222222222232] ] ); - assert_eq!(gnb.inner.distribution.class_priors, &[0.5, 0.5]); + assert_eq!(gnb.class_priors(), &[0.5, 0.5]); assert_eq!( - gnb.inner.distribution.theta, + gnb.theta(), &[&[-2., -1.3333333333333333], &[2., 1.3333333333333333]] ); } @@ -277,7 +311,7 @@ mod tests { let parameters = GaussianNBParameters::default().with_priors(priors.clone()); let gnb = GaussianNB::fit(&x, &y, parameters).unwrap(); - assert_eq!(gnb.inner.distribution.class_priors, priors); + assert_eq!(gnb.class_priors(), &priors); } #[test] diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 06ee071..87e0ddd 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -51,8 +51,16 @@ use serde::{Deserialize, Serialize}; struct MultinomialNBDistribution { /// class labels known to the classifier class_labels: Vec, + /// number of training samples observed in each class + class_count: Vec, + /// probability of each class class_priors: Vec, - feature_prob: Vec>, + /// Empirical log probability of features given a class + feature_log_prob: Vec>, + /// Number of samples encountered for each (class, feature) + feature_count: Vec>, + /// Number of features of each sample + n_features: usize, } impl> NBDistribution for MultinomialNBDistribution { @@ -64,7 +72,7 @@ impl> NBDistribution for MultinomialNBDistribu let mut likelihood = T::zero(); for feature in 0..j.len() { let value = j.get(feature); - likelihood += value * self.feature_prob[class_index][feature].ln(); + likelihood += value * self.feature_log_prob[class_index][feature]; } likelihood } @@ -144,10 +152,10 @@ impl MultinomialNBDistribution { let y = y.to_vec(); let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); - let mut class_count = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; for class_index in indices.iter() { - class_count[*class_index] += T::one(); + class_count[*class_index] += 1; } let class_priors = if let Some(class_priors) = priors { @@ -160,33 +168,46 @@ impl MultinomialNBDistribution { } else { class_count .iter() - .map(|&c| c / T::from(n_samples).unwrap()) + .map(|&c| T::from(c).unwrap() / T::from(n_samples).unwrap()) .collect() }; - let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + let mut feature_in_class_counter = vec![vec![0_usize; n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { for (idx, row_i) in row.iter().enumerate().take(n_features) { - feature_in_class_counter[class_index][idx] += *row_i; + feature_in_class_counter[class_index][idx] += + row_i.to_usize().ok_or_else(|| { + Failed::fit(&format!( + "Elements of the matrix should be convertible to usize |found|=[{}]", + row_i + )) + })?; } } - let feature_prob = feature_in_class_counter + let feature_log_prob = feature_in_class_counter .iter() .map(|feature_count| { - let n_c = feature_count.sum(); + let n_c: usize = feature_count.iter().sum(); feature_count .iter() - .map(|&count| (count + alpha) / (n_c + alpha * T::from(n_features).unwrap())) + .map(|&count| { + ((T::from(count).unwrap() + alpha) + / (T::from(n_c).unwrap() + alpha * T::from(n_features).unwrap())) + .ln() + }) .collect() }) .collect(); Ok(Self { + class_count, class_labels, class_priors, - feature_prob, + feature_log_prob, + feature_count: feature_in_class_counter, + n_features, }) } } @@ -240,6 +261,35 @@ impl> MultinomialNB { pub fn predict(&self, x: &M) -> Result { self.inner.predict(x) } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Empirical log probability of features given a class, P(x_i|y). + /// Returns a 2d vector of shape (n_classes, n_features) + pub fn feature_log_prob(&self) -> &Vec> { + &self.inner.distribution.feature_log_prob + } + + /// Number of features of each sample + pub fn n_features(&self) -> usize { + self.inner.distribution.n_features + } + + /// Number of samples encountered for each (class, feature) + /// Returns a 2d vector of shape (n_classes, n_features) + pub fn feature_count(&self) -> &Vec> { + &self.inner.distribution.feature_count + } } #[cfg(test)] @@ -268,12 +318,29 @@ mod tests { let y = vec![0., 0., 0., 1.]; let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + assert_eq!(mnb.classes(), &[0., 1.]); + assert_eq!(mnb.class_count(), &[3, 1]); + assert_eq!(mnb.inner.distribution.class_priors, &[0.75, 0.25]); assert_eq!( - mnb.inner.distribution.feature_prob, + mnb.feature_log_prob(), &[ - &[1. / 7., 3. / 7., 1. / 14., 1. / 7., 1. / 7., 1. / 14.], - &[1. / 9., 2. / 9.0, 2. / 9.0, 1. / 9.0, 1. / 9.0, 2. / 9.0] + &[ + (1_f64 / 7_f64).ln(), + (3_f64 / 7_f64).ln(), + (1_f64 / 14_f64).ln(), + (1_f64 / 7_f64).ln(), + (1_f64 / 7_f64).ln(), + (1_f64 / 14_f64).ln() + ], + &[ + (1_f64 / 9_f64).ln(), + (2_f64 / 9_f64).ln(), + (2_f64 / 9_f64).ln(), + (1_f64 / 9_f64).ln(), + (1_f64 / 9_f64).ln(), + (2_f64 / 9_f64).ln() + ] ] ); @@ -307,6 +374,16 @@ mod tests { let y = vec![2., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0., 0., 2., 0., 2.]; let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + assert_eq!(nb.n_features(), 10); + assert_eq!( + nb.feature_count(), + &[ + &[12, 20, 11, 24, 12, 14, 13, 17, 13, 18], + &[9, 6, 9, 4, 7, 3, 8, 5, 4, 9], + &[10, 12, 9, 9, 11, 3, 9, 18, 10, 10] + ] + ); + let y_hat = nb.predict(&x).unwrap(); assert!(nb @@ -314,9 +391,20 @@ mod tests { .distribution .class_priors .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); - assert!(nb.inner.distribution.feature_prob[1].approximate_eq( - &vec!(0.07, 0.12, 0.07, 0.15, 0.07, 0.09, 0.08, 0.10, 0.08, 0.11), - 1e-1 + assert!(nb.feature_log_prob()[1].approximate_eq( + &vec![ + -2.00148, + -2.35815494, + -2.00148, + -2.69462718, + -2.22462355, + -2.91777073, + -2.10684052, + -2.51230562, + -2.69462718, + -2.00148 + ], + 1e-5 )); assert!(y_hat.approximate_eq( &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 2.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 2.0),