feat: add Naive Bayes and CategoricalNB (#15)

* feat: Implement Naive Bayes classifier

* Implement CategoricalNB
This commit is contained in:
morenol
2020-11-09 15:54:27 -04:00
committed by GitHub
parent 4efad85f8a
commit 3d4d5f64f6
3 changed files with 303 additions and 0 deletions
+2
View File
@@ -85,6 +85,8 @@ pub mod math;
/// Functions for assessing prediction error. /// Functions for assessing prediction error.
pub mod metrics; pub mod metrics;
pub mod model_selection; pub mod model_selection;
/// Supervised learning algorithms based on applying the Bayes theorem with the independence assumptions between predictors
pub mod naive_bayes;
/// Supervised neighbors-based learning methods /// Supervised neighbors-based learning methods
pub mod neighbors; pub mod neighbors;
pub(crate) mod optimization; pub(crate) mod optimization;
+232
View File
@@ -0,0 +1,232 @@
use crate::error::Failed;
use crate::linalg::BaseVector;
use crate::linalg::Matrix;
use crate::math::num::RealNumber;
use crate::naive_bayes::{BaseNaiveBayes, NBDistribution};
use serde::{Deserialize, Serialize};
/// Naive Bayes classifier for categorical features
struct CategoricalNBDistribution<T: RealNumber> {
class_labels: Vec<T>,
class_probabilities: Vec<T>,
coef: Vec<Vec<Vec<T>>>,
feature_categories: Vec<Vec<T>>,
}
impl<T: RealNumber, M: Matrix<T>> NBDistribution<T, M> for CategoricalNBDistribution<T> {
fn prior(&self, class_index: usize) -> T {
if class_index >= self.class_labels.len() {
T::zero()
} else {
self.class_probabilities[class_index]
}
}
fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T {
if class_index < self.class_labels.len() {
let mut prob = T::one();
for feature in 0..j.len() {
let value = j.get(feature);
match self.feature_categories[feature]
.iter()
.position(|&t| t == value)
{
Some(_i) => prob *= self.coef[class_index][feature][_i],
None => return T::zero(),
}
}
prob
} else {
T::zero()
}
}
fn classes(&self) -> &Vec<T> {
&self.class_labels
}
}
impl<T: RealNumber> CategoricalNBDistribution<T> {
/// Fits the distribution to a NxM matrix where N is number of samples and M is number of features.
/// * `x` - training data.
/// * `y` - vector with target values (classes) of length N.
/// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
pub fn fit<M: Matrix<T>>(x: &M, y: &M::RowVector, alpha: T) -> Result<Self, Failed> {
if alpha < T::zero() {
return Err(Failed::fit(&format!(
"alpha should be >= 0, alpha=[{}]",
alpha
)));
}
let (n_samples, n_features) = x.shape();
let y_samples = y.len();
if y_samples != n_samples {
return Err(Failed::fit(&format!(
"Size of x should equal size of y; |x|=[{}], |y|=[{}]",
n_samples, y_samples
)));
}
if n_samples == 0 {
return Err(Failed::fit(&format!(
"Size of x and y should greater than 0; |x|=[{}]",
n_samples
)));
}
let mut y_sorted = y.to_vec();
y_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let mut class_labels = Vec::with_capacity(y.len());
class_labels.push(y_sorted[0]);
let mut classes_count = Vec::with_capacity(y.len());
let mut current_count = T::one();
for idx in 1..y_samples {
if y_sorted[idx] == y_sorted[idx - 1] {
current_count += T::one();
} else {
classes_count.push(current_count);
class_labels.push(y_sorted[idx]);
current_count = T::one()
}
classes_count.push(current_count);
}
let mut feature_categories: Vec<Vec<T>> = Vec::with_capacity(n_features);
for feature in 0..n_features {
let feature_types = x.get_col_as_vec(feature).unique();
feature_categories.push(feature_types);
}
let mut coef: Vec<Vec<Vec<T>>> = Vec::with_capacity(class_labels.len());
for (label, label_count) in class_labels.iter().zip(classes_count.iter()) {
let mut coef_i: Vec<Vec<T>> = Vec::with_capacity(n_features);
for (feature_index, feature_options) in
feature_categories.iter().enumerate().take(n_features)
{
let col = x
.get_col_as_vec(feature_index)
.iter()
.enumerate()
.filter(|(i, _j)| y.get(*i) == *label)
.map(|(_, j)| *j)
.collect::<Vec<T>>();
let mut feat_count: Vec<usize> = Vec::with_capacity(feature_options.len());
for k in feature_options.iter() {
let feat_k_count = col.iter().filter(|&v| v == k).count();
feat_count.push(feat_k_count);
}
let coef_i_j = feat_count
.iter()
.map(|c| {
(T::from(*c).unwrap() + alpha)
/ (T::from(*label_count).unwrap()
+ T::from(feature_options.len()).unwrap() * alpha)
})
.collect::<Vec<T>>();
coef_i.push(coef_i_j);
}
coef.push(coef_i);
}
let class_probabilities = classes_count
.into_iter()
.map(|count| count / T::from(n_samples).unwrap())
.collect::<Vec<T>>();
Ok(Self {
class_labels,
class_probabilities,
coef,
feature_categories,
})
}
}
/// `CategoricalNB` parameters. Use `Default::default()` for default values.
#[derive(Serialize, Deserialize, Debug)]
pub struct CategoricalNBParameters<T: RealNumber> {
/// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
pub alpha: T,
}
impl<T: RealNumber> CategoricalNBParameters<T> {
/// Create CategoricalNBParameters with specific paramaters.
pub fn new(alpha: T) -> Result<Self, Failed> {
if alpha > T::zero() {
Ok(Self { alpha })
} else {
Err(Failed::fit(&format!(
"alpha should be >= 0, alpha=[{}]",
alpha
)))
}
}
}
impl<T: RealNumber> Default for CategoricalNBParameters<T> {
fn default() -> Self {
Self { alpha: T::one() }
}
}
/// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data.
pub struct CategoricalNB<T: RealNumber, M: Matrix<T>> {
inner: BaseNaiveBayes<T, M, CategoricalNBDistribution<T>>,
}
impl<T: RealNumber, M: Matrix<T>> CategoricalNB<T, M> {
/// Fits CategoricalNB with given data
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
/// features.
/// * `y` - vector with target values (classes) of length N.
/// * `parameters` - additional parameters like alpha for smoothing
pub fn fit(
x: &M,
y: &M::RowVector,
parameters: CategoricalNBParameters<T>,
) -> Result<Self, Failed> {
let alpha = parameters.alpha;
let distribution = CategoricalNBDistribution::fit(x, y, alpha)?;
let inner = BaseNaiveBayes::fit(distribution)?;
Ok(Self { inner })
}
/// Estimates the class labels for the provided data.
/// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
/// Returns a vector of size N with class estimates.
pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
self.inner.predict(x)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::linalg::naive::dense_matrix::DenseMatrix;
#[test]
fn run_base_naive_bayes() {
let x = DenseMatrix::from_2d_array(&[
&[0., 2., 1., 0.],
&[0., 2., 1., 1.],
&[1., 2., 1., 0.],
&[2., 1., 1., 0.],
&[2., 0., 0., 0.],
&[2., 0., 0., 1.],
&[1., 0., 0., 1.],
&[0., 1., 1., 0.],
&[0., 0., 0., 0.],
&[2., 1., 0., 0.],
&[0., 1., 0., 1.],
&[1., 1., 1., 1.],
&[1., 2., 0., 0.],
&[2., 1., 1., 1.],
]);
let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.];
let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap();
let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]);
let y_hat = cnb.predict(&x_test).unwrap();
assert_eq!(y_hat, vec![0., 1.]);
}
}
+69
View File
@@ -0,0 +1,69 @@
use crate::error::Failed;
use crate::linalg::BaseVector;
use crate::linalg::Matrix;
use crate::math::num::RealNumber;
use std::marker::PhantomData;
/// Distribution used in the Naive Bayes classifier.
pub(crate) trait NBDistribution<T: RealNumber, M: Matrix<T>> {
/// Prior of class at the given index.
fn prior(&self, class_index: usize) -> T;
/// Conditional probability of sample j given class in the specified index.
fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T;
/// Possible classes of the distribution.
fn classes(&self) -> &Vec<T>;
}
/// Base struct for the Naive Bayes classifier.
pub(crate) struct BaseNaiveBayes<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> {
distribution: D,
_phantom_t: PhantomData<T>,
_phantom_m: PhantomData<M>,
}
impl<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> BaseNaiveBayes<T, M, D> {
/// Fits NB classifier to a given NBdistribution.
/// * `distribution` - NBDistribution of the training data
pub fn fit(distribution: D) -> Result<Self, Failed> {
Ok(Self {
distribution,
_phantom_t: PhantomData,
_phantom_m: PhantomData,
})
}
/// Estimates the class labels for the provided data.
/// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
/// Returns a vector of size N with class estimates.
pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
let y_classes = self.distribution.classes();
let (rows, _) = x.shape();
let predictions = (0..rows)
.map(|row_index| {
let row = x.get_row(row_index);
let (prediction, _probability) = y_classes
.iter()
.enumerate()
.map(|(class_index, class)| {
(
class,
self.distribution.conditional_probability(class_index, &row)
* self.distribution.prior(class_index),
)
})
.max_by(|(_, p1), (_, p2)| p1.partial_cmp(p2).unwrap())
.unwrap();
*prediction
})
.collect::<Vec<T>>();
let mut y_hat = M::RowVector::zeros(rows);
for (i, prediction) in predictions.iter().enumerate().take(rows) {
y_hat.set(i, *prediction);
}
Ok(y_hat)
}
}
mod categorical;
pub use categorical::{CategoricalNB, CategoricalNBParameters};