feat: add Naive Bayes and CategoricalNB (#15)
* feat: Implement Naive Bayes classifier * Implement CategoricalNB
This commit is contained in:
@@ -85,6 +85,8 @@ pub mod math;
|
||||
/// Functions for assessing prediction error.
|
||||
pub mod metrics;
|
||||
pub mod model_selection;
|
||||
/// Supervised learning algorithms based on applying the Bayes theorem with the independence assumptions between predictors
|
||||
pub mod naive_bayes;
|
||||
/// Supervised neighbors-based learning methods
|
||||
pub mod neighbors;
|
||||
pub(crate) mod optimization;
|
||||
|
||||
@@ -0,0 +1,232 @@
|
||||
use crate::error::Failed;
|
||||
use crate::linalg::BaseVector;
|
||||
use crate::linalg::Matrix;
|
||||
use crate::math::num::RealNumber;
|
||||
use crate::naive_bayes::{BaseNaiveBayes, NBDistribution};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Naive Bayes classifier for categorical features
|
||||
struct CategoricalNBDistribution<T: RealNumber> {
|
||||
class_labels: Vec<T>,
|
||||
class_probabilities: Vec<T>,
|
||||
coef: Vec<Vec<Vec<T>>>,
|
||||
feature_categories: Vec<Vec<T>>,
|
||||
}
|
||||
|
||||
impl<T: RealNumber, M: Matrix<T>> NBDistribution<T, M> for CategoricalNBDistribution<T> {
|
||||
fn prior(&self, class_index: usize) -> T {
|
||||
if class_index >= self.class_labels.len() {
|
||||
T::zero()
|
||||
} else {
|
||||
self.class_probabilities[class_index]
|
||||
}
|
||||
}
|
||||
|
||||
fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T {
|
||||
if class_index < self.class_labels.len() {
|
||||
let mut prob = T::one();
|
||||
for feature in 0..j.len() {
|
||||
let value = j.get(feature);
|
||||
match self.feature_categories[feature]
|
||||
.iter()
|
||||
.position(|&t| t == value)
|
||||
{
|
||||
Some(_i) => prob *= self.coef[class_index][feature][_i],
|
||||
None => return T::zero(),
|
||||
}
|
||||
}
|
||||
prob
|
||||
} else {
|
||||
T::zero()
|
||||
}
|
||||
}
|
||||
|
||||
fn classes(&self) -> &Vec<T> {
|
||||
&self.class_labels
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber> CategoricalNBDistribution<T> {
|
||||
/// Fits the distribution to a NxM matrix where N is number of samples and M is number of features.
|
||||
/// * `x` - training data.
|
||||
/// * `y` - vector with target values (classes) of length N.
|
||||
/// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
|
||||
pub fn fit<M: Matrix<T>>(x: &M, y: &M::RowVector, alpha: T) -> Result<Self, Failed> {
|
||||
if alpha < T::zero() {
|
||||
return Err(Failed::fit(&format!(
|
||||
"alpha should be >= 0, alpha=[{}]",
|
||||
alpha
|
||||
)));
|
||||
}
|
||||
|
||||
let (n_samples, n_features) = x.shape();
|
||||
let y_samples = y.len();
|
||||
if y_samples != n_samples {
|
||||
return Err(Failed::fit(&format!(
|
||||
"Size of x should equal size of y; |x|=[{}], |y|=[{}]",
|
||||
n_samples, y_samples
|
||||
)));
|
||||
}
|
||||
|
||||
if n_samples == 0 {
|
||||
return Err(Failed::fit(&format!(
|
||||
"Size of x and y should greater than 0; |x|=[{}]",
|
||||
n_samples
|
||||
)));
|
||||
}
|
||||
|
||||
let mut y_sorted = y.to_vec();
|
||||
y_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
let mut class_labels = Vec::with_capacity(y.len());
|
||||
class_labels.push(y_sorted[0]);
|
||||
let mut classes_count = Vec::with_capacity(y.len());
|
||||
let mut current_count = T::one();
|
||||
for idx in 1..y_samples {
|
||||
if y_sorted[idx] == y_sorted[idx - 1] {
|
||||
current_count += T::one();
|
||||
} else {
|
||||
classes_count.push(current_count);
|
||||
class_labels.push(y_sorted[idx]);
|
||||
current_count = T::one()
|
||||
}
|
||||
classes_count.push(current_count);
|
||||
}
|
||||
|
||||
let mut feature_categories: Vec<Vec<T>> = Vec::with_capacity(n_features);
|
||||
|
||||
for feature in 0..n_features {
|
||||
let feature_types = x.get_col_as_vec(feature).unique();
|
||||
feature_categories.push(feature_types);
|
||||
}
|
||||
let mut coef: Vec<Vec<Vec<T>>> = Vec::with_capacity(class_labels.len());
|
||||
for (label, label_count) in class_labels.iter().zip(classes_count.iter()) {
|
||||
let mut coef_i: Vec<Vec<T>> = Vec::with_capacity(n_features);
|
||||
for (feature_index, feature_options) in
|
||||
feature_categories.iter().enumerate().take(n_features)
|
||||
{
|
||||
let col = x
|
||||
.get_col_as_vec(feature_index)
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(i, _j)| y.get(*i) == *label)
|
||||
.map(|(_, j)| *j)
|
||||
.collect::<Vec<T>>();
|
||||
let mut feat_count: Vec<usize> = Vec::with_capacity(feature_options.len());
|
||||
for k in feature_options.iter() {
|
||||
let feat_k_count = col.iter().filter(|&v| v == k).count();
|
||||
feat_count.push(feat_k_count);
|
||||
}
|
||||
|
||||
let coef_i_j = feat_count
|
||||
.iter()
|
||||
.map(|c| {
|
||||
(T::from(*c).unwrap() + alpha)
|
||||
/ (T::from(*label_count).unwrap()
|
||||
+ T::from(feature_options.len()).unwrap() * alpha)
|
||||
})
|
||||
.collect::<Vec<T>>();
|
||||
coef_i.push(coef_i_j);
|
||||
}
|
||||
coef.push(coef_i);
|
||||
}
|
||||
let class_probabilities = classes_count
|
||||
.into_iter()
|
||||
.map(|count| count / T::from(n_samples).unwrap())
|
||||
.collect::<Vec<T>>();
|
||||
|
||||
Ok(Self {
|
||||
class_labels,
|
||||
class_probabilities,
|
||||
coef,
|
||||
feature_categories,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// `CategoricalNB` parameters. Use `Default::default()` for default values.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct CategoricalNBParameters<T: RealNumber> {
|
||||
/// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
|
||||
pub alpha: T,
|
||||
}
|
||||
|
||||
impl<T: RealNumber> CategoricalNBParameters<T> {
|
||||
/// Create CategoricalNBParameters with specific paramaters.
|
||||
pub fn new(alpha: T) -> Result<Self, Failed> {
|
||||
if alpha > T::zero() {
|
||||
Ok(Self { alpha })
|
||||
} else {
|
||||
Err(Failed::fit(&format!(
|
||||
"alpha should be >= 0, alpha=[{}]",
|
||||
alpha
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<T: RealNumber> Default for CategoricalNBParameters<T> {
|
||||
fn default() -> Self {
|
||||
Self { alpha: T::one() }
|
||||
}
|
||||
}
|
||||
|
||||
/// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data.
|
||||
pub struct CategoricalNB<T: RealNumber, M: Matrix<T>> {
|
||||
inner: BaseNaiveBayes<T, M, CategoricalNBDistribution<T>>,
|
||||
}
|
||||
|
||||
impl<T: RealNumber, M: Matrix<T>> CategoricalNB<T, M> {
|
||||
/// Fits CategoricalNB with given data
|
||||
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
||||
/// features.
|
||||
/// * `y` - vector with target values (classes) of length N.
|
||||
/// * `parameters` - additional parameters like alpha for smoothing
|
||||
pub fn fit(
|
||||
x: &M,
|
||||
y: &M::RowVector,
|
||||
parameters: CategoricalNBParameters<T>,
|
||||
) -> Result<Self, Failed> {
|
||||
let alpha = parameters.alpha;
|
||||
let distribution = CategoricalNBDistribution::fit(x, y, alpha)?;
|
||||
let inner = BaseNaiveBayes::fit(distribution)?;
|
||||
Ok(Self { inner })
|
||||
}
|
||||
|
||||
/// Estimates the class labels for the provided data.
|
||||
/// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
|
||||
/// Returns a vector of size N with class estimates.
|
||||
pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||
self.inner.predict(x)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
||||
|
||||
#[test]
|
||||
fn run_base_naive_bayes() {
|
||||
let x = DenseMatrix::from_2d_array(&[
|
||||
&[0., 2., 1., 0.],
|
||||
&[0., 2., 1., 1.],
|
||||
&[1., 2., 1., 0.],
|
||||
&[2., 1., 1., 0.],
|
||||
&[2., 0., 0., 0.],
|
||||
&[2., 0., 0., 1.],
|
||||
&[1., 0., 0., 1.],
|
||||
&[0., 1., 1., 0.],
|
||||
&[0., 0., 0., 0.],
|
||||
&[2., 1., 0., 0.],
|
||||
&[0., 1., 0., 1.],
|
||||
&[1., 1., 1., 1.],
|
||||
&[1., 2., 0., 0.],
|
||||
&[2., 1., 1., 1.],
|
||||
]);
|
||||
let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.];
|
||||
|
||||
let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap();
|
||||
let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]);
|
||||
let y_hat = cnb.predict(&x_test).unwrap();
|
||||
assert_eq!(y_hat, vec![0., 1.]);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
use crate::error::Failed;
|
||||
use crate::linalg::BaseVector;
|
||||
use crate::linalg::Matrix;
|
||||
use crate::math::num::RealNumber;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
/// Distribution used in the Naive Bayes classifier.
|
||||
pub(crate) trait NBDistribution<T: RealNumber, M: Matrix<T>> {
|
||||
/// Prior of class at the given index.
|
||||
fn prior(&self, class_index: usize) -> T;
|
||||
|
||||
/// Conditional probability of sample j given class in the specified index.
|
||||
fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T;
|
||||
|
||||
/// Possible classes of the distribution.
|
||||
fn classes(&self) -> &Vec<T>;
|
||||
}
|
||||
|
||||
/// Base struct for the Naive Bayes classifier.
|
||||
pub(crate) struct BaseNaiveBayes<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> {
|
||||
distribution: D,
|
||||
_phantom_t: PhantomData<T>,
|
||||
_phantom_m: PhantomData<M>,
|
||||
}
|
||||
|
||||
impl<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> BaseNaiveBayes<T, M, D> {
|
||||
/// Fits NB classifier to a given NBdistribution.
|
||||
/// * `distribution` - NBDistribution of the training data
|
||||
pub fn fit(distribution: D) -> Result<Self, Failed> {
|
||||
Ok(Self {
|
||||
distribution,
|
||||
_phantom_t: PhantomData,
|
||||
_phantom_m: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// Estimates the class labels for the provided data.
|
||||
/// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
|
||||
/// Returns a vector of size N with class estimates.
|
||||
pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||
let y_classes = self.distribution.classes();
|
||||
let (rows, _) = x.shape();
|
||||
let predictions = (0..rows)
|
||||
.map(|row_index| {
|
||||
let row = x.get_row(row_index);
|
||||
let (prediction, _probability) = y_classes
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(class_index, class)| {
|
||||
(
|
||||
class,
|
||||
self.distribution.conditional_probability(class_index, &row)
|
||||
* self.distribution.prior(class_index),
|
||||
)
|
||||
})
|
||||
.max_by(|(_, p1), (_, p2)| p1.partial_cmp(p2).unwrap())
|
||||
.unwrap();
|
||||
*prediction
|
||||
})
|
||||
.collect::<Vec<T>>();
|
||||
let mut y_hat = M::RowVector::zeros(rows);
|
||||
for (i, prediction) in predictions.iter().enumerate().take(rows) {
|
||||
y_hat.set(i, *prediction);
|
||||
}
|
||||
Ok(y_hat)
|
||||
}
|
||||
}
|
||||
mod categorical;
|
||||
pub use categorical::{CategoricalNB, CategoricalNBParameters};
|
||||
Reference in New Issue
Block a user