feat: add Naive Bayes and CategoricalNB (#15)
* feat: Implement Naive Bayes classifier * Implement CategoricalNB
This commit is contained in:
@@ -85,6 +85,8 @@ pub mod math;
|
|||||||
/// Functions for assessing prediction error.
|
/// Functions for assessing prediction error.
|
||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
pub mod model_selection;
|
pub mod model_selection;
|
||||||
|
/// Supervised learning algorithms based on applying the Bayes theorem with the independence assumptions between predictors
|
||||||
|
pub mod naive_bayes;
|
||||||
/// Supervised neighbors-based learning methods
|
/// Supervised neighbors-based learning methods
|
||||||
pub mod neighbors;
|
pub mod neighbors;
|
||||||
pub(crate) mod optimization;
|
pub(crate) mod optimization;
|
||||||
|
|||||||
@@ -0,0 +1,232 @@
|
|||||||
|
use crate::error::Failed;
|
||||||
|
use crate::linalg::BaseVector;
|
||||||
|
use crate::linalg::Matrix;
|
||||||
|
use crate::math::num::RealNumber;
|
||||||
|
use crate::naive_bayes::{BaseNaiveBayes, NBDistribution};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// Naive Bayes classifier for categorical features
|
||||||
|
struct CategoricalNBDistribution<T: RealNumber> {
|
||||||
|
class_labels: Vec<T>,
|
||||||
|
class_probabilities: Vec<T>,
|
||||||
|
coef: Vec<Vec<Vec<T>>>,
|
||||||
|
feature_categories: Vec<Vec<T>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> NBDistribution<T, M> for CategoricalNBDistribution<T> {
|
||||||
|
fn prior(&self, class_index: usize) -> T {
|
||||||
|
if class_index >= self.class_labels.len() {
|
||||||
|
T::zero()
|
||||||
|
} else {
|
||||||
|
self.class_probabilities[class_index]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T {
|
||||||
|
if class_index < self.class_labels.len() {
|
||||||
|
let mut prob = T::one();
|
||||||
|
for feature in 0..j.len() {
|
||||||
|
let value = j.get(feature);
|
||||||
|
match self.feature_categories[feature]
|
||||||
|
.iter()
|
||||||
|
.position(|&t| t == value)
|
||||||
|
{
|
||||||
|
Some(_i) => prob *= self.coef[class_index][feature][_i],
|
||||||
|
None => return T::zero(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prob
|
||||||
|
} else {
|
||||||
|
T::zero()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn classes(&self) -> &Vec<T> {
|
||||||
|
&self.class_labels
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber> CategoricalNBDistribution<T> {
|
||||||
|
/// Fits the distribution to a NxM matrix where N is number of samples and M is number of features.
|
||||||
|
/// * `x` - training data.
|
||||||
|
/// * `y` - vector with target values (classes) of length N.
|
||||||
|
/// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
|
||||||
|
pub fn fit<M: Matrix<T>>(x: &M, y: &M::RowVector, alpha: T) -> Result<Self, Failed> {
|
||||||
|
if alpha < T::zero() {
|
||||||
|
return Err(Failed::fit(&format!(
|
||||||
|
"alpha should be >= 0, alpha=[{}]",
|
||||||
|
alpha
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let (n_samples, n_features) = x.shape();
|
||||||
|
let y_samples = y.len();
|
||||||
|
if y_samples != n_samples {
|
||||||
|
return Err(Failed::fit(&format!(
|
||||||
|
"Size of x should equal size of y; |x|=[{}], |y|=[{}]",
|
||||||
|
n_samples, y_samples
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if n_samples == 0 {
|
||||||
|
return Err(Failed::fit(&format!(
|
||||||
|
"Size of x and y should greater than 0; |x|=[{}]",
|
||||||
|
n_samples
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut y_sorted = y.to_vec();
|
||||||
|
y_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||||
|
let mut class_labels = Vec::with_capacity(y.len());
|
||||||
|
class_labels.push(y_sorted[0]);
|
||||||
|
let mut classes_count = Vec::with_capacity(y.len());
|
||||||
|
let mut current_count = T::one();
|
||||||
|
for idx in 1..y_samples {
|
||||||
|
if y_sorted[idx] == y_sorted[idx - 1] {
|
||||||
|
current_count += T::one();
|
||||||
|
} else {
|
||||||
|
classes_count.push(current_count);
|
||||||
|
class_labels.push(y_sorted[idx]);
|
||||||
|
current_count = T::one()
|
||||||
|
}
|
||||||
|
classes_count.push(current_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut feature_categories: Vec<Vec<T>> = Vec::with_capacity(n_features);
|
||||||
|
|
||||||
|
for feature in 0..n_features {
|
||||||
|
let feature_types = x.get_col_as_vec(feature).unique();
|
||||||
|
feature_categories.push(feature_types);
|
||||||
|
}
|
||||||
|
let mut coef: Vec<Vec<Vec<T>>> = Vec::with_capacity(class_labels.len());
|
||||||
|
for (label, label_count) in class_labels.iter().zip(classes_count.iter()) {
|
||||||
|
let mut coef_i: Vec<Vec<T>> = Vec::with_capacity(n_features);
|
||||||
|
for (feature_index, feature_options) in
|
||||||
|
feature_categories.iter().enumerate().take(n_features)
|
||||||
|
{
|
||||||
|
let col = x
|
||||||
|
.get_col_as_vec(feature_index)
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(i, _j)| y.get(*i) == *label)
|
||||||
|
.map(|(_, j)| *j)
|
||||||
|
.collect::<Vec<T>>();
|
||||||
|
let mut feat_count: Vec<usize> = Vec::with_capacity(feature_options.len());
|
||||||
|
for k in feature_options.iter() {
|
||||||
|
let feat_k_count = col.iter().filter(|&v| v == k).count();
|
||||||
|
feat_count.push(feat_k_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
let coef_i_j = feat_count
|
||||||
|
.iter()
|
||||||
|
.map(|c| {
|
||||||
|
(T::from(*c).unwrap() + alpha)
|
||||||
|
/ (T::from(*label_count).unwrap()
|
||||||
|
+ T::from(feature_options.len()).unwrap() * alpha)
|
||||||
|
})
|
||||||
|
.collect::<Vec<T>>();
|
||||||
|
coef_i.push(coef_i_j);
|
||||||
|
}
|
||||||
|
coef.push(coef_i);
|
||||||
|
}
|
||||||
|
let class_probabilities = classes_count
|
||||||
|
.into_iter()
|
||||||
|
.map(|count| count / T::from(n_samples).unwrap())
|
||||||
|
.collect::<Vec<T>>();
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
class_labels,
|
||||||
|
class_probabilities,
|
||||||
|
coef,
|
||||||
|
feature_categories,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `CategoricalNB` parameters. Use `Default::default()` for default values.
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub struct CategoricalNBParameters<T: RealNumber> {
|
||||||
|
/// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
|
||||||
|
pub alpha: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber> CategoricalNBParameters<T> {
|
||||||
|
/// Create CategoricalNBParameters with specific paramaters.
|
||||||
|
pub fn new(alpha: T) -> Result<Self, Failed> {
|
||||||
|
if alpha > T::zero() {
|
||||||
|
Ok(Self { alpha })
|
||||||
|
} else {
|
||||||
|
Err(Failed::fit(&format!(
|
||||||
|
"alpha should be >= 0, alpha=[{}]",
|
||||||
|
alpha
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl<T: RealNumber> Default for CategoricalNBParameters<T> {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self { alpha: T::one() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data.
|
||||||
|
pub struct CategoricalNB<T: RealNumber, M: Matrix<T>> {
|
||||||
|
inner: BaseNaiveBayes<T, M, CategoricalNBDistribution<T>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> CategoricalNB<T, M> {
|
||||||
|
/// Fits CategoricalNB with given data
|
||||||
|
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
||||||
|
/// features.
|
||||||
|
/// * `y` - vector with target values (classes) of length N.
|
||||||
|
/// * `parameters` - additional parameters like alpha for smoothing
|
||||||
|
pub fn fit(
|
||||||
|
x: &M,
|
||||||
|
y: &M::RowVector,
|
||||||
|
parameters: CategoricalNBParameters<T>,
|
||||||
|
) -> Result<Self, Failed> {
|
||||||
|
let alpha = parameters.alpha;
|
||||||
|
let distribution = CategoricalNBDistribution::fit(x, y, alpha)?;
|
||||||
|
let inner = BaseNaiveBayes::fit(distribution)?;
|
||||||
|
Ok(Self { inner })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimates the class labels for the provided data.
|
||||||
|
/// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
|
||||||
|
/// Returns a vector of size N with class estimates.
|
||||||
|
pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.inner.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_base_naive_bayes() {
|
||||||
|
let x = DenseMatrix::from_2d_array(&[
|
||||||
|
&[0., 2., 1., 0.],
|
||||||
|
&[0., 2., 1., 1.],
|
||||||
|
&[1., 2., 1., 0.],
|
||||||
|
&[2., 1., 1., 0.],
|
||||||
|
&[2., 0., 0., 0.],
|
||||||
|
&[2., 0., 0., 1.],
|
||||||
|
&[1., 0., 0., 1.],
|
||||||
|
&[0., 1., 1., 0.],
|
||||||
|
&[0., 0., 0., 0.],
|
||||||
|
&[2., 1., 0., 0.],
|
||||||
|
&[0., 1., 0., 1.],
|
||||||
|
&[1., 1., 1., 1.],
|
||||||
|
&[1., 2., 0., 0.],
|
||||||
|
&[2., 1., 1., 1.],
|
||||||
|
]);
|
||||||
|
let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.];
|
||||||
|
|
||||||
|
let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]);
|
||||||
|
let y_hat = cnb.predict(&x_test).unwrap();
|
||||||
|
assert_eq!(y_hat, vec![0., 1.]);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
use crate::error::Failed;
|
||||||
|
use crate::linalg::BaseVector;
|
||||||
|
use crate::linalg::Matrix;
|
||||||
|
use crate::math::num::RealNumber;
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
|
/// Distribution used in the Naive Bayes classifier.
|
||||||
|
pub(crate) trait NBDistribution<T: RealNumber, M: Matrix<T>> {
|
||||||
|
/// Prior of class at the given index.
|
||||||
|
fn prior(&self, class_index: usize) -> T;
|
||||||
|
|
||||||
|
/// Conditional probability of sample j given class in the specified index.
|
||||||
|
fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T;
|
||||||
|
|
||||||
|
/// Possible classes of the distribution.
|
||||||
|
fn classes(&self) -> &Vec<T>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Base struct for the Naive Bayes classifier.
|
||||||
|
pub(crate) struct BaseNaiveBayes<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> {
|
||||||
|
distribution: D,
|
||||||
|
_phantom_t: PhantomData<T>,
|
||||||
|
_phantom_m: PhantomData<M>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> BaseNaiveBayes<T, M, D> {
|
||||||
|
/// Fits NB classifier to a given NBdistribution.
|
||||||
|
/// * `distribution` - NBDistribution of the training data
|
||||||
|
pub fn fit(distribution: D) -> Result<Self, Failed> {
|
||||||
|
Ok(Self {
|
||||||
|
distribution,
|
||||||
|
_phantom_t: PhantomData,
|
||||||
|
_phantom_m: PhantomData,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimates the class labels for the provided data.
|
||||||
|
/// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
|
||||||
|
/// Returns a vector of size N with class estimates.
|
||||||
|
pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
let y_classes = self.distribution.classes();
|
||||||
|
let (rows, _) = x.shape();
|
||||||
|
let predictions = (0..rows)
|
||||||
|
.map(|row_index| {
|
||||||
|
let row = x.get_row(row_index);
|
||||||
|
let (prediction, _probability) = y_classes
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(class_index, class)| {
|
||||||
|
(
|
||||||
|
class,
|
||||||
|
self.distribution.conditional_probability(class_index, &row)
|
||||||
|
* self.distribution.prior(class_index),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.max_by(|(_, p1), (_, p2)| p1.partial_cmp(p2).unwrap())
|
||||||
|
.unwrap();
|
||||||
|
*prediction
|
||||||
|
})
|
||||||
|
.collect::<Vec<T>>();
|
||||||
|
let mut y_hat = M::RowVector::zeros(rows);
|
||||||
|
for (i, prediction) in predictions.iter().enumerate().take(rows) {
|
||||||
|
y_hat.set(i, *prediction);
|
||||||
|
}
|
||||||
|
Ok(y_hat)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mod categorical;
|
||||||
|
pub use categorical::{CategoricalNB, CategoricalNBParameters};
|
||||||
Reference in New Issue
Block a user