//! # Logistic Regression //! //! As [Linear Regression](../linear_regression/index.html), logistic regression explains your outcome as a linear combination of predictor variables \\(X\\) but rather than modeling this response directly, //! logistic regression models the probability that \\(y\\) belongs to a particular category, \\(Pr(y = 1|X) \\), as: //! //! \\[ Pr(y=1) \approx \frac{e^{\beta_0 + \sum_{i=1}^n \beta_iX_i}}{1 + e^{\beta_0 + \sum_{i=1}^n \beta_iX_i}} \\] //! //! SmartCore uses [limited memory BFGS](https://en.wikipedia.org/wiki/Limited-memory_BFGS) method to find estimates of regression coefficients, \\(\beta\\) //! //! Example: //! //! ``` //! use smartcore::linalg::naive::dense_matrix::*; //! use smartcore::linear::logistic_regression::*; //! //! //Iris data //! let x = DenseMatrix::from_2d_array(&[ //! &[5.1, 3.5, 1.4, 0.2], //! &[4.9, 3.0, 1.4, 0.2], //! &[4.7, 3.2, 1.3, 0.2], //! &[4.6, 3.1, 1.5, 0.2], //! &[5.0, 3.6, 1.4, 0.2], //! &[5.4, 3.9, 1.7, 0.4], //! &[4.6, 3.4, 1.4, 0.3], //! &[5.0, 3.4, 1.5, 0.2], //! &[4.4, 2.9, 1.4, 0.2], //! &[4.9, 3.1, 1.5, 0.1], //! &[7.0, 3.2, 4.7, 1.4], //! &[6.4, 3.2, 4.5, 1.5], //! &[6.9, 3.1, 4.9, 1.5], //! &[5.5, 2.3, 4.0, 1.3], //! &[6.5, 2.8, 4.6, 1.5], //! &[5.7, 2.8, 4.5, 1.3], //! &[6.3, 3.3, 4.7, 1.6], //! &[4.9, 2.4, 3.3, 1.0], //! &[6.6, 2.9, 4.6, 1.3], //! &[5.2, 2.7, 3.9, 1.4], //! ]); //! let y: Vec = vec![ //! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., //! ]; //! //! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! ``` //! //! ## References: //! * ["Pattern Recognition and Machine Learning", C.M. Bishop, Linear Models for Classification](https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf) //! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 4.3 Logistic Regression](http://faculty.marshall.usc.edu/gareth-james/ISL/) //! * ["On the Limited Memory Method for Large Scale Optimization", Nocedal et al., Mathematical Programming, 1989](http://users.iems.northwestern.edu/~nocedal/PDFfiles/limited.pdf) //! //! //! use std::cmp::Ordering; use std::fmt::Debug; use std::marker::PhantomData; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::optimization::first_order::lbfgs::LBFGS; use crate::optimization::first_order::{FirstOrderOptimizer, OptimizerResult}; use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; /// Logistic Regression parameters #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] pub struct LogisticRegressionParameters {} /// Logistic Regression #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] pub struct LogisticRegression> { coefficients: M, intercept: M, classes: Vec, num_attributes: usize, num_classes: usize, } trait ObjectiveFunction> { fn f(&self, w_bias: &M) -> T; fn df(&self, g: &mut M, w_bias: &M); fn partial_dot(w: &M, x: &M, v_col: usize, m_row: usize) -> T { let mut sum = T::zero(); let p = x.shape().1; for i in 0..p { sum += x.get(m_row, i) * w.get(0, i + v_col); } sum + w.get(0, p + v_col) } } struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { x: &'a M, y: Vec, phantom: PhantomData<&'a T>, } impl Default for LogisticRegressionParameters { fn default() -> Self { LogisticRegressionParameters {} } } impl> PartialEq for LogisticRegression { fn eq(&self, other: &Self) -> bool { if self.num_classes != other.num_classes || self.num_attributes != other.num_attributes || self.classes.len() != other.classes.len() { false } else { for i in 0..self.classes.len() { if (self.classes[i] - other.classes[i]).abs() > T::epsilon() { return false; } } self.coefficients == other.coefficients && self.intercept == other.intercept } } } impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction for BinaryObjectiveFunction<'a, T, M> { fn f(&self, w_bias: &M) -> T { let mut f = T::zero(); let (n, _) = self.x.shape(); for i in 0..n { let wx = BinaryObjectiveFunction::partial_dot(w_bias, self.x, 0, i); f += wx.ln_1pe() - (T::from(self.y[i]).unwrap()) * wx; } f } fn df(&self, g: &mut M, w_bias: &M) { g.copy_from(&M::zeros(1, g.shape().1)); let (n, p) = self.x.shape(); for i in 0..n { let wx = BinaryObjectiveFunction::partial_dot(w_bias, self.x, 0, i); let dyi = (T::from(self.y[i]).unwrap()) - wx.sigmoid(); for j in 0..p { g.set(0, j, g.get(0, j) - dyi * self.x.get(i, j)); } g.set(0, p, g.get(0, p) - dyi); } } } struct MultiClassObjectiveFunction<'a, T: RealNumber, M: Matrix> { x: &'a M, y: Vec, k: usize, phantom: PhantomData<&'a T>, } impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction for MultiClassObjectiveFunction<'a, T, M> { fn f(&self, w_bias: &M) -> T { let mut f = T::zero(); let mut prob = M::zeros(1, self.k); let (n, p) = self.x.shape(); for i in 0..n { for j in 0..self.k { prob.set( 0, j, MultiClassObjectiveFunction::partial_dot(w_bias, self.x, j * (p + 1), i), ); } prob.softmax_mut(); f -= prob.get(0, self.y[i]).ln(); } f } fn df(&self, g: &mut M, w: &M) { g.copy_from(&M::zeros(1, g.shape().1)); let mut prob = M::zeros(1, self.k); let (n, p) = self.x.shape(); for i in 0..n { for j in 0..self.k { prob.set( 0, j, MultiClassObjectiveFunction::partial_dot(w, self.x, j * (p + 1), i), ); } prob.softmax_mut(); for j in 0..self.k { let yi = (if self.y[i] == j { T::one() } else { T::zero() }) - prob.get(0, j); for l in 0..p { let pos = j * (p + 1); g.set(0, pos + l, g.get(0, pos + l) - yi * self.x.get(i, l)); } g.set(0, j * (p + 1) + p, g.get(0, j * (p + 1) + p) - yi); } } } } impl> SupervisedEstimator for LogisticRegression { fn fit( x: &M, y: &M::RowVector, parameters: LogisticRegressionParameters, ) -> Result { LogisticRegression::fit(x, y, parameters) } } impl> Predictor for LogisticRegression { fn predict(&self, x: &M) -> Result { self.predict(x) } } impl> LogisticRegression { /// Fits Logistic Regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target class values /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); let (_, y_nrows) = y_m.shape(); if x_nrows != y_nrows { return Err(Failed::fit( &"Number of rows of X doesn\'t match number of rows of Y".to_string(), )); } let classes = y_m.unique(); let k = classes.len(); let mut yi: Vec = vec![0; y_nrows]; for (i, yi_i) in yi.iter_mut().enumerate().take(y_nrows) { let yc = y_m.get(0, i); *yi_i = classes.iter().position(|c| yc == *c).unwrap(); } match k.cmp(&2) { Ordering::Less => Err(Failed::fit(&format!( "incorrect number of classes: {}. Should be >= 2.", k ))), Ordering::Equal => { let x0 = M::zeros(1, num_attributes + 1); let objective = BinaryObjectiveFunction { x, y: yi, phantom: PhantomData, }; let result = LogisticRegression::minimize(x0, objective); let weights = result.x; Ok(LogisticRegression { coefficients: weights.slice(0..1, 0..num_attributes), intercept: weights.slice(0..1, num_attributes..num_attributes + 1), classes, num_attributes, num_classes: k, }) } Ordering::Greater => { let x0 = M::zeros(1, (num_attributes + 1) * k); let objective = MultiClassObjectiveFunction { x, y: yi, k, phantom: PhantomData, }; let result = LogisticRegression::minimize(x0, objective); let weights = result.x.reshape(k, num_attributes + 1); Ok(LogisticRegression { coefficients: weights.slice(0..k, 0..num_attributes), intercept: weights.slice(0..k, num_attributes..num_attributes + 1), classes, num_attributes, num_classes: k, }) } } } /// Predict class labels for samples in `x`. /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. pub fn predict(&self, x: &M) -> Result { let n = x.shape().0; let mut result = M::zeros(1, n); if self.num_classes == 2 { let y_hat: Vec = x.ab(false, &self.coefficients, true).get_col_as_vec(0); let intercept = self.intercept.get(0, 0); for (i, y_hat_i) in y_hat.iter().enumerate().take(n) { result.set( 0, i, self.classes[if (*y_hat_i + intercept).sigmoid() > T::half() { 1 } else { 0 }], ); } } else { let mut y_hat = x.matmul(&self.coefficients.transpose()); for r in 0..n { for c in 0..self.num_classes { y_hat.set(r, c, y_hat.get(r, c) + self.intercept.get(c, 0)); } } let class_idxs = y_hat.argmax(); for (i, class_i) in class_idxs.iter().enumerate().take(n) { result.set(0, i, self.classes[*class_i]); } } Ok(result.to_row_vector()) } /// Get estimates regression coefficients pub fn coefficients(&self) -> &M { &self.coefficients } /// Get estimate of intercept pub fn intercept(&self) -> &M { &self.intercept } fn minimize(x0: M, objective: impl ObjectiveFunction) -> OptimizerResult { let f = |w: &M| -> T { objective.f(w) }; let df = |g: &mut M, w: &M| objective.df(g, w); let ls: Backtracking = Backtracking { order: FunctionOrder::THIRD, ..Default::default() }; let optimizer: LBFGS = Default::default(); optimizer.optimize(&f, &df, &x0, &ls) } } #[cfg(test)] mod tests { use super::*; use crate::dataset::generator::make_blobs; use crate::linalg::naive::dense_matrix::*; use crate::metrics::accuracy; #[test] fn multiclass_objective_f() { let x = DenseMatrix::from_2d_array(&[ &[1., -5.], &[2., 5.], &[3., -2.], &[1., 2.], &[2., 0.], &[6., -5.], &[7., 5.], &[6., -2.], &[7., 2.], &[6., 0.], &[8., -5.], &[9., 5.], &[10., -2.], &[8., 2.], &[9., 0.], ]); let y = vec![0, 0, 1, 1, 2, 1, 1, 0, 0, 2, 1, 1, 0, 0, 1]; let objective = MultiClassObjectiveFunction { x: &x, y, k: 3, phantom: PhantomData, }; let mut g: DenseMatrix = DenseMatrix::zeros(1, 9); objective.df( &mut g, &DenseMatrix::row_vector_from_array(&[1., 2., 3., 4., 5., 6., 7., 8., 9.]), ); objective.df( &mut g, &DenseMatrix::row_vector_from_array(&[1., 2., 3., 4., 5., 6., 7., 8., 9.]), ); assert!((g.get(0, 0) + 33.000068218163484).abs() < std::f64::EPSILON); let f = objective.f(&DenseMatrix::row_vector_from_array(&[ 1., 2., 3., 4., 5., 6., 7., 8., 9., ])); assert!((f - 408.0052230582765).abs() < std::f64::EPSILON); } #[test] fn binary_objective_f() { let x = DenseMatrix::from_2d_array(&[ &[1., -5.], &[2., 5.], &[3., -2.], &[1., 2.], &[2., 0.], &[6., -5.], &[7., 5.], &[6., -2.], &[7., 2.], &[6., 0.], &[8., -5.], &[9., 5.], &[10., -2.], &[8., 2.], &[9., 0.], ]); let y = vec![0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1]; let objective = BinaryObjectiveFunction { x: &x, y, phantom: PhantomData, }; let mut g: DenseMatrix = DenseMatrix::zeros(1, 3); objective.df(&mut g, &DenseMatrix::row_vector_from_array(&[1., 2., 3.])); objective.df(&mut g, &DenseMatrix::row_vector_from_array(&[1., 2., 3.])); assert!((g.get(0, 0) - 26.051064349381285).abs() < std::f64::EPSILON); assert!((g.get(0, 1) - 10.239000702928523).abs() < std::f64::EPSILON); assert!((g.get(0, 2) - 3.869294270156324).abs() < std::f64::EPSILON); let f = objective.f(&DenseMatrix::row_vector_from_array(&[1., 2., 3.])); assert!((f - 59.76994756647412).abs() < std::f64::EPSILON); } #[test] fn lr_fit_predict() { let x = DenseMatrix::from_2d_array(&[ &[1., -5.], &[2., 5.], &[3., -2.], &[1., 2.], &[2., 0.], &[6., -5.], &[7., 5.], &[6., -2.], &[7., 2.], &[6., 0.], &[8., -5.], &[9., 5.], &[10., -2.], &[8., 2.], &[9., 0.], ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); assert_eq!(lr.coefficients().shape(), (3, 2)); assert_eq!(lr.intercept().shape(), (3, 1)); assert!((lr.coefficients().get(0, 0) - 0.0435).abs() < 1e-4); assert!((lr.intercept().get(0, 0) - 0.1250).abs() < 1e-4); let y_hat = lr.predict(&x).unwrap(); assert_eq!( y_hat, vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] ); } #[test] fn lr_fit_predict_multiclass() { let blobs = make_blobs(15, 4, 3); let x = DenseMatrix::from_vec(15, 4, &blobs.data); let y = blobs.target; let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); assert!(accuracy(&y_hat, &y) > 0.9); } #[test] fn lr_fit_predict_binary() { let blobs = make_blobs(20, 4, 2); let x = DenseMatrix::from_vec(20, 4, &blobs.data); let y = blobs.target; let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); assert!(accuracy(&y_hat, &y) > 0.9); } #[test] #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[1., -5.], &[2., 5.], &[3., -2.], &[1., 2.], &[2., 0.], &[6., -5.], &[7., 5.], &[6., -2.], &[7., 2.], &[6., 0.], &[8., -5.], &[9., 5.], &[10., -2.], &[8., 2.], &[9., 0.], ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let deserialized_lr: LogisticRegression> = serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); assert_eq!(lr, deserialized_lr); } #[test] fn lr_fit_predict_iris() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], &[4.9, 3.0, 1.4, 0.2], &[4.7, 3.2, 1.3, 0.2], &[4.6, 3.1, 1.5, 0.2], &[5.0, 3.6, 1.4, 0.2], &[5.4, 3.9, 1.7, 0.4], &[4.6, 3.4, 1.4, 0.3], &[5.0, 3.4, 1.5, 0.2], &[4.4, 2.9, 1.4, 0.2], &[4.9, 3.1, 1.5, 0.1], &[7.0, 3.2, 4.7, 1.4], &[6.4, 3.2, 4.5, 1.5], &[6.9, 3.1, 4.9, 1.5], &[5.5, 2.3, 4.0, 1.3], &[6.5, 2.8, 4.6, 1.5], &[5.7, 2.8, 4.5, 1.3], &[6.3, 3.3, 4.7, 1.6], &[4.9, 2.4, 3.3, 1.0], &[6.6, 2.9, 4.6, 1.3], &[5.2, 2.7, 3.9, 1.4], ]); let y: Vec = vec![ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); let error: f64 = y .into_iter() .zip(y_hat.into_iter()) .map(|(a, b)| (a - b).abs()) .sum(); assert!(error <= 1.0); } }