@@ -6,6 +6,7 @@
|
|||||||
//! use smartcore::algorithm::neighbour::cover_tree::*;
|
//! use smartcore::algorithm::neighbour::cover_tree::*;
|
||||||
//! use smartcore::math::distance::Distance;
|
//! use smartcore::math::distance::Distance;
|
||||||
//!
|
//!
|
||||||
|
//! #[derive(Clone)]
|
||||||
//! struct SimpleDistance {} // Our distance function
|
//! struct SimpleDistance {} // Our distance function
|
||||||
//!
|
//!
|
||||||
//! impl Distance<i32, f64> for SimpleDistance {
|
//! impl Distance<i32, f64> for SimpleDistance {
|
||||||
@@ -453,7 +454,7 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
use crate::math::distance::Distances;
|
use crate::math::distance::Distances;
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
struct SimpleDistance {}
|
struct SimpleDistance {}
|
||||||
|
|
||||||
impl Distance<i32, f64> for SimpleDistance {
|
impl Distance<i32, f64> for SimpleDistance {
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
//! use smartcore::algorithm::neighbour::linear_search::*;
|
//! use smartcore::algorithm::neighbour::linear_search::*;
|
||||||
//! use smartcore::math::distance::Distance;
|
//! use smartcore::math::distance::Distance;
|
||||||
//!
|
//!
|
||||||
|
//! #[derive(Clone)]
|
||||||
//! struct SimpleDistance {} // Our distance function
|
//! struct SimpleDistance {} // Our distance function
|
||||||
//!
|
//!
|
||||||
//! impl Distance<i32, f64> for SimpleDistance {
|
//! impl Distance<i32, f64> for SimpleDistance {
|
||||||
@@ -137,6 +138,7 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
use crate::math::distance::Distances;
|
use crate::math::distance::Distances;
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
struct SimpleDistance {}
|
struct SimpleDistance {}
|
||||||
|
|
||||||
impl Distance<i32, f64> for SimpleDistance {
|
impl Distance<i32, f64> for SimpleDistance {
|
||||||
|
|||||||
+10
@@ -0,0 +1,10 @@
|
|||||||
|
//! # Common Interfaces and methods
|
||||||
|
//!
|
||||||
|
//! This module consolidates interfaces and uniform basic API that is used elsewhere in the code.
|
||||||
|
|
||||||
|
use crate::error::Failed;
|
||||||
|
|
||||||
|
/// Implements method predict that offers a way to estimate target value from new data
|
||||||
|
pub trait Predictor<X, Y> {
|
||||||
|
fn predict(&self, x: &X) -> Result<Y, Failed>;
|
||||||
|
}
|
||||||
+23
-7
@@ -15,11 +15,9 @@
|
|||||||
//! let blobs = generator::make_blobs(100, 2, 3);
|
//! let blobs = generator::make_blobs(100, 2, 3);
|
||||||
//! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data);
|
//! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data);
|
||||||
//! // Fit the algorithm and predict cluster labels
|
//! // Fit the algorithm and predict cluster labels
|
||||||
//! let labels = DBSCAN::fit(&x, Distances::euclidian(), DBSCANParameters{
|
//! let labels = DBSCAN::fit(&x, Distances::euclidian(),
|
||||||
//! min_samples: 5,
|
//! DBSCANParameters::default().with_eps(3.0)).
|
||||||
//! eps: 3.0,
|
//! and_then(|dbscan| dbscan.predict(&x));
|
||||||
//! algorithm: KNNAlgorithmName::CoverTree
|
|
||||||
//! }).and_then(|dbscan| dbscan.predict(&x));
|
|
||||||
//!
|
//!
|
||||||
//! println!("{:?}", labels);
|
//! println!("{:?}", labels);
|
||||||
//! ```
|
//! ```
|
||||||
@@ -53,14 +51,32 @@ pub struct DBSCAN<T: RealNumber, D: Distance<Vec<T>, T>> {
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
/// DBSCAN clustering algorithm parameters
|
/// DBSCAN clustering algorithm parameters
|
||||||
pub struct DBSCANParameters<T: RealNumber> {
|
pub struct DBSCANParameters<T: RealNumber> {
|
||||||
/// Maximum number of iterations of the k-means algorithm for a single run.
|
/// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
|
||||||
pub min_samples: usize,
|
pub min_samples: usize,
|
||||||
/// The number of samples in a neighborhood for a point to be considered as a core point.
|
/// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
|
||||||
pub eps: T,
|
pub eps: T,
|
||||||
/// KNN algorithm to use.
|
/// KNN algorithm to use.
|
||||||
pub algorithm: KNNAlgorithmName,
|
pub algorithm: KNNAlgorithmName,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber> DBSCANParameters<T> {
|
||||||
|
/// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
|
||||||
|
pub fn with_min_samples(mut self, min_samples: usize) -> Self {
|
||||||
|
self.min_samples = min_samples;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
|
||||||
|
pub fn with_eps(mut self, eps: T) -> Self {
|
||||||
|
self.eps = eps;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// KNN algorithm to use.
|
||||||
|
pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self {
|
||||||
|
self.algorithm = algorithm;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
|
impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.cluster_labels.len() == other.cluster_labels.len()
|
self.cluster_labels.len() == other.cluster_labels.len()
|
||||||
|
|||||||
@@ -105,6 +105,14 @@ pub struct KMeansParameters {
|
|||||||
pub max_iter: usize,
|
pub max_iter: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl KMeansParameters {
|
||||||
|
/// Maximum number of iterations of the k-means algorithm for a single run.
|
||||||
|
pub fn with_max_iter(mut self, max_iter: usize) -> Self {
|
||||||
|
self.max_iter = max_iter;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Default for KMeansParameters {
|
impl Default for KMeansParameters {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
KMeansParameters { max_iter: 100 }
|
KMeansParameters { max_iter: 100 }
|
||||||
|
|||||||
@@ -88,6 +88,15 @@ pub struct PCAParameters {
|
|||||||
pub use_correlation_matrix: bool,
|
pub use_correlation_matrix: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl PCAParameters {
|
||||||
|
/// By default, covariance matrix is used to compute principal components.
|
||||||
|
/// Enable this flag if you want to use correlation matrix instead.
|
||||||
|
pub fn with_use_correlation_matrix(mut self, use_correlation_matrix: bool) -> Self {
|
||||||
|
self.use_correlation_matrix = use_correlation_matrix;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Default for PCAParameters {
|
impl Default for PCAParameters {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
PCAParameters {
|
PCAParameters {
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
//! use smartcore::linalg::naive::dense_matrix::*;
|
//! use smartcore::linalg::naive::dense_matrix::*;
|
||||||
//! use smartcore::ensemble::random_forest_classifier::*;
|
//! use smartcore::ensemble::random_forest_classifier::RandomForestClassifier;
|
||||||
//!
|
//!
|
||||||
//! // Iris dataset
|
//! // Iris dataset
|
||||||
//! let x = DenseMatrix::from_2d_array(&[
|
//! let x = DenseMatrix::from_2d_array(&[
|
||||||
@@ -51,6 +51,7 @@ use std::fmt::Debug;
|
|||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
@@ -84,6 +85,39 @@ pub struct RandomForestClassifier<T: RealNumber> {
|
|||||||
classes: Vec<T>,
|
classes: Vec<T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl RandomForestClassifierParameters {
|
||||||
|
/// Split criteria to use when building a tree. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
|
||||||
|
pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self {
|
||||||
|
self.criterion = criterion;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
|
||||||
|
pub fn with_max_depth(mut self, max_depth: u16) -> Self {
|
||||||
|
self.max_depth = Some(max_depth);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
|
||||||
|
pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
|
||||||
|
self.min_samples_leaf = min_samples_leaf;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
|
||||||
|
pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
|
||||||
|
self.min_samples_split = min_samples_split;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The number of trees in the forest.
|
||||||
|
pub fn with_n_trees(mut self, n_trees: u16) -> Self {
|
||||||
|
self.n_trees = n_trees;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Number of random sample of predictors to use as split candidates.
|
||||||
|
pub fn with_m(mut self, m: usize) -> Self {
|
||||||
|
self.m = Some(m);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> PartialEq for RandomForestClassifier<T> {
|
impl<T: RealNumber> PartialEq for RandomForestClassifier<T> {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
if self.classes.len() != other.classes.len() || self.trees.len() != other.trees.len() {
|
if self.classes.len() != other.classes.len() || self.trees.len() != other.trees.len() {
|
||||||
@@ -117,6 +151,12 @@ impl Default for RandomForestClassifierParameters {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for RandomForestClassifier<T> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> RandomForestClassifier<T> {
|
impl<T: RealNumber> RandomForestClassifier<T> {
|
||||||
/// Build a forest of trees from the training set.
|
/// Build a forest of trees from the training set.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ use std::fmt::Debug;
|
|||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
@@ -79,6 +80,34 @@ pub struct RandomForestRegressor<T: RealNumber> {
|
|||||||
trees: Vec<DecisionTreeRegressor<T>>,
|
trees: Vec<DecisionTreeRegressor<T>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl RandomForestRegressorParameters {
|
||||||
|
/// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
|
||||||
|
pub fn with_max_depth(mut self, max_depth: u16) -> Self {
|
||||||
|
self.max_depth = Some(max_depth);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
|
||||||
|
pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
|
||||||
|
self.min_samples_leaf = min_samples_leaf;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
|
||||||
|
pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
|
||||||
|
self.min_samples_split = min_samples_split;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The number of trees in the forest.
|
||||||
|
pub fn with_n_trees(mut self, n_trees: usize) -> Self {
|
||||||
|
self.n_trees = n_trees;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Number of random sample of predictors to use as split candidates.
|
||||||
|
pub fn with_m(mut self, m: usize) -> Self {
|
||||||
|
self.m = Some(m);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Default for RandomForestRegressorParameters {
|
impl Default for RandomForestRegressorParameters {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
RandomForestRegressorParameters {
|
RandomForestRegressorParameters {
|
||||||
@@ -106,6 +135,12 @@ impl<T: RealNumber> PartialEq for RandomForestRegressor<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for RandomForestRegressor<T> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> RandomForestRegressor<T> {
|
impl<T: RealNumber> RandomForestRegressor<T> {
|
||||||
/// Build a forest of trees from the training set.
|
/// Build a forest of trees from the training set.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
|
|||||||
+2
-1
@@ -63,7 +63,7 @@
|
|||||||
//! let y = vec![2., 2., 2., 3., 3.];
|
//! let y = vec![2., 2., 2., 3., 3.];
|
||||||
//!
|
//!
|
||||||
//! // Train classifier
|
//! // Train classifier
|
||||||
//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap();
|
//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap();
|
||||||
//!
|
//!
|
||||||
//! // Predict classes
|
//! // Predict classes
|
||||||
//! let y_hat = knn.predict(&x).unwrap();
|
//! let y_hat = knn.predict(&x).unwrap();
|
||||||
@@ -71,6 +71,7 @@
|
|||||||
|
|
||||||
/// Various algorithms and helper methods that are used elsewhere in SmartCore
|
/// Various algorithms and helper methods that are used elsewhere in SmartCore
|
||||||
pub mod algorithm;
|
pub mod algorithm;
|
||||||
|
pub(crate) mod base;
|
||||||
/// Algorithms for clustering of unlabeled data
|
/// Algorithms for clustering of unlabeled data
|
||||||
pub mod cluster;
|
pub mod cluster;
|
||||||
/// Various datasets
|
/// Various datasets
|
||||||
|
|||||||
@@ -274,6 +274,19 @@ pub trait BaseVector<T: RealNumber>: Clone + Debug {
|
|||||||
|
|
||||||
/// Copies content of `other` vector.
|
/// Copies content of `other` vector.
|
||||||
fn copy_from(&mut self, other: &Self);
|
fn copy_from(&mut self, other: &Self);
|
||||||
|
|
||||||
|
/// Take elements from an array.
|
||||||
|
fn take(&self, index: &[usize]) -> Self {
|
||||||
|
let n = index.len();
|
||||||
|
|
||||||
|
let mut result = Self::zeros(n);
|
||||||
|
|
||||||
|
for (i, idx) in index.iter().enumerate() {
|
||||||
|
result.set(i, self.get(*idx));
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generic matrix type.
|
/// Generic matrix type.
|
||||||
@@ -611,6 +624,32 @@ pub trait BaseMatrix<T: RealNumber>: Clone + Debug {
|
|||||||
|
|
||||||
/// Calculates the covariance matrix
|
/// Calculates the covariance matrix
|
||||||
fn cov(&self) -> Self;
|
fn cov(&self) -> Self;
|
||||||
|
|
||||||
|
/// Take elements from an array along an axis.
|
||||||
|
fn take(&self, index: &[usize], axis: u8) -> Self {
|
||||||
|
let (n, p) = self.shape();
|
||||||
|
|
||||||
|
let k = match axis {
|
||||||
|
0 => p,
|
||||||
|
_ => n,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut result = match axis {
|
||||||
|
0 => Self::zeros(index.len(), p),
|
||||||
|
_ => Self::zeros(n, index.len()),
|
||||||
|
};
|
||||||
|
|
||||||
|
for (i, idx) in index.iter().enumerate() {
|
||||||
|
for j in 0..k {
|
||||||
|
match axis {
|
||||||
|
0 => result.set(i, j, self.get(*idx, j)),
|
||||||
|
_ => result.set(j, i, self.get(j, *idx)),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generic matrix with additional mixins like various factorization methods.
|
/// Generic matrix with additional mixins like various factorization methods.
|
||||||
@@ -662,6 +701,8 @@ impl<'a, T: RealNumber, M: BaseMatrix<T>> Iterator for RowIter<'a, T, M> {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
||||||
|
use crate::linalg::BaseMatrix;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -684,4 +725,35 @@ mod tests {
|
|||||||
|
|
||||||
assert!((m.var() - 1.25f64).abs() < std::f64::EPSILON);
|
assert!((m.var() - 1.25f64).abs() < std::f64::EPSILON);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn vec_take() {
|
||||||
|
let m = vec![1., 2., 3., 4., 5.];
|
||||||
|
|
||||||
|
assert_eq!(m.take(&vec!(0, 0, 4, 4)), vec![1., 1., 5., 5.]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn take() {
|
||||||
|
let m = DenseMatrix::from_2d_array(&[
|
||||||
|
&[1.0, 2.0],
|
||||||
|
&[3.0, 4.0],
|
||||||
|
&[5.0, 6.0],
|
||||||
|
&[7.0, 8.0],
|
||||||
|
&[9.0, 10.0],
|
||||||
|
]);
|
||||||
|
|
||||||
|
let expected_0 = DenseMatrix::from_2d_array(&[&[3.0, 4.0], &[3.0, 4.0], &[7.0, 8.0]]);
|
||||||
|
|
||||||
|
let expected_1 = DenseMatrix::from_2d_array(&[
|
||||||
|
&[2.0, 1.0],
|
||||||
|
&[4.0, 3.0],
|
||||||
|
&[6.0, 5.0],
|
||||||
|
&[8.0, 7.0],
|
||||||
|
&[10.0, 9.0],
|
||||||
|
]);
|
||||||
|
|
||||||
|
assert_eq!(m.take(&vec!(1, 1, 3), 0), expected_0);
|
||||||
|
assert_eq!(m.take(&vec!(1, 0), 1), expected_1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,7 +36,7 @@
|
|||||||
//! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.
|
//! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.
|
||||||
//! ]);
|
//! ]);
|
||||||
//!
|
//!
|
||||||
//! let lr = LogisticRegression::fit(&x, &y).unwrap();
|
//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap();
|
||||||
//! let y_hat = lr.predict(&x).unwrap();
|
//! let y_hat = lr.predict(&x).unwrap();
|
||||||
//! ```
|
//! ```
|
||||||
use std::iter::Sum;
|
use std::iter::Sum;
|
||||||
@@ -917,7 +917,7 @@ mod tests {
|
|||||||
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let lr = LogisticRegression::fit(&x, &y).unwrap();
|
let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
let y_hat = lr.predict(&x).unwrap();
|
let y_hat = lr.predict(&x).unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -58,6 +58,7 @@ use std::fmt::Debug;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
@@ -66,7 +67,7 @@ use crate::math::num::RealNumber;
|
|||||||
use crate::linear::lasso_optimizer::InteriorPointOptimizer;
|
use crate::linear::lasso_optimizer::InteriorPointOptimizer;
|
||||||
|
|
||||||
/// Elastic net parameters
|
/// Elastic net parameters
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct ElasticNetParameters<T: RealNumber> {
|
pub struct ElasticNetParameters<T: RealNumber> {
|
||||||
/// Regularization parameter.
|
/// Regularization parameter.
|
||||||
pub alpha: T,
|
pub alpha: T,
|
||||||
@@ -89,6 +90,36 @@ pub struct ElasticNet<T: RealNumber, M: Matrix<T>> {
|
|||||||
intercept: T,
|
intercept: T,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber> ElasticNetParameters<T> {
|
||||||
|
/// Regularization parameter.
|
||||||
|
pub fn with_alpha(mut self, alpha: T) -> Self {
|
||||||
|
self.alpha = alpha;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The elastic net mixing parameter, with 0 <= l1_ratio <= 1.
|
||||||
|
/// For l1_ratio = 0 the penalty is an L2 penalty.
|
||||||
|
/// For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
|
||||||
|
pub fn with_l1_ratio(mut self, l1_ratio: T) -> Self {
|
||||||
|
self.l1_ratio = l1_ratio;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation.
|
||||||
|
pub fn with_normalize(mut self, normalize: bool) -> Self {
|
||||||
|
self.normalize = normalize;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The tolerance for the optimization
|
||||||
|
pub fn with_tol(mut self, tol: T) -> Self {
|
||||||
|
self.tol = tol;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The maximum number of iterations
|
||||||
|
pub fn with_max_iter(mut self, max_iter: usize) -> Self {
|
||||||
|
self.max_iter = max_iter;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> Default for ElasticNetParameters<T> {
|
impl<T: RealNumber> Default for ElasticNetParameters<T> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
ElasticNetParameters {
|
ElasticNetParameters {
|
||||||
@@ -108,6 +139,12 @@ impl<T: RealNumber, M: Matrix<T>> PartialEq for ElasticNet<T, M> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for ElasticNet<T, M> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> ElasticNet<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> ElasticNet<T, M> {
|
||||||
/// Fits elastic net regression to your data.
|
/// Fits elastic net regression to your data.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
|
|||||||
+31
-1
@@ -26,6 +26,7 @@ use std::fmt::Debug;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
@@ -33,7 +34,7 @@ use crate::linear::lasso_optimizer::InteriorPointOptimizer;
|
|||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
|
|
||||||
/// Lasso regression parameters
|
/// Lasso regression parameters
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct LassoParameters<T: RealNumber> {
|
pub struct LassoParameters<T: RealNumber> {
|
||||||
/// Controls the strength of the penalty to the loss function.
|
/// Controls the strength of the penalty to the loss function.
|
||||||
pub alpha: T,
|
pub alpha: T,
|
||||||
@@ -53,6 +54,29 @@ pub struct Lasso<T: RealNumber, M: Matrix<T>> {
|
|||||||
intercept: T,
|
intercept: T,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber> LassoParameters<T> {
|
||||||
|
/// Regularization parameter.
|
||||||
|
pub fn with_alpha(mut self, alpha: T) -> Self {
|
||||||
|
self.alpha = alpha;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation.
|
||||||
|
pub fn with_normalize(mut self, normalize: bool) -> Self {
|
||||||
|
self.normalize = normalize;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The tolerance for the optimization
|
||||||
|
pub fn with_tol(mut self, tol: T) -> Self {
|
||||||
|
self.tol = tol;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The maximum number of iterations
|
||||||
|
pub fn with_max_iter(mut self, max_iter: usize) -> Self {
|
||||||
|
self.max_iter = max_iter;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> Default for LassoParameters<T> {
|
impl<T: RealNumber> Default for LassoParameters<T> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
LassoParameters {
|
LassoParameters {
|
||||||
@@ -71,6 +95,12 @@ impl<T: RealNumber, M: Matrix<T>> PartialEq for Lasso<T, M> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for Lasso<T, M> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> Lasso<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> Lasso<T, M> {
|
||||||
/// Fits Lasso regression to your data.
|
/// Fits Lasso regression to your data.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
|
|||||||
@@ -45,9 +45,9 @@
|
|||||||
//! let y: Vec<f64> = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0,
|
//! let y: Vec<f64> = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0,
|
||||||
//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9];
|
//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9];
|
||||||
//!
|
//!
|
||||||
//! let lr = LinearRegression::fit(&x, &y, LinearRegressionParameters {
|
//! let lr = LinearRegression::fit(&x, &y,
|
||||||
//! solver: LinearRegressionSolverName::QR, // or SVD
|
//! LinearRegressionParameters::default().
|
||||||
//! }).unwrap();
|
//! with_solver(LinearRegressionSolverName::QR)).unwrap();
|
||||||
//!
|
//!
|
||||||
//! let y_hat = lr.predict(&x).unwrap();
|
//! let y_hat = lr.predict(&x).unwrap();
|
||||||
//! ```
|
//! ```
|
||||||
@@ -64,11 +64,12 @@ use std::fmt::Debug;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
/// Approach to use for estimation of regression coefficients. QR is more efficient but SVD is more stable.
|
/// Approach to use for estimation of regression coefficients. QR is more efficient but SVD is more stable.
|
||||||
pub enum LinearRegressionSolverName {
|
pub enum LinearRegressionSolverName {
|
||||||
/// QR decomposition, see [QR](../../linalg/qr/index.html)
|
/// QR decomposition, see [QR](../../linalg/qr/index.html)
|
||||||
@@ -78,7 +79,7 @@ pub enum LinearRegressionSolverName {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Linear Regression parameters
|
/// Linear Regression parameters
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct LinearRegressionParameters {
|
pub struct LinearRegressionParameters {
|
||||||
/// Solver to use for estimation of regression coefficients.
|
/// Solver to use for estimation of regression coefficients.
|
||||||
pub solver: LinearRegressionSolverName,
|
pub solver: LinearRegressionSolverName,
|
||||||
@@ -92,6 +93,14 @@ pub struct LinearRegression<T: RealNumber, M: Matrix<T>> {
|
|||||||
solver: LinearRegressionSolverName,
|
solver: LinearRegressionSolverName,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl LinearRegressionParameters {
|
||||||
|
/// Solver to use for estimation of regression coefficients.
|
||||||
|
pub fn with_solver(mut self, solver: LinearRegressionSolverName) -> Self {
|
||||||
|
self.solver = solver;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Default for LinearRegressionParameters {
|
impl Default for LinearRegressionParameters {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
LinearRegressionParameters {
|
LinearRegressionParameters {
|
||||||
@@ -107,6 +116,12 @@ impl<T: RealNumber, M: Matrix<T>> PartialEq for LinearRegression<T, M> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for LinearRegression<T, M> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> LinearRegression<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> LinearRegression<T, M> {
|
||||||
/// Fits Linear Regression to your data.
|
/// Fits Linear Regression to your data.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
|
|||||||
@@ -40,7 +40,7 @@
|
|||||||
//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
||||||
//! ];
|
//! ];
|
||||||
//!
|
//!
|
||||||
//! let lr = LogisticRegression::fit(&x, &y).unwrap();
|
//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap();
|
||||||
//!
|
//!
|
||||||
//! let y_hat = lr.predict(&x).unwrap();
|
//! let y_hat = lr.predict(&x).unwrap();
|
||||||
//! ```
|
//! ```
|
||||||
@@ -58,6 +58,7 @@ use std::marker::PhantomData;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
@@ -66,6 +67,10 @@ use crate::optimization::first_order::{FirstOrderOptimizer, OptimizerResult};
|
|||||||
use crate::optimization::line_search::Backtracking;
|
use crate::optimization::line_search::Backtracking;
|
||||||
use crate::optimization::FunctionOrder;
|
use crate::optimization::FunctionOrder;
|
||||||
|
|
||||||
|
/// Logistic Regression parameters
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
|
pub struct LogisticRegressionParameters {}
|
||||||
|
|
||||||
/// Logistic Regression
|
/// Logistic Regression
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct LogisticRegression<T: RealNumber, M: Matrix<T>> {
|
pub struct LogisticRegression<T: RealNumber, M: Matrix<T>> {
|
||||||
@@ -97,6 +102,12 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix<T>> {
|
|||||||
phantom: PhantomData<&'a T>,
|
phantom: PhantomData<&'a T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Default for LogisticRegressionParameters {
|
||||||
|
fn default() -> Self {
|
||||||
|
LogisticRegressionParameters {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> PartialEq for LogisticRegression<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> PartialEq for LogisticRegression<T, M> {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
if self.num_classes != other.num_classes
|
if self.num_classes != other.num_classes
|
||||||
@@ -207,11 +218,22 @@ impl<'a, T: RealNumber, M: Matrix<T>> ObjectiveFunction<T, M>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for LogisticRegression<T, M> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> LogisticRegression<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> LogisticRegression<T, M> {
|
||||||
/// Fits Logistic Regression to your data.
|
/// Fits Logistic Regression to your data.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
/// * `y` - target class values
|
/// * `y` - target class values
|
||||||
pub fn fit(x: &M, y: &M::RowVector) -> Result<LogisticRegression<T, M>, Failed> {
|
/// * `parameters` - other parameters, use `Default::default()` to set parameters to default values.
|
||||||
|
pub fn fit(
|
||||||
|
x: &M,
|
||||||
|
y: &M::RowVector,
|
||||||
|
_parameters: LogisticRegressionParameters,
|
||||||
|
) -> Result<LogisticRegression<T, M>, Failed> {
|
||||||
let y_m = M::from_row_vector(y.clone());
|
let y_m = M::from_row_vector(y.clone());
|
||||||
let (x_nrows, num_attributes) = x.shape();
|
let (x_nrows, num_attributes) = x.shape();
|
||||||
let (_, y_nrows) = y_m.shape();
|
let (_, y_nrows) = y_m.shape();
|
||||||
@@ -461,7 +483,7 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let y: Vec<f64> = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.];
|
let y: Vec<f64> = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.];
|
||||||
|
|
||||||
let lr = LogisticRegression::fit(&x, &y).unwrap();
|
let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
assert_eq!(lr.coefficients().shape(), (3, 2));
|
assert_eq!(lr.coefficients().shape(), (3, 2));
|
||||||
assert_eq!(lr.intercept().shape(), (3, 1));
|
assert_eq!(lr.intercept().shape(), (3, 1));
|
||||||
@@ -484,7 +506,7 @@ mod tests {
|
|||||||
let x = DenseMatrix::from_vec(15, 4, &blobs.data);
|
let x = DenseMatrix::from_vec(15, 4, &blobs.data);
|
||||||
let y = blobs.target;
|
let y = blobs.target;
|
||||||
|
|
||||||
let lr = LogisticRegression::fit(&x, &y).unwrap();
|
let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
let y_hat = lr.predict(&x).unwrap();
|
let y_hat = lr.predict(&x).unwrap();
|
||||||
|
|
||||||
@@ -498,7 +520,7 @@ mod tests {
|
|||||||
let x = DenseMatrix::from_vec(20, 4, &blobs.data);
|
let x = DenseMatrix::from_vec(20, 4, &blobs.data);
|
||||||
let y = blobs.target;
|
let y = blobs.target;
|
||||||
|
|
||||||
let lr = LogisticRegression::fit(&x, &y).unwrap();
|
let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
let y_hat = lr.predict(&x).unwrap();
|
let y_hat = lr.predict(&x).unwrap();
|
||||||
|
|
||||||
@@ -526,7 +548,7 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let y: Vec<f64> = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.];
|
let y: Vec<f64> = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.];
|
||||||
|
|
||||||
let lr = LogisticRegression::fit(&x, &y).unwrap();
|
let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
let deserialized_lr: LogisticRegression<f64, DenseMatrix<f64>> =
|
let deserialized_lr: LogisticRegression<f64, DenseMatrix<f64>> =
|
||||||
serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap();
|
serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap();
|
||||||
@@ -562,7 +584,7 @@ mod tests {
|
|||||||
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
||||||
];
|
];
|
||||||
|
|
||||||
let lr = LogisticRegression::fit(&x, &y).unwrap();
|
let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
let y_hat = lr.predict(&x).unwrap();
|
let y_hat = lr.predict(&x).unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -45,11 +45,8 @@
|
|||||||
//! let y: Vec<f64> = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0,
|
//! let y: Vec<f64> = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0,
|
||||||
//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9];
|
//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9];
|
||||||
//!
|
//!
|
||||||
//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters {
|
//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters::default().with_alpha(0.1)).
|
||||||
//! solver: RidgeRegressionSolverName::Cholesky,
|
//! and_then(|lr| lr.predict(&x)).unwrap();
|
||||||
//! alpha: 0.1,
|
|
||||||
//! normalize: true
|
|
||||||
//! }).and_then(|lr| lr.predict(&x)).unwrap();
|
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! ## References:
|
//! ## References:
|
||||||
@@ -63,12 +60,13 @@ use std::fmt::Debug;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
/// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable.
|
/// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable.
|
||||||
pub enum RidgeRegressionSolverName {
|
pub enum RidgeRegressionSolverName {
|
||||||
/// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html)
|
/// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html)
|
||||||
@@ -78,7 +76,7 @@ pub enum RidgeRegressionSolverName {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Ridge Regression parameters
|
/// Ridge Regression parameters
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct RidgeRegressionParameters<T: RealNumber> {
|
pub struct RidgeRegressionParameters<T: RealNumber> {
|
||||||
/// Solver to use for estimation of regression coefficients.
|
/// Solver to use for estimation of regression coefficients.
|
||||||
pub solver: RidgeRegressionSolverName,
|
pub solver: RidgeRegressionSolverName,
|
||||||
@@ -97,6 +95,24 @@ pub struct RidgeRegression<T: RealNumber, M: Matrix<T>> {
|
|||||||
solver: RidgeRegressionSolverName,
|
solver: RidgeRegressionSolverName,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber> RidgeRegressionParameters<T> {
|
||||||
|
/// Regularization parameter.
|
||||||
|
pub fn with_alpha(mut self, alpha: T) -> Self {
|
||||||
|
self.alpha = alpha;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Solver to use for estimation of regression coefficients.
|
||||||
|
pub fn with_solver(mut self, solver: RidgeRegressionSolverName) -> Self {
|
||||||
|
self.solver = solver;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation.
|
||||||
|
pub fn with_normalize(mut self, normalize: bool) -> Self {
|
||||||
|
self.normalize = normalize;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> Default for RidgeRegressionParameters<T> {
|
impl<T: RealNumber> Default for RidgeRegressionParameters<T> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
RidgeRegressionParameters {
|
RidgeRegressionParameters {
|
||||||
@@ -114,6 +130,12 @@ impl<T: RealNumber, M: Matrix<T>> PartialEq for RidgeRegression<T, M> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for RidgeRegression<T, M> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> RidgeRegression<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> RidgeRegression<T, M> {
|
||||||
/// Fits ridge regression to your data.
|
/// Fits ridge regression to your data.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ use crate::math::num::RealNumber;
|
|||||||
use super::Distance;
|
use super::Distance;
|
||||||
|
|
||||||
/// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space.
|
/// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space.
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct Euclidian {}
|
pub struct Euclidian {}
|
||||||
|
|
||||||
impl Euclidian {
|
impl Euclidian {
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ use crate::math::num::RealNumber;
|
|||||||
use super::Distance;
|
use super::Distance;
|
||||||
|
|
||||||
/// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different
|
/// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct Hamming {}
|
pub struct Hamming {}
|
||||||
|
|
||||||
impl<T: PartialEq, F: RealNumber> Distance<Vec<T>, F> for Hamming {
|
impl<T: PartialEq, F: RealNumber> Distance<Vec<T>, F> for Hamming {
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ use super::Distance;
|
|||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
|
|
||||||
/// Mahalanobis distance.
|
/// Mahalanobis distance.
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct Mahalanobis<T: RealNumber, M: Matrix<T>> {
|
pub struct Mahalanobis<T: RealNumber, M: Matrix<T>> {
|
||||||
/// covariance matrix of the dataset
|
/// covariance matrix of the dataset
|
||||||
pub sigma: M,
|
pub sigma: M,
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ use crate::math::num::RealNumber;
|
|||||||
use super::Distance;
|
use super::Distance;
|
||||||
|
|
||||||
/// Manhattan distance
|
/// Manhattan distance
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct Manhattan {}
|
pub struct Manhattan {}
|
||||||
|
|
||||||
impl<T: RealNumber> Distance<Vec<T>, T> for Manhattan {
|
impl<T: RealNumber> Distance<Vec<T>, T> for Manhattan {
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ use crate::math::num::RealNumber;
|
|||||||
use super::Distance;
|
use super::Distance;
|
||||||
|
|
||||||
/// Defines the Minkowski distance of order `p`
|
/// Defines the Minkowski distance of order `p`
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct Minkowski {
|
pub struct Minkowski {
|
||||||
/// order, integer
|
/// order, integer
|
||||||
pub p: u16,
|
pub p: u16,
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ use crate::linalg::Matrix;
|
|||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
|
|
||||||
/// Distance metric, a function that calculates distance between two points
|
/// Distance metric, a function that calculates distance between two points
|
||||||
pub trait Distance<T, F: RealNumber> {
|
pub trait Distance<T, F: RealNumber>: Clone {
|
||||||
/// Calculates distance between _a_ and _b_
|
/// Calculates distance between _a_ and _b_
|
||||||
fn distance(&self, a: &T, b: &T) -> F;
|
fn distance(&self, a: &T, b: &T) -> F;
|
||||||
}
|
}
|
||||||
|
|||||||
+1
-1
@@ -42,7 +42,7 @@
|
|||||||
//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
||||||
//! ];
|
//! ];
|
||||||
//!
|
//!
|
||||||
//! let lr = LogisticRegression::fit(&x, &y).unwrap();
|
//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap();
|
||||||
//!
|
//!
|
||||||
//! let y_hat = lr.predict(&x).unwrap();
|
//! let y_hat = lr.predict(&x).unwrap();
|
||||||
//!
|
//!
|
||||||
|
|||||||
@@ -0,0 +1,269 @@
|
|||||||
|
//! # KFold
|
||||||
|
//!
|
||||||
|
//! Defines k-fold cross validator.
|
||||||
|
|
||||||
|
use crate::linalg::Matrix;
|
||||||
|
use crate::math::num::RealNumber;
|
||||||
|
use crate::model_selection::BaseKFold;
|
||||||
|
use rand::seq::SliceRandom;
|
||||||
|
use rand::thread_rng;
|
||||||
|
|
||||||
|
/// K-Folds cross-validator
|
||||||
|
pub struct KFold {
|
||||||
|
/// Number of folds. Must be at least 2.
|
||||||
|
pub n_splits: usize, // cannot exceed std::usize::MAX
|
||||||
|
/// Whether to shuffle the data before splitting into batches
|
||||||
|
pub shuffle: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KFold {
|
||||||
|
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>> {
|
||||||
|
// number of samples (rows) in the matrix
|
||||||
|
let n_samples: usize = x.shape().0;
|
||||||
|
|
||||||
|
// initialise indices
|
||||||
|
let mut indices: Vec<usize> = (0..n_samples).collect();
|
||||||
|
if self.shuffle {
|
||||||
|
indices.shuffle(&mut thread_rng());
|
||||||
|
}
|
||||||
|
// return a new array of given shape n_split, filled with each element of n_samples divided by n_splits.
|
||||||
|
let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits];
|
||||||
|
|
||||||
|
// increment by one if odd
|
||||||
|
for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) {
|
||||||
|
*fold_size += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// generate the right array of arrays for test indices
|
||||||
|
let mut return_values: Vec<Vec<usize>> = Vec::with_capacity(self.n_splits);
|
||||||
|
let mut current: usize = 0;
|
||||||
|
for fold_size in fold_sizes.drain(..) {
|
||||||
|
let stop = current + fold_size;
|
||||||
|
return_values.push(indices[current..stop].to_vec());
|
||||||
|
current = stop
|
||||||
|
}
|
||||||
|
|
||||||
|
return_values
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>> {
|
||||||
|
let mut return_values: Vec<Vec<bool>> = Vec::with_capacity(self.n_splits);
|
||||||
|
for test_index in self.test_indices(x).drain(..) {
|
||||||
|
// init mask
|
||||||
|
let mut test_mask = vec![false; x.shape().0];
|
||||||
|
// set mask's indices to true according to test indices
|
||||||
|
for i in test_index {
|
||||||
|
test_mask[i] = true; // can be implemented with map()
|
||||||
|
}
|
||||||
|
return_values.push(test_mask);
|
||||||
|
}
|
||||||
|
return_values
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for KFold {
|
||||||
|
fn default() -> KFold {
|
||||||
|
KFold {
|
||||||
|
n_splits: 3,
|
||||||
|
shuffle: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KFold {
|
||||||
|
/// Number of folds. Must be at least 2.
|
||||||
|
pub fn with_n_splits(mut self, n_splits: usize) -> Self {
|
||||||
|
self.n_splits = n_splits;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Whether to shuffle the data before splitting into batches
|
||||||
|
pub fn with_shuffle(mut self, shuffle: bool) -> Self {
|
||||||
|
self.shuffle = shuffle;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An iterator over indices that split data into training and test set.
|
||||||
|
pub struct KFoldIter {
|
||||||
|
indices: Vec<usize>,
|
||||||
|
test_indices: Vec<Vec<bool>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for KFoldIter {
|
||||||
|
type Item = (Vec<usize>, Vec<usize>);
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<(Vec<usize>, Vec<usize>)> {
|
||||||
|
self.test_indices.pop().map(|test_index| {
|
||||||
|
let train_index = self
|
||||||
|
.indices
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|&(idx, _)| !test_index[idx])
|
||||||
|
.map(|(idx, _)| idx)
|
||||||
|
.collect::<Vec<usize>>(); // filter train indices out according to mask
|
||||||
|
let test_index = self
|
||||||
|
.indices
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|&(idx, _)| test_index[idx])
|
||||||
|
.map(|(idx, _)| idx)
|
||||||
|
.collect::<Vec<usize>>(); // filter tests indices out according to mask
|
||||||
|
|
||||||
|
(train_index, test_index)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Abstract class for all KFold functionalities
|
||||||
|
impl BaseKFold for KFold {
|
||||||
|
type Output = KFoldIter;
|
||||||
|
|
||||||
|
fn n_splits(&self) -> usize {
|
||||||
|
self.n_splits
|
||||||
|
}
|
||||||
|
|
||||||
|
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Self::Output {
|
||||||
|
if self.n_splits < 2 {
|
||||||
|
panic!("Number of splits is too small: {}", self.n_splits);
|
||||||
|
}
|
||||||
|
let n_samples: usize = x.shape().0;
|
||||||
|
let indices: Vec<usize> = (0..n_samples).collect();
|
||||||
|
let mut test_indices = self.test_masks(x);
|
||||||
|
test_indices.reverse();
|
||||||
|
|
||||||
|
KFoldIter {
|
||||||
|
indices,
|
||||||
|
test_indices,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use crate::linalg::naive::dense_matrix::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_test_indices_simple() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 3,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(33, 100);
|
||||||
|
let test_indices = k.test_indices(&x);
|
||||||
|
|
||||||
|
assert_eq!(test_indices[0], (0..11).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(test_indices[1], (11..22).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(test_indices[2], (22..33).collect::<Vec<usize>>());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_test_indices_odd() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 3,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(34, 100);
|
||||||
|
let test_indices = k.test_indices(&x);
|
||||||
|
|
||||||
|
assert_eq!(test_indices[0], (0..12).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(test_indices[1], (12..23).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(test_indices[2], (23..34).collect::<Vec<usize>>());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_test_mask_simple() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 2,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
|
||||||
|
let test_masks = k.test_masks(&x);
|
||||||
|
|
||||||
|
for t in &test_masks[0][0..11] {
|
||||||
|
// TODO: this can be prob done better
|
||||||
|
assert_eq!(*t, true)
|
||||||
|
}
|
||||||
|
for t in &test_masks[0][11..22] {
|
||||||
|
assert_eq!(*t, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
for t in &test_masks[1][0..11] {
|
||||||
|
assert_eq!(*t, false)
|
||||||
|
}
|
||||||
|
for t in &test_masks[1][11..22] {
|
||||||
|
assert_eq!(*t, true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_split_simple() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 2,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
|
||||||
|
let train_test_splits: Vec<(Vec<usize>, Vec<usize>)> = k.split(&x).collect();
|
||||||
|
|
||||||
|
assert_eq!(train_test_splits[0].1, (0..11).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(train_test_splits[0].0, (11..22).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(train_test_splits[1].0, (0..11).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(train_test_splits[1].1, (11..22).collect::<Vec<usize>>());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_split_simple_shuffle() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 2,
|
||||||
|
..KFold::default()
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(23, 100);
|
||||||
|
let train_test_splits: Vec<(Vec<usize>, Vec<usize>)> = k.split(&x).collect();
|
||||||
|
|
||||||
|
assert_eq!(train_test_splits[0].1.len(), 12_usize);
|
||||||
|
assert_eq!(train_test_splits[0].0.len(), 11_usize);
|
||||||
|
assert_eq!(train_test_splits[1].0.len(), 12_usize);
|
||||||
|
assert_eq!(train_test_splits[1].1.len(), 11_usize);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn numpy_parity_test() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 3,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
|
||||||
|
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
|
||||||
|
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
|
||||||
|
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
|
||||||
|
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
|
||||||
|
];
|
||||||
|
for ((train, test), (expected_train, expected_test)) in
|
||||||
|
k.split(&x).into_iter().zip(expected)
|
||||||
|
{
|
||||||
|
assert_eq!(test, expected_test);
|
||||||
|
assert_eq!(train, expected_train);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn numpy_parity_test_shuffle() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 3,
|
||||||
|
..KFold::default()
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
|
||||||
|
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
|
||||||
|
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
|
||||||
|
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
|
||||||
|
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
|
||||||
|
];
|
||||||
|
for ((train, test), (expected_train, expected_test)) in
|
||||||
|
k.split(&x).into_iter().zip(expected)
|
||||||
|
{
|
||||||
|
assert_eq!(test.len(), expected_test.len());
|
||||||
|
assert_eq!(train.len(), expected_train.len());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
+254
-239
@@ -9,21 +9,39 @@
|
|||||||
//!
|
//!
|
||||||
//! In SmartCore you can split your data into training and test datasets using `train_test_split` function.
|
//! In SmartCore you can split your data into training and test datasets using `train_test_split` function.
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
|
use crate::error::Failed;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
use rand::Rng;
|
|
||||||
|
pub(crate) mod kfold;
|
||||||
|
|
||||||
|
pub use kfold::{KFold, KFoldIter};
|
||||||
|
|
||||||
|
/// An interface for the K-Folds cross-validator
|
||||||
|
pub trait BaseKFold {
|
||||||
|
/// An iterator over indices that split data into training and test set.
|
||||||
|
type Output: Iterator<Item = (Vec<usize>, Vec<usize>)>;
|
||||||
|
/// Return a tuple containing the the training set indices for that split and
|
||||||
|
/// the testing set indices for that split.
|
||||||
|
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Self::Output;
|
||||||
|
/// Returns the number of splits
|
||||||
|
fn n_splits(&self) -> usize;
|
||||||
|
}
|
||||||
|
|
||||||
/// Splits data into 2 disjoint datasets.
|
/// Splits data into 2 disjoint datasets.
|
||||||
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
|
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
|
||||||
/// * `y` - target values, should be of size _M_
|
/// * `y` - target values, should be of size _N_
|
||||||
/// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split.
|
/// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split.
|
||||||
|
/// * `shuffle`, - whether or not to shuffle the data before splitting
|
||||||
pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
||||||
x: &M,
|
x: &M,
|
||||||
y: &M::RowVector,
|
y: &M::RowVector,
|
||||||
test_size: f32,
|
test_size: f32,
|
||||||
|
shuffle: bool,
|
||||||
) -> (M, M, M::RowVector, M::RowVector) {
|
) -> (M, M, M::RowVector, M::RowVector) {
|
||||||
if x.shape().0 != y.len() {
|
if x.shape().0 != y.len() {
|
||||||
panic!(
|
panic!(
|
||||||
@@ -38,155 +56,131 @@ pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let n = y.len();
|
let n = y.len();
|
||||||
let m = x.shape().1;
|
|
||||||
|
|
||||||
let mut rng = rand::thread_rng();
|
let n_test = ((n as f32) * test_size) as usize;
|
||||||
let mut n_test = 0;
|
|
||||||
let mut index = vec![false; n];
|
|
||||||
|
|
||||||
for index_i in index.iter_mut().take(n) {
|
if n_test < 1 {
|
||||||
let p_test: f32 = rng.gen();
|
panic!("number of sample is too small {}", n);
|
||||||
if p_test <= test_size {
|
|
||||||
*index_i = true;
|
|
||||||
n_test += 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let n_train = n - n_test;
|
let mut indices: Vec<usize> = (0..n).collect();
|
||||||
|
|
||||||
let mut x_train = M::zeros(n_train, m);
|
if shuffle {
|
||||||
let mut x_test = M::zeros(n_test, m);
|
indices.shuffle(&mut thread_rng());
|
||||||
let mut y_train = M::RowVector::zeros(n_train);
|
|
||||||
let mut y_test = M::RowVector::zeros(n_test);
|
|
||||||
|
|
||||||
let mut r_train = 0;
|
|
||||||
let mut r_test = 0;
|
|
||||||
|
|
||||||
for (r, index_r) in index.iter().enumerate().take(n) {
|
|
||||||
if *index_r {
|
|
||||||
//sample belongs to test
|
|
||||||
for c in 0..m {
|
|
||||||
x_test.set(r_test, c, x.get(r, c));
|
|
||||||
y_test.set(r_test, y.get(r));
|
|
||||||
}
|
|
||||||
r_test += 1;
|
|
||||||
} else {
|
|
||||||
for c in 0..m {
|
|
||||||
x_train.set(r_train, c, x.get(r, c));
|
|
||||||
y_train.set(r_train, y.get(r));
|
|
||||||
}
|
|
||||||
r_train += 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let x_train = x.take(&indices[n_test..n], 0);
|
||||||
|
let x_test = x.take(&indices[0..n_test], 0);
|
||||||
|
let y_train = y.take(&indices[n_test..n]);
|
||||||
|
let y_test = y.take(&indices[0..n_test]);
|
||||||
|
|
||||||
(x_train, x_test, y_train, y_test)
|
(x_train, x_test, y_train, y_test)
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
/// Cross validation results.
|
||||||
/// KFold Cross-Validation
|
#[derive(Clone, Debug)]
|
||||||
///
|
pub struct CrossValidationResult<T: RealNumber> {
|
||||||
pub trait BaseKFold {
|
/// Vector with test scores on each cv split
|
||||||
/// Returns integer indices corresponding to test sets
|
pub test_score: Vec<T>,
|
||||||
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>>;
|
/// Vector with training scores on each cv split
|
||||||
|
pub train_score: Vec<T>,
|
||||||
/// Returns masksk corresponding to test sets
|
|
||||||
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>>;
|
|
||||||
|
|
||||||
/// Return a tuple containing the the training set indices for that split and
|
|
||||||
/// the testing set indices for that split.
|
|
||||||
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
impl<T: RealNumber> CrossValidationResult<T> {
|
||||||
/// An implementation of KFold
|
/// Average test score
|
||||||
///
|
pub fn mean_test_score(&self) -> T {
|
||||||
pub struct KFold {
|
self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap()
|
||||||
n_splits: usize, // cannot exceed std::usize::MAX
|
}
|
||||||
shuffle: bool,
|
/// Average training score
|
||||||
// TODO: to be implemented later
|
pub fn mean_train_score(&self) -> T {
|
||||||
// random_state: i32,
|
self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap()
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for KFold {
|
|
||||||
fn default() -> KFold {
|
|
||||||
KFold {
|
|
||||||
n_splits: 3_usize,
|
|
||||||
shuffle: true,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
/// Evaluate an estimator by cross-validation using given metric.
|
||||||
/// Abstract class for all KFold functionalities
|
/// * `fit_estimator` - a `fit` function of an estimator
|
||||||
///
|
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
|
||||||
impl BaseKFold for KFold {
|
/// * `y` - target values, should be of size _N_
|
||||||
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>> {
|
/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters.
|
||||||
// number of samples (rows) in the matrix
|
/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html)
|
||||||
let n_samples: usize = x.shape().0;
|
/// * `score` - a metric to use for evaluation, see [metrics](../metrics/index.html)
|
||||||
|
pub fn cross_validate<T, M, H, E, K, F, S>(
|
||||||
|
fit_estimator: F,
|
||||||
|
x: &M,
|
||||||
|
y: &M::RowVector,
|
||||||
|
parameters: H,
|
||||||
|
cv: K,
|
||||||
|
score: S,
|
||||||
|
) -> Result<CrossValidationResult<T>, Failed>
|
||||||
|
where
|
||||||
|
T: RealNumber,
|
||||||
|
M: Matrix<T>,
|
||||||
|
H: Clone,
|
||||||
|
E: Predictor<M, M::RowVector>,
|
||||||
|
K: BaseKFold,
|
||||||
|
F: Fn(&M, &M::RowVector, H) -> Result<E, Failed>,
|
||||||
|
S: Fn(&M::RowVector, &M::RowVector) -> T,
|
||||||
|
{
|
||||||
|
let k = cv.n_splits();
|
||||||
|
let mut test_score = Vec::with_capacity(k);
|
||||||
|
let mut train_score = Vec::with_capacity(k);
|
||||||
|
|
||||||
// initialise indices
|
for (train_idx, test_idx) in cv.split(x) {
|
||||||
let mut indices: Vec<usize> = (0..n_samples).collect();
|
let train_x = x.take(&train_idx, 0);
|
||||||
if self.shuffle {
|
let train_y = y.take(&train_idx);
|
||||||
indices.shuffle(&mut thread_rng());
|
let test_x = x.take(&test_idx, 0);
|
||||||
}
|
let test_y = y.take(&test_idx);
|
||||||
// return a new array of given shape n_split, filled with each element of n_samples divided by n_splits.
|
|
||||||
let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits];
|
|
||||||
|
|
||||||
// increment by one if odd
|
let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?;
|
||||||
for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) {
|
|
||||||
*fold_size += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// generate the right array of arrays for test indices
|
train_score.push(score(&train_y, &estimator.predict(&train_x)?));
|
||||||
let mut return_values: Vec<Vec<usize>> = Vec::with_capacity(self.n_splits);
|
test_score.push(score(&test_y, &estimator.predict(&test_x)?));
|
||||||
let mut current: usize = 0;
|
|
||||||
for fold_size in fold_sizes.drain(..) {
|
|
||||||
let stop = current + fold_size;
|
|
||||||
return_values.push(indices[current..stop].to_vec());
|
|
||||||
current = stop
|
|
||||||
}
|
|
||||||
|
|
||||||
return_values
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>> {
|
Ok(CrossValidationResult {
|
||||||
let mut return_values: Vec<Vec<bool>> = Vec::with_capacity(self.n_splits);
|
test_score,
|
||||||
for test_index in self.test_indices(x).drain(..) {
|
train_score,
|
||||||
// init mask
|
})
|
||||||
let mut test_mask = vec![false; x.shape().0];
|
}
|
||||||
// set mask's indices to true according to test indices
|
|
||||||
for i in test_index {
|
/// Generate cross-validated estimates for each input data point.
|
||||||
test_mask[i] = true; // can be implemented with map()
|
/// The data is split according to the cv parameter. Each sample belongs to exactly one test set, and its prediction is computed with an estimator fitted on the corresponding training set.
|
||||||
}
|
/// * `fit_estimator` - a `fit` function of an estimator
|
||||||
return_values.push(test_mask);
|
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
|
||||||
|
/// * `y` - target values, should be of size _N_
|
||||||
|
/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters.
|
||||||
|
/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html)
|
||||||
|
pub fn cross_val_predict<T, M, H, E, K, F>(
|
||||||
|
fit_estimator: F,
|
||||||
|
x: &M,
|
||||||
|
y: &M::RowVector,
|
||||||
|
parameters: H,
|
||||||
|
cv: K,
|
||||||
|
) -> Result<M::RowVector, Failed>
|
||||||
|
where
|
||||||
|
T: RealNumber,
|
||||||
|
M: Matrix<T>,
|
||||||
|
H: Clone,
|
||||||
|
E: Predictor<M, M::RowVector>,
|
||||||
|
K: BaseKFold,
|
||||||
|
F: Fn(&M, &M::RowVector, H) -> Result<E, Failed>,
|
||||||
|
{
|
||||||
|
let mut y_hat = M::RowVector::zeros(y.len());
|
||||||
|
|
||||||
|
for (train_idx, test_idx) in cv.split(x) {
|
||||||
|
let train_x = x.take(&train_idx, 0);
|
||||||
|
let train_y = y.take(&train_idx);
|
||||||
|
let test_x = x.take(&test_idx, 0);
|
||||||
|
|
||||||
|
let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?;
|
||||||
|
|
||||||
|
let y_test_hat = estimator.predict(&test_x)?;
|
||||||
|
for (i, &idx) in test_idx.iter().enumerate() {
|
||||||
|
y_hat.set(idx, y_test_hat.get(i));
|
||||||
}
|
}
|
||||||
return_values
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)> {
|
Ok(y_hat)
|
||||||
let n_samples: usize = x.shape().0;
|
|
||||||
let indices: Vec<usize> = (0..n_samples).collect();
|
|
||||||
|
|
||||||
let mut return_values: Vec<(Vec<usize>, Vec<usize>)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs
|
|
||||||
|
|
||||||
for test_index in self.test_masks(x).drain(..) {
|
|
||||||
let train_index = indices
|
|
||||||
.clone()
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.filter(|&(idx, _)| !test_index[idx])
|
|
||||||
.map(|(idx, _)| idx)
|
|
||||||
.collect::<Vec<usize>>(); // filter train indices out according to mask
|
|
||||||
let test_index = indices
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.filter(|&(idx, _)| test_index[idx])
|
|
||||||
.map(|(idx, _)| idx)
|
|
||||||
.collect::<Vec<usize>>(); // filter tests indices out according to mask
|
|
||||||
return_values.push((train_index, test_index))
|
|
||||||
}
|
|
||||||
return_values
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -194,14 +188,17 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::linalg::naive::dense_matrix::*;
|
use crate::linalg::naive::dense_matrix::*;
|
||||||
|
use crate::metrics::{accuracy, mean_absolute_error};
|
||||||
|
use crate::model_selection::kfold::KFold;
|
||||||
|
use crate::neighbors::knn_regressor::KNNRegressor;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn run_train_test_split() {
|
fn run_train_test_split() {
|
||||||
let n = 100;
|
let n = 123;
|
||||||
let x: DenseMatrix<f64> = DenseMatrix::rand(100, 3);
|
let x: DenseMatrix<f64> = DenseMatrix::rand(n, 3);
|
||||||
let y = vec![0f64; 100];
|
let y = vec![0f64; n];
|
||||||
|
|
||||||
let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2);
|
let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true);
|
||||||
|
|
||||||
assert!(
|
assert!(
|
||||||
x_train.shape().0 > (n as f64 * 0.65) as usize
|
x_train.shape().0 > (n as f64 * 0.65) as usize
|
||||||
@@ -215,126 +212,144 @@ mod tests {
|
|||||||
assert_eq!(x_test.shape().0, y_test.len());
|
assert_eq!(x_test.shape().0, y_test.len());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[derive(Clone)]
|
||||||
fn run_kfold_return_test_indices_simple() {
|
struct NoParameters {}
|
||||||
let k = KFold {
|
|
||||||
n_splits: 3,
|
|
||||||
shuffle: false,
|
|
||||||
};
|
|
||||||
let x: DenseMatrix<f64> = DenseMatrix::rand(33, 100);
|
|
||||||
let test_indices = k.test_indices(&x);
|
|
||||||
|
|
||||||
assert_eq!(test_indices[0], (0..11).collect::<Vec<usize>>());
|
#[test]
|
||||||
assert_eq!(test_indices[1], (11..22).collect::<Vec<usize>>());
|
fn test_cross_validate_biased() {
|
||||||
assert_eq!(test_indices[2], (22..33).collect::<Vec<usize>>());
|
struct BiasedEstimator {}
|
||||||
|
|
||||||
|
impl BiasedEstimator {
|
||||||
|
fn fit<M: Matrix<f32>>(
|
||||||
|
_: &M,
|
||||||
|
_: &M::RowVector,
|
||||||
|
_: NoParameters,
|
||||||
|
) -> Result<BiasedEstimator, Failed> {
|
||||||
|
Ok(BiasedEstimator {})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<M: Matrix<f32>> Predictor<M, M::RowVector> for BiasedEstimator {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
let (n, _) = x.shape();
|
||||||
|
Ok(M::RowVector::zeros(n))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let x = DenseMatrix::from_2d_array(&[
|
||||||
|
&[5.1, 3.5, 1.4, 0.2],
|
||||||
|
&[4.9, 3.0, 1.4, 0.2],
|
||||||
|
&[4.7, 3.2, 1.3, 0.2],
|
||||||
|
&[4.6, 3.1, 1.5, 0.2],
|
||||||
|
&[5.0, 3.6, 1.4, 0.2],
|
||||||
|
&[5.4, 3.9, 1.7, 0.4],
|
||||||
|
&[4.6, 3.4, 1.4, 0.3],
|
||||||
|
&[5.0, 3.4, 1.5, 0.2],
|
||||||
|
&[4.4, 2.9, 1.4, 0.2],
|
||||||
|
&[4.9, 3.1, 1.5, 0.1],
|
||||||
|
&[7.0, 3.2, 4.7, 1.4],
|
||||||
|
&[6.4, 3.2, 4.5, 1.5],
|
||||||
|
&[6.9, 3.1, 4.9, 1.5],
|
||||||
|
&[5.5, 2.3, 4.0, 1.3],
|
||||||
|
&[6.5, 2.8, 4.6, 1.5],
|
||||||
|
&[5.7, 2.8, 4.5, 1.3],
|
||||||
|
&[6.3, 3.3, 4.7, 1.6],
|
||||||
|
&[4.9, 2.4, 3.3, 1.0],
|
||||||
|
&[6.6, 2.9, 4.6, 1.3],
|
||||||
|
&[5.2, 2.7, 3.9, 1.4],
|
||||||
|
]);
|
||||||
|
let y = vec![
|
||||||
|
0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
||||||
|
];
|
||||||
|
|
||||||
|
let cv = KFold {
|
||||||
|
n_splits: 5,
|
||||||
|
..KFold::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let results =
|
||||||
|
cross_validate(BiasedEstimator::fit, &x, &y, NoParameters {}, cv, &accuracy).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(0.4, results.mean_test_score());
|
||||||
|
assert_eq!(0.4, results.mean_train_score());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn run_kfold_return_test_indices_odd() {
|
fn test_cross_validate_knn() {
|
||||||
let k = KFold {
|
let x = DenseMatrix::from_2d_array(&[
|
||||||
n_splits: 3,
|
&[234.289, 235.6, 159., 107.608, 1947., 60.323],
|
||||||
shuffle: false,
|
&[259.426, 232.5, 145.6, 108.632, 1948., 61.122],
|
||||||
};
|
&[258.054, 368.2, 161.6, 109.773, 1949., 60.171],
|
||||||
let x: DenseMatrix<f64> = DenseMatrix::rand(34, 100);
|
&[284.599, 335.1, 165., 110.929, 1950., 61.187],
|
||||||
let test_indices = k.test_indices(&x);
|
&[328.975, 209.9, 309.9, 112.075, 1951., 63.221],
|
||||||
|
&[346.999, 193.2, 359.4, 113.27, 1952., 63.639],
|
||||||
|
&[365.385, 187., 354.7, 115.094, 1953., 64.989],
|
||||||
|
&[363.112, 357.8, 335., 116.219, 1954., 63.761],
|
||||||
|
&[397.469, 290.4, 304.8, 117.388, 1955., 66.019],
|
||||||
|
&[419.18, 282.2, 285.7, 118.734, 1956., 67.857],
|
||||||
|
&[442.769, 293.6, 279.8, 120.445, 1957., 68.169],
|
||||||
|
&[444.546, 468.1, 263.7, 121.95, 1958., 66.513],
|
||||||
|
&[482.704, 381.3, 255.2, 123.366, 1959., 68.655],
|
||||||
|
&[502.601, 393.1, 251.4, 125.368, 1960., 69.564],
|
||||||
|
&[518.173, 480.6, 257.2, 127.852, 1961., 69.331],
|
||||||
|
&[554.894, 400.7, 282.7, 130.081, 1962., 70.551],
|
||||||
|
]);
|
||||||
|
let y = vec![
|
||||||
|
83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6,
|
||||||
|
114.2, 115.7, 116.9,
|
||||||
|
];
|
||||||
|
|
||||||
assert_eq!(test_indices[0], (0..12).collect::<Vec<usize>>());
|
let cv = KFold {
|
||||||
assert_eq!(test_indices[1], (12..23).collect::<Vec<usize>>());
|
n_splits: 5,
|
||||||
assert_eq!(test_indices[2], (23..34).collect::<Vec<usize>>());
|
..KFold::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let results = cross_validate(
|
||||||
|
KNNRegressor::fit,
|
||||||
|
&x,
|
||||||
|
&y,
|
||||||
|
Default::default(),
|
||||||
|
cv,
|
||||||
|
&mean_absolute_error,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(results.mean_test_score() < 15.0);
|
||||||
|
assert!(results.mean_train_score() < results.mean_test_score());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn run_kfold_return_test_mask_simple() {
|
fn test_cross_val_predict_knn() {
|
||||||
let k = KFold {
|
let x = DenseMatrix::from_2d_array(&[
|
||||||
n_splits: 2,
|
&[234.289, 235.6, 159., 107.608, 1947., 60.323],
|
||||||
shuffle: false,
|
&[259.426, 232.5, 145.6, 108.632, 1948., 61.122],
|
||||||
};
|
&[258.054, 368.2, 161.6, 109.773, 1949., 60.171],
|
||||||
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
|
&[284.599, 335.1, 165., 110.929, 1950., 61.187],
|
||||||
let test_masks = k.test_masks(&x);
|
&[328.975, 209.9, 309.9, 112.075, 1951., 63.221],
|
||||||
|
&[346.999, 193.2, 359.4, 113.27, 1952., 63.639],
|
||||||
|
&[365.385, 187., 354.7, 115.094, 1953., 64.989],
|
||||||
|
&[363.112, 357.8, 335., 116.219, 1954., 63.761],
|
||||||
|
&[397.469, 290.4, 304.8, 117.388, 1955., 66.019],
|
||||||
|
&[419.18, 282.2, 285.7, 118.734, 1956., 67.857],
|
||||||
|
&[442.769, 293.6, 279.8, 120.445, 1957., 68.169],
|
||||||
|
&[444.546, 468.1, 263.7, 121.95, 1958., 66.513],
|
||||||
|
&[482.704, 381.3, 255.2, 123.366, 1959., 68.655],
|
||||||
|
&[502.601, 393.1, 251.4, 125.368, 1960., 69.564],
|
||||||
|
&[518.173, 480.6, 257.2, 127.852, 1961., 69.331],
|
||||||
|
&[554.894, 400.7, 282.7, 130.081, 1962., 70.551],
|
||||||
|
]);
|
||||||
|
let y = vec![
|
||||||
|
83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6,
|
||||||
|
114.2, 115.7, 116.9,
|
||||||
|
];
|
||||||
|
|
||||||
for t in &test_masks[0][0..11] {
|
let cv = KFold {
|
||||||
// TODO: this can be prob done better
|
|
||||||
assert_eq!(*t, true)
|
|
||||||
}
|
|
||||||
for t in &test_masks[0][11..22] {
|
|
||||||
assert_eq!(*t, false)
|
|
||||||
}
|
|
||||||
|
|
||||||
for t in &test_masks[1][0..11] {
|
|
||||||
assert_eq!(*t, false)
|
|
||||||
}
|
|
||||||
for t in &test_masks[1][11..22] {
|
|
||||||
assert_eq!(*t, true)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn run_kfold_return_split_simple() {
|
|
||||||
let k = KFold {
|
|
||||||
n_splits: 2,
|
|
||||||
shuffle: false,
|
|
||||||
};
|
|
||||||
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
|
|
||||||
let train_test_splits = k.split(&x);
|
|
||||||
|
|
||||||
assert_eq!(train_test_splits[0].1, (0..11).collect::<Vec<usize>>());
|
|
||||||
assert_eq!(train_test_splits[0].0, (11..22).collect::<Vec<usize>>());
|
|
||||||
assert_eq!(train_test_splits[1].0, (0..11).collect::<Vec<usize>>());
|
|
||||||
assert_eq!(train_test_splits[1].1, (11..22).collect::<Vec<usize>>());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn run_kfold_return_split_simple_shuffle() {
|
|
||||||
let k = KFold {
|
|
||||||
n_splits: 2,
|
n_splits: 2,
|
||||||
..KFold::default()
|
..KFold::default()
|
||||||
};
|
};
|
||||||
let x: DenseMatrix<f64> = DenseMatrix::rand(23, 100);
|
|
||||||
let train_test_splits = k.split(&x);
|
|
||||||
|
|
||||||
assert_eq!(train_test_splits[0].1.len(), 12_usize);
|
let y_hat = cross_val_predict(KNNRegressor::fit, &x, &y, Default::default(), cv).unwrap();
|
||||||
assert_eq!(train_test_splits[0].0.len(), 11_usize);
|
|
||||||
assert_eq!(train_test_splits[1].0.len(), 12_usize);
|
|
||||||
assert_eq!(train_test_splits[1].1.len(), 11_usize);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
assert!(mean_absolute_error(&y, &y_hat) < 10.0);
|
||||||
fn numpy_parity_test() {
|
|
||||||
let k = KFold {
|
|
||||||
n_splits: 3,
|
|
||||||
shuffle: false,
|
|
||||||
};
|
|
||||||
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
|
|
||||||
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
|
|
||||||
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
|
|
||||||
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
|
|
||||||
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
|
|
||||||
];
|
|
||||||
for ((train, test), (expected_train, expected_test)) in
|
|
||||||
k.split(&x).into_iter().zip(expected)
|
|
||||||
{
|
|
||||||
assert_eq!(test, expected_test);
|
|
||||||
assert_eq!(train, expected_train);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn numpy_parity_test_shuffle() {
|
|
||||||
let k = KFold {
|
|
||||||
n_splits: 3,
|
|
||||||
..KFold::default()
|
|
||||||
};
|
|
||||||
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
|
|
||||||
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
|
|
||||||
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
|
|
||||||
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
|
|
||||||
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
|
|
||||||
];
|
|
||||||
for ((train, test), (expected_train, expected_test)) in
|
|
||||||
k.split(&x).into_iter().zip(expected)
|
|
||||||
{
|
|
||||||
assert_eq!(test.len(), expected_test.len());
|
|
||||||
assert_eq!(train.len(), expected_train.len());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -33,6 +33,7 @@
|
|||||||
//! ## References:
|
//! ## References:
|
||||||
//!
|
//!
|
||||||
//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
|
//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::row_iter;
|
use crate::linalg::row_iter;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
@@ -87,13 +88,20 @@ pub struct BernoulliNBParameters<T: RealNumber> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> BernoulliNBParameters<T> {
|
impl<T: RealNumber> BernoulliNBParameters<T> {
|
||||||
/// Create BernoulliNBParameters with specific paramaters.
|
/// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
|
||||||
pub fn new(alpha: T, priors: Option<Vec<T>>, binarize: Option<T>) -> Self {
|
pub fn with_alpha(mut self, alpha: T) -> Self {
|
||||||
Self {
|
self.alpha = alpha;
|
||||||
alpha,
|
self
|
||||||
priors,
|
}
|
||||||
binarize,
|
/// Prior probabilities of the classes. If specified the priors are not adjusted according to the data
|
||||||
}
|
pub fn with_priors(mut self, priors: Vec<T>) -> Self {
|
||||||
|
self.priors = Some(priors);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors.
|
||||||
|
pub fn with_binarize(mut self, binarize: T) -> Self {
|
||||||
|
self.binarize = Some(binarize);
|
||||||
|
self
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -200,6 +208,12 @@ pub struct BernoulliNB<T: RealNumber, M: Matrix<T>> {
|
|||||||
binarize: Option<T>,
|
binarize: Option<T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for BernoulliNB<T, M> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> BernoulliNB<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> BernoulliNB<T, M> {
|
||||||
/// Fits BernoulliNB with given data
|
/// Fits BernoulliNB with given data
|
||||||
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
//! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap();
|
//! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap();
|
||||||
//! let y_hat = nb.predict(&x).unwrap();
|
//! let y_hat = nb.predict(&x).unwrap();
|
||||||
//! ```
|
//! ```
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
@@ -222,18 +223,13 @@ pub struct CategoricalNBParameters<T: RealNumber> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> CategoricalNBParameters<T> {
|
impl<T: RealNumber> CategoricalNBParameters<T> {
|
||||||
/// Create CategoricalNBParameters with specific paramaters.
|
/// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
|
||||||
pub fn new(alpha: T) -> Result<Self, Failed> {
|
pub fn with_alpha(mut self, alpha: T) -> Self {
|
||||||
if alpha > T::zero() {
|
self.alpha = alpha;
|
||||||
Ok(Self { alpha })
|
self
|
||||||
} else {
|
|
||||||
Err(Failed::fit(&format!(
|
|
||||||
"alpha should be >= 0, alpha=[{}]",
|
|
||||||
alpha
|
|
||||||
)))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> Default for CategoricalNBParameters<T> {
|
impl<T: RealNumber> Default for CategoricalNBParameters<T> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self { alpha: T::one() }
|
Self { alpha: T::one() }
|
||||||
@@ -246,6 +242,12 @@ pub struct CategoricalNB<T: RealNumber, M: Matrix<T>> {
|
|||||||
inner: BaseNaiveBayes<T, M, CategoricalNBDistribution<T>>,
|
inner: BaseNaiveBayes<T, M, CategoricalNBDistribution<T>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for CategoricalNB<T, M> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> CategoricalNB<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> CategoricalNB<T, M> {
|
||||||
/// Fits CategoricalNB with given data
|
/// Fits CategoricalNB with given data
|
||||||
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
||||||
|
|||||||
@@ -22,6 +22,7 @@
|
|||||||
//! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap();
|
//! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap();
|
||||||
//! let y_hat = nb.predict(&x).unwrap();
|
//! let y_hat = nb.predict(&x).unwrap();
|
||||||
//! ```
|
//! ```
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::row_iter;
|
use crate::linalg::row_iter;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
@@ -81,9 +82,10 @@ pub struct GaussianNBParameters<T: RealNumber> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> GaussianNBParameters<T> {
|
impl<T: RealNumber> GaussianNBParameters<T> {
|
||||||
/// Create GaussianNBParameters with specific paramaters.
|
/// Prior probabilities of the classes. If specified the priors are not adjusted according to the data
|
||||||
pub fn new(priors: Option<Vec<T>>) -> Self {
|
pub fn with_priors(mut self, priors: Vec<T>) -> Self {
|
||||||
Self { priors }
|
self.priors = Some(priors);
|
||||||
|
self
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -181,6 +183,12 @@ pub struct GaussianNB<T: RealNumber, M: Matrix<T>> {
|
|||||||
inner: BaseNaiveBayes<T, M, GaussianNBDistribution<T>>,
|
inner: BaseNaiveBayes<T, M, GaussianNBDistribution<T>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for GaussianNB<T, M> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> GaussianNB<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> GaussianNB<T, M> {
|
||||||
/// Fits GaussianNB with given data
|
/// Fits GaussianNB with given data
|
||||||
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
||||||
@@ -254,7 +262,7 @@ mod tests {
|
|||||||
let y = vec![1., 1., 1., 2., 2., 2.];
|
let y = vec![1., 1., 1., 2., 2., 2.];
|
||||||
|
|
||||||
let priors = vec![0.3, 0.7];
|
let priors = vec![0.3, 0.7];
|
||||||
let parameters = GaussianNBParameters::new(Some(priors.clone()));
|
let parameters = GaussianNBParameters::default().with_priors(priors.clone());
|
||||||
let gnb = GaussianNB::fit(&x, &y, parameters).unwrap();
|
let gnb = GaussianNB::fit(&x, &y, parameters).unwrap();
|
||||||
|
|
||||||
assert_eq!(gnb.inner.distribution.class_priors, priors);
|
assert_eq!(gnb.inner.distribution.class_priors, priors);
|
||||||
|
|||||||
@@ -33,6 +33,7 @@
|
|||||||
//! ## References:
|
//! ## References:
|
||||||
//!
|
//!
|
||||||
//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
|
//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::row_iter;
|
use crate::linalg::row_iter;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
@@ -81,9 +82,15 @@ pub struct MultinomialNBParameters<T: RealNumber> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> MultinomialNBParameters<T> {
|
impl<T: RealNumber> MultinomialNBParameters<T> {
|
||||||
/// Create MultinomialNBParameters with specific paramaters.
|
/// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
|
||||||
pub fn new(alpha: T, priors: Option<Vec<T>>) -> Self {
|
pub fn with_alpha(mut self, alpha: T) -> Self {
|
||||||
Self { alpha, priors }
|
self.alpha = alpha;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Prior probabilities of the classes. If specified the priors are not adjusted according to the data
|
||||||
|
pub fn with_priors(mut self, priors: Vec<T>) -> Self {
|
||||||
|
self.priors = Some(priors);
|
||||||
|
self
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -187,6 +194,12 @@ pub struct MultinomialNB<T: RealNumber, M: Matrix<T>> {
|
|||||||
inner: BaseNaiveBayes<T, M, MultinomialNBDistribution<T>>,
|
inner: BaseNaiveBayes<T, M, MultinomialNBDistribution<T>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for MultinomialNB<T, M> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> MultinomialNB<T, M> {
|
impl<T: RealNumber, M: Matrix<T>> MultinomialNB<T, M> {
|
||||||
/// Fits MultinomialNB with given data
|
/// Fits MultinomialNB with given data
|
||||||
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
|
||||||
|
|||||||
@@ -25,31 +25,40 @@
|
|||||||
//! &[9., 10.]]);
|
//! &[9., 10.]]);
|
||||||
//! let y = vec![2., 2., 2., 3., 3.]; //your class labels
|
//! let y = vec![2., 2., 2., 3., 3.]; //your class labels
|
||||||
//!
|
//!
|
||||||
//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap();
|
//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap();
|
||||||
//! let y_hat = knn.predict(&x).unwrap();
|
//! let y_hat = knn.predict(&x).unwrap();
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! variable `y_hat` will hold a vector with estimates of class labels
|
//! variable `y_hat` will hold a vector with estimates of class labels
|
||||||
//!
|
//!
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
|
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::{row_iter, Matrix};
|
use crate::linalg::{row_iter, Matrix};
|
||||||
use crate::math::distance::Distance;
|
use crate::math::distance::euclidian::Euclidian;
|
||||||
|
use crate::math::distance::{Distance, Distances};
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
use crate::neighbors::KNNWeightFunction;
|
use crate::neighbors::KNNWeightFunction;
|
||||||
|
|
||||||
/// `KNNClassifier` parameters. Use `Default::default()` for default values.
|
/// `KNNClassifier` parameters. Use `Default::default()` for default values.
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct KNNClassifierParameters {
|
pub struct KNNClassifierParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
|
||||||
|
/// a function that defines a distance between each pair of point in training data.
|
||||||
|
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
|
||||||
|
/// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
|
||||||
|
pub distance: D,
|
||||||
/// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
|
/// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
|
||||||
pub algorithm: KNNAlgorithmName,
|
pub algorithm: KNNAlgorithmName,
|
||||||
/// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`.
|
/// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`.
|
||||||
pub weight: KNNWeightFunction,
|
pub weight: KNNWeightFunction,
|
||||||
/// number of training samples to consider when estimating class for new point. Default value is 3.
|
/// number of training samples to consider when estimating class for new point. Default value is 3.
|
||||||
pub k: usize,
|
pub k: usize,
|
||||||
|
/// this parameter is not used
|
||||||
|
t: PhantomData<T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// K Nearest Neighbors Classifier
|
/// K Nearest Neighbors Classifier
|
||||||
@@ -62,12 +71,47 @@ pub struct KNNClassifier<T: RealNumber, D: Distance<Vec<T>, T>> {
|
|||||||
k: usize,
|
k: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for KNNClassifierParameters {
|
impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifierParameters<T, D> {
|
||||||
|
/// number of training samples to consider when estimating class for new point. Default value is 3.
|
||||||
|
pub fn with_k(mut self, k: usize) -> Self {
|
||||||
|
self.k = k;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// a function that defines a distance between each pair of point in training data.
|
||||||
|
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
|
||||||
|
/// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
|
||||||
|
pub fn with_distance<DD: Distance<Vec<T>, T>>(
|
||||||
|
self,
|
||||||
|
distance: DD,
|
||||||
|
) -> KNNClassifierParameters<T, DD> {
|
||||||
|
KNNClassifierParameters {
|
||||||
|
distance,
|
||||||
|
algorithm: self.algorithm,
|
||||||
|
weight: self.weight,
|
||||||
|
k: self.k,
|
||||||
|
t: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
|
||||||
|
pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self {
|
||||||
|
self.algorithm = algorithm;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`.
|
||||||
|
pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self {
|
||||||
|
self.weight = weight;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber> Default for KNNClassifierParameters<T, Euclidian> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
KNNClassifierParameters {
|
KNNClassifierParameters {
|
||||||
|
distance: Distances::euclidian(),
|
||||||
algorithm: KNNAlgorithmName::CoverTree,
|
algorithm: KNNAlgorithmName::CoverTree,
|
||||||
weight: KNNWeightFunction::Uniform,
|
weight: KNNWeightFunction::Uniform,
|
||||||
k: 3,
|
k: 3,
|
||||||
|
t: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -95,19 +139,23 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for KNNClassifier<T, D> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>, D: Distance<Vec<T>, T>> Predictor<M, M::RowVector>
|
||||||
|
for KNNClassifier<T, D>
|
||||||
|
{
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
|
impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
|
||||||
/// Fits KNN classifier to a NxM matrix where N is number of samples and M is number of features.
|
/// Fits KNN classifier to a NxM matrix where N is number of samples and M is number of features.
|
||||||
/// * `x` - training data
|
/// * `x` - training data
|
||||||
/// * `y` - vector with target values (classes) of length N
|
/// * `y` - vector with target values (classes) of length N
|
||||||
/// * `distance` - a function that defines a distance between each pair of point in training data.
|
|
||||||
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
|
|
||||||
/// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
|
|
||||||
/// * `parameters` - additional parameters like search algorithm and k
|
/// * `parameters` - additional parameters like search algorithm and k
|
||||||
pub fn fit<M: Matrix<T>>(
|
pub fn fit<M: Matrix<T>>(
|
||||||
x: &M,
|
x: &M,
|
||||||
y: &M::RowVector,
|
y: &M::RowVector,
|
||||||
distance: D,
|
parameters: KNNClassifierParameters<T, D>,
|
||||||
parameters: KNNClassifierParameters,
|
|
||||||
) -> Result<KNNClassifier<T, D>, Failed> {
|
) -> Result<KNNClassifier<T, D>, Failed> {
|
||||||
let y_m = M::from_row_vector(y.clone());
|
let y_m = M::from_row_vector(y.clone());
|
||||||
|
|
||||||
@@ -142,7 +190,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
|
|||||||
classes,
|
classes,
|
||||||
y: yi,
|
y: yi,
|
||||||
k: parameters.k,
|
k: parameters.k,
|
||||||
knn_algorithm: parameters.algorithm.fit(data, distance)?,
|
knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?,
|
||||||
weight: parameters.weight,
|
weight: parameters.weight,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -187,14 +235,13 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
||||||
use crate::math::distance::Distances;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn knn_fit_predict() {
|
fn knn_fit_predict() {
|
||||||
let x =
|
let x =
|
||||||
DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
|
DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
|
||||||
let y = vec![2., 2., 2., 3., 3.];
|
let y = vec![2., 2., 2., 3., 3.];
|
||||||
let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap();
|
let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap();
|
||||||
let y_hat = knn.predict(&x).unwrap();
|
let y_hat = knn.predict(&x).unwrap();
|
||||||
assert_eq!(5, Vec::len(&y_hat));
|
assert_eq!(5, Vec::len(&y_hat));
|
||||||
assert_eq!(y.to_vec(), y_hat);
|
assert_eq!(y.to_vec(), y_hat);
|
||||||
@@ -207,12 +254,10 @@ mod tests {
|
|||||||
let knn = KNNClassifier::fit(
|
let knn = KNNClassifier::fit(
|
||||||
&x,
|
&x,
|
||||||
&y,
|
&y,
|
||||||
Distances::euclidian(),
|
KNNClassifierParameters::default()
|
||||||
KNNClassifierParameters {
|
.with_k(5)
|
||||||
k: 5,
|
.with_algorithm(KNNAlgorithmName::LinearSearch)
|
||||||
algorithm: KNNAlgorithmName::LinearSearch,
|
.with_weight(KNNWeightFunction::Distance),
|
||||||
weight: KNNWeightFunction::Distance,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let y_hat = knn.predict(&DenseMatrix::from_2d_array(&[&[4.1]])).unwrap();
|
let y_hat = knn.predict(&DenseMatrix::from_2d_array(&[&[4.1]])).unwrap();
|
||||||
@@ -225,7 +270,7 @@ mod tests {
|
|||||||
DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
|
DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
|
||||||
let y = vec![2., 2., 2., 3., 3.];
|
let y = vec![2., 2., 2., 3., 3.];
|
||||||
|
|
||||||
let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap();
|
let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap();
|
let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -27,31 +27,41 @@
|
|||||||
//! &[5., 5.]]);
|
//! &[5., 5.]]);
|
||||||
//! let y = vec![1., 2., 3., 4., 5.]; //your target values
|
//! let y = vec![1., 2., 3., 4., 5.]; //your target values
|
||||||
//!
|
//!
|
||||||
//! let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap();
|
//! let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap();
|
||||||
//! let y_hat = knn.predict(&x).unwrap();
|
//! let y_hat = knn.predict(&x).unwrap();
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! variable `y_hat` will hold predicted value
|
//! variable `y_hat` will hold predicted value
|
||||||
//!
|
//!
|
||||||
//!
|
//!
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
|
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::{row_iter, BaseVector, Matrix};
|
use crate::linalg::{row_iter, BaseVector, Matrix};
|
||||||
use crate::math::distance::Distance;
|
use crate::math::distance::euclidian::Euclidian;
|
||||||
|
use crate::math::distance::{Distance, Distances};
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
use crate::neighbors::KNNWeightFunction;
|
use crate::neighbors::KNNWeightFunction;
|
||||||
|
|
||||||
/// `KNNRegressor` parameters. Use `Default::default()` for default values.
|
/// `KNNRegressor` parameters. Use `Default::default()` for default values.
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct KNNRegressorParameters {
|
pub struct KNNRegressorParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
|
||||||
|
/// a function that defines a distance between each pair of point in training data.
|
||||||
|
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
|
||||||
|
/// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
|
||||||
|
distance: D,
|
||||||
/// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
|
/// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
|
||||||
pub algorithm: KNNAlgorithmName,
|
pub algorithm: KNNAlgorithmName,
|
||||||
/// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`.
|
/// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`.
|
||||||
pub weight: KNNWeightFunction,
|
pub weight: KNNWeightFunction,
|
||||||
/// number of training samples to consider when estimating class for new point. Default value is 3.
|
/// number of training samples to consider when estimating class for new point. Default value is 3.
|
||||||
pub k: usize,
|
pub k: usize,
|
||||||
|
/// this parameter is not used
|
||||||
|
t: PhantomData<T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// K Nearest Neighbors Regressor
|
/// K Nearest Neighbors Regressor
|
||||||
@@ -63,12 +73,47 @@ pub struct KNNRegressor<T: RealNumber, D: Distance<Vec<T>, T>> {
|
|||||||
k: usize,
|
k: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for KNNRegressorParameters {
|
impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNRegressorParameters<T, D> {
|
||||||
|
/// number of training samples to consider when estimating class for new point. Default value is 3.
|
||||||
|
pub fn with_k(mut self, k: usize) -> Self {
|
||||||
|
self.k = k;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// a function that defines a distance between each pair of point in training data.
|
||||||
|
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
|
||||||
|
/// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
|
||||||
|
pub fn with_distance<DD: Distance<Vec<T>, T>>(
|
||||||
|
self,
|
||||||
|
distance: DD,
|
||||||
|
) -> KNNRegressorParameters<T, DD> {
|
||||||
|
KNNRegressorParameters {
|
||||||
|
distance,
|
||||||
|
algorithm: self.algorithm,
|
||||||
|
weight: self.weight,
|
||||||
|
k: self.k,
|
||||||
|
t: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
|
||||||
|
pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self {
|
||||||
|
self.algorithm = algorithm;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`.
|
||||||
|
pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self {
|
||||||
|
self.weight = weight;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber> Default for KNNRegressorParameters<T, Euclidian> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
KNNRegressorParameters {
|
KNNRegressorParameters {
|
||||||
|
distance: Distances::euclidian(),
|
||||||
algorithm: KNNAlgorithmName::CoverTree,
|
algorithm: KNNAlgorithmName::CoverTree,
|
||||||
weight: KNNWeightFunction::Uniform,
|
weight: KNNWeightFunction::Uniform,
|
||||||
k: 3,
|
k: 3,
|
||||||
|
t: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -88,19 +133,23 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for KNNRegressor<T, D> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>, D: Distance<Vec<T>, T>> Predictor<M, M::RowVector>
|
||||||
|
for KNNRegressor<T, D>
|
||||||
|
{
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNRegressor<T, D> {
|
impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNRegressor<T, D> {
|
||||||
/// Fits KNN regressor to a NxM matrix where N is number of samples and M is number of features.
|
/// Fits KNN regressor to a NxM matrix where N is number of samples and M is number of features.
|
||||||
/// * `x` - training data
|
/// * `x` - training data
|
||||||
/// * `y` - vector with real values
|
/// * `y` - vector with real values
|
||||||
/// * `distance` - a function that defines a distance between each pair of point in training data.
|
|
||||||
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
|
|
||||||
/// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
|
|
||||||
/// * `parameters` - additional parameters like search algorithm and k
|
/// * `parameters` - additional parameters like search algorithm and k
|
||||||
pub fn fit<M: Matrix<T>>(
|
pub fn fit<M: Matrix<T>>(
|
||||||
x: &M,
|
x: &M,
|
||||||
y: &M::RowVector,
|
y: &M::RowVector,
|
||||||
distance: D,
|
parameters: KNNRegressorParameters<T, D>,
|
||||||
parameters: KNNRegressorParameters,
|
|
||||||
) -> Result<KNNRegressor<T, D>, Failed> {
|
) -> Result<KNNRegressor<T, D>, Failed> {
|
||||||
let y_m = M::from_row_vector(y.clone());
|
let y_m = M::from_row_vector(y.clone());
|
||||||
|
|
||||||
@@ -126,7 +175,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNRegressor<T, D> {
|
|||||||
Ok(KNNRegressor {
|
Ok(KNNRegressor {
|
||||||
y: y.to_vec(),
|
y: y.to_vec(),
|
||||||
k: parameters.k,
|
k: parameters.k,
|
||||||
knn_algorithm: parameters.algorithm.fit(data, distance)?,
|
knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?,
|
||||||
weight: parameters.weight,
|
weight: parameters.weight,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -176,12 +225,11 @@ mod tests {
|
|||||||
let knn = KNNRegressor::fit(
|
let knn = KNNRegressor::fit(
|
||||||
&x,
|
&x,
|
||||||
&y,
|
&y,
|
||||||
Distances::euclidian(),
|
KNNRegressorParameters::default()
|
||||||
KNNRegressorParameters {
|
.with_k(3)
|
||||||
k: 3,
|
.with_distance(Distances::euclidian())
|
||||||
algorithm: KNNAlgorithmName::LinearSearch,
|
.with_algorithm(KNNAlgorithmName::LinearSearch)
|
||||||
weight: KNNWeightFunction::Distance,
|
.with_weight(KNNWeightFunction::Distance),
|
||||||
},
|
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let y_hat = knn.predict(&x).unwrap();
|
let y_hat = knn.predict(&x).unwrap();
|
||||||
@@ -197,7 +245,7 @@ mod tests {
|
|||||||
DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
|
DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
|
||||||
let y: Vec<f64> = vec![1., 2., 3., 4., 5.];
|
let y: Vec<f64> = vec![1., 2., 3., 4., 5.];
|
||||||
let y_exp = vec![2., 2., 3., 4., 4.];
|
let y_exp = vec![2., 2., 3., 4., 4.];
|
||||||
let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap();
|
let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap();
|
||||||
let y_hat = knn.predict(&x).unwrap();
|
let y_hat = knn.predict(&x).unwrap();
|
||||||
assert_eq!(5, Vec::len(&y_hat));
|
assert_eq!(5, Vec::len(&y_hat));
|
||||||
for i in 0..y_hat.len() {
|
for i in 0..y_hat.len() {
|
||||||
@@ -211,7 +259,7 @@ mod tests {
|
|||||||
DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
|
DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
|
||||||
let y = vec![1., 2., 3., 4., 5.];
|
let y = vec![1., 2., 3., 4., 5.];
|
||||||
|
|
||||||
let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap();
|
let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap();
|
let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ pub mod knn_regressor;
|
|||||||
pub type KNNAlgorithmName = crate::algorithm::neighbour::KNNAlgorithmName;
|
pub type KNNAlgorithmName = crate::algorithm::neighbour::KNNAlgorithmName;
|
||||||
|
|
||||||
/// Weight function that is used to determine estimated value.
|
/// Weight function that is used to determine estimated value.
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub enum KNNWeightFunction {
|
pub enum KNNWeightFunction {
|
||||||
/// All k nearest points are weighted equally
|
/// All k nearest points are weighted equally
|
||||||
Uniform,
|
Uniform,
|
||||||
|
|||||||
+4
-1
@@ -93,16 +93,18 @@ impl Kernels {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Linear Kernel
|
/// Linear Kernel
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct LinearKernel {}
|
pub struct LinearKernel {}
|
||||||
|
|
||||||
/// Radial basis function (Gaussian) kernel
|
/// Radial basis function (Gaussian) kernel
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct RBFKernel<T: RealNumber> {
|
pub struct RBFKernel<T: RealNumber> {
|
||||||
/// kernel coefficient
|
/// kernel coefficient
|
||||||
pub gamma: T,
|
pub gamma: T,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Polynomial kernel
|
/// Polynomial kernel
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct PolynomialKernel<T: RealNumber> {
|
pub struct PolynomialKernel<T: RealNumber> {
|
||||||
/// degree of the polynomial
|
/// degree of the polynomial
|
||||||
pub degree: T,
|
pub degree: T,
|
||||||
@@ -113,6 +115,7 @@ pub struct PolynomialKernel<T: RealNumber> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Sigmoid (hyperbolic tangent) kernel
|
/// Sigmoid (hyperbolic tangent) kernel
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct SigmoidKernel<T: RealNumber> {
|
pub struct SigmoidKernel<T: RealNumber> {
|
||||||
/// kernel coefficient
|
/// kernel coefficient
|
||||||
pub gamma: T,
|
pub gamma: T,
|
||||||
|
|||||||
+62
-36
@@ -57,13 +57,7 @@
|
|||||||
//! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0.,
|
//! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0.,
|
||||||
//! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.];
|
//! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.];
|
||||||
//!
|
//!
|
||||||
//! let svr = SVC::fit(&x, &y,
|
//! let svr = SVC::fit(&x, &y, SVCParameters::default().with_c(200.0)).unwrap();
|
||||||
//! Kernels::linear(),
|
|
||||||
//! SVCParameters {
|
|
||||||
//! epoch: 2,
|
|
||||||
//! c: 200.0,
|
|
||||||
//! tol: 1e-3,
|
|
||||||
//! }).unwrap();
|
|
||||||
//!
|
//!
|
||||||
//! let y_hat = svr.predict(&x).unwrap();
|
//! let y_hat = svr.predict(&x).unwrap();
|
||||||
//! ```
|
//! ```
|
||||||
@@ -84,22 +78,26 @@ use rand::seq::SliceRandom;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
use crate::svm::Kernel;
|
use crate::svm::{Kernel, Kernels, LinearKernel};
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
/// SVC Parameters
|
/// SVC Parameters
|
||||||
pub struct SVCParameters<T: RealNumber> {
|
pub struct SVCParameters<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> {
|
||||||
/// Number of epochs
|
/// Number of epochs.
|
||||||
pub epoch: usize,
|
pub epoch: usize,
|
||||||
/// Regularization parameter.
|
/// Regularization parameter.
|
||||||
pub c: T,
|
pub c: T,
|
||||||
/// Tolerance for stopping criterion
|
/// Tolerance for stopping criterion.
|
||||||
pub tol: T,
|
pub tol: T,
|
||||||
|
/// The kernel function.
|
||||||
|
pub kernel: K,
|
||||||
|
/// Unused parameter.
|
||||||
|
m: PhantomData<M>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
@@ -136,7 +134,7 @@ struct Cache<'a, T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> {
|
|||||||
struct Optimizer<'a, T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> {
|
struct Optimizer<'a, T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> {
|
||||||
x: &'a M,
|
x: &'a M,
|
||||||
y: &'a M::RowVector,
|
y: &'a M::RowVector,
|
||||||
parameters: &'a SVCParameters<T>,
|
parameters: &'a SVCParameters<T, M, K>,
|
||||||
svmin: usize,
|
svmin: usize,
|
||||||
svmax: usize,
|
svmax: usize,
|
||||||
gmin: T,
|
gmin: T,
|
||||||
@@ -147,27 +145,63 @@ struct Optimizer<'a, T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> {
|
|||||||
recalculate_minmax_grad: bool,
|
recalculate_minmax_grad: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> Default for SVCParameters<T> {
|
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVCParameters<T, M, K> {
|
||||||
|
/// Number of epochs.
|
||||||
|
pub fn with_epoch(mut self, epoch: usize) -> Self {
|
||||||
|
self.epoch = epoch;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Regularization parameter.
|
||||||
|
pub fn with_c(mut self, c: T) -> Self {
|
||||||
|
self.c = c;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Tolerance for stopping criterion.
|
||||||
|
pub fn with_tol(mut self, tol: T) -> Self {
|
||||||
|
self.tol = tol;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The kernel function.
|
||||||
|
pub fn with_kernel<KK: Kernel<T, M::RowVector>>(&self, kernel: KK) -> SVCParameters<T, M, KK> {
|
||||||
|
SVCParameters {
|
||||||
|
epoch: self.epoch,
|
||||||
|
c: self.c,
|
||||||
|
tol: self.tol,
|
||||||
|
kernel,
|
||||||
|
m: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Default for SVCParameters<T, M, LinearKernel> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
SVCParameters {
|
SVCParameters {
|
||||||
epoch: 2,
|
epoch: 2,
|
||||||
c: T::one(),
|
c: T::one(),
|
||||||
tol: T::from_f64(1e-3).unwrap(),
|
tol: T::from_f64(1e-3).unwrap(),
|
||||||
|
kernel: Kernels::linear(),
|
||||||
|
m: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector>
|
||||||
|
for SVC<T, M, K>
|
||||||
|
{
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVC<T, M, K> {
|
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVC<T, M, K> {
|
||||||
/// Fits SVC to your data.
|
/// Fits SVC to your data.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
/// * `y` - class labels
|
/// * `y` - class labels
|
||||||
/// * `kernel` - the kernel function
|
|
||||||
/// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values.
|
/// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values.
|
||||||
pub fn fit(
|
pub fn fit(
|
||||||
x: &M,
|
x: &M,
|
||||||
y: &M::RowVector,
|
y: &M::RowVector,
|
||||||
kernel: K,
|
parameters: SVCParameters<T, M, K>,
|
||||||
parameters: SVCParameters<T>,
|
|
||||||
) -> Result<SVC<T, M, K>, Failed> {
|
) -> Result<SVC<T, M, K>, Failed> {
|
||||||
let (n, _) = x.shape();
|
let (n, _) = x.shape();
|
||||||
|
|
||||||
@@ -198,13 +232,13 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVC<T, M, K> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let optimizer = Optimizer::new(x, &y, &kernel, ¶meters);
|
let optimizer = Optimizer::new(x, &y, ¶meters.kernel, ¶meters);
|
||||||
|
|
||||||
let (support_vectors, weight, b) = optimizer.optimize();
|
let (support_vectors, weight, b) = optimizer.optimize();
|
||||||
|
|
||||||
Ok(SVC {
|
Ok(SVC {
|
||||||
classes,
|
classes,
|
||||||
kernel,
|
kernel: parameters.kernel,
|
||||||
instances: support_vectors,
|
instances: support_vectors,
|
||||||
w: weight,
|
w: weight,
|
||||||
b,
|
b,
|
||||||
@@ -321,7 +355,7 @@ impl<'a, T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Optimizer<'a,
|
|||||||
x: &'a M,
|
x: &'a M,
|
||||||
y: &'a M::RowVector,
|
y: &'a M::RowVector,
|
||||||
kernel: &'a K,
|
kernel: &'a K,
|
||||||
parameters: &'a SVCParameters<T>,
|
parameters: &'a SVCParameters<T, M, K>,
|
||||||
) -> Optimizer<'a, T, M, K> {
|
) -> Optimizer<'a, T, M, K> {
|
||||||
let (n, _) = x.shape();
|
let (n, _) = x.shape();
|
||||||
|
|
||||||
@@ -711,18 +745,13 @@ mod tests {
|
|||||||
let y_hat = SVC::fit(
|
let y_hat = SVC::fit(
|
||||||
&x,
|
&x,
|
||||||
&y,
|
&y,
|
||||||
Kernels::linear(),
|
SVCParameters::default()
|
||||||
SVCParameters {
|
.with_c(200.0)
|
||||||
epoch: 2,
|
.with_kernel(Kernels::linear()),
|
||||||
c: 200.0,
|
|
||||||
tol: 1e-3,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
.and_then(|lr| lr.predict(&x))
|
.and_then(|lr| lr.predict(&x))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
println!("{:?}", y_hat);
|
|
||||||
|
|
||||||
assert!(accuracy(&y_hat, &y) >= 0.9);
|
assert!(accuracy(&y_hat, &y) >= 0.9);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -759,12 +788,9 @@ mod tests {
|
|||||||
let y_hat = SVC::fit(
|
let y_hat = SVC::fit(
|
||||||
&x,
|
&x,
|
||||||
&y,
|
&y,
|
||||||
Kernels::rbf(0.7),
|
SVCParameters::default()
|
||||||
SVCParameters {
|
.with_c(1.0)
|
||||||
epoch: 2,
|
.with_kernel(Kernels::rbf(0.7)),
|
||||||
c: 1.0,
|
|
||||||
tol: 1e-3,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
.and_then(|lr| lr.predict(&x))
|
.and_then(|lr| lr.predict(&x))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -801,7 +827,7 @@ mod tests {
|
|||||||
-1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
-1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
||||||
];
|
];
|
||||||
|
|
||||||
let svr = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap();
|
let svr = SVC::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
let deserialized_svr: SVC<f64, DenseMatrix<f64>, LinearKernel> =
|
let deserialized_svr: SVC<f64, DenseMatrix<f64>, LinearKernel> =
|
||||||
serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap();
|
serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap();
|
||||||
|
|||||||
+59
-32
@@ -49,13 +49,7 @@
|
|||||||
//! let y: Vec<f64> = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0,
|
//! let y: Vec<f64> = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0,
|
||||||
//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9];
|
//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9];
|
||||||
//!
|
//!
|
||||||
//! let svr = SVR::fit(&x, &y,
|
//! let svr = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0)).unwrap();
|
||||||
//! LinearKernel {},
|
|
||||||
//! SVRParameters {
|
|
||||||
//! eps: 2.0,
|
|
||||||
//! c: 10.0,
|
|
||||||
//! tol: 1e-3,
|
|
||||||
//! }).unwrap();
|
|
||||||
//!
|
//!
|
||||||
//! let y_hat = svr.predict(&x).unwrap();
|
//! let y_hat = svr.predict(&x).unwrap();
|
||||||
//! ```
|
//! ```
|
||||||
@@ -72,25 +66,30 @@
|
|||||||
|
|
||||||
use std::cell::{Ref, RefCell};
|
use std::cell::{Ref, RefCell};
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
use crate::svm::Kernel;
|
use crate::svm::{Kernel, Kernels, LinearKernel};
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
/// SVR Parameters
|
/// SVR Parameters
|
||||||
pub struct SVRParameters<T: RealNumber> {
|
pub struct SVRParameters<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> {
|
||||||
/// Epsilon in the epsilon-SVR model
|
/// Epsilon in the epsilon-SVR model.
|
||||||
pub eps: T,
|
pub eps: T,
|
||||||
/// Regularization parameter.
|
/// Regularization parameter.
|
||||||
pub c: T,
|
pub c: T,
|
||||||
/// Tolerance for stopping criterion
|
/// Tolerance for stopping criterion.
|
||||||
pub tol: T,
|
pub tol: T,
|
||||||
|
/// The kernel function.
|
||||||
|
pub kernel: K,
|
||||||
|
/// Unused parameter.
|
||||||
|
m: PhantomData<M>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
@@ -135,16 +134,54 @@ struct Cache<T: Clone> {
|
|||||||
data: Vec<RefCell<Option<Vec<T>>>>,
|
data: Vec<RefCell<Option<Vec<T>>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> Default for SVRParameters<T> {
|
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVRParameters<T, M, K> {
|
||||||
|
/// Epsilon in the epsilon-SVR model.
|
||||||
|
pub fn with_eps(mut self, eps: T) -> Self {
|
||||||
|
self.eps = eps;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Regularization parameter.
|
||||||
|
pub fn with_c(mut self, c: T) -> Self {
|
||||||
|
self.c = c;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Tolerance for stopping criterion.
|
||||||
|
pub fn with_tol(mut self, tol: T) -> Self {
|
||||||
|
self.tol = tol;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The kernel function.
|
||||||
|
pub fn with_kernel<KK: Kernel<T, M::RowVector>>(&self, kernel: KK) -> SVRParameters<T, M, KK> {
|
||||||
|
SVRParameters {
|
||||||
|
eps: self.eps,
|
||||||
|
c: self.c,
|
||||||
|
tol: self.tol,
|
||||||
|
kernel,
|
||||||
|
m: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Default for SVRParameters<T, M, LinearKernel> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
SVRParameters {
|
SVRParameters {
|
||||||
eps: T::from_f64(0.1).unwrap(),
|
eps: T::from_f64(0.1).unwrap(),
|
||||||
c: T::one(),
|
c: T::one(),
|
||||||
tol: T::from_f64(1e-3).unwrap(),
|
tol: T::from_f64(1e-3).unwrap(),
|
||||||
|
kernel: Kernels::linear(),
|
||||||
|
m: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector>
|
||||||
|
for SVR<T, M, K>
|
||||||
|
{
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVR<T, M, K> {
|
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVR<T, M, K> {
|
||||||
/// Fits SVR to your data.
|
/// Fits SVR to your data.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
@@ -154,8 +191,7 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVR<T, M, K> {
|
|||||||
pub fn fit(
|
pub fn fit(
|
||||||
x: &M,
|
x: &M,
|
||||||
y: &M::RowVector,
|
y: &M::RowVector,
|
||||||
kernel: K,
|
parameters: SVRParameters<T, M, K>,
|
||||||
parameters: SVRParameters<T>,
|
|
||||||
) -> Result<SVR<T, M, K>, Failed> {
|
) -> Result<SVR<T, M, K>, Failed> {
|
||||||
let (n, _) = x.shape();
|
let (n, _) = x.shape();
|
||||||
|
|
||||||
@@ -165,12 +201,12 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVR<T, M, K> {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let optimizer = Optimizer::new(x, y, &kernel, ¶meters);
|
let optimizer = Optimizer::new(x, y, ¶meters.kernel, ¶meters);
|
||||||
|
|
||||||
let (support_vectors, weight, b) = optimizer.smo();
|
let (support_vectors, weight, b) = optimizer.smo();
|
||||||
|
|
||||||
Ok(SVR {
|
Ok(SVR {
|
||||||
kernel,
|
kernel: parameters.kernel,
|
||||||
instances: support_vectors,
|
instances: support_vectors,
|
||||||
w: weight,
|
w: weight,
|
||||||
b,
|
b,
|
||||||
@@ -243,7 +279,7 @@ impl<'a, T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Optimizer<'a,
|
|||||||
x: &M,
|
x: &M,
|
||||||
y: &M::RowVector,
|
y: &M::RowVector,
|
||||||
kernel: &'a K,
|
kernel: &'a K,
|
||||||
parameters: &SVRParameters<T>,
|
parameters: &SVRParameters<T, M, K>,
|
||||||
) -> Optimizer<'a, T, M, K> {
|
) -> Optimizer<'a, T, M, K> {
|
||||||
let (n, _) = x.shape();
|
let (n, _) = x.shape();
|
||||||
|
|
||||||
@@ -510,18 +546,9 @@ mod tests {
|
|||||||
114.2, 115.7, 116.9,
|
114.2, 115.7, 116.9,
|
||||||
];
|
];
|
||||||
|
|
||||||
let y_hat = SVR::fit(
|
let y_hat = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0))
|
||||||
&x,
|
.and_then(|lr| lr.predict(&x))
|
||||||
&y,
|
.unwrap();
|
||||||
LinearKernel {},
|
|
||||||
SVRParameters {
|
|
||||||
eps: 2.0,
|
|
||||||
c: 10.0,
|
|
||||||
tol: 1e-3,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.and_then(|lr| lr.predict(&x))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert!(mean_squared_error(&y_hat, &y) < 2.5);
|
assert!(mean_squared_error(&y_hat, &y) < 2.5);
|
||||||
}
|
}
|
||||||
@@ -552,7 +579,7 @@ mod tests {
|
|||||||
114.2, 115.7, 116.9,
|
114.2, 115.7, 116.9,
|
||||||
];
|
];
|
||||||
|
|
||||||
let svr = SVR::fit(&x, &y, LinearKernel {}, Default::default()).unwrap();
|
let svr = SVR::fit(&x, &y, Default::default()).unwrap();
|
||||||
|
|
||||||
let deserialized_svr: SVR<f64, DenseMatrix<f64>, LinearKernel> =
|
let deserialized_svr: SVR<f64, DenseMatrix<f64>, LinearKernel> =
|
||||||
serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap();
|
serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap();
|
||||||
|
|||||||
@@ -71,11 +71,12 @@ use rand::seq::SliceRandom;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::algorithm::sort::quick_sort::QuickArgSort;
|
use crate::algorithm::sort::quick_sort::QuickArgSort;
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
/// Parameters of Decision Tree
|
/// Parameters of Decision Tree
|
||||||
pub struct DecisionTreeClassifierParameters {
|
pub struct DecisionTreeClassifierParameters {
|
||||||
/// Split criteria to use when building a tree.
|
/// Split criteria to use when building a tree.
|
||||||
@@ -160,6 +161,29 @@ impl<T: RealNumber> PartialEq for Node<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl DecisionTreeClassifierParameters {
|
||||||
|
/// Split criteria to use when building a tree.
|
||||||
|
pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self {
|
||||||
|
self.criterion = criterion;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The maximum depth of the tree.
|
||||||
|
pub fn with_max_depth(mut self, max_depth: u16) -> Self {
|
||||||
|
self.max_depth = Some(max_depth);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The minimum number of samples required to be at a leaf node.
|
||||||
|
pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
|
||||||
|
self.min_samples_leaf = min_samples_leaf;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The minimum number of samples required to split an internal node.
|
||||||
|
pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
|
||||||
|
self.min_samples_split = min_samples_split;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Default for DecisionTreeClassifierParameters {
|
impl Default for DecisionTreeClassifierParameters {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
DecisionTreeClassifierParameters {
|
DecisionTreeClassifierParameters {
|
||||||
@@ -269,6 +293,12 @@ pub(in crate) fn which_max(x: &[usize]) -> usize {
|
|||||||
which
|
which
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for DecisionTreeClassifier<T> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> DecisionTreeClassifier<T> {
|
impl<T: RealNumber> DecisionTreeClassifier<T> {
|
||||||
/// Build a decision tree classifier from the training data.
|
/// Build a decision tree classifier from the training data.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
|
|||||||
@@ -66,11 +66,12 @@ use rand::seq::SliceRandom;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::algorithm::sort::quick_sort::QuickArgSort;
|
use crate::algorithm::sort::quick_sort::QuickArgSort;
|
||||||
|
use crate::base::Predictor;
|
||||||
use crate::error::Failed;
|
use crate::error::Failed;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
/// Parameters of Regression Tree
|
/// Parameters of Regression Tree
|
||||||
pub struct DecisionTreeRegressorParameters {
|
pub struct DecisionTreeRegressorParameters {
|
||||||
/// The maximum depth of the tree.
|
/// The maximum depth of the tree.
|
||||||
@@ -100,6 +101,24 @@ struct Node<T: RealNumber> {
|
|||||||
false_child: Option<usize>,
|
false_child: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl DecisionTreeRegressorParameters {
|
||||||
|
/// The maximum depth of the tree.
|
||||||
|
pub fn with_max_depth(mut self, max_depth: u16) -> Self {
|
||||||
|
self.max_depth = Some(max_depth);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The minimum number of samples required to be at a leaf node.
|
||||||
|
pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
|
||||||
|
self.min_samples_leaf = min_samples_leaf;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// The minimum number of samples required to split an internal node.
|
||||||
|
pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
|
||||||
|
self.min_samples_split = min_samples_split;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Default for DecisionTreeRegressorParameters {
|
impl Default for DecisionTreeRegressorParameters {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
DecisionTreeRegressorParameters {
|
DecisionTreeRegressorParameters {
|
||||||
@@ -189,6 +208,12 @@ impl<'a, T: RealNumber, M: Matrix<T>> NodeVisitor<'a, T, M> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for DecisionTreeRegressor<T> {
|
||||||
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
|
self.predict(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> DecisionTreeRegressor<T> {
|
impl<T: RealNumber> DecisionTreeRegressor<T> {
|
||||||
/// Build a decision tree regressor from the training data.
|
/// Build a decision tree regressor from the training data.
|
||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
|
|||||||
Reference in New Issue
Block a user