diff --git a/.gitignore b/.gitignore index d770de9..e4ee4c2 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ Cargo.lock .idea .project .vscode +smartcore.code-workspace # OS .DS_Store diff --git a/Cargo.toml b/Cargo.toml index 422abe5..e339654 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,12 @@ [package] name = "smartcore" version = "0.1.0" -authors = ["Vlad Orlov"] +authors = ["SmartCore Developers"] edition = "2018" [dependencies] ndarray = "0.13" -nalgebra = "0.21.1" +nalgebra = "0.22.0" num-traits = "0.2.12" num = "0.3.0" rand = "0.7.3" diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 3be1133..413e4d8 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -66,7 +66,7 @@ impl> CoverTree { } } - pub fn new_node(&mut self, parent: Option, data: T) -> NodeId { + fn new_node(&mut self, parent: Option, data: T) -> NodeId { let next_index = self.nodes.len(); let node_id = NodeId { index: next_index }; self.nodes.push(Node { @@ -300,7 +300,7 @@ impl> CoverTree { } #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -pub struct NodeId { +struct NodeId { index: usize, } diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 1caf84b..52d117d 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -1,3 +1,3 @@ -pub mod bbd_tree; +pub(crate) mod bbd_tree; pub mod cover_tree; pub mod linear_search; diff --git a/src/common/mod.rs b/src/common/mod.rs deleted file mode 100644 index a70976d..0000000 --- a/src/common/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -use ndarray::ScalarOperand; -use num_traits::{FromPrimitive, Num, One, ToPrimitive, Zero}; -use std::fmt::Debug; -use std::hash::Hash; - -pub trait AnyNumber: Num + ScalarOperand + ToPrimitive + FromPrimitive {} - -pub trait Nominal: - PartialEq + Zero + One + Eq + Hash + ToPrimitive + FromPrimitive + Debug + 'static + Clone -{ -} - -impl AnyNumber for T where T: Num + ScalarOperand + ToPrimitive + FromPrimitive {} - -impl Nominal for T where - T: PartialEq + Zero + One + Eq + Hash + ToPrimitive + Debug + FromPrimitive + 'static + Clone -{ -} diff --git a/src/lib.rs b/src/lib.rs index f085a64..6822702 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,87 @@ +#![warn(missing_docs)] +#![warn(missing_doc_code_examples)] + +//! # SmartCore +//! +//! Welcome to SmartCore library, the most complete machine learning library for Rust! +//! +//! In SmartCore you will find implementation of these ML algorithms: +//! * Regression: Linear Regression (OLS), Decision Tree Regressor, Random Forest Regressor +//! * Classification: Logistic Regressor, Decision Tree Classifier, Random Forest Classifier, Unsupervised Nearest Neighbors (KNN) +//! * Clustering: K-Means +//! * Matrix decomposition: PCA, LU, QR, SVD, EVD +//! * Distance Metrics: Euclidian, Minkowski, Manhattan, Hamming, Mahalanobis +//! * Evaluation Metrics: Accuracy, AUC, Recall, Precision, F1, Mean Absolute Error, Mean Squared Error, R2 +//! +//! Most of algorithms implemented in SmartCore operate on n-dimentional arrays. While you can use Rust vectors with all functions defined in this library +//! we do recommend to go with one of the popular linear algebra libraries available in Rust. At this moment we support these packages: +//! * [ndarray](https://docs.rs/ndarray) +//! * [nalgebra](https://docs.rs/nalgebra/) +//! +//! ## Getting Started +//! +//! To start using SmartCore simply add the following to your Cargo.toml file: +//! ```ignore +//! [dependencies] +//! smartcore = "0.1.0" +//! ``` +//! +//! All ML algorithms in SmartCore are grouped into these generic categories: +//! * [Clustering](cluster/index.html), unsupervised clustering of unlabeled data. +//! * [Martix Decomposition](decomposition/index.html), various methods for matrix decomposition. +//! * [Linear Models](linear/index.html), regression and classification methods where output is assumed to have linear relation to explanatory variables +//! * [Ensemble Models](ensemble/index.html), variety of regression and classification ensemble models +//! * [Tree-based Models](tree/index.html), classification and regression trees +//! * [Nearest Neighbors](neighbors/index.html), K Nearest Neighbors for classification and regression +//! +//! Each category is assigned to a separate module. +//! +//! For example, KNN classifier is defined in [smartcore::neighbors::knn](neighbors/knn/index.html). To train and run it using standard Rust vectors you will +//! run this code: +//! +//! ``` +//! // DenseMatrix defenition +//! use smartcore::linalg::naive::dense_matrix::*; +//! // KNNClassifier +//! use smartcore::neighbors::knn::*; +//! // Various distance metrics +//! use smartcore::math::distance::*; +//! +//! // Turn Rust vectors with samples into a matrix +//! let x = DenseMatrix::from_array(&[ +//! &[1., 2.], +//! &[3., 4.], +//! &[5., 6.], +//! &[7., 8.], +//! &[9., 10.]]); +//! // Our classes are defined as a Vector +//! let y = vec![2., 2., 2., 3., 3.]; +//! +//! // Train classifier +//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()); +//! +//! // Predict classes +//! let y_hat = knn.predict(&x); +//! ``` + +/// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; +/// Algorithms for clustering of unlabeled data pub mod cluster; -pub mod common; +/// Matrix decomposition algorithms pub mod decomposition; +/// Ensemble methods, including Random Forest classifier and regressor pub mod ensemble; +/// Diverse collection of linear algebra abstractions and methods that power SmartCore algorithms pub mod linalg; +/// Supervised classification and regression models that assume linear relationship between dependent and explanatory variables. pub mod linear; +/// Multitude of helper methods and classes, including definitions of distance metrics pub mod math; +/// Functions for assessing prediction error. pub mod metrics; +/// Supervised neighbors-based learning methods pub mod neighbors; -pub mod optimization; +pub(crate) mod optimization; +/// Supervised tree-based learning methods pub mod tree; diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 271480c..fdbaa16 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -6,16 +6,29 @@ use crate::linalg::Matrix; use crate::math::num::FloatExt; #[derive(Serialize, Deserialize, Debug)] -pub enum LinearRegressionSolver { +pub enum LinearRegressionSolverName { QR, SVD, } +#[derive(Serialize, Deserialize, Debug)] +pub struct LinearRegressionParameters { + solver: LinearRegressionSolverName, +} + #[derive(Serialize, Deserialize, Debug)] pub struct LinearRegression> { coefficients: M, intercept: T, - solver: LinearRegressionSolver, + solver: LinearRegressionSolverName, +} + +impl Default for LinearRegressionParameters { + fn default() -> Self { + LinearRegressionParameters { + solver: LinearRegressionSolverName::SVD + } + } } impl> PartialEq for LinearRegression { @@ -26,7 +39,7 @@ impl> PartialEq for LinearRegression { } impl> LinearRegression { - pub fn fit(x: &M, y: &M::RowVector, solver: LinearRegressionSolver) -> LinearRegression { + pub fn fit(x: &M, y: &M::RowVector, parameters: LinearRegressionParameters) -> LinearRegression { let y_m = M::from_row_vector(y.clone()); let b = y_m.transpose(); let (x_nrows, num_attributes) = x.shape(); @@ -38,9 +51,9 @@ impl> LinearRegression { let a = x.v_stack(&M::ones(x_nrows, 1)); - let w = match solver { - LinearRegressionSolver::QR => a.qr_solve_mut(b), - LinearRegressionSolver::SVD => a.svd_solve_mut(b), + let w = match parameters.solver { + LinearRegressionSolverName::QR => a.qr_solve_mut(b), + LinearRegressionSolverName::SVD => a.svd_solve_mut(b), }; let wights = w.slice(0..num_attributes, 0..1); @@ -48,7 +61,7 @@ impl> LinearRegression { LinearRegression { intercept: w.get(num_attributes, 0), coefficients: wights, - solver: solver, + solver: parameters.solver, } } @@ -90,9 +103,9 @@ mod tests { 114.2, 115.7, 116.9, ]); - let y_hat_qr = LinearRegression::fit(&x, &y, LinearRegressionSolver::QR).predict(&x); + let y_hat_qr = LinearRegression::fit(&x, &y, LinearRegressionParameters{solver: LinearRegressionSolverName::QR}).predict(&x); - let y_hat_svd = LinearRegression::fit(&x, &y, LinearRegressionSolver::SVD).predict(&x); + let y_hat_svd = LinearRegression::fit(&x, &y, Default::default()).predict(&x); assert!(y .iter() @@ -130,9 +143,9 @@ mod tests { 114.2, 115.7, 116.9, ]; - let y_hat_qr = LinearRegression::fit(&x, &y, LinearRegressionSolver::QR).predict(&x); + let y_hat_qr = LinearRegression::fit(&x, &y, LinearRegressionParameters{solver: LinearRegressionSolverName::QR}).predict(&x); - let y_hat_svd = LinearRegression::fit(&x, &y, LinearRegressionSolver::SVD).predict(&x); + let y_hat_svd = LinearRegression::fit(&x, &y, Default::default()).predict(&x); assert!(y .iter() @@ -170,7 +183,7 @@ mod tests { 114.2, 115.7, 116.9, ]; - let lr = LinearRegression::fit(&x, &y, LinearRegressionSolver::QR); + let lr = LinearRegression::fit(&x, &y, Default::default()); let deserialized_lr: LinearRegression> = serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); diff --git a/src/math/mod.rs b/src/math/mod.rs index da8b7ba..2c6e226 100644 --- a/src/math/mod.rs +++ b/src/math/mod.rs @@ -1,2 +1,2 @@ pub mod distance; -pub mod num; +pub(crate) mod num; diff --git a/src/neighbors/knn.rs b/src/neighbors/knn.rs index d160296..9af48f0 100644 --- a/src/neighbors/knn.rs +++ b/src/neighbors/knn.rs @@ -7,44 +7,60 @@ use crate::math::distance::Distance; use crate::math::num::FloatExt; #[derive(Serialize, Deserialize, Debug)] -pub struct KNNClassifier, T>> { - classes: Vec, - y: Vec, - knn_algorithm: KNNAlgorithmV, - k: usize, -} - pub enum KNNAlgorithmName { LinearSearch, CoverTree, } #[derive(Serialize, Deserialize, Debug)] -pub enum KNNAlgorithmV, T>> { +pub struct KNNClassifierParameters { + pub algorithm: KNNAlgorithmName, + pub k: usize +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct KNNClassifier, T>> { + classes: Vec, + y: Vec, + knn_algorithm: KNNAlgorithm, + k: usize, +} + +#[derive(Serialize, Deserialize, Debug)] +enum KNNAlgorithm, T>> { LinearSearch(LinearKNNSearch, T, D>), CoverTree(CoverTree, T, D>), } +impl Default for KNNClassifierParameters { + fn default() -> Self { + KNNClassifierParameters { + algorithm: KNNAlgorithmName::CoverTree, + k: 3 + } + } +} + impl KNNAlgorithmName { fn fit, T>>( &self, data: Vec>, distance: D, - ) -> KNNAlgorithmV { + ) -> KNNAlgorithm { match *self { KNNAlgorithmName::LinearSearch => { - KNNAlgorithmV::LinearSearch(LinearKNNSearch::new(data, distance)) + KNNAlgorithm::LinearSearch(LinearKNNSearch::new(data, distance)) } - KNNAlgorithmName::CoverTree => KNNAlgorithmV::CoverTree(CoverTree::new(data, distance)), + KNNAlgorithmName::CoverTree => KNNAlgorithm::CoverTree(CoverTree::new(data, distance)), } } } -impl, T>> KNNAlgorithmV { +impl, T>> KNNAlgorithm { fn find(&self, from: &Vec, k: usize) -> Vec { match *self { - KNNAlgorithmV::LinearSearch(ref linear) => linear.find(from, k), - KNNAlgorithmV::CoverTree(ref cover) => cover.find(from, k), + KNNAlgorithm::LinearSearch(ref linear) => linear.find(from, k), + KNNAlgorithm::CoverTree(ref cover) => cover.find(from, k), } } } @@ -76,9 +92,8 @@ impl, T>> KNNClassifier { pub fn fit>( x: &M, y: &M::RowVector, - k: usize, distance: D, - algorithm: KNNAlgorithmName, + parameters: KNNClassifierParameters ) -> KNNClassifier { let y_m = M::from_row_vector(y.clone()); @@ -103,13 +118,13 @@ impl, T>> KNNClassifier { ) ); - assert!(k > 1, format!("k should be > 1, k=[{}]", k)); + assert!(parameters.k > 1, format!("k should be > 1, k=[{}]", parameters.k)); KNNClassifier { classes: classes, y: yi, - k: k, - knn_algorithm: algorithm.fit(data, distance), + k: parameters.k, + knn_algorithm: parameters.algorithm.fit(data, distance), } } @@ -153,9 +168,8 @@ mod tests { let knn = KNNClassifier::fit( &x, &y, - 3, Distances::euclidian(), - KNNAlgorithmName::LinearSearch, + KNNClassifierParameters{k: 3, algorithm: KNNAlgorithmName::LinearSearch} ); let r = knn.predict(&x); assert_eq!(5, Vec::len(&r)); @@ -169,10 +183,9 @@ mod tests { let knn = KNNClassifier::fit( &x, - &y, - 3, + &y, Distances::euclidian(), - KNNAlgorithmName::CoverTree, + Default::default() ); let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); diff --git a/src/optimization/mod.rs b/src/optimization/mod.rs index f592d86..feb0322 100644 --- a/src/optimization/mod.rs +++ b/src/optimization/mod.rs @@ -5,8 +5,7 @@ pub type F<'a, T, X> = dyn for<'b> Fn(&'b X) -> T + 'a; pub type DF<'a, X> = dyn for<'b> Fn(&'b mut X, &'b X) + 'a; #[derive(Debug, PartialEq)] -pub enum FunctionOrder { - FIRST, +pub enum FunctionOrder { SECOND, THIRD, }