diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 708b415..f228aed 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -26,7 +26,8 @@ //! * ["Faster cover trees." Izbicki et al., Proceedings of the 32nd International Conference on Machine Learning, ICML'15 (2015)](http://www.cs.ucr.edu/~cshelton/papers/index.cgi%3FIzbShe15) //! * ["The Elements of Statistical Learning: Data Mining, Inference, and Prediction" Trevor et al., 2nd edition, chapter 13](https://web.stanford.edu/~hastie/ElemStatLearn/) //! -//! +//! +//! pub(crate) mod bbd_tree; /// tree data structure for fast nearest neighbor search diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index b565ce4..9b1f540 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -43,7 +43,8 @@ //! //! ``` //! -//! +//! +//! use std::fmt::Debug; use serde::{Deserialize, Serialize}; diff --git a/src/ensemble/mod.rs b/src/ensemble/mod.rs index 249d33c..1ddf4b4 100644 --- a/src/ensemble/mod.rs +++ b/src/ensemble/mod.rs @@ -1,2 +1,22 @@ +//! # Ensemble Methods +//! +//! Combining predictions of several base estimators is a general-purpose procedure for reducing the variance of a statistical learning method. +//! When combined with bagging, ensemble models achive superior performance to individual estimators. +//! +//! The main idea behind bagging (or bootstrap aggregation) is to fit the same base model to a big number of random subsets of the original training +//! set and then aggregate their individual predictions to form a final prediction. In classification setting the overall prediction is the most commonly +//! occurring majority class among the individual predictions. +//! +//! In SmartCore you will find implementation of RandomForest - a popular averaging algorithms based on randomized [decision trees](../tree/index.html). +//! Random forests provide an improvement over bagged trees by way of a small tweak that decorrelates the trees. As in bagging, we build a number of +//! decision trees on bootstrapped training samples. But when building these decision trees, each time a split in a tree is considered, +//! a random sample of _m_ predictors is chosen as split candidates from the full set of _p_ predictors. +//! +//! ## References: +//! +//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 8.2 Bagging, Random Forests, Boosting](http://faculty.marshall.usc.edu/gareth-james/ISL/) + +/// Random forest classifier pub mod random_forest_classifier; +/// Random forest regressor pub mod random_forest_regressor; diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index b99a0da..418f583 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -1,3 +1,50 @@ +//! # Random Forest Classifier +//! A random forest is an ensemble estimator that fits multiple [decision trees](../../tree/index.html) to random subsets of the dataset and averages predictions +//! to improve the predictive accuracy and control over-fitting. See [ensemble models](../index.html) for more details. +//! +//! Bigger number of estimators in general improves performance of the algorithm with an increased cost of training time. +//! The random sample of _m_ predictors is typically set to be \\(\sqrt{p}\\) from the full set of _p_ predictors. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::ensemble::random_forest_classifier::*; +//! +//! // Iris dataset +//! let x = DenseMatrix::from_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! let y = vec![ +//! 0., 0., 0., 0., 0., 0., 0., 0., +//! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., +//! ]; +//! +//! let classifier = RandomForestClassifier::fit(&x, &y, Default::default()); +//! let y_hat = classifier.predict(&x); // use the same data for prediction +//! ``` +//! +//! +//! extern crate rand; use std::default::Default; @@ -12,16 +59,25 @@ use crate::tree::decision_tree_classifier::{ which_max, DecisionTreeClassifier, DecisionTreeClassifierParameters, SplitCriterion, }; +/// Parameters of the Random Forest algorithm. +/// Some parameters here are passed directly into base estimator. #[derive(Serialize, Deserialize, Debug, Clone)] pub struct RandomForestClassifierParameters { + /// Split criteria to use when building a tree. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) pub criterion: SplitCriterion, + /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) pub max_depth: Option, + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) pub min_samples_leaf: usize, + /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) pub min_samples_split: usize, + /// The number of trees in the forest. pub n_trees: u16, - pub mtry: Option, + /// Number of random sample of predictors to use as split candidates. + pub m: Option, } +/// Random Forest Classifier #[derive(Serialize, Deserialize, Debug)] pub struct RandomForestClassifier { parameters: RandomForestClassifierParameters, @@ -57,12 +113,15 @@ impl Default for RandomForestClassifierParameters { min_samples_leaf: 1, min_samples_split: 2, n_trees: 100, - mtry: Option::None, + m: Option::None, } } } impl RandomForestClassifier { + /// Build a forest of trees from the training set. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - the target class values pub fn fit>( x: &M, y: &M::RowVector, @@ -79,7 +138,7 @@ impl RandomForestClassifier { yi[i] = classes.iter().position(|c| yc == *c).unwrap(); } - let mtry = parameters.mtry.unwrap_or( + let mtry = parameters.m.unwrap_or( (T::from(num_attributes).unwrap()) .sqrt() .floor() @@ -110,6 +169,8 @@ impl RandomForestClassifier { } } + /// Predict class for `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. pub fn predict>(&self, x: &M) -> M::RowVector { let mut result = M::zeros(1, x.shape().0); @@ -199,7 +260,7 @@ mod tests { min_samples_leaf: 1, min_samples_split: 2, n_trees: 1000, - mtry: Option::None, + m: Option::None, }, ); diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index d651fb7..447ff3b 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -1,3 +1,47 @@ +//! # Random Forest Regressor +//! A random forest is an ensemble estimator that fits multiple [decision trees](../../tree/index.html) to random subsets of the dataset and averages predictions +//! to improve the predictive accuracy and control over-fitting. See [ensemble models](../index.html) for more details. +//! +//! Bigger number of estimators in general improves performance of the algorithm with an increased cost of training time. +//! The random sample of _m_ predictors is typically set to be \\(\sqrt{p}\\) from the full set of _p_ predictors. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::ensemble::random_forest_regressor::*; +//! +//! // Longley dataset (https://www.statsmodels.org/stable/datasets/generated/longley.html) +//! let x = DenseMatrix::from_array(&[ +//! &[234.289, 235.6, 159., 107.608, 1947., 60.323], +//! &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], +//! &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], +//! &[284.599, 335.1, 165., 110.929, 1950., 61.187], +//! &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], +//! &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], +//! &[365.385, 187., 354.7, 115.094, 1953., 64.989], +//! &[363.112, 357.8, 335., 116.219, 1954., 63.761], +//! &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], +//! &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], +//! &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], +//! &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], +//! &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], +//! &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], +//! &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], +//! &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], +//! ]); +//! let y = vec![ +//! 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, +//! 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9 +//! ]; +//! +//! let regressor = RandomForestRegressor::fit(&x, &y, Default::default()); +//! +//! let y_hat = regressor.predict(&x); // use the same data for prediction +//! ``` +//! +//! +//! extern crate rand; use std::default::Default; @@ -13,14 +57,22 @@ use crate::tree::decision_tree_regressor::{ }; #[derive(Serialize, Deserialize, Debug, Clone)] +/// Parameters of the Random Forest Regressor +/// Some parameters here are passed directly into base estimator. pub struct RandomForestRegressorParameters { + /// Tree max depth. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html) pub max_depth: Option, + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html) pub min_samples_leaf: usize, + /// The minimum number of samples required to split an internal node. See [Decision Tree Regressor](../../tree/decision_tree_regressor/index.html) pub min_samples_split: usize, + /// The number of trees in the forest. pub n_trees: usize, - pub mtry: Option, + /// Number of random sample of predictors to use as split candidates. + pub m: Option, } +/// Random Forest Regressor #[derive(Serialize, Deserialize, Debug)] pub struct RandomForestRegressor { parameters: RandomForestRegressorParameters, @@ -34,7 +86,7 @@ impl Default for RandomForestRegressorParameters { min_samples_leaf: 1, min_samples_split: 2, n_trees: 10, - mtry: Option::None, + m: Option::None, } } } @@ -55,6 +107,9 @@ impl PartialEq for RandomForestRegressor { } impl RandomForestRegressor { + /// Build a forest of trees from the training set. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - the target class values pub fn fit>( x: &M, y: &M::RowVector, @@ -63,7 +118,7 @@ impl RandomForestRegressor { let (n_rows, num_attributes) = x.shape(); let mtry = parameters - .mtry + .m .unwrap_or((num_attributes as f64).sqrt().floor() as usize); let mut trees: Vec> = Vec::new(); @@ -85,6 +140,8 @@ impl RandomForestRegressor { } } + /// Predict class for `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. pub fn predict>(&self, x: &M) -> M::RowVector { let mut result = M::zeros(1, x.shape().0); @@ -162,7 +219,7 @@ mod tests { min_samples_leaf: 1, min_samples_split: 2, n_trees: 1000, - mtry: Option::None, + m: Option::None, }, ) .predict(&x); diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index e9a2592..d5dd653 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -58,7 +58,8 @@ //! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 3. Linear Regression](http://faculty.marshall.usc.edu/gareth-james/ISL/) //! * ["Numerical Recipes: The Art of Scientific Computing", Press W.H., Teukolsky S.A., Vetterling W.T, Flannery B.P, 3rd ed., Section 15.4 General Linear Least Squares](http://numerical.recipes/) //! -//! +//! +//! use std::fmt::Debug; use serde::{Deserialize, Serialize}; diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 0e7e2f8..1e362de 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -49,7 +49,9 @@ //! * ["Pattern Recognition and Machine Learning", C.M. Bishop, Linear Models for Classification](https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf) //! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 4.3 Logistic Regression](http://faculty.marshall.usc.edu/gareth-james/ISL/) //! * ["On the Limited Memory Method for Large Scale Optimization", Nocedal et al., Mathematical Programming, 1989](http://users.iems.northwestern.edu/~nocedal/PDFfiles/limited.pdf) -//! +//! +//! +//! use std::fmt::Debug; use std::marker::PhantomData; diff --git a/src/linear/mod.rs b/src/linear/mod.rs index 1965d42..54bbca0 100644 --- a/src/linear/mod.rs +++ b/src/linear/mod.rs @@ -17,7 +17,8 @@ //! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 3. Linear Regression](http://faculty.marshall.usc.edu/gareth-james/ISL/) //! * ["The Statistical Sleuth, A Course in Methods of Data Analysis", Ramsey F.L., Schafer D.W., Ch 7, 8, 3rd edition, 2013](http://www.statisticalsleuth.com/) //! -//! +//! +//! pub mod linear_regression; pub mod logistic_regression; diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 1517850..42688c6 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -16,7 +16,8 @@ //! let l2: f64 = Euclidian{}.distance(&x, &y); //! ``` //! -//! +//! +//! use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index eebae55..4028259 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -16,7 +16,8 @@ //! //! ``` //! -//! +//! +//! use serde::{Deserialize, Serialize}; diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index 3d499a3..5f940fa 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -38,7 +38,8 @@ //! * ["Introduction to Multivariate Statistical Analysis in Chemometrics", Varmuza, K., Filzmoser, P., 2016, p.46](https://www.taylorfrancis.com/books/9780429145049) //! * ["Example of Calculating the Mahalanobis Distance", McCaffrey, J.D.](https://jamesmccaffrey.wordpress.com/2017/11/09/example-of-calculating-the-mahalanobis-distance/) //! -//! +//! +//! #![allow(non_snake_case)] use std::marker::PhantomData; diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index cbf1a92..9b46a0c 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -15,7 +15,8 @@ //! //! let l1: f64 = Manhattan {}.distance(&x, &y); //! ``` -//! +//! +//! use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index e345fce..667e0db 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -18,7 +18,8 @@ //! let l2: f64 = Minkowski { p: 2 }.distance(&x, &y); //! //! ``` -//! +//! +//! use serde::{Deserialize, Serialize}; diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 7b5f1f8..0532e86 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -10,7 +10,8 @@ //! //! A good distance metric helps to improve the performance of classification, clustering and information retrieval algorithms significantly. //! -//! +//! +//! /// Euclidean Distance is the straight-line distance between two points in Euclidean spacere that presents the shortest distance between these points. pub mod euclidian; diff --git a/src/metrics/accuracy.rs b/src/metrics/accuracy.rs index e04a5e6..ef7028f 100644 --- a/src/metrics/accuracy.rs +++ b/src/metrics/accuracy.rs @@ -14,7 +14,8 @@ //! let score: f64 = Accuracy {}.get_score(&y_pred, &y_true); //! ``` //! -//! +//! +//! use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; diff --git a/src/metrics/f1.rs b/src/metrics/f1.rs index deb9bd0..5c8537c 100644 --- a/src/metrics/f1.rs +++ b/src/metrics/f1.rs @@ -16,7 +16,8 @@ //! let score: f64 = F1 {beta: 1.0}.get_score(&y_pred, &y_true); //! ``` //! -//! +//! +//! use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; diff --git a/src/metrics/mean_absolute_error.rs b/src/metrics/mean_absolute_error.rs index eaab033..55132cd 100644 --- a/src/metrics/mean_absolute_error.rs +++ b/src/metrics/mean_absolute_error.rs @@ -16,7 +16,8 @@ //! let mse: f64 = MeanAbsoluteError {}.get_score(&y_pred, &y_true); //! ``` //! -//! +//! +//! use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; diff --git a/src/metrics/mean_squared_error.rs b/src/metrics/mean_squared_error.rs index ad06fbe..2b4c5be 100644 --- a/src/metrics/mean_squared_error.rs +++ b/src/metrics/mean_squared_error.rs @@ -16,7 +16,8 @@ //! let mse: f64 = MeanSquareError {}.get_score(&y_pred, &y_true); //! ``` //! -//! +//! +//! use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; diff --git a/src/metrics/precision.rs b/src/metrics/precision.rs index 627c541..3524e7f 100644 --- a/src/metrics/precision.rs +++ b/src/metrics/precision.rs @@ -16,7 +16,8 @@ //! let score: f64 = Precision {}.get_score(&y_pred, &y_true); //! ``` //! -//! +//! +//! use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; diff --git a/src/metrics/r2.rs b/src/metrics/r2.rs index 415dd5d..e689c6f 100644 --- a/src/metrics/r2.rs +++ b/src/metrics/r2.rs @@ -16,7 +16,8 @@ //! let mse: f64 = MeanAbsoluteError {}.get_score(&y_pred, &y_true); //! ``` //! -//! +//! +//! use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; diff --git a/src/metrics/recall.rs b/src/metrics/recall.rs index b6ae73c..4d2be95 100644 --- a/src/metrics/recall.rs +++ b/src/metrics/recall.rs @@ -16,7 +16,8 @@ //! let score: f64 = Recall {}.get_score(&y_pred, &y_true); //! ``` //! -//! +//! +//! use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index ec3a71a..bc4d688 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -29,7 +29,8 @@ //! * ["Nearest Neighbor Pattern Classification" Cover, T.M., IEEE Transactions on Information Theory (1967)](http://ssg.mit.edu/cal/abs/2000_spring/np_dens/classification/cover67.pdf) //! * ["The Elements of Statistical Learning: Data Mining, Inference, and Prediction" Trevor et al., 2nd edition, chapter 13](https://web.stanford.edu/~hastie/ElemStatLearn/) //! -//! +//! +//! use crate::algorithm::neighbour::cover_tree::CoverTree; use crate::algorithm::neighbour::linear_search::LinearKNNSearch; diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 7c5b640..996d98d 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -55,7 +55,8 @@ //! * ["Classification and regression trees", Breiman, L, Friedman, J H, Olshen, R A, and Stone, C J, 1984](https://www.sciencebase.gov/catalog/item/545d07dfe4b0ba8303f728c1) //! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., Chapter 8](http://faculty.marshall.usc.edu/gareth-james/ISL/) //! -//! +//! +//! use std::collections::LinkedList; use std::default::Default; @@ -187,7 +188,7 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } impl DecisionTreeRegressor { - /// Build a regression tree regressor from the training data. + /// Build a decision tree regressor from the training data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - the target values pub fn fit>( diff --git a/src/tree/mod.rs b/src/tree/mod.rs index a5739f4..700dc76 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -6,7 +6,7 @@ //! and fit a simple prediction model within each region. In order to make a prediction for a given observation, \\(\hat{y}\\) //! decision tree typically use the mean or the mode of the training observations in the region \\(R_j\\) to which it belongs. //! -//! Decision trees often does not deliver best prediction accuracy when compared to other supervised learning approaches, such as linear and logistic regression. +//! Decision trees suffer from high variance and often does not deliver best prediction accuracy when compared to other supervised learning approaches, such as linear and logistic regression. //! Hence some techniques such as [Random Forests](../ensemble/index.html) use more than one decision tree to improve performance of the algorithm. //! //! SmartCore uses [CART](https://en.wikipedia.org/wiki/Predictive_analytics#Classification_and_regression_trees_.28CART.29) learning technique to build both classification and regression trees. @@ -16,7 +16,8 @@ //! * ["Classification and regression trees", Breiman, L, Friedman, J H, Olshen, R A, and Stone, C J, 1984](https://www.sciencebase.gov/catalog/item/545d07dfe4b0ba8303f728c1) //! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., Chapter 8](http://faculty.marshall.usc.edu/gareth-james/ISL/) //! -//! +//! +//! /// Classification tree for dependent variables that take a finite number of unordered values. pub mod decision_tree_classifier;