diff --git a/src/ensemble/mod.rs b/src/ensemble/mod.rs index 5f273ad..eb02323 100644 --- a/src/ensemble/mod.rs +++ b/src/ensemble/mod.rs @@ -1 +1,2 @@ -pub mod random_forest; \ No newline at end of file +pub mod random_forest_classifier; +pub mod random_forest_regressor; \ No newline at end of file diff --git a/src/ensemble/random_forest.rs b/src/ensemble/random_forest_classifier.rs similarity index 87% rename from src/ensemble/random_forest.rs rename to src/ensemble/random_forest_classifier.rs index 09b47f6..9175b28 100644 --- a/src/ensemble/random_forest.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -6,7 +6,7 @@ use crate::linalg::Matrix; use crate::tree::decision_tree_classifier::{DecisionTreeClassifier, DecisionTreeClassifierParameters, SplitCriterion, which_max}; #[derive(Debug, Clone)] -pub struct RandomForestParameters { +pub struct RandomForestClassifierParameters { pub criterion: SplitCriterion, pub max_depth: Option, pub min_samples_leaf: u16, @@ -16,15 +16,15 @@ pub struct RandomForestParameters { } #[derive(Debug)] -pub struct RandomForest { - parameters: RandomForestParameters, +pub struct RandomForestClassifier { + parameters: RandomForestClassifierParameters, trees: Vec, classes: Vec } -impl Default for RandomForestParameters { +impl Default for RandomForestClassifierParameters { fn default() -> Self { - RandomForestParameters { + RandomForestClassifierParameters { criterion: SplitCriterion::Gini, max_depth: None, min_samples_leaf: 1, @@ -35,9 +35,9 @@ impl Default for RandomForestParameters { } } -impl RandomForest { +impl RandomForestClassifier { - pub fn fit(x: &M, y: &M::RowVector, parameters: RandomForestParameters) -> RandomForest { + pub fn fit(x: &M, y: &M::RowVector, parameters: RandomForestClassifierParameters) -> RandomForestClassifier { let (_, num_attributes) = x.shape(); let y_m = M::from_row_vector(y.clone()); let (_, y_ncols) = y_m.shape(); @@ -56,7 +56,7 @@ impl RandomForest { let mut trees: Vec = Vec::new(); for _ in 0..parameters.n_trees { - let samples = RandomForest::sample_with_replacement(&yi, k); + let samples = RandomForestClassifier::sample_with_replacement(&yi, k); let params = DecisionTreeClassifierParameters{ criterion: parameters.criterion.clone(), max_depth: parameters.max_depth, @@ -67,7 +67,7 @@ impl RandomForest { trees.push(tree); } - RandomForest { + RandomForestClassifier { parameters: parameters, trees: trees, classes @@ -154,9 +154,9 @@ mod tests { &[5.2, 2.7, 3.9, 1.4]]); let y = vec![0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; - RandomForest::fit(&x, &y, Default::default()); + RandomForestClassifier::fit(&x, &y, Default::default()); - assert_eq!(y, RandomForest::fit(&x, &y, Default::default()).predict(&x)); + assert_eq!(y, RandomForestClassifier::fit(&x, &y, Default::default()).predict(&x)); } diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs new file mode 100644 index 0000000..8b15ad9 --- /dev/null +++ b/src/ensemble/random_forest_regressor.rs @@ -0,0 +1,141 @@ +extern crate rand; + +use rand::Rng; +use std::default::Default; +use crate::linalg::Matrix; +use crate::tree::decision_tree_regressor::{DecisionTreeRegressor, DecisionTreeRegressorParameters}; + +#[derive(Debug, Clone)] +pub struct RandomForestRegressorParameters { + pub max_depth: Option, + pub min_samples_leaf: usize, + pub min_samples_split: usize, + pub n_trees: usize, + pub mtry: Option +} + +#[derive(Debug)] +pub struct RandomForestRegressor { + parameters: RandomForestRegressorParameters, + trees: Vec +} + +impl Default for RandomForestRegressorParameters { + fn default() -> Self { + RandomForestRegressorParameters { + max_depth: None, + min_samples_leaf: 1, + min_samples_split: 2, + n_trees: 10, + mtry: Option::None + } + } +} + +impl RandomForestRegressor { + + pub fn fit(x: &M, y: &M::RowVector, parameters: RandomForestRegressorParameters) -> RandomForestRegressor { + let (n_rows, num_attributes) = x.shape(); + + let mtry = parameters.mtry.unwrap_or((num_attributes as f64).sqrt().floor() as usize); + + let mut trees: Vec = Vec::new(); + + for _ in 0..parameters.n_trees { + let samples = RandomForestRegressor::sample_with_replacement(n_rows); + let params = DecisionTreeRegressorParameters{ + max_depth: parameters.max_depth, + min_samples_leaf: parameters.min_samples_leaf, + min_samples_split: parameters.min_samples_split + }; + let tree = DecisionTreeRegressor::fit_weak_learner(x, y, samples, mtry, params); + trees.push(tree); + } + + RandomForestRegressor { + parameters: parameters, + trees: trees + } + } + + pub fn predict(&self, x: &M) -> M::RowVector { + let mut result = M::zeros(1, x.shape().0); + + let (n, _) = x.shape(); + + for i in 0..n { + result.set(0, i, self.predict_for_row(x, i)); + } + + result.to_row_vector() + } + + fn predict_for_row(&self, x: &M, row: usize) -> f64 { + + let n_trees = self.trees.len(); + + let mut result = 0f64; + + for tree in self.trees.iter() { + result += tree.predict_for_row(x, row); + } + + result / n_trees as f64 + + } + + fn sample_with_replacement(nrows: usize) -> Vec{ + let mut rng = rand::thread_rng(); + let mut samples = vec![0; nrows]; + for _ in 0..nrows { + let xi = rng.gen_range(0, nrows); + samples[xi] += 1; + } + samples + } + +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn fit_longley() { + + let x = DenseMatrix::from_array(&[ + &[ 234.289, 235.6, 159., 107.608, 1947., 60.323], + &[ 259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[ 258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[ 284.599, 335.1, 165., 110.929, 1950., 61.187], + &[ 328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[ 346.999, 193.2, 359.4, 113.27 , 1952., 63.639], + &[ 365.385, 187., 354.7, 115.094, 1953., 64.989], + &[ 363.112, 357.8, 335., 116.219, 1954., 63.761], + &[ 397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[ 419.18 , 282.2, 285.7, 118.734, 1956., 67.857], + &[ 442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[ 444.546, 468.1, 263.7, 121.95 , 1958., 66.513], + &[ 482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[ 502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[ 518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[ 554.894, 400.7, 282.7, 130.081, 1962., 70.551]]); + let y = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; + + let expected_y = vec![85., 88., 88., 89., 97., 98., 99., 99., 102., 104., 109., 110., 113., 114., 115., 116.]; + + let y_hat = RandomForestRegressor::fit(&x, &y, + RandomForestRegressorParameters{max_depth: None, + min_samples_leaf: 1, + min_samples_split: 2, + n_trees: 1000, + mtry: Option::None}).predict(&x); + + for i in 0..y_hat.len() { + assert!((y_hat[i] - expected_y[i]).abs() < 1.0); + } + + } + +} \ No newline at end of file