From d28f13d849bc3644f2f973f6786839a005001664 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Sun, 13 Sep 2020 16:23:30 -0700 Subject: [PATCH] feat: adds train/test split function; fixes bug in random forest --- src/ensemble/random_forest_classifier.rs | 18 ++-- src/lib.rs | 1 + src/linalg/mod.rs | 9 ++ src/linalg/naive/dense_matrix.rs | 12 +++ src/linalg/nalgebra_bindings.rs | 26 +++++- src/linalg/ndarray_bindings.rs | 12 +++ src/model_selection/mod.rs | 109 +++++++++++++++++++++++ src/tree/decision_tree_classifier.rs | 5 ++ src/tree/decision_tree_regressor.rs | 5 ++ 9 files changed, 187 insertions(+), 10 deletions(-) create mode 100644 src/model_selection/mod.rs diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 1b3e66f..973229f 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -199,19 +199,19 @@ impl RandomForestClassifier { let nrows = y.len(); let mut samples = vec![0; nrows]; for l in 0..num_classes { - let mut nj = 0; - let mut cj: Vec = Vec::new(); + let mut n_samples = 0; + let mut index: Vec = Vec::new(); for i in 0..nrows { if y[i] == l { - cj.push(i); - nj += 1; + index.push(i); + n_samples += 1; } } - let size = ((nj as f64) / class_weight[l]) as usize; + let size = ((n_samples as f64) / class_weight[l]) as usize; for _ in 0..size { - let xi: usize = rng.gen_range(0, nj); - samples[cj[xi]] += 1; + let xi: usize = rng.gen_range(0, n_samples); + samples[index[xi]] += 1; } } samples @@ -260,12 +260,12 @@ mod tests { max_depth: None, min_samples_leaf: 1, min_samples_split: 2, - n_trees: 1000, + n_trees: 100, m: Option::None, }, ); - assert!(accuracy(&y, &classifier.predict(&x)) > 0.9); + assert!(accuracy(&y, &classifier.predict(&x)) >= 0.95); } #[test] diff --git a/src/lib.rs b/src/lib.rs index b67d0f6..c21b989 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,6 +83,7 @@ pub mod linear; pub mod math; /// Functions for assessing prediction error. pub mod metrics; +pub mod model_selection; /// Supervised neighbors-based learning methods pub mod neighbors; pub(crate) mod optimization; diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 0de70dd..af8191e 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -76,6 +76,15 @@ pub trait BaseVector: Clone + Debug { /// Return a vector with the elements of the one-dimensional array. fn to_vec(&self) -> Vec; + + /// Create new vector with zeros of size `len`. + fn zeros(len: usize) -> Self; + + /// Create new vector with ones of size `len`. + fn ones(len: usize) -> Self; + + /// Create new vector of size `len` where each element is set to `value`. + fn fill(len: usize, value: T) -> Self; } /// Generic matrix type. diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index c098ee0..c2d7928 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -32,6 +32,18 @@ impl BaseVector for Vec { let v = self.clone(); v } + + fn zeros(len: usize) -> Self { + vec![T::zero(); len] + } + + fn ones(len: usize) -> Self { + vec![T::one(); len] + } + + fn fill(len: usize, value: T) -> Self { + vec![value; len] + } } /// Column-major, dense matrix. See [Simple Dense Matrix](../index.html). diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index 4c1ba56..2e5ced6 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -40,7 +40,7 @@ use std::iter::Sum; use std::ops::{AddAssign, DivAssign, MulAssign, Range, SubAssign}; -use nalgebra::{DMatrix, Dynamic, Matrix, MatrixMN, Scalar, VecStorage, U1}; +use nalgebra::{DMatrix, Dynamic, Matrix, MatrixMN, RowDVector, Scalar, VecStorage, U1}; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; @@ -65,6 +65,20 @@ impl BaseVector for MatrixMN { fn to_vec(&self) -> Vec { self.row(0).iter().map(|v| *v).collect() } + + fn zeros(len: usize) -> Self { + RowDVector::zeros(len) + } + + fn ones(len: usize) -> Self { + BaseVector::fill(len, T::one()) + } + + fn fill(len: usize, value: T) -> Self { + let mut m = RowDVector::zeros(len); + m.fill(value); + m + } } impl @@ -446,6 +460,16 @@ mod tests { assert_eq!(vec![1., 2., 3.], v.to_vec()); } + #[test] + fn vec_init() { + let zeros: RowDVector = BaseVector::zeros(3); + let ones: RowDVector = BaseVector::ones(3); + let twos: RowDVector = BaseVector::fill(3, 2.); + assert_eq!(zeros, RowDVector::from_vec(vec![0., 0., 0.])); + assert_eq!(ones, RowDVector::from_vec(vec![1., 1., 1.])); + assert_eq!(twos, RowDVector::from_vec(vec![2., 2., 2.])); + } + #[test] fn get_set_dynamic() { let mut m = DMatrix::from_row_slice(2, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 68b228d..989ece1 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -72,6 +72,18 @@ impl BaseVector for ArrayBase, Ix1> { fn to_vec(&self) -> Vec { self.to_owned().to_vec() } + + fn zeros(len: usize) -> Self { + Array::zeros(len) + } + + fn ones(len: usize) -> Self { + Array::ones(len) + } + + fn fill(len: usize, value: T) -> Self { + Array::from_elem(len, value) + } } impl diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs new file mode 100644 index 0000000..1895296 --- /dev/null +++ b/src/model_selection/mod.rs @@ -0,0 +1,109 @@ +//! # Model Selection methods +//! +//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), +//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. +//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. +//! Underfitted is bad because the model is undetrained and does not fit the training data well. +//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for +//! your data. +//! +//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +extern crate rand; + +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use rand::Rng; + +/// Splits data into 2 disjoint datasets. +/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. +/// * `y` - target values, should be of size _M_ +/// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. +pub fn train_test_split>( + x: &M, + y: &M::RowVector, + test_size: f32, +) -> (M, M, M::RowVector, M::RowVector) { + if x.shape().0 != y.len() { + panic!( + "x and y should have the same number of samples. |x|: {}, |y|: {}", + x.shape().0, + y.len() + ); + } + + if test_size <= 0. || test_size > 1.0 { + panic!("test_size should be between 0 and 1"); + } + + let n = y.len(); + let m = x.shape().1; + + let mut rng = rand::thread_rng(); + let mut n_test = 0; + let mut index = vec![false; n]; + + for i in 0..n { + let p_test: f32 = rng.gen(); + if p_test <= test_size { + index[i] = true; + n_test += 1; + } + } + + let n_train = n - n_test; + + let mut x_train = M::zeros(n_train, m); + let mut x_test = M::zeros(n_test, m); + let mut y_train = M::RowVector::zeros(n_train); + let mut y_test = M::RowVector::zeros(n_test); + + let mut r_train = 0; + let mut r_test = 0; + + for r in 0..n { + if index[r] { + //sample belongs to test + for c in 0..m { + x_test.set(r_test, c, x.get(r, c)); + y_test.set(r_test, y.get(r)); + } + r_test += 1; + } else { + for c in 0..m { + x_train.set(r_train, c, x.get(r, c)); + y_train.set(r_train, y.get(r)); + } + r_train += 1; + } + } + + (x_train, x_test, y_train, y_test) +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn run_train_test_split() { + let n = 100; + let x: DenseMatrix = DenseMatrix::rand(100, 3); + let y = vec![0f64; 100]; + + let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2); + + assert!( + x_train.shape().0 > (n as f64 * 0.65) as usize + && x_train.shape().0 < (n as f64 * 0.95) as usize + ); + assert!( + x_test.shape().0 > (n as f64 * 0.05) as usize + && x_test.shape().0 < (n as f64 * 0.35) as usize + ); + assert_eq!(x_train.shape().0, y_train.len()); + assert_eq!(x_test.shape().0, y_test.len()); + } +} diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 0d1bffa..b8f8b95 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -67,6 +67,7 @@ use std::default::Default; use std::fmt::Debug; use std::marker::PhantomData; +use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; @@ -431,6 +432,10 @@ impl DecisionTreeClassifier { variables[i] = i; } + if mtry < n_attr { + variables.shuffle(&mut rand::thread_rng()); + } + for j in 0..mtry { self.find_best_split( visitor, diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index ef54503..911b8a8 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -62,6 +62,7 @@ use std::collections::LinkedList; use std::default::Default; use std::fmt::Debug; +use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; @@ -320,6 +321,10 @@ impl DecisionTreeRegressor { variables[i] = i; } + if mtry < n_attr { + variables.shuffle(&mut rand::thread_rng()); + } + let parent_gain = T::from(n).unwrap() * self.nodes[visitor.node].output * self.nodes[visitor.node].output;