From d28f13d849bc3644f2f973f6786839a005001664 Mon Sep 17 00:00:00 2001
From: Volodymyr Orlov <vorl@Echo.local>
Date: Sun, 13 Sep 2020 16:23:30 -0700
Subject: [PATCH] feat: adds train/test split function; fixes bug in random
 forest

---
 src/ensemble/random_forest_classifier.rs |  18 ++--
 src/lib.rs                               |   1 +
 src/linalg/mod.rs                        |   9 ++
 src/linalg/naive/dense_matrix.rs         |  12 +++
 src/linalg/nalgebra_bindings.rs          |  26 +++++-
 src/linalg/ndarray_bindings.rs           |  12 +++
 src/model_selection/mod.rs               | 109 +++++++++++++++++++++++
 src/tree/decision_tree_classifier.rs     |   5 ++
 src/tree/decision_tree_regressor.rs      |   5 ++
 9 files changed, 187 insertions(+), 10 deletions(-)
 create mode 100644 src/model_selection/mod.rs
diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs
index 1b3e66f..973229f 100644
--- a/src/ensemble/random_forest_classifier.rs
+++ b/src/ensemble/random_forest_classifier.rs
@@ -199,19 +199,19 @@ impl<T: RealNumber> RandomForestClassifier<T> {
         let nrows = y.len();
         let mut samples = vec![0; nrows];
         for l in 0..num_classes {
-            let mut nj = 0;
-            let mut cj: Vec<usize> = Vec::new();
+            let mut n_samples = 0;
+            let mut index: Vec<usize> = Vec::new();
             for i in 0..nrows {
                 if y[i] == l {
-                    cj.push(i);
-                    nj += 1;
+                    index.push(i);
+                    n_samples += 1;
                 }
             }
 
-            let size = ((nj as f64) / class_weight[l]) as usize;
+            let size = ((n_samples as f64) / class_weight[l]) as usize;
             for _ in 0..size {
-                let xi: usize = rng.gen_range(0, nj);
-                samples[cj[xi]] += 1;
+                let xi: usize = rng.gen_range(0, n_samples);
+                samples[index[xi]] += 1;
             }
         }
         samples
@@ -260,12 +260,12 @@ mod tests {
                 max_depth: None,
                 min_samples_leaf: 1,
                 min_samples_split: 2,
-                n_trees: 1000,
+                n_trees: 100,
                 m: Option::None,
             },
         );
 
-        assert!(accuracy(&y, &classifier.predict(&x)) > 0.9);
+        assert!(accuracy(&y, &classifier.predict(&x)) >= 0.95);
     }
 
     #[test]
diff --git a/src/lib.rs b/src/lib.rs
index b67d0f6..c21b989 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -83,6 +83,7 @@ pub mod linear;
 pub mod math;
 /// Functions for assessing prediction error.
 pub mod metrics;
+pub mod model_selection;
 /// Supervised neighbors-based learning methods
 pub mod neighbors;
 pub(crate) mod optimization;
diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs
index 0de70dd..af8191e 100644
--- a/src/linalg/mod.rs
+++ b/src/linalg/mod.rs
@@ -76,6 +76,15 @@ pub trait BaseVector<T: RealNumber>: Clone + Debug {
 
     /// Return a vector with the elements of the one-dimensional array.
     fn to_vec(&self) -> Vec<T>;
+
+    /// Create new vector with zeros of size `len`.
+    fn zeros(len: usize) -> Self;
+
+    /// Create new vector with ones of size `len`.
+    fn ones(len: usize) -> Self;
+
+    /// Create new vector of size `len` where each element is set to `value`.
+    fn fill(len: usize, value: T) -> Self;
 }
 
 /// Generic matrix type.
diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs
index c098ee0..c2d7928 100644
--- a/src/linalg/naive/dense_matrix.rs
+++ b/src/linalg/naive/dense_matrix.rs
@@ -32,6 +32,18 @@ impl<T: RealNumber> BaseVector<T> for Vec<T> {
         let v = self.clone();
         v
     }
+
+    fn zeros(len: usize) -> Self {
+        vec![T::zero(); len]
+    }
+
+    fn ones(len: usize) -> Self {
+        vec![T::one(); len]
+    }
+
+    fn fill(len: usize, value: T) -> Self {
+        vec![value; len]
+    }
 }
 
 /// Column-major, dense matrix. See [Simple Dense Matrix](../index.html).
diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs
index 4c1ba56..2e5ced6 100644
--- a/src/linalg/nalgebra_bindings.rs
+++ b/src/linalg/nalgebra_bindings.rs
@@ -40,7 +40,7 @@
 use std::iter::Sum;
 use std::ops::{AddAssign, DivAssign, MulAssign, Range, SubAssign};
 
-use nalgebra::{DMatrix, Dynamic, Matrix, MatrixMN, Scalar, VecStorage, U1};
+use nalgebra::{DMatrix, Dynamic, Matrix, MatrixMN, RowDVector, Scalar, VecStorage, U1};
 
 use crate::linalg::evd::EVDDecomposableMatrix;
 use crate::linalg::lu::LUDecomposableMatrix;
@@ -65,6 +65,20 @@ impl<T: RealNumber + 'static> BaseVector<T> for MatrixMN<T, U1, Dynamic> {
     fn to_vec(&self) -> Vec<T> {
         self.row(0).iter().map(|v| *v).collect()
     }
+
+    fn zeros(len: usize) -> Self {
+        RowDVector::zeros(len)
+    }
+
+    fn ones(len: usize) -> Self {
+        BaseVector::fill(len, T::one())
+    }
+
+    fn fill(len: usize, value: T) -> Self {
+        let mut m = RowDVector::zeros(len);
+        m.fill(value);
+        m
+    }
 }
 
 impl<T: RealNumber + Scalar + AddAssign + SubAssign + MulAssign + DivAssign + Sum + 'static>
@@ -446,6 +460,16 @@ mod tests {
         assert_eq!(vec![1., 2., 3.], v.to_vec());
     }
 
+    #[test]
+    fn vec_init() {
+        let zeros: RowDVector<f32> = BaseVector::zeros(3);
+        let ones: RowDVector<f32> = BaseVector::ones(3);
+        let twos: RowDVector<f32> = BaseVector::fill(3, 2.);
+        assert_eq!(zeros, RowDVector::from_vec(vec![0., 0., 0.]));
+        assert_eq!(ones, RowDVector::from_vec(vec![1., 1., 1.]));
+        assert_eq!(twos, RowDVector::from_vec(vec![2., 2., 2.]));
+    }
+
     #[test]
     fn get_set_dynamic() {
         let mut m = DMatrix::from_row_slice(2, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs
index 68b228d..989ece1 100644
--- a/src/linalg/ndarray_bindings.rs
+++ b/src/linalg/ndarray_bindings.rs
@@ -72,6 +72,18 @@ impl<T: RealNumber> BaseVector<T> for ArrayBase<OwnedRepr<T>, Ix1> {
     fn to_vec(&self) -> Vec<T> {
         self.to_owned().to_vec()
     }
+
+    fn zeros(len: usize) -> Self {
+        Array::zeros(len)
+    }
+
+    fn ones(len: usize) -> Self {
+        Array::ones(len)
+    }
+
+    fn fill(len: usize, value: T) -> Self {
+        Array::from_elem(len, value)
+    }
 }
 
 impl<T: RealNumber + ScalarOperand + AddAssign + SubAssign + MulAssign + DivAssign + Sum>
diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs
new file mode 100644
index 0000000..1895296
--- /dev/null
+++ b/src/model_selection/mod.rs
@@ -0,0 +1,109 @@
+//! # Model Selection methods
+//!
+//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate),
+//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data.
+//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data.
+//! Underfitted is bad because the model is undetrained and does not fit the training data well.
+//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for
+//! your data.
+//!
+//! In SmartCore you can split your data into training and test datasets using `train_test_split` function.
+extern crate rand;
+
+use crate::linalg::BaseVector;
+use crate::linalg::Matrix;
+use crate::math::num::RealNumber;
+use rand::Rng;
+
+/// Splits data into 2 disjoint datasets.
+/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
+/// * `y` - target values, should be of size _M_
+/// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split.
+pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
+    x: &M,
+    y: &M::RowVector,
+    test_size: f32,
+) -> (M, M, M::RowVector, M::RowVector) {
+    if x.shape().0 != y.len() {
+        panic!(
+            "x and y should have the same number of samples. |x|: {}, |y|: {}",
+            x.shape().0,
+            y.len()
+        );
+    }
+
+    if test_size <= 0. || test_size > 1.0 {
+        panic!("test_size should be between 0 and 1");
+    }
+
+    let n = y.len();
+    let m = x.shape().1;
+
+    let mut rng = rand::thread_rng();
+    let mut n_test = 0;
+    let mut index = vec![false; n];
+
+    for i in 0..n {
+        let p_test: f32 = rng.gen();
+        if p_test <= test_size {
+            index[i] = true;
+            n_test += 1;
+        }
+    }
+
+    let n_train = n - n_test;
+
+    let mut x_train = M::zeros(n_train, m);
+    let mut x_test = M::zeros(n_test, m);
+    let mut y_train = M::RowVector::zeros(n_train);
+    let mut y_test = M::RowVector::zeros(n_test);
+
+    let mut r_train = 0;
+    let mut r_test = 0;
+
+    for r in 0..n {
+        if index[r] {
+            //sample belongs to test
+            for c in 0..m {
+                x_test.set(r_test, c, x.get(r, c));
+                y_test.set(r_test, y.get(r));
+            }
+            r_test += 1;
+        } else {
+            for c in 0..m {
+                x_train.set(r_train, c, x.get(r, c));
+                y_train.set(r_train, y.get(r));
+            }
+            r_train += 1;
+        }
+    }
+
+    (x_train, x_test, y_train, y_test)
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+    use crate::linalg::naive::dense_matrix::*;
+
+    #[test]
+    fn run_train_test_split() {
+        let n = 100;
+        let x: DenseMatrix<f64> = DenseMatrix::rand(100, 3);
+        let y = vec![0f64; 100];
+
+        let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2);
+
+        assert!(
+            x_train.shape().0 > (n as f64 * 0.65) as usize
+                && x_train.shape().0 < (n as f64 * 0.95) as usize
+        );
+        assert!(
+            x_test.shape().0 > (n as f64 * 0.05) as usize
+                && x_test.shape().0 < (n as f64 * 0.35) as usize
+        );
+        assert_eq!(x_train.shape().0, y_train.len());
+        assert_eq!(x_test.shape().0, y_test.len());
+    }
+}
diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs
index 0d1bffa..b8f8b95 100644
--- a/src/tree/decision_tree_classifier.rs
+++ b/src/tree/decision_tree_classifier.rs
@@ -67,6 +67,7 @@ use std::default::Default;
 use std::fmt::Debug;
 use std::marker::PhantomData;
 
+use rand::seq::SliceRandom;
 use serde::{Deserialize, Serialize};
 
 use crate::algorithm::sort::quick_sort::QuickArgSort;
@@ -431,6 +432,10 @@ impl<T: RealNumber> DecisionTreeClassifier<T> {
             variables[i] = i;
         }
 
+        if mtry < n_attr {
+            variables.shuffle(&mut rand::thread_rng());
+        }
+
         for j in 0..mtry {
             self.find_best_split(
                 visitor,
diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs
index ef54503..911b8a8 100644
--- a/src/tree/decision_tree_regressor.rs
+++ b/src/tree/decision_tree_regressor.rs
@@ -62,6 +62,7 @@ use std::collections::LinkedList;
 use std::default::Default;
 use std::fmt::Debug;
 
+use rand::seq::SliceRandom;
 use serde::{Deserialize, Serialize};
 
 use crate::algorithm::sort::quick_sort::QuickArgSort;
@@ -320,6 +321,10 @@ impl<T: RealNumber> DecisionTreeRegressor<T> {
             variables[i] = i;
         }
 
+        if mtry < n_attr {
+            variables.shuffle(&mut rand::thread_rng());
+        }
+
         let parent_gain =
             T::from(n).unwrap() * self.nodes[visitor.node].output * self.nodes[visitor.node].output;