Merge potential next release v0.4 (#187) Breaking Changes

* First draft of the new n-dimensional arrays + NB use case * Improves default implementation of multiple Array methods * Refactors tree methods * Adds matrix decomposition routines * Adds matrix decomposition methods to ndarray and nalgebra bindings * Refactoring + linear regression now uses array2 * Ridge & Linear regression * LBFGS optimizer & logistic regression * LBFGS optimizer & logistic regression * Changes linear methods, metrics and model selection methods to new n-dimensional arrays * Switches KNN and clustering algorithms to new n-d array layer * Refactors distance metrics * Optimizes knn and clustering methods * Refactors metrics module * Switches decomposition methods to n-dimensional arrays * Linalg refactoring - cleanup rng merge (#172) * Remove legacy DenseMatrix and BaseMatrix implementation. Port the new Number, FloatNumber and Array implementation into module structure. * Exclude AUC metrics. Needs reimplementation * Improve developers walkthrough New traits system in place at `src/numbers` and `src/linalg` Co-authored-by: Lorenzo <tunedconsulting@gmail.com> * Provide SupervisedEstimator with a constructor to avoid explicit dynamical box allocation in 'cross_validate' and 'cross_validate_predict' as required by the use of 'dyn' as per Rust 2021 * Implement getters to use as_ref() in src/neighbors * Implement getters to use as_ref() in src/naive_bayes * Implement getters to use as_ref() in src/linear * Add Clone to src/naive_bayes * Change signature for cross_validate and other model_selection functions to abide to use of dyn in Rust 2021 * Implement ndarray-bindings. Remove FloatNumber from implementations * Drop nalgebra-bindings support (as decided in conf-call to go for ndarray) * Remove benches. Benches will have their own repo at smartcore-benches * Implement SVC * Implement SVC serialization. Move search parameters in dedicated module * Implement SVR. Definitely too slow * Fix compilation issues for wasm (#202) Co-authored-by: Luis Moreno <morenol@users.noreply.github.com> * Fix tests (#203) * Port linalg/traits/stats.rs * Improve methods naming * Improve Display for DenseMatrix Co-authored-by: Montana Low <montanalow@users.noreply.github.com> Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
2022-10-31 10:44:57 +00:00
parent bb71656137
commit 52eb6ce023
110 changed files with 10327 additions and 9107 deletions
@@ -12,9 +12,9 @@
 //! To fit the model to a 4 x 2 matrix with 4 training samples, 2 features per sample:
 //!
 //! ```
-//! use smartcore::linalg::naive::dense_matrix::*;
+//! use smartcore::linalg::basic::matrix::DenseMatrix;
 //! use smartcore::neighbors::knn_classifier::*;
-//! use smartcore::math::distance::*;
+//! use smartcore::metrics::distance::*;
 //!
 //! //your explanatory variables. Each row is a training sample with 2 numerical features
 //! let x = DenseMatrix::from_2d_array(&[
@@ -23,7 +23,7 @@
 //!     &[5., 6.],
 //!     &[7., 8.],
 //! &[9., 10.]]);
-//! let y = vec![2., 2., 2., 3., 3.]; //your class labels
+//! let y = vec![2, 2, 2, 3, 3]; //your class labels
 //!
 //! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap();
 //! let y_hat = knn.predict(&x).unwrap();
@@ -39,16 +39,16 @@ use serde::{Deserialize, Serialize};
 use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
 use crate::api::{Predictor, SupervisedEstimator};
 use crate::error::Failed;
-use crate::linalg::{row_iter, Matrix};
-use crate::math::distance::euclidian::Euclidian;
-use crate::math::distance::{Distance, Distances};
-use crate::math::num::RealNumber;
+use crate::linalg::basic::arrays::{Array1, Array2};
+use crate::metrics::distance::euclidian::Euclidian;
+use crate::metrics::distance::{Distance, Distances};
 use crate::neighbors::KNNWeightFunction;
+use crate::numbers::basenum::Number;

 /// `KNNClassifier` parameters. Use `Default::default()` for default values.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Debug, Clone)]
-pub struct KNNClassifierParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
+pub struct KNNClassifierParameters<T: Number, D: Distance<Vec<T>>> {
    #[cfg_attr(feature = "serde", serde(default))]
    /// a function that defines a distance between each pair of point in training data.
    /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
@@ -71,15 +71,44 @@ pub struct KNNClassifierParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
 /// K Nearest Neighbors Classifier
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Debug)]
-pub struct KNNClassifier<T: RealNumber, D: Distance<Vec<T>, T>> {
-    classes: Vec<T>,
-    y: Vec<usize>,
-    knn_algorithm: KNNAlgorithm<T, D>,
-    weight: KNNWeightFunction,
-    k: usize,
+pub struct KNNClassifier<
+    TX: Number,
+    TY: Number + Ord,
+    X: Array2<TX>,
+    Y: Array1<TY>,
+    D: Distance<Vec<TX>>,
+> {
+    classes: Option<Vec<TY>>,
+    y: Option<Vec<usize>>,
+    knn_algorithm: Option<KNNAlgorithm<TX, D>>,
+    weight: Option<KNNWeightFunction>,
+    k: Option<usize>,
+    _phantom_tx: PhantomData<TX>,
+    _phantom_x: PhantomData<X>,
+    _phantom_y: PhantomData<Y>,
 }

-impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifierParameters<T, D> {
+impl<TX: Number, TY: Number + Ord, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
+    KNNClassifier<TX, TY, X, Y, D>
+{
+    fn classes(&self) -> &Vec<TY> {
+        self.classes.as_ref().unwrap()
+    }
+    fn y(&self) -> &Vec<usize> {
+        self.y.as_ref().unwrap()
+    }
+    fn knn_algorithm(&self) -> &KNNAlgorithm<TX, D> {
+        self.knn_algorithm.as_ref().unwrap()
+    }
+    fn weight(&self) -> &KNNWeightFunction {
+        self.weight.as_ref().unwrap()
+    }
+    fn k(&self) -> usize {
+        self.k.unwrap()
+    }
+}
+
+impl<T: Number, D: Distance<Vec<T>>> KNNClassifierParameters<T, D> {
    /// number of training samples to consider when estimating class for new point. Default value is 3.
    pub fn with_k(mut self, k: usize) -> Self {
        self.k = k;
@@ -88,7 +117,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifierParameters<T, D> {
    /// a function that defines a distance between each pair of point in training data.
    /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
    /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
-    pub fn with_distance<DD: Distance<Vec<T>, T>>(
+    pub fn with_distance<DD: Distance<Vec<T>>>(
        self,
        distance: DD,
    ) -> KNNClassifierParameters<T, DD> {
@@ -112,7 +141,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifierParameters<T, D> {
    }
 }

-impl<T: RealNumber> Default for KNNClassifierParameters<T, Euclidian> {
+impl<T: Number> Default for KNNClassifierParameters<T, Euclidian<T>> {
    fn default() -> Self {
        KNNClassifierParameters {
            distance: Distances::euclidian(),
@@ -124,21 +153,23 @@ impl<T: RealNumber> Default for KNNClassifierParameters<T, Euclidian> {
    }
 }

-impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for KNNClassifier<T, D> {
+impl<TX: Number, TY: Number + Ord, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> PartialEq
+    for KNNClassifier<TX, TY, X, Y, D>
+{
    fn eq(&self, other: &Self) -> bool {
-        if self.classes.len() != other.classes.len()
-            || self.k != other.k
-            || self.y.len() != other.y.len()
+        if self.classes().len() != other.classes().len()
+            || self.k() != other.k()
+            || self.y().len() != other.y().len()
        {
            false
        } else {
-            for i in 0..self.classes.len() {
-                if (self.classes[i] - other.classes[i]).abs() > T::epsilon() {
+            for i in 0..self.classes().len() {
+                if self.classes()[i] != other.classes()[i] {
                    return false;
                }
            }
-            for i in 0..self.y.len() {
-                if self.y[i] != other.y[i] {
+            for i in 0..self.y().len() {
+                if self.y().get(i) != other.y().get(i) {
                    return false;
                }
            }
@@ -147,48 +178,59 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for KNNClassifier<T, D> {
    }
 }

-impl<T: RealNumber, M: Matrix<T>, D: Distance<Vec<T>, T>>
-    SupervisedEstimator<M, M::RowVector, KNNClassifierParameters<T, D>> for KNNClassifier<T, D>
+impl<TX: Number, TY: Number + Ord, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
+    SupervisedEstimator<X, Y, KNNClassifierParameters<TX, D>> for KNNClassifier<TX, TY, X, Y, D>
 {
-    fn fit(
-        x: &M,
-        y: &M::RowVector,
-        parameters: KNNClassifierParameters<T, D>,
-    ) -> Result<Self, Failed> {
+    fn new() -> Self {
+        Self {
+            classes: Option::None,
+            y: Option::None,
+            knn_algorithm: Option::None,
+            weight: Option::None,
+            k: Option::None,
+            _phantom_tx: PhantomData,
+            _phantom_x: PhantomData,
+            _phantom_y: PhantomData,
+        }
+    }
+    fn fit(x: &X, y: &Y, parameters: KNNClassifierParameters<TX, D>) -> Result<Self, Failed> {
        KNNClassifier::fit(x, y, parameters)
    }
 }

-impl<T: RealNumber, M: Matrix<T>, D: Distance<Vec<T>, T>> Predictor<M, M::RowVector>
-    for KNNClassifier<T, D>
+impl<TX: Number, TY: Number + Ord, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
+    Predictor<X, Y> for KNNClassifier<TX, TY, X, Y, D>
 {
-    fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
+    fn predict(&self, x: &X) -> Result<Y, Failed> {
        self.predict(x)
    }
 }

-impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
+impl<TX: Number, TY: Number + Ord, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
+    KNNClassifier<TX, TY, X, Y, D>
+{
    /// Fits KNN classifier to a NxM matrix where N is number of samples and M is number of features.
    /// * `x` - training data
    /// * `y` - vector with target values (classes) of length N    
    /// * `parameters` - additional parameters like search algorithm and k
-    pub fn fit<M: Matrix<T>>(
-        x: &M,
-        y: &M::RowVector,
-        parameters: KNNClassifierParameters<T, D>,
-    ) -> Result<KNNClassifier<T, D>, Failed> {
-        let y_m = M::from_row_vector(y.clone());
-
-        let (_, y_n) = y_m.shape();
+    pub fn fit(
+        x: &X,
+        y: &Y,
+        parameters: KNNClassifierParameters<TX, D>,
+    ) -> Result<KNNClassifier<TX, TY, X, Y, D>, Failed> {
+        let y_n = y.shape();
        let (x_n, _) = x.shape();

-        let data = row_iter(x).collect();
+        let data = x
+            .row_iter()
+            .map(|row| row.iterator(0).copied().collect())
+            .collect();

        let mut yi: Vec<usize> = vec![0; y_n];
-        let classes = y_m.unique();
+        let classes = y.unique();

        for (i, yi_i) in yi.iter_mut().enumerate().take(y_n) {
-            let yc = y_m.get(0, i);
+            let yc = *y.get(i);
            *yi_i = classes.iter().position(|c| yc == *c).unwrap();
        }

@@ -207,43 +249,50 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
        }

        Ok(KNNClassifier {
-            classes,
-            y: yi,
-            k: parameters.k,
-            knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?,
-            weight: parameters.weight,
+            classes: Some(classes),
+            y: Some(yi),
+            k: Some(parameters.k),
+            knn_algorithm: Some(parameters.algorithm.fit(data, parameters.distance)?),
+            weight: Some(parameters.weight),
+            _phantom_tx: PhantomData,
+            _phantom_x: PhantomData,
+            _phantom_y: PhantomData,
        })
    }

    /// Estimates the class labels for the provided data.
    /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
    /// Returns a vector of size N with class estimates.
-    pub fn predict<M: Matrix<T>>(&self, x: &M) -> Result<M::RowVector, Failed> {
-        let mut result = M::zeros(1, x.shape().0);
+    pub fn predict(&self, x: &X) -> Result<Y, Failed> {
+        let mut result = Y::zeros(x.shape().0);

-        for (i, x) in row_iter(x).enumerate() {
-            result.set(0, i, self.classes[self.predict_for_row(x)?]);
+        let mut row_vec = vec![TX::zero(); x.shape().1];
+        for (i, row) in x.row_iter().enumerate() {
+            row.iterator(0)
+                .zip(row_vec.iter_mut())
+                .for_each(|(&s, v)| *v = s);
+            result.set(i, self.classes()[self.predict_for_row(&row_vec)?]);
        }

-        Ok(result.to_row_vector())
+        Ok(result)
    }

-    fn predict_for_row(&self, x: Vec<T>) -> Result<usize, Failed> {
-        let search_result = self.knn_algorithm.find(&x, self.k)?;
+    fn predict_for_row(&self, row: &Vec<TX>) -> Result<usize, Failed> {
+        let search_result = self.knn_algorithm().find(row, self.k())?;

        let weights = self
-            .weight
+            .weight()
            .calc_weights(search_result.iter().map(|v| v.1).collect());
-        let w_sum = weights.iter().copied().sum();
+        let w_sum: f64 = weights.iter().copied().sum();

-        let mut c = vec![T::zero(); self.classes.len()];
-        let mut max_c = T::zero();
+        let mut c = vec![0f64; self.classes().len()];
+        let mut max_c = 0f64;
        let mut max_i = 0;
        for (r, w) in search_result.iter().zip(weights.iter()) {
-            c[self.y[r.0]] += *w / w_sum;
-            if c[self.y[r.0]] > max_c {
-                max_c = c[self.y[r.0]];
-                max_i = self.y[r.0];
+            c[self.y()[r.0]] += *w / w_sum;
+            if c[self.y()[r.0]] > max_c {
+                max_c = c[self.y()[r.0]];
+                max_i = self.y()[r.0];
            }
        }

@@ -254,14 +303,14 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::linalg::naive::dense_matrix::DenseMatrix;
+    use crate::linalg::basic::matrix::DenseMatrix;

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
    #[test]
    fn knn_fit_predict() {
        let x =
            DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
-        let y = vec![2., 2., 2., 3., 3.];
+        let y = vec![2, 2, 2, 3, 3];
        let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap();
        let y_hat = knn.predict(&x).unwrap();
        assert_eq!(5, Vec::len(&y_hat));
@@ -272,7 +321,7 @@ mod tests {
    #[test]
    fn knn_fit_predict_weighted() {
        let x = DenseMatrix::from_2d_array(&[&[1.], &[2.], &[3.], &[4.], &[5.]]);
-        let y = vec![2., 2., 2., 3., 3.];
+        let y = vec![2, 2, 2, 3, 3];
        let knn = KNNClassifier::fit(
            &x,
            &y,
@@ -283,7 +332,7 @@ mod tests {
        )
        .unwrap();
        let y_hat = knn.predict(&DenseMatrix::from_2d_array(&[&[4.1]])).unwrap();
-        assert_eq!(vec![3.0], y_hat);
+        assert_eq!(vec![3], y_hat);
    }

    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
@@ -292,7 +341,7 @@ mod tests {
    fn serde() {
        let x =
            DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
-        let y = vec![2., 2., 2., 3., 3.];
+        let y = vec![2, 2, 2, 3, 3];

        let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap();