Merge potential next release v0.4 (#187) Breaking Changes

* First draft of the new n-dimensional arrays + NB use case * Improves default implementation of multiple Array methods * Refactors tree methods * Adds matrix decomposition routines * Adds matrix decomposition methods to ndarray and nalgebra bindings * Refactoring + linear regression now uses array2 * Ridge & Linear regression * LBFGS optimizer & logistic regression * LBFGS optimizer & logistic regression * Changes linear methods, metrics and model selection methods to new n-dimensional arrays * Switches KNN and clustering algorithms to new n-d array layer * Refactors distance metrics * Optimizes knn and clustering methods * Refactors metrics module * Switches decomposition methods to n-dimensional arrays * Linalg refactoring - cleanup rng merge (#172) * Remove legacy DenseMatrix and BaseMatrix implementation. Port the new Number, FloatNumber and Array implementation into module structure. * Exclude AUC metrics. Needs reimplementation * Improve developers walkthrough New traits system in place at `src/numbers` and `src/linalg` Co-authored-by: Lorenzo <tunedconsulting@gmail.com> * Provide SupervisedEstimator with a constructor to avoid explicit dynamical box allocation in 'cross_validate' and 'cross_validate_predict' as required by the use of 'dyn' as per Rust 2021 * Implement getters to use as_ref() in src/neighbors * Implement getters to use as_ref() in src/naive_bayes * Implement getters to use as_ref() in src/linear * Add Clone to src/naive_bayes * Change signature for cross_validate and other model_selection functions to abide to use of dyn in Rust 2021 * Implement ndarray-bindings. Remove FloatNumber from implementations * Drop nalgebra-bindings support (as decided in conf-call to go for ndarray) * Remove benches. Benches will have their own repo at smartcore-benches * Implement SVC * Implement SVC serialization. Move search parameters in dedicated module * Implement SVR. Definitely too slow * Fix compilation issues for wasm (#202) Co-authored-by: Luis Moreno <morenol@users.noreply.github.com> * Fix tests (#203) * Port linalg/traits/stats.rs * Improve methods naming * Improve Display for DenseMatrix Co-authored-by: Montana Low <montanalow@users.noreply.github.com> Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
2022-10-31 10:44:57 +00:00
parent bb71656137
commit 52eb6ce023
110 changed files with 10327 additions and 9107 deletions
@@ -19,18 +19,19 @@
 //! Example:
 //!
 //! ```
-//! use smartcore::linalg::naive::dense_matrix::*;
+//! use smartcore::linalg::basic::matrix::DenseMatrix;
+//! use smartcore::linalg::basic::arrays::Array2;
 //! use smartcore::cluster::dbscan::*;
-//! use smartcore::math::distance::Distances;
+//! use smartcore::metrics::distance::Distances;
 //! use smartcore::neighbors::KNNAlgorithmName;
 //! use smartcore::dataset::generator;
 //!
 //! // Generate three blobs
 //! let blobs = generator::make_blobs(100, 2, 3);
-//! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data);
+//! let x: DenseMatrix<f32> = DenseMatrix::from_iterator(blobs.data.into_iter(), 100, 2, 0);
 //! // Fit the algorithm and predict cluster labels
-//! let labels = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0)).
-//!     and_then(|dbscan| dbscan.predict(&x));
+//! let labels: Vec<u32> = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0)).
+//!     and_then(|dbscan| dbscan.predict(&x)).unwrap();
 //!
 //! println!("{:?}", labels);
 //! ```
@@ -41,7 +42,7 @@
 //! * ["Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and its Applications", Sander J., Ester M., Kriegel HP., Xu X.](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.63.1629&rep=rep1&type=pdf)

 use std::fmt::Debug;
-use std::iter::Sum;
+use std::marker::PhantomData;

 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
@@ -49,26 +50,29 @@ use serde::{Deserialize, Serialize};
 use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
 use crate::api::{Predictor, UnsupervisedEstimator};
 use crate::error::Failed;
-use crate::linalg::{row_iter, Matrix};
-use crate::math::distance::euclidian::Euclidian;
-use crate::math::distance::{Distance, Distances};
-use crate::math::num::RealNumber;
+use crate::linalg::basic::arrays::{Array1, Array2};
+use crate::metrics::distance::euclidian::Euclidian;
+use crate::metrics::distance::{Distance, Distances};
+use crate::numbers::basenum::Number;
 use crate::tree::decision_tree_classifier::which_max;

 /// DBSCAN clustering algorithm
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Debug)]
-pub struct DBSCAN<T: RealNumber, D: Distance<Vec<T>, T>> {
+pub struct DBSCAN<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> {
    cluster_labels: Vec<i16>,
    num_classes: usize,
-    knn_algorithm: KNNAlgorithm<T, D>,
-    eps: T,
+    knn_algorithm: KNNAlgorithm<TX, D>,
+    eps: f64,
+    _phantom_ty: PhantomData<TY>,
+    _phantom_x: PhantomData<X>,
+    _phantom_y: PhantomData<Y>,
 }

 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Debug, Clone)]
 /// DBSCAN clustering algorithm parameters
-pub struct DBSCANParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
+pub struct DBSCANParameters<T: Number, D: Distance<Vec<T>>> {
    #[cfg_attr(feature = "serde", serde(default))]
    /// a function that defines a distance between each pair of point in training data.
    /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
@@ -79,22 +83,25 @@ pub struct DBSCANParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
    pub min_samples: usize,
    #[cfg_attr(feature = "serde", serde(default))]
    /// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
-    pub eps: T,
+    pub eps: f64,
    #[cfg_attr(feature = "serde", serde(default))]
    /// KNN algorithm to use.
    pub algorithm: KNNAlgorithmName,
+    #[cfg_attr(feature = "serde", serde(default))]
+    _phantom_t: PhantomData<T>,
 }

-impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
+impl<T: Number, D: Distance<Vec<T>>> DBSCANParameters<T, D> {
    /// a function that defines a distance between each pair of point in training data.
    /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
    /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
-    pub fn with_distance<DD: Distance<Vec<T>, T>>(self, distance: DD) -> DBSCANParameters<T, DD> {
+    pub fn with_distance<DD: Distance<Vec<T>>>(self, distance: DD) -> DBSCANParameters<T, DD> {
        DBSCANParameters {
            distance,
            min_samples: self.min_samples,
            eps: self.eps,
            algorithm: self.algorithm,
+            _phantom_t: PhantomData,
        }
    }
    /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
@@ -103,7 +110,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
        self
    }
    /// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
-    pub fn with_eps(mut self, eps: T) -> Self {
+    pub fn with_eps(mut self, eps: f64) -> Self {
        self.eps = eps;
        self
    }
@@ -117,7 +124,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
 /// DBSCAN grid search parameters
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Debug, Clone)]
-pub struct DBSCANSearchParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
+pub struct DBSCANSearchParameters<T: Number, D: Distance<Vec<T>>> {
    #[cfg_attr(feature = "serde", serde(default))]
    /// a function that defines a distance between each pair of point in training data.
    /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
@@ -128,14 +135,15 @@ pub struct DBSCANSearchParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
    pub min_samples: Vec<usize>,
    #[cfg_attr(feature = "serde", serde(default))]
    /// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
-    pub eps: Vec<T>,
+    pub eps: Vec<f64>,
    #[cfg_attr(feature = "serde", serde(default))]
    /// KNN algorithm to use.
    pub algorithm: Vec<KNNAlgorithmName>,
+    _phantom_t: PhantomData<T>,
 }

 /// DBSCAN grid search iterator
-pub struct DBSCANSearchParametersIterator<T: RealNumber, D: Distance<Vec<T>, T>> {
+pub struct DBSCANSearchParametersIterator<T: Number, D: Distance<Vec<T>>> {
    dbscan_search_parameters: DBSCANSearchParameters<T, D>,
    current_distance: usize,
    current_min_samples: usize,
@@ -143,7 +151,7 @@ pub struct DBSCANSearchParametersIterator<T: RealNumber, D: Distance<Vec<T>, T>>
    current_algorithm: usize,
 }

-impl<T: RealNumber, D: Distance<Vec<T>, T>> IntoIterator for DBSCANSearchParameters<T, D> {
+impl<T: Number, D: Distance<Vec<T>>> IntoIterator for DBSCANSearchParameters<T, D> {
    type Item = DBSCANParameters<T, D>;
    type IntoIter = DBSCANSearchParametersIterator<T, D>;

@@ -158,7 +166,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> IntoIterator for DBSCANSearchParamet
    }
 }

-impl<T: RealNumber, D: Distance<Vec<T>, T>> Iterator for DBSCANSearchParametersIterator<T, D> {
+impl<T: Number, D: Distance<Vec<T>>> Iterator for DBSCANSearchParametersIterator<T, D> {
    type Item = DBSCANParameters<T, D>;

    fn next(&mut self) -> Option<Self::Item> {
@@ -175,6 +183,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> Iterator for DBSCANSearchParametersI
            min_samples: self.dbscan_search_parameters.min_samples[self.current_min_samples],
            eps: self.dbscan_search_parameters.eps[self.current_eps],
            algorithm: self.dbscan_search_parameters.algorithm[self.current_algorithm].clone(),
+            _phantom_t: PhantomData,
        };

        if self.current_distance + 1 < self.dbscan_search_parameters.distance.len() {
@@ -202,7 +211,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> Iterator for DBSCANSearchParametersI
    }
 }

-impl<T: RealNumber> Default for DBSCANSearchParameters<T, Euclidian> {
+impl<T: Number> Default for DBSCANSearchParameters<T, Euclidian<T>> {
    fn default() -> Self {
        let default_params = DBSCANParameters::default();

@@ -211,11 +220,14 @@ impl<T: RealNumber> Default for DBSCANSearchParameters<T, Euclidian> {
            min_samples: vec![default_params.min_samples],
            eps: vec![default_params.eps],
            algorithm: vec![default_params.algorithm],
+            _phantom_t: PhantomData,
        }
    }
 }

-impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
+impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> PartialEq
+    for DBSCAN<TX, TY, X, Y, D>
+{
    fn eq(&self, other: &Self) -> bool {
        self.cluster_labels.len() == other.cluster_labels.len()
            && self.num_classes == other.num_classes
@@ -224,47 +236,50 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
    }
 }

-impl<T: RealNumber> Default for DBSCANParameters<T, Euclidian> {
+impl<T: Number> Default for DBSCANParameters<T, Euclidian<T>> {
    fn default() -> Self {
        DBSCANParameters {
            distance: Distances::euclidian(),
            min_samples: 5,
-            eps: T::half(),
+            eps: 0.5f64,
            algorithm: KNNAlgorithmName::default(),
+            _phantom_t: PhantomData,
        }
    }
 }

-impl<T: RealNumber + Sum, M: Matrix<T>, D: Distance<Vec<T>, T>>
-    UnsupervisedEstimator<M, DBSCANParameters<T, D>> for DBSCAN<T, D>
+impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
+    UnsupervisedEstimator<X, DBSCANParameters<TX, D>> for DBSCAN<TX, TY, X, Y, D>
 {
-    fn fit(x: &M, parameters: DBSCANParameters<T, D>) -> Result<Self, Failed> {
+    fn fit(x: &X, parameters: DBSCANParameters<TX, D>) -> Result<Self, Failed> {
        DBSCAN::fit(x, parameters)
    }
 }

-impl<T: RealNumber, M: Matrix<T>, D: Distance<Vec<T>, T>> Predictor<M, M::RowVector>
-    for DBSCAN<T, D>
+impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> Predictor<X, Y>
+    for DBSCAN<TX, TY, X, Y, D>
 {
-    fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
+    fn predict(&self, x: &X) -> Result<Y, Failed> {
        self.predict(x)
    }
 }

-impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
+impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
+    DBSCAN<TX, TY, X, Y, D>
+{
    /// Fit algorithm to _NxM_ matrix where _N_ is number of samples and _M_ is number of features.
    /// * `data` - training instances to cluster
    /// * `k` - number of clusters
    /// * `parameters` - cluster parameters
-    pub fn fit<M: Matrix<T>>(
-        x: &M,
-        parameters: DBSCANParameters<T, D>,
-    ) -> Result<DBSCAN<T, D>, Failed> {
+    pub fn fit(
+        x: &X,
+        parameters: DBSCANParameters<TX, D>,
+    ) -> Result<DBSCAN<TX, TY, X, Y, D>, Failed> {
        if parameters.min_samples < 1 {
            return Err(Failed::fit("Invalid minPts"));
        }

-        if parameters.eps <= T::zero() {
+        if parameters.eps <= 0f64 {
            return Err(Failed::fit("Invalid radius: "));
        }

@@ -276,13 +291,19 @@ impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
        let n = x.shape().0;
        let mut y = vec![undefined; n];

-        let algo = parameters
-            .algorithm
-            .fit(row_iter(x).collect(), parameters.distance)?;
+        let algo = parameters.algorithm.fit(
+            x.row_iter()
+                .map(|row| row.iterator(0).cloned().collect())
+                .collect(),
+            parameters.distance,
+        )?;

-        for (i, e) in row_iter(x).enumerate() {
+        let mut row = vec![TX::zero(); x.shape().1];
+
+        for (i, e) in x.row_iter().enumerate() {
            if y[i] == undefined {
-                let mut neighbors = algo.find_radius(&e, parameters.eps)?;
+                e.iterator(0).zip(row.iter_mut()).for_each(|(&x, r)| *r = x);
+                let mut neighbors = algo.find_radius(&row, parameters.eps)?;
                if neighbors.len() < parameters.min_samples {
                    y[i] = outlier;
                } else {
@@ -333,18 +354,25 @@ impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
            num_classes: k as usize,
            knn_algorithm: algo,
            eps: parameters.eps,
+            _phantom_ty: PhantomData,
+            _phantom_x: PhantomData,
+            _phantom_y: PhantomData,
        })
    }

    /// Predict clusters for `x`
    /// * `x` - matrix with new data to transform of size _KxM_ , where _K_ is number of new samples and _M_ is number of features.
-    pub fn predict<M: Matrix<T>>(&self, x: &M) -> Result<M::RowVector, Failed> {
-        let (n, m) = x.shape();
-        let mut result = M::zeros(1, n);
-        let mut row = vec![T::zero(); m];
+    pub fn predict(&self, x: &X) -> Result<Y, Failed> {
+        let (n, _) = x.shape();
+        let mut result = Y::zeros(n);
+
+        let mut row = vec![TX::zero(); x.shape().1];

        for i in 0..n {
-            x.copy_row_as_vec(i, &mut row);
+            x.get_row(i)
+                .iterator(0)
+                .zip(row.iter_mut())
+                .for_each(|(&x, r)| *r = x);
            let neighbors = self.knn_algorithm.find_radius(&row, self.eps)?;
            let mut label = vec![0usize; self.num_classes + 1];
            for neighbor in neighbors {
@@ -357,26 +385,26 @@ impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
            }
            let class = which_max(&label);
            if class != self.num_classes {
-                result.set(0, i, T::from(class).unwrap());
+                result.set(i, TY::from(class + 1).unwrap());
            } else {
-                result.set(0, i, -T::one());
+                result.set(i, TY::zero());
            }
        }

-        Ok(result.to_row_vector())
+        Ok(result)
    }
 }

 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::linalg::naive::dense_matrix::DenseMatrix;
+    use crate::linalg::basic::matrix::DenseMatrix;
    #[cfg(feature = "serde")]
-    use crate::math::distance::euclidian::Euclidian;
+    use crate::metrics::distance::euclidian::Euclidian;

    #[test]
    fn search_parameters() {
-        let parameters = DBSCANSearchParameters {
+        let parameters: DBSCANSearchParameters<f64, Euclidian<f64>> = DBSCANSearchParameters {
            min_samples: vec![10, 100],
            eps: vec![1., 2.],
            ..Default::default()
@@ -414,7 +442,7 @@ mod tests {
            &[3.0, 5.0],
        ]);

-        let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0];
+        let expected_labels = vec![1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0];

        let dbscan = DBSCAN::fit(
            &x,
@@ -424,7 +452,7 @@ mod tests {
        )
        .unwrap();

-        let predicted_labels = dbscan.predict(&x).unwrap();
+        let predicted_labels: Vec<i32> = dbscan.predict(&x).unwrap();

        assert_eq!(expected_labels, predicted_labels);
    }
@@ -458,9 +486,23 @@ mod tests {

        let dbscan = DBSCAN::fit(&x, Default::default()).unwrap();

-        let deserialized_dbscan: DBSCAN<f64, Euclidian> =
+        let deserialized_dbscan: DBSCAN<f32, f32, DenseMatrix<f32>, Vec<f32>, Euclidian<f32>> =
            serde_json::from_str(&serde_json::to_string(&dbscan).unwrap()).unwrap();

        assert_eq!(dbscan, deserialized_dbscan);
    }
+    use crate::dataset::generator;
+
+    #[test]
+    fn from_vec() {
+        // Generate three blobs
+        let blobs = generator::make_blobs(100, 2, 3);
+        let x: DenseMatrix<f32> = DenseMatrix::from_iterator(blobs.data.into_iter(), 100, 2, 0);
+        // Fit the algorithm and predict cluster labels
+        let labels: Vec<i32> = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0))
+            .and_then(|dbscan| dbscan.predict(&x))
+            .unwrap();
+
+        println!("{:?}", labels);
+    }
 }