Merge potential next release v0.4 (#187) Breaking Changes

* First draft of the new n-dimensional arrays + NB use case * Improves default implementation of multiple Array methods * Refactors tree methods * Adds matrix decomposition routines * Adds matrix decomposition methods to ndarray and nalgebra bindings * Refactoring + linear regression now uses array2 * Ridge & Linear regression * LBFGS optimizer & logistic regression * LBFGS optimizer & logistic regression * Changes linear methods, metrics and model selection methods to new n-dimensional arrays * Switches KNN and clustering algorithms to new n-d array layer * Refactors distance metrics * Optimizes knn and clustering methods * Refactors metrics module * Switches decomposition methods to n-dimensional arrays * Linalg refactoring - cleanup rng merge (#172) * Remove legacy DenseMatrix and BaseMatrix implementation. Port the new Number, FloatNumber and Array implementation into module structure. * Exclude AUC metrics. Needs reimplementation * Improve developers walkthrough New traits system in place at `src/numbers` and `src/linalg` Co-authored-by: Lorenzo <tunedconsulting@gmail.com> * Provide SupervisedEstimator with a constructor to avoid explicit dynamical box allocation in 'cross_validate' and 'cross_validate_predict' as required by the use of 'dyn' as per Rust 2021 * Implement getters to use as_ref() in src/neighbors * Implement getters to use as_ref() in src/naive_bayes * Implement getters to use as_ref() in src/linear * Add Clone to src/naive_bayes * Change signature for cross_validate and other model_selection functions to abide to use of dyn in Rust 2021 * Implement ndarray-bindings. Remove FloatNumber from implementations * Drop nalgebra-bindings support (as decided in conf-call to go for ndarray) * Remove benches. Benches will have their own repo at smartcore-benches * Implement SVC * Implement SVC serialization. Move search parameters in dedicated module * Implement SVR. Definitely too slow * Fix compilation issues for wasm (#202) Co-authored-by: Luis Moreno <morenol@users.noreply.github.com> * Fix tests (#203) * Port linalg/traits/stats.rs * Improve methods naming * Improve Display for DenseMatrix Co-authored-by: Montana Low <montanalow@users.noreply.github.com> Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
2022-10-31 10:44:57 +00:00
parent a32eb66a6a
commit a7fa0585eb
110 changed files with 10327 additions and 9107 deletions
@@ -0,0 +1,89 @@
+//! # Euclidian Metric Distance
+//!
+//! The Euclidean distance (L2) between two points \\( x \\) and \\( y \\) in n-space is defined as
+//!
+//! \\[ d(x, y) = \sqrt{\sum_{i=1}^n (x-y)^2} \\]
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::metrics::distance::Distance;
+//! use smartcore::metrics::distance::euclidian::Euclidian;
+//!
+//! let x = vec![1., 1.];
+//! let y = vec![2., 2.];
+//!
+//! let l2: f64 = Euclidian::new().distance(&x, &y);
+//! ```
+//!
+//! <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+//! <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+use std::marker::PhantomData;
+
+use crate::linalg::basic::arrays::ArrayView1;
+use crate::numbers::basenum::Number;
+
+use super::Distance;
+
+/// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space.
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Clone)]
+pub struct Euclidian<T> {
+    _t: PhantomData<T>,
+}
+
+impl<T: Number> Default for Euclidian<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T: Number> Euclidian<T> {
+    /// instatiate the initial structure
+    pub fn new() -> Euclidian<T> {
+        Euclidian { _t: PhantomData }
+    }
+
+    /// return sum of squared distances
+    #[inline]
+    pub(crate) fn squared_distance<A: ArrayView1<T>>(x: &A, y: &A) -> f64 {
+        if x.shape() != y.shape() {
+            panic!("Input vector sizes are different.");
+        }
+
+        let sum: f64 = x
+            .iterator(0)
+            .zip(y.iterator(0))
+            .map(|(&a, &b)| {
+                let r = a - b;
+                (r * r).to_f64().unwrap()
+            })
+            .sum();
+
+        sum
+    }
+}
+
+impl<T: Number, A: ArrayView1<T>> Distance<A> for Euclidian<T> {
+    fn distance(&self, x: &A, y: &A) -> f64 {
+        Euclidian::squared_distance(x, y).sqrt()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+    #[test]
+    fn squared_distance() {
+        let a = vec![1, 2, 3];
+        let b = vec![4, 5, 6];
+
+        let l2: f64 = Euclidian::new().distance(&a, &b);
+
+        assert!((l2 - 5.19615242).abs() < 1e-8);
+    }
+}
@@ -0,0 +1,83 @@
+//! # Hamming Distance
+//!
+//! Hamming Distance measures the similarity between two integer-valued vectors of the same length.
+//! Given two vectors \\( x \in ℝ^n \\), \\( y \in ℝ^n \\) the hamming distance between \\( x \\) and \\( y \\), \\( d(x, y) \\), is the number of places where \\( x \\) and \\( y \\) differ.
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::metrics::distance::Distance;
+//! use smartcore::metrics::distance::hamming::Hamming;
+//!
+//! let a = vec![1, 0, 0, 1, 0, 0, 1];
+//! let b = vec![1, 1, 0, 0, 1, 0, 1];
+//!
+//! let h: f64 = Hamming::new().distance(&a, &b);
+//!
+//! ```
+//!
+//! <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+//! <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+use std::marker::PhantomData;
+
+use super::Distance;
+use crate::linalg::basic::arrays::ArrayView1;
+use crate::numbers::basenum::Number;
+
+/// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Clone)]
+pub struct Hamming<T: Number> {
+    _t: PhantomData<T>,
+}
+
+impl<T: Number> Hamming<T> {
+    /// instatiate the initial structure
+    pub fn new() -> Hamming<T> {
+        Hamming { _t: PhantomData }
+    }
+}
+
+impl<T: Number> Default for Hamming<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T: Number, A: ArrayView1<T>> Distance<A> for Hamming<T> {
+    fn distance(&self, x: &A, y: &A) -> f64 {
+        if x.shape() != y.shape() {
+            panic!("Input vector sizes are different");
+        }
+
+        let dist: usize = x
+            .iterator(0)
+            .zip(y.iterator(0))
+            .map(|(a, b)| match a != b {
+                true => 1,
+                false => 0,
+            })
+            .sum();
+
+        dist as f64 / x.shape() as f64
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+    #[test]
+    fn hamming_distance() {
+        let a = vec![1, 0, 0, 1, 0, 0, 1];
+        let b = vec![1, 1, 0, 0, 1, 0, 1];
+
+        let h: f64 = Hamming::new().distance(&a, &b);
+
+        assert!((h - 0.42857142).abs() < 1e-8);
+    }
+}
@@ -0,0 +1,162 @@
+//! # Mahalanobis Distance
+//!
+//! The Mahalanobis distance (MD) is the distance between two points in multivariate space.
+//! In a regular Euclidean space the distance between any two points can be measured with [Euclidean distance](../euclidian/index.html).
+//! For uncorrelated variables, the Euclidean distance equals the MD. However, if two or more variables are correlated the measurements become impossible
+//! with Euclidean distance because the axes are no longer at right angles to each other. MD on the other hand, is scale-invariant,
+//! it takes into account the covariance matrix of the dataset when calculating distance between 2 points that belong to the same space as the dataset.
+//!
+//! MD between two vectors \\( x \in ℝ^n \\) and \\( y \in ℝ^n \\) is defined as
+//! \\[ d(x, y) = \sqrt{(x - y)^TS^{-1}(x - y)}\\]
+//!
+//! where \\( S \\) is the covariance matrix of the dataset.
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::linalg::basic::matrix::DenseMatrix;
+//! use smartcore::linalg::basic::arrays::ArrayView2;
+//! use smartcore::metrics::distance::Distance;
+//! use smartcore::metrics::distance::mahalanobis::Mahalanobis;
+//!
+//! let data = DenseMatrix::from_2d_array(&[
+//!                   &[64., 580., 29.],
+//!                   &[66., 570., 33.],
+//!                   &[68., 590., 37.],
+//!                   &[69., 660., 46.],
+//!                   &[73., 600., 55.],
+//! ]);
+//!
+//! let a = data.mean_by(0);
+//! let b = vec![66., 640., 44.];
+//!
+//! let mahalanobis = Mahalanobis::new(&data);
+//!
+//! mahalanobis.distance(&a, &b);
+//! ```
+//!
+//! ## References
+//! * ["Introduction to Multivariate Statistical Analysis in Chemometrics", Varmuza, K., Filzmoser, P., 2016, p.46](https://www.taylorfrancis.com/books/9780429145049)
+//! * ["Example of Calculating the Mahalanobis Distance", McCaffrey, J.D.](https://jamesmccaffrey.wordpress.com/2017/11/09/example-of-calculating-the-mahalanobis-distance/)
+//!
+//! <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+//! <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+#![allow(non_snake_case)]
+
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+use std::marker::PhantomData;
+
+use super::Distance;
+use crate::linalg::basic::arrays::{Array, Array2, ArrayView1};
+use crate::linalg::basic::matrix::DenseMatrix;
+use crate::linalg::traits::lu::LUDecomposable;
+use crate::numbers::basenum::Number;
+
+/// Mahalanobis distance.
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Clone)]
+pub struct Mahalanobis<T: Number, M: Array2<f64>> {
+    /// covariance matrix of the dataset
+    pub sigma: M,
+    /// inverse of the covariance matrix
+    pub sigmaInv: M,
+    _t: PhantomData<T>,
+}
+
+impl<T: Number, M: Array2<f64> + LUDecomposable<f64>> Mahalanobis<T, M> {
+    /// Constructs new instance of `Mahalanobis` from given dataset
+    /// * `data` - a matrix of _NxM_ where _N_ is number of observations and _M_ is number of attributes
+    pub fn new<X: Array2<T>>(data: &X) -> Mahalanobis<T, M> {
+        let (_, m) = data.shape();
+        let mut sigma = M::zeros(m, m);
+        data.cov(&mut sigma);
+        let sigmaInv = sigma.lu().and_then(|lu| lu.inverse()).unwrap();
+        Mahalanobis {
+            sigma,
+            sigmaInv,
+            _t: PhantomData,
+        }
+    }
+
+    /// Constructs new instance of `Mahalanobis` from given covariance matrix
+    /// * `cov` - a covariance matrix
+    pub fn new_from_covariance<X: Array2<f64> + LUDecomposable<f64>>(cov: &X) -> Mahalanobis<T, X> {
+        let sigma = cov.clone();
+        let sigmaInv = sigma.lu().and_then(|lu| lu.inverse()).unwrap();
+        Mahalanobis {
+            sigma,
+            sigmaInv,
+            _t: PhantomData,
+        }
+    }
+}
+
+impl<T: Number, A: ArrayView1<T>> Distance<A> for Mahalanobis<T, DenseMatrix<f64>> {
+    fn distance(&self, x: &A, y: &A) -> f64 {
+        let (nrows, ncols) = self.sigma.shape();
+        if x.shape() != nrows {
+            panic!(
+                "Array x[{}] has different dimension with Sigma[{}][{}].",
+                x.shape(),
+                nrows,
+                ncols
+            );
+        }
+
+        if y.shape() != nrows {
+            panic!(
+                "Array y[{}] has different dimension with Sigma[{}][{}].",
+                y.shape(),
+                nrows,
+                ncols
+            );
+        }
+
+        let n = x.shape();
+
+        let z: Vec<f64> = x
+            .iterator(0)
+            .zip(y.iterator(0))
+            .map(|(&a, &b)| (a - b).to_f64().unwrap())
+            .collect();
+
+        // np.dot(np.dot((a-b),VI),(a-b).T)
+        let mut s = 0f64;
+        for j in 0..n {
+            for i in 0..n {
+                s += *self.sigmaInv.get((i, j)) * z[i] * z[j];
+            }
+        }
+
+        s.sqrt()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::linalg::basic::arrays::ArrayView2;
+    use crate::linalg::basic::matrix::DenseMatrix;
+
+    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+    #[test]
+    fn mahalanobis_distance() {
+        let data = DenseMatrix::from_2d_array(&[
+            &[64., 580., 29.],
+            &[66., 570., 33.],
+            &[68., 590., 37.],
+            &[69., 660., 46.],
+            &[73., 600., 55.],
+        ]);
+
+        let a = data.mean_by(0);
+        let b = vec![66., 640., 44.];
+
+        let mahalanobis = Mahalanobis::new(&data);
+
+        let md: f64 = mahalanobis.distance(&a, &b);
+
+        assert!((md - 5.33).abs() < 1e-2);
+    }
+}
@@ -0,0 +1,79 @@
+//! # Manhattan Distance
+//!
+//! The Manhattan distance between two points \\(x \in ℝ^n \\) and \\( y \in ℝ^n \\) in n-dimensional space is the sum of the distances in each dimension.
+//!
+//! \\[ d(x, y) = \sum_{i=0}^n \lvert x_i - y_i \rvert \\]
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::metrics::distance::Distance;
+//! use smartcore::metrics::distance::manhattan::Manhattan;
+//!
+//! let x = vec![1., 1.];
+//! let y = vec![2., 2.];
+//!
+//! let l1: f64 = Manhattan::new().distance(&x, &y);
+//! ```
+//! <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+//! <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+use std::marker::PhantomData;
+
+use crate::linalg::basic::arrays::ArrayView1;
+use crate::numbers::basenum::Number;
+
+use super::Distance;
+
+/// Manhattan distance
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Clone)]
+pub struct Manhattan<T: Number> {
+    _t: PhantomData<T>,
+}
+
+impl<T: Number> Manhattan<T> {
+    /// instatiate the initial structure
+    pub fn new() -> Manhattan<T> {
+        Manhattan { _t: PhantomData }
+    }
+}
+
+impl<T: Number> Default for Manhattan<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T: Number, A: ArrayView1<T>> Distance<A> for Manhattan<T> {
+    fn distance(&self, x: &A, y: &A) -> f64 {
+        if x.shape() != y.shape() {
+            panic!("Input vector sizes are different");
+        }
+
+        let dist: f64 = x
+            .iterator(0)
+            .zip(y.iterator(0))
+            .map(|(&a, &b)| (a - b).to_f64().unwrap().abs())
+            .sum();
+
+        dist
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+    #[test]
+    fn manhattan_distance() {
+        let a = vec![1., 2., 3.];
+        let b = vec![4., 5., 6.];
+
+        let l1: f64 = Manhattan::new().distance(&a, &b);
+
+        assert!((l1 - 9.0).abs() < 1e-8);
+    }
+}
@@ -0,0 +1,97 @@
+//! # Minkowski Distance
+//!
+//! The Minkowski distance  of order _p_ (where _p_ is an integer) is a metric in a normed vector space which can be considered as a generalization of both the Euclidean distance and the Manhattan distance.
+//! The Manhattan distance between two points \\(x \in ℝ^n \\) and \\( y \in ℝ^n \\) in n-dimensional space is defined as:
+//!
+//! \\[ d(x, y) = \left(\sum_{i=0}^n \lvert x_i - y_i \rvert^p\right)^{1/p} \\]
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::metrics::distance::Distance;
+//! use smartcore::metrics::distance::minkowski::Minkowski;
+//!
+//! let x = vec![1., 1.];
+//! let y = vec![2., 2.];
+//!
+//! let l1: f64 = Minkowski::new(1).distance(&x, &y);
+//! let l2: f64 = Minkowski::new(2).distance(&x, &y);
+//!
+//! ```
+//! <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+//! <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+use std::marker::PhantomData;
+
+use crate::linalg::basic::arrays::ArrayView1;
+use crate::numbers::basenum::Number;
+
+use super::Distance;
+
+/// Defines the Minkowski distance of order `p`
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Clone)]
+pub struct Minkowski<T: Number> {
+    /// order, integer
+    pub p: u16,
+    _t: PhantomData<T>,
+}
+
+impl<T: Number> Minkowski<T> {
+    /// instatiate the initial structure
+    pub fn new(p: u16) -> Minkowski<T> {
+        Minkowski { p, _t: PhantomData }
+    }
+}
+
+impl<T: Number, A: ArrayView1<T>> Distance<A> for Minkowski<T> {
+    fn distance(&self, x: &A, y: &A) -> f64 {
+        if x.shape() != y.shape() {
+            panic!("Input vector sizes are different");
+        }
+        if self.p < 1 {
+            panic!("p must be at least 1");
+        }
+
+        let p_t = self.p as f64;
+
+        let dist: f64 = x
+            .iterator(0)
+            .zip(y.iterator(0))
+            .map(|(&a, &b)| (a - b).to_f64().unwrap().abs().powf(p_t))
+            .sum();
+
+        dist.powf(1f64 / p_t)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+    #[test]
+    fn minkowski_distance() {
+        let a = vec![1., 2., 3.];
+        let b = vec![4., 5., 6.];
+
+        let l1: f64 = Minkowski::new(1).distance(&a, &b);
+        let l2: f64 = Minkowski::new(2).distance(&a, &b);
+        let l3: f64 = Minkowski::new(3).distance(&a, &b);
+
+        assert!((l1 - 9.0).abs() < 1e-8);
+        assert!((l2 - 5.19615242).abs() < 1e-8);
+        assert!((l3 - 4.32674871).abs() < 1e-8);
+    }
+
+    #[test]
+    #[should_panic(expected = "p must be at least 1")]
+    fn minkowski_distance_negative_p() {
+        let a = vec![1., 2., 3.];
+        let b = vec![4., 5., 6.];
+
+        let _: f64 = Minkowski::new(0).distance(&a, &b);
+    }
+}
@@ -0,0 +1,68 @@
+//! # Collection of Distance Functions
+//!
+//! Many algorithms in machine learning require a measure of distance between data points. Distance metric (or metric) is a function that defines a distance between a pair of point elements of a set.
+//! Formally, the distance can be any metric measure that is defined as \\( d(x, y) \geq 0\\) and follows three conditions:
+//! 1. \\( d(x, y) = 0 \\) if and only \\( x = y \\), positive definiteness
+//! 1. \\( d(x, y) = d(y, x) \\), symmetry
+//! 1. \\( d(x, y) \leq d(x, z) + d(z, y) \\), subadditivity or triangle inequality
+//!
+//! for all \\(x, y, z \in Z \\)
+//!
+//! A good distance metric helps to improve the performance of classification, clustering and information retrieval algorithms significantly.
+//!
+//! <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+//! <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+
+/// Euclidean Distance is the straight-line distance between two points in Euclidean spacere that presents the shortest distance between these points.
+pub mod euclidian;
+/// Hamming Distance between two strings is the number of positions at which the corresponding symbols are different.
+pub mod hamming;
+/// The Mahalanobis distance is the distance between two points in multivariate space.
+pub mod mahalanobis;
+/// Also known as rectilinear distance, city block distance, taxicab metric.
+pub mod manhattan;
+/// A generalization of both the Euclidean distance and the Manhattan distance.
+pub mod minkowski;
+
+use crate::linalg::basic::arrays::Array2;
+use crate::linalg::traits::lu::LUDecomposable;
+use crate::numbers::basenum::Number;
+
+/// Distance metric, a function that calculates distance between two points
+pub trait Distance<T>: Clone {
+    /// Calculates distance between _a_ and _b_
+    fn distance(&self, a: &T, b: &T) -> f64;
+}
+
+/// Multitude of distance metric functions
+pub struct Distances {}
+
+impl Distances {
+    /// Euclidian distance, see [`Euclidian`](euclidian/index.html)
+    pub fn euclidian<T: Number>() -> euclidian::Euclidian<T> {
+        euclidian::Euclidian::new()
+    }
+
+    /// Minkowski distance, see [`Minkowski`](minkowski/index.html)
+    /// * `p` - function order. Should be >= 1
+    pub fn minkowski<T: Number>(p: u16) -> minkowski::Minkowski<T> {
+        minkowski::Minkowski::new(p)
+    }
+
+    /// Manhattan distance, see [`Manhattan`](manhattan/index.html)
+    pub fn manhattan<T: Number>() -> manhattan::Manhattan<T> {
+        manhattan::Manhattan::new()
+    }
+
+    /// Hamming distance, see [`Hamming`](hamming/index.html)
+    pub fn hamming<T: Number>() -> hamming::Hamming<T> {
+        hamming::Hamming::new()
+    }
+
+    /// Mahalanobis distance, see [`Mahalanobis`](mahalanobis/index.html)
+    pub fn mahalanobis<T: Number, M: Array2<T>, C: Array2<f64> + LUDecomposable<f64>>(
+        data: &M,
+    ) -> mahalanobis::Mahalanobis<T, C> {
+        mahalanobis::Mahalanobis::new(data)
+    }
+}