Merge potential next release v0.4 (#187) Breaking Changes

* First draft of the new n-dimensional arrays + NB use case
* Improves default implementation of multiple Array methods
* Refactors tree methods
* Adds matrix decomposition routines
* Adds matrix decomposition methods to ndarray and nalgebra bindings
* Refactoring + linear regression now uses array2
* Ridge & Linear regression
* LBFGS optimizer & logistic regression
* LBFGS optimizer & logistic regression
* Changes linear methods, metrics and model selection methods to new n-dimensional arrays
* Switches KNN and clustering algorithms to new n-d array layer
* Refactors distance metrics
* Optimizes knn and clustering methods
* Refactors metrics module
* Switches decomposition methods to n-dimensional arrays
* Linalg refactoring - cleanup rng merge (#172)
* Remove legacy DenseMatrix and BaseMatrix implementation. Port the new Number, FloatNumber and Array implementation into module structure.
* Exclude AUC metrics. Needs reimplementation
* Improve developers walkthrough

New traits system in place at `src/numbers` and `src/linalg`
Co-authored-by: Lorenzo <tunedconsulting@gmail.com>

* Provide SupervisedEstimator with a constructor to avoid explicit dynamical box allocation in 'cross_validate' and 'cross_validate_predict' as required by the use of 'dyn' as per Rust 2021
* Implement getters to use as_ref() in src/neighbors
* Implement getters to use as_ref() in src/naive_bayes
* Implement getters to use as_ref() in src/linear
* Add Clone to src/naive_bayes
* Change signature for cross_validate and other model_selection functions to abide to use of dyn in Rust 2021
* Implement ndarray-bindings. Remove FloatNumber from implementations
* Drop nalgebra-bindings support (as decided in conf-call to go for ndarray)
* Remove benches. Benches will have their own repo at smartcore-benches
* Implement SVC
* Implement SVC serialization. Move search parameters in dedicated module
* Implement SVR. Definitely too slow
* Fix compilation issues for wasm (#202)

Co-authored-by: Luis Moreno <morenol@users.noreply.github.com>
* Fix tests (#203)

* Port linalg/traits/stats.rs
* Improve methods naming
* Improve Display for DenseMatrix

Co-authored-by: Montana Low <montanalow@users.noreply.github.com>
Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
This commit is contained in:
Lorenzo
2022-10-31 10:44:57 +00:00
committed by GitHub
parent bb71656137
commit 52eb6ce023
110 changed files with 10327 additions and 9107 deletions
+109 -79
View File
@@ -10,7 +10,7 @@
//!
//! Example:
//! ```
//! use smartcore::linalg::naive::dense_matrix::*;
//! use smartcore::linalg::basic::matrix::DenseMatrix;
//! use smartcore::decomposition::pca::*;
//!
//! // Iris data
@@ -52,24 +52,33 @@ use serde::{Deserialize, Serialize};
use crate::api::{Transformer, UnsupervisedEstimator};
use crate::error::Failed;
use crate::linalg::Matrix;
use crate::math::num::RealNumber;
use crate::linalg::basic::arrays::Array2;
use crate::linalg::traits::evd::EVDDecomposable;
use crate::linalg::traits::svd::SVDDecomposable;
use crate::numbers::basenum::Number;
use crate::numbers::realnum::RealNumber;
/// Principal components analysis algorithm
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
pub struct PCA<T: RealNumber, M: Matrix<T>> {
eigenvectors: M,
pub struct PCA<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>> {
eigenvectors: X,
eigenvalues: Vec<T>,
projection: M,
projection: X,
mu: Vec<T>,
pmu: Vec<T>,
}
impl<T: RealNumber, M: Matrix<T>> PartialEq for PCA<T, M> {
impl<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>> PartialEq
for PCA<T, X>
{
fn eq(&self, other: &Self) -> bool {
if self.eigenvectors != other.eigenvectors
|| self.eigenvalues.len() != other.eigenvalues.len()
if self.eigenvalues.len() != other.eigenvalues.len()
|| self
.eigenvectors
.iterator(0)
.zip(other.eigenvectors.iterator(0))
.any(|(&a, &b)| (a - b).abs() > T::epsilon())
{
false
} else {
@@ -196,24 +205,28 @@ impl Default for PCASearchParameters {
}
}
impl<T: RealNumber, M: Matrix<T>> UnsupervisedEstimator<M, PCAParameters> for PCA<T, M> {
fn fit(x: &M, parameters: PCAParameters) -> Result<Self, Failed> {
impl<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>>
UnsupervisedEstimator<X, PCAParameters> for PCA<T, X>
{
fn fit(x: &X, parameters: PCAParameters) -> Result<Self, Failed> {
PCA::fit(x, parameters)
}
}
impl<T: RealNumber, M: Matrix<T>> Transformer<M> for PCA<T, M> {
fn transform(&self, x: &M) -> Result<M, Failed> {
impl<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>> Transformer<X>
for PCA<T, X>
{
fn transform(&self, x: &X) -> Result<X, Failed> {
self.transform(x)
}
}
impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
impl<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>> PCA<T, X> {
/// Fits PCA to your data.
/// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
/// * `n_components` - number of components to keep.
/// * `parameters` - other parameters, use `Default::default()` to set parameters to default values.
pub fn fit(data: &M, parameters: PCAParameters) -> Result<PCA<T, M>, Failed> {
pub fn fit(data: &X, parameters: PCAParameters) -> Result<PCA<T, X>, Failed> {
let (m, n) = data.shape();
if parameters.n_components > n {
@@ -223,13 +236,17 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
)));
}
let mu = data.column_mean();
let mu: Vec<T> = data
.mean_by(0)
.iter()
.map(|&v| T::from_f64(v).unwrap())
.collect();
let mut x = data.clone();
for (c, mu_c) in mu.iter().enumerate().take(n) {
for (c, &mu_c) in mu.iter().enumerate().take(n) {
for r in 0..m {
x.sub_element_mut(r, c, *mu_c);
x.sub_element_mut((r, c), mu_c);
}
}
@@ -245,33 +262,33 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
eigenvectors = svd.V;
} else {
let mut cov = M::zeros(n, n);
let mut cov = X::zeros(n, n);
for k in 0..m {
for i in 0..n {
for j in 0..=i {
cov.add_element_mut(i, j, x.get(k, i) * x.get(k, j));
cov.add_element_mut((i, j), *x.get((k, i)) * *x.get((k, j)));
}
}
}
for i in 0..n {
for j in 0..=i {
cov.div_element_mut(i, j, T::from(m).unwrap());
cov.set(j, i, cov.get(i, j));
cov.div_element_mut((i, j), T::from(m).unwrap());
cov.set((j, i), *cov.get((i, j)));
}
}
if parameters.use_correlation_matrix {
let mut sd = vec![T::zero(); n];
for (i, sd_i) in sd.iter_mut().enumerate().take(n) {
*sd_i = cov.get(i, i).sqrt();
*sd_i = cov.get((i, i)).sqrt();
}
for i in 0..n {
for j in 0..=i {
cov.div_element_mut(i, j, sd[i] * sd[j]);
cov.set(j, i, cov.get(i, j));
cov.div_element_mut((i, j), sd[i] * sd[j]);
cov.set((j, i), *cov.get((i, j)));
}
}
@@ -283,7 +300,7 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
for (i, sd_i) in sd.iter().enumerate().take(n) {
for j in 0..n {
eigenvectors.div_element_mut(i, j, *sd_i);
eigenvectors.div_element_mut((i, j), *sd_i);
}
}
} else {
@@ -295,17 +312,17 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
}
}
let mut projection = M::zeros(parameters.n_components, n);
let mut projection = X::zeros(parameters.n_components, n);
for i in 0..n {
for j in 0..parameters.n_components {
projection.set(j, i, eigenvectors.get(i, j));
projection.set((j, i), *eigenvectors.get((i, j)));
}
}
let mut pmu = vec![T::zero(); parameters.n_components];
for (k, mu_k) in mu.iter().enumerate().take(n) {
for (i, pmu_i) in pmu.iter_mut().enumerate().take(parameters.n_components) {
*pmu_i += projection.get(i, k) * (*mu_k);
*pmu_i += *projection.get((i, k)) * (*mu_k);
}
}
@@ -320,7 +337,7 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
/// Run dimensionality reduction for `x`
/// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features.
pub fn transform(&self, x: &M) -> Result<M, Failed> {
pub fn transform(&self, x: &X) -> Result<X, Failed> {
let (nrows, ncols) = x.shape();
let (_, n_components) = self.projection.shape();
if ncols != self.mu.len() {
@@ -334,14 +351,14 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
let mut x_transformed = x.matmul(&self.projection);
for r in 0..nrows {
for c in 0..n_components {
x_transformed.sub_element_mut(r, c, self.pmu[c]);
x_transformed.sub_element_mut((r, c), self.pmu[c]);
}
}
Ok(x_transformed)
}
/// Get a projection matrix
pub fn components(&self) -> &M {
pub fn components(&self) -> &X {
&self.projection
}
}
@@ -349,7 +366,8 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
#[cfg(test)]
mod tests {
use super::*;
use crate::linalg::naive::dense_matrix::*;
use crate::linalg::basic::matrix::DenseMatrix;
use approx::relative_eq;
#[test]
fn search_parameters() {
@@ -442,7 +460,11 @@ mod tests {
let pca = PCA::fit(&us_arrests, Default::default()).unwrap();
assert!(expected.approximate_eq(&pca.components().abs(), 0.4));
assert!(relative_eq!(
expected,
pca.components().abs(),
epsilon = 1e-3
));
}
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
#[test]
@@ -538,10 +560,11 @@ mod tests {
let pca = PCA::fit(&us_arrests, PCAParameters::default().with_n_components(4)).unwrap();
assert!(pca
.eigenvectors
.abs()
.approximate_eq(&expected_eigenvectors.abs(), 1e-4));
assert!(relative_eq!(
pca.eigenvectors.abs(),
&expected_eigenvectors.abs(),
epsilon = 1e-4
));
for i in 0..pca.eigenvalues.len() {
assert!((pca.eigenvalues[i].abs() - expected_eigenvalues[i].abs()).abs() < 1e-8);
@@ -549,9 +572,11 @@ mod tests {
let us_arrests_t = pca.transform(&us_arrests).unwrap();
assert!(us_arrests_t
.abs()
.approximate_eq(&expected_projection.abs(), 1e-4));
assert!(relative_eq!(
us_arrests_t.abs(),
&expected_projection.abs(),
epsilon = 1e-4
));
}
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
@@ -654,10 +679,11 @@ mod tests {
)
.unwrap();
assert!(pca
.eigenvectors
.abs()
.approximate_eq(&expected_eigenvectors.abs(), 1e-4));
assert!(relative_eq!(
pca.eigenvectors.abs(),
&expected_eigenvectors.abs(),
epsilon = 1e-4
));
for i in 0..pca.eigenvalues.len() {
assert!((pca.eigenvalues[i].abs() - expected_eigenvalues[i].abs()).abs() < 1e-8);
@@ -665,43 +691,47 @@ mod tests {
let us_arrests_t = pca.transform(&us_arrests).unwrap();
assert!(us_arrests_t
.abs()
.approximate_eq(&expected_projection.abs(), 1e-4));
assert!(relative_eq!(
us_arrests_t.abs(),
&expected_projection.abs(),
epsilon = 1e-4
));
}
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
#[test]
#[cfg(feature = "serde")]
fn serde() {
let iris = DenseMatrix::from_2d_array(&[
&[5.1, 3.5, 1.4, 0.2],
&[4.9, 3.0, 1.4, 0.2],
&[4.7, 3.2, 1.3, 0.2],
&[4.6, 3.1, 1.5, 0.2],
&[5.0, 3.6, 1.4, 0.2],
&[5.4, 3.9, 1.7, 0.4],
&[4.6, 3.4, 1.4, 0.3],
&[5.0, 3.4, 1.5, 0.2],
&[4.4, 2.9, 1.4, 0.2],
&[4.9, 3.1, 1.5, 0.1],
&[7.0, 3.2, 4.7, 1.4],
&[6.4, 3.2, 4.5, 1.5],
&[6.9, 3.1, 4.9, 1.5],
&[5.5, 2.3, 4.0, 1.3],
&[6.5, 2.8, 4.6, 1.5],
&[5.7, 2.8, 4.5, 1.3],
&[6.3, 3.3, 4.7, 1.6],
&[4.9, 2.4, 3.3, 1.0],
&[6.6, 2.9, 4.6, 1.3],
&[5.2, 2.7, 3.9, 1.4],
]);
// Disable this test for now
// TODO: implement deserialization for new DenseMatrix
// #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
// #[test]
// #[cfg(feature = "serde")]
// fn pca_serde() {
// let iris = DenseMatrix::from_2d_array(&[
// &[5.1, 3.5, 1.4, 0.2],
// &[4.9, 3.0, 1.4, 0.2],
// &[4.7, 3.2, 1.3, 0.2],
// &[4.6, 3.1, 1.5, 0.2],
// &[5.0, 3.6, 1.4, 0.2],
// &[5.4, 3.9, 1.7, 0.4],
// &[4.6, 3.4, 1.4, 0.3],
// &[5.0, 3.4, 1.5, 0.2],
// &[4.4, 2.9, 1.4, 0.2],
// &[4.9, 3.1, 1.5, 0.1],
// &[7.0, 3.2, 4.7, 1.4],
// &[6.4, 3.2, 4.5, 1.5],
// &[6.9, 3.1, 4.9, 1.5],
// &[5.5, 2.3, 4.0, 1.3],
// &[6.5, 2.8, 4.6, 1.5],
// &[5.7, 2.8, 4.5, 1.3],
// &[6.3, 3.3, 4.7, 1.6],
// &[4.9, 2.4, 3.3, 1.0],
// &[6.6, 2.9, 4.6, 1.3],
// &[5.2, 2.7, 3.9, 1.4],
// ]);
let pca = PCA::fit(&iris, Default::default()).unwrap();
// let pca = PCA::fit(&iris, Default::default()).unwrap();
let deserialized_pca: PCA<f64, DenseMatrix<f64>> =
serde_json::from_str(&serde_json::to_string(&pca).unwrap()).unwrap();
// let deserialized_pca: PCA<f64, DenseMatrix<f64>> =
// serde_json::from_str(&serde_json::to_string(&pca).unwrap()).unwrap();
assert_eq!(pca, deserialized_pca);
}
// assert_eq!(pca, deserialized_pca);
// }
}