Merge potential next release v0.4 (#187) Breaking Changes

* First draft of the new n-dimensional arrays + NB use case
* Improves default implementation of multiple Array methods
* Refactors tree methods
* Adds matrix decomposition routines
* Adds matrix decomposition methods to ndarray and nalgebra bindings
* Refactoring + linear regression now uses array2
* Ridge & Linear regression
* LBFGS optimizer & logistic regression
* LBFGS optimizer & logistic regression
* Changes linear methods, metrics and model selection methods to new n-dimensional arrays
* Switches KNN and clustering algorithms to new n-d array layer
* Refactors distance metrics
* Optimizes knn and clustering methods
* Refactors metrics module
* Switches decomposition methods to n-dimensional arrays
* Linalg refactoring - cleanup rng merge (#172)
* Remove legacy DenseMatrix and BaseMatrix implementation. Port the new Number, FloatNumber and Array implementation into module structure.
* Exclude AUC metrics. Needs reimplementation
* Improve developers walkthrough

New traits system in place at `src/numbers` and `src/linalg`
Co-authored-by: Lorenzo <tunedconsulting@gmail.com>

* Provide SupervisedEstimator with a constructor to avoid explicit dynamical box allocation in 'cross_validate' and 'cross_validate_predict' as required by the use of 'dyn' as per Rust 2021
* Implement getters to use as_ref() in src/neighbors
* Implement getters to use as_ref() in src/naive_bayes
* Implement getters to use as_ref() in src/linear
* Add Clone to src/naive_bayes
* Change signature for cross_validate and other model_selection functions to abide to use of dyn in Rust 2021
* Implement ndarray-bindings. Remove FloatNumber from implementations
* Drop nalgebra-bindings support (as decided in conf-call to go for ndarray)
* Remove benches. Benches will have their own repo at smartcore-benches
* Implement SVC
* Implement SVC serialization. Move search parameters in dedicated module
* Implement SVR. Definitely too slow
* Fix compilation issues for wasm (#202)

Co-authored-by: Luis Moreno <morenol@users.noreply.github.com>
* Fix tests (#203)

* Port linalg/traits/stats.rs
* Improve methods naming
* Improve Display for DenseMatrix

Co-authored-by: Montana Low <montanalow@users.noreply.github.com>
Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
This commit is contained in:
Lorenzo
2022-10-31 10:44:57 +00:00
committed by GitHub
parent bb71656137
commit 52eb6ce023
110 changed files with 10327 additions and 9107 deletions
+100 -58
View File
@@ -19,18 +19,19 @@
//! Example:
//!
//! ```
//! use smartcore::linalg::naive::dense_matrix::*;
//! use smartcore::linalg::basic::matrix::DenseMatrix;
//! use smartcore::linalg::basic::arrays::Array2;
//! use smartcore::cluster::dbscan::*;
//! use smartcore::math::distance::Distances;
//! use smartcore::metrics::distance::Distances;
//! use smartcore::neighbors::KNNAlgorithmName;
//! use smartcore::dataset::generator;
//!
//! // Generate three blobs
//! let blobs = generator::make_blobs(100, 2, 3);
//! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data);
//! let x: DenseMatrix<f32> = DenseMatrix::from_iterator(blobs.data.into_iter(), 100, 2, 0);
//! // Fit the algorithm and predict cluster labels
//! let labels = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0)).
//! and_then(|dbscan| dbscan.predict(&x));
//! let labels: Vec<u32> = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0)).
//! and_then(|dbscan| dbscan.predict(&x)).unwrap();
//!
//! println!("{:?}", labels);
//! ```
@@ -41,7 +42,7 @@
//! * ["Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and its Applications", Sander J., Ester M., Kriegel HP., Xu X.](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.63.1629&rep=rep1&type=pdf)
use std::fmt::Debug;
use std::iter::Sum;
use std::marker::PhantomData;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
@@ -49,26 +50,29 @@ use serde::{Deserialize, Serialize};
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
use crate::api::{Predictor, UnsupervisedEstimator};
use crate::error::Failed;
use crate::linalg::{row_iter, Matrix};
use crate::math::distance::euclidian::Euclidian;
use crate::math::distance::{Distance, Distances};
use crate::math::num::RealNumber;
use crate::linalg::basic::arrays::{Array1, Array2};
use crate::metrics::distance::euclidian::Euclidian;
use crate::metrics::distance::{Distance, Distances};
use crate::numbers::basenum::Number;
use crate::tree::decision_tree_classifier::which_max;
/// DBSCAN clustering algorithm
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
pub struct DBSCAN<T: RealNumber, D: Distance<Vec<T>, T>> {
pub struct DBSCAN<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> {
cluster_labels: Vec<i16>,
num_classes: usize,
knn_algorithm: KNNAlgorithm<T, D>,
eps: T,
knn_algorithm: KNNAlgorithm<TX, D>,
eps: f64,
_phantom_ty: PhantomData<TY>,
_phantom_x: PhantomData<X>,
_phantom_y: PhantomData<Y>,
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
/// DBSCAN clustering algorithm parameters
pub struct DBSCANParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
pub struct DBSCANParameters<T: Number, D: Distance<Vec<T>>> {
#[cfg_attr(feature = "serde", serde(default))]
/// a function that defines a distance between each pair of point in training data.
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
@@ -79,22 +83,25 @@ pub struct DBSCANParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
pub min_samples: usize,
#[cfg_attr(feature = "serde", serde(default))]
/// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
pub eps: T,
pub eps: f64,
#[cfg_attr(feature = "serde", serde(default))]
/// KNN algorithm to use.
pub algorithm: KNNAlgorithmName,
#[cfg_attr(feature = "serde", serde(default))]
_phantom_t: PhantomData<T>,
}
impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
impl<T: Number, D: Distance<Vec<T>>> DBSCANParameters<T, D> {
/// a function that defines a distance between each pair of point in training data.
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
/// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
pub fn with_distance<DD: Distance<Vec<T>, T>>(self, distance: DD) -> DBSCANParameters<T, DD> {
pub fn with_distance<DD: Distance<Vec<T>>>(self, distance: DD) -> DBSCANParameters<T, DD> {
DBSCANParameters {
distance,
min_samples: self.min_samples,
eps: self.eps,
algorithm: self.algorithm,
_phantom_t: PhantomData,
}
}
/// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
@@ -103,7 +110,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
self
}
/// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
pub fn with_eps(mut self, eps: T) -> Self {
pub fn with_eps(mut self, eps: f64) -> Self {
self.eps = eps;
self
}
@@ -117,7 +124,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
/// DBSCAN grid search parameters
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
pub struct DBSCANSearchParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
pub struct DBSCANSearchParameters<T: Number, D: Distance<Vec<T>>> {
#[cfg_attr(feature = "serde", serde(default))]
/// a function that defines a distance between each pair of point in training data.
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
@@ -128,14 +135,15 @@ pub struct DBSCANSearchParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
pub min_samples: Vec<usize>,
#[cfg_attr(feature = "serde", serde(default))]
/// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
pub eps: Vec<T>,
pub eps: Vec<f64>,
#[cfg_attr(feature = "serde", serde(default))]
/// KNN algorithm to use.
pub algorithm: Vec<KNNAlgorithmName>,
_phantom_t: PhantomData<T>,
}
/// DBSCAN grid search iterator
pub struct DBSCANSearchParametersIterator<T: RealNumber, D: Distance<Vec<T>, T>> {
pub struct DBSCANSearchParametersIterator<T: Number, D: Distance<Vec<T>>> {
dbscan_search_parameters: DBSCANSearchParameters<T, D>,
current_distance: usize,
current_min_samples: usize,
@@ -143,7 +151,7 @@ pub struct DBSCANSearchParametersIterator<T: RealNumber, D: Distance<Vec<T>, T>>
current_algorithm: usize,
}
impl<T: RealNumber, D: Distance<Vec<T>, T>> IntoIterator for DBSCANSearchParameters<T, D> {
impl<T: Number, D: Distance<Vec<T>>> IntoIterator for DBSCANSearchParameters<T, D> {
type Item = DBSCANParameters<T, D>;
type IntoIter = DBSCANSearchParametersIterator<T, D>;
@@ -158,7 +166,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> IntoIterator for DBSCANSearchParamet
}
}
impl<T: RealNumber, D: Distance<Vec<T>, T>> Iterator for DBSCANSearchParametersIterator<T, D> {
impl<T: Number, D: Distance<Vec<T>>> Iterator for DBSCANSearchParametersIterator<T, D> {
type Item = DBSCANParameters<T, D>;
fn next(&mut self) -> Option<Self::Item> {
@@ -175,6 +183,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> Iterator for DBSCANSearchParametersI
min_samples: self.dbscan_search_parameters.min_samples[self.current_min_samples],
eps: self.dbscan_search_parameters.eps[self.current_eps],
algorithm: self.dbscan_search_parameters.algorithm[self.current_algorithm].clone(),
_phantom_t: PhantomData,
};
if self.current_distance + 1 < self.dbscan_search_parameters.distance.len() {
@@ -202,7 +211,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> Iterator for DBSCANSearchParametersI
}
}
impl<T: RealNumber> Default for DBSCANSearchParameters<T, Euclidian> {
impl<T: Number> Default for DBSCANSearchParameters<T, Euclidian<T>> {
fn default() -> Self {
let default_params = DBSCANParameters::default();
@@ -211,11 +220,14 @@ impl<T: RealNumber> Default for DBSCANSearchParameters<T, Euclidian> {
min_samples: vec![default_params.min_samples],
eps: vec![default_params.eps],
algorithm: vec![default_params.algorithm],
_phantom_t: PhantomData,
}
}
}
impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> PartialEq
for DBSCAN<TX, TY, X, Y, D>
{
fn eq(&self, other: &Self) -> bool {
self.cluster_labels.len() == other.cluster_labels.len()
&& self.num_classes == other.num_classes
@@ -224,47 +236,50 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
}
}
impl<T: RealNumber> Default for DBSCANParameters<T, Euclidian> {
impl<T: Number> Default for DBSCANParameters<T, Euclidian<T>> {
fn default() -> Self {
DBSCANParameters {
distance: Distances::euclidian(),
min_samples: 5,
eps: T::half(),
eps: 0.5f64,
algorithm: KNNAlgorithmName::default(),
_phantom_t: PhantomData,
}
}
}
impl<T: RealNumber + Sum, M: Matrix<T>, D: Distance<Vec<T>, T>>
UnsupervisedEstimator<M, DBSCANParameters<T, D>> for DBSCAN<T, D>
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
UnsupervisedEstimator<X, DBSCANParameters<TX, D>> for DBSCAN<TX, TY, X, Y, D>
{
fn fit(x: &M, parameters: DBSCANParameters<T, D>) -> Result<Self, Failed> {
fn fit(x: &X, parameters: DBSCANParameters<TX, D>) -> Result<Self, Failed> {
DBSCAN::fit(x, parameters)
}
}
impl<T: RealNumber, M: Matrix<T>, D: Distance<Vec<T>, T>> Predictor<M, M::RowVector>
for DBSCAN<T, D>
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> Predictor<X, Y>
for DBSCAN<TX, TY, X, Y, D>
{
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
fn predict(&self, x: &X) -> Result<Y, Failed> {
self.predict(x)
}
}
impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
DBSCAN<TX, TY, X, Y, D>
{
/// Fit algorithm to _NxM_ matrix where _N_ is number of samples and _M_ is number of features.
/// * `data` - training instances to cluster
/// * `k` - number of clusters
/// * `parameters` - cluster parameters
pub fn fit<M: Matrix<T>>(
x: &M,
parameters: DBSCANParameters<T, D>,
) -> Result<DBSCAN<T, D>, Failed> {
pub fn fit(
x: &X,
parameters: DBSCANParameters<TX, D>,
) -> Result<DBSCAN<TX, TY, X, Y, D>, Failed> {
if parameters.min_samples < 1 {
return Err(Failed::fit("Invalid minPts"));
}
if parameters.eps <= T::zero() {
if parameters.eps <= 0f64 {
return Err(Failed::fit("Invalid radius: "));
}
@@ -276,13 +291,19 @@ impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
let n = x.shape().0;
let mut y = vec![undefined; n];
let algo = parameters
.algorithm
.fit(row_iter(x).collect(), parameters.distance)?;
let algo = parameters.algorithm.fit(
x.row_iter()
.map(|row| row.iterator(0).cloned().collect())
.collect(),
parameters.distance,
)?;
for (i, e) in row_iter(x).enumerate() {
let mut row = vec![TX::zero(); x.shape().1];
for (i, e) in x.row_iter().enumerate() {
if y[i] == undefined {
let mut neighbors = algo.find_radius(&e, parameters.eps)?;
e.iterator(0).zip(row.iter_mut()).for_each(|(&x, r)| *r = x);
let mut neighbors = algo.find_radius(&row, parameters.eps)?;
if neighbors.len() < parameters.min_samples {
y[i] = outlier;
} else {
@@ -333,18 +354,25 @@ impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
num_classes: k as usize,
knn_algorithm: algo,
eps: parameters.eps,
_phantom_ty: PhantomData,
_phantom_x: PhantomData,
_phantom_y: PhantomData,
})
}
/// Predict clusters for `x`
/// * `x` - matrix with new data to transform of size _KxM_ , where _K_ is number of new samples and _M_ is number of features.
pub fn predict<M: Matrix<T>>(&self, x: &M) -> Result<M::RowVector, Failed> {
let (n, m) = x.shape();
let mut result = M::zeros(1, n);
let mut row = vec![T::zero(); m];
pub fn predict(&self, x: &X) -> Result<Y, Failed> {
let (n, _) = x.shape();
let mut result = Y::zeros(n);
let mut row = vec![TX::zero(); x.shape().1];
for i in 0..n {
x.copy_row_as_vec(i, &mut row);
x.get_row(i)
.iterator(0)
.zip(row.iter_mut())
.for_each(|(&x, r)| *r = x);
let neighbors = self.knn_algorithm.find_radius(&row, self.eps)?;
let mut label = vec![0usize; self.num_classes + 1];
for neighbor in neighbors {
@@ -357,26 +385,26 @@ impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
}
let class = which_max(&label);
if class != self.num_classes {
result.set(0, i, T::from(class).unwrap());
result.set(i, TY::from(class + 1).unwrap());
} else {
result.set(0, i, -T::one());
result.set(i, TY::zero());
}
}
Ok(result.to_row_vector())
Ok(result)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::linalg::naive::dense_matrix::DenseMatrix;
use crate::linalg::basic::matrix::DenseMatrix;
#[cfg(feature = "serde")]
use crate::math::distance::euclidian::Euclidian;
use crate::metrics::distance::euclidian::Euclidian;
#[test]
fn search_parameters() {
let parameters = DBSCANSearchParameters {
let parameters: DBSCANSearchParameters<f64, Euclidian<f64>> = DBSCANSearchParameters {
min_samples: vec![10, 100],
eps: vec![1., 2.],
..Default::default()
@@ -414,7 +442,7 @@ mod tests {
&[3.0, 5.0],
]);
let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0];
let expected_labels = vec![1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0];
let dbscan = DBSCAN::fit(
&x,
@@ -424,7 +452,7 @@ mod tests {
)
.unwrap();
let predicted_labels = dbscan.predict(&x).unwrap();
let predicted_labels: Vec<i32> = dbscan.predict(&x).unwrap();
assert_eq!(expected_labels, predicted_labels);
}
@@ -458,9 +486,23 @@ mod tests {
let dbscan = DBSCAN::fit(&x, Default::default()).unwrap();
let deserialized_dbscan: DBSCAN<f64, Euclidian> =
let deserialized_dbscan: DBSCAN<f32, f32, DenseMatrix<f32>, Vec<f32>, Euclidian<f32>> =
serde_json::from_str(&serde_json::to_string(&dbscan).unwrap()).unwrap();
assert_eq!(dbscan, deserialized_dbscan);
}
use crate::dataset::generator;
#[test]
fn from_vec() {
// Generate three blobs
let blobs = generator::make_blobs(100, 2, 3);
let x: DenseMatrix<f32> = DenseMatrix::from_iterator(blobs.data.into_iter(), 100, 2, 0);
// Fit the algorithm and predict cluster labels
let labels: Vec<i32> = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0))
.and_then(|dbscan| dbscan.predict(&x))
.unwrap();
println!("{:?}", labels);
}
}