Merge potential next release v0.4 (#187) Breaking Changes
* First draft of the new n-dimensional arrays + NB use case * Improves default implementation of multiple Array methods * Refactors tree methods * Adds matrix decomposition routines * Adds matrix decomposition methods to ndarray and nalgebra bindings * Refactoring + linear regression now uses array2 * Ridge & Linear regression * LBFGS optimizer & logistic regression * LBFGS optimizer & logistic regression * Changes linear methods, metrics and model selection methods to new n-dimensional arrays * Switches KNN and clustering algorithms to new n-d array layer * Refactors distance metrics * Optimizes knn and clustering methods * Refactors metrics module * Switches decomposition methods to n-dimensional arrays * Linalg refactoring - cleanup rng merge (#172) * Remove legacy DenseMatrix and BaseMatrix implementation. Port the new Number, FloatNumber and Array implementation into module structure. * Exclude AUC metrics. Needs reimplementation * Improve developers walkthrough New traits system in place at `src/numbers` and `src/linalg` Co-authored-by: Lorenzo <tunedconsulting@gmail.com> * Provide SupervisedEstimator with a constructor to avoid explicit dynamical box allocation in 'cross_validate' and 'cross_validate_predict' as required by the use of 'dyn' as per Rust 2021 * Implement getters to use as_ref() in src/neighbors * Implement getters to use as_ref() in src/naive_bayes * Implement getters to use as_ref() in src/linear * Add Clone to src/naive_bayes * Change signature for cross_validate and other model_selection functions to abide to use of dyn in Rust 2021 * Implement ndarray-bindings. Remove FloatNumber from implementations * Drop nalgebra-bindings support (as decided in conf-call to go for ndarray) * Remove benches. Benches will have their own repo at smartcore-benches * Implement SVC * Implement SVC serialization. Move search parameters in dedicated module * Implement SVR. Definitely too slow * Fix compilation issues for wasm (#202) Co-authored-by: Luis Moreno <morenol@users.noreply.github.com> * Fix tests (#203) * Port linalg/traits/stats.rs * Improve methods naming * Improve Display for DenseMatrix Co-authored-by: Montana Low <montanalow@users.noreply.github.com> Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
This commit is contained in:
+100
-58
@@ -19,18 +19,19 @@
|
||||
//! Example:
|
||||
//!
|
||||
//! ```
|
||||
//! use smartcore::linalg::naive::dense_matrix::*;
|
||||
//! use smartcore::linalg::basic::matrix::DenseMatrix;
|
||||
//! use smartcore::linalg::basic::arrays::Array2;
|
||||
//! use smartcore::cluster::dbscan::*;
|
||||
//! use smartcore::math::distance::Distances;
|
||||
//! use smartcore::metrics::distance::Distances;
|
||||
//! use smartcore::neighbors::KNNAlgorithmName;
|
||||
//! use smartcore::dataset::generator;
|
||||
//!
|
||||
//! // Generate three blobs
|
||||
//! let blobs = generator::make_blobs(100, 2, 3);
|
||||
//! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data);
|
||||
//! let x: DenseMatrix<f32> = DenseMatrix::from_iterator(blobs.data.into_iter(), 100, 2, 0);
|
||||
//! // Fit the algorithm and predict cluster labels
|
||||
//! let labels = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0)).
|
||||
//! and_then(|dbscan| dbscan.predict(&x));
|
||||
//! let labels: Vec<u32> = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0)).
|
||||
//! and_then(|dbscan| dbscan.predict(&x)).unwrap();
|
||||
//!
|
||||
//! println!("{:?}", labels);
|
||||
//! ```
|
||||
@@ -41,7 +42,7 @@
|
||||
//! * ["Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and its Applications", Sander J., Ester M., Kriegel HP., Xu X.](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.63.1629&rep=rep1&type=pdf)
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::iter::Sum;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
#[cfg(feature = "serde")]
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -49,26 +50,29 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
|
||||
use crate::api::{Predictor, UnsupervisedEstimator};
|
||||
use crate::error::Failed;
|
||||
use crate::linalg::{row_iter, Matrix};
|
||||
use crate::math::distance::euclidian::Euclidian;
|
||||
use crate::math::distance::{Distance, Distances};
|
||||
use crate::math::num::RealNumber;
|
||||
use crate::linalg::basic::arrays::{Array1, Array2};
|
||||
use crate::metrics::distance::euclidian::Euclidian;
|
||||
use crate::metrics::distance::{Distance, Distances};
|
||||
use crate::numbers::basenum::Number;
|
||||
use crate::tree::decision_tree_classifier::which_max;
|
||||
|
||||
/// DBSCAN clustering algorithm
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug)]
|
||||
pub struct DBSCAN<T: RealNumber, D: Distance<Vec<T>, T>> {
|
||||
pub struct DBSCAN<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> {
|
||||
cluster_labels: Vec<i16>,
|
||||
num_classes: usize,
|
||||
knn_algorithm: KNNAlgorithm<T, D>,
|
||||
eps: T,
|
||||
knn_algorithm: KNNAlgorithm<TX, D>,
|
||||
eps: f64,
|
||||
_phantom_ty: PhantomData<TY>,
|
||||
_phantom_x: PhantomData<X>,
|
||||
_phantom_y: PhantomData<Y>,
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Clone)]
|
||||
/// DBSCAN clustering algorithm parameters
|
||||
pub struct DBSCANParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
|
||||
pub struct DBSCANParameters<T: Number, D: Distance<Vec<T>>> {
|
||||
#[cfg_attr(feature = "serde", serde(default))]
|
||||
/// a function that defines a distance between each pair of point in training data.
|
||||
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
|
||||
@@ -79,22 +83,25 @@ pub struct DBSCANParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
|
||||
pub min_samples: usize,
|
||||
#[cfg_attr(feature = "serde", serde(default))]
|
||||
/// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
|
||||
pub eps: T,
|
||||
pub eps: f64,
|
||||
#[cfg_attr(feature = "serde", serde(default))]
|
||||
/// KNN algorithm to use.
|
||||
pub algorithm: KNNAlgorithmName,
|
||||
#[cfg_attr(feature = "serde", serde(default))]
|
||||
_phantom_t: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
|
||||
impl<T: Number, D: Distance<Vec<T>>> DBSCANParameters<T, D> {
|
||||
/// a function that defines a distance between each pair of point in training data.
|
||||
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
|
||||
/// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
|
||||
pub fn with_distance<DD: Distance<Vec<T>, T>>(self, distance: DD) -> DBSCANParameters<T, DD> {
|
||||
pub fn with_distance<DD: Distance<Vec<T>>>(self, distance: DD) -> DBSCANParameters<T, DD> {
|
||||
DBSCANParameters {
|
||||
distance,
|
||||
min_samples: self.min_samples,
|
||||
eps: self.eps,
|
||||
algorithm: self.algorithm,
|
||||
_phantom_t: PhantomData,
|
||||
}
|
||||
}
|
||||
/// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
|
||||
@@ -103,7 +110,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
|
||||
self
|
||||
}
|
||||
/// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
|
||||
pub fn with_eps(mut self, eps: T) -> Self {
|
||||
pub fn with_eps(mut self, eps: f64) -> Self {
|
||||
self.eps = eps;
|
||||
self
|
||||
}
|
||||
@@ -117,7 +124,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> DBSCANParameters<T, D> {
|
||||
/// DBSCAN grid search parameters
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DBSCANSearchParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
|
||||
pub struct DBSCANSearchParameters<T: Number, D: Distance<Vec<T>>> {
|
||||
#[cfg_attr(feature = "serde", serde(default))]
|
||||
/// a function that defines a distance between each pair of point in training data.
|
||||
/// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
|
||||
@@ -128,14 +135,15 @@ pub struct DBSCANSearchParameters<T: RealNumber, D: Distance<Vec<T>, T>> {
|
||||
pub min_samples: Vec<usize>,
|
||||
#[cfg_attr(feature = "serde", serde(default))]
|
||||
/// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
|
||||
pub eps: Vec<T>,
|
||||
pub eps: Vec<f64>,
|
||||
#[cfg_attr(feature = "serde", serde(default))]
|
||||
/// KNN algorithm to use.
|
||||
pub algorithm: Vec<KNNAlgorithmName>,
|
||||
_phantom_t: PhantomData<T>,
|
||||
}
|
||||
|
||||
/// DBSCAN grid search iterator
|
||||
pub struct DBSCANSearchParametersIterator<T: RealNumber, D: Distance<Vec<T>, T>> {
|
||||
pub struct DBSCANSearchParametersIterator<T: Number, D: Distance<Vec<T>>> {
|
||||
dbscan_search_parameters: DBSCANSearchParameters<T, D>,
|
||||
current_distance: usize,
|
||||
current_min_samples: usize,
|
||||
@@ -143,7 +151,7 @@ pub struct DBSCANSearchParametersIterator<T: RealNumber, D: Distance<Vec<T>, T>>
|
||||
current_algorithm: usize,
|
||||
}
|
||||
|
||||
impl<T: RealNumber, D: Distance<Vec<T>, T>> IntoIterator for DBSCANSearchParameters<T, D> {
|
||||
impl<T: Number, D: Distance<Vec<T>>> IntoIterator for DBSCANSearchParameters<T, D> {
|
||||
type Item = DBSCANParameters<T, D>;
|
||||
type IntoIter = DBSCANSearchParametersIterator<T, D>;
|
||||
|
||||
@@ -158,7 +166,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> IntoIterator for DBSCANSearchParamet
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber, D: Distance<Vec<T>, T>> Iterator for DBSCANSearchParametersIterator<T, D> {
|
||||
impl<T: Number, D: Distance<Vec<T>>> Iterator for DBSCANSearchParametersIterator<T, D> {
|
||||
type Item = DBSCANParameters<T, D>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
@@ -175,6 +183,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> Iterator for DBSCANSearchParametersI
|
||||
min_samples: self.dbscan_search_parameters.min_samples[self.current_min_samples],
|
||||
eps: self.dbscan_search_parameters.eps[self.current_eps],
|
||||
algorithm: self.dbscan_search_parameters.algorithm[self.current_algorithm].clone(),
|
||||
_phantom_t: PhantomData,
|
||||
};
|
||||
|
||||
if self.current_distance + 1 < self.dbscan_search_parameters.distance.len() {
|
||||
@@ -202,7 +211,7 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> Iterator for DBSCANSearchParametersI
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber> Default for DBSCANSearchParameters<T, Euclidian> {
|
||||
impl<T: Number> Default for DBSCANSearchParameters<T, Euclidian<T>> {
|
||||
fn default() -> Self {
|
||||
let default_params = DBSCANParameters::default();
|
||||
|
||||
@@ -211,11 +220,14 @@ impl<T: RealNumber> Default for DBSCANSearchParameters<T, Euclidian> {
|
||||
min_samples: vec![default_params.min_samples],
|
||||
eps: vec![default_params.eps],
|
||||
algorithm: vec![default_params.algorithm],
|
||||
_phantom_t: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
|
||||
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> PartialEq
|
||||
for DBSCAN<TX, TY, X, Y, D>
|
||||
{
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cluster_labels.len() == other.cluster_labels.len()
|
||||
&& self.num_classes == other.num_classes
|
||||
@@ -224,47 +236,50 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber> Default for DBSCANParameters<T, Euclidian> {
|
||||
impl<T: Number> Default for DBSCANParameters<T, Euclidian<T>> {
|
||||
fn default() -> Self {
|
||||
DBSCANParameters {
|
||||
distance: Distances::euclidian(),
|
||||
min_samples: 5,
|
||||
eps: T::half(),
|
||||
eps: 0.5f64,
|
||||
algorithm: KNNAlgorithmName::default(),
|
||||
_phantom_t: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber + Sum, M: Matrix<T>, D: Distance<Vec<T>, T>>
|
||||
UnsupervisedEstimator<M, DBSCANParameters<T, D>> for DBSCAN<T, D>
|
||||
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
|
||||
UnsupervisedEstimator<X, DBSCANParameters<TX, D>> for DBSCAN<TX, TY, X, Y, D>
|
||||
{
|
||||
fn fit(x: &M, parameters: DBSCANParameters<T, D>) -> Result<Self, Failed> {
|
||||
fn fit(x: &X, parameters: DBSCANParameters<TX, D>) -> Result<Self, Failed> {
|
||||
DBSCAN::fit(x, parameters)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber, M: Matrix<T>, D: Distance<Vec<T>, T>> Predictor<M, M::RowVector>
|
||||
for DBSCAN<T, D>
|
||||
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>> Predictor<X, Y>
|
||||
for DBSCAN<TX, TY, X, Y, D>
|
||||
{
|
||||
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||
fn predict(&self, x: &X) -> Result<Y, Failed> {
|
||||
self.predict(x)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
|
||||
impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: Distance<Vec<TX>>>
|
||||
DBSCAN<TX, TY, X, Y, D>
|
||||
{
|
||||
/// Fit algorithm to _NxM_ matrix where _N_ is number of samples and _M_ is number of features.
|
||||
/// * `data` - training instances to cluster
|
||||
/// * `k` - number of clusters
|
||||
/// * `parameters` - cluster parameters
|
||||
pub fn fit<M: Matrix<T>>(
|
||||
x: &M,
|
||||
parameters: DBSCANParameters<T, D>,
|
||||
) -> Result<DBSCAN<T, D>, Failed> {
|
||||
pub fn fit(
|
||||
x: &X,
|
||||
parameters: DBSCANParameters<TX, D>,
|
||||
) -> Result<DBSCAN<TX, TY, X, Y, D>, Failed> {
|
||||
if parameters.min_samples < 1 {
|
||||
return Err(Failed::fit("Invalid minPts"));
|
||||
}
|
||||
|
||||
if parameters.eps <= T::zero() {
|
||||
if parameters.eps <= 0f64 {
|
||||
return Err(Failed::fit("Invalid radius: "));
|
||||
}
|
||||
|
||||
@@ -276,13 +291,19 @@ impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
|
||||
let n = x.shape().0;
|
||||
let mut y = vec![undefined; n];
|
||||
|
||||
let algo = parameters
|
||||
.algorithm
|
||||
.fit(row_iter(x).collect(), parameters.distance)?;
|
||||
let algo = parameters.algorithm.fit(
|
||||
x.row_iter()
|
||||
.map(|row| row.iterator(0).cloned().collect())
|
||||
.collect(),
|
||||
parameters.distance,
|
||||
)?;
|
||||
|
||||
for (i, e) in row_iter(x).enumerate() {
|
||||
let mut row = vec![TX::zero(); x.shape().1];
|
||||
|
||||
for (i, e) in x.row_iter().enumerate() {
|
||||
if y[i] == undefined {
|
||||
let mut neighbors = algo.find_radius(&e, parameters.eps)?;
|
||||
e.iterator(0).zip(row.iter_mut()).for_each(|(&x, r)| *r = x);
|
||||
let mut neighbors = algo.find_radius(&row, parameters.eps)?;
|
||||
if neighbors.len() < parameters.min_samples {
|
||||
y[i] = outlier;
|
||||
} else {
|
||||
@@ -333,18 +354,25 @@ impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
|
||||
num_classes: k as usize,
|
||||
knn_algorithm: algo,
|
||||
eps: parameters.eps,
|
||||
_phantom_ty: PhantomData,
|
||||
_phantom_x: PhantomData,
|
||||
_phantom_y: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// Predict clusters for `x`
|
||||
/// * `x` - matrix with new data to transform of size _KxM_ , where _K_ is number of new samples and _M_ is number of features.
|
||||
pub fn predict<M: Matrix<T>>(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||
let (n, m) = x.shape();
|
||||
let mut result = M::zeros(1, n);
|
||||
let mut row = vec![T::zero(); m];
|
||||
pub fn predict(&self, x: &X) -> Result<Y, Failed> {
|
||||
let (n, _) = x.shape();
|
||||
let mut result = Y::zeros(n);
|
||||
|
||||
let mut row = vec![TX::zero(); x.shape().1];
|
||||
|
||||
for i in 0..n {
|
||||
x.copy_row_as_vec(i, &mut row);
|
||||
x.get_row(i)
|
||||
.iterator(0)
|
||||
.zip(row.iter_mut())
|
||||
.for_each(|(&x, r)| *r = x);
|
||||
let neighbors = self.knn_algorithm.find_radius(&row, self.eps)?;
|
||||
let mut label = vec![0usize; self.num_classes + 1];
|
||||
for neighbor in neighbors {
|
||||
@@ -357,26 +385,26 @@ impl<T: RealNumber + Sum, D: Distance<Vec<T>, T>> DBSCAN<T, D> {
|
||||
}
|
||||
let class = which_max(&label);
|
||||
if class != self.num_classes {
|
||||
result.set(0, i, T::from(class).unwrap());
|
||||
result.set(i, TY::from(class + 1).unwrap());
|
||||
} else {
|
||||
result.set(0, i, -T::one());
|
||||
result.set(i, TY::zero());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result.to_row_vector())
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
||||
use crate::linalg::basic::matrix::DenseMatrix;
|
||||
#[cfg(feature = "serde")]
|
||||
use crate::math::distance::euclidian::Euclidian;
|
||||
use crate::metrics::distance::euclidian::Euclidian;
|
||||
|
||||
#[test]
|
||||
fn search_parameters() {
|
||||
let parameters = DBSCANSearchParameters {
|
||||
let parameters: DBSCANSearchParameters<f64, Euclidian<f64>> = DBSCANSearchParameters {
|
||||
min_samples: vec![10, 100],
|
||||
eps: vec![1., 2.],
|
||||
..Default::default()
|
||||
@@ -414,7 +442,7 @@ mod tests {
|
||||
&[3.0, 5.0],
|
||||
]);
|
||||
|
||||
let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0];
|
||||
let expected_labels = vec![1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0];
|
||||
|
||||
let dbscan = DBSCAN::fit(
|
||||
&x,
|
||||
@@ -424,7 +452,7 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let predicted_labels = dbscan.predict(&x).unwrap();
|
||||
let predicted_labels: Vec<i32> = dbscan.predict(&x).unwrap();
|
||||
|
||||
assert_eq!(expected_labels, predicted_labels);
|
||||
}
|
||||
@@ -458,9 +486,23 @@ mod tests {
|
||||
|
||||
let dbscan = DBSCAN::fit(&x, Default::default()).unwrap();
|
||||
|
||||
let deserialized_dbscan: DBSCAN<f64, Euclidian> =
|
||||
let deserialized_dbscan: DBSCAN<f32, f32, DenseMatrix<f32>, Vec<f32>, Euclidian<f32>> =
|
||||
serde_json::from_str(&serde_json::to_string(&dbscan).unwrap()).unwrap();
|
||||
|
||||
assert_eq!(dbscan, deserialized_dbscan);
|
||||
}
|
||||
use crate::dataset::generator;
|
||||
|
||||
#[test]
|
||||
fn from_vec() {
|
||||
// Generate three blobs
|
||||
let blobs = generator::make_blobs(100, 2, 3);
|
||||
let x: DenseMatrix<f32> = DenseMatrix::from_iterator(blobs.data.into_iter(), 100, 2, 0);
|
||||
// Fit the algorithm and predict cluster labels
|
||||
let labels: Vec<i32> = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0))
|
||||
.and_then(|dbscan| dbscan.predict(&x))
|
||||
.unwrap();
|
||||
|
||||
println!("{:?}", labels);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user