diff --git a/Cargo.toml b/Cargo.toml index 2978238..069e223 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ default = ["datasets"] ndarray-bindings = ["ndarray"] nalgebra-bindings = ["nalgebra"] datasets = [] +fp_bench = [] [dependencies] ndarray = { version = "0.15", optional = true } @@ -26,12 +27,14 @@ num = "0.4" rand = "0.8" rand_distr = "0.4" serde = { version = "1", features = ["derive"], optional = true } +itertools = "0.10.3" [target.'cfg(target_arch = "wasm32")'.dependencies] getrandom = { version = "0.2", features = ["js"] } [dev-dependencies] -criterion = "0.3" +smartcore = { path = ".", features = ["fp_bench"] } +criterion = { version = "0.4", default-features = false } serde_json = "1.0" bincode = "1.3.1" @@ -46,3 +49,8 @@ harness = false name = "naive_bayes" harness = false required-features = ["ndarray-bindings", "nalgebra-bindings"] + +[[bench]] +name = "fastpair" +harness = false +required-features = ["fp_bench"] diff --git a/benches/fastpair.rs b/benches/fastpair.rs new file mode 100644 index 0000000..baa0e90 --- /dev/null +++ b/benches/fastpair.rs @@ -0,0 +1,56 @@ +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; + +// to run this bench you have to change the declaraion in mod.rs ---> pub mod fastpair; +use smartcore::algorithm::neighbour::fastpair::FastPair; +use smartcore::linalg::naive::dense_matrix::*; +use std::time::Duration; + +fn closest_pair_bench(n: usize, m: usize) -> () { + let x = DenseMatrix::::rand(n, m); + let fastpair = FastPair::new(&x); + let result = fastpair.unwrap(); + + result.closest_pair(); +} + +fn closest_pair_brute_bench(n: usize, m: usize) -> () { + let x = DenseMatrix::::rand(n, m); + let fastpair = FastPair::new(&x); + let result = fastpair.unwrap(); + + result.closest_pair_brute(); +} + +fn bench_fastpair(c: &mut Criterion) { + let mut group = c.benchmark_group("FastPair"); + + // with full samples size (100) the test will take too long + group.significance_level(0.1).sample_size(30); + // increase from default 5.0 secs + group.measurement_time(Duration::from_secs(60)); + + for n_samples in [100_usize, 1000_usize].iter() { + for n_features in [10_usize, 100_usize, 1000_usize].iter() { + group.bench_with_input( + BenchmarkId::from_parameter(format!( + "fastpair --- n_samples: {}, n_features: {}", + n_samples, n_features + )), + n_samples, + |b, _| b.iter(|| closest_pair_bench(*n_samples, *n_features)), + ); + group.bench_with_input( + BenchmarkId::from_parameter(format!( + "brute --- n_samples: {}, n_features: {}", + n_samples, n_features + )), + n_samples, + |b, _| b.iter(|| closest_pair_brute_bench(*n_samples, *n_features)), + ); + } + } + group.finish(); +} + +criterion_group!(benches, bench_fastpair); +criterion_main!(benches); diff --git a/src/algorithm/neighbour/bbd_tree.rs b/src/algorithm/neighbour/bbd_tree.rs index 293a822..93ea050 100644 --- a/src/algorithm/neighbour/bbd_tree.rs +++ b/src/algorithm/neighbour/bbd_tree.rs @@ -59,7 +59,7 @@ impl BBDTree { tree } - pub(in crate) fn clustering( + pub(crate) fn clustering( &self, centroids: &[Vec], sums: &mut Vec>, diff --git a/src/algorithm/neighbour/distances.rs b/src/algorithm/neighbour/distances.rs new file mode 100644 index 0000000..56a7ed6 --- /dev/null +++ b/src/algorithm/neighbour/distances.rs @@ -0,0 +1,48 @@ +//! +//! Dissimilarities for vector-vector distance +//! +//! Representing distances as pairwise dissimilarities, so to build a +//! graph of closest neighbours. This representation can be reused for +//! different implementations (initially used in this library for FastPair). +use std::cmp::{Eq, Ordering, PartialOrd}; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::math::num::RealNumber; + +/// +/// The edge of the subgraph is defined by `PairwiseDistance`. +/// The calling algorithm can store a list of distsances as +/// a list of these structures. +/// +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone, Copy)] +pub struct PairwiseDistance { + /// index of the vector in the original `Matrix` or list + pub node: usize, + + /// index of the closest neighbor in the original `Matrix` or same list + pub neighbour: Option, + + /// measure of distance, according to the algorithm distance function + /// if the distance is None, the edge has value "infinite" or max distance + /// each algorithm has to match + pub distance: Option, +} + +impl Eq for PairwiseDistance {} + +impl PartialEq for PairwiseDistance { + fn eq(&self, other: &Self) -> bool { + self.node == other.node + && self.neighbour == other.neighbour + && self.distance == other.distance + } +} + +impl PartialOrd for PairwiseDistance { + fn partial_cmp(&self, other: &Self) -> Option { + self.distance.partial_cmp(&other.distance) + } +} diff --git a/src/algorithm/neighbour/fastpair.rs b/src/algorithm/neighbour/fastpair.rs new file mode 100644 index 0000000..e14c2b3 --- /dev/null +++ b/src/algorithm/neighbour/fastpair.rs @@ -0,0 +1,570 @@ +#![allow(non_snake_case)] +use itertools::Itertools; +/// +/// # FastPair: Data-structure for the dynamic closest-pair problem. +/// +/// Reference: +/// Eppstein, David: Fast hierarchical clustering and other applications of +/// dynamic closest pairs. Journal of Experimental Algorithmics 5 (2000) 1. +/// +/// Example: +/// ``` +/// use smartcore::algorithm::neighbour::distances::PairwiseDistance; +/// use smartcore::linalg::naive::dense_matrix::DenseMatrix; +/// use smartcore::algorithm::neighbour::fastpair::FastPair; +/// let x = DenseMatrix::::from_2d_array(&[ +/// &[5.1, 3.5, 1.4, 0.2], +/// &[4.9, 3.0, 1.4, 0.2], +/// &[4.7, 3.2, 1.3, 0.2], +/// &[4.6, 3.1, 1.5, 0.2], +/// &[5.0, 3.6, 1.4, 0.2], +/// &[5.4, 3.9, 1.7, 0.4], +/// ]); +/// let fastpair = FastPair::new(&x); +/// let closest_pair: PairwiseDistance = fastpair.unwrap().closest_pair(); +/// ``` +/// +/// +use std::collections::HashMap; + +use crate::algorithm::neighbour::distances::PairwiseDistance; +use crate::error::{Failed, FailedError}; +use crate::linalg::Matrix; +use crate::math::distance::euclidian::Euclidian; +use crate::math::num::RealNumber; + +/// +/// Inspired by Python implementation: +/// +/// MIT License (MIT) Copyright (c) 2016 Carson Farmer +/// +/// affinity used is Euclidean so to allow linkage with single, ward, complete and average +/// +#[derive(Debug, Clone)] +pub struct FastPair<'a, T: RealNumber, M: Matrix> { + /// initial matrix + samples: &'a M, + /// closest pair hashmap (connectivity matrix for closest pairs) + pub distances: HashMap>, + /// conga line used to keep track of the closest pair + pub neighbours: Vec, +} + +impl<'a, T: RealNumber, M: Matrix> FastPair<'a, T, M> { + /// + /// Constructor + /// Instantiate and inizialise the algorithm + /// + pub fn new(m: &'a M) -> Result { + if m.shape().0 < 3 { + return Err(Failed::because( + FailedError::FindFailed, + "min number of rows should be 3", + )); + } + + let mut init = Self { + samples: m, + // to be computed in init(..) + distances: HashMap::with_capacity(m.shape().0), + neighbours: Vec::with_capacity(m.shape().0 + 1), + }; + init.init(); + Ok(init) + } + + /// + /// Initialise `FastPair` by passing a `Matrix`. + /// Build a FastPairs data-structure from a set of (new) points. + /// + fn init(&mut self) { + // basic measures + let len = self.samples.shape().0; + let max_index = self.samples.shape().0 - 1; + + // Store all closest neighbors + let _distances = Box::new(HashMap::with_capacity(len)); + let _neighbours = Box::new(Vec::with_capacity(len)); + + let mut distances = *_distances; + let mut neighbours = *_neighbours; + + // fill neighbours with -1 values + neighbours.extend(0..len); + + // init closest neighbour pairwise data + for index_row_i in 0..(max_index) { + distances.insert( + index_row_i, + PairwiseDistance { + node: index_row_i, + neighbour: None, + distance: Some(T::max_value()), + }, + ); + } + + // loop through indeces and neighbours + for index_row_i in 0..(len) { + // start looking for the neighbour in the second element + let mut index_closest = index_row_i + 1; // closest neighbour index + let mut nbd: Option = distances[&index_row_i].distance; // init neighbour distance + for index_row_j in (index_row_i + 1)..len { + distances.insert( + index_row_j, + PairwiseDistance { + node: index_row_j, + neighbour: Some(index_row_i), + distance: nbd, + }, + ); + + let d = Euclidian::squared_distance( + &(self.samples.get_row_as_vec(index_row_i)), + &(self.samples.get_row_as_vec(index_row_j)), + ); + if d < nbd.unwrap() { + // set this j-value to be the closest neighbour + index_closest = index_row_j; + nbd = Some(d); + } + } + + // Add that edge + distances.entry(index_row_i).and_modify(|e| { + e.distance = nbd; + e.neighbour = Some(index_closest); + }); + } + // No more neighbors, terminate conga line. + // Last person on the line has no neigbors + distances.get_mut(&max_index).unwrap().neighbour = Some(max_index); + distances.get_mut(&(len - 1)).unwrap().distance = Some(T::max_value()); + + // compute sparse matrix (connectivity matrix) + let mut sparse_matrix = M::zeros(len, len); + for (_, p) in distances.iter() { + sparse_matrix.set(p.node, p.neighbour.unwrap(), p.distance.unwrap()); + } + + self.distances = distances; + self.neighbours = neighbours; + } + + /// + /// Find closest pair by scanning list of nearest neighbors. + /// + #[allow(dead_code)] + pub fn closest_pair(&self) -> PairwiseDistance { + let mut a = self.neighbours[0]; // Start with first point + let mut d = self.distances[&a].distance; + for p in self.neighbours.iter() { + if self.distances[p].distance < d { + a = *p; // Update `a` and distance `d` + d = self.distances[p].distance; + } + } + let b = self.distances[&a].neighbour; + PairwiseDistance { + node: a, + neighbour: b, + distance: d, + } + } + + /// + /// Brute force algorithm, used only for comparison and testing + /// + #[cfg(feature = "fp_bench")] + pub fn closest_pair_brute(&self) -> PairwiseDistance { + let m = self.samples.shape().0; + + let mut closest_pair = PairwiseDistance { + node: 0, + neighbour: None, + distance: Some(T::max_value()), + }; + for pair in (0..m).combinations(2) { + let d = Euclidian::squared_distance( + &(self.samples.get_row_as_vec(pair[0])), + &(self.samples.get_row_as_vec(pair[1])), + ); + if d < closest_pair.distance.unwrap() { + closest_pair.node = pair[0]; + closest_pair.neighbour = Some(pair[1]); + closest_pair.distance = Some(d); + } + } + closest_pair + } + + // + // Compute distances from input to all other points in data-structure. + // input is the row index of the sample matrix + // + #[allow(dead_code)] + fn distances_from(&self, index_row: usize) -> Vec> { + let mut distances = Vec::>::with_capacity(self.samples.shape().0); + for other in self.neighbours.iter() { + if index_row != *other { + distances.push(PairwiseDistance { + node: index_row, + neighbour: Some(*other), + distance: Some(Euclidian::squared_distance( + &(self.samples.get_row_as_vec(index_row)), + &(self.samples.get_row_as_vec(*other)), + )), + }) + } + } + distances + } +} + +#[cfg(test)] +mod tests_fastpair { + + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn fastpair_init() { + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let _fastpair = FastPair::new(&x); + assert!(_fastpair.is_ok()); + + let fastpair = _fastpair.unwrap(); + + let distances = fastpair.distances; + let neighbours = fastpair.neighbours; + + assert!(distances.len() != 0); + assert!(neighbours.len() != 0); + + assert_eq!(10, neighbours.len()); + assert_eq!(10, distances.len()); + } + + #[test] + fn dataset_has_at_least_three_points() { + // Create a dataset which consists of only two points: + // A(0.0, 0.0) and B(1.0, 1.0). + let dataset = DenseMatrix::::from_2d_array(&[&[0.0, 0.0], &[1.0, 1.0]]); + + // We expect an error when we run `FastPair` on this dataset, + // becuase `FastPair` currently only works on a minimum of 3 + // points. + let _fastpair = FastPair::new(&dataset); + + match _fastpair { + Err(e) => { + let expected_error = + Failed::because(FailedError::FindFailed, "min number of rows should be 3"); + assert_eq!(e, expected_error) + } + _ => { + assert!(false); + } + } + } + + #[test] + fn one_dimensional_dataset_minimal() { + let dataset = DenseMatrix::::from_2d_array(&[&[0.0], &[2.0], &[9.0]]); + + let result = FastPair::new(&dataset); + assert!(result.is_ok()); + + let fastpair = result.unwrap(); + let closest_pair = fastpair.closest_pair(); + let expected_closest_pair = PairwiseDistance { + node: 0, + neighbour: Some(1), + distance: Some(4.0), + }; + assert_eq!(closest_pair, expected_closest_pair); + + let closest_pair_brute = fastpair.closest_pair_brute(); + assert_eq!(closest_pair_brute, expected_closest_pair); + } + + #[test] + fn one_dimensional_dataset_2() { + let dataset = DenseMatrix::::from_2d_array(&[&[27.0], &[0.0], &[9.0], &[2.0]]); + + let result = FastPair::new(&dataset); + assert!(result.is_ok()); + + let fastpair = result.unwrap(); + let closest_pair = fastpair.closest_pair(); + let expected_closest_pair = PairwiseDistance { + node: 1, + neighbour: Some(3), + distance: Some(4.0), + }; + assert_eq!(closest_pair, fastpair.closest_pair_brute()); + assert_eq!(closest_pair, expected_closest_pair); + } + + #[test] + fn fastpair_new() { + // compute + let x = DenseMatrix::::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + ]); + let fastpair = FastPair::new(&x); + assert!(fastpair.is_ok()); + + // unwrap results + let result = fastpair.unwrap(); + + // list of minimal pairwise dissimilarities + let dissimilarities = vec![ + ( + 1, + PairwiseDistance { + node: 1, + neighbour: Some(9), + distance: Some(0.030000000000000037), + }, + ), + ( + 10, + PairwiseDistance { + node: 10, + neighbour: Some(12), + distance: Some(0.07000000000000003), + }, + ), + ( + 11, + PairwiseDistance { + node: 11, + neighbour: Some(14), + distance: Some(0.18000000000000013), + }, + ), + ( + 12, + PairwiseDistance { + node: 12, + neighbour: Some(14), + distance: Some(0.34000000000000086), + }, + ), + ( + 13, + PairwiseDistance { + node: 13, + neighbour: Some(14), + distance: Some(1.6499999999999997), + }, + ), + ( + 14, + PairwiseDistance { + node: 14, + neighbour: Some(14), + distance: Some(f64::MAX), + }, + ), + ( + 6, + PairwiseDistance { + node: 6, + neighbour: Some(7), + distance: Some(0.18000000000000027), + }, + ), + ( + 0, + PairwiseDistance { + node: 0, + neighbour: Some(4), + distance: Some(0.01999999999999995), + }, + ), + ( + 8, + PairwiseDistance { + node: 8, + neighbour: Some(9), + distance: Some(0.3100000000000001), + }, + ), + ( + 2, + PairwiseDistance { + node: 2, + neighbour: Some(3), + distance: Some(0.0600000000000001), + }, + ), + ( + 3, + PairwiseDistance { + node: 3, + neighbour: Some(8), + distance: Some(0.08999999999999982), + }, + ), + ( + 7, + PairwiseDistance { + node: 7, + neighbour: Some(9), + distance: Some(0.10999999999999982), + }, + ), + ( + 9, + PairwiseDistance { + node: 9, + neighbour: Some(13), + distance: Some(8.69), + }, + ), + ( + 4, + PairwiseDistance { + node: 4, + neighbour: Some(7), + distance: Some(0.050000000000000086), + }, + ), + ( + 5, + PairwiseDistance { + node: 5, + neighbour: Some(7), + distance: Some(0.4900000000000002), + }, + ), + ]; + + let expected: HashMap<_, _> = dissimilarities.into_iter().collect(); + + for i in 0..(x.shape().0 - 1) { + let input_node = result.samples.get_row_as_vec(i); + let input_neighbour: usize = expected.get(&i).unwrap().neighbour.unwrap(); + let distance = Euclidian::squared_distance( + &input_node, + &result.samples.get_row_as_vec(input_neighbour), + ); + + assert_eq!(i, expected.get(&i).unwrap().node); + assert_eq!( + input_neighbour, + expected.get(&i).unwrap().neighbour.unwrap() + ); + assert_eq!(distance, expected.get(&i).unwrap().distance.unwrap()); + } + } + + #[test] + fn fastpair_closest_pair() { + let x = DenseMatrix::::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + ]); + // compute + let fastpair = FastPair::new(&x); + assert!(fastpair.is_ok()); + + let dissimilarity = fastpair.unwrap().closest_pair(); + let closest = PairwiseDistance { + node: 0, + neighbour: Some(4), + distance: Some(0.01999999999999995), + }; + + assert_eq!(closest, dissimilarity); + } + + #[test] + fn fastpair_closest_pair_random_matrix() { + let x = DenseMatrix::::rand(200, 25); + // compute + let fastpair = FastPair::new(&x); + assert!(fastpair.is_ok()); + + let result = fastpair.unwrap(); + + let dissimilarity1 = result.closest_pair(); + let dissimilarity2 = result.closest_pair_brute(); + + assert_eq!(dissimilarity1, dissimilarity2); + } + + #[test] + fn fastpair_distances() { + let x = DenseMatrix::::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + ]); + // compute + let fastpair = FastPair::new(&x); + assert!(fastpair.is_ok()); + + let dissimilarities = fastpair.unwrap().distances_from(0); + + let mut min_dissimilarity = PairwiseDistance { + node: 0, + neighbour: None, + distance: Some(f64::MAX), + }; + for p in dissimilarities.iter() { + if p.distance.unwrap() < min_dissimilarity.distance.unwrap() { + min_dissimilarity = p.clone() + } + } + + let closest = PairwiseDistance { + node: 0, + neighbour: Some(4), + distance: Some(0.01999999999999995), + }; + + assert_eq!(closest, min_dissimilarity); + } +} diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 321ec01..42ab7bc 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -41,6 +41,10 @@ use serde::{Deserialize, Serialize}; pub(crate) mod bbd_tree; /// tree data structure for fast nearest neighbor search pub mod cover_tree; +/// dissimilarities for vector-vector distance. Linkage algorithms used in fastpair +pub mod distances; +/// fastpair closest neighbour algorithm +pub mod fastpair; /// very simple algorithm that sequentially checks each element of the list until a match is found or the whole list has been searched. pub mod linear_search; diff --git a/src/algorithm/sort/heap_select.rs b/src/algorithm/sort/heap_select.rs index beb698f..bc880bc 100644 --- a/src/algorithm/sort/heap_select.rs +++ b/src/algorithm/sort/heap_select.rs @@ -12,7 +12,7 @@ pub struct HeapSelection { heap: Vec, } -impl<'a, T: PartialOrd + Debug> HeapSelection { +impl HeapSelection { pub fn with_capacity(k: usize) -> HeapSelection { HeapSelection { k, diff --git a/src/linalg/evd.rs b/src/linalg/evd.rs index bf195a0..fdca1fb 100644 --- a/src/linalg/evd.rs +++ b/src/linalg/evd.rs @@ -25,6 +25,19 @@ //! let eigenvectors: DenseMatrix = evd.V; //! let eigenvalues: Vec = evd.d; //! ``` +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::linalg::evd::*; +//! +//! let A = DenseMatrix::from_2d_array(&[ +//! &[-5.0, 2.0], +//! &[-7.0, 4.0], +//! ]); +//! +//! let evd = A.evd(false).unwrap(); +//! let eigenvectors: DenseMatrix = evd.V; +//! let eigenvalues: Vec = evd.d; +//! ``` //! //! ## References: //! * ["Numerical Recipes: The Art of Scientific Computing", Press W.H., Teukolsky S.A., Vetterling W.T, Flannery B.P, 3rd ed., Section 11 Eigensystems](http://numerical.recipes/) @@ -799,10 +812,10 @@ fn sort>(d: &mut [T], e: &mut [T], V: &mut M) { } i -= 1; } - d[i as usize + 1] = real; - e[i as usize + 1] = img; + d[(i + 1) as usize] = real; + e[(i + 1) as usize] = img; for (k, temp_k) in temp.iter().enumerate().take(n) { - V.set(k, i as usize + 1, *temp_k); + V.set(k, (i + 1) as usize, *temp_k); } } } diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 59b6089..8e27c0b 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -651,6 +651,10 @@ pub trait BaseMatrix: Clone + Debug { result } + /// Take an individual column from the matrix. + fn take_column(&self, column_index: usize) -> Self { + self.take(&[column_index], 1) + } } /// Generic matrix with additional mixins like various factorization methods. @@ -761,4 +765,21 @@ mod tests { assert_eq!(m.take(&vec!(1, 1, 3), 0), expected_0); assert_eq!(m.take(&vec!(1, 0), 1), expected_1); } + + #[test] + fn take_second_column_from_matrix() { + let four_columns: DenseMatrix = DenseMatrix::from_2d_array(&[ + &[0.0, 1.0, 2.0, 3.0], + &[0.0, 1.0, 2.0, 3.0], + &[0.0, 1.0, 2.0, 3.0], + &[0.0, 1.0, 2.0, 3.0], + ]); + + let second_column = four_columns.take_column(1); + assert_eq!( + second_column, + DenseMatrix::from_2d_array(&[&[1.0], &[1.0], &[1.0], &[1.0]]), + "The second column was not extracted correctly" + ); + } } diff --git a/src/linear/lasso_optimizer.rs b/src/linear/lasso_optimizer.rs index c4340fc..aa09128 100644 --- a/src/linear/lasso_optimizer.rs +++ b/src/linear/lasso_optimizer.rs @@ -211,9 +211,7 @@ impl> InteriorPointOptimizer { } } -impl<'a, T: RealNumber, M: Matrix> BiconjugateGradientSolver - for InteriorPointOptimizer -{ +impl> BiconjugateGradientSolver for InteriorPointOptimizer { fn solve_preconditioner(&self, a: &M, b: &M, x: &mut M) { let (_, p) = a.shape(); diff --git a/src/math/num.rs b/src/math/num.rs index 7199949..c454b9d 100644 --- a/src/math/num.rs +++ b/src/math/num.rs @@ -46,8 +46,11 @@ pub trait RealNumber: self * self } - /// Raw transmutation to u64 + /// Raw transmutation to u32 fn to_f32_bits(self) -> u32; + + /// Raw transmutation to u64 + fn to_f64_bits(self) -> u64; } impl RealNumber for f64 { @@ -89,6 +92,10 @@ impl RealNumber for f64 { fn to_f32_bits(self) -> u32 { self.to_bits() as u32 } + + fn to_f64_bits(self) -> u64 { + self.to_bits() + } } impl RealNumber for f32 { @@ -130,6 +137,10 @@ impl RealNumber for f32 { fn to_f32_bits(self) -> u32 { self.to_bits() } + + fn to_f64_bits(self) -> u64 { + self.to_bits() as u64 + } } #[cfg(test)] diff --git a/src/metrics/precision.rs b/src/metrics/precision.rs index a0171aa..a2bad30 100644 --- a/src/metrics/precision.rs +++ b/src/metrics/precision.rs @@ -18,6 +18,8 @@ //! //! //! +use std::collections::HashSet; + #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -42,34 +44,33 @@ impl Precision { ); } + let mut classes = HashSet::new(); + for i in 0..y_true.len() { + classes.insert(y_true.get(i).to_f64_bits()); + } + let classes = classes.len(); + let mut tp = 0; - let mut p = 0; - let n = y_true.len(); - for i in 0..n { - if y_true.get(i) != T::zero() && y_true.get(i) != T::one() { - panic!( - "Precision can only be applied to binary classification: {}", - y_true.get(i) - ); - } - - if y_pred.get(i) != T::zero() && y_pred.get(i) != T::one() { - panic!( - "Precision can only be applied to binary classification: {}", - y_pred.get(i) - ); - } - - if y_pred.get(i) == T::one() { - p += 1; - - if y_true.get(i) == T::one() { + let mut fp = 0; + for i in 0..y_true.len() { + if y_pred.get(i) == y_true.get(i) { + if classes == 2 { + if y_true.get(i) == T::one() { + tp += 1; + } + } else { tp += 1; } + } else if classes == 2 { + if y_true.get(i) == T::one() { + fp += 1; + } + } else { + fp += 1; } } - T::from_i64(tp).unwrap() / T::from_i64(p).unwrap() + T::from_i64(tp).unwrap() / (T::from_i64(tp).unwrap() + T::from_i64(fp).unwrap()) } } @@ -88,5 +89,24 @@ mod tests { assert!((score1 - 0.5).abs() < 1e-8); assert!((score2 - 1.0).abs() < 1e-8); + + let y_pred: Vec = vec![0., 0., 1., 1., 1., 1.]; + let y_true: Vec = vec![0., 1., 1., 0., 1., 0.]; + + let score3: f64 = Precision {}.get_score(&y_pred, &y_true); + assert!((score3 - 0.5).abs() < 1e-8); + } + + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] + #[test] + fn precision_multiclass() { + let y_true: Vec = vec![0., 0., 0., 1., 1., 1., 2., 2., 2.]; + let y_pred: Vec = vec![0., 1., 2., 0., 1., 2., 0., 1., 2.]; + + let score1: f64 = Precision {}.get_score(&y_pred, &y_true); + let score2: f64 = Precision {}.get_score(&y_pred, &y_pred); + + assert!((score1 - 0.333333333).abs() < 1e-8); + assert!((score2 - 1.0).abs() < 1e-8); } } diff --git a/src/metrics/recall.rs b/src/metrics/recall.rs index 18863ae..48ddeeb 100644 --- a/src/metrics/recall.rs +++ b/src/metrics/recall.rs @@ -18,6 +18,9 @@ //! //! //! +use std::collections::HashSet; +use std::convert::TryInto; + #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -42,34 +45,32 @@ impl Recall { ); } + let mut classes = HashSet::new(); + for i in 0..y_true.len() { + classes.insert(y_true.get(i).to_f64_bits()); + } + let classes: i64 = classes.len().try_into().unwrap(); + let mut tp = 0; - let mut p = 0; - let n = y_true.len(); - for i in 0..n { - if y_true.get(i) != T::zero() && y_true.get(i) != T::one() { - panic!( - "Recall can only be applied to binary classification: {}", - y_true.get(i) - ); - } - - if y_pred.get(i) != T::zero() && y_pred.get(i) != T::one() { - panic!( - "Recall can only be applied to binary classification: {}", - y_pred.get(i) - ); - } - - if y_true.get(i) == T::one() { - p += 1; - - if y_pred.get(i) == T::one() { + let mut fne = 0; + for i in 0..y_true.len() { + if y_pred.get(i) == y_true.get(i) { + if classes == 2 { + if y_true.get(i) == T::one() { + tp += 1; + } + } else { tp += 1; } + } else if classes == 2 { + if y_true.get(i) != T::one() { + fne += 1; + } + } else { + fne += 1; } } - - T::from_i64(tp).unwrap() / T::from_i64(p).unwrap() + T::from_i64(tp).unwrap() / (T::from_i64(tp).unwrap() + T::from_i64(fne).unwrap()) } } @@ -88,5 +89,24 @@ mod tests { assert!((score1 - 0.5).abs() < 1e-8); assert!((score2 - 1.0).abs() < 1e-8); + + let y_pred: Vec = vec![0., 0., 1., 1., 1., 1.]; + let y_true: Vec = vec![0., 1., 1., 0., 1., 0.]; + + let score3: f64 = Recall {}.get_score(&y_pred, &y_true); + assert!((score3 - 0.66666666).abs() < 1e-8); + } + + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] + #[test] + fn recall_multiclass() { + let y_true: Vec = vec![0., 0., 0., 1., 1., 1., 2., 2., 2.]; + let y_pred: Vec = vec![0., 1., 2., 0., 1., 2., 0., 1., 2.]; + + let score1: f64 = Recall {}.get_score(&y_pred, &y_true); + let score2: f64 = Recall {}.get_score(&y_pred, &y_pred); + + assert!((score1 - 0.333333333).abs() < 1e-8); + assert!((score2 - 1.0).abs() < 1e-8); } } diff --git a/src/optimization/mod.rs b/src/optimization/mod.rs index b0be9d6..127b534 100644 --- a/src/optimization/mod.rs +++ b/src/optimization/mod.rs @@ -5,7 +5,7 @@ pub type F<'a, T, X> = dyn for<'b> Fn(&'b X) -> T + 'a; pub type DF<'a, X> = dyn for<'b> Fn(&'b mut X, &'b X) + 'a; #[allow(clippy::upper_case_acronyms)] -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Eq)] pub enum FunctionOrder { SECOND, THIRD, diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index 32a0cfa..915fdab 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,5 +1,7 @@ -/// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents +/// Transform a data matrix by replacing all categorical variables with their one-hot vector equivalents pub mod categorical; mod data_traits; +/// Preprocess numerical matrices. +pub mod numerical; /// Encode a series (column, array) of categorical variables as one-hot vectors pub mod series_encoder; diff --git a/src/preprocessing/numerical.rs b/src/preprocessing/numerical.rs new file mode 100644 index 0000000..e2205c3 --- /dev/null +++ b/src/preprocessing/numerical.rs @@ -0,0 +1,447 @@ +//! # Standard-Scaling For [RealNumber](../../math/num/trait.RealNumber.html) Matricies +//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by removing the mean and scaling to unit variance. +//! +//! ### Usage Example +//! ``` +//! use smartcore::api::{Transformer, UnsupervisedEstimator}; +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::preprocessing::numerical; +//! let data = DenseMatrix::from_2d_vec(&vec![ +//! vec![0.0, 0.0], +//! vec![0.0, 0.0], +//! vec![1.0, 1.0], +//! vec![1.0, 1.0], +//! ]); +//! +//! let standard_scaler = +//! numerical::StandardScaler::fit(&data, numerical::StandardScalerParameters::default()) +//! .unwrap(); +//! let transformed_data = standard_scaler.transform(&data).unwrap(); +//! assert_eq!( +//! transformed_data, +//! DenseMatrix::from_2d_vec(&vec![ +//! vec![-1.0, -1.0], +//! vec![-1.0, -1.0], +//! vec![1.0, 1.0], +//! vec![1.0, 1.0], +//! ]) +//! ); +//! ``` +use crate::api::{Transformer, UnsupervisedEstimator}; +use crate::error::{Failed, FailedError}; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +/// Configure Behaviour of `StandardScaler`. +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone, Debug, Copy, Eq, PartialEq)] +pub struct StandardScalerParameters { + /// Optionaly adjust mean to be zero. + with_mean: bool, + /// Optionally adjust standard-deviation to be one. + with_std: bool, +} +impl Default for StandardScalerParameters { + fn default() -> Self { + StandardScalerParameters { + with_mean: true, + with_std: true, + } + } +} + +/// With the `StandardScaler` data can be adjusted so +/// that every column has a mean of zero and a standard +/// deviation of one. This can improve model training for +/// scaling sensitive models like neural network or nearest +/// neighbors based models. +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct StandardScaler { + means: Vec, + stds: Vec, + parameters: StandardScalerParameters, +} +impl StandardScaler { + /// When the mean should be adjusted, the column mean + /// should be kept. Otherwise, replace it by zero. + fn adjust_column_mean(&self, mean: T) -> T { + if self.parameters.with_mean { + mean + } else { + T::zero() + } + } + /// When the standard-deviation should be adjusted, the column + /// standard-deviation should be kept. Otherwise, replace it by one. + fn adjust_column_std(&self, std: T) -> T { + if self.parameters.with_std { + ensure_std_valid(std) + } else { + T::one() + } + } +} + +/// Make sure the standard deviation is valid. If it is +/// negative or zero, it should replaced by the smallest +/// positive value the type can have. That way we can savely +/// divide the columns with the resulting scalar. +fn ensure_std_valid(value: T) -> T { + value.max(T::min_positive_value()) +} + +/// During `fit` the `StandardScaler` computes the column means and standard deviation. +impl> UnsupervisedEstimator + for StandardScaler +{ + fn fit(x: &M, parameters: StandardScalerParameters) -> Result { + Ok(Self { + means: x.column_mean(), + stds: x.std(0), + parameters, + }) + } +} + +/// During `transform` the `StandardScaler` applies the summary statistics +/// computed during `fit` to set the mean of each column to zero and the +/// standard deviation to one. +impl> Transformer for StandardScaler { + fn transform(&self, x: &M) -> Result { + let (_, n_cols) = x.shape(); + if n_cols != self.means.len() { + return Err(Failed::because( + FailedError::TransformFailed, + &format!( + "Expected {} columns, but got {} columns instead.", + self.means.len(), + n_cols, + ), + )); + } + + Ok(build_matrix_from_columns( + self.means + .iter() + .zip(self.stds.iter()) + .enumerate() + .map(|(column_index, (column_mean, column_std))| { + x.take_column(column_index) + .sub_scalar(self.adjust_column_mean(*column_mean)) + .div_scalar(self.adjust_column_std(*column_std)) + }) + .collect(), + ) + .unwrap()) + } +} + +/// From a collection of matrices, that contain columns, construct +/// a matrix by stacking the columns horizontally. +fn build_matrix_from_columns(columns: Vec) -> Option +where + T: RealNumber, + M: Matrix, +{ + if let Some(output_matrix) = columns.first().cloned() { + return Some( + columns + .iter() + .skip(1) + .fold(output_matrix, |current_matrix, new_colum| { + current_matrix.h_stack(new_colum) + }), + ); + } else { + None + } +} + +#[cfg(test)] +mod tests { + + mod helper_functionality { + use super::super::{build_matrix_from_columns, ensure_std_valid}; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn combine_three_columns() { + assert_eq!( + build_matrix_from_columns(vec![ + DenseMatrix::from_2d_vec(&vec![vec![1.0], vec![1.0], vec![1.0],]), + DenseMatrix::from_2d_vec(&vec![vec![2.0], vec![2.0], vec![2.0],]), + DenseMatrix::from_2d_vec(&vec![vec![3.0], vec![3.0], vec![3.0],]) + ]), + Some(DenseMatrix::from_2d_vec(&vec![ + vec![1.0, 2.0, 3.0], + vec![1.0, 2.0, 3.0], + vec![1.0, 2.0, 3.0] + ])) + ) + } + + #[test] + fn negative_value_should_be_replace_with_minimal_positive_value() { + assert_eq!(ensure_std_valid(-1.0), f64::MIN_POSITIVE) + } + + #[test] + fn zero_should_be_replace_with_minimal_positive_value() { + assert_eq!(ensure_std_valid(0.0), f64::MIN_POSITIVE) + } + } + mod standard_scaler { + use super::super::{StandardScaler, StandardScalerParameters}; + use crate::api::{Transformer, UnsupervisedEstimator}; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::linalg::BaseMatrix; + + #[test] + fn dont_adjust_mean_if_used() { + assert_eq!( + (StandardScaler { + means: vec![], + stds: vec![], + parameters: StandardScalerParameters { + with_mean: true, + with_std: true + } + }) + .adjust_column_mean(1.0), + 1.0 + ) + } + #[test] + fn replace_mean_with_zero_if_not_used() { + assert_eq!( + (StandardScaler { + means: vec![], + stds: vec![], + parameters: StandardScalerParameters { + with_mean: false, + with_std: true + } + }) + .adjust_column_mean(1.0), + 0.0 + ) + } + #[test] + fn dont_adjust_std_if_used() { + assert_eq!( + (StandardScaler { + means: vec![], + stds: vec![], + parameters: StandardScalerParameters { + with_mean: true, + with_std: true + } + }) + .adjust_column_std(10.0), + 10.0 + ) + } + #[test] + fn replace_std_with_one_if_not_used() { + assert_eq!( + (StandardScaler { + means: vec![], + stds: vec![], + parameters: StandardScalerParameters { + with_mean: true, + with_std: false + } + }) + .adjust_column_std(10.0), + 1.0 + ) + } + + /// Helper function to apply fit as well as transform at the same time. + fn fit_transform_with_default_standard_scaler( + values_to_be_transformed: &DenseMatrix, + ) -> DenseMatrix { + StandardScaler::fit( + values_to_be_transformed, + StandardScalerParameters::default(), + ) + .unwrap() + .transform(values_to_be_transformed) + .unwrap() + } + + /// Fit transform with random generated values, expected values taken from + /// sklearn. + #[test] + fn fit_transform_random_values() { + let transformed_values = + fit_transform_with_default_standard_scaler(&DenseMatrix::from_2d_array(&[ + &[0.1004222429, 0.2194113576, 0.9310663354, 0.3313593793], + &[0.2045493861, 0.1683865411, 0.5071506765, 0.7257355264], + &[0.5708488802, 0.1846414616, 0.9590802982, 0.5591871046], + &[0.8387612750, 0.5754861361, 0.5537109852, 0.1077646442], + ])); + println!("{}", transformed_values); + assert!(transformed_values.approximate_eq( + &DenseMatrix::from_2d_array(&[ + &[-1.1154020653, -0.4031985330, 0.9284605204, -0.4271473866], + &[-0.7615464283, -0.7076698384, -1.1075452562, 1.2632979631], + &[0.4832504303, -0.6106747444, 1.0630075435, 0.5494084257], + &[1.3936980634, 1.7215431158, -0.8839228078, -1.3855590021], + ]), + 1.0 + )) + } + + /// Test `fit` and `transform` for a column with zero variance. + #[test] + fn fit_transform_with_zero_variance() { + assert_eq!( + fit_transform_with_default_standard_scaler(&DenseMatrix::from_2d_array(&[ + &[1.0], + &[1.0], + &[1.0], + &[1.0] + ])), + DenseMatrix::from_2d_array(&[&[0.0], &[0.0], &[0.0], &[0.0]]), + "When scaling values with zero variance, zero is expected as return value" + ) + } + + /// Test `fit` for columns with nice summary statistics. + #[test] + fn fit_for_simple_values() { + assert_eq!( + StandardScaler::fit( + &DenseMatrix::from_2d_array(&[ + &[1.0, 1.0, 1.0], + &[1.0, 2.0, 5.0], + &[1.0, 1.0, 1.0], + &[1.0, 2.0, 5.0] + ]), + StandardScalerParameters::default(), + ), + Ok(StandardScaler { + means: vec![1.0, 1.5, 3.0], + stds: vec![0.0, 0.5, 2.0], + parameters: StandardScalerParameters { + with_mean: true, + with_std: true + } + }) + ) + } + /// Test `fit` for random generated values. + #[test] + fn fit_for_random_values() { + let fitted_scaler = StandardScaler::fit( + &DenseMatrix::from_2d_array(&[ + &[0.1004222429, 0.2194113576, 0.9310663354, 0.3313593793], + &[0.2045493861, 0.1683865411, 0.5071506765, 0.7257355264], + &[0.5708488802, 0.1846414616, 0.9590802982, 0.5591871046], + &[0.8387612750, 0.5754861361, 0.5537109852, 0.1077646442], + ]), + StandardScalerParameters::default(), + ) + .unwrap(); + + assert_eq!( + fitted_scaler.means, + vec![0.42864544605, 0.2869813741, 0.737752073825, 0.431011663625], + ); + + assert!( + &DenseMatrix::from_2d_vec(&vec![fitted_scaler.stds]).approximate_eq( + &DenseMatrix::from_2d_array(&[&[ + 0.29426447500954, + 0.16758497615485, + 0.20820945786863, + 0.23329718831165 + ],]), + 0.00000000000001 + ) + ) + } + + /// If `with_std` is set to `false` the values should not be + /// adjusted to have a std of one. + #[test] + fn transform_without_std() { + let standard_scaler = StandardScaler { + means: vec![1.0, 3.0], + stds: vec![1.0, 2.0], + parameters: StandardScalerParameters { + with_mean: true, + with_std: false, + }, + }; + + assert_eq!( + standard_scaler.transform(&DenseMatrix::from_2d_array(&[&[0.0, 2.0], &[2.0, 4.0]])), + Ok(DenseMatrix::from_2d_array(&[&[-1.0, -1.0], &[1.0, 1.0]])) + ) + } + + /// If `with_mean` is set to `false` the values should not be adjusted + /// to have a mean of zero. + #[test] + fn transform_without_mean() { + let standard_scaler = StandardScaler { + means: vec![1.0, 2.0], + stds: vec![2.0, 3.0], + parameters: StandardScalerParameters { + with_mean: false, + with_std: true, + }, + }; + + assert_eq!( + standard_scaler + .transform(&DenseMatrix::from_2d_array(&[&[0.0, 9.0], &[4.0, 12.0]])), + Ok(DenseMatrix::from_2d_array(&[&[0.0, 3.0], &[2.0, 4.0]])) + ) + } + + /// Same as `fit_for_random_values` test, but using a `StandardScaler` that has been + /// serialized and deserialized. + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] + #[test] + #[cfg(feature = "serde")] + fn serde_fit_for_random_values() { + let fitted_scaler = StandardScaler::fit( + &DenseMatrix::from_2d_array(&[ + &[0.1004222429, 0.2194113576, 0.9310663354, 0.3313593793], + &[0.2045493861, 0.1683865411, 0.5071506765, 0.7257355264], + &[0.5708488802, 0.1846414616, 0.9590802982, 0.5591871046], + &[0.8387612750, 0.5754861361, 0.5537109852, 0.1077646442], + ]), + StandardScalerParameters::default(), + ) + .unwrap(); + + let deserialized_scaler: StandardScaler = + serde_json::from_str(&serde_json::to_string(&fitted_scaler).unwrap()).unwrap(); + + assert_eq!( + deserialized_scaler.means, + vec![0.42864544605, 0.2869813741, 0.737752073825, 0.431011663625], + ); + + assert!( + &DenseMatrix::from_2d_vec(&vec![deserialized_scaler.stds]).approximate_eq( + &DenseMatrix::from_2d_array(&[&[ + 0.29426447500954, + 0.16758497615485, + 0.20820945786863, + 0.23329718831165 + ],]), + 0.00000000000001 + ) + ) + } + } +} diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 3257111..18c73d1 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -242,7 +242,7 @@ impl, K: Kernel> SVR { Ok(y_hat) } - pub(in crate) fn predict_for_row(&self, x: M::RowVector) -> T { + pub(crate) fn predict_for_row(&self, x: M::RowVector) -> T { let mut f = self.b; for i in 0..self.instances.len() { diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index d86f59a..35889e4 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -285,7 +285,7 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } } -pub(in crate) fn which_max(x: &[usize]) -> usize { +pub(crate) fn which_max(x: &[usize]) -> usize { let mut m = x[0]; let mut which = 0; @@ -421,7 +421,7 @@ impl DecisionTreeClassifier { Ok(result.to_row_vector()) } - pub(in crate) fn predict_for_row>(&self, x: &M, row: usize) -> usize { + pub(crate) fn predict_for_row>(&self, x: &M, row: usize) -> usize { let mut result = 0; let mut queue: LinkedList = LinkedList::new(); diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 94fa0f8..25f5e7e 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -321,7 +321,7 @@ impl DecisionTreeRegressor { Ok(result.to_row_vector()) } - pub(in crate) fn predict_for_row>(&self, x: &M, row: usize) -> T { + pub(crate) fn predict_for_row>(&self, x: &M, row: usize) -> T { let mut result = T::zero(); let mut queue: LinkedList = LinkedList::new();