diff --git a/benches/distance.rs b/benches/distance.rs index 407b4db..927f8e4 100644 --- a/benches/distance.rs +++ b/benches/distance.rs @@ -1,17 +1,14 @@ #[macro_use] extern crate criterion; extern crate smartcore; -extern crate ndarray; -use ndarray::{Array, Array1}; -use smartcore::math::distance::Distance; use criterion::Criterion; use criterion::black_box; fn criterion_benchmark(c: &mut Criterion) { - let a = Array::from_vec(vec![1., 2., 3.]); + let a = vec![1., 2., 3.]; - c.bench_function("Euclidean Distance", move |b| b.iter(|| Array1::distance(black_box(&a), black_box(&a)))); + c.bench_function("Euclidean Distance", move |b| b.iter(|| smartcore::math::distance::euclidian::distance(black_box(&a), black_box(&a)))); } criterion_group!(benches, criterion_benchmark); diff --git a/src/algorithm/neighbour/bbd_tree.rs b/src/algorithm/neighbour/bbd_tree.rs index c7a7e86..ac288f9 100644 --- a/src/algorithm/neighbour/bbd_tree.rs +++ b/src/algorithm/neighbour/bbd_tree.rs @@ -1,6 +1,5 @@ -use std::collections::LinkedList; - use crate::linalg::Matrix; +use crate::math::distance::euclidian; #[derive(Debug)] pub struct BBDTree { @@ -77,10 +76,10 @@ impl BBDTree { let d = centroids[0].len(); // Determine which mean the node mean is closest to - let mut min_dist = BBDTree::squared_distance(&self.nodes[node].center, ¢roids[candidates[0]]); + let mut min_dist = euclidian::squared_distance(&self.nodes[node].center, ¢roids[candidates[0]]); let mut closest = candidates[0]; for i in 1..k { - let dist = BBDTree::squared_distance(&self.nodes[node].center, ¢roids[candidates[i]]); + let dist = euclidian::squared_distance(&self.nodes[node].center, ¢roids[candidates[i]]); if dist < min_dist { min_dist = dist; closest = candidates[i]; @@ -146,20 +145,7 @@ impl BBDTree { } return lhs >= 2f64 * rhs; - } - - fn squared_distance(x: &Vec,y: &Vec) -> f64 { - if x.len() != y.len() { - panic!("Input vector sizes are different."); - } - - let mut sum = 0f64; - for i in 0..x.len() { - sum += (x[i] - y[i]).powf(2.); - } - - return sum; - } + } fn build_node(&mut self, data: &M, begin: usize, end: usize) -> usize { let (_, d) = data.shape(); diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index 3314a09..a6eb2a8 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -72,9 +72,8 @@ impl Eq for KNNPoint {} #[cfg(test)] mod tests { - use super::*; - use crate::math::distance::Distance; - use ndarray::{arr1, Array1}; + use super::*; + use crate::math::distance::euclidian; struct SimpleDistance{} @@ -92,11 +91,11 @@ mod tests { assert_eq!(vec!(1, 2, 0), algorithm1.find(&2, 3)); - let data2 = vec!(arr1(&[1, 1]), arr1(&[2, 2]), arr1(&[3, 3]), arr1(&[4, 4]), arr1(&[5, 5])); + let data2 = vec!(vec![1., 1.], vec![2., 2.], vec![3., 3.], vec![4., 4.], vec![5., 5.]); - let algorithm2 = LinearKNNSearch::new(data2, &Array1::distance); + let algorithm2 = LinearKNNSearch::new(data2, &euclidian::distance); - assert_eq!(vec!(2, 3, 1), algorithm2.find(&arr1(&[3, 3]), 3)); + assert_eq!(vec!(2, 3, 1), algorithm2.find(&vec![3., 3.], 3)); } #[test] diff --git a/src/classification/knn.rs b/src/classification/knn.rs index 98730f4..02be3a6 100644 --- a/src/classification/knn.rs +++ b/src/classification/knn.rs @@ -1,61 +1,67 @@ -use super::Classifier; -use std::collections::HashSet; +use crate::linalg::Matrix; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::algorithm::neighbour::linear_search::LinearKNNSearch; use crate::algorithm::neighbour::cover_tree::CoverTree; use crate::common::Nominal; use ndarray::{ArrayBase, Data, Ix1, Ix2}; -use std::fmt::Debug; -type F = dyn Fn(&X, &X) -> f64; +type F = dyn Fn(&Vec, &Vec) -> f64; -pub struct KNNClassifier<'a, X, Y> -where - Y: Nominal, - X: Debug -{ - classes: Vec, +pub struct KNNClassifier<'a> { + classes: Vec, y: Vec, - knn_algorithm: Box + 'a>, + knn_algorithm: Box> + 'a>, k: usize, } -impl<'a, X, Y> KNNClassifier<'a, X, Y> -where - Y: Nominal, - X: Debug -{ +impl<'a> KNNClassifier<'a> { - pub fn fit(x: Vec, y: Vec, k: usize, distance: &'a F, algorithm: KNNAlgorithmName) -> KNNClassifier { + pub fn fit(x: &M, y: &M::RowVector, k: usize, distance: &'a F, algorithm: KNNAlgorithmName) -> KNNClassifier<'a> { - assert!(Vec::len(&x) == Vec::len(&y), format!("Size of x should equal size of y; |x|=[{}], |y|=[{}]", Vec::len(&x), Vec::len(&y))); + let y_m = M::from_row_vector(y.clone()); - assert!(k > 1, format!("k should be > 1, k=[{}]", k)); - - let c_hash: HashSet = y.clone().into_iter().collect(); - let classes: Vec = c_hash.into_iter().collect(); - let y_i:Vec = y.into_iter().map(|y| classes.iter().position(|yy| yy == &y).unwrap()).collect(); + let (_, y_n) = y_m.shape(); + let (x_n, _) = x.shape(); - let knn_algorithm: Box + 'a> = match algorithm { - KNNAlgorithmName::CoverTree => Box::new(CoverTree::::new(x, distance)), - KNNAlgorithmName::LinearSearch => Box::new(LinearKNNSearch::::new(x, distance)) + let data = x.to_vector(); + + let mut yi: Vec = vec![0; y_n]; + let classes = y_m.unique(); + + for i in 0..y_n { + let yc = y_m.get(0, i); + yi[i] = classes.iter().position(|c| yc == *c).unwrap(); + } + + assert!(x_n == y_n, format!("Size of x should equal size of y; |x|=[{}], |y|=[{}]", x_n, y_n)); + + assert!(k > 1, format!("k should be > 1, k=[{}]", k)); + + let knn_algorithm: Box> + 'a> = match algorithm { + KNNAlgorithmName::CoverTree => Box::new(CoverTree::>::new(data, distance)), + KNNAlgorithmName::LinearSearch => Box::new(LinearKNNSearch::>::new(data, distance)) }; - KNNClassifier{classes:classes, y: y_i, k: k, knn_algorithm: knn_algorithm} + KNNClassifier{classes:classes, y: yi, k: k, knn_algorithm: knn_algorithm} } - -} -impl<'a, X, Y> Classifier for KNNClassifier<'a, X, Y> -where - Y: Nominal, - X: Debug - { + pub fn predict(&self, x: &M) -> M::RowVector { + let mut result = M::zeros(1, x.shape().0); - fn predict(&self, x: &X) -> Y { - let idxs = self.knn_algorithm.find(x, self.k); + let (n, _) = x.shape(); + + for i in 0..n { + result.set(0, i, self.classes[self.predict_for_row(x, i)]); + } + + result.to_row_vector() + } + + pub(in crate) fn predict_for_row(&self, x: &M, row: usize) -> usize { + + let idxs = self.knn_algorithm.find(&x.get_row_as_vec(row), self.k); let mut c = vec![0; self.classes.len()]; let mut max_c = 0; let mut max_i = 0; @@ -65,41 +71,31 @@ where max_c = c[self.y[i]]; max_i = self.y[i]; } - } + } + + max_i - self.classes[max_i].clone() - } - -} - -pub struct NDArrayUtils { - -} - -impl NDArrayUtils { - - pub fn array2_to_vec(x: &ArrayBase) -> Vec> - where - E: Nominal, - S: Data, - std::vec::Vec>: std::iter::FromIterator, Ix1>>{ - let x_vec: Vec> = x.outer_iter().map(|x| x.to_owned()).collect(); - x_vec } + } #[cfg(test)] mod tests { - use super::*; - use crate::math::distance::Distance; - use ndarray::{arr1, arr2, Array1}; + use super::*; + use crate::math::distance::euclidian; + use crate::linalg::naive::dense_matrix::DenseMatrix; #[test] fn knn_fit_predict() { - let x = arr2(&[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]); - let y = arr1(&[2, 2, 2, 3, 3]); - let knn = KNNClassifier::fit(NDArrayUtils::array2_to_vec(&x), y.to_vec(), 3, &Array1::distance, KNNAlgorithmName::LinearSearch); - let r = knn.predict_vec(&NDArrayUtils::array2_to_vec(&x)); + let x = DenseMatrix::from_array(&[ + &[1., 2.], + &[3., 4.], + &[5., 6.], + &[7., 8.], + &[9., 10.]]); + let y = vec![2., 2., 2., 3., 3.]; + let knn = KNNClassifier::fit(&x, &y, 3, &euclidian::distance, KNNAlgorithmName::LinearSearch); + let r = knn.predict(&x); assert_eq!(5, Vec::len(&r)); assert_eq!(y.to_vec(), r); } diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index f59efbd..ae314b9 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -3,6 +3,7 @@ extern crate rand; use rand::Rng; use crate::linalg::Matrix; +use crate::math::distance::euclidian; use crate::algorithm::neighbour::bbd_tree::BBDTree; #[derive(Debug)] @@ -101,7 +102,7 @@ impl KMeans{ let mut best_cluster = 0; for j in 0..self.k { - let dist = KMeans::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]); + let dist = euclidian::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]); if dist < min_dist { min_dist = dist; best_cluster = j; @@ -127,7 +128,7 @@ impl KMeans{ // the distance from each sample to its closest center in scores. for i in 0..n { // compute the distance between this sample and the current center - let dist = KMeans::squared_distance(&data.get_row_as_vec(i), ¢roid); + let dist = euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid); if dist < d[i] { d[i] = dist; @@ -151,7 +152,7 @@ impl KMeans{ for i in 0..n { // compute the distance between this sample and the current center - let dist = KMeans::squared_distance(&data.get_row_as_vec(i), ¢roid); + let dist = euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid); if dist < d[i] { d[i] = dist; @@ -161,19 +162,6 @@ impl KMeans{ y } - - fn squared_distance(x: &Vec,y: &Vec) -> f64 { - if x.len() != y.len() { - panic!("Input vector sizes are different."); - } - - let mut sum = 0f64; - for i in 0..x.len() { - sum += (x[i] - y[i]).powf(2.); - } - - return sum; - } } diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 798cc90..6ff1167 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -18,6 +18,18 @@ pub trait Matrix: Clone + Debug { fn get_col_as_vec(&self, col: usize) -> Vec; + fn to_vector(&self) -> Vec> { + + let (n, _) = self.shape(); + let mut data = Vec::new(); + + for i in 0..n { + data.push(self.get_row_as_vec(i)); + } + + data + } + fn set(&mut self, row: usize, col: usize, x: f64); fn qr_solve_mut(&mut self, b: Self) -> Self; diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 83945a4..a449f74 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -1,56 +1,33 @@ -use crate::math::distance::Distance; -use ndarray::{ArrayBase, Data, Dimension}; -use crate::common::AnyNumber; +pub fn distance(x: &Vec, y: &Vec) -> f64 { + return squared_distance(x, y).sqrt(); +} -impl Distance> for ArrayBase -where - A: AnyNumber, - S1: Data, - S2: Data, - D: Dimension -{ - fn distance_to(&self, other: &Self) -> f64 - { - Self::distance(self, other) +pub fn squared_distance(x: &Vec,y: &Vec) -> f64 { + if x.len() != y.len() { + panic!("Input vector sizes are different."); } - fn distance(a: &Self, b: &ArrayBase) -> f64 - { - if a.len() != b.len() { - panic!("vectors a and b have different length"); - } else { - ((a - b)*(a - b)).sum().to_f64().unwrap().sqrt() - } + let mut sum = 0f64; + for i in 0..x.len() { + sum += (x[i] - y[i]).powf(2.); } - + return sum; } #[cfg(test)] mod tests { - use super::*; - use ndarray::{Array1, ArrayView1, arr1}; + use super::*; #[test] fn measure_simple_euclidian_distance() { - let a = arr1(&[1, 2, 3]); - let b = arr1(&[4, 5, 6]); + let a = vec![1., 2., 3.]; + let b = vec![4., 5., 6.]; - let d_arr = Array1::distance(&a, &b); - let d_view = ArrayView1::distance(&a.view(), &b.view()); + let d_arr = distance(&a, &b); - assert!((d_arr - 5.19615242).abs() < 1e-8); - assert!((d_view - 5.19615242).abs() < 1e-8); + assert!((d_arr - 5.19615242).abs() < 1e-8); } - #[test] - fn measure_simple_euclidian_distance_static() { - let a = arr1(&[-2.1968219, -0.9559913, -0.0431738, 1.0567679, 0.3853515]); - let b = arr1(&[-1.7781325, -0.6659839, 0.9526148, -0.9460919, -0.3925300]); - - let d = Array1::distance(&a, &b); - - assert!((d - 2.422302).abs() < 1e-6); - } } \ No newline at end of file diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 7b3104e..987f06e 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -1,9 +1 @@ -pub mod euclidian; - -pub trait Distance { - - fn distance_to(&self, other: &Self) -> f64; - - fn distance(a: &Self, b: &T) -> f64; - -} \ No newline at end of file +pub mod euclidian; \ No newline at end of file