fix: refactors knn and distance functions

This commit is contained in:
Volodymyr Orlov
2020-02-21 18:54:50 -08:00
parent 0e89113297
commit fe50509d3b
8 changed files with 101 additions and 154 deletions
+2 -5
View File
@@ -1,17 +1,14 @@
#[macro_use] #[macro_use]
extern crate criterion; extern crate criterion;
extern crate smartcore; extern crate smartcore;
extern crate ndarray;
use ndarray::{Array, Array1};
use smartcore::math::distance::Distance;
use criterion::Criterion; use criterion::Criterion;
use criterion::black_box; use criterion::black_box;
fn criterion_benchmark(c: &mut Criterion) { fn criterion_benchmark(c: &mut Criterion) {
let a = Array::from_vec(vec![1., 2., 3.]); let a = vec![1., 2., 3.];
c.bench_function("Euclidean Distance", move |b| b.iter(|| Array1::distance(black_box(&a), black_box(&a)))); c.bench_function("Euclidean Distance", move |b| b.iter(|| smartcore::math::distance::euclidian::distance(black_box(&a), black_box(&a))));
} }
criterion_group!(benches, criterion_benchmark); criterion_group!(benches, criterion_benchmark);
+4 -18
View File
@@ -1,6 +1,5 @@
use std::collections::LinkedList;
use crate::linalg::Matrix; use crate::linalg::Matrix;
use crate::math::distance::euclidian;
#[derive(Debug)] #[derive(Debug)]
pub struct BBDTree { pub struct BBDTree {
@@ -77,10 +76,10 @@ impl BBDTree {
let d = centroids[0].len(); let d = centroids[0].len();
// Determine which mean the node mean is closest to // Determine which mean the node mean is closest to
let mut min_dist = BBDTree::squared_distance(&self.nodes[node].center, &centroids[candidates[0]]); let mut min_dist = euclidian::squared_distance(&self.nodes[node].center, &centroids[candidates[0]]);
let mut closest = candidates[0]; let mut closest = candidates[0];
for i in 1..k { for i in 1..k {
let dist = BBDTree::squared_distance(&self.nodes[node].center, &centroids[candidates[i]]); let dist = euclidian::squared_distance(&self.nodes[node].center, &centroids[candidates[i]]);
if dist < min_dist { if dist < min_dist {
min_dist = dist; min_dist = dist;
closest = candidates[i]; closest = candidates[i];
@@ -146,20 +145,7 @@ impl BBDTree {
} }
return lhs >= 2f64 * rhs; return lhs >= 2f64 * rhs;
} }
fn squared_distance(x: &Vec<f64>,y: &Vec<f64>) -> f64 {
if x.len() != y.len() {
panic!("Input vector sizes are different.");
}
let mut sum = 0f64;
for i in 0..x.len() {
sum += (x[i] - y[i]).powf(2.);
}
return sum;
}
fn build_node<M: Matrix>(&mut self, data: &M, begin: usize, end: usize) -> usize { fn build_node<M: Matrix>(&mut self, data: &M, begin: usize, end: usize) -> usize {
let (_, d) = data.shape(); let (_, d) = data.shape();
+5 -6
View File
@@ -72,9 +72,8 @@ impl Eq for KNNPoint {}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::math::distance::Distance; use crate::math::distance::euclidian;
use ndarray::{arr1, Array1};
struct SimpleDistance{} struct SimpleDistance{}
@@ -92,11 +91,11 @@ mod tests {
assert_eq!(vec!(1, 2, 0), algorithm1.find(&2, 3)); assert_eq!(vec!(1, 2, 0), algorithm1.find(&2, 3));
let data2 = vec!(arr1(&[1, 1]), arr1(&[2, 2]), arr1(&[3, 3]), arr1(&[4, 4]), arr1(&[5, 5])); let data2 = vec!(vec![1., 1.], vec![2., 2.], vec![3., 3.], vec![4., 4.], vec![5., 5.]);
let algorithm2 = LinearKNNSearch::new(data2, &Array1::distance); let algorithm2 = LinearKNNSearch::new(data2, &euclidian::distance);
assert_eq!(vec!(2, 3, 1), algorithm2.find(&arr1(&[3, 3]), 3)); assert_eq!(vec!(2, 3, 1), algorithm2.find(&vec![3., 3.], 3));
} }
#[test] #[test]
+58 -62
View File
@@ -1,61 +1,67 @@
use super::Classifier; use crate::linalg::Matrix;
use std::collections::HashSet;
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
use crate::algorithm::neighbour::linear_search::LinearKNNSearch; use crate::algorithm::neighbour::linear_search::LinearKNNSearch;
use crate::algorithm::neighbour::cover_tree::CoverTree; use crate::algorithm::neighbour::cover_tree::CoverTree;
use crate::common::Nominal; use crate::common::Nominal;
use ndarray::{ArrayBase, Data, Ix1, Ix2}; use ndarray::{ArrayBase, Data, Ix1, Ix2};
use std::fmt::Debug;
type F<X> = dyn Fn(&X, &X) -> f64; type F = dyn Fn(&Vec<f64>, &Vec<f64>) -> f64;
pub struct KNNClassifier<'a, X, Y> pub struct KNNClassifier<'a> {
where classes: Vec<f64>,
Y: Nominal,
X: Debug
{
classes: Vec<Y>,
y: Vec<usize>, y: Vec<usize>,
knn_algorithm: Box<dyn KNNAlgorithm<X> + 'a>, knn_algorithm: Box<dyn KNNAlgorithm<Vec<f64>> + 'a>,
k: usize, k: usize,
} }
impl<'a, X, Y> KNNClassifier<'a, X, Y> impl<'a> KNNClassifier<'a> {
where
Y: Nominal,
X: Debug
{
pub fn fit(x: Vec<X>, y: Vec<Y>, k: usize, distance: &'a F<X>, algorithm: KNNAlgorithmName) -> KNNClassifier<X, Y> { pub fn fit<M: Matrix>(x: &M, y: &M::RowVector, k: usize, distance: &'a F, algorithm: KNNAlgorithmName) -> KNNClassifier<'a> {
assert!(Vec::len(&x) == Vec::len(&y), format!("Size of x should equal size of y; |x|=[{}], |y|=[{}]", Vec::len(&x), Vec::len(&y))); let y_m = M::from_row_vector(y.clone());
assert!(k > 1, format!("k should be > 1, k=[{}]", k)); let (_, y_n) = y_m.shape();
let (x_n, _) = x.shape();
let c_hash: HashSet<Y> = y.clone().into_iter().collect();
let classes: Vec<Y> = c_hash.into_iter().collect();
let y_i:Vec<usize> = y.into_iter().map(|y| classes.iter().position(|yy| yy == &y).unwrap()).collect();
let knn_algorithm: Box<dyn KNNAlgorithm<X> + 'a> = match algorithm { let data = x.to_vector();
KNNAlgorithmName::CoverTree => Box::new(CoverTree::<X>::new(x, distance)),
KNNAlgorithmName::LinearSearch => Box::new(LinearKNNSearch::<X>::new(x, distance)) let mut yi: Vec<usize> = vec![0; y_n];
let classes = y_m.unique();
for i in 0..y_n {
let yc = y_m.get(0, i);
yi[i] = classes.iter().position(|c| yc == *c).unwrap();
}
assert!(x_n == y_n, format!("Size of x should equal size of y; |x|=[{}], |y|=[{}]", x_n, y_n));
assert!(k > 1, format!("k should be > 1, k=[{}]", k));
let knn_algorithm: Box<dyn KNNAlgorithm<Vec<f64>> + 'a> = match algorithm {
KNNAlgorithmName::CoverTree => Box::new(CoverTree::<Vec<f64>>::new(data, distance)),
KNNAlgorithmName::LinearSearch => Box::new(LinearKNNSearch::<Vec<f64>>::new(data, distance))
}; };
KNNClassifier{classes:classes, y: y_i, k: k, knn_algorithm: knn_algorithm} KNNClassifier{classes:classes, y: yi, k: k, knn_algorithm: knn_algorithm}
} }
}
impl<'a, X, Y> Classifier<X, Y> for KNNClassifier<'a, X, Y> pub fn predict<M: Matrix>(&self, x: &M) -> M::RowVector {
where let mut result = M::zeros(1, x.shape().0);
Y: Nominal,
X: Debug
{
fn predict(&self, x: &X) -> Y { let (n, _) = x.shape();
let idxs = self.knn_algorithm.find(x, self.k);
for i in 0..n {
result.set(0, i, self.classes[self.predict_for_row(x, i)]);
}
result.to_row_vector()
}
pub(in crate) fn predict_for_row<M: Matrix>(&self, x: &M, row: usize) -> usize {
let idxs = self.knn_algorithm.find(&x.get_row_as_vec(row), self.k);
let mut c = vec![0; self.classes.len()]; let mut c = vec![0; self.classes.len()];
let mut max_c = 0; let mut max_c = 0;
let mut max_i = 0; let mut max_i = 0;
@@ -65,41 +71,31 @@ where
max_c = c[self.y[i]]; max_c = c[self.y[i]];
max_i = self.y[i]; max_i = self.y[i];
} }
} }
max_i
self.classes[max_i].clone()
}
}
pub struct NDArrayUtils {
}
impl NDArrayUtils {
pub fn array2_to_vec<E, S>(x: &ArrayBase<S, Ix2>) -> Vec<ArrayBase<S, Ix1>>
where
E: Nominal,
S: Data<Elem = E>,
std::vec::Vec<ArrayBase<S, Ix1>>: std::iter::FromIterator<ndarray::ArrayBase<ndarray::OwnedRepr<E>, Ix1>>{
let x_vec: Vec<ArrayBase<S, Ix1>> = x.outer_iter().map(|x| x.to_owned()).collect();
x_vec
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::math::distance::Distance; use crate::math::distance::euclidian;
use ndarray::{arr1, arr2, Array1}; use crate::linalg::naive::dense_matrix::DenseMatrix;
#[test] #[test]
fn knn_fit_predict() { fn knn_fit_predict() {
let x = arr2(&[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]); let x = DenseMatrix::from_array(&[
let y = arr1(&[2, 2, 2, 3, 3]); &[1., 2.],
let knn = KNNClassifier::fit(NDArrayUtils::array2_to_vec(&x), y.to_vec(), 3, &Array1::distance, KNNAlgorithmName::LinearSearch); &[3., 4.],
let r = knn.predict_vec(&NDArrayUtils::array2_to_vec(&x)); &[5., 6.],
&[7., 8.],
&[9., 10.]]);
let y = vec![2., 2., 2., 3., 3.];
let knn = KNNClassifier::fit(&x, &y, 3, &euclidian::distance, KNNAlgorithmName::LinearSearch);
let r = knn.predict(&x);
assert_eq!(5, Vec::len(&r)); assert_eq!(5, Vec::len(&r));
assert_eq!(y.to_vec(), r); assert_eq!(y.to_vec(), r);
} }
+4 -16
View File
@@ -3,6 +3,7 @@ extern crate rand;
use rand::Rng; use rand::Rng;
use crate::linalg::Matrix; use crate::linalg::Matrix;
use crate::math::distance::euclidian;
use crate::algorithm::neighbour::bbd_tree::BBDTree; use crate::algorithm::neighbour::bbd_tree::BBDTree;
#[derive(Debug)] #[derive(Debug)]
@@ -101,7 +102,7 @@ impl KMeans{
let mut best_cluster = 0; let mut best_cluster = 0;
for j in 0..self.k { for j in 0..self.k {
let dist = KMeans::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]); let dist = euclidian::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]);
if dist < min_dist { if dist < min_dist {
min_dist = dist; min_dist = dist;
best_cluster = j; best_cluster = j;
@@ -127,7 +128,7 @@ impl KMeans{
// the distance from each sample to its closest center in scores. // the distance from each sample to its closest center in scores.
for i in 0..n { for i in 0..n {
// compute the distance between this sample and the current center // compute the distance between this sample and the current center
let dist = KMeans::squared_distance(&data.get_row_as_vec(i), &centroid); let dist = euclidian::squared_distance(&data.get_row_as_vec(i), &centroid);
if dist < d[i] { if dist < d[i] {
d[i] = dist; d[i] = dist;
@@ -151,7 +152,7 @@ impl KMeans{
for i in 0..n { for i in 0..n {
// compute the distance between this sample and the current center // compute the distance between this sample and the current center
let dist = KMeans::squared_distance(&data.get_row_as_vec(i), &centroid); let dist = euclidian::squared_distance(&data.get_row_as_vec(i), &centroid);
if dist < d[i] { if dist < d[i] {
d[i] = dist; d[i] = dist;
@@ -161,19 +162,6 @@ impl KMeans{
y y
} }
fn squared_distance(x: &Vec<f64>,y: &Vec<f64>) -> f64 {
if x.len() != y.len() {
panic!("Input vector sizes are different.");
}
let mut sum = 0f64;
for i in 0..x.len() {
sum += (x[i] - y[i]).powf(2.);
}
return sum;
}
} }
+12
View File
@@ -18,6 +18,18 @@ pub trait Matrix: Clone + Debug {
fn get_col_as_vec(&self, col: usize) -> Vec<f64>; fn get_col_as_vec(&self, col: usize) -> Vec<f64>;
fn to_vector(&self) -> Vec<Vec<f64>> {
let (n, _) = self.shape();
let mut data = Vec::new();
for i in 0..n {
data.push(self.get_row_as_vec(i));
}
data
}
fn set(&mut self, row: usize, col: usize, x: f64); fn set(&mut self, row: usize, col: usize, x: f64);
fn qr_solve_mut(&mut self, b: Self) -> Self; fn qr_solve_mut(&mut self, b: Self) -> Self;
+15 -38
View File
@@ -1,56 +1,33 @@
use crate::math::distance::Distance; pub fn distance(x: &Vec<f64>, y: &Vec<f64>) -> f64 {
use ndarray::{ArrayBase, Data, Dimension}; return squared_distance(x, y).sqrt();
use crate::common::AnyNumber; }
impl<A, S1, S2, D> Distance<ArrayBase<S2, D>> for ArrayBase<S1, D> pub fn squared_distance(x: &Vec<f64>,y: &Vec<f64>) -> f64 {
where if x.len() != y.len() {
A: AnyNumber, panic!("Input vector sizes are different.");
S1: Data<Elem = A>,
S2: Data<Elem = A>,
D: Dimension
{
fn distance_to(&self, other: &Self) -> f64
{
Self::distance(self, other)
} }
fn distance(a: &Self, b: &ArrayBase<S2, D>) -> f64 let mut sum = 0f64;
{ for i in 0..x.len() {
if a.len() != b.len() { sum += (x[i] - y[i]).powf(2.);
panic!("vectors a and b have different length");
} else {
((a - b)*(a - b)).sum().to_f64().unwrap().sqrt()
}
} }
return sum;
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use ndarray::{Array1, ArrayView1, arr1};
#[test] #[test]
fn measure_simple_euclidian_distance() { fn measure_simple_euclidian_distance() {
let a = arr1(&[1, 2, 3]); let a = vec![1., 2., 3.];
let b = arr1(&[4, 5, 6]); let b = vec![4., 5., 6.];
let d_arr = Array1::distance(&a, &b); let d_arr = distance(&a, &b);
let d_view = ArrayView1::distance(&a.view(), &b.view());
assert!((d_arr - 5.19615242).abs() < 1e-8); assert!((d_arr - 5.19615242).abs() < 1e-8);
assert!((d_view - 5.19615242).abs() < 1e-8);
} }
#[test]
fn measure_simple_euclidian_distance_static() {
let a = arr1(&[-2.1968219, -0.9559913, -0.0431738, 1.0567679, 0.3853515]);
let b = arr1(&[-1.7781325, -0.6659839, 0.9526148, -0.9460919, -0.3925300]);
let d = Array1::distance(&a, &b);
assert!((d - 2.422302).abs() < 1e-6);
}
} }
+1 -9
View File
@@ -1,9 +1 @@
pub mod euclidian; pub mod euclidian;
pub trait Distance<T> {
fn distance_to(&self, other: &Self) -> f64;
fn distance(a: &Self, b: &T) -> f64;
}