fix: refactors knn and distance functions
This commit is contained in:
+2
-5
@@ -1,17 +1,14 @@
|
|||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate criterion;
|
extern crate criterion;
|
||||||
extern crate smartcore;
|
extern crate smartcore;
|
||||||
extern crate ndarray;
|
|
||||||
use ndarray::{Array, Array1};
|
|
||||||
use smartcore::math::distance::Distance;
|
|
||||||
|
|
||||||
use criterion::Criterion;
|
use criterion::Criterion;
|
||||||
use criterion::black_box;
|
use criterion::black_box;
|
||||||
|
|
||||||
fn criterion_benchmark(c: &mut Criterion) {
|
fn criterion_benchmark(c: &mut Criterion) {
|
||||||
let a = Array::from_vec(vec![1., 2., 3.]);
|
let a = vec![1., 2., 3.];
|
||||||
|
|
||||||
c.bench_function("Euclidean Distance", move |b| b.iter(|| Array1::distance(black_box(&a), black_box(&a))));
|
c.bench_function("Euclidean Distance", move |b| b.iter(|| smartcore::math::distance::euclidian::distance(black_box(&a), black_box(&a))));
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group!(benches, criterion_benchmark);
|
criterion_group!(benches, criterion_benchmark);
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use std::collections::LinkedList;
|
|
||||||
|
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
|
use crate::math::distance::euclidian;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct BBDTree {
|
pub struct BBDTree {
|
||||||
@@ -77,10 +76,10 @@ impl BBDTree {
|
|||||||
let d = centroids[0].len();
|
let d = centroids[0].len();
|
||||||
|
|
||||||
// Determine which mean the node mean is closest to
|
// Determine which mean the node mean is closest to
|
||||||
let mut min_dist = BBDTree::squared_distance(&self.nodes[node].center, ¢roids[candidates[0]]);
|
let mut min_dist = euclidian::squared_distance(&self.nodes[node].center, ¢roids[candidates[0]]);
|
||||||
let mut closest = candidates[0];
|
let mut closest = candidates[0];
|
||||||
for i in 1..k {
|
for i in 1..k {
|
||||||
let dist = BBDTree::squared_distance(&self.nodes[node].center, ¢roids[candidates[i]]);
|
let dist = euclidian::squared_distance(&self.nodes[node].center, ¢roids[candidates[i]]);
|
||||||
if dist < min_dist {
|
if dist < min_dist {
|
||||||
min_dist = dist;
|
min_dist = dist;
|
||||||
closest = candidates[i];
|
closest = candidates[i];
|
||||||
@@ -148,19 +147,6 @@ impl BBDTree {
|
|||||||
return lhs >= 2f64 * rhs;
|
return lhs >= 2f64 * rhs;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn squared_distance(x: &Vec<f64>,y: &Vec<f64>) -> f64 {
|
|
||||||
if x.len() != y.len() {
|
|
||||||
panic!("Input vector sizes are different.");
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut sum = 0f64;
|
|
||||||
for i in 0..x.len() {
|
|
||||||
sum += (x[i] - y[i]).powf(2.);
|
|
||||||
}
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_node<M: Matrix>(&mut self, data: &M, begin: usize, end: usize) -> usize {
|
fn build_node<M: Matrix>(&mut self, data: &M, begin: usize, end: usize) -> usize {
|
||||||
let (_, d) = data.shape();
|
let (_, d) = data.shape();
|
||||||
|
|
||||||
|
|||||||
@@ -73,8 +73,7 @@ impl Eq for KNNPoint {}
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::math::distance::Distance;
|
use crate::math::distance::euclidian;
|
||||||
use ndarray::{arr1, Array1};
|
|
||||||
|
|
||||||
struct SimpleDistance{}
|
struct SimpleDistance{}
|
||||||
|
|
||||||
@@ -92,11 +91,11 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(vec!(1, 2, 0), algorithm1.find(&2, 3));
|
assert_eq!(vec!(1, 2, 0), algorithm1.find(&2, 3));
|
||||||
|
|
||||||
let data2 = vec!(arr1(&[1, 1]), arr1(&[2, 2]), arr1(&[3, 3]), arr1(&[4, 4]), arr1(&[5, 5]));
|
let data2 = vec!(vec![1., 1.], vec![2., 2.], vec![3., 3.], vec![4., 4.], vec![5., 5.]);
|
||||||
|
|
||||||
let algorithm2 = LinearKNNSearch::new(data2, &Array1::distance);
|
let algorithm2 = LinearKNNSearch::new(data2, &euclidian::distance);
|
||||||
|
|
||||||
assert_eq!(vec!(2, 3, 1), algorithm2.find(&arr1(&[3, 3]), 3));
|
assert_eq!(vec!(2, 3, 1), algorithm2.find(&vec![3., 3.], 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
+53
-57
@@ -1,61 +1,67 @@
|
|||||||
use super::Classifier;
|
use crate::linalg::Matrix;
|
||||||
use std::collections::HashSet;
|
|
||||||
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
|
use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName};
|
||||||
use crate::algorithm::neighbour::linear_search::LinearKNNSearch;
|
use crate::algorithm::neighbour::linear_search::LinearKNNSearch;
|
||||||
use crate::algorithm::neighbour::cover_tree::CoverTree;
|
use crate::algorithm::neighbour::cover_tree::CoverTree;
|
||||||
use crate::common::Nominal;
|
use crate::common::Nominal;
|
||||||
use ndarray::{ArrayBase, Data, Ix1, Ix2};
|
use ndarray::{ArrayBase, Data, Ix1, Ix2};
|
||||||
use std::fmt::Debug;
|
|
||||||
|
|
||||||
|
|
||||||
type F<X> = dyn Fn(&X, &X) -> f64;
|
type F = dyn Fn(&Vec<f64>, &Vec<f64>) -> f64;
|
||||||
|
|
||||||
pub struct KNNClassifier<'a, X, Y>
|
pub struct KNNClassifier<'a> {
|
||||||
where
|
classes: Vec<f64>,
|
||||||
Y: Nominal,
|
|
||||||
X: Debug
|
|
||||||
{
|
|
||||||
classes: Vec<Y>,
|
|
||||||
y: Vec<usize>,
|
y: Vec<usize>,
|
||||||
knn_algorithm: Box<dyn KNNAlgorithm<X> + 'a>,
|
knn_algorithm: Box<dyn KNNAlgorithm<Vec<f64>> + 'a>,
|
||||||
k: usize,
|
k: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, X, Y> KNNClassifier<'a, X, Y>
|
impl<'a> KNNClassifier<'a> {
|
||||||
where
|
|
||||||
Y: Nominal,
|
|
||||||
X: Debug
|
|
||||||
{
|
|
||||||
|
|
||||||
pub fn fit(x: Vec<X>, y: Vec<Y>, k: usize, distance: &'a F<X>, algorithm: KNNAlgorithmName) -> KNNClassifier<X, Y> {
|
pub fn fit<M: Matrix>(x: &M, y: &M::RowVector, k: usize, distance: &'a F, algorithm: KNNAlgorithmName) -> KNNClassifier<'a> {
|
||||||
|
|
||||||
assert!(Vec::len(&x) == Vec::len(&y), format!("Size of x should equal size of y; |x|=[{}], |y|=[{}]", Vec::len(&x), Vec::len(&y)));
|
let y_m = M::from_row_vector(y.clone());
|
||||||
|
|
||||||
|
let (_, y_n) = y_m.shape();
|
||||||
|
let (x_n, _) = x.shape();
|
||||||
|
|
||||||
|
let data = x.to_vector();
|
||||||
|
|
||||||
|
let mut yi: Vec<usize> = vec![0; y_n];
|
||||||
|
let classes = y_m.unique();
|
||||||
|
|
||||||
|
for i in 0..y_n {
|
||||||
|
let yc = y_m.get(0, i);
|
||||||
|
yi[i] = classes.iter().position(|c| yc == *c).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(x_n == y_n, format!("Size of x should equal size of y; |x|=[{}], |y|=[{}]", x_n, y_n));
|
||||||
|
|
||||||
assert!(k > 1, format!("k should be > 1, k=[{}]", k));
|
assert!(k > 1, format!("k should be > 1, k=[{}]", k));
|
||||||
|
|
||||||
let c_hash: HashSet<Y> = y.clone().into_iter().collect();
|
let knn_algorithm: Box<dyn KNNAlgorithm<Vec<f64>> + 'a> = match algorithm {
|
||||||
let classes: Vec<Y> = c_hash.into_iter().collect();
|
KNNAlgorithmName::CoverTree => Box::new(CoverTree::<Vec<f64>>::new(data, distance)),
|
||||||
let y_i:Vec<usize> = y.into_iter().map(|y| classes.iter().position(|yy| yy == &y).unwrap()).collect();
|
KNNAlgorithmName::LinearSearch => Box::new(LinearKNNSearch::<Vec<f64>>::new(data, distance))
|
||||||
|
|
||||||
let knn_algorithm: Box<dyn KNNAlgorithm<X> + 'a> = match algorithm {
|
|
||||||
KNNAlgorithmName::CoverTree => Box::new(CoverTree::<X>::new(x, distance)),
|
|
||||||
KNNAlgorithmName::LinearSearch => Box::new(LinearKNNSearch::<X>::new(x, distance))
|
|
||||||
};
|
};
|
||||||
|
|
||||||
KNNClassifier{classes:classes, y: y_i, k: k, knn_algorithm: knn_algorithm}
|
KNNClassifier{classes:classes, y: yi, k: k, knn_algorithm: knn_algorithm}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
pub fn predict<M: Matrix>(&self, x: &M) -> M::RowVector {
|
||||||
|
let mut result = M::zeros(1, x.shape().0);
|
||||||
|
|
||||||
impl<'a, X, Y> Classifier<X, Y> for KNNClassifier<'a, X, Y>
|
let (n, _) = x.shape();
|
||||||
where
|
|
||||||
Y: Nominal,
|
|
||||||
X: Debug
|
|
||||||
{
|
|
||||||
|
|
||||||
fn predict(&self, x: &X) -> Y {
|
for i in 0..n {
|
||||||
let idxs = self.knn_algorithm.find(x, self.k);
|
result.set(0, i, self.classes[self.predict_for_row(x, i)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
result.to_row_vector()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(in crate) fn predict_for_row<M: Matrix>(&self, x: &M, row: usize) -> usize {
|
||||||
|
|
||||||
|
let idxs = self.knn_algorithm.find(&x.get_row_as_vec(row), self.k);
|
||||||
let mut c = vec![0; self.classes.len()];
|
let mut c = vec![0; self.classes.len()];
|
||||||
let mut max_c = 0;
|
let mut max_c = 0;
|
||||||
let mut max_i = 0;
|
let mut max_i = 0;
|
||||||
@@ -67,39 +73,29 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
self.classes[max_i].clone()
|
max_i
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct NDArrayUtils {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
impl NDArrayUtils {
|
|
||||||
|
|
||||||
pub fn array2_to_vec<E, S>(x: &ArrayBase<S, Ix2>) -> Vec<ArrayBase<S, Ix1>>
|
|
||||||
where
|
|
||||||
E: Nominal,
|
|
||||||
S: Data<Elem = E>,
|
|
||||||
std::vec::Vec<ArrayBase<S, Ix1>>: std::iter::FromIterator<ndarray::ArrayBase<ndarray::OwnedRepr<E>, Ix1>>{
|
|
||||||
let x_vec: Vec<ArrayBase<S, Ix1>> = x.outer_iter().map(|x| x.to_owned()).collect();
|
|
||||||
x_vec
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::math::distance::Distance;
|
use crate::math::distance::euclidian;
|
||||||
use ndarray::{arr1, arr2, Array1};
|
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn knn_fit_predict() {
|
fn knn_fit_predict() {
|
||||||
let x = arr2(&[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]);
|
let x = DenseMatrix::from_array(&[
|
||||||
let y = arr1(&[2, 2, 2, 3, 3]);
|
&[1., 2.],
|
||||||
let knn = KNNClassifier::fit(NDArrayUtils::array2_to_vec(&x), y.to_vec(), 3, &Array1::distance, KNNAlgorithmName::LinearSearch);
|
&[3., 4.],
|
||||||
let r = knn.predict_vec(&NDArrayUtils::array2_to_vec(&x));
|
&[5., 6.],
|
||||||
|
&[7., 8.],
|
||||||
|
&[9., 10.]]);
|
||||||
|
let y = vec![2., 2., 2., 3., 3.];
|
||||||
|
let knn = KNNClassifier::fit(&x, &y, 3, &euclidian::distance, KNNAlgorithmName::LinearSearch);
|
||||||
|
let r = knn.predict(&x);
|
||||||
assert_eq!(5, Vec::len(&r));
|
assert_eq!(5, Vec::len(&r));
|
||||||
assert_eq!(y.to_vec(), r);
|
assert_eq!(y.to_vec(), r);
|
||||||
}
|
}
|
||||||
|
|||||||
+4
-16
@@ -3,6 +3,7 @@ extern crate rand;
|
|||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
|
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
|
use crate::math::distance::euclidian;
|
||||||
use crate::algorithm::neighbour::bbd_tree::BBDTree;
|
use crate::algorithm::neighbour::bbd_tree::BBDTree;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -101,7 +102,7 @@ impl KMeans{
|
|||||||
let mut best_cluster = 0;
|
let mut best_cluster = 0;
|
||||||
|
|
||||||
for j in 0..self.k {
|
for j in 0..self.k {
|
||||||
let dist = KMeans::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]);
|
let dist = euclidian::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]);
|
||||||
if dist < min_dist {
|
if dist < min_dist {
|
||||||
min_dist = dist;
|
min_dist = dist;
|
||||||
best_cluster = j;
|
best_cluster = j;
|
||||||
@@ -127,7 +128,7 @@ impl KMeans{
|
|||||||
// the distance from each sample to its closest center in scores.
|
// the distance from each sample to its closest center in scores.
|
||||||
for i in 0..n {
|
for i in 0..n {
|
||||||
// compute the distance between this sample and the current center
|
// compute the distance between this sample and the current center
|
||||||
let dist = KMeans::squared_distance(&data.get_row_as_vec(i), ¢roid);
|
let dist = euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid);
|
||||||
|
|
||||||
if dist < d[i] {
|
if dist < d[i] {
|
||||||
d[i] = dist;
|
d[i] = dist;
|
||||||
@@ -151,7 +152,7 @@ impl KMeans{
|
|||||||
|
|
||||||
for i in 0..n {
|
for i in 0..n {
|
||||||
// compute the distance between this sample and the current center
|
// compute the distance between this sample and the current center
|
||||||
let dist = KMeans::squared_distance(&data.get_row_as_vec(i), ¢roid);
|
let dist = euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid);
|
||||||
|
|
||||||
if dist < d[i] {
|
if dist < d[i] {
|
||||||
d[i] = dist;
|
d[i] = dist;
|
||||||
@@ -162,19 +163,6 @@ impl KMeans{
|
|||||||
y
|
y
|
||||||
}
|
}
|
||||||
|
|
||||||
fn squared_distance(x: &Vec<f64>,y: &Vec<f64>) -> f64 {
|
|
||||||
if x.len() != y.len() {
|
|
||||||
panic!("Input vector sizes are different.");
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut sum = 0f64;
|
|
||||||
for i in 0..x.len() {
|
|
||||||
sum += (x[i] - y[i]).powf(2.);
|
|
||||||
}
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,18 @@ pub trait Matrix: Clone + Debug {
|
|||||||
|
|
||||||
fn get_col_as_vec(&self, col: usize) -> Vec<f64>;
|
fn get_col_as_vec(&self, col: usize) -> Vec<f64>;
|
||||||
|
|
||||||
|
fn to_vector(&self) -> Vec<Vec<f64>> {
|
||||||
|
|
||||||
|
let (n, _) = self.shape();
|
||||||
|
let mut data = Vec::new();
|
||||||
|
|
||||||
|
for i in 0..n {
|
||||||
|
data.push(self.get_row_as_vec(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
data
|
||||||
|
}
|
||||||
|
|
||||||
fn set(&mut self, row: usize, col: usize, x: f64);
|
fn set(&mut self, row: usize, col: usize, x: f64);
|
||||||
|
|
||||||
fn qr_solve_mut(&mut self, b: Self) -> Self;
|
fn qr_solve_mut(&mut self, b: Self) -> Self;
|
||||||
|
|||||||
@@ -1,56 +1,33 @@
|
|||||||
use crate::math::distance::Distance;
|
pub fn distance(x: &Vec<f64>, y: &Vec<f64>) -> f64 {
|
||||||
use ndarray::{ArrayBase, Data, Dimension};
|
return squared_distance(x, y).sqrt();
|
||||||
use crate::common::AnyNumber;
|
}
|
||||||
|
|
||||||
impl<A, S1, S2, D> Distance<ArrayBase<S2, D>> for ArrayBase<S1, D>
|
pub fn squared_distance(x: &Vec<f64>,y: &Vec<f64>) -> f64 {
|
||||||
where
|
if x.len() != y.len() {
|
||||||
A: AnyNumber,
|
panic!("Input vector sizes are different.");
|
||||||
S1: Data<Elem = A>,
|
|
||||||
S2: Data<Elem = A>,
|
|
||||||
D: Dimension
|
|
||||||
{
|
|
||||||
fn distance_to(&self, other: &Self) -> f64
|
|
||||||
{
|
|
||||||
Self::distance(self, other)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn distance(a: &Self, b: &ArrayBase<S2, D>) -> f64
|
let mut sum = 0f64;
|
||||||
{
|
for i in 0..x.len() {
|
||||||
if a.len() != b.len() {
|
sum += (x[i] - y[i]).powf(2.);
|
||||||
panic!("vectors a and b have different length");
|
|
||||||
} else {
|
|
||||||
((a - b)*(a - b)).sum().to_f64().unwrap().sqrt()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use ndarray::{Array1, ArrayView1, arr1};
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn measure_simple_euclidian_distance() {
|
fn measure_simple_euclidian_distance() {
|
||||||
let a = arr1(&[1, 2, 3]);
|
let a = vec![1., 2., 3.];
|
||||||
let b = arr1(&[4, 5, 6]);
|
let b = vec![4., 5., 6.];
|
||||||
|
|
||||||
let d_arr = Array1::distance(&a, &b);
|
let d_arr = distance(&a, &b);
|
||||||
let d_view = ArrayView1::distance(&a.view(), &b.view());
|
|
||||||
|
|
||||||
assert!((d_arr - 5.19615242).abs() < 1e-8);
|
assert!((d_arr - 5.19615242).abs() < 1e-8);
|
||||||
assert!((d_view - 5.19615242).abs() < 1e-8);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn measure_simple_euclidian_distance_static() {
|
|
||||||
let a = arr1(&[-2.1968219, -0.9559913, -0.0431738, 1.0567679, 0.3853515]);
|
|
||||||
let b = arr1(&[-1.7781325, -0.6659839, 0.9526148, -0.9460919, -0.3925300]);
|
|
||||||
|
|
||||||
let d = Array1::distance(&a, &b);
|
|
||||||
|
|
||||||
assert!((d - 2.422302).abs() < 1e-6);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@@ -1,9 +1 @@
|
|||||||
pub mod euclidian;
|
pub mod euclidian;
|
||||||
|
|
||||||
pub trait Distance<T> {
|
|
||||||
|
|
||||||
fn distance_to(&self, other: &Self) -> f64;
|
|
||||||
|
|
||||||
fn distance(a: &Self, b: &T) -> f64;
|
|
||||||
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user