From 367ea62608d218e7f0ae3f9aa6f10c092eaa7ed7 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 28 Aug 2020 15:30:52 -0700 Subject: [PATCH] feat: new distance function parameter in KNN, extends KNN documentation --- src/algorithm/neighbour/cover_tree.rs | 14 ++-- src/algorithm/neighbour/linear_search.rs | 19 +++-- src/math/num.rs | 3 +- src/neighbors/knn_classifier.rs | 49 +++++++++---- src/neighbors/knn_regressor.rs | 89 ++++++++++++++++++++++-- src/neighbors/mod.rs | 31 ++++++++- 6 files changed, 172 insertions(+), 33 deletions(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 413e4d8..e82a2e7 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -78,7 +78,7 @@ impl> CoverTree { node_id } - pub fn find(&self, p: &T, k: usize) -> Vec { + pub fn find(&self, p: &T, k: usize) -> Vec<(usize, F)> { let mut qi_p_ds = vec![(self.root(), self.distance.distance(&p, &self.root().data))]; for i in (self.min_level..self.max_level + 1).rev() { let i_d = self.base.powf(F::from(i).unwrap()); @@ -92,7 +92,7 @@ impl> CoverTree { qi_p_ds.sort_by(|(_, d1), (_, d2)| d1.partial_cmp(d2).unwrap()); qi_p_ds[..usize::min(qi_p_ds.len(), k)] .iter() - .map(|(n, _)| n.index.index) + .map(|(n, d)| (n.index.index, *d)) .collect() } @@ -353,12 +353,14 @@ mod tests { } let mut nearest_3_to_5 = tree.find(&5, 3); - nearest_3_to_5.sort(); - assert_eq!(vec!(3, 4, 5), nearest_3_to_5); + nearest_3_to_5.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + let nearest_3_to_5_indexes: Vec = nearest_3_to_5.iter().map(|v| v.0).collect(); + assert_eq!(vec!(4, 5, 3), nearest_3_to_5_indexes); let mut nearest_3_to_15 = tree.find(&15, 3); - nearest_3_to_15.sort(); - assert_eq!(vec!(13, 14, 15), nearest_3_to_15); + nearest_3_to_15.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + let nearest_3_to_15_indexes: Vec = nearest_3_to_15.iter().map(|v| v.0).collect(); + assert_eq!(vec!(14, 13, 15), nearest_3_to_15_indexes); assert_eq!(-1, tree.min_level); assert_eq!(100, tree.max_level); diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index 9f1ae3d..65cc62c 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -22,7 +22,7 @@ impl> LinearKNNSearch { } } - pub fn find(&self, from: &T, k: usize) -> Vec { + pub fn find(&self, from: &T, k: usize) -> Vec<(usize, F)> { if k < 1 || k > self.data.len() { panic!("k should be >= 1 and <= length(data)"); } @@ -48,7 +48,10 @@ impl> LinearKNNSearch { heap.sort(); - heap.get().into_iter().flat_map(|x| x.index).collect() + heap.get() + .into_iter() + .flat_map(|x| x.index.map(|i| (i, x.distance))) + .collect() } } @@ -91,7 +94,9 @@ mod tests { let algorithm1 = LinearKNNSearch::new(data1, SimpleDistance {}); - assert_eq!(vec!(1, 2, 0), algorithm1.find(&2, 3)); + let found_idxs1: Vec = algorithm1.find(&2, 3).iter().map(|v| v.0).collect(); + + assert_eq!(vec!(1, 2, 0), found_idxs1); let data2 = vec![ vec![1., 1.], @@ -103,7 +108,13 @@ mod tests { let algorithm2 = LinearKNNSearch::new(data2, Distances::euclidian()); - assert_eq!(vec!(2, 3, 1), algorithm2.find(&vec![3., 3.], 3)); + let found_idxs2: Vec = algorithm2 + .find(&vec![3., 3.], 3) + .iter() + .map(|v| v.0) + .collect(); + + assert_eq!(vec!(2, 3, 1), found_idxs2); } #[test] diff --git a/src/math/num.rs b/src/math/num.rs index 4623779..a0b690f 100644 --- a/src/math/num.rs +++ b/src/math/num.rs @@ -1,8 +1,9 @@ use num_traits::{Float, FromPrimitive}; use rand::prelude::*; use std::fmt::{Debug, Display}; +use std::iter::{Product, Sum}; -pub trait FloatExt: Float + FromPrimitive + Debug + Display + Copy { +pub trait FloatExt: Float + FromPrimitive + Debug + Display + Copy + Sum + Product { fn copysign(self, sign: Self) -> Self; fn ln_1pe(self) -> Self; diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 2e11697..f7de583 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -37,13 +37,15 @@ use serde::{Deserialize, Serialize}; use crate::linalg::{row_iter, Matrix}; use crate::math::distance::Distance; use crate::math::num::FloatExt; -use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName}; +use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName, KNNWeightFunction}; /// `KNNClassifier` parameters. Use `Default::default()` for default values. #[derive(Serialize, Deserialize, Debug)] pub struct KNNClassifierParameters { /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub weight: KNNWeightFunction, /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, } @@ -54,6 +56,7 @@ pub struct KNNClassifier, T>> { classes: Vec, y: Vec, knn_algorithm: KNNAlgorithm, + weight: KNNWeightFunction, k: usize, } @@ -61,6 +64,7 @@ impl Default for KNNClassifierParameters { fn default() -> Self { KNNClassifierParameters { algorithm: KNNAlgorithmName::CoverTree, + weight: KNNWeightFunction::Uniform, k: 3, } } @@ -90,7 +94,7 @@ impl, T>> PartialEq for KNNClassifier { } impl, T>> KNNClassifier { - /// Fits KNN Classifier to a NxM matrix where N is number of samples and M is number of features. + /// Fits KNN classifier to a NxM matrix where N is number of samples and M is number of features. /// * `x` - training data /// * `y` - vector with target values (classes) of length N /// * `distance` - a function that defines a distance between each pair of point in training data. @@ -136,6 +140,7 @@ impl, T>> KNNClassifier { y: yi, k: parameters.k, knn_algorithm: parameters.algorithm.fit(data, distance), + weight: parameters.weight, } } @@ -153,15 +158,21 @@ impl, T>> KNNClassifier { } fn predict_for_row(&self, x: Vec) -> usize { - let idxs = self.knn_algorithm.find(&x, self.k); - let mut c = vec![0; self.classes.len()]; - let mut max_c = 0; + let search_result = self.knn_algorithm.find(&x, self.k); + + let weights = self + .weight + .calc_weights(search_result.iter().map(|v| v.1).collect()); + let w_sum = weights.iter().map(|w| *w).sum(); + + let mut c = vec![T::zero(); self.classes.len()]; + let mut max_c = T::zero(); let mut max_i = 0; - for i in idxs { - c[self.y[i]] += 1; - if c[self.y[i]] > max_c { - max_c = c[self.y[i]]; - max_i = self.y[i]; + for (r, w) in search_result.iter().zip(weights.iter()) { + c[self.y[r.0]] = c[self.y[r.0]] + (*w / w_sum); + if c[self.y[r.0]] > max_c { + max_c = c[self.y[r.0]]; + max_i = self.y[r.0]; } } @@ -179,18 +190,28 @@ mod tests { fn knn_fit_predict() { let x = DenseMatrix::from_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![2., 2., 2., 3., 3.]; + let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()); + let y_hat = knn.predict(&x); + assert_eq!(5, Vec::len(&y_hat)); + assert_eq!(y.to_vec(), y_hat); + } + + #[test] + fn knn_fit_predict_weighted() { + let x = DenseMatrix::from_array(&[&[1.], &[2.], &[3.], &[4.], &[5.]]); + let y = vec![2., 2., 2., 3., 3.]; let knn = KNNClassifier::fit( &x, &y, Distances::euclidian(), KNNClassifierParameters { - k: 3, + k: 5, algorithm: KNNAlgorithmName::LinearSearch, + weight: KNNWeightFunction::Distance, }, ); - let r = knn.predict(&x); - assert_eq!(5, Vec::len(&r)); - assert_eq!(y.to_vec(), r); + let y_hat = knn.predict(&DenseMatrix::from_array(&[&[4.1]])); + assert_eq!(vec![3.0], y_hat); } #[test] diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 61df138..5724f87 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -1,20 +1,63 @@ +//! # K Nearest Neighbors Regressor +//! +//! Regressor that predicts estimated values as a function of k nearest neightbours. +//! +//! `KNNRegressor` relies on 2 backend algorithms to speedup KNN queries: +//! * [`LinearSearch`](../../algorithm/neighbour/linear_search/index.html) +//! * [`CoverTree`](../../algorithm/neighbour/cover_tree/index.html) +//! +//! The parameter `k` controls the stability of the KNN estimate: when `k` is small the algorithm is sensitive to the noise in data. When `k` increases the estimator becomes more stable. +//! In terms of the bias variance trade-off the variance decreases with `k` and the bias is likely to increase with `k`. +//! +//! When you don't know which search algorithm and `k` value to use go with default parameters defined by `Default::default()` +//! +//! To fit the model to a 4 x 2 matrix with 4 training samples, 2 features per sample: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::neighbors::knn_regressor::*; +//! use smartcore::math::distance::*; +//! +//! //your explanatory variables. Each row is a training sample with 2 numerical features +//! let x = DenseMatrix::from_array(&[ +//! &[1., 1.], +//! &[2., 2.], +//! &[3., 3.], +//! &[4., 4.], +//! &[5., 5.]]); +//! let y = vec![1., 2., 3., 4., 5.]; //your target values +//! +//! let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()); +//! let y_hat = knn.predict(&x); +//! ``` +//! +//! variable `y_hat` will hold predicted value +//! +//! use serde::{Deserialize, Serialize}; use crate::linalg::{row_iter, BaseVector, Matrix}; use crate::math::distance::Distance; use crate::math::num::FloatExt; -use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName}; +use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName, KNNWeightFunction}; +/// `KNNRegressor` parameters. Use `Default::default()` for default values. #[derive(Serialize, Deserialize, Debug)] pub struct KNNRegressorParameters { + /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub weight: KNNWeightFunction, + /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, } +/// K Nearest Neighbors Regressor #[derive(Serialize, Deserialize, Debug)] pub struct KNNRegressor, T>> { y: Vec, knn_algorithm: KNNAlgorithm, + weight: KNNWeightFunction, k: usize, } @@ -22,6 +65,7 @@ impl Default for KNNRegressorParameters { fn default() -> Self { KNNRegressorParameters { algorithm: KNNAlgorithmName::CoverTree, + weight: KNNWeightFunction::Uniform, k: 3, } } @@ -43,6 +87,13 @@ impl, T>> PartialEq for KNNRegressor { } impl, T>> KNNRegressor { + /// Fits KNN regressor to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data + /// * `y` - vector with real values + /// * `distance` - a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + /// * `parameters` - additional parameters like search algorithm and k pub fn fit>( x: &M, y: &M::RowVector, @@ -73,9 +124,13 @@ impl, T>> KNNRegressor { y: y.to_vec(), k: parameters.k, knn_algorithm: parameters.algorithm.fit(data, distance), + weight: parameters.weight, } } + /// Predict the target for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with estimates. pub fn predict>(&self, x: &M) -> M::RowVector { let mut result = M::zeros(1, x.shape().0); @@ -87,13 +142,19 @@ impl, T>> KNNRegressor { } fn predict_for_row(&self, x: Vec) -> T { - let idxs = self.knn_algorithm.find(&x, self.k); + let search_result = self.knn_algorithm.find(&x, self.k); let mut result = T::zero(); - for i in idxs { - result = result + self.y[i]; + + let weights = self + .weight + .calc_weights(search_result.iter().map(|v| v.1).collect()); + let w_sum = weights.iter().map(|w| *w).sum(); + + for (r, w) in search_result.iter().zip(weights.iter()) { + result = result + self.y[r.0] * (*w / w_sum); } - result / T::from_usize(self.k).unwrap() + result } } @@ -104,10 +165,10 @@ mod tests { use crate::math::distance::Distances; #[test] - fn knn_fit_predict() { + fn knn_fit_predict_weighted() { let x = DenseMatrix::from_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y: Vec = vec![1., 2., 3., 4., 5.]; - let y_exp = vec![2., 2., 3., 4., 4.]; + let y_exp = vec![1., 2., 3., 4., 5.]; let knn = KNNRegressor::fit( &x, &y, @@ -115,6 +176,7 @@ mod tests { KNNRegressorParameters { k: 3, algorithm: KNNAlgorithmName::LinearSearch, + weight: KNNWeightFunction::Distance, }, ); let y_hat = knn.predict(&x); @@ -124,6 +186,19 @@ mod tests { } } + #[test] + fn knn_fit_predict_uniform() { + let x = DenseMatrix::from_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); + let y: Vec = vec![1., 2., 3., 4., 5.]; + let y_exp = vec![2., 2., 3., 4., 4.]; + let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()); + let y_hat = knn.predict(&x); + assert_eq!(5, Vec::len(&y_hat)); + for i in 0..y_hat.len() { + assert!((y_hat[i] - y_exp[i]).abs() < std::f64::EPSILON); + } + } + #[test] fn serde() { let x = DenseMatrix::from_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index c9aca51..9abedf6 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -52,12 +52,41 @@ pub enum KNNAlgorithmName { CoverTree, } +/// Weight function that is used to determine estimated value. +#[derive(Serialize, Deserialize, Debug)] +pub enum KNNWeightFunction { + /// All k nearest points are weighted equally + Uniform, + /// k nearest points are weighted by the inverse of their distance. Closer neighbors will have a greater influence than neighbors which are further away. + Distance, +} + #[derive(Serialize, Deserialize, Debug)] enum KNNAlgorithm, T>> { LinearSearch(LinearKNNSearch, T, D>), CoverTree(CoverTree, T, D>), } +impl KNNWeightFunction { + fn calc_weights(&self, distances: Vec) -> std::vec::Vec { + match *self { + KNNWeightFunction::Distance => { + // if there are any points that has zero distance from one or more training points, + // those training points are weighted as 1.0 and the other points as 0.0 + if distances.iter().any(|&e| e == T::zero()) { + distances + .iter() + .map(|e| if *e == T::zero() { T::one() } else { T::zero() }) + .collect() + } else { + distances.iter().map(|e| T::one() / *e).collect() + } + } + KNNWeightFunction::Uniform => vec![T::one(); distances.len()], + } + } +} + impl KNNAlgorithmName { fn fit, T>>( &self, @@ -74,7 +103,7 @@ impl KNNAlgorithmName { } impl, T>> KNNAlgorithm { - fn find(&self, from: &Vec, k: usize) -> Vec { + fn find(&self, from: &Vec, k: usize) -> Vec<(usize, T)> { match *self { KNNAlgorithm::LinearSearch(ref linear) => linear.find(from, k), KNNAlgorithm::CoverTree(ref cover) => cover.find(from, k),