feat: new distance function parameter in KNN, extends KNN documentation

2020-08-28 15:30:52 -07:00
parent dcf636a5f1
commit 367ea62608
6 changed files with 172 additions and 33 deletions
@@ -37,13 +37,15 @@ use serde::{Deserialize, Serialize};
 use crate::linalg::{row_iter, Matrix};
 use crate::math::distance::Distance;
 use crate::math::num::FloatExt;
-use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName};
+use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName, KNNWeightFunction};

 /// `KNNClassifier` parameters. Use `Default::default()` for default values.
 #[derive(Serialize, Deserialize, Debug)]
 pub struct KNNClassifierParameters {
    /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
    pub algorithm: KNNAlgorithmName,
+    /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`.
+    pub weight: KNNWeightFunction,
    /// number of training samples to consider when estimating class for new point. Default value is 3.
    pub k: usize,
 }
@@ -54,6 +56,7 @@ pub struct KNNClassifier<T: FloatExt, D: Distance<Vec<T>, T>> {
    classes: Vec<T>,
    y: Vec<usize>,
    knn_algorithm: KNNAlgorithm<T, D>,
+    weight: KNNWeightFunction,
    k: usize,
 }

@@ -61,6 +64,7 @@ impl Default for KNNClassifierParameters {
    fn default() -> Self {
        KNNClassifierParameters {
            algorithm: KNNAlgorithmName::CoverTree,
+            weight: KNNWeightFunction::Uniform,
            k: 3,
        }
    }
@@ -90,7 +94,7 @@ impl<T: FloatExt, D: Distance<Vec<T>, T>> PartialEq for KNNClassifier<T, D> {
 }

 impl<T: FloatExt, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
-    /// Fits KNN Classifier to a NxM matrix where N is number of samples and M is number of features.
+    /// Fits KNN classifier to a NxM matrix where N is number of samples and M is number of features.
    /// * `x` - training data
    /// * `y` - vector with target values (classes) of length N
    /// * `distance` - a function that defines a distance between each pair of point in training data.
@@ -136,6 +140,7 @@ impl<T: FloatExt, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
            y: yi,
            k: parameters.k,
            knn_algorithm: parameters.algorithm.fit(data, distance),
+            weight: parameters.weight,
        }
    }

@@ -153,15 +158,21 @@ impl<T: FloatExt, D: Distance<Vec<T>, T>> KNNClassifier<T, D> {
    }

    fn predict_for_row(&self, x: Vec<T>) -> usize {
-        let idxs = self.knn_algorithm.find(&x, self.k);
-        let mut c = vec![0; self.classes.len()];
-        let mut max_c = 0;
+        let search_result = self.knn_algorithm.find(&x, self.k);
+
+        let weights = self
+            .weight
+            .calc_weights(search_result.iter().map(|v| v.1).collect());
+        let w_sum = weights.iter().map(|w| *w).sum();
+
+        let mut c = vec![T::zero(); self.classes.len()];
+        let mut max_c = T::zero();
        let mut max_i = 0;
-        for i in idxs {
-            c[self.y[i]] += 1;
-            if c[self.y[i]] > max_c {
-                max_c = c[self.y[i]];
-                max_i = self.y[i];
+        for (r, w) in search_result.iter().zip(weights.iter()) {
+            c[self.y[r.0]] = c[self.y[r.0]] + (*w / w_sum);
+            if c[self.y[r.0]] > max_c {
+                max_c = c[self.y[r.0]];
+                max_i = self.y[r.0];
            }
        }

@@ -179,18 +190,28 @@ mod tests {
    fn knn_fit_predict() {
        let x = DenseMatrix::from_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
        let y = vec![2., 2., 2., 3., 3.];
+        let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default());
+        let y_hat = knn.predict(&x);
+        assert_eq!(5, Vec::len(&y_hat));
+        assert_eq!(y.to_vec(), y_hat);
+    }
+
+    #[test]
+    fn knn_fit_predict_weighted() {
+        let x = DenseMatrix::from_array(&[&[1.], &[2.], &[3.], &[4.], &[5.]]);
+        let y = vec![2., 2., 2., 3., 3.];
        let knn = KNNClassifier::fit(
            &x,
            &y,
            Distances::euclidian(),
            KNNClassifierParameters {
-                k: 3,
+                k: 5,
                algorithm: KNNAlgorithmName::LinearSearch,
+                weight: KNNWeightFunction::Distance,
            },
        );
-        let r = knn.predict(&x);
-        assert_eq!(5, Vec::len(&r));
-        assert_eq!(y.to_vec(), r);
+        let y_hat = knn.predict(&DenseMatrix::from_array(&[&[4.1]]));
+        assert_eq!(vec![3.0], y_hat);
    }

    #[test]
@@ -1,20 +1,63 @@
+//! # K Nearest Neighbors Regressor
+//!
+//! Regressor that predicts estimated values as a function of k nearest neightbours.
+//!
+//! `KNNRegressor` relies on 2 backend algorithms to speedup KNN queries:
+//! * [`LinearSearch`](../../algorithm/neighbour/linear_search/index.html)
+//! * [`CoverTree`](../../algorithm/neighbour/cover_tree/index.html)
+//!
+//! The parameter `k` controls the stability of the KNN estimate: when `k` is small the algorithm is sensitive to the noise in data. When `k` increases the estimator becomes more stable.
+//! In terms of the bias variance trade-off the variance decreases with `k` and the bias is likely to increase with `k`.
+//!
+//! When you don't know which search algorithm and `k` value to use go with default parameters defined by `Default::default()`
+//!
+//! To fit the model to a 4 x 2 matrix with 4 training samples, 2 features per sample:
+//!
+//! ```
+//! use smartcore::linalg::naive::dense_matrix::*;
+//! use smartcore::neighbors::knn_regressor::*;
+//! use smartcore::math::distance::*;
+//!
+//! //your explanatory variables. Each row is a training sample with 2 numerical features
+//! let x = DenseMatrix::from_array(&[
+//!     &[1., 1.],
+//!     &[2., 2.],
+//!     &[3., 3.],
+//!     &[4., 4.],
+//!     &[5., 5.]]);
+//! let y = vec![1., 2., 3., 4., 5.]; //your target values
+//!
+//! let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default());
+//! let y_hat = knn.predict(&x);
+//! ```
+//!
+//! variable `y_hat` will hold predicted value
+//!
+//!
 use serde::{Deserialize, Serialize};

 use crate::linalg::{row_iter, BaseVector, Matrix};
 use crate::math::distance::Distance;
 use crate::math::num::FloatExt;
-use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName};
+use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName, KNNWeightFunction};

+/// `KNNRegressor` parameters. Use `Default::default()` for default values.
 #[derive(Serialize, Deserialize, Debug)]
 pub struct KNNRegressorParameters {
+    /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
    pub algorithm: KNNAlgorithmName,
+    /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`.
+    pub weight: KNNWeightFunction,
+    /// number of training samples to consider when estimating class for new point. Default value is 3.
    pub k: usize,
 }

+/// K Nearest Neighbors Regressor
 #[derive(Serialize, Deserialize, Debug)]
 pub struct KNNRegressor<T: FloatExt, D: Distance<Vec<T>, T>> {
    y: Vec<T>,
    knn_algorithm: KNNAlgorithm<T, D>,
+    weight: KNNWeightFunction,
    k: usize,
 }

@@ -22,6 +65,7 @@ impl Default for KNNRegressorParameters {
    fn default() -> Self {
        KNNRegressorParameters {
            algorithm: KNNAlgorithmName::CoverTree,
+            weight: KNNWeightFunction::Uniform,
            k: 3,
        }
    }
@@ -43,6 +87,13 @@ impl<T: FloatExt, D: Distance<Vec<T>, T>> PartialEq for KNNRegressor<T, D> {
 }

 impl<T: FloatExt, D: Distance<Vec<T>, T>> KNNRegressor<T, D> {
+    /// Fits KNN regressor to a NxM matrix where N is number of samples and M is number of features.
+    /// * `x` - training data
+    /// * `y` - vector with real values
+    /// * `distance` - a function that defines a distance between each pair of point in training data.
+    ///    This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
+    ///    See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
+    /// * `parameters` - additional parameters like search algorithm and k
    pub fn fit<M: Matrix<T>>(
        x: &M,
        y: &M::RowVector,
@@ -73,9 +124,13 @@ impl<T: FloatExt, D: Distance<Vec<T>, T>> KNNRegressor<T, D> {
            y: y.to_vec(),
            k: parameters.k,
            knn_algorithm: parameters.algorithm.fit(data, distance),
+            weight: parameters.weight,
        }
    }

+    /// Predict the target for the provided data.
+    /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
+    /// Returns a vector of size N with estimates.
    pub fn predict<M: Matrix<T>>(&self, x: &M) -> M::RowVector {
        let mut result = M::zeros(1, x.shape().0);

@@ -87,13 +142,19 @@ impl<T: FloatExt, D: Distance<Vec<T>, T>> KNNRegressor<T, D> {
    }

    fn predict_for_row(&self, x: Vec<T>) -> T {
-        let idxs = self.knn_algorithm.find(&x, self.k);
+        let search_result = self.knn_algorithm.find(&x, self.k);
        let mut result = T::zero();
-        for i in idxs {
-            result = result + self.y[i];
+
+        let weights = self
+            .weight
+            .calc_weights(search_result.iter().map(|v| v.1).collect());
+        let w_sum = weights.iter().map(|w| *w).sum();
+
+        for (r, w) in search_result.iter().zip(weights.iter()) {
+            result = result + self.y[r.0] * (*w / w_sum);
        }

-        result / T::from_usize(self.k).unwrap()
+        result
    }
 }

@@ -104,10 +165,10 @@ mod tests {
    use crate::math::distance::Distances;

    #[test]
-    fn knn_fit_predict() {
+    fn knn_fit_predict_weighted() {
        let x = DenseMatrix::from_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
        let y: Vec<f64> = vec![1., 2., 3., 4., 5.];
-        let y_exp = vec![2., 2., 3., 4., 4.];
+        let y_exp = vec![1., 2., 3., 4., 5.];
        let knn = KNNRegressor::fit(
            &x,
            &y,
@@ -115,6 +176,7 @@ mod tests {
            KNNRegressorParameters {
                k: 3,
                algorithm: KNNAlgorithmName::LinearSearch,
+                weight: KNNWeightFunction::Distance,
            },
        );
        let y_hat = knn.predict(&x);
@@ -124,6 +186,19 @@ mod tests {
        }
    }

+    #[test]
+    fn knn_fit_predict_uniform() {
+        let x = DenseMatrix::from_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
+        let y: Vec<f64> = vec![1., 2., 3., 4., 5.];
+        let y_exp = vec![2., 2., 3., 4., 4.];
+        let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default());
+        let y_hat = knn.predict(&x);
+        assert_eq!(5, Vec::len(&y_hat));
+        for i in 0..y_hat.len() {
+            assert!((y_hat[i] - y_exp[i]).abs() < std::f64::EPSILON);
+        }
+    }
+
    #[test]
    fn serde() {
        let x = DenseMatrix::from_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]);
@@ -52,12 +52,41 @@ pub enum KNNAlgorithmName {
    CoverTree,
 }

+/// Weight function that is used to determine estimated value.
+#[derive(Serialize, Deserialize, Debug)]
+pub enum KNNWeightFunction {
+    /// All k nearest points are weighted equally
+    Uniform,
+    /// k nearest points are weighted by the inverse of their distance. Closer neighbors will have a greater influence than neighbors which are further away.
+    Distance,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 enum KNNAlgorithm<T: FloatExt, D: Distance<Vec<T>, T>> {
    LinearSearch(LinearKNNSearch<Vec<T>, T, D>),
    CoverTree(CoverTree<Vec<T>, T, D>),
 }

+impl KNNWeightFunction {
+    fn calc_weights<T: FloatExt>(&self, distances: Vec<T>) -> std::vec::Vec<T> {
+        match *self {
+            KNNWeightFunction::Distance => {
+                // if there are any points that has zero distance from one or more training points,
+                // those training points are weighted as 1.0 and the other points as 0.0
+                if distances.iter().any(|&e| e == T::zero()) {
+                    distances
+                        .iter()
+                        .map(|e| if *e == T::zero() { T::one() } else { T::zero() })
+                        .collect()
+                } else {
+                    distances.iter().map(|e| T::one() / *e).collect()
+                }
+            }
+            KNNWeightFunction::Uniform => vec![T::one(); distances.len()],
+        }
+    }
+}
+
 impl KNNAlgorithmName {
    fn fit<T: FloatExt, D: Distance<Vec<T>, T>>(
        &self,
@@ -74,7 +103,7 @@ impl KNNAlgorithmName {
 }

 impl<T: FloatExt, D: Distance<Vec<T>, T>> KNNAlgorithm<T, D> {
-    fn find(&self, from: &Vec<T>, k: usize) -> Vec<usize> {
+    fn find(&self, from: &Vec<T>, k: usize) -> Vec<(usize, T)> {
        match *self {
            KNNAlgorithm::LinearSearch(ref linear) => linear.find(from, k),
            KNNAlgorithm::CoverTree(ref cover) => cover.find(from, k),