feat: documents KNN algorithms section
This commit is contained in:
@@ -1,2 +1,2 @@
|
|||||||
pub mod neighbour;
|
pub mod neighbour;
|
||||||
pub mod sort;
|
pub(crate) mod sort;
|
||||||
|
|||||||
@@ -1,3 +1,26 @@
|
|||||||
|
//! # Cover Tree
|
||||||
|
//!
|
||||||
|
//! The Cover Tree data structure is specifically designed to facilitate the speed-up of a nearest neighbor search, see [KNN algorithms](../index.html).
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//! use smartcore::algorithm::neighbour::cover_tree::*;
|
||||||
|
//! use smartcore::math::distance::Distance;
|
||||||
|
//!
|
||||||
|
//! struct SimpleDistance {} // Our distance function
|
||||||
|
//!
|
||||||
|
//! impl Distance<i32, f64> for SimpleDistance {
|
||||||
|
//! fn distance(&self, a: &i32, b: &i32) -> f64 { // simple simmetrical scalar distance
|
||||||
|
//! (a - b).abs() as f64
|
||||||
|
//! }
|
||||||
|
//! }
|
||||||
|
//!
|
||||||
|
//! let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; // data points
|
||||||
|
//!
|
||||||
|
//! let mut tree = CoverTree::new(data, SimpleDistance {});
|
||||||
|
//!
|
||||||
|
//! tree.find(&5, 3); // find 3 knn points from 5
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
use core::hash::{Hash, Hasher};
|
use core::hash::{Hash, Hasher};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
@@ -9,6 +32,7 @@ use crate::algorithm::sort::heap_select::HeapSelect;
|
|||||||
use crate::math::distance::Distance;
|
use crate::math::distance::Distance;
|
||||||
use crate::math::num::FloatExt;
|
use crate::math::num::FloatExt;
|
||||||
|
|
||||||
|
/// Implements Cover Tree algorithm
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct CoverTree<T, F: FloatExt, D: Distance<T, F>> {
|
pub struct CoverTree<T, F: FloatExt, D: Distance<T, F>> {
|
||||||
base: F,
|
base: F,
|
||||||
@@ -19,6 +43,9 @@ pub struct CoverTree<T, F: FloatExt, D: Distance<T, F>> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T: Debug, F: FloatExt, D: Distance<T, F>> CoverTree<T, F, D> {
|
impl<T: Debug, F: FloatExt, D: Distance<T, F>> CoverTree<T, F, D> {
|
||||||
|
/// Construct a cover tree.
|
||||||
|
/// * `data` - vector of data points to search for.
|
||||||
|
/// * `distance` - distance metric to use for searching. This function should extend [`Distance`](../algorithm/neighbour/index.html) interface.
|
||||||
pub fn new(mut data: Vec<T>, distance: D) -> CoverTree<T, F, D> {
|
pub fn new(mut data: Vec<T>, distance: D) -> CoverTree<T, F, D> {
|
||||||
let mut tree = CoverTree {
|
let mut tree = CoverTree {
|
||||||
base: F::two(),
|
base: F::two(),
|
||||||
@@ -34,6 +61,8 @@ impl<T: Debug, F: FloatExt, D: Distance<T, F>> CoverTree<T, F, D> {
|
|||||||
tree
|
tree
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Insert new data point into the cover tree.
|
||||||
|
/// * `p` - new data points.
|
||||||
pub fn insert(&mut self, p: T) {
|
pub fn insert(&mut self, p: T) {
|
||||||
if self.nodes.is_empty() {
|
if self.nodes.is_empty() {
|
||||||
self.new_node(None, p);
|
self.new_node(None, p);
|
||||||
@@ -78,6 +107,9 @@ impl<T: Debug, F: FloatExt, D: Distance<T, F>> CoverTree<T, F, D> {
|
|||||||
node_id
|
node_id
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Find k nearest neighbors of `p`
|
||||||
|
/// * `p` - look for k nearest points to `p`
|
||||||
|
/// * `k` - the number of nearest neighbors to return
|
||||||
pub fn find(&self, p: &T, k: usize) -> Vec<(usize, F)> {
|
pub fn find(&self, p: &T, k: usize) -> Vec<(usize, F)> {
|
||||||
let mut qi_p_ds = vec![(self.root(), self.distance.distance(&p, &self.root().data))];
|
let mut qi_p_ds = vec![(self.root(), self.distance.distance(&p, &self.root().data))];
|
||||||
for i in (self.min_level..self.max_level + 1).rev() {
|
for i in (self.min_level..self.max_level + 1).rev() {
|
||||||
|
|||||||
@@ -1,3 +1,26 @@
|
|||||||
|
//! # Brute Force Linear Search
|
||||||
|
//!
|
||||||
|
//! see [KNN algorithms](../index.html)
|
||||||
|
//! ```
|
||||||
|
//! use smartcore::algorithm::neighbour::linear_search::*;
|
||||||
|
//! use smartcore::math::distance::Distance;
|
||||||
|
//!
|
||||||
|
//! struct SimpleDistance {} // Our distance function
|
||||||
|
//!
|
||||||
|
//! impl Distance<i32, f64> for SimpleDistance {
|
||||||
|
//! fn distance(&self, a: &i32, b: &i32) -> f64 { // simple simmetrical scalar distance
|
||||||
|
//! (a - b).abs() as f64
|
||||||
|
//! }
|
||||||
|
//! }
|
||||||
|
//!
|
||||||
|
//! let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; // data points
|
||||||
|
//!
|
||||||
|
//! let knn = LinearKNNSearch::new(data, SimpleDistance {});
|
||||||
|
//!
|
||||||
|
//! knn.find(&5, 3); // find 3 knn points from 5
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::cmp::{Ordering, PartialOrd};
|
use std::cmp::{Ordering, PartialOrd};
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
@@ -6,6 +29,7 @@ use crate::algorithm::sort::heap_select::HeapSelect;
|
|||||||
use crate::math::distance::Distance;
|
use crate::math::distance::Distance;
|
||||||
use crate::math::num::FloatExt;
|
use crate::math::num::FloatExt;
|
||||||
|
|
||||||
|
/// Implements Linear Search algorithm, see [KNN algorithms](../index.html)
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct LinearKNNSearch<T, F: FloatExt, D: Distance<T, F>> {
|
pub struct LinearKNNSearch<T, F: FloatExt, D: Distance<T, F>> {
|
||||||
distance: D,
|
distance: D,
|
||||||
@@ -14,6 +38,9 @@ pub struct LinearKNNSearch<T, F: FloatExt, D: Distance<T, F>> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T, F: FloatExt, D: Distance<T, F>> LinearKNNSearch<T, F, D> {
|
impl<T, F: FloatExt, D: Distance<T, F>> LinearKNNSearch<T, F, D> {
|
||||||
|
/// Initializes algorithm.
|
||||||
|
/// * `data` - vector of data points to search for.
|
||||||
|
/// * `distance` - distance metric to use for searching. This function should extend [`Distance`](../algorithm/neighbour/index.html) interface.
|
||||||
pub fn new(data: Vec<T>, distance: D) -> LinearKNNSearch<T, F, D> {
|
pub fn new(data: Vec<T>, distance: D) -> LinearKNNSearch<T, F, D> {
|
||||||
LinearKNNSearch {
|
LinearKNNSearch {
|
||||||
data: data,
|
data: data,
|
||||||
@@ -22,6 +49,9 @@ impl<T, F: FloatExt, D: Distance<T, F>> LinearKNNSearch<T, F, D> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Find k nearest neighbors
|
||||||
|
/// * `from` - look for k nearest points to `from`
|
||||||
|
/// * `k` - the number of nearest neighbors to return
|
||||||
pub fn find(&self, from: &T, k: usize) -> Vec<(usize, F)> {
|
pub fn find(&self, from: &T, k: usize) -> Vec<(usize, F)> {
|
||||||
if k < 1 || k > self.data.len() {
|
if k < 1 || k > self.data.len() {
|
||||||
panic!("k should be >= 1 and <= length(data)");
|
panic!("k should be >= 1 and <= length(data)");
|
||||||
|
|||||||
@@ -1,3 +1,35 @@
|
|||||||
|
//! # Nearest Neighbors Search Algorithms and Data Structures
|
||||||
|
//!
|
||||||
|
//! Nearest neighbor search is a basic computational tool that is particularly relevant to machine learning,
|
||||||
|
//! where it is often believed that highdimensional datasets have low-dimensional intrinsic structure.
|
||||||
|
//! The basic nearest neighbor problem is formalized as follows: given a set \\( S \\) of \\( n \\) points in some metric space \\( (X, d) \\),
|
||||||
|
//! the problem is to preprocess \\( S \\) so that given a query point \\( p \in X \\), one can efficiently find a point \\( q \in S \\)
|
||||||
|
//! which minimizes \\( d(p, q) \\).
|
||||||
|
//!
|
||||||
|
//! [The most straightforward nearest neighbor search algorithm](linear_search/index.html) finds k nearest points using the brute-force approach where distances between all
|
||||||
|
//! pairs of points in the dataset are calculated. This approach scales as \\( O(nd^2) \\) where \\( n = \lvert S \rvert \\), is number of samples and \\( d \\) is number
|
||||||
|
//! of dimentions in metric space. As the number of samples grows, the brute-force approach quickly becomes infeasible.
|
||||||
|
//!
|
||||||
|
//! [Cover Tree](cover_tree/index.html) is data structure that partitions metric spaces to speed up nearest neighbor search. Cover tree requires \\( O(n) \\) space and
|
||||||
|
//! have nice theoretical properties:
|
||||||
|
//!
|
||||||
|
//! * construction time: \\( O(c^6n \log n) \\),
|
||||||
|
//! * insertion time \\( O(c^6 \log n) \\),
|
||||||
|
//! * removal time: \\( O(c^6 \log n) \\),
|
||||||
|
//! * query time: \\( O(c^{12} \log n) \\),
|
||||||
|
//!
|
||||||
|
//! Where \\( c \\) is a constant.
|
||||||
|
//!
|
||||||
|
//! ## References:
|
||||||
|
//! * ["The Art of Computer Programming" Knuth, D, Vol. 3, 2nd ed, Sorting and Searching, 1998](https://www-cs-faculty.stanford.edu/~knuth/taocp.html)
|
||||||
|
//! * ["Cover Trees for Nearest Neighbor" Beygelzimer et al., Proceedings of the 23rd international conference on Machine learning, ICML'06 (2006)](https://homes.cs.washington.edu/~sham/papers/ml/cover_tree.pdf)
|
||||||
|
//! * ["Faster cover trees." Izbicki et al., Proceedings of the 32nd International Conference on Machine Learning, ICML'15 (2015)](http://www.cs.ucr.edu/~cshelton/papers/index.cgi%3FIzbShe15)
|
||||||
|
//! * ["The Elements of Statistical Learning: Data Mining, Inference, and Prediction" Trevor et al., 2nd edition, chapter 13](https://web.stanford.edu/~hastie/ElemStatLearn/)
|
||||||
|
//!
|
||||||
|
//! <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_CHTML"></script>
|
||||||
|
|
||||||
pub(crate) mod bbd_tree;
|
pub(crate) mod bbd_tree;
|
||||||
|
/// tree data structure for fast nearest neighbor search
|
||||||
pub mod cover_tree;
|
pub mod cover_tree;
|
||||||
|
/// very simple algorithm that sequentially checks each element of the list until a match is found or the whole list has been searched.
|
||||||
pub mod linear_search;
|
pub mod linear_search;
|
||||||
|
|||||||
@@ -1,7 +1,5 @@
|
|||||||
//! # Nearest Neighbors
|
//! # Nearest Neighbors
|
||||||
//!
|
//!
|
||||||
//! <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_CHTML"></script>
|
|
||||||
//!
|
|
||||||
//! The k-nearest neighbors (KNN) algorithm is a simple supervised machine learning algorithm that can be used to solve both classification and regression problems.
|
//! The k-nearest neighbors (KNN) algorithm is a simple supervised machine learning algorithm that can be used to solve both classification and regression problems.
|
||||||
//! KNN is a non-parametric method that assumes that similar things exist in close proximity.
|
//! KNN is a non-parametric method that assumes that similar things exist in close proximity.
|
||||||
//!
|
//!
|
||||||
@@ -30,6 +28,8 @@
|
|||||||
//! ## References:
|
//! ## References:
|
||||||
//! * ["Nearest Neighbor Pattern Classification" Cover, T.M., IEEE Transactions on Information Theory (1967)](http://ssg.mit.edu/cal/abs/2000_spring/np_dens/classification/cover67.pdf)
|
//! * ["Nearest Neighbor Pattern Classification" Cover, T.M., IEEE Transactions on Information Theory (1967)](http://ssg.mit.edu/cal/abs/2000_spring/np_dens/classification/cover67.pdf)
|
||||||
//! * ["The Elements of Statistical Learning: Data Mining, Inference, and Prediction" Trevor et al., 2nd edition, chapter 13](https://web.stanford.edu/~hastie/ElemStatLearn/)
|
//! * ["The Elements of Statistical Learning: Data Mining, Inference, and Prediction" Trevor et al., 2nd edition, chapter 13](https://web.stanford.edu/~hastie/ElemStatLearn/)
|
||||||
|
//!
|
||||||
|
//! <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_CHTML"></script>
|
||||||
|
|
||||||
use crate::algorithm::neighbour::cover_tree::CoverTree;
|
use crate::algorithm::neighbour::cover_tree::CoverTree;
|
||||||
use crate::algorithm::neighbour::linear_search::LinearKNNSearch;
|
use crate::algorithm::neighbour::linear_search::LinearKNNSearch;
|
||||||
@@ -43,7 +43,7 @@ pub mod knn_classifier;
|
|||||||
pub mod knn_regressor;
|
pub mod knn_regressor;
|
||||||
|
|
||||||
/// Both, KNN classifier and regressor benefits from underlying search algorithms that helps to speed up queries.
|
/// Both, KNN classifier and regressor benefits from underlying search algorithms that helps to speed up queries.
|
||||||
/// `KNNAlgorithmName` maintains a list of supported search algorithms
|
/// `KNNAlgorithmName` maintains a list of supported search algorithms, see [KNN algorithms](../algorithm/neighbour/index.html)
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub enum KNNAlgorithmName {
|
pub enum KNNAlgorithmName {
|
||||||
/// Heap Search algorithm, see [`LinearSearch`](../algorithm/neighbour/linear_search/index.html)
|
/// Heap Search algorithm, see [`LinearSearch`](../algorithm/neighbour/linear_search/index.html)
|
||||||
|
|||||||
Reference in New Issue
Block a user