diff --git a/src/algorithm/mod.rs b/src/algorithm/mod.rs index 00d752d..7741a40 100644 --- a/src/algorithm/mod.rs +++ b/src/algorithm/mod.rs @@ -1,2 +1,2 @@ pub mod neighbour; -pub mod sort; +pub(crate) mod sort; diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index e82a2e7..be57006 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -1,3 +1,26 @@ +//! # Cover Tree +//! +//! The Cover Tree data structure is specifically designed to facilitate the speed-up of a nearest neighbor search, see [KNN algorithms](../index.html). +//! +//! ``` +//! use smartcore::algorithm::neighbour::cover_tree::*; +//! use smartcore::math::distance::Distance; +//! +//! struct SimpleDistance {} // Our distance function +//! +//! impl Distance for SimpleDistance { +//! fn distance(&self, a: &i32, b: &i32) -> f64 { // simple simmetrical scalar distance +//! (a - b).abs() as f64 +//! } +//! } +//! +//! let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; // data points +//! +//! let mut tree = CoverTree::new(data, SimpleDistance {}); +//! +//! tree.find(&5, 3); // find 3 knn points from 5 +//! +//! ``` use core::hash::{Hash, Hasher}; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; @@ -9,6 +32,7 @@ use crate::algorithm::sort::heap_select::HeapSelect; use crate::math::distance::Distance; use crate::math::num::FloatExt; +/// Implements Cover Tree algorithm #[derive(Serialize, Deserialize, Debug)] pub struct CoverTree> { base: F, @@ -19,6 +43,9 @@ pub struct CoverTree> { } impl> CoverTree { + /// Construct a cover tree. + /// * `data` - vector of data points to search for. + /// * `distance` - distance metric to use for searching. This function should extend [`Distance`](../algorithm/neighbour/index.html) interface. pub fn new(mut data: Vec, distance: D) -> CoverTree { let mut tree = CoverTree { base: F::two(), @@ -34,6 +61,8 @@ impl> CoverTree { tree } + /// Insert new data point into the cover tree. + /// * `p` - new data points. pub fn insert(&mut self, p: T) { if self.nodes.is_empty() { self.new_node(None, p); @@ -78,6 +107,9 @@ impl> CoverTree { node_id } + /// Find k nearest neighbors of `p` + /// * `p` - look for k nearest points to `p` + /// * `k` - the number of nearest neighbors to return pub fn find(&self, p: &T, k: usize) -> Vec<(usize, F)> { let mut qi_p_ds = vec![(self.root(), self.distance.distance(&p, &self.root().data))]; for i in (self.min_level..self.max_level + 1).rev() { diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index 65cc62c..39e1b40 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -1,3 +1,26 @@ +//! # Brute Force Linear Search +//! +//! see [KNN algorithms](../index.html) +//! ``` +//! use smartcore::algorithm::neighbour::linear_search::*; +//! use smartcore::math::distance::Distance; +//! +//! struct SimpleDistance {} // Our distance function +//! +//! impl Distance for SimpleDistance { +//! fn distance(&self, a: &i32, b: &i32) -> f64 { // simple simmetrical scalar distance +//! (a - b).abs() as f64 +//! } +//! } +//! +//! let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; // data points +//! +//! let knn = LinearKNNSearch::new(data, SimpleDistance {}); +//! +//! knn.find(&5, 3); // find 3 knn points from 5 +//! +//! ``` + use serde::{Deserialize, Serialize}; use std::cmp::{Ordering, PartialOrd}; use std::marker::PhantomData; @@ -6,6 +29,7 @@ use crate::algorithm::sort::heap_select::HeapSelect; use crate::math::distance::Distance; use crate::math::num::FloatExt; +/// Implements Linear Search algorithm, see [KNN algorithms](../index.html) #[derive(Serialize, Deserialize, Debug)] pub struct LinearKNNSearch> { distance: D, @@ -14,6 +38,9 @@ pub struct LinearKNNSearch> { } impl> LinearKNNSearch { + /// Initializes algorithm. + /// * `data` - vector of data points to search for. + /// * `distance` - distance metric to use for searching. This function should extend [`Distance`](../algorithm/neighbour/index.html) interface. pub fn new(data: Vec, distance: D) -> LinearKNNSearch { LinearKNNSearch { data: data, @@ -22,6 +49,9 @@ impl> LinearKNNSearch { } } + /// Find k nearest neighbors + /// * `from` - look for k nearest points to `from` + /// * `k` - the number of nearest neighbors to return pub fn find(&self, from: &T, k: usize) -> Vec<(usize, F)> { if k < 1 || k > self.data.len() { panic!("k should be >= 1 and <= length(data)"); diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 52d117d..708b415 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -1,3 +1,35 @@ +//! # Nearest Neighbors Search Algorithms and Data Structures +//! +//! Nearest neighbor search is a basic computational tool that is particularly relevant to machine learning, +//! where it is often believed that highdimensional datasets have low-dimensional intrinsic structure. +//! The basic nearest neighbor problem is formalized as follows: given a set \\( S \\) of \\( n \\) points in some metric space \\( (X, d) \\), +//! the problem is to preprocess \\( S \\) so that given a query point \\( p \in X \\), one can efficiently find a point \\( q \in S \\) +//! which minimizes \\( d(p, q) \\). +//! +//! [The most straightforward nearest neighbor search algorithm](linear_search/index.html) finds k nearest points using the brute-force approach where distances between all +//! pairs of points in the dataset are calculated. This approach scales as \\( O(nd^2) \\) where \\( n = \lvert S \rvert \\), is number of samples and \\( d \\) is number +//! of dimentions in metric space. As the number of samples grows, the brute-force approach quickly becomes infeasible. +//! +//! [Cover Tree](cover_tree/index.html) is data structure that partitions metric spaces to speed up nearest neighbor search. Cover tree requires \\( O(n) \\) space and +//! have nice theoretical properties: +//! +//! * construction time: \\( O(c^6n \log n) \\), +//! * insertion time \\( O(c^6 \log n) \\), +//! * removal time: \\( O(c^6 \log n) \\), +//! * query time: \\( O(c^{12} \log n) \\), +//! +//! Where \\( c \\) is a constant. +//! +//! ## References: +//! * ["The Art of Computer Programming" Knuth, D, Vol. 3, 2nd ed, Sorting and Searching, 1998](https://www-cs-faculty.stanford.edu/~knuth/taocp.html) +//! * ["Cover Trees for Nearest Neighbor" Beygelzimer et al., Proceedings of the 23rd international conference on Machine learning, ICML'06 (2006)](https://homes.cs.washington.edu/~sham/papers/ml/cover_tree.pdf) +//! * ["Faster cover trees." Izbicki et al., Proceedings of the 32nd International Conference on Machine Learning, ICML'15 (2015)](http://www.cs.ucr.edu/~cshelton/papers/index.cgi%3FIzbShe15) +//! * ["The Elements of Statistical Learning: Data Mining, Inference, and Prediction" Trevor et al., 2nd edition, chapter 13](https://web.stanford.edu/~hastie/ElemStatLearn/) +//! +//! + pub(crate) mod bbd_tree; +/// tree data structure for fast nearest neighbor search pub mod cover_tree; +/// very simple algorithm that sequentially checks each element of the list until a match is found or the whole list has been searched. pub mod linear_search; diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index 9abedf6..37cd310 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -1,7 +1,5 @@ //! # Nearest Neighbors //! -//! -//! //! The k-nearest neighbors (KNN) algorithm is a simple supervised machine learning algorithm that can be used to solve both classification and regression problems. //! KNN is a non-parametric method that assumes that similar things exist in close proximity. //! @@ -30,6 +28,8 @@ //! ## References: //! * ["Nearest Neighbor Pattern Classification" Cover, T.M., IEEE Transactions on Information Theory (1967)](http://ssg.mit.edu/cal/abs/2000_spring/np_dens/classification/cover67.pdf) //! * ["The Elements of Statistical Learning: Data Mining, Inference, and Prediction" Trevor et al., 2nd edition, chapter 13](https://web.stanford.edu/~hastie/ElemStatLearn/) +//! +//! use crate::algorithm::neighbour::cover_tree::CoverTree; use crate::algorithm::neighbour::linear_search::LinearKNNSearch; @@ -43,7 +43,7 @@ pub mod knn_classifier; pub mod knn_regressor; /// Both, KNN classifier and regressor benefits from underlying search algorithms that helps to speed up queries. -/// `KNNAlgorithmName` maintains a list of supported search algorithms +/// `KNNAlgorithmName` maintains a list of supported search algorithms, see [KNN algorithms](../algorithm/neighbour/index.html) #[derive(Serialize, Deserialize, Debug)] pub enum KNNAlgorithmName { /// Heap Search algorithm, see [`LinearSearch`](../algorithm/neighbour/linear_search/index.html)