Merge potential next release v0.4 (#187) Breaking Changes

* First draft of the new n-dimensional arrays + NB use case
* Improves default implementation of multiple Array methods
* Refactors tree methods
* Adds matrix decomposition routines
* Adds matrix decomposition methods to ndarray and nalgebra bindings
* Refactoring + linear regression now uses array2
* Ridge & Linear regression
* LBFGS optimizer & logistic regression
* LBFGS optimizer & logistic regression
* Changes linear methods, metrics and model selection methods to new n-dimensional arrays
* Switches KNN and clustering algorithms to new n-d array layer
* Refactors distance metrics
* Optimizes knn and clustering methods
* Refactors metrics module
* Switches decomposition methods to n-dimensional arrays
* Linalg refactoring - cleanup rng merge (#172)
* Remove legacy DenseMatrix and BaseMatrix implementation. Port the new Number, FloatNumber and Array implementation into module structure.
* Exclude AUC metrics. Needs reimplementation
* Improve developers walkthrough

New traits system in place at `src/numbers` and `src/linalg`
Co-authored-by: Lorenzo <tunedconsulting@gmail.com>

* Provide SupervisedEstimator with a constructor to avoid explicit dynamical box allocation in 'cross_validate' and 'cross_validate_predict' as required by the use of 'dyn' as per Rust 2021
* Implement getters to use as_ref() in src/neighbors
* Implement getters to use as_ref() in src/naive_bayes
* Implement getters to use as_ref() in src/linear
* Add Clone to src/naive_bayes
* Change signature for cross_validate and other model_selection functions to abide to use of dyn in Rust 2021
* Implement ndarray-bindings. Remove FloatNumber from implementations
* Drop nalgebra-bindings support (as decided in conf-call to go for ndarray)
* Remove benches. Benches will have their own repo at smartcore-benches
* Implement SVC
* Implement SVC serialization. Move search parameters in dedicated module
* Implement SVR. Definitely too slow
* Fix compilation issues for wasm (#202)

Co-authored-by: Luis Moreno <morenol@users.noreply.github.com>
* Fix tests (#203)

* Port linalg/traits/stats.rs
* Improve methods naming
* Improve Display for DenseMatrix

Co-authored-by: Montana Low <montanalow@users.noreply.github.com>
Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
This commit is contained in:
Lorenzo
2022-10-31 10:44:57 +00:00
committed by GitHub
parent bb71656137
commit 52eb6ce023
110 changed files with 10327 additions and 9107 deletions
+57 -55
View File
@@ -1,45 +1,45 @@
use std::fmt::Debug;
use crate::linalg::Matrix;
use crate::math::distance::euclidian::*;
use crate::math::num::RealNumber;
use crate::linalg::basic::arrays::Array2;
use crate::metrics::distance::euclidian::*;
use crate::numbers::basenum::Number;
#[derive(Debug)]
pub struct BBDTree<T: RealNumber> {
nodes: Vec<BBDTreeNode<T>>,
pub struct BBDTree {
nodes: Vec<BBDTreeNode>,
index: Vec<usize>,
root: usize,
}
#[derive(Debug)]
struct BBDTreeNode<T: RealNumber> {
struct BBDTreeNode {
count: usize,
index: usize,
center: Vec<T>,
radius: Vec<T>,
sum: Vec<T>,
cost: T,
center: Vec<f64>,
radius: Vec<f64>,
sum: Vec<f64>,
cost: f64,
lower: Option<usize>,
upper: Option<usize>,
}
impl<T: RealNumber> BBDTreeNode<T> {
fn new(d: usize) -> BBDTreeNode<T> {
impl BBDTreeNode {
fn new(d: usize) -> BBDTreeNode {
BBDTreeNode {
count: 0,
index: 0,
center: vec![T::zero(); d],
radius: vec![T::zero(); d],
sum: vec![T::zero(); d],
cost: T::zero(),
center: vec![0f64; d],
radius: vec![0f64; d],
sum: vec![0f64; d],
cost: 0f64,
lower: Option::None,
upper: Option::None,
}
}
}
impl<T: RealNumber> BBDTree<T> {
pub fn new<M: Matrix<T>>(data: &M) -> BBDTree<T> {
impl BBDTree {
pub fn new<T: Number, M: Array2<T>>(data: &M) -> BBDTree {
let nodes = Vec::new();
let (n, _) = data.shape();
@@ -61,18 +61,18 @@ impl<T: RealNumber> BBDTree<T> {
pub(crate) fn clustering(
&self,
centroids: &[Vec<T>],
sums: &mut Vec<Vec<T>>,
centroids: &[Vec<f64>],
sums: &mut Vec<Vec<f64>>,
counts: &mut Vec<usize>,
membership: &mut Vec<usize>,
) -> T {
) -> f64 {
let k = centroids.len();
counts.iter_mut().for_each(|v| *v = 0);
let mut candidates = vec![0; k];
for i in 0..k {
candidates[i] = i;
sums[i].iter_mut().for_each(|v| *v = T::zero());
sums[i].iter_mut().for_each(|v| *v = 0f64);
}
self.filter(
@@ -89,13 +89,13 @@ impl<T: RealNumber> BBDTree<T> {
fn filter(
&self,
node: usize,
centroids: &[Vec<T>],
centroids: &[Vec<f64>],
candidates: &[usize],
k: usize,
sums: &mut Vec<Vec<T>>,
sums: &mut Vec<Vec<f64>>,
counts: &mut Vec<usize>,
membership: &mut Vec<usize>,
) -> T {
) -> f64 {
let d = centroids[0].len();
let mut min_dist =
@@ -163,9 +163,9 @@ impl<T: RealNumber> BBDTree<T> {
}
fn prune(
center: &[T],
radius: &[T],
centroids: &[Vec<T>],
center: &[f64],
radius: &[f64],
centroids: &[Vec<f64>],
best_index: usize,
test_index: usize,
) -> bool {
@@ -177,22 +177,22 @@ impl<T: RealNumber> BBDTree<T> {
let best = &centroids[best_index];
let test = &centroids[test_index];
let mut lhs = T::zero();
let mut rhs = T::zero();
let mut lhs = 0f64;
let mut rhs = 0f64;
for i in 0..d {
let diff = test[i] - best[i];
lhs += diff * diff;
if diff > T::zero() {
if diff > 0f64 {
rhs += (center[i] + radius[i] - best[i]) * diff;
} else {
rhs += (center[i] - radius[i] - best[i]) * diff;
}
}
lhs >= T::two() * rhs
lhs >= 2f64 * rhs
}
fn build_node<M: Matrix<T>>(&mut self, data: &M, begin: usize, end: usize) -> usize {
fn build_node<T: Number, M: Array2<T>>(&mut self, data: &M, begin: usize, end: usize) -> usize {
let (_, d) = data.shape();
let mut node = BBDTreeNode::new(d);
@@ -200,17 +200,17 @@ impl<T: RealNumber> BBDTree<T> {
node.count = end - begin;
node.index = begin;
let mut lower_bound = vec![T::zero(); d];
let mut upper_bound = vec![T::zero(); d];
let mut lower_bound = vec![0f64; d];
let mut upper_bound = vec![0f64; d];
for i in 0..d {
lower_bound[i] = data.get(self.index[begin], i);
upper_bound[i] = data.get(self.index[begin], i);
lower_bound[i] = data.get((self.index[begin], i)).to_f64().unwrap();
upper_bound[i] = data.get((self.index[begin], i)).to_f64().unwrap();
}
for i in begin..end {
for j in 0..d {
let c = data.get(self.index[i], j);
let c = data.get((self.index[i], j)).to_f64().unwrap();
if lower_bound[j] > c {
lower_bound[j] = c;
}
@@ -220,32 +220,32 @@ impl<T: RealNumber> BBDTree<T> {
}
}
let mut max_radius = T::from(-1.).unwrap();
let mut max_radius = -1f64;
let mut split_index = 0;
for i in 0..d {
node.center[i] = (lower_bound[i] + upper_bound[i]) / T::two();
node.radius[i] = (upper_bound[i] - lower_bound[i]) / T::two();
node.center[i] = (lower_bound[i] + upper_bound[i]) / 2f64;
node.radius[i] = (upper_bound[i] - lower_bound[i]) / 2f64;
if node.radius[i] > max_radius {
max_radius = node.radius[i];
split_index = i;
}
}
if max_radius < T::from(1E-10).unwrap() {
if max_radius < 1E-10 {
node.lower = Option::None;
node.upper = Option::None;
for i in 0..d {
node.sum[i] = data.get(self.index[begin], i);
node.sum[i] = data.get((self.index[begin], i)).to_f64().unwrap();
}
if end > begin + 1 {
let len = end - begin;
for i in 0..d {
node.sum[i] *= T::from(len).unwrap();
node.sum[i] *= len as f64;
}
}
node.cost = T::zero();
node.cost = 0f64;
return self.add_node(node);
}
@@ -254,8 +254,10 @@ impl<T: RealNumber> BBDTree<T> {
let mut i2 = end - 1;
let mut size = 0;
while i1 <= i2 {
let mut i1_good = data.get(self.index[i1], split_index) < split_cutoff;
let mut i2_good = data.get(self.index[i2], split_index) >= split_cutoff;
let mut i1_good =
data.get((self.index[i1], split_index)).to_f64().unwrap() < split_cutoff;
let mut i2_good =
data.get((self.index[i2], split_index)).to_f64().unwrap() >= split_cutoff;
if !i1_good && !i2_good {
self.index.swap(i1, i2);
@@ -281,9 +283,9 @@ impl<T: RealNumber> BBDTree<T> {
self.nodes[node.lower.unwrap()].sum[i] + self.nodes[node.upper.unwrap()].sum[i];
}
let mut mean = vec![T::zero(); d];
let mut mean = vec![0f64; d];
for (i, mean_i) in mean.iter_mut().enumerate().take(d) {
*mean_i = node.sum[i] / T::from(node.count).unwrap();
*mean_i = node.sum[i] / node.count as f64;
}
node.cost = BBDTree::node_cost(&self.nodes[node.lower.unwrap()], &mean)
@@ -292,17 +294,17 @@ impl<T: RealNumber> BBDTree<T> {
self.add_node(node)
}
fn node_cost(node: &BBDTreeNode<T>, center: &[T]) -> T {
fn node_cost(node: &BBDTreeNode, center: &[f64]) -> f64 {
let d = center.len();
let mut scatter = T::zero();
let mut scatter = 0f64;
for (i, center_i) in center.iter().enumerate().take(d) {
let x = (node.sum[i] / T::from(node.count).unwrap()) - *center_i;
let x = (node.sum[i] / node.count as f64) - *center_i;
scatter += x * x;
}
node.cost + T::from(node.count).unwrap() * scatter
node.cost + node.count as f64 * scatter
}
fn add_node(&mut self, new_node: BBDTreeNode<T>) -> usize {
fn add_node(&mut self, new_node: BBDTreeNode) -> usize {
let idx = self.nodes.len();
self.nodes.push(new_node);
idx
@@ -312,7 +314,7 @@ impl<T: RealNumber> BBDTree<T> {
#[cfg(test)]
mod tests {
use super::*;
use crate::linalg::naive::dense_matrix::DenseMatrix;
use crate::linalg::basic::matrix::DenseMatrix;
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
#[test]
+74 -75
View File
@@ -4,12 +4,12 @@
//!
//! ```
//! use smartcore::algorithm::neighbour::cover_tree::*;
//! use smartcore::math::distance::Distance;
//! use smartcore::metrics::distance::Distance;
//!
//! #[derive(Clone)]
//! struct SimpleDistance {} // Our distance function
//!
//! impl Distance<i32, f64> for SimpleDistance {
//! impl Distance<i32> for SimpleDistance {
//! fn distance(&self, a: &i32, b: &i32) -> f64 { // simple simmetrical scalar distance
//! (a - b).abs() as f64
//! }
@@ -29,28 +29,27 @@ use serde::{Deserialize, Serialize};
use crate::algorithm::sort::heap_select::HeapSelection;
use crate::error::{Failed, FailedError};
use crate::math::distance::Distance;
use crate::math::num::RealNumber;
use crate::metrics::distance::Distance;
/// Implements Cover Tree algorithm
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
pub struct CoverTree<T, F: RealNumber, D: Distance<T, F>> {
base: F,
inv_log_base: F,
pub struct CoverTree<T, D: Distance<T>> {
base: f64,
inv_log_base: f64,
distance: D,
root: Node<F>,
root: Node,
data: Vec<T>,
identical_excluded: bool,
}
impl<T, F: RealNumber, D: Distance<T, F>> PartialEq for CoverTree<T, F, D> {
impl<T, D: Distance<T>> PartialEq for CoverTree<T, D> {
fn eq(&self, other: &Self) -> bool {
if self.data.len() != other.data.len() {
return false;
}
for i in 0..self.data.len() {
if self.distance.distance(&self.data[i], &other.data[i]) != F::zero() {
if self.distance.distance(&self.data[i], &other.data[i]) != 0f64 {
return false;
}
}
@@ -60,36 +59,36 @@ impl<T, F: RealNumber, D: Distance<T, F>> PartialEq for CoverTree<T, F, D> {
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
struct Node<F: RealNumber> {
struct Node {
idx: usize,
max_dist: F,
parent_dist: F,
children: Vec<Node<F>>,
_scale: i64,
max_dist: f64,
parent_dist: f64,
children: Vec<Node>,
scale: i64,
}
#[derive(Debug)]
struct DistanceSet<F: RealNumber> {
struct DistanceSet {
idx: usize,
dist: Vec<F>,
dist: Vec<f64>,
}
impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D> {
impl<T: Debug + PartialEq, D: Distance<T>> CoverTree<T, D> {
/// Construct a cover tree.
/// * `data` - vector of data points to search for.
/// * `distance` - distance metric to use for searching. This function should extend [`Distance`](../../../math/distance/index.html) interface.
pub fn new(data: Vec<T>, distance: D) -> Result<CoverTree<T, F, D>, Failed> {
let base = F::from_f64(1.3).unwrap();
pub fn new(data: Vec<T>, distance: D) -> Result<CoverTree<T, D>, Failed> {
let base = 1.3f64;
let root = Node {
idx: 0,
max_dist: F::zero(),
parent_dist: F::zero(),
max_dist: 0f64,
parent_dist: 0f64,
children: Vec::new(),
_scale: 0,
scale: 0,
};
let mut tree = CoverTree {
base,
inv_log_base: F::one() / base.ln(),
inv_log_base: 1f64 / base.ln(),
distance,
root,
data,
@@ -104,7 +103,7 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
/// Find k nearest neighbors of `p`
/// * `p` - look for k nearest points to `p`
/// * `k` - the number of nearest neighbors to return
pub fn find(&self, p: &T, k: usize) -> Result<Vec<(usize, F, &T)>, Failed> {
pub fn find(&self, p: &T, k: usize) -> Result<Vec<(usize, f64, &T)>, Failed> {
if k == 0 {
return Err(Failed::because(FailedError::FindFailed, "k should be > 0"));
}
@@ -119,13 +118,13 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
let e = self.get_data_value(self.root.idx);
let mut d = self.distance.distance(e, p);
let mut current_cover_set: Vec<(F, &Node<F>)> = Vec::new();
let mut zero_set: Vec<(F, &Node<F>)> = Vec::new();
let mut current_cover_set: Vec<(f64, &Node)> = Vec::new();
let mut zero_set: Vec<(f64, &Node)> = Vec::new();
current_cover_set.push((d, &self.root));
let mut heap = HeapSelection::with_capacity(k);
heap.add(F::max_value());
heap.add(std::f64::MAX);
let mut empty_heap = true;
if !self.identical_excluded || self.get_data_value(self.root.idx) != p {
@@ -134,7 +133,7 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
}
while !current_cover_set.is_empty() {
let mut next_cover_set: Vec<(F, &Node<F>)> = Vec::new();
let mut next_cover_set: Vec<(f64, &Node)> = Vec::new();
for par in current_cover_set {
let parent = par.1;
for c in 0..parent.children.len() {
@@ -146,7 +145,7 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
}
let upper_bound = if empty_heap {
F::infinity()
std::f64::INFINITY
} else {
*heap.peek()
};
@@ -169,7 +168,7 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
current_cover_set = next_cover_set;
}
let mut neighbors: Vec<(usize, F, &T)> = Vec::new();
let mut neighbors: Vec<(usize, f64, &T)> = Vec::new();
let upper_bound = *heap.peek();
for ds in zero_set {
if ds.0 <= upper_bound {
@@ -189,25 +188,25 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
/// Find all nearest neighbors within radius `radius` from `p`
/// * `p` - look for k nearest points to `p`
/// * `radius` - radius of the search
pub fn find_radius(&self, p: &T, radius: F) -> Result<Vec<(usize, F, &T)>, Failed> {
if radius <= F::zero() {
pub fn find_radius(&self, p: &T, radius: f64) -> Result<Vec<(usize, f64, &T)>, Failed> {
if radius <= 0f64 {
return Err(Failed::because(
FailedError::FindFailed,
"radius should be > 0",
));
}
let mut neighbors: Vec<(usize, F, &T)> = Vec::new();
let mut neighbors: Vec<(usize, f64, &T)> = Vec::new();
let mut current_cover_set: Vec<(F, &Node<F>)> = Vec::new();
let mut zero_set: Vec<(F, &Node<F>)> = Vec::new();
let mut current_cover_set: Vec<(f64, &Node)> = Vec::new();
let mut zero_set: Vec<(f64, &Node)> = Vec::new();
let e = self.get_data_value(self.root.idx);
let mut d = self.distance.distance(e, p);
current_cover_set.push((d, &self.root));
while !current_cover_set.is_empty() {
let mut next_cover_set: Vec<(F, &Node<F>)> = Vec::new();
let mut next_cover_set: Vec<(f64, &Node)> = Vec::new();
for par in current_cover_set {
let parent = par.1;
for c in 0..parent.children.len() {
@@ -240,23 +239,23 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
Ok(neighbors)
}
fn new_leaf(&self, idx: usize) -> Node<F> {
fn new_leaf(&self, idx: usize) -> Node {
Node {
idx,
max_dist: F::zero(),
parent_dist: F::zero(),
max_dist: 0f64,
parent_dist: 0f64,
children: Vec::new(),
_scale: 100,
scale: 100,
}
}
fn build_cover_tree(&mut self) {
let mut point_set: Vec<DistanceSet<F>> = Vec::new();
let mut consumed_set: Vec<DistanceSet<F>> = Vec::new();
let mut point_set: Vec<DistanceSet> = Vec::new();
let mut consumed_set: Vec<DistanceSet> = Vec::new();
let point = &self.data[0];
let idx = 0;
let mut max_dist = -F::one();
let mut max_dist = -1f64;
for i in 1..self.data.len() {
let dist = self.distance.distance(point, &self.data[i]);
@@ -284,16 +283,16 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
p: usize,
max_scale: i64,
top_scale: i64,
point_set: &mut Vec<DistanceSet<F>>,
consumed_set: &mut Vec<DistanceSet<F>>,
) -> Node<F> {
point_set: &mut Vec<DistanceSet>,
consumed_set: &mut Vec<DistanceSet>,
) -> Node {
if point_set.is_empty() {
self.new_leaf(p)
} else {
let max_dist = self.max(point_set);
let next_scale = (max_scale - 1).min(self.get_scale(max_dist));
if next_scale == std::i64::MIN {
let mut children: Vec<Node<F>> = Vec::new();
let mut children: Vec<Node> = Vec::new();
let mut leaf = self.new_leaf(p);
children.push(leaf);
while !point_set.is_empty() {
@@ -304,13 +303,13 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
}
Node {
idx: p,
max_dist: F::zero(),
parent_dist: F::zero(),
max_dist: 0f64,
parent_dist: 0f64,
children,
_scale: 100,
scale: 100,
}
} else {
let mut far: Vec<DistanceSet<F>> = Vec::new();
let mut far: Vec<DistanceSet> = Vec::new();
self.split(point_set, &mut far, max_scale);
let child = self.batch_insert(p, next_scale, top_scale, point_set, consumed_set);
@@ -319,14 +318,14 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
point_set.append(&mut far);
child
} else {
let mut children: Vec<Node<F>> = vec![child];
let mut new_point_set: Vec<DistanceSet<F>> = Vec::new();
let mut new_consumed_set: Vec<DistanceSet<F>> = Vec::new();
let mut children: Vec<Node> = vec![child];
let mut new_point_set: Vec<DistanceSet> = Vec::new();
let mut new_consumed_set: Vec<DistanceSet> = Vec::new();
while !point_set.is_empty() {
let set: DistanceSet<F> = point_set.remove(point_set.len() - 1);
let set: DistanceSet = point_set.remove(point_set.len() - 1);
let new_dist: F = set.dist[set.dist.len() - 1];
let new_dist = set.dist[set.dist.len() - 1];
self.dist_split(
point_set,
@@ -374,9 +373,9 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
Node {
idx: p,
max_dist: self.max(consumed_set),
parent_dist: F::zero(),
parent_dist: 0f64,
children,
_scale: (top_scale - max_scale),
scale: (top_scale - max_scale),
}
}
}
@@ -385,12 +384,12 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
fn split(
&self,
point_set: &mut Vec<DistanceSet<F>>,
far_set: &mut Vec<DistanceSet<F>>,
point_set: &mut Vec<DistanceSet>,
far_set: &mut Vec<DistanceSet>,
max_scale: i64,
) {
let fmax = self.get_cover_radius(max_scale);
let mut new_set: Vec<DistanceSet<F>> = Vec::new();
let mut new_set: Vec<DistanceSet> = Vec::new();
for n in point_set.drain(0..) {
if n.dist[n.dist.len() - 1] <= fmax {
new_set.push(n);
@@ -404,13 +403,13 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
fn dist_split(
&self,
point_set: &mut Vec<DistanceSet<F>>,
new_point_set: &mut Vec<DistanceSet<F>>,
point_set: &mut Vec<DistanceSet>,
new_point_set: &mut Vec<DistanceSet>,
new_point: &T,
max_scale: i64,
) {
let fmax = self.get_cover_radius(max_scale);
let mut new_set: Vec<DistanceSet<F>> = Vec::new();
let mut new_set: Vec<DistanceSet> = Vec::new();
for mut n in point_set.drain(0..) {
let new_dist = self
.distance
@@ -426,24 +425,24 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
point_set.append(&mut new_set);
}
fn get_cover_radius(&self, s: i64) -> F {
self.base.powf(F::from_i64(s).unwrap())
fn get_cover_radius(&self, s: i64) -> f64 {
self.base.powf(s as f64)
}
fn get_data_value(&self, idx: usize) -> &T {
&self.data[idx]
}
fn get_scale(&self, d: F) -> i64 {
if d == F::zero() {
fn get_scale(&self, d: f64) -> i64 {
if d == 0f64 {
std::i64::MIN
} else {
(self.inv_log_base * d.ln()).ceil().to_i64().unwrap()
(self.inv_log_base * d.ln()).ceil() as i64
}
}
fn max(&self, distance_set: &[DistanceSet<F>]) -> F {
let mut max = F::zero();
fn max(&self, distance_set: &[DistanceSet]) -> f64 {
let mut max = 0f64;
for n in distance_set {
if max < n.dist[n.dist.len() - 1] {
max = n.dist[n.dist.len() - 1];
@@ -457,13 +456,13 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
mod tests {
use super::*;
use crate::math::distance::Distances;
use crate::metrics::distance::Distances;
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
struct SimpleDistance {}
impl Distance<i32, f64> for SimpleDistance {
impl Distance<i32> for SimpleDistance {
fn distance(&self, a: &i32, b: &i32) -> f64 {
(a - b).abs() as f64
}
@@ -513,7 +512,7 @@ mod tests {
let tree = CoverTree::new(data, SimpleDistance {}).unwrap();
let deserialized_tree: CoverTree<i32, f64, SimpleDistance> =
let deserialized_tree: CoverTree<i32, SimpleDistance> =
serde_json::from_str(&serde_json::to_string(&tree).unwrap()).unwrap();
assert_eq!(tree, deserialized_tree);
+1 -1
View File
@@ -9,7 +9,7 @@ use std::cmp::{Eq, Ordering, PartialOrd};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use crate::math::num::RealNumber;
use crate::numbers::realnum::RealNumber;
///
/// The edge of the subgraph is defined by `PairwiseDistance`.
+12 -11
View File
@@ -27,9 +27,10 @@ use std::collections::HashMap;
use crate::algorithm::neighbour::distances::PairwiseDistance;
use crate::error::{Failed, FailedError};
use crate::linalg::Matrix;
use crate::math::distance::euclidian::Euclidian;
use crate::math::num::RealNumber;
use crate::linalg::basic::arrays::Array2;
use crate::metrics::distance::euclidian::Euclidian;
use crate::numbers::realnum::RealNumber;
use crate::numbers::floatnum::FloatNumber;
///
/// Inspired by Python implementation:
@@ -39,7 +40,7 @@ use crate::math::num::RealNumber;
/// affinity used is Euclidean so to allow linkage with single, ward, complete and average
///
#[derive(Debug, Clone)]
pub struct FastPair<'a, T: RealNumber, M: Matrix<T>> {
pub struct FastPair<'a, T: RealNumber + FloatNumber, M: Array2<T>> {
/// initial matrix
samples: &'a M,
/// closest pair hashmap (connectivity matrix for closest pairs)
@@ -48,7 +49,7 @@ pub struct FastPair<'a, T: RealNumber, M: Matrix<T>> {
pub neighbours: Vec<usize>,
}
impl<'a, T: RealNumber, M: Matrix<T>> FastPair<'a, T, M> {
impl<'a, T: RealNumber + FloatNumber, M: Array2<T>> FastPair<'a, T, M> {
///
/// Constructor
/// Instantiate and inizialise the algorithm
@@ -72,7 +73,7 @@ impl<'a, T: RealNumber, M: Matrix<T>> FastPair<'a, T, M> {
}
///
/// Initialise `FastPair` by passing a `Matrix`.
/// Initialise `FastPair` by passing a `Array2`.
/// Build a FastPairs data-structure from a set of (new) points.
///
fn init(&mut self) {
@@ -96,8 +97,8 @@ impl<'a, T: RealNumber, M: Matrix<T>> FastPair<'a, T, M> {
index_row_i,
PairwiseDistance {
node: index_row_i,
neighbour: None,
distance: Some(T::max_value()),
neighbour: Option::None,
distance: Some(T::MAX),
},
);
}
@@ -142,7 +143,7 @@ impl<'a, T: RealNumber, M: Matrix<T>> FastPair<'a, T, M> {
// compute sparse matrix (connectivity matrix)
let mut sparse_matrix = M::zeros(len, len);
for (_, p) in distances.iter() {
sparse_matrix.set(p.node, p.neighbour.unwrap(), p.distance.unwrap());
sparse_matrix.set((p.node, p.neighbour.unwrap()), p.distance.unwrap());
}
self.distances = distances;
@@ -180,7 +181,7 @@ impl<'a, T: RealNumber, M: Matrix<T>> FastPair<'a, T, M> {
let mut closest_pair = PairwiseDistance {
node: 0,
neighbour: None,
neighbour: Option::None,
distance: Some(T::max_value()),
};
for pair in (0..m).combinations(2) {
@@ -549,7 +550,7 @@ mod tests_fastpair {
let mut min_dissimilarity = PairwiseDistance {
node: 0,
neighbour: None,
neighbour: Option::None,
distance: Some(f64::MAX),
};
for p in dissimilarities.iter() {
+20 -27
View File
@@ -3,12 +3,12 @@
//! see [KNN algorithms](../index.html)
//! ```
//! use smartcore::algorithm::neighbour::linear_search::*;
//! use smartcore::math::distance::Distance;
//! use smartcore::metrics::distance::Distance;
//!
//! #[derive(Clone)]
//! struct SimpleDistance {} // Our distance function
//!
//! impl Distance<i32, f64> for SimpleDistance {
//! impl Distance<i32> for SimpleDistance {
//! fn distance(&self, a: &i32, b: &i32) -> f64 { // simple simmetrical scalar distance
//! (a - b).abs() as f64
//! }
@@ -25,38 +25,31 @@
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use std::cmp::{Ordering, PartialOrd};
use std::marker::PhantomData;
use crate::algorithm::sort::heap_select::HeapSelection;
use crate::error::{Failed, FailedError};
use crate::math::distance::Distance;
use crate::math::num::RealNumber;
use crate::metrics::distance::Distance;
/// Implements Linear Search algorithm, see [KNN algorithms](../index.html)
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
pub struct LinearKNNSearch<T, F: RealNumber, D: Distance<T, F>> {
pub struct LinearKNNSearch<T, D: Distance<T>> {
distance: D,
data: Vec<T>,
f: PhantomData<F>,
}
impl<T, F: RealNumber, D: Distance<T, F>> LinearKNNSearch<T, F, D> {
impl<T, D: Distance<T>> LinearKNNSearch<T, D> {
/// Initializes algorithm.
/// * `data` - vector of data points to search for.
/// * `distance` - distance metric to use for searching. This function should extend [`Distance`](../../../math/distance/index.html) interface.
pub fn new(data: Vec<T>, distance: D) -> Result<LinearKNNSearch<T, F, D>, Failed> {
Ok(LinearKNNSearch {
data,
distance,
f: PhantomData,
})
pub fn new(data: Vec<T>, distance: D) -> Result<LinearKNNSearch<T, D>, Failed> {
Ok(LinearKNNSearch { data, distance })
}
/// Find k nearest neighbors
/// * `from` - look for k nearest points to `from`
/// * `k` - the number of nearest neighbors to return
pub fn find(&self, from: &T, k: usize) -> Result<Vec<(usize, F, &T)>, Failed> {
pub fn find(&self, from: &T, k: usize) -> Result<Vec<(usize, f64, &T)>, Failed> {
if k < 1 || k > self.data.len() {
return Err(Failed::because(
FailedError::FindFailed,
@@ -64,11 +57,11 @@ impl<T, F: RealNumber, D: Distance<T, F>> LinearKNNSearch<T, F, D> {
));
}
let mut heap = HeapSelection::<KNNPoint<F>>::with_capacity(k);
let mut heap = HeapSelection::<KNNPoint>::with_capacity(k);
for _ in 0..k {
heap.add(KNNPoint {
distance: F::infinity(),
distance: std::f64::INFINITY,
index: None,
});
}
@@ -93,15 +86,15 @@ impl<T, F: RealNumber, D: Distance<T, F>> LinearKNNSearch<T, F, D> {
/// Find all nearest neighbors within radius `radius` from `p`
/// * `p` - look for k nearest points to `p`
/// * `radius` - radius of the search
pub fn find_radius(&self, from: &T, radius: F) -> Result<Vec<(usize, F, &T)>, Failed> {
if radius <= F::zero() {
pub fn find_radius(&self, from: &T, radius: f64) -> Result<Vec<(usize, f64, &T)>, Failed> {
if radius <= 0f64 {
return Err(Failed::because(
FailedError::FindFailed,
"radius should be > 0",
));
}
let mut neighbors: Vec<(usize, F, &T)> = Vec::new();
let mut neighbors: Vec<(usize, f64, &T)> = Vec::new();
for i in 0..self.data.len() {
let d = self.distance.distance(from, &self.data[i]);
@@ -116,35 +109,35 @@ impl<T, F: RealNumber, D: Distance<T, F>> LinearKNNSearch<T, F, D> {
}
#[derive(Debug)]
struct KNNPoint<F: RealNumber> {
distance: F,
struct KNNPoint {
distance: f64,
index: Option<usize>,
}
impl<F: RealNumber> PartialOrd for KNNPoint<F> {
impl PartialOrd for KNNPoint {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
self.distance.partial_cmp(&other.distance)
}
}
impl<F: RealNumber> PartialEq for KNNPoint<F> {
impl PartialEq for KNNPoint {
fn eq(&self, other: &Self) -> bool {
self.distance == other.distance
}
}
impl<F: RealNumber> Eq for KNNPoint<F> {}
impl Eq for KNNPoint {}
#[cfg(test)]
mod tests {
use super::*;
use crate::math::distance::Distances;
use crate::metrics::distance::Distances;
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
struct SimpleDistance {}
impl Distance<i32, f64> for SimpleDistance {
impl Distance<i32> for SimpleDistance {
fn distance(&self, a: &i32, b: &i32) -> f64 {
(a - b).abs() as f64
}
+12 -11
View File
@@ -33,8 +33,8 @@
use crate::algorithm::neighbour::cover_tree::CoverTree;
use crate::algorithm::neighbour::linear_search::LinearKNNSearch;
use crate::error::Failed;
use crate::math::distance::Distance;
use crate::math::num::RealNumber;
use crate::metrics::distance::Distance;
use crate::numbers::basenum::Number;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
@@ -44,7 +44,7 @@ pub mod cover_tree;
/// dissimilarities for vector-vector distance. Linkage algorithms used in fastpair
pub mod distances;
/// fastpair closest neighbour algorithm
pub mod fastpair;
// pub mod fastpair;
/// very simple algorithm that sequentially checks each element of the list until a match is found or the whole list has been searched.
pub mod linear_search;
@@ -67,13 +67,14 @@ impl Default for KNNAlgorithmName {
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
pub(crate) enum KNNAlgorithm<T: RealNumber, D: Distance<Vec<T>, T>> {
LinearSearch(LinearKNNSearch<Vec<T>, T, D>),
CoverTree(CoverTree<Vec<T>, T, D>),
pub(crate) enum KNNAlgorithm<T: Number, D: Distance<Vec<T>>> {
LinearSearch(LinearKNNSearch<Vec<T>, D>),
CoverTree(CoverTree<Vec<T>, D>),
}
// TODO: missing documentation
impl KNNAlgorithmName {
pub(crate) fn fit<T: RealNumber, D: Distance<Vec<T>, T>>(
pub(crate) fn fit<T: Number, D: Distance<Vec<T>>>(
&self,
data: Vec<Vec<T>>,
distance: D,
@@ -89,8 +90,8 @@ impl KNNAlgorithmName {
}
}
impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNAlgorithm<T, D> {
pub fn find(&self, from: &Vec<T>, k: usize) -> Result<Vec<(usize, T, &Vec<T>)>, Failed> {
impl<T: Number, D: Distance<Vec<T>>> KNNAlgorithm<T, D> {
pub fn find(&self, from: &Vec<T>, k: usize) -> Result<Vec<(usize, f64, &Vec<T>)>, Failed> {
match *self {
KNNAlgorithm::LinearSearch(ref linear) => linear.find(from, k),
KNNAlgorithm::CoverTree(ref cover) => cover.find(from, k),
@@ -100,8 +101,8 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNAlgorithm<T, D> {
pub fn find_radius(
&self,
from: &Vec<T>,
radius: T,
) -> Result<Vec<(usize, T, &Vec<T>)>, Failed> {
radius: f64,
) -> Result<Vec<(usize, f64, &Vec<T>)>, Failed> {
match *self {
KNNAlgorithm::LinearSearch(ref linear) => linear.find_radius(from, radius),
KNNAlgorithm::CoverTree(ref cover) => cover.find_radius(from, radius),
+2 -2
View File
@@ -1,4 +1,4 @@
use num_traits::Float;
use num_traits::Num;
pub trait QuickArgSort {
fn quick_argsort_mut(&mut self) -> Vec<usize>;
@@ -6,7 +6,7 @@ pub trait QuickArgSort {
fn quick_argsort(&self) -> Vec<usize>;
}
impl<T: Float> QuickArgSort for Vec<T> {
impl<T: Num + PartialOrd + Copy> QuickArgSort for Vec<T> {
fn quick_argsort(&self) -> Vec<usize> {
let mut v = self.clone();
v.quick_argsort_mut()