diff --git a/benches/distance.rs b/benches/distance.rs index 927f8e4..d9020f3 100644 --- a/benches/distance.rs +++ b/benches/distance.rs @@ -4,11 +4,12 @@ extern crate smartcore; use criterion::Criterion; use criterion::black_box; +use smartcore::math::distance::euclidian::*; fn criterion_benchmark(c: &mut Criterion) { let a = vec![1., 2., 3.]; - c.bench_function("Euclidean Distance", move |b| b.iter(|| smartcore::math::distance::euclidian::distance(black_box(&a), black_box(&a)))); + c.bench_function("Euclidean Distance", move |b| b.iter(|| Euclidian::distance(black_box(&a), black_box(&a)))); } criterion_group!(benches, criterion_benchmark); diff --git a/src/algorithm/neighbour/bbd_tree.rs b/src/algorithm/neighbour/bbd_tree.rs index d0cea26..0e52350 100644 --- a/src/algorithm/neighbour/bbd_tree.rs +++ b/src/algorithm/neighbour/bbd_tree.rs @@ -2,7 +2,7 @@ use std::fmt::Debug; use crate::math::num::FloatExt; use crate::linalg::Matrix; -use crate::math::distance::euclidian; +use crate::math::distance::euclidian::*; #[derive(Debug)] pub struct BBDTree { @@ -79,10 +79,10 @@ impl BBDTree { let d = centroids[0].len(); // Determine which mean the node mean is closest to - let mut min_dist = euclidian::squared_distance(&self.nodes[node].center, ¢roids[candidates[0]]); + let mut min_dist = Euclidian::squared_distance(&self.nodes[node].center, ¢roids[candidates[0]]); let mut closest = candidates[0]; for i in 1..k { - let dist = euclidian::squared_distance(&self.nodes[node].center, ¢roids[candidates[i]]); + let dist = Euclidian::squared_distance(&self.nodes[node].center, ¢roids[candidates[i]]); if dist < min_dist { min_dist = dist; closest = candidates[i]; diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 600f47a..dd73d18 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -3,25 +3,26 @@ use std::iter::FromIterator; use std::fmt::Debug; use core::hash::{Hash, Hasher}; +use serde::{Serialize, Deserialize}; + use crate::math::num::FloatExt; -use crate::algorithm::neighbour::KNNAlgorithm; +use crate::math::distance::Distance; use crate::algorithm::sort::heap_select::HeapSelect; -pub struct CoverTree<'a, T, F: FloatExt> -where T: Debug +#[derive(Serialize, Deserialize, Debug)] +pub struct CoverTree> { base: F, max_level: i8, min_level: i8, - distance: &'a dyn Fn(&T, &T) -> F, + distance: D, nodes: Vec> } -impl<'a, T, F: FloatExt> CoverTree<'a, T, F> -where T: Debug +impl> CoverTree { - pub fn new(mut data: Vec, distance: &'a dyn Fn(&T, &T) -> F) -> CoverTree { + pub fn new(mut data: Vec, distance: D) -> CoverTree { let mut tree = CoverTree { base: F::two(), max_level: 100, @@ -43,7 +44,7 @@ where T: Debug } else { let mut parent: Option = Option::None; let mut p_i = 0; - let mut qi_p_ds = vec!((self.root(), (self.distance)(&p, &self.root().data))); + let mut qi_p_ds = vec!((self.root(), D::distance(&p, &self.root().data))); let mut i = self.max_level; loop { let i_d = self.base.powf(F::from(i).unwrap()); @@ -82,6 +83,18 @@ where T: Debug node_id } + pub fn find(&self, p: &T, k: usize) -> Vec{ + let mut qi_p_ds = vec!((self.root(), D::distance(&p, &self.root().data))); + for i in (self.min_level..self.max_level+1).rev() { + let i_d = self.base.powf(F::from(i).unwrap()); + let mut q_p_ds = self.get_children_dist(&p, &qi_p_ds, i); + let d_p_q = self.min_k_by_distance(&mut q_p_ds, k); + qi_p_ds = q_p_ds.into_iter().filter(|(_, d)| d <= &(d_p_q + i_d)).collect(); + } + qi_p_ds.sort_by(|(_, d1), (_, d2)| d1.partial_cmp(d2).unwrap()); + qi_p_ds[..usize::min(qi_p_ds.len(), k)].iter().map(|(n, _)| n.index.index).collect() + } + fn split(&self, p_id: NodeId, r: F, s1: &mut Vec, s2: Option<&mut Vec>) -> (Vec, Vec){ let mut my_near = (Vec::new(), Vec::new()); @@ -102,7 +115,7 @@ where T: Debug let p = &self.nodes.get(p_id.index).unwrap().data; let mut i = 0; while i != s.len() { - let d = (self.distance)(p, &s[i]); + let d = D::distance(p, &s[i]); if d <= r { my_near.0.push(s.remove(i)); } else if d > r && d <= F::two() * r{ @@ -156,7 +169,7 @@ where T: Debug let q: Vec<&Node> = qi_p_ds.iter().flat_map(|(n, _)| self.get_child(n, i)).collect(); - children.extend(q.into_iter().map(|n| (n, (self.distance)(&n.data, &p)))); + children.extend(q.into_iter().map(|n| (n, D::distance(&n.data, &p)))); children @@ -180,7 +193,7 @@ where T: Debug } #[allow(dead_code)] - fn check_invariant(&self, invariant: fn(&CoverTree, &Vec<&Node>, &Vec<&Node>, i8) -> ()) { + fn check_invariant(&self, invariant: fn(&CoverTree, &Vec<&Node>, &Vec<&Node>, i8) -> ()) { let mut current_nodes: Vec<&Node> = Vec::new(); current_nodes.push(self.root()); for i in (self.min_level..self.max_level+1).rev() { @@ -193,7 +206,7 @@ where T: Debug } #[allow(dead_code)] - fn nesting_invariant(_: &CoverTree, nodes: &Vec<&Node>, next_nodes: &Vec<&Node>, _: i8) { + fn nesting_invariant(_: &CoverTree, nodes: &Vec<&Node>, next_nodes: &Vec<&Node>, _: i8) { let nodes_set: HashSet<&Node> = HashSet::from_iter(nodes.into_iter().map(|n| *n)); let next_nodes_set: HashSet<&Node> = HashSet::from_iter(next_nodes.into_iter().map(|n| *n)); for n in nodes_set.iter() { @@ -202,11 +215,11 @@ where T: Debug } #[allow(dead_code)] - fn covering_tree(tree: &CoverTree, nodes: &Vec<&Node>, next_nodes: &Vec<&Node>, i: i8) { + fn covering_tree(tree: &CoverTree, nodes: &Vec<&Node>, next_nodes: &Vec<&Node>, i: i8) { let mut p_selected: Vec<&Node> = Vec::new(); for p in next_nodes { for q in nodes { - if (tree.distance)(&p.data, &q.data) <= tree.base.powf(F::from(i).unwrap()) { + if D::distance(&p.data, &q.data) <= tree.base.powf(F::from(i).unwrap()) { p_selected.push(*p); } } @@ -216,11 +229,11 @@ where T: Debug } #[allow(dead_code)] - fn separation(tree: &CoverTree, nodes: &Vec<&Node>, _: &Vec<&Node>, i: i8) { + fn separation(tree: &CoverTree, nodes: &Vec<&Node>, _: &Vec<&Node>, i: i8) { for p in nodes { for q in nodes { if p != q { - assert!((tree.distance)(&p.data, &q.data) > tree.base.powf(F::from(i).unwrap())); + assert!(D::distance(&p.data, &q.data) > tree.base.powf(F::from(i).unwrap())); } } } @@ -228,28 +241,12 @@ where T: Debug } -impl<'a, T, F: FloatExt> KNNAlgorithm for CoverTree<'a, T, F> -where T: Debug -{ - fn find(&self, p: &T, k: usize) -> Vec{ - let mut qi_p_ds = vec!((self.root(), (self.distance)(&p, &self.root().data))); - for i in (self.min_level..self.max_level+1).rev() { - let i_d = self.base.powf(F::from(i).unwrap()); - let mut q_p_ds = self.get_children_dist(&p, &qi_p_ds, i); - let d_p_q = self.min_k_by_distance(&mut q_p_ds, k); - qi_p_ds = q_p_ds.into_iter().filter(|(_, d)| d <= &(d_p_q + i_d)).collect(); - } - qi_p_ds.sort_by(|(_, d1), (_, d2)| d1.partial_cmp(d2).unwrap()); - qi_p_ds[..usize::min(qi_p_ds.len(), k)].iter().map(|(n, _)| n.index.index).collect() - } -} - -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] pub struct NodeId { index: usize, } -#[derive(Debug)] +#[derive(Debug, Serialize, Deserialize)] struct Node { index: NodeId, data: T, @@ -280,13 +277,19 @@ mod tests { use super::*; + struct SimpleDistance{} + + impl Distance for SimpleDistance { + fn distance(a: &i32, b: &i32) -> f64 { + (a - b).abs() as f64 + } + } + #[test] fn cover_tree_test() { let data = vec!(1, 2, 3, 4, 5, 6, 7, 8, 9); - let distance = |a: &i32, b: &i32| -> f64 { - (a - b).abs() as f64 - }; - let mut tree = CoverTree::::new(data, &distance); + + let mut tree = CoverTree::new(data, SimpleDistance{}); for d in vec!(10, 11, 12, 13, 14, 15, 16, 17, 18, 19) { tree.insert(d); } @@ -306,10 +309,8 @@ mod tests { #[test] fn test_invariants(){ let data = vec!(1, 2, 3, 4, 5, 6, 7, 8, 9); - let distance = |a: &i32, b: &i32| -> f64 { - (a - b).abs() as f64 - }; - let tree = CoverTree::::new(data, &distance); + + let tree = CoverTree::new(data, SimpleDistance{}); tree.check_invariant(CoverTree::nesting_invariant); tree.check_invariant(CoverTree::covering_tree); tree.check_invariant(CoverTree::separation); diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index f7c4f38..aea64aa 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -1,16 +1,28 @@ -use crate::algorithm::neighbour::KNNAlgorithm; -use crate::algorithm::sort::heap_select::HeapSelect; use std::cmp::{Ordering, PartialOrd}; -use num_traits::Float; +use std::marker::PhantomData; +use serde::{Serialize, Deserialize}; -pub struct LinearKNNSearch<'a, T, F: Float> { - distance: Box F + 'a>, - data: Vec +use crate::math::num::FloatExt; +use crate::math::distance::Distance; +use crate::algorithm::sort::heap_select::HeapSelect; + +#[derive(Serialize, Deserialize, Debug)] +pub struct LinearKNNSearch> { + distance: D, + data: Vec, + f: PhantomData } -impl<'a, T, F: Float> KNNAlgorithm for LinearKNNSearch<'a, T, F> -{ - fn find(&self, from: &T, k: usize) -> Vec { +impl> LinearKNNSearch { + pub fn new(data: Vec, distance: D) -> LinearKNNSearch{ + LinearKNNSearch{ + data: data, + distance: distance, + f: PhantomData + } + } + + pub fn find(&self, from: &T, k: usize) -> Vec { if k < 1 || k > self.data.len() { panic!("k should be >= 1 and <= length(data)"); } @@ -19,14 +31,14 @@ impl<'a, T, F: Float> KNNAlgorithm for LinearKNNSearch<'a, T, F> for _ in 0..k { heap.add(KNNPoint{ - distance: Float::infinity(), + distance: F::infinity(), index: None }); } for i in 0..self.data.len() { - let d = (self.distance)(&from, &self.data[i]); + let d = D::distance(&from, &self.data[i]); let datum = heap.peek_mut(); if d < datum.distance { datum.distance = d; @@ -41,43 +53,34 @@ impl<'a, T, F: Float> KNNAlgorithm for LinearKNNSearch<'a, T, F> } } -impl<'a, T, F: Float> LinearKNNSearch<'a, T, F> { - pub fn new(data: Vec, distance: &'a dyn Fn(&T, &T) -> F) -> LinearKNNSearch{ - LinearKNNSearch{ - data: data, - distance: Box::new(distance) - } - } -} - #[derive(Debug)] -struct KNNPoint { +struct KNNPoint { distance: F, index: Option } -impl PartialOrd for KNNPoint { +impl PartialOrd for KNNPoint { fn partial_cmp(&self, other: &Self) -> Option { self.distance.partial_cmp(&other.distance) } } -impl PartialEq for KNNPoint { +impl PartialEq for KNNPoint { fn eq(&self, other: &Self) -> bool { self.distance == other.distance } } -impl Eq for KNNPoint {} +impl Eq for KNNPoint {} #[cfg(test)] mod tests { use super::*; - use crate::math::distance::euclidian; + use crate::math::distance::Distances; struct SimpleDistance{} - impl SimpleDistance { + impl Distance for SimpleDistance { fn distance(a: &i32, b: &i32) -> f64 { (a - b).abs() as f64 } @@ -87,13 +90,13 @@ mod tests { fn knn_find() { let data1 = vec!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); - let algorithm1 = LinearKNNSearch::new(data1, &SimpleDistance::distance); + let algorithm1 = LinearKNNSearch::new(data1, SimpleDistance{}); assert_eq!(vec!(1, 2, 0), algorithm1.find(&2, 3)); let data2 = vec!(vec![1., 1.], vec![2., 2.], vec![3., 3.], vec![4., 4.], vec![5., 5.]); - let algorithm2 = LinearKNNSearch::new(data2, &euclidian::distance); + let algorithm2 = LinearKNNSearch::new(data2, Distances::euclidian()); assert_eq!(vec!(2, 3, 1), algorithm2.find(&vec![3., 3.], 3)); } @@ -116,7 +119,7 @@ mod tests { }; let point_inf = KNNPoint{ - distance: Float::infinity(), + distance: std::f64::INFINITY, index: Some(3) }; diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 0ab7e19..9ca8ae9 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -1,12 +1,3 @@ pub mod cover_tree; pub mod linear_search; -pub mod bbd_tree; - -pub enum KNNAlgorithmName { - CoverTree, - LinearSearch, -} - -pub trait KNNAlgorithm{ - fn find(&self, from: &T, k: usize) -> Vec; -} \ No newline at end of file +pub mod bbd_tree; \ No newline at end of file diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 7000405..2f42681 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -8,7 +8,7 @@ use serde::{Serialize, Deserialize}; use crate::math::num::FloatExt; use crate::linalg::Matrix; -use crate::math::distance::euclidian; +use crate::math::distance::euclidian::*; use crate::algorithm::neighbour::bbd_tree::BBDTree; #[derive(Serialize, Deserialize, Debug)] @@ -130,7 +130,7 @@ impl KMeans{ let mut best_cluster = 0; for j in 0..self.k { - let dist = euclidian::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]); + let dist = Euclidian::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]); if dist < min_dist { min_dist = dist; best_cluster = j; @@ -156,7 +156,7 @@ impl KMeans{ // the distance from each sample to its closest center in scores. for i in 0..n { // compute the distance between this sample and the current center - let dist = euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid); + let dist = Euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid); if dist < d[i] { d[i] = dist; @@ -183,7 +183,7 @@ impl KMeans{ for i in 0..n { // compute the distance between this sample and the current center - let dist = euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid); + let dist = Euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid); if dist < d[i] { d[i] = dist; diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 68d884c..7537c4b 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -1,8 +1,11 @@ use std::fmt::Debug; + +use serde::{Serialize, Deserialize}; + use crate::math::num::FloatExt; use crate::linalg::{Matrix}; -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct PCA> { eigenvectors: M, eigenvalues: Vec, @@ -11,6 +14,22 @@ pub struct PCA> { pmu: Vec } +impl> PartialEq for PCA { + fn eq(&self, other: &Self) -> bool { + if self.eigenvectors != other.eigenvectors || + self.eigenvalues.len() != other.eigenvalues.len() { + return false + } else { + for i in 0..self.eigenvalues.len() { + if (self.eigenvalues[i] - other.eigenvalues[i]).abs() > T::epsilon() { + return false + } + } + return true + } + } +} + #[derive(Debug, Clone)] pub struct PCAParameters { use_correlation_matrix: bool @@ -366,5 +385,37 @@ mod tests { assert!(us_arrests_t.abs().approximate_eq(&expected_projection.abs(), 1e-4)); } + + #[test] + fn serde() { + let iris = DenseMatrix::from_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4]]); + + let pca = PCA::new(&iris, 4, Default::default()); + + let deserialized_pca: PCA> = serde_json::from_str(&serde_json::to_string(&pca).unwrap()).unwrap(); + + assert_eq!(pca, deserialized_pca); + + } } \ No newline at end of file diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 4a0835c..28f6227 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -4,12 +4,13 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; +use serde::{Serialize, Deserialize}; use crate::math::num::FloatExt; use crate::linalg::Matrix; use crate::tree::decision_tree_classifier::{DecisionTreeClassifier, DecisionTreeClassifierParameters, SplitCriterion, which_max}; -#[derive(Debug, Clone)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RandomForestClassifierParameters { pub criterion: SplitCriterion, pub max_depth: Option, @@ -19,13 +20,34 @@ pub struct RandomForestClassifierParameters { pub mtry: Option } -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct RandomForestClassifier { parameters: RandomForestClassifierParameters, trees: Vec>, classes: Vec } +impl PartialEq for RandomForestClassifier { + fn eq(&self, other: &Self) -> bool { + if self.classes.len() != other.classes.len() || + self.trees.len() != other.trees.len() { + return false + } else { + for i in 0..self.classes.len() { + if (self.classes[i] - other.classes[i]).abs() > T::epsilon() { + return false + } + } + for i in 0..self.trees.len() { + if self.trees[i] != other.trees[i] { + return false + } + } + true + } + } +} + impl Default for RandomForestClassifierParameters { fn default() -> Self { RandomForestClassifierParameters { @@ -171,4 +193,37 @@ mod tests { } + #[test] + fn serde() { + let x = DenseMatrix::from_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4]]); + let y = vec![0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; + + let forest = RandomForestClassifier::fit(&x, &y, Default::default()); + + let deserialized_forest: RandomForestClassifier = bincode::deserialize(&bincode::serialize(&forest).unwrap()).unwrap(); + + assert_eq!(forest, deserialized_forest); + + } + } \ No newline at end of file diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 1d4dec6..a578358 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -4,12 +4,13 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; +use serde::{Serialize, Deserialize}; use crate::math::num::FloatExt; use crate::linalg::Matrix; use crate::tree::decision_tree_regressor::{DecisionTreeRegressor, DecisionTreeRegressorParameters}; -#[derive(Debug, Clone)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RandomForestRegressorParameters { pub max_depth: Option, pub min_samples_leaf: usize, @@ -18,7 +19,7 @@ pub struct RandomForestRegressorParameters { pub mtry: Option } -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct RandomForestRegressor { parameters: RandomForestRegressorParameters, trees: Vec> @@ -36,6 +37,21 @@ impl Default for RandomForestRegressorParameters { } } +impl PartialEq for RandomForestRegressor { + fn eq(&self, other: &Self) -> bool { + if self.trees.len() != other.trees.len() { + return false + } else { + for i in 0..self.trees.len() { + if self.trees[i] != other.trees[i] { + return false + } + } + true + } + } +} + impl RandomForestRegressor { pub fn fit>(x: &M, y: &M::RowVector, parameters: RandomForestRegressorParameters) -> RandomForestRegressor { @@ -180,4 +196,33 @@ mod tests { } + #[test] + fn serde() { + let x = DenseMatrix::from_array(&[ + &[ 234.289, 235.6, 159., 107.608, 1947., 60.323], + &[ 259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[ 258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[ 284.599, 335.1, 165., 110.929, 1950., 61.187], + &[ 328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[ 346.999, 193.2, 359.4, 113.27 , 1952., 63.639], + &[ 365.385, 187., 354.7, 115.094, 1953., 64.989], + &[ 363.112, 357.8, 335., 116.219, 1954., 63.761], + &[ 397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[ 419.18 , 282.2, 285.7, 118.734, 1956., 67.857], + &[ 442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[ 444.546, 468.1, 263.7, 121.95 , 1958., 66.513], + &[ 482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[ 502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[ 518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[ 554.894, 400.7, 282.7, 130.081, 1962., 70.551]]); + let y = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; + + let forest = RandomForestRegressor::fit(&x, &y, Default::default()); + + let deserialized_forest: RandomForestRegressor = bincode::deserialize(&bincode::serialize(&forest).unwrap()).unwrap(); + + assert_eq!(forest, deserialized_forest); + + } + } \ No newline at end of file diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 2fd7506..a1d5e2c 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -21,7 +21,7 @@ pub struct LinearRegression> { impl> PartialEq for LinearRegression { fn eq(&self, other: &Self) -> bool { self.coefficients == other.coefficients && - self.intercept == other.intercept + (self.intercept - other.intercept).abs() <= T::epsilon() } } diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 5116e2d..bd1d9bc 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -42,10 +42,19 @@ struct BinaryObjectiveFunction<'a, T: FloatExt, M: Matrix> { impl> PartialEq for LogisticRegression { fn eq(&self, other: &Self) -> bool { - self.num_classes == other.num_classes && - self.classes == other.classes && - self.num_attributes == other.num_attributes && - self.weights == other.weights + if self.num_classes != other.num_classes || + self.num_attributes != other.num_attributes || + self.classes.len() != other.classes.len() { + return false + } else { + for i in 0..self.classes.len() { + if (self.classes[i] - other.classes[i]).abs() > T::epsilon(){ + return false + } + } + + return self.weights == other.weights + } } } diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index e962b4c..0a6815b 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -1,20 +1,39 @@ +use serde::{Serialize, Deserialize}; + use crate::math::num::FloatExt; -pub fn distance(x: &Vec, y: &Vec) -> T { - return squared_distance(x, y).sqrt(); +use super::Distance; + +#[derive(Serialize, Deserialize, Debug)] +pub struct Euclidian { } -pub fn squared_distance(x: &Vec,y: &Vec) -> T { - if x.len() != y.len() { - panic!("Input vector sizes are different."); +impl Euclidian { + pub fn squared_distance(x: &Vec,y: &Vec) -> T { + if x.len() != y.len() { + panic!("Input vector sizes are different."); + } + + let mut sum = T::zero(); + for i in 0..x.len() { + sum = sum + (x[i] - y[i]).powf(T::two()); + } + + sum } - let mut sum = T::zero(); - for i in 0..x.len() { - sum = sum + (x[i] - y[i]).powf(T::two()); + pub fn distance(x: &Vec, y: &Vec) -> T { + Euclidian::squared_distance(x, y).sqrt() } + +} + +impl Distance, T> for Euclidian { + + fn distance(x: &Vec, y: &Vec) -> T { + Self::distance(x, y) + } - return sum; } @@ -27,7 +46,7 @@ mod tests { let a = vec![1., 2., 3.]; let b = vec![4., 5., 6.]; - let d_arr: f64 = distance(&a, &b); + let d_arr: f64 = Euclidian::distance(&a, &b); assert!((d_arr - 5.19615242).abs() < 1e-8); } diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 987f06e..9e29063 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -1 +1,16 @@ -pub mod euclidian; \ No newline at end of file +pub mod euclidian; + +use crate::math::num::FloatExt; + +pub trait Distance{ + fn distance(a: &T, b: &T) -> F; +} + +pub struct Distances{ +} + +impl Distances { + pub fn euclidian() -> euclidian::Euclidian{ + euclidian::Euclidian {} + } +} \ No newline at end of file diff --git a/src/neighbors/knn.rs b/src/neighbors/knn.rs index 3cb96eb..df7a6dd 100644 --- a/src/neighbors/knn.rs +++ b/src/neighbors/knn.rs @@ -1,26 +1,83 @@ +use serde::{Serialize, Deserialize}; + use crate::math::num::FloatExt; +use crate::math::distance::Distance; use crate::linalg::{Matrix, row_iter}; -use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::algorithm::neighbour::linear_search::LinearKNNSearch; use crate::algorithm::neighbour::cover_tree::CoverTree; -pub struct KNNClassifier<'a, T: FloatExt> { +#[derive(Serialize, Deserialize, Debug)] +pub struct KNNClassifier, T>> { classes: Vec, y: Vec, - knn_algorithm: Box> + 'a>, - k: usize, + knn_algorithm: KNNAlgorithmV, + k: usize } -impl<'a, T: FloatExt> KNNClassifier<'a, T> { +pub enum KNNAlgorithmName { + LinearSearch, + CoverTree +} - pub fn fit>(x: &M, y: &M::RowVector, k: usize, distance: &'a dyn Fn(&Vec, &Vec) -> T, algorithm: KNNAlgorithmName) -> KNNClassifier<'a, T> { +#[derive(Serialize, Deserialize, Debug)] +pub enum KNNAlgorithmV, T>> { + LinearSearch(LinearKNNSearch, T, D>), + CoverTree(CoverTree, T, D>) +} + +impl KNNAlgorithmName { + + fn fit, T>>(&self, data: Vec>, distance: D) -> KNNAlgorithmV { + match *self { + KNNAlgorithmName::LinearSearch => KNNAlgorithmV::LinearSearch(LinearKNNSearch::new(data, distance)), + KNNAlgorithmName::CoverTree => KNNAlgorithmV::CoverTree(CoverTree::new(data, distance)), + } + } + +} + +impl, T>> KNNAlgorithmV { + fn find(&self, from: &Vec, k: usize) -> Vec{ + match *self { + KNNAlgorithmV::LinearSearch(ref linear) => linear.find(from, k), + KNNAlgorithmV::CoverTree(ref cover) => cover.find(from, k) + } + } +} + + +impl, T>> PartialEq for KNNClassifier { + fn eq(&self, other: &Self) -> bool { + if self.classes.len() != other.classes.len() || + self.k != other.k || + self.y.len() != other.y.len() { + return false + } else { + for i in 0..self.classes.len() { + if (self.classes[i] - other.classes[i]).abs() > T::epsilon() { + return false + } + } + for i in 0..self.y.len() { + if self.y[i] != other.y[i] { + return false + } + } + true + } + } +} + +impl, T>> KNNClassifier { + + pub fn fit>(x: &M, y: &M::RowVector, k: usize, distance: D, algorithm: KNNAlgorithmName) -> KNNClassifier { let y_m = M::from_row_vector(y.clone()); let (_, y_n) = y_m.shape(); let (x_n, _) = x.shape(); - let data = row_iter(x).collect(); + let data = row_iter(x).collect(); let mut yi: Vec = vec![0; y_n]; let classes = y_m.unique(); @@ -32,14 +89,9 @@ impl<'a, T: FloatExt> KNNClassifier<'a, T> { assert!(x_n == y_n, format!("Size of x should equal size of y; |x|=[{}], |y|=[{}]", x_n, y_n)); - assert!(k > 1, format!("k should be > 1, k=[{}]", k)); + assert!(k > 1, format!("k should be > 1, k=[{}]", k)); - let knn_algorithm: Box> + 'a> = match algorithm { - KNNAlgorithmName::CoverTree => Box::new(CoverTree::, T>::new(data, distance)), - KNNAlgorithmName::LinearSearch => Box::new(LinearKNNSearch::, T>::new(data, distance)) - }; - - KNNClassifier{classes:classes, y: yi, k: k, knn_algorithm: knn_algorithm} + KNNClassifier{classes:classes, y: yi, k: k, knn_algorithm: algorithm.fit(data, distance)} } @@ -74,8 +126,8 @@ impl<'a, T: FloatExt> KNNClassifier<'a, T> { #[cfg(test)] mod tests { use super::*; - use crate::math::distance::euclidian; - use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::math::distance::Distances; + use crate::linalg::naive::dense_matrix::DenseMatrix; #[test] fn knn_fit_predict() { @@ -85,10 +137,28 @@ mod tests { &[5., 6.], &[7., 8.], &[9., 10.]]); - let y = vec![2., 2., 2., 3., 3.]; - let knn = KNNClassifier::fit(&x, &y, 3, &euclidian::distance, KNNAlgorithmName::LinearSearch); + let y = vec![2., 2., 2., 3., 3.]; + let knn = KNNClassifier::fit(&x, &y, 3, Distances::euclidian(), KNNAlgorithmName::LinearSearch); let r = knn.predict(&x); assert_eq!(5, Vec::len(&r)); assert_eq!(y.to_vec(), r); } + + #[test] + fn serde() { + let x = DenseMatrix::from_array(&[ + &[1., 2.], + &[3., 4.], + &[5., 6.], + &[7., 8.], + &[9., 10.]]); + let y = vec![2., 2., 2., 3., 3.]; + + let knn = KNNClassifier::fit(&x, &y, 3, Distances::euclidian(), KNNAlgorithmName::CoverTree); + + let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); + + assert_eq!(knn, deserialized_knn); + + } } \ No newline at end of file diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index a5ff657..a204923 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -3,11 +3,13 @@ use std::fmt::Debug; use std::marker::PhantomData; use std::collections::LinkedList; +use serde::{Serialize, Deserialize}; + use crate::math::num::FloatExt; use crate::linalg::Matrix; use crate::algorithm::sort::quick_sort::QuickArgSort; -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct DecisionTreeClassifierParameters { pub criterion: SplitCriterion, pub max_depth: Option, @@ -15,7 +17,7 @@ pub struct DecisionTreeClassifierParameters { pub min_samples_split: usize } -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct DecisionTreeClassifier { nodes: Vec>, parameters: DecisionTreeClassifierParameters, @@ -24,24 +26,62 @@ pub struct DecisionTreeClassifier { depth: u16 } -#[derive(Debug, Clone)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub enum SplitCriterion { Gini, Entropy, ClassificationError } -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct Node { index: usize, output: usize, split_feature: usize, - split_value: T, - split_score: T, + split_value: Option, + split_score: Option, true_child: Option, false_child: Option, } +impl PartialEq for DecisionTreeClassifier { + fn eq(&self, other: &Self) -> bool { + if self.depth != other.depth || + self.num_classes != other.num_classes || + self.nodes.len() != other.nodes.len(){ + return false + } else { + for i in 0..self.classes.len() { + if (self.classes[i] - other.classes[i]).abs() > T::epsilon() { + return false + } + } + for i in 0..self.nodes.len() { + if self.nodes[i] != other.nodes[i] { + return false + } + } + return true + } + } +} + +impl PartialEq for Node { + fn eq(&self, other: &Self) -> bool { + self.output == other.output && + self.split_feature == other.split_feature && + match (self.split_value, other.split_value) { + (Some(a), Some(b)) => (a - b).abs() < T::epsilon(), + (None, None) => true, + _ => false, + } && + match (self.split_score, other.split_score) { + (Some(a), Some(b)) => (a - b).abs() < T::epsilon(), + (None, None) => true, + _ => false, + } + } +} impl Default for DecisionTreeClassifierParameters { fn default() -> Self { @@ -60,8 +100,8 @@ impl Node { index: index, output: output, split_feature: 0, - split_value: T::nan(), - split_score: T::nan(), + split_value: Option::None, + split_score: Option::None, true_child: Option::None, false_child: Option::None } @@ -238,7 +278,7 @@ impl DecisionTreeClassifier { if node.true_child == None && node.false_child == None { result = node.output; } else { - if x.get(row, node.split_feature) <= node.split_value { + if x.get(row, node.split_feature) <= node.split_value.unwrap_or(T::nan()) { queue.push_back(node.true_child.unwrap()); } else { queue.push_back(node.false_child.unwrap()); @@ -299,7 +339,7 @@ impl DecisionTreeClassifier { self.find_best_split(visitor, n, &count, &mut false_count, parent_impurity, variables[j]); } - !self.nodes[visitor.node].split_score.is_nan() + self.nodes[visitor.node].split_score != Option::None } @@ -336,10 +376,10 @@ impl DecisionTreeClassifier { let false_label = which_max(false_count); let gain = parent_impurity - T::from(tc).unwrap() / T::from(n).unwrap() * impurity(&self.parameters.criterion, &true_count, tc) - T::from(fc).unwrap() / T::from(n).unwrap() * impurity(&self.parameters.criterion, &false_count, fc); - if self.nodes[visitor.node].split_score.is_nan() || gain > self.nodes[visitor.node].split_score { + if self.nodes[visitor.node].split_score == Option::None || gain > self.nodes[visitor.node].split_score.unwrap() { self.nodes[visitor.node].split_feature = j; - self.nodes[visitor.node].split_value = (visitor.x.get(*i, j) + prevx) / T::two(); - self.nodes[visitor.node].split_score = gain; + self.nodes[visitor.node].split_value = Option::Some((visitor.x.get(*i, j) + prevx) / T::two()); + self.nodes[visitor.node].split_score = Option::Some(gain); visitor.true_child_output = true_label; visitor.false_child_output = false_label; } @@ -360,7 +400,7 @@ impl DecisionTreeClassifier { for i in 0..n { if visitor.samples[i] > 0 { - if visitor.x.get(i, self.nodes[visitor.node].split_feature) <= self.nodes[visitor.node].split_value { + if visitor.x.get(i, self.nodes[visitor.node].split_feature) <= self.nodes[visitor.node].split_value.unwrap_or(T::nan()) { true_samples[i] = visitor.samples[i]; tc += true_samples[i]; visitor.samples[i] = 0; @@ -372,8 +412,8 @@ impl DecisionTreeClassifier { if tc < self.parameters.min_samples_leaf || fc < self.parameters.min_samples_leaf { self.nodes[visitor.node].split_feature = 0; - self.nodes[visitor.node].split_value = T::nan(); - self.nodes[visitor.node].split_score = T::nan(); + self.nodes[visitor.node].split_value = Option::None; + self.nodes[visitor.node].split_score = Option::None; return false; } @@ -477,4 +517,37 @@ mod tests { assert_eq!(y, DecisionTreeClassifier::fit(&x, &y, Default::default()).predict(&x)); } + + #[test] + fn serde() { + let x = DenseMatrix::from_array(&[ + &[1.,1.,1.,0.], + &[1.,1.,1.,0.], + &[1.,1.,1.,1.], + &[1.,1.,0.,0.], + &[1.,1.,0.,1.], + &[1.,0.,1.,0.], + &[1.,0.,1.,0.], + &[1.,0.,1.,1.], + &[1.,0.,0.,0.], + &[1.,0.,0.,1.], + &[0.,1.,1.,0.], + &[0.,1.,1.,0.], + &[0.,1.,1.,1.], + &[0.,1.,0.,0.], + &[0.,1.,0.,1.], + &[0.,0.,1.,0.], + &[0.,0.,1.,0.], + &[0.,0.,1.,1.], + &[0.,0.,0.,0.], + &[0.,0.,0.,1.]]); + let y = vec![1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0.]; + + let tree = DecisionTreeClassifier::fit(&x, &y, Default::default()); + + let deserialized_tree: DecisionTreeClassifier = bincode::deserialize(&bincode::serialize(&tree).unwrap()).unwrap(); + + assert_eq!(tree, deserialized_tree); + + } } \ No newline at end of file diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index a0c574a..62b3bef 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -2,31 +2,33 @@ use std::default::Default; use std::fmt::Debug; use std::collections::LinkedList; +use serde::{Serialize, Deserialize}; + use crate::math::num::FloatExt; use crate::linalg::Matrix; use crate::algorithm::sort::quick_sort::QuickArgSort; -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct DecisionTreeRegressorParameters { pub max_depth: Option, pub min_samples_leaf: usize, pub min_samples_split: usize } -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct DecisionTreeRegressor { nodes: Vec>, parameters: DecisionTreeRegressorParameters, depth: u16 } -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct Node { index: usize, output: T, split_feature: usize, - split_value: T, - split_score: T, + split_value: Option, + split_score: Option, true_child: Option, false_child: Option, } @@ -48,14 +50,46 @@ impl Node { index: index, output: output, split_feature: 0, - split_value: T::nan(), - split_score: T::nan(), + split_value: Option::None, + split_score: Option::None, true_child: Option::None, false_child: Option::None } } } +impl PartialEq for Node { + fn eq(&self, other: &Self) -> bool { + (self.output - other.output).abs() < T::epsilon() && + self.split_feature == other.split_feature && + match (self.split_value, other.split_value) { + (Some(a), Some(b)) => (a - b).abs() < T::epsilon(), + (None, None) => true, + _ => false, + } && + match (self.split_score, other.split_score) { + (Some(a), Some(b)) => (a - b).abs() < T::epsilon(), + (None, None) => true, + _ => false, + } + } +} + +impl PartialEq for DecisionTreeRegressor { + fn eq(&self, other: &Self) -> bool { + if self.depth != other.depth || self.nodes.len() != other.nodes.len(){ + return false + } else { + for i in 0..self.nodes.len() { + if self.nodes[i] != other.nodes[i] { + return false + } + } + return true + } + } +} + struct NodeVisitor<'a, T: FloatExt, M: Matrix> { x: &'a M, y: &'a M, @@ -169,7 +203,7 @@ impl DecisionTreeRegressor { if node.true_child == None && node.false_child == None { result = node.output; } else { - if x.get(row, node.split_feature) <= node.split_value { + if x.get(row, node.split_feature) <= node.split_value.unwrap_or(T::nan()) { queue.push_back(node.true_child.unwrap()); } else { queue.push_back(node.false_child.unwrap()); @@ -207,7 +241,7 @@ impl DecisionTreeRegressor { self.find_best_split(visitor, n, sum, parent_gain, variables[j]); } - !self.nodes[visitor.node].split_score.is_nan() + self.nodes[visitor.node].split_score != Option::None } @@ -240,10 +274,10 @@ impl DecisionTreeRegressor { let gain = (T::from(true_count).unwrap() * true_mean * true_mean + T::from(false_count).unwrap() * false_mean * false_mean) - parent_gain; - if self.nodes[visitor.node].split_score.is_nan() || gain > self.nodes[visitor.node].split_score { + if self.nodes[visitor.node].split_score == Option::None || gain > self.nodes[visitor.node].split_score.unwrap() { self.nodes[visitor.node].split_feature = j; - self.nodes[visitor.node].split_value = (visitor.x.get(*i, j) + prevx) / T::two(); - self.nodes[visitor.node].split_score = gain; + self.nodes[visitor.node].split_value = Option::Some((visitor.x.get(*i, j) + prevx) / T::two()); + self.nodes[visitor.node].split_score = Option::Some(gain); visitor.true_child_output = true_mean; visitor.false_child_output = false_mean; } @@ -264,7 +298,7 @@ impl DecisionTreeRegressor { for i in 0..n { if visitor.samples[i] > 0 { - if visitor.x.get(i, self.nodes[visitor.node].split_feature) <= self.nodes[visitor.node].split_value { + if visitor.x.get(i, self.nodes[visitor.node].split_feature) <= self.nodes[visitor.node].split_value.unwrap_or(T::nan()) { true_samples[i] = visitor.samples[i]; tc += true_samples[i]; visitor.samples[i] = 0; @@ -276,8 +310,8 @@ impl DecisionTreeRegressor { if tc < self.parameters.min_samples_leaf || fc < self.parameters.min_samples_leaf { self.nodes[visitor.node].split_feature = 0; - self.nodes[visitor.node].split_value = T::nan(); - self.nodes[visitor.node].split_score = T::nan(); + self.nodes[visitor.node].split_value = Option::None; + self.nodes[visitor.node].split_score = Option::None; return false; } @@ -357,4 +391,33 @@ mod tests { } + #[test] + fn serde() { + let x = DenseMatrix::from_array(&[ + &[ 234.289, 235.6, 159., 107.608, 1947., 60.323], + &[ 259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[ 258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[ 284.599, 335.1, 165., 110.929, 1950., 61.187], + &[ 328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[ 346.999, 193.2, 359.4, 113.27 , 1952., 63.639], + &[ 365.385, 187., 354.7, 115.094, 1953., 64.989], + &[ 363.112, 357.8, 335., 116.219, 1954., 63.761], + &[ 397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[ 419.18 , 282.2, 285.7, 118.734, 1956., 67.857], + &[ 442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[ 444.546, 468.1, 263.7, 121.95 , 1958., 66.513], + &[ 482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[ 502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[ 518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[ 554.894, 400.7, 282.7, 130.081, 1962., 70.551]]); + let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; + + let tree = DecisionTreeRegressor::fit(&x, &y, Default::default()); + + let deserialized_tree: DecisionTreeRegressor = bincode::deserialize(&bincode::serialize(&tree).unwrap()).unwrap(); + + assert_eq!(tree, deserialized_tree); + + } + } \ No newline at end of file