From c43990e932b97ba65f748b58aed5eb3665de5011 Mon Sep 17 00:00:00 2001 From: Vadim Zaliva Date: Fri, 2 Oct 2020 14:04:01 -0700 Subject: [PATCH 01/79] + DBSCAN and data generator. Improves KNN API --- Cargo.toml | 1 + src/algorithm/neighbour/cover_tree.rs | 65 +++++- src/algorithm/neighbour/linear_search.rs | 45 +++- src/algorithm/neighbour/mod.rs | 60 ++++++ src/cluster/dbscan.rs | 252 +++++++++++++++++++++++ src/cluster/mod.rs | 1 + src/dataset/generator.rs | 129 ++++++++++++ src/dataset/mod.rs | 1 + src/neighbors/knn_classifier.rs | 3 +- src/neighbors/knn_regressor.rs | 3 +- src/neighbors/mod.rs | 49 +---- 11 files changed, 556 insertions(+), 53 deletions(-) create mode 100644 src/cluster/dbscan.rs create mode 100644 src/dataset/generator.rs diff --git a/Cargo.toml b/Cargo.toml index 0a6e32d..20eebf5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ nalgebra = { version = "0.22.0", optional = true } num-traits = "0.2.12" num = "0.3.0" rand = "0.7.3" +rand_distr = "0.3.0" serde = { version = "1.0.115", features = ["derive"] } serde_derive = "1.0.115" diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 70a3d33..da870d2 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -100,7 +100,7 @@ impl> CoverTree /// Find k nearest neighbors of `p` /// * `p` - look for k nearest points to `p` /// * `k` - the number of nearest neighbors to return - pub fn find(&self, p: &T, k: usize) -> Result, Failed> { + pub fn find(&self, p: &T, k: usize) -> Result, Failed> { if k <= 0 { return Err(Failed::because(FailedError::FindFailed, "k should be > 0")); } @@ -164,13 +164,13 @@ impl> CoverTree current_cover_set = next_cover_set; } - let mut neighbors: Vec<(usize, F)> = Vec::new(); + let mut neighbors: Vec<(usize, F, &T)> = Vec::new(); let upper_bound = *heap.peek(); for ds in zero_set { if ds.0 <= upper_bound { let v = self.get_data_value(ds.1.idx); if !self.identical_excluded || v != p { - neighbors.push((ds.1.idx, ds.0)); + neighbors.push((ds.1.idx, ds.0, &v)); } } } @@ -178,6 +178,60 @@ impl> CoverTree Ok(neighbors.into_iter().take(k).collect()) } + /// Find all nearest neighbors within radius `radius` from `p` + /// * `p` - look for k nearest points to `p` + /// * `radius` - radius of the search + pub fn find_radius(&self, p: &T, radius: F) -> Result, Failed> { + if radius <= F::zero() { + return Err(Failed::because( + FailedError::FindFailed, + "radius should be > 0", + )); + } + + let mut neighbors: Vec<(usize, F, &T)> = Vec::new(); + + let mut current_cover_set: Vec<(F, &Node)> = Vec::new(); + let mut zero_set: Vec<(F, &Node)> = Vec::new(); + + let e = self.get_data_value(self.root.idx); + let mut d = self.distance.distance(&e, p); + current_cover_set.push((d, &self.root)); + + while !current_cover_set.is_empty() { + let mut next_cover_set: Vec<(F, &Node)> = Vec::new(); + for par in current_cover_set { + let parent = par.1; + for c in 0..parent.children.len() { + let child = &parent.children[c]; + if c == 0 { + d = par.0; + } else { + d = self.distance.distance(self.get_data_value(child.idx), p); + } + + if d <= radius + child.max_dist { + if !child.children.is_empty() { + next_cover_set.push((d, child)); + } else if d <= radius { + zero_set.push((d, child)); + } + } + } + } + current_cover_set = next_cover_set; + } + + for ds in zero_set { + let v = self.get_data_value(ds.1.idx); + if !self.identical_excluded || v != p { + neighbors.push((ds.1.idx, ds.0, &v)); + } + } + + Ok(neighbors) + } + fn new_leaf(&self, idx: usize) -> Node { Node { idx: idx, @@ -417,6 +471,11 @@ mod tests { knn.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); let knn: Vec = knn.iter().map(|v| v.0).collect(); assert_eq!(vec!(3, 4, 5), knn); + + let mut knn = tree.find_radius(&5, 2.0).unwrap(); + knn.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + let knn: Vec = knn.iter().map(|v| *v.2).collect(); + assert_eq!(vec!(3, 4, 5, 6, 7), knn); } #[test] diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index 3ac1a2b..e89a793 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -26,7 +26,7 @@ use std::cmp::{Ordering, PartialOrd}; use std::marker::PhantomData; use crate::algorithm::sort::heap_select::HeapSelection; -use crate::error::Failed; +use crate::error::{Failed, FailedError}; use crate::math::distance::Distance; use crate::math::num::RealNumber; @@ -53,9 +53,12 @@ impl> LinearKNNSearch { /// Find k nearest neighbors /// * `from` - look for k nearest points to `from` /// * `k` - the number of nearest neighbors to return - pub fn find(&self, from: &T, k: usize) -> Result, Failed> { + pub fn find(&self, from: &T, k: usize) -> Result, Failed> { if k < 1 || k > self.data.len() { - panic!("k should be >= 1 and <= length(data)"); + return Err(Failed::because( + FailedError::FindFailed, + "k should be >= 1 and <= length(data)", + )); } let mut heap = HeapSelection::>::with_capacity(k); @@ -80,9 +83,33 @@ impl> LinearKNNSearch { Ok(heap .get() .into_iter() - .flat_map(|x| x.index.map(|i| (i, x.distance))) + .flat_map(|x| x.index.map(|i| (i, x.distance, &self.data[i]))) .collect()) } + + /// Find all nearest neighbors within radius `radius` from `p` + /// * `p` - look for k nearest points to `p` + /// * `radius` - radius of the search + pub fn find_radius(&self, from: &T, radius: F) -> Result, Failed> { + if radius <= F::zero() { + return Err(Failed::because( + FailedError::FindFailed, + "radius should be > 0", + )); + } + + let mut neighbors: Vec<(usize, F, &T)> = Vec::new(); + + for i in 0..self.data.len() { + let d = self.distance.distance(&from, &self.data[i]); + + if d <= radius { + neighbors.push((i, d, &self.data[i])); + } + } + + Ok(neighbors) + } } #[derive(Debug)] @@ -134,6 +161,16 @@ mod tests { assert_eq!(vec!(0, 1, 2), found_idxs1); + let mut found_idxs1: Vec = algorithm1 + .find_radius(&5, 3.0) + .unwrap() + .iter() + .map(|v| *v.2) + .collect(); + found_idxs1.sort(); + + assert_eq!(vec!(2, 3, 4, 5, 6, 7, 8), found_idxs1); + let data2 = vec![ vec![1., 1.], vec![2., 2.], diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 48c8835..0a4f21a 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -29,8 +29,68 @@ //! //! +use crate::algorithm::neighbour::cover_tree::CoverTree; +use crate::algorithm::neighbour::linear_search::LinearKNNSearch; +use crate::error::Failed; +use crate::math::distance::Distance; +use crate::math::num::RealNumber; +use serde::{Deserialize, Serialize}; + pub(crate) mod bbd_tree; /// tree data structure for fast nearest neighbor search pub mod cover_tree; /// very simple algorithm that sequentially checks each element of the list until a match is found or the whole list has been searched. pub mod linear_search; + +/// Both, KNN classifier and regressor benefits from underlying search algorithms that helps to speed up queries. +/// `KNNAlgorithmName` maintains a list of supported search algorithms, see [KNN algorithms](../algorithm/neighbour/index.html) +#[derive(Serialize, Deserialize, Debug, Clone)] +pub enum KNNAlgorithmName { + /// Heap Search algorithm, see [`LinearSearch`](../algorithm/neighbour/linear_search/index.html) + LinearSearch, + /// Cover Tree Search algorithm, see [`CoverTree`](../algorithm/neighbour/cover_tree/index.html) + CoverTree, +} + +#[derive(Serialize, Deserialize, Debug)] +pub(crate) enum KNNAlgorithm, T>> { + LinearSearch(LinearKNNSearch, T, D>), + CoverTree(CoverTree, T, D>), +} + +impl KNNAlgorithmName { + pub(crate) fn fit, T>>( + &self, + data: Vec>, + distance: D, + ) -> Result, Failed> { + match *self { + KNNAlgorithmName::LinearSearch => { + LinearKNNSearch::new(data, distance).map(|a| KNNAlgorithm::LinearSearch(a)) + } + KNNAlgorithmName::CoverTree => { + CoverTree::new(data, distance).map(|a| KNNAlgorithm::CoverTree(a)) + } + } + } +} + +impl, T>> KNNAlgorithm { + pub fn find(&self, from: &Vec, k: usize) -> Result)>, Failed> { + match *self { + KNNAlgorithm::LinearSearch(ref linear) => linear.find(from, k), + KNNAlgorithm::CoverTree(ref cover) => cover.find(from, k), + } + } + + pub fn find_radius( + &self, + from: &Vec, + radius: T, + ) -> Result)>, Failed> { + match *self { + KNNAlgorithm::LinearSearch(ref linear) => linear.find_radius(from, radius), + KNNAlgorithm::CoverTree(ref cover) => cover.find_radius(from, radius), + } + } +} diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs new file mode 100644 index 0000000..488a7ac --- /dev/null +++ b/src/cluster/dbscan.rs @@ -0,0 +1,252 @@ +//! # DBSCAN Clustering +//! +//! DBSCAN - Density-Based Spatial Clustering of Applications with Noise. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::cluster::dbscan::*; +//! use smartcore::math::distance::Distances; +//! use smartcore::neighbors::KNNAlgorithmName; +//! use smartcore::dataset::generator; +//! +//! // Generate three blobs +//! let blobs = generator::make_blobs(100, 2, 3); +//! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data); +//! // Fit the algorithm and predict cluster labels +//! let labels = DBSCAN::fit(&x, Distances::euclidian(), DBSCANParameters{ +//! min_samples: 5, +//! eps: 3.0, +//! algorithm: KNNAlgorithmName::CoverTree +//! }).and_then(|dbscan| dbscan.predict(&x)); +//! +//! println!("{:?}", labels); +//! ``` +//! +//! ## References: +//! +//! * ["A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", Ester M., Kriegel HP., Sander J., Xu X.](http://faculty.marshall.usc.edu/gareth-james/ISL/) +//! * ["Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and its Applications", Sander J., Ester M., Kriegel HP., Xu X.](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.63.1629&rep=rep1&type=pdf) + +extern crate rand; + +use std::fmt::Debug; +use std::iter::Sum; + +use serde::{Deserialize, Serialize}; + +use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::error::Failed; +use crate::linalg::{row_iter, Matrix}; +use crate::math::distance::Distance; +use crate::math::num::RealNumber; +use crate::tree::decision_tree_classifier::which_max; + +/// DBSCAN clustering algorithm +#[derive(Serialize, Deserialize, Debug)] +pub struct DBSCAN, T>> { + cluster_labels: Vec, + num_classes: usize, + knn_algorithm: KNNAlgorithm, + eps: T, +} + +#[derive(Debug, Clone)] +/// DBSCAN clustering algorithm parameters +pub struct DBSCANParameters { + /// Maximum number of iterations of the k-means algorithm for a single run. + pub min_samples: usize, + /// The number of samples in a neighborhood for a point to be considered as a core point. + pub eps: T, + /// KNN algorithm to use. + pub algorithm: KNNAlgorithmName, +} + +impl, T>> PartialEq for DBSCAN { + fn eq(&self, other: &Self) -> bool { + self.cluster_labels.len() == other.cluster_labels.len() + && self.num_classes == other.num_classes + && self.eps == other.eps + && self.cluster_labels == other.cluster_labels + } +} + +impl Default for DBSCANParameters { + fn default() -> Self { + DBSCANParameters { + min_samples: 5, + eps: T::half(), + algorithm: KNNAlgorithmName::CoverTree, + } + } +} + +impl, T>> DBSCAN { + /// Fit algorithm to _NxM_ matrix where _N_ is number of samples and _M_ is number of features. + /// * `data` - training instances to cluster + /// * `k` - number of clusters + /// * `parameters` - cluster parameters + pub fn fit>( + x: &M, + distance: D, + parameters: DBSCANParameters, + ) -> Result, Failed> { + if parameters.min_samples < 1 { + return Err(Failed::fit(&format!("Invalid minPts"))); + } + + if parameters.eps <= T::zero() { + return Err(Failed::fit(&format!("Invalid radius: "))); + } + + let mut k = 0; + let unassigned = -2; + let outlier = -1; + + let n = x.shape().0; + let mut y = vec![unassigned; n]; + + let algo = parameters.algorithm.fit(row_iter(x).collect(), distance)?; + + for (i, e) in row_iter(x).enumerate() { + if y[i] == unassigned { + let mut neighbors = algo.find_radius(&e, parameters.eps)?; + if neighbors.len() < parameters.min_samples { + y[i] = outlier; + } else { + y[i] = k; + for j in 0..neighbors.len() { + if y[neighbors[j].0] == unassigned { + y[neighbors[j].0] = k; + + let mut secondary_neighbors = + algo.find_radius(neighbors[j].2, parameters.eps)?; + + if secondary_neighbors.len() >= parameters.min_samples { + neighbors.append(&mut secondary_neighbors); + } + } + + if y[neighbors[j].0] == outlier { + y[neighbors[j].0] = k; + } + } + k += 1; + } + } + } + + Ok(DBSCAN { + cluster_labels: y, + num_classes: k as usize, + knn_algorithm: algo, + eps: parameters.eps, + }) + } + + /// Predict clusters for `x` + /// * `x` - matrix with new data to transform of size _KxM_ , where _K_ is number of new samples and _M_ is number of features. + pub fn predict>(&self, x: &M) -> Result { + let (n, m) = x.shape(); + let mut result = M::zeros(1, n); + let mut row = vec![T::zero(); m]; + + for i in 0..n { + x.copy_row_as_vec(i, &mut row); + let neighbors = self.knn_algorithm.find_radius(&row, self.eps)?; + let mut label = vec![0usize; self.num_classes + 1]; + for neighbor in neighbors { + let yi = self.cluster_labels[neighbor.0]; + if yi < 0 { + label[self.num_classes] += 1; + } else { + label[yi as usize] += 1; + } + } + let class = which_max(&label); + if class != self.num_classes { + result.set(0, i, T::from(class).unwrap()); + } else { + result.set(0, i, -T::one()); + } + } + + Ok(result.to_row_vector()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::math::distance::euclidian::Euclidian; + use crate::math::distance::Distances; + + #[test] + fn fit_predict_dbscan() { + let x = DenseMatrix::from_2d_array(&[ + &[1.0, 2.0], + &[1.1, 2.1], + &[0.9, 1.9], + &[1.2, 1.2], + &[0.8, 1.8], + &[2.0, 1.0], + &[2.1, 1.1], + &[2.2, 1.2], + &[1.9, 0.9], + &[1.8, 0.8], + &[3.0, 5.0], + ]); + + let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0]; + + let dbscan = DBSCAN::fit( + &x, + Distances::euclidian(), + DBSCANParameters { + min_samples: 5, + eps: 1.0, + algorithm: KNNAlgorithmName::CoverTree, + }, + ) + .unwrap(); + + let predicted_labels = dbscan.predict(&x).unwrap(); + + assert_eq!(expected_labels, predicted_labels); + } + + #[test] + fn serde() { + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + + let dbscan = DBSCAN::fit(&x, Distances::euclidian(), Default::default()).unwrap(); + + let deserialized_dbscan: DBSCAN = + serde_json::from_str(&serde_json::to_string(&dbscan).unwrap()).unwrap(); + + assert_eq!(dbscan, deserialized_dbscan); + } +} diff --git a/src/cluster/mod.rs b/src/cluster/mod.rs index 3201cda..be6ef9f 100644 --- a/src/cluster/mod.rs +++ b/src/cluster/mod.rs @@ -3,5 +3,6 @@ //! Clustering is the type of unsupervised learning where you divide the population or data points into a number of groups such that data points in the same groups //! are more similar to other data points in the same group than those in other groups. In simple words, the aim is to segregate groups with similar traits and assign them into clusters. +pub mod dbscan; /// An iterative clustering algorithm that aims to find local maxima in each iteration. pub mod kmeans; diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs new file mode 100644 index 0000000..fd4f400 --- /dev/null +++ b/src/dataset/generator.rs @@ -0,0 +1,129 @@ +//! # Dataset Generators +//! +use rand::distributions::Uniform; +use rand::prelude::*; +use rand_distr::Normal; + +use crate::dataset::Dataset; + +/// Generate `num_centers` clusters of normally distributed points +pub fn make_blobs( + num_samples: usize, + num_features: usize, + num_centers: usize, +) -> Dataset { + let center_box = Uniform::from(-10.0..10.0); + let cluster_std = 1.0; + let mut centers: Vec>> = Vec::with_capacity(num_centers); + + let mut rng = rand::thread_rng(); + for _ in 0..num_centers { + centers.push( + (0..num_features) + .map(|_| Normal::new(center_box.sample(&mut rng), cluster_std).unwrap()) + .collect(), + ); + } + + let mut y: Vec = Vec::with_capacity(num_samples); + let mut x: Vec = Vec::with_capacity(num_samples); + + for i in 0..num_samples { + let label = i % num_centers; + y.push(label as f32); + for j in 0..num_features { + x.push(centers[label][j].sample(&mut rng)); + } + } + + Dataset { + data: x, + target: y, + num_samples: num_samples, + num_features: num_features, + feature_names: (0..num_features).map(|n| n.to_string()).collect(), + target_names: vec!["label".to_string()], + description: "Isotropic Gaussian blobs".to_string(), + } +} + +/// Make a large circle containing a smaller circle in 2d. +pub fn make_circles(num_samples: usize, factor: f32, noise: f32) -> Dataset { + if factor >= 1.0 || factor < 0.0 { + panic!("'factor' has to be between 0 and 1."); + } + + let num_samples_out = num_samples / 2; + let num_samples_in = num_samples - num_samples_out; + + let linspace_out = linspace(0.0, 2.0 * std::f32::consts::PI, num_samples_out); + let linspace_in = linspace(0.0, 2.0 * std::f32::consts::PI, num_samples_in); + + println!("{:?}", linspace_out); + println!("{:?}", linspace_in); + let noise = Normal::new(0.0, noise).unwrap(); + let mut rng = rand::thread_rng(); + + let mut x: Vec = Vec::with_capacity(num_samples * 2); + let mut y: Vec = Vec::with_capacity(num_samples); + + for v in linspace_out { + x.push(v.cos() + noise.sample(&mut rng)); + x.push(v.sin() + noise.sample(&mut rng)); + y.push(0.0); + } + + for v in linspace_in { + x.push(v.cos() * factor + noise.sample(&mut rng)); + x.push(v.sin() * factor + noise.sample(&mut rng)); + y.push(1.0); + } + + Dataset { + data: x, + target: y, + num_samples: num_samples, + num_features: 2, + feature_names: (0..2).map(|n| n.to_string()).collect(), + target_names: vec!["label".to_string()], + description: "Large circle containing a smaller circle in 2d".to_string(), + } +} + +fn linspace(start: f32, stop: f32, num: usize) -> Vec { + let div = num as f32; + let delta = stop - start; + let step = delta / div; + (0..num).map(|v| v as f32 * step).collect() +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_make_blobs() { + let dataset = make_blobs(10, 2, 3); + assert_eq!( + dataset.data.len(), + dataset.num_features * dataset.num_samples + ); + assert_eq!(dataset.target.len(), dataset.num_samples); + assert_eq!(dataset.num_features, 2); + assert_eq!(dataset.num_samples, 10); + } + + #[test] + fn test_make_circles() { + let dataset = make_circles(10, 0.5, 0.05); + println!("{:?}", dataset.as_matrix()); + assert_eq!( + dataset.data.len(), + dataset.num_features * dataset.num_samples + ); + assert_eq!(dataset.target.len(), dataset.num_samples); + assert_eq!(dataset.num_features, 2); + assert_eq!(dataset.num_samples, 10); + } +} diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 8d7a4e2..bfcd1c9 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -5,6 +5,7 @@ pub mod boston; pub mod breast_cancer; pub mod diabetes; pub mod digits; +pub mod generator; pub mod iris; use crate::math::num::RealNumber; diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 1f75949..3ad4297 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -34,11 +34,12 @@ use serde::{Deserialize, Serialize}; +use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::error::Failed; use crate::linalg::{row_iter, Matrix}; use crate::math::distance::Distance; use crate::math::num::RealNumber; -use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName, KNNWeightFunction}; +use crate::neighbors::KNNWeightFunction; /// `KNNClassifier` parameters. Use `Default::default()` for default values. #[derive(Serialize, Deserialize, Debug)] diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 5c979a3..04fbd35 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -36,11 +36,12 @@ //! use serde::{Deserialize, Serialize}; +use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::error::Failed; use crate::linalg::{row_iter, BaseVector, Matrix}; use crate::math::distance::Distance; use crate::math::num::RealNumber; -use crate::neighbors::{KNNAlgorithm, KNNAlgorithmName, KNNWeightFunction}; +use crate::neighbors::KNNWeightFunction; /// `KNNRegressor` parameters. Use `Default::default()` for default values. #[derive(Serialize, Deserialize, Debug)] diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index 8251117..6d542f6 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -32,10 +32,6 @@ //! //! -use crate::algorithm::neighbour::cover_tree::CoverTree; -use crate::algorithm::neighbour::linear_search::LinearKNNSearch; -use crate::error::Failed; -use crate::math::distance::Distance; use crate::math::num::RealNumber; use serde::{Deserialize, Serialize}; @@ -44,15 +40,12 @@ pub mod knn_classifier; /// K Nearest Neighbors Regressor pub mod knn_regressor; -/// Both, KNN classifier and regressor benefits from underlying search algorithms that helps to speed up queries. /// `KNNAlgorithmName` maintains a list of supported search algorithms, see [KNN algorithms](../algorithm/neighbour/index.html) -#[derive(Serialize, Deserialize, Debug)] -pub enum KNNAlgorithmName { - /// Heap Search algorithm, see [`LinearSearch`](../algorithm/neighbour/linear_search/index.html) - LinearSearch, - /// Cover Tree Search algorithm, see [`CoverTree`](../algorithm/neighbour/cover_tree/index.html) - CoverTree, -} +#[deprecated( + since = "0.2.0", + note = "please use `smartcore::algorithm::neighbour::KNNAlgorithmName` instead" +)] +pub type KNNAlgorithmName = crate::algorithm::neighbour::KNNAlgorithmName; /// Weight function that is used to determine estimated value. #[derive(Serialize, Deserialize, Debug)] @@ -63,12 +56,6 @@ pub enum KNNWeightFunction { Distance, } -#[derive(Serialize, Deserialize, Debug)] -enum KNNAlgorithm, T>> { - LinearSearch(LinearKNNSearch, T, D>), - CoverTree(CoverTree, T, D>), -} - impl KNNWeightFunction { fn calc_weights(&self, distances: Vec) -> std::vec::Vec { match *self { @@ -88,29 +75,3 @@ impl KNNWeightFunction { } } } - -impl KNNAlgorithmName { - fn fit, T>>( - &self, - data: Vec>, - distance: D, - ) -> Result, Failed> { - match *self { - KNNAlgorithmName::LinearSearch => { - LinearKNNSearch::new(data, distance).map(|a| KNNAlgorithm::LinearSearch(a)) - } - KNNAlgorithmName::CoverTree => { - CoverTree::new(data, distance).map(|a| KNNAlgorithm::CoverTree(a)) - } - } - } -} - -impl, T>> KNNAlgorithm { - fn find(&self, from: &Vec, k: usize) -> Result, Failed> { - match *self { - KNNAlgorithm::LinearSearch(ref linear) => linear.find(from, k), - KNNAlgorithm::CoverTree(ref cover) => cover.find(from, k), - } - } -} From a2588f6f459b6138fb40aa2b29dd456478e5f29d Mon Sep 17 00:00:00 2001 From: Lorenzo Date: Tue, 13 Oct 2020 10:10:28 +0100 Subject: [PATCH 02/79] KFold cross-validation (#8) * Add documentation and API * Add public keyword * Implement test_indices (debug version) * Return indices as Vec of Vec * Consume vector using drain() * Use shape() to return num of samples * Implement test_masks * Implement KFold.split() * Make trait public * Add test for split * Fix samples in shape() * Implement shuffle * Simplify return values * Use usize for n_splits Co-authored-by: VolodymyrOrlov --- src/model_selection/mod.rs | 232 +++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 1895296..49938cf 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -13,6 +13,8 @@ extern crate rand; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; +use rand::seq::SliceRandom; +use rand::thread_rng; use rand::Rng; /// Splits data into 2 disjoint datasets. @@ -81,6 +83,113 @@ pub fn train_test_split>( (x_train, x_test, y_train, y_test) } +/// +/// KFold Cross-Validation +/// +pub trait BaseKFold { + /// Returns integer indices corresponding to test sets + fn test_indices>(&self, x: &M) -> Vec>; + + /// Returns masksk corresponding to test sets + fn test_masks>(&self, x: &M) -> Vec>; + + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Vec<(Vec, Vec)>; +} + +/// +/// An implementation of KFold +/// +pub struct KFold { + n_splits: usize, // cannot exceed std::usize::MAX + shuffle: bool, + // TODO: to be implemented later + // random_state: i32, +} + +impl Default for KFold { + fn default() -> KFold { + KFold { + n_splits: 3 as usize, + shuffle: true, + } + } +} + +/// +/// Abstract class for all KFold functionalities +/// +impl BaseKFold for KFold { + fn test_indices>(&self, x: &M) -> Vec> { + // number of samples (rows) in the matrix + let n_samples: usize = x.shape().0; + + // initialise indices + let mut indices: Vec = (0..n_samples).collect(); + if self.shuffle == true { + indices.shuffle(&mut thread_rng()); + } + // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. + let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + + // increment by one if odd + for i in 0..(n_samples % self.n_splits) { + fold_sizes[i] = fold_sizes[i] + 1; + } + + // generate the right array of arrays for test indices + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + let mut current: usize = 0; + for fold_size in fold_sizes.drain(..) { + let stop = current + fold_size; + return_values.push(indices[current..stop].to_vec()); + current = stop + } + + return_values + } + + fn test_masks>(&self, x: &M) -> Vec> { + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + for test_index in self.test_indices(x).drain(..) { + // init mask + let mut test_mask = vec![false; x.shape().0]; + // set mask's indices to true according to test indices + for i in test_index { + test_mask[i] = true; // can be implemented with map() + } + return_values.push(test_mask); + } + return_values + } + + fn split>(&self, x: &M) -> Vec<(Vec, Vec)> { + let n_samples: usize = x.shape().0; + let indices: Vec = (0..n_samples).collect(); + + let mut return_values: Vec<(Vec, Vec)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs + + for test_index in self.test_masks(x).drain(..) { + let train_index = indices + .clone() + .iter() + .enumerate() + .filter(|&(idx, _)| test_index[idx] == false) + .map(|(idx, _)| idx) + .collect::>(); // filter train indices out according to mask + let test_index = indices + .iter() + .enumerate() + .filter(|&(idx, _)| test_index[idx] == true) + .map(|(idx, _)| idx) + .collect::>(); // filter tests indices out according to mask + return_values.push((train_index, test_index)) + } + return_values + } +} + #[cfg(test)] mod tests { @@ -106,4 +215,127 @@ mod tests { assert_eq!(x_train.shape().0, y_train.len()); assert_eq!(x_test.shape().0, y_test.len()); } + + #[test] + fn run_kfold_return_test_indices_simple() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(33, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..11).collect::>()); + assert_eq!(test_indices[1], (11..22).collect::>()); + assert_eq!(test_indices[2], (22..33).collect::>()); + } + + #[test] + fn run_kfold_return_test_indices_odd() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(34, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..12).collect::>()); + assert_eq!(test_indices[1], (12..23).collect::>()); + assert_eq!(test_indices[2], (23..34).collect::>()); + } + + #[test] + fn run_kfold_return_test_mask_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let test_masks = k.test_masks(&x); + + for t in &test_masks[0][0..11] { + // TODO: this can be prob done better + assert_eq!(*t, true) + } + for t in &test_masks[0][11..22] { + assert_eq!(*t, false) + } + + for t in &test_masks[1][0..11] { + assert_eq!(*t, false) + } + for t in &test_masks[1][11..22] { + assert_eq!(*t, true) + } + } + + #[test] + fn run_kfold_return_split_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let train_test_splits = k.split(&x); + + assert_eq!(train_test_splits[0].1, (0..11).collect::>()); + assert_eq!(train_test_splits[0].0, (11..22).collect::>()); + assert_eq!(train_test_splits[1].0, (0..11).collect::>()); + assert_eq!(train_test_splits[1].1, (11..22).collect::>()); + } + + #[test] + fn run_kfold_return_split_simple_shuffle() { + let k = KFold { + n_splits: 2, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(23, 100); + let train_test_splits = k.split(&x); + + assert_eq!(train_test_splits[0].1.len(), 12 as usize); + assert_eq!(train_test_splits[0].0.len(), 11 as usize); + assert_eq!(train_test_splits[1].0.len(), 12 as usize); + assert_eq!(train_test_splits[1].1.len(), 11 as usize); + } + + #[test] + fn numpy_parity_test() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test, expected_test); + assert_eq!(train, expected_train); + } + } + + #[test] + fn numpy_parity_test_shuffle() { + let k = KFold { + n_splits: 3, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test.len(), expected_test.len()); + assert_eq!(train.len(), expected_train.len()); + } + } } From 20e58a881788437d47901971cd80b54d14fadc87 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 15 Oct 2020 16:23:26 -0700 Subject: [PATCH 03/79] feat: adds e-SVR --- src/lib.rs | 2 + src/linalg/mod.rs | 10 + src/linalg/naive/dense_matrix.rs | 57 ++++ src/linalg/nalgebra_bindings.rs | 39 +++ src/linalg/ndarray_bindings.rs | 35 +- src/math/num.rs | 15 +- src/svm/mod.rs | 25 ++ src/svm/svr.rs | 538 +++++++++++++++++++++++++++++++ 8 files changed, 719 insertions(+), 2 deletions(-) create mode 100644 src/svm/mod.rs create mode 100644 src/svm/svr.rs diff --git a/src/lib.rs b/src/lib.rs index 4ac8fb3..083b95f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,5 +88,7 @@ pub mod model_selection; /// Supervised neighbors-based learning methods pub mod neighbors; pub(crate) mod optimization; +/// Support Vector Machines +pub mod svm; /// Supervised tree-based learning methods pub mod tree; diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 3937b47..a637bdf 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -85,6 +85,12 @@ pub trait BaseVector: Clone + Debug { /// Create new vector of size `len` where each element is set to `value`. fn fill(len: usize, value: T) -> Self; + + /// Vector dot product + fn dot(&self, other: &Self) -> T; + + /// Returns True if matrices are element-wise equal within a tolerance `error`. + fn approximate_eq(&self, other: &Self, error: T) -> bool; } /// Generic matrix type. @@ -110,6 +116,10 @@ pub trait BaseMatrix: Clone + Debug { /// * `row` - row number fn get_row_as_vec(&self, row: usize) -> Vec; + /// Get the `row`'th row + /// * `row` - row number + fn get_row(&self, row: usize) -> Self::RowVector; + /// Copies a vector with elements of the `row`'th row into `result` /// * `row` - row number /// * `result` - receiver for the row diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index ae9d1d2..ab6bf43 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -44,6 +44,32 @@ impl BaseVector for Vec { fn fill(len: usize, value: T) -> Self { vec![value; len] } + + fn dot(&self, other: &Self) -> T { + if self.len() != other.len() { + panic!("A and B should have the same size"); + } + + let mut result = T::zero(); + for i in 0..self.len() { + result = result + self[i] * other[i]; + } + + result + } + + fn approximate_eq(&self, other: &Self, error: T) -> bool { + if self.len() != other.len() { + false + } else { + for i in 0..other.len() { + if (self[i] - other[i]).abs() > error { + return false; + } + } + true + } + } } /// Column-major, dense matrix. See [Simple Dense Matrix](../index.html). @@ -371,6 +397,16 @@ impl BaseMatrix for DenseMatrix { self.values[col * self.nrows + row] } + fn get_row(&self, row: usize) -> Self::RowVector { + let mut v = vec![T::zero(); self.ncols]; + + for c in 0..self.ncols { + v[c] = self.get(row, c); + } + + v + } + fn get_row_as_vec(&self, row: usize) -> Vec { let mut result = vec![T::zero(); self.ncols]; for c in 0..self.ncols { @@ -865,6 +901,21 @@ impl BaseMatrix for DenseMatrix { mod tests { use super::*; + #[test] + fn vec_dot() { + let v1 = vec![1., 2., 3.]; + let v2 = vec![4., 5., 6.]; + assert_eq!(32.0, BaseVector::dot(&v1, &v2)); + } + + #[test] + fn vec_approximate_eq() { + let a = vec![1., 2., 3.]; + let b = vec![1. + 1e-5, 2. + 2e-5, 3. + 3e-5]; + assert!(a.approximate_eq(&b, 1e-4)); + assert!(!a.approximate_eq(&b, 1e-5)); + } + #[test] fn from_array() { let vec = [1., 2., 3., 4., 5., 6.]; @@ -939,6 +990,12 @@ mod tests { assert_eq!(result, expected); } + #[test] + fn get_row() { + let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]); + assert_eq!(vec![4., 5., 6.], a.get_row(1)); + } + #[test] fn matmul() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index 5e52d14..badd8c4 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -79,6 +79,20 @@ impl BaseVector for MatrixMN { m.fill(value); m } + + fn dot(&self, other: &Self) -> T { + self.dot(other) + } + + fn approximate_eq(&self, other: &Self, error: T) -> bool { + if self.shape() != other.shape() { + false + } else { + self.iter() + .zip(other.iter()) + .all(|(a, b)| (*a - *b).abs() <= error) + } + } } impl @@ -102,6 +116,10 @@ impl Self::RowVector { + self.row(row).into_owned() + } + fn copy_row_as_vec(&self, row: usize, result: &mut Vec) { let mut r = 0; for e in self.row(row).iter() { @@ -486,6 +504,21 @@ mod tests { assert_eq!(twos, RowDVector::from_vec(vec![2., 2., 2.])); } + #[test] + fn vec_dot() { + let v1 = RowDVector::from_vec(vec![1., 2., 3.]); + let v2 = RowDVector::from_vec(vec![4., 5., 6.]); + assert_eq!(32.0, BaseVector::dot(&v1, &v2)); + } + + #[test] + fn vec_approximate_eq() { + let a = RowDVector::from_vec(vec![1., 2., 3.]); + let noise = RowDVector::from_vec(vec![1e-5, 2e-5, 3e-5]); + assert!(a.approximate_eq(&(&noise + &a), 1e-4)); + assert!(!a.approximate_eq(&(&noise + &a), 1e-5)); + } + #[test] fn get_set_dynamic() { let mut m = DMatrix::from_row_slice(2, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); @@ -579,6 +612,12 @@ mod tests { assert_eq!(m.get_col_as_vec(1), vec!(2., 5., 8.)); } + #[test] + fn get_row() { + let a = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]); + assert_eq!(RowDVector::from_vec(vec![4., 5., 6.]), a.get_row(1)); + } + #[test] fn copy_row_col_as_vec() { let m = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]); diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index c6324eb..9cfb6d7 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -57,7 +57,7 @@ use crate::linalg::Matrix; use crate::linalg::{BaseMatrix, BaseVector}; use crate::math::num::RealNumber; -impl BaseVector for ArrayBase, Ix1> { +impl BaseVector for ArrayBase, Ix1> { fn get(&self, i: usize) -> T { self[i] } @@ -84,6 +84,14 @@ impl BaseVector for ArrayBase, Ix1> { fn fill(len: usize, value: T) -> Self { Array::from_elem(len, value) } + + fn dot(&self, other: &Self) -> T { + self.dot(other) + } + + fn approximate_eq(&self, other: &Self, error: T) -> bool { + (self - other).iter().all(|v| v.abs() <= error) + } } impl @@ -109,6 +117,10 @@ impl Self::RowVector { + self.row(row).to_owned() + } + fn copy_row_as_vec(&self, row: usize, result: &mut Vec) { let mut r = 0; for e in self.row(row).iter() { @@ -437,6 +449,21 @@ mod tests { assert_eq!(vec![1., 2., 3.], v.to_vec()); } + #[test] + fn vec_dot() { + let v1 = arr1(&[1., 2., 3.]); + let v2 = arr1(&[4., 5., 6.]); + assert_eq!(32.0, BaseVector::dot(&v1, &v2)); + } + + #[test] + fn vec_approximate_eq() { + let a = arr1(&[1., 2., 3.]); + let noise = arr1(&[1e-5, 2e-5, 3e-5]); + assert!(a.approximate_eq(&(&noise + &a), 1e-4)); + assert!(!a.approximate_eq(&(&noise + &a), 1e-5)); + } + #[test] fn from_to_row_vec() { let vec = arr1(&[1., 2., 3.]); @@ -678,6 +705,12 @@ mod tests { assert_eq!(res, vec![4., 5., 6.]); } + #[test] + fn get_row() { + let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]); + assert_eq!(arr1(&[4., 5., 6.]), a.get_row(1)); + } + #[test] fn get_col_as_vector() { let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]); diff --git a/src/math/num.rs b/src/math/num.rs index 9ffbdff..894e5a3 100644 --- a/src/math/num.rs +++ b/src/math/num.rs @@ -6,10 +6,23 @@ use num_traits::{Float, FromPrimitive}; use rand::prelude::*; use std::fmt::{Debug, Display}; use std::iter::{Product, Sum}; +use std::ops::{AddAssign, DivAssign, MulAssign, SubAssign}; /// Defines real number /// -pub trait RealNumber: Float + FromPrimitive + Debug + Display + Copy + Sum + Product { +pub trait RealNumber: + Float + + FromPrimitive + + Debug + + Display + + Copy + + Sum + + Product + + AddAssign + + SubAssign + + MulAssign + + DivAssign +{ /// Copy sign from `sign` - another real number fn copysign(self, sign: Self) -> Self; diff --git a/src/svm/mod.rs b/src/svm/mod.rs new file mode 100644 index 0000000..404b281 --- /dev/null +++ b/src/svm/mod.rs @@ -0,0 +1,25 @@ +//! # Support Vector Machines +//! + +pub mod svr; + +use serde::{Deserialize, Serialize}; + +use crate::linalg::BaseVector; +use crate::math::num::RealNumber; + +/// Kernel +pub trait Kernel> { + /// Apply kernel function to x_i and x_j + fn apply(&self, x_i: &V, x_j: &V) -> T; +} + +/// Linear Kernel +#[derive(Serialize, Deserialize, Debug)] +pub struct LinearKernel {} + +impl> Kernel for LinearKernel { + fn apply(&self, x_i: &V, x_j: &V) -> T { + x_i.dot(x_j) + } +} diff --git a/src/svm/svr.rs b/src/svm/svr.rs new file mode 100644 index 0000000..371c4ca --- /dev/null +++ b/src/svm/svr.rs @@ -0,0 +1,538 @@ +//! # Epsilon-Support Vector Regression. +//! +//! Example +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::linear::linear_regression::*; +//! use smartcore::svm::*; +//! use smartcore::svm::svr::{SVR, SVRParameters}; +//! +//! // Longley dataset (https://www.statsmodels.org/stable/datasets/generated/longley.html) +//! let x = DenseMatrix::from_2d_array(&[ +//! &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], +//! &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], +//! &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], +//! &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], +//! &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], +//! &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], +//! &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], +//! &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], +//! &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], +//! &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], +//! &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], +//! &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], +//! &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], +//! &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], +//! &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], +//! &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], +//! ]); +//! +//! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, +//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; +//! +//! let svr = SVR::fit(&x, &y, +//! LinearKernel {}, +//! SVRParameters { +//! eps: 2.0, +//! c: 10.0, +//! tol: 1e-3, +//! }).unwrap(); +//! +//! let y_hat = svr.predict(&x).unwrap(); +//! ``` +use std::cell::{Ref, RefCell}; +use std::fmt::Debug; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use crate::svm::Kernel; + +#[derive(Serialize, Deserialize, Debug)] + +/// SVR Parameters +pub struct SVRParameters { + /// Epsilon in the epsilon-SVR model + pub eps: T, + /// Regularization parameter. + pub c: T, + /// Tolerance for stopping criterion + pub tol: T, +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(bound( + serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", + deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", +))] + +/// Epsilon-Support Vector Regression +pub struct SVR, K: Kernel> { + kernel: K, + instances: Vec, + w: Vec, + b: T, +} + +#[derive(Serialize, Deserialize, Debug)] +struct SupportVector> { + index: usize, + x: V, + alpha: [T; 2], + grad: [T; 2], + k: T, +} + +struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { + tol: T, + c: T, + svmin: usize, + svmax: usize, + gmin: T, + gmax: T, + gminindex: usize, + gmaxindex: usize, + tau: T, + sv: Vec>, + kernel: &'a K, +} + +struct Cache { + data: Vec>>>, +} + +impl Default for SVRParameters { + fn default() -> Self { + SVRParameters { + eps: T::from_f64(0.1).unwrap(), + c: T::one(), + tol: T::from_f64(1e-3).unwrap(), + } + } +} + +impl, K: Kernel> SVR { + /// Fits SVR to your data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - target values + /// * `kernel` - the kernel function + /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. + pub fn fit( + x: &M, + y: &M::RowVector, + kernel: K, + parameters: SVRParameters, + ) -> Result, Failed> { + let (n, _) = x.shape(); + + if n != y.len() { + return Err(Failed::fit(&format!( + "Number of rows of X doesn't match number of rows of Y" + ))); + } + + let optimizer = Optimizer::optimize(x, y, &kernel, ¶meters); + + let (support_vectors, weight, b) = optimizer.smo(); + + Ok(SVR { + kernel: kernel, + instances: support_vectors, + w: weight, + b: b, + }) + } + + /// Predict target values from `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &M) -> Result { + let (n, _) = x.shape(); + + let mut y_hat = M::RowVector::zeros(n); + + for i in 0..n { + y_hat.set(i, self.predict_for_row(x.get_row(i))); + } + + Ok(y_hat) + } + + pub(in crate) fn predict_for_row(&self, x: M::RowVector) -> T { + let mut f = self.b; + + for i in 0..self.instances.len() { + f += self.w[i] * self.kernel.apply(&x, &self.instances[i]); + } + + return f; + } +} + +impl, K: Kernel> PartialEq for SVR { + fn eq(&self, other: &Self) -> bool { + if self.b != other.b + || self.w.len() != other.w.len() + || self.instances.len() != other.instances.len() + { + return false; + } else { + for i in 0..self.w.len() { + if (self.w[i] - other.w[i]).abs() > T::epsilon() { + return false; + } + } + for i in 0..self.instances.len() { + if !self.instances[i].approximate_eq(&other.instances[i], T::epsilon()) { + return false; + } + } + return true; + } + } +} + +impl> SupportVector { + fn new>(i: usize, x: V, y: T, eps: T, k: &K) -> SupportVector { + let k_v = k.apply(&x, &x); + SupportVector { + index: i, + x: x, + grad: [eps + y, eps - y], + k: k_v, + alpha: [T::zero(), T::zero()], + } + } +} + +impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, T, M, K> { + fn optimize( + x: &M, + y: &M::RowVector, + kernel: &'a K, + parameters: &SVRParameters, + ) -> Optimizer<'a, T, M, K> { + let (n, _) = x.shape(); + + let mut support_vectors: Vec> = Vec::with_capacity(n); + + for i in 0..n { + support_vectors.push(SupportVector::new( + i, + x.get_row(i), + y.get(i), + parameters.eps, + kernel, + )); + } + + Optimizer { + tol: parameters.tol, + c: parameters.c, + svmin: 0, + svmax: 0, + gmin: T::max_value(), + gmax: T::min_value(), + gminindex: 0, + gmaxindex: 0, + tau: T::from_f64(1e-12).unwrap(), + sv: support_vectors, + kernel: kernel, + } + } + + fn minmax(&mut self) { + self.gmin = T::max_value(); + self.gmax = T::min_value(); + + for i in 0..self.sv.len() { + let v = &self.sv[i]; + let g = -v.grad[0]; + let a = v.alpha[0]; + if g < self.gmin && a > T::zero() { + self.gmin = g; + self.gminindex = 0; + self.svmin = i; + } + if g > self.gmax && a < self.c { + self.gmax = g; + self.gmaxindex = 0; + self.svmax = i; + } + + let g = v.grad[1]; + let a = v.alpha[1]; + if g < self.gmin && a < self.c { + self.gmin = g; + self.gminindex = 1; + self.svmin = i; + } + if g > self.gmax && a > T::zero() { + self.gmax = g; + self.gmaxindex = 1; + self.svmax = i; + } + } + } + + fn smo(mut self) -> (Vec, Vec, T) { + let cache: Cache = Cache::new(self.sv.len()); + + self.minmax(); + + while self.gmax - self.gmin > self.tol { + let v1 = self.svmax; + let i = self.gmaxindex; + let old_alpha_i = self.sv[v1].alpha[i]; + + let k1 = cache.get(self.sv[v1].index, || { + self.sv + .iter() + .map(|vi| self.kernel.apply(&self.sv[v1].x, &vi.x)) + .collect() + }); + + let mut v2 = self.svmin; + let mut j = self.gminindex; + let mut old_alpha_j = self.sv[v2].alpha[j]; + + let mut best = T::zero(); + let gi = if i == 0 { + -self.sv[v1].grad[0] + } else { + self.sv[v1].grad[1] + }; + for jj in 0..self.sv.len() { + let v = &self.sv[jj]; + let mut curv = self.sv[v1].k + v.k - T::two() * k1[v.index]; + if curv <= T::zero() { + curv = self.tau; + } + + let mut gj = -v.grad[0]; + if v.alpha[0] > T::zero() && gj < gi { + let gain = -((gi - gj) * (gi - gj)) / curv; + if gain < best { + best = gain; + v2 = jj; + j = 0; + old_alpha_j = self.sv[v2].alpha[0]; + } + } + + gj = v.grad[1]; + if v.alpha[1] < self.c && gj < gi { + let gain = -((gi - gj) * (gi - gj)) / curv; + if gain < best { + best = gain; + v2 = jj; + j = 1; + old_alpha_j = self.sv[v2].alpha[1]; + } + } + } + + let k2 = cache.get(self.sv[v2].index, || { + self.sv + .iter() + .map(|vi| self.kernel.apply(&self.sv[v2].x, &vi.x)) + .collect() + }); + + let mut curv = self.sv[v1].k + self.sv[v2].k - T::two() * k1[self.sv[v2].index]; + if curv <= T::zero() { + curv = self.tau; + } + + if i != j { + let delta = (-self.sv[v1].grad[i] - self.sv[v2].grad[j]) / curv; + let diff = self.sv[v1].alpha[i] - self.sv[v2].alpha[j]; + self.sv[v1].alpha[i] += delta; + self.sv[v2].alpha[j] += delta; + + if diff > T::zero() { + if self.sv[v2].alpha[j] < T::zero() { + self.sv[v2].alpha[j] = T::zero(); + self.sv[v1].alpha[i] = diff; + } + } else { + if self.sv[v1].alpha[i] < T::zero() { + self.sv[v1].alpha[i] = T::zero(); + self.sv[v2].alpha[j] = -diff; + } + } + + if diff > T::zero() { + if self.sv[v1].alpha[i] > self.c { + self.sv[v1].alpha[i] = self.c; + self.sv[v2].alpha[j] = self.c - diff; + } + } else { + if self.sv[v2].alpha[j] > self.c { + self.sv[v2].alpha[j] = self.c; + self.sv[v1].alpha[i] = self.c + diff; + } + } + } else { + let delta = (self.sv[v1].grad[i] - self.sv[v2].grad[j]) / curv; + let sum = self.sv[v1].alpha[i] + self.sv[v2].alpha[j]; + self.sv[v1].alpha[i] -= delta; + self.sv[v2].alpha[j] += delta; + + if sum > self.c { + if self.sv[v1].alpha[i] > self.c { + self.sv[v1].alpha[i] = self.c; + self.sv[v2].alpha[j] = sum - self.c; + } + } else { + if self.sv[v2].alpha[j] < T::zero() { + self.sv[v2].alpha[j] = T::zero(); + self.sv[v1].alpha[i] = sum; + } + } + + if sum > self.c { + if self.sv[v2].alpha[j] > self.c { + self.sv[v2].alpha[j] = self.c; + self.sv[v1].alpha[i] = sum - self.c; + } + } else { + if self.sv[v1].alpha[i] < T::zero() { + self.sv[v1].alpha[i] = T::zero(); + self.sv[v2].alpha[j] = sum; + } + } + } + + let delta_alpha_i = self.sv[v1].alpha[i] - old_alpha_i; + let delta_alpha_j = self.sv[v2].alpha[j] - old_alpha_j; + + let si = T::two() * T::from_usize(i).unwrap() - T::one(); + let sj = T::two() * T::from_usize(j).unwrap() - T::one(); + for v in self.sv.iter_mut() { + v.grad[0] -= si * k1[v.index] * delta_alpha_i + sj * k2[v.index] * delta_alpha_j; + v.grad[1] += si * k1[v.index] * delta_alpha_i + sj * k2[v.index] * delta_alpha_j; + } + + self.minmax(); + } + + let b = -(self.gmax + self.gmin) / T::two(); + + let mut result: Vec = Vec::new(); + let mut alpha: Vec = Vec::new(); + + for v in self.sv { + if v.alpha[0] != v.alpha[1] { + result.push(v.x); + alpha.push(v.alpha[1] - v.alpha[0]); + } + } + + (result, alpha, b) + } +} + +impl Cache { + fn new(n: usize) -> Cache { + Cache { + data: vec![RefCell::new(None); n], + } + } + + fn get Vec>(&self, i: usize, or: F) -> Ref> { + if self.data[i].borrow().is_none() { + self.data[i].replace(Some(or())); + } + Ref::map(self.data[i].borrow(), |v| v.as_ref().unwrap()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + use crate::metrics::mean_squared_error; + use crate::svm::*; + + #[test] + fn svr_fit_predict() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y: Vec = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let y_hat = SVR::fit( + &x, + &y, + LinearKernel {}, + SVRParameters { + eps: 2.0, + c: 10.0, + tol: 1e-3, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + println!("{:?}", y_hat); + + assert!(mean_squared_error(&y_hat, &y) < 2.5); + } + + #[test] + fn svr_serde() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y: Vec = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let svr = SVR::fit(&x, &y, LinearKernel {}, Default::default()).unwrap(); + + let deserialized_svr: SVR, LinearKernel> = + serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); + + assert_eq!(svr, deserialized_svr); + } +} From 92dad01810c7407c99460b78b3a145c1dfee8e4d Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Fri, 16 Oct 2020 12:28:30 -0400 Subject: [PATCH 04/79] Allow KNN with k=1 --- src/algorithm/sort/heap_select.rs | 3 +++ src/neighbors/knn_regressor.rs | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/algorithm/sort/heap_select.rs b/src/algorithm/sort/heap_select.rs index 063ffc6..ae3ff18 100644 --- a/src/algorithm/sort/heap_select.rs +++ b/src/algorithm/sort/heap_select.rs @@ -41,6 +41,9 @@ impl<'a, T: PartialOrd + Debug> HeapSelection { pub fn heapify(&mut self) { let n = self.heap.len(); + if n <= 1 { + return; + } for i in (0..=(n / 2 - 1)).rev() { self.sift_down(i, n - 1); } diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 04fbd35..0bf283f 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -116,9 +116,9 @@ impl, T>> KNNRegressor { ))); } - if parameters.k <= 1 { + if parameters.k < 1 { return Err(Failed::fit(&format!( - "k should be > 1, k=[{}]", + "k should be > 0, k=[{}]", parameters.k ))); } From 83d28dea62c7a6b70947d1ee79b7a28fbd04cd6e Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 16 Oct 2020 11:56:37 -0700 Subject: [PATCH 05/79] fix: svr, post-review changes --- src/svm/svr.rs | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 371c4ca..29c6b2e 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -41,6 +41,14 @@ //! //! let y_hat = svr.predict(&x).unwrap(); //! ``` +//! +//! ## References: +//! +//! * ["Support Vector Machines" Kowalczyk A., 2017](https://www.svm-tutorial.com/2017/10/support-vector-machines-succinctly-released/) +//! * ["A Fast Algorithm for Training Support Vector Machines", Platt J.C., 1998](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-98-14.pdf) +//! * ["Working Set Selection Using Second Order Information for Training Support Vector Machines", Rong-En Fan et al., 2005](https://www.jmlr.org/papers/volume6/fan05a/fan05a.pdf) +//! * ["A tutorial on support vector regression", SMOLA A.J., Scholkopf B., 2003](https://alex.smola.org/papers/2004/SmoSch04.pdf) + use std::cell::{Ref, RefCell}; use std::fmt::Debug; @@ -87,6 +95,7 @@ struct SupportVector> { k: T, } +/// Sequential Minimal Optimization algorithm struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { tol: T, c: T, @@ -135,7 +144,7 @@ impl, K: Kernel> SVR { ))); } - let optimizer = Optimizer::optimize(x, y, &kernel, ¶meters); + let optimizer = Optimizer::new(x, y, &kernel, ¶meters); let (support_vectors, weight, b) = optimizer.smo(); @@ -209,7 +218,7 @@ impl> SupportVector { } impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, T, M, K> { - fn optimize( + fn new( x: &M, y: &M::RowVector, kernel: &'a K, @@ -244,7 +253,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, } } - fn minmax(&mut self) { + fn find_min_max_gradient(&mut self) { self.gmin = T::max_value(); self.gmax = T::min_value(); @@ -278,10 +287,14 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, } } + /// Solvs the quadratic programming (QP) problem that arises during the training of support-vector machines (SVM) algorithm. + /// Returns: + /// * support vectors + /// * hyperplane parameters: w and b fn smo(mut self) -> (Vec, Vec, T) { let cache: Cache = Cache::new(self.sv.len()); - self.minmax(); + self.find_min_max_gradient(); while self.gmax - self.gmin > self.tol { let v1 = self.svmax; @@ -417,22 +430,22 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, v.grad[1] += si * k1[v.index] * delta_alpha_i + sj * k2[v.index] * delta_alpha_j; } - self.minmax(); + self.find_min_max_gradient(); } let b = -(self.gmax + self.gmin) / T::two(); - let mut result: Vec = Vec::new(); - let mut alpha: Vec = Vec::new(); + let mut support_vectors: Vec = Vec::new(); + let mut w: Vec = Vec::new(); for v in self.sv { if v.alpha[0] != v.alpha[1] { - result.push(v.x); - alpha.push(v.alpha[1] - v.alpha[0]); + support_vectors.push(v.x); + w.push(v.alpha[1] - v.alpha[0]); } } - (result, alpha, b) + (support_vectors, w, b) } } @@ -497,8 +510,6 @@ mod tests { .and_then(|lr| lr.predict(&x)) .unwrap(); - println!("{:?}", y_hat); - assert!(mean_squared_error(&y_hat, &y) < 2.5); } From 1b9347baa1b55f1ad343a81a3afca3d31ffd30a5 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 21 Oct 2020 19:01:29 -0700 Subject: [PATCH 06/79] feat: adds support vector classifier --- src/svm/mod.rs | 1 + src/svm/svc.rs | 685 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 686 insertions(+) create mode 100644 src/svm/svc.rs diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 404b281..6b31683 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -1,6 +1,7 @@ //! # Support Vector Machines //! +pub mod svc; pub mod svr; use serde::{Deserialize, Serialize}; diff --git a/src/svm/svc.rs b/src/svm/svc.rs new file mode 100644 index 0000000..a3fbb8a --- /dev/null +++ b/src/svm/svc.rs @@ -0,0 +1,685 @@ +//! # Support Vector Classifier. +//! +//! Example +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::linear::linear_regression::*; +//! use smartcore::svm::LinearKernel; +//! use smartcore::svm::svc::{SVC, SVCParameters}; +//! +//! // Iris dataset +//! let x = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! let y = vec![ -1., -1., -1., -1., -1., -1., -1., -1., +//! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; +//! +//! let svr = SVC::fit(&x, &y, +//! LinearKernel {}, +//! SVCParameters { +//! epoch: 2, +//! c: 200.0, +//! tol: 1e-3, +//! }).unwrap(); +//! +//! let y_hat = svr.predict(&x).unwrap(); +//! ``` +//! +//! ## References: +//! +//! * ["Support Vector Machines" Kowalczyk A., 2017](https://www.svm-tutorial.com/2017/10/support-vector-machines-succinctly-released/) +//! * ["Fast Kernel Classifiers with Online and Active Learning", Bordes A., Ertekin S., Weston J., Bottou L., 2005](https://www.jmlr.org/papers/volume6/bordes05a/bordes05a.pdf) + +use std::collections::{HashMap, HashSet}; +use std::fmt::Debug; +use std::marker::PhantomData; + +use rand::seq::SliceRandom; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use crate::svm::Kernel; + +#[derive(Serialize, Deserialize, Debug)] + +/// SVC Parameters +pub struct SVCParameters { + /// Number of epochs + pub epoch: usize, + /// Regularization parameter. + pub c: T, + /// Tolerance for stopping criterion + pub tol: T, +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(bound( + serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", + deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", +))] +/// Support Vector Classifier +pub struct SVC, K: Kernel> { + kernel: K, + instances: Vec, + w: Vec, + b: T, +} + +#[derive(Serialize, Deserialize, Debug)] +struct SupportVector> { + index: usize, + x: V, + alpha: T, + grad: T, + cmin: T, + cmax: T, + k: T, +} + +struct Cache<'a, T: RealNumber, M: Matrix, K: Kernel> { + kernel: &'a K, + data: HashMap<(usize, usize), T>, + phantom: PhantomData, +} + +struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { + x: &'a M, + y: &'a M::RowVector, + parameters: &'a SVCParameters, + svmin: usize, + svmax: usize, + gmin: T, + gmax: T, + tau: T, + sv: Vec>, + kernel: &'a K, + recalculate_minmax_grad: bool, +} + +impl Default for SVCParameters { + fn default() -> Self { + SVCParameters { + epoch: 2, + c: T::one(), + tol: T::from_f64(1e-3).unwrap(), + } + } +} + +impl, K: Kernel> SVC { + /// Fits SVC to your data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - class labels + /// * `kernel` - the kernel function + /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. + pub fn fit( + x: &M, + y: &M::RowVector, + kernel: K, + parameters: SVCParameters, + ) -> Result, Failed> { + let (n, _) = x.shape(); + + if n != y.len() { + return Err(Failed::fit(&format!( + "Number of rows of X doesn't match number of rows of Y" + ))); + } + + let optimizer = Optimizer::new(x, y, &kernel, ¶meters); + + let (support_vectors, weight, b) = optimizer.optimize(); + + Ok(SVC { + kernel: kernel, + instances: support_vectors, + w: weight, + b: b, + }) + } + + /// Predicts estimated class labels from `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &M) -> Result { + let (n, _) = x.shape(); + + let mut y_hat = M::RowVector::zeros(n); + + for i in 0..n { + y_hat.set(i, self.predict_for_row(x.get_row(i))); + } + + Ok(y_hat) + } + + fn predict_for_row(&self, x: M::RowVector) -> T { + let mut f = self.b; + + for i in 0..self.instances.len() { + f += self.w[i] * self.kernel.apply(&x, &self.instances[i]); + } + + if f > T::zero() { + T::one() + } else { + -T::one() + } + } +} + +impl, K: Kernel> PartialEq for SVC { + fn eq(&self, other: &Self) -> bool { + if self.b != other.b + || self.w.len() != other.w.len() + || self.instances.len() != other.instances.len() + { + return false; + } else { + for i in 0..self.w.len() { + if (self.w[i] - other.w[i]).abs() > T::epsilon() { + return false; + } + } + for i in 0..self.instances.len() { + if !self.instances[i].approximate_eq(&other.instances[i], T::epsilon()) { + return false; + } + } + return true; + } + } +} + +impl> SupportVector { + fn new>(i: usize, x: V, y: T, g: T, c: T, k: &K) -> SupportVector { + let k_v = k.apply(&x, &x); + let (cmin, cmax) = if y > T::zero() { + (T::zero(), c) + } else { + (-c, T::zero()) + }; + SupportVector { + index: i, + x: x, + grad: g, + k: k_v, + alpha: T::zero(), + cmin: cmin, + cmax: cmax, + } + } +} + +impl<'a, T: RealNumber, M: Matrix, K: Kernel> Cache<'a, T, M, K> { + fn new(kernel: &'a K) -> Cache<'a, T, M, K> { + Cache { + kernel: kernel, + data: HashMap::new(), + phantom: PhantomData, + } + } + + fn get(&mut self, i: &SupportVector, j: &SupportVector) -> T { + let idx_i = i.index; + let idx_j = j.index; + if !self.data.contains_key(&(idx_i, idx_j)) { + let v = self.kernel.apply(&i.x, &j.x); + self.data.insert((idx_i, idx_j), v); + } + *self.data.get(&(idx_i, idx_j)).unwrap() + } + + fn insert(&mut self, key: (usize, usize), value: T) { + self.data.insert(key, value); + } + + fn drop(&mut self, idxs_to_drop: HashSet) { + self.data.retain(|k, _| !idxs_to_drop.contains(&k.0)); + } +} + +impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, T, M, K> { + fn new( + x: &'a M, + y: &'a M::RowVector, + kernel: &'a K, + parameters: &'a SVCParameters, + ) -> Optimizer<'a, T, M, K> { + let (n, _) = x.shape(); + + Optimizer { + x: x, + y: y, + parameters: ¶meters, + svmin: 0, + svmax: 0, + gmin: T::max_value(), + gmax: T::min_value(), + tau: T::from_f64(1e-12).unwrap(), + sv: Vec::with_capacity(n), + kernel: kernel, + recalculate_minmax_grad: true, + } + } + + fn optimize(mut self) -> (Vec, Vec, T) { + let (n, _) = self.x.shape(); + + let mut cache = Cache::new(self.kernel); + + self.initialize(&mut cache); + + let tol = self.parameters.tol; + let good_enough = T::from_i32(1000).unwrap(); + + for _ in 0..self.parameters.epoch { + for i in Self::permutate(n) { + self.process(i, self.x.get_row(i), self.y.get(i), &mut cache); + loop { + self.reprocess(tol, &mut cache); + self.find_min_max_gradient(); + if self.gmax - self.gmin < good_enough { + break; + } + } + } + } + + self.finish(&mut cache); + + let mut support_vectors: Vec = Vec::new(); + let mut w: Vec = Vec::new(); + + let b = (self.gmax + self.gmin) / T::two(); + + for v in self.sv { + support_vectors.push(v.x); + w.push(v.alpha); + } + + (support_vectors, w, b) + } + + fn initialize(&mut self, cache: &mut Cache) { + let (n, _) = self.x.shape(); + let few = 5; + let mut cp = 0; + let mut cn = 0; + + for i in Self::permutate(n) { + if self.y.get(i) == T::one() && cp < few { + if self.process(i, self.x.get_row(i), self.y.get(i), cache) { + cp += 1; + } + } else if self.y.get(i) == -T::one() && cn < few { + if self.process(i, self.x.get_row(i), self.y.get(i), cache) { + cn += 1; + } + } + + if cp >= few && cn >= few { + break; + } + } + } + + fn process(&mut self, i: usize, x: M::RowVector, y: T, cache: &mut Cache) -> bool { + for j in 0..self.sv.len() { + if self.sv[j].index == i { + return true; + } + } + + let mut g = y; + + let mut cache_values: Vec<((usize, usize), T)> = Vec::new(); + + for v in self.sv.iter() { + let k = self.kernel.apply(&v.x, &x); + cache_values.push(((i, v.index), k)); + g -= v.alpha * k; + } + + self.find_min_max_gradient(); + + if self.gmin < self.gmax { + if (y > T::zero() && g < self.gmin) || (y < T::zero() && g > self.gmax) { + return false; + } + } + + for v in cache_values { + cache.insert(v.0, v.1); + } + + self.sv.insert( + 0, + SupportVector::new(i, x, y, g, self.parameters.c, self.kernel), + ); + + if y > T::zero() { + self.smo(None, Some(0), T::zero(), cache); + } else { + self.smo(Some(0), None, T::zero(), cache); + } + + true + } + + fn reprocess(&mut self, tol: T, cache: &mut Cache) -> bool { + let status = self.smo(None, None, tol, cache); + self.clean(cache); + status + } + + fn finish(&mut self, cache: &mut Cache) { + let mut max_iter = self.sv.len(); + + while self.smo(None, None, self.parameters.tol, cache) && max_iter > 0 { + max_iter -= 1; + } + + self.clean(cache); + } + + fn find_min_max_gradient(&mut self) { + if !self.recalculate_minmax_grad { + return; + } + + self.gmin = T::max_value(); + self.gmax = T::min_value(); + + for i in 0..self.sv.len() { + let v = &self.sv[i]; + let g = v.grad; + let a = v.alpha; + if g < self.gmin && a > v.cmin { + self.gmin = g; + self.svmin = i; + } + if g > self.gmax && a < v.cmax { + self.gmax = g; + self.svmax = i; + } + } + + self.recalculate_minmax_grad = false + } + + fn clean(&mut self, cache: &mut Cache) { + self.find_min_max_gradient(); + + let gmax = self.gmax; + let gmin = self.gmin; + + let mut idxs_to_drop: HashSet = HashSet::new(); + + self.sv.retain(|v| { + if v.alpha == T::zero() { + if (v.grad >= gmax && T::zero() >= v.cmax) + || (v.grad <= gmin && T::zero() <= v.cmin) + { + idxs_to_drop.insert(v.index); + return false; + } + }; + true + }); + + cache.drop(idxs_to_drop); + self.recalculate_minmax_grad = true; + } + + fn permutate(n: usize) -> Vec { + let mut rng = rand::thread_rng(); + let mut range: Vec = (0..n).collect(); + range.shuffle(&mut rng); + range + } + + fn smo( + &mut self, + idx_1: Option, + idx_2: Option, + tol: T, + cache: &mut Cache, + ) -> bool { + let mut idx_1 = idx_1; + let mut idx_2 = idx_2; + + let mut k_v_12: Option = None; + + if idx_1.is_none() && idx_2.is_none() { + self.find_min_max_gradient(); + if self.gmax > -self.gmin { + idx_2 = Some(self.svmax); + } else { + idx_1 = Some(self.svmin); + } + } + + if idx_2.is_none() { + let idx_1 = &self.sv[idx_1.unwrap()]; + let km = idx_1.k; + let gm = idx_1.grad; + let mut best = T::zero(); + for i in 0..self.sv.len() { + let v = &self.sv[i]; + let z = v.grad - gm; + let k = cache.get(idx_1, &v); + let mut curv = km + v.k - T::two() * k; + if curv <= T::zero() { + curv = self.tau; + } + let mu = z / curv; + if (mu > T::zero() && v.alpha < v.cmax) || (mu < T::zero() && v.alpha > v.cmin) { + let gain = z * mu; + if gain > best { + best = gain; + idx_2 = Some(i); + k_v_12 = Some(k); + } + } + } + } + + if idx_1.is_none() { + let idx_2 = &self.sv[idx_2.unwrap()]; + let km = idx_2.k; + let gm = idx_2.grad; + let mut best = T::zero(); + for i in 0..self.sv.len() { + let v = &self.sv[i]; + let z = gm - v.grad; + let k = cache.get(idx_2, v); + let mut curv = km + v.k - T::two() * k; + if curv <= T::zero() { + curv = self.tau; + } + + let mu = z / curv; + if (mu > T::zero() && v.alpha > v.cmin) || (mu < T::zero() && v.alpha < v.cmax) { + let gain = z * mu; + if gain > best { + best = gain; + idx_1 = Some(i); + k_v_12 = Some(k); + } + } + } + } + + if idx_1.is_none() || idx_2.is_none() { + return false; + } + + let idx_1 = idx_1.unwrap(); + let idx_2 = idx_2.unwrap(); + + if k_v_12.is_none() { + k_v_12 = Some(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x)); + } + + let k_v_12 = k_v_12.unwrap(); + + let mut curv = self.sv[idx_1].k + self.sv[idx_2].k - T::two() * k_v_12; + if curv <= T::zero() { + curv = self.tau; + } + + let mut step = (self.sv[idx_2].grad - self.sv[idx_1].grad) / curv; + + if step >= T::zero() { + let mut ostep = self.sv[idx_1].alpha - self.sv[idx_1].cmin; + if ostep < step { + step = ostep; + } + ostep = self.sv[idx_2].cmax - self.sv[idx_2].alpha; + if ostep < step { + step = ostep; + } + } else { + let mut ostep = self.sv[idx_2].cmin - self.sv[idx_2].alpha; + if ostep > step { + step = ostep; + } + ostep = self.sv[idx_1].alpha - self.sv[idx_1].cmax; + if ostep > step { + step = ostep; + } + } + + self.update(idx_1, idx_2, step, cache); + + return self.gmax - self.gmin > tol; + } + + fn update(&mut self, v1: usize, v2: usize, step: T, cache: &mut Cache) { + self.sv[v1].alpha -= step; + self.sv[v2].alpha += step; + + for i in 0..self.sv.len() { + let k2 = cache.get(&self.sv[v2], &self.sv[i]); + let k1 = cache.get(&self.sv[v1], &self.sv[i]); + self.sv[i].grad -= step * (k2 - k1); + } + + self.recalculate_minmax_grad = true; + self.find_min_max_gradient(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + use crate::metrics::accuracy; + use crate::svm::*; + + #[test] + fn svc_fit_predict() { + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + + let y: Vec = vec![ + -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let y_hat = SVC::fit( + &x, + &y, + LinearKernel {}, + SVCParameters { + epoch: 2, + c: 200.0, + tol: 1e-3, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(accuracy(&y_hat, &y) >= 0.9); + } + + #[test] + fn svc_serde() { + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + + let y: Vec = vec![ + -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let svr = SVC::fit(&x, &y, LinearKernel {}, Default::default()).unwrap(); + + let deserialized_svr: SVC, LinearKernel> = + serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); + + assert_eq!(svr, deserialized_svr); + } +} From 47abbbe8b63d4e48ef4d36418e03ea033e9f8f72 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 26 Oct 2020 16:00:31 -0700 Subject: [PATCH 07/79] fix: SVS: post-review changes --- src/svm/svc.rs | 100 +++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 45 deletions(-) diff --git a/src/svm/svc.rs b/src/svm/svc.rs index a3fbb8a..6e79177 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -462,13 +462,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, range } - fn smo( - &mut self, - idx_1: Option, - idx_2: Option, - tol: T, - cache: &mut Cache, - ) -> bool { + fn select_pair(&mut self, idx_1: Option, idx_2: Option, cache: &mut Cache) -> Option<(usize, usize, T)> { let mut idx_1 = idx_1; let mut idx_2 = idx_2; @@ -532,51 +526,67 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, } } } - } + } if idx_1.is_none() || idx_2.is_none() { - return false; - } - - let idx_1 = idx_1.unwrap(); - let idx_2 = idx_2.unwrap(); - - if k_v_12.is_none() { - k_v_12 = Some(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x)); - } - - let k_v_12 = k_v_12.unwrap(); - - let mut curv = self.sv[idx_1].k + self.sv[idx_2].k - T::two() * k_v_12; - if curv <= T::zero() { - curv = self.tau; - } - - let mut step = (self.sv[idx_2].grad - self.sv[idx_1].grad) / curv; - - if step >= T::zero() { - let mut ostep = self.sv[idx_1].alpha - self.sv[idx_1].cmin; - if ostep < step { - step = ostep; - } - ostep = self.sv[idx_2].cmax - self.sv[idx_2].alpha; - if ostep < step { - step = ostep; - } + None } else { - let mut ostep = self.sv[idx_2].cmin - self.sv[idx_2].alpha; - if ostep > step { - step = ostep; - } - ostep = self.sv[idx_1].alpha - self.sv[idx_1].cmax; - if ostep > step { - step = ostep; + + let idx_1 = idx_1.unwrap(); + let idx_2 = idx_2.unwrap(); + + if k_v_12.is_none() { + k_v_12 = Some(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x)); } + + Some((idx_1, idx_2, k_v_12.unwrap())) } + } - self.update(idx_1, idx_2, step, cache); + fn smo( + &mut self, + idx_1: Option, + idx_2: Option, + tol: T, + cache: &mut Cache, + ) -> bool { + + match self.select_pair(idx_1, idx_2, cache) { + Some((idx_1, idx_2, k_v_12)) => { + let mut curv = self.sv[idx_1].k + self.sv[idx_2].k - T::two() * k_v_12; + if curv <= T::zero() { + curv = self.tau; + } - return self.gmax - self.gmin > tol; + let mut step = (self.sv[idx_2].grad - self.sv[idx_1].grad) / curv; + + if step >= T::zero() { + let mut ostep = self.sv[idx_1].alpha - self.sv[idx_1].cmin; + if ostep < step { + step = ostep; + } + ostep = self.sv[idx_2].cmax - self.sv[idx_2].alpha; + if ostep < step { + step = ostep; + } + } else { + let mut ostep = self.sv[idx_2].cmin - self.sv[idx_2].alpha; + if ostep > step { + step = ostep; + } + ostep = self.sv[idx_1].alpha - self.sv[idx_1].cmax; + if ostep > step { + step = ostep; + } + } + + self.update(idx_1, idx_2, step, cache); + + return self.gmax - self.gmin > tol; + }, + None => false + } + } fn update(&mut self, v1: usize, v2: usize, step: T, cache: &mut Cache) { From aa38fc8b700dd00e05546cc7313570fbb223cf1d Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 26 Oct 2020 16:00:55 -0700 Subject: [PATCH 08/79] fix: SVS: post-review changes --- src/svm/svc.rs | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 6e79177..b0bf8b0 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -462,7 +462,12 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, range } - fn select_pair(&mut self, idx_1: Option, idx_2: Option, cache: &mut Cache) -> Option<(usize, usize, T)> { + fn select_pair( + &mut self, + idx_1: Option, + idx_2: Option, + cache: &mut Cache, + ) -> Option<(usize, usize, T)> { let mut idx_1 = idx_1; let mut idx_2 = idx_2; @@ -526,15 +531,14 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, } } } - } + } if idx_1.is_none() || idx_2.is_none() { None } else { - let idx_1 = idx_1.unwrap(); let idx_2 = idx_2.unwrap(); - + if k_v_12.is_none() { k_v_12 = Some(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x)); } @@ -550,7 +554,6 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, tol: T, cache: &mut Cache, ) -> bool { - match self.select_pair(idx_1, idx_2, cache) { Some((idx_1, idx_2, k_v_12)) => { let mut curv = self.sv[idx_1].k + self.sv[idx_2].k - T::two() * k_v_12; @@ -583,10 +586,9 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, self.update(idx_1, idx_2, step, cache); return self.gmax - self.gmin > tol; - }, - None => false - } - + } + None => false, + } } fn update(&mut self, v1: usize, v2: usize, step: T, cache: &mut Cache) { From bf8d0c081f6d5ddff46b191b71eda4be0221d821 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 26 Oct 2020 16:27:26 -0700 Subject: [PATCH 09/79] fix: SVC: some more post-review refactoring --- src/svm/svc.rs | 136 ++++++++++++++++++++++++------------------------- 1 file changed, 67 insertions(+), 69 deletions(-) diff --git a/src/svm/svc.rs b/src/svm/svc.rs index b0bf8b0..829f729 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -468,83 +468,81 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, idx_2: Option, cache: &mut Cache, ) -> Option<(usize, usize, T)> { - let mut idx_1 = idx_1; - let mut idx_2 = idx_2; - - let mut k_v_12: Option = None; - - if idx_1.is_none() && idx_2.is_none() { - self.find_min_max_gradient(); - if self.gmax > -self.gmin { - idx_2 = Some(self.svmax); - } else { - idx_1 = Some(self.svmin); - } - } - - if idx_2.is_none() { - let idx_1 = &self.sv[idx_1.unwrap()]; - let km = idx_1.k; - let gm = idx_1.grad; - let mut best = T::zero(); - for i in 0..self.sv.len() { - let v = &self.sv[i]; - let z = v.grad - gm; - let k = cache.get(idx_1, &v); - let mut curv = km + v.k - T::two() * k; - if curv <= T::zero() { - curv = self.tau; - } - let mu = z / curv; - if (mu > T::zero() && v.alpha < v.cmax) || (mu < T::zero() && v.alpha > v.cmin) { - let gain = z * mu; - if gain > best { - best = gain; - idx_2 = Some(i); - k_v_12 = Some(k); + + match (idx_1, idx_2) { + (None, None) => { + if self.gmax > -self.gmin { + self.select_pair(None, Some(self.svmax), cache) + } else { + self.select_pair(Some(self.svmin), None, cache) + } + }, + (Some(idx_1), None) => { + let sv1 = &self.sv[idx_1]; + let mut idx_2 = None; + let mut k_v_12 = None; + let km = sv1.k; + let gm = sv1.grad; + let mut best = T::zero(); + for i in 0..self.sv.len() { + let v = &self.sv[i]; + let z = v.grad - gm; + let k = cache.get(sv1, &v); + let mut curv = km + v.k - T::two() * k; + if curv <= T::zero() { + curv = self.tau; + } + let mu = z / curv; + if (mu > T::zero() && v.alpha < v.cmax) || (mu < T::zero() && v.alpha > v.cmin) { + let gain = z * mu; + if gain > best { + best = gain; + idx_2 = Some(i); + k_v_12 = Some(k); + } } } - } - } - if idx_1.is_none() { - let idx_2 = &self.sv[idx_2.unwrap()]; - let km = idx_2.k; - let gm = idx_2.grad; - let mut best = T::zero(); - for i in 0..self.sv.len() { - let v = &self.sv[i]; - let z = gm - v.grad; - let k = cache.get(idx_2, v); - let mut curv = km + v.k - T::two() * k; - if curv <= T::zero() { - curv = self.tau; - } + idx_2.map(|idx_2| { + (idx_1, idx_2, k_v_12.unwrap_or(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x))) + }) + }, + (None, Some(idx_2)) => { + let mut idx_1 = None; + let sv2 = &self.sv[idx_2]; + let mut k_v_12 = None; + let km = sv2.k; + let gm = sv2.grad; + let mut best = T::zero(); + for i in 0..self.sv.len() { + let v = &self.sv[i]; + let z = gm - v.grad; + let k = cache.get(sv2, v); + let mut curv = km + v.k - T::two() * k; + if curv <= T::zero() { + curv = self.tau; + } - let mu = z / curv; - if (mu > T::zero() && v.alpha > v.cmin) || (mu < T::zero() && v.alpha < v.cmax) { - let gain = z * mu; - if gain > best { - best = gain; - idx_1 = Some(i); - k_v_12 = Some(k); + let mu = z / curv; + if (mu > T::zero() && v.alpha > v.cmin) || (mu < T::zero() && v.alpha < v.cmax) { + let gain = z * mu; + if gain > best { + best = gain; + idx_1 = Some(i); + k_v_12 = Some(k); + } } } + + idx_1.map(|idx_1| { + (idx_1, idx_2, k_v_12.unwrap_or(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x))) + }) + }, + (Some(idx_1), Some(idx_2)) => { + Some((idx_1, idx_2, self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x))) } } - - if idx_1.is_none() || idx_2.is_none() { - None - } else { - let idx_1 = idx_1.unwrap(); - let idx_2 = idx_2.unwrap(); - - if k_v_12.is_none() { - k_v_12 = Some(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x)); - } - - Some((idx_1, idx_2, k_v_12.unwrap())) - } + } fn smo( From 1773ed0e6e72519d73a337d3b398cc31e2dcaeaa Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 26 Oct 2020 16:27:54 -0700 Subject: [PATCH 10/79] fix: SVC: some more post-review refactoring --- src/svm/svc.rs | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 829f729..22623b4 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -468,19 +468,18 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, idx_2: Option, cache: &mut Cache, ) -> Option<(usize, usize, T)> { - match (idx_1, idx_2) { (None, None) => { if self.gmax > -self.gmin { self.select_pair(None, Some(self.svmax), cache) } else { self.select_pair(Some(self.svmin), None, cache) - } - }, + } + } (Some(idx_1), None) => { - let sv1 = &self.sv[idx_1]; + let sv1 = &self.sv[idx_1]; let mut idx_2 = None; - let mut k_v_12 = None; + let mut k_v_12 = None; let km = sv1.k; let gm = sv1.grad; let mut best = T::zero(); @@ -493,7 +492,8 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, curv = self.tau; } let mu = z / curv; - if (mu > T::zero() && v.alpha < v.cmax) || (mu < T::zero() && v.alpha > v.cmin) { + if (mu > T::zero() && v.alpha < v.cmax) || (mu < T::zero() && v.alpha > v.cmin) + { let gain = z * mu; if gain > best { best = gain; @@ -503,10 +503,14 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, } } - idx_2.map(|idx_2| { - (idx_1, idx_2, k_v_12.unwrap_or(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x))) - }) - }, + idx_2.map(|idx_2| { + ( + idx_1, + idx_2, + k_v_12.unwrap_or(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x)), + ) + }) + } (None, Some(idx_2)) => { let mut idx_1 = None; let sv2 = &self.sv[idx_2]; @@ -524,7 +528,8 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, } let mu = z / curv; - if (mu > T::zero() && v.alpha > v.cmin) || (mu < T::zero() && v.alpha < v.cmax) { + if (mu > T::zero() && v.alpha > v.cmin) || (mu < T::zero() && v.alpha < v.cmax) + { let gain = z * mu; if gain > best { best = gain; @@ -534,15 +539,20 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, } } - idx_1.map(|idx_1| { - (idx_1, idx_2, k_v_12.unwrap_or(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x))) + idx_1.map(|idx_1| { + ( + idx_1, + idx_2, + k_v_12.unwrap_or(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x)), + ) }) - }, - (Some(idx_1), Some(idx_2)) => { - Some((idx_1, idx_2, self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x))) } + (Some(idx_1), Some(idx_2)) => Some(( + idx_1, + idx_2, + self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x), + )), } - } fn smo( From cf4f658f015405481d5c67e828acf0eca5c95709 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 28 Oct 2020 17:10:17 -0700 Subject: [PATCH 11/79] feat: adds 3 more SVM kernels, linalg refactoring --- src/linalg/mod.rs | 70 +++++++++++++++ src/linalg/naive/dense_matrix.rs | 105 ++++++++++++++++++++++ src/linalg/nalgebra_bindings.rs | 85 ++++++++++++++++++ src/linalg/ndarray_bindings.rs | 81 +++++++++++++++++ src/svm/mod.rs | 147 ++++++++++++++++++++++++++++++- src/svm/svc.rs | 89 +++++++++++++++++-- 6 files changed, 568 insertions(+), 9 deletions(-) diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index a637bdf..29c7a89 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -91,6 +91,76 @@ pub trait BaseVector: Clone + Debug { /// Returns True if matrices are element-wise equal within a tolerance `error`. fn approximate_eq(&self, other: &Self, error: T) -> bool; + + /// Returns [L2 norm] of the vector(https://en.wikipedia.org/wiki/Matrix_norm). + fn norm2(&self) -> T; + + /// Returns [vectors norm](https://en.wikipedia.org/wiki/Matrix_norm) of order `p`. + fn norm(&self, p: T) -> T; + + /// Divide single element of the vector by `x`, write result to original vector. + fn div_element_mut(&mut self, pos: usize, x: T); + + /// Multiply single element of the vector by `x`, write result to original vector. + fn mul_element_mut(&mut self, pos: usize, x: T); + + /// Add single element of the vector to `x`, write result to original vector. + fn add_element_mut(&mut self, pos: usize, x: T); + + /// Subtract `x` from single element of the vector, write result to original vector. + fn sub_element_mut(&mut self, pos: usize, x: T); + + /// Add vectors, element-wise, overriding original vector with result. + fn add_mut(&mut self, other: &Self) -> &Self; + + /// Subtract vectors, element-wise, overriding original vector with result. + fn sub_mut(&mut self, other: &Self) -> &Self; + + /// Multiply vectors, element-wise, overriding original vector with result. + fn mul_mut(&mut self, other: &Self) -> &Self; + + /// Divide vectors, element-wise, overriding original vector with result. + fn div_mut(&mut self, other: &Self) -> &Self; + + /// Add vectors, element-wise + fn add(&self, other: &Self) -> Self { + let mut r = self.clone(); + r.add_mut(other); + r + } + + /// Subtract vectors, element-wise + fn sub(&self, other: &Self) -> Self { + let mut r = self.clone(); + r.sub_mut(other); + r + } + + /// Multiply vectors, element-wise + fn mul(&self, other: &Self) -> Self { + let mut r = self.clone(); + r.mul_mut(other); + r + } + + /// Divide vectors, element-wise + fn div(&self, other: &Self) -> Self { + let mut r = self.clone(); + r.div_mut(other); + r + } + + /// Calculates sum of all elements of the vector. + fn sum(&self) -> T; + + /// Returns unique values from the vector. + /// ``` + /// use smartcore::linalg::naive::dense_matrix::*; + /// let a = vec!(1., 2., 2., -2., -6., -7., 2., 3., 4.); + /// + ///assert_eq!(a.unique(), vec![-7., -6., -2., 1., 2., 3., 4.]); + /// ``` + fn unique(&self) -> Vec; } /// Generic matrix type. diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index ab6bf43..cf29061 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -58,6 +58,96 @@ impl BaseVector for Vec { result } + fn norm2(&self) -> T { + let mut norm = T::zero(); + + for xi in self.iter() { + norm = norm + *xi * *xi; + } + + norm.sqrt() + } + + fn norm(&self, p: T) -> T { + if p.is_infinite() && p.is_sign_positive() { + self.iter() + .map(|x| x.abs()) + .fold(T::neg_infinity(), |a, b| a.max(b)) + } else if p.is_infinite() && p.is_sign_negative() { + self.iter() + .map(|x| x.abs()) + .fold(T::infinity(), |a, b| a.min(b)) + } else { + let mut norm = T::zero(); + + for xi in self.iter() { + norm = norm + xi.abs().powf(p); + } + + norm.powf(T::one() / p) + } + } + + fn div_element_mut(&mut self, pos: usize, x: T) { + self[pos] = self[pos] / x; + } + + fn mul_element_mut(&mut self, pos: usize, x: T) { + self[pos] = self[pos] * x; + } + + fn add_element_mut(&mut self, pos: usize, x: T) { + self[pos] = self[pos] + x + } + + fn sub_element_mut(&mut self, pos: usize, x: T) { + self[pos] = self[pos] - x; + } + + fn add_mut(&mut self, other: &Self) -> &Self { + if self.len() != other.len() { + panic!("A and B should have the same shape"); + } + for i in 0..self.len() { + self.add_element_mut(i, other.get(i)); + } + + self + } + + fn sub_mut(&mut self, other: &Self) -> &Self { + if self.len() != other.len() { + panic!("A and B should have the same shape"); + } + for i in 0..self.len() { + self.sub_element_mut(i, other.get(i)); + } + + self + } + + fn mul_mut(&mut self, other: &Self) -> &Self { + if self.len() != other.len() { + panic!("A and B should have the same shape"); + } + for i in 0..self.len() { + self.mul_element_mut(i, other.get(i)); + } + + self + } + + fn div_mut(&mut self, other: &Self) -> &Self { + if self.len() != other.len() { + panic!("A and B should have the same shape"); + } + for i in 0..self.len() { + self.div_element_mut(i, other.get(i)); + } + + self + } + fn approximate_eq(&self, other: &Self, error: T) -> bool { if self.len() != other.len() { false @@ -70,6 +160,21 @@ impl BaseVector for Vec { true } } + + fn sum(&self) -> T { + let mut sum = T::zero(); + for i in 0..self.len() { + sum = sum + self[i]; + } + sum + } + + fn unique(&self) -> Vec { + let mut result = self.clone(); + result.sort_by(|a, b| a.partial_cmp(b).unwrap()); + result.dedup(); + result + } } /// Column-major, dense matrix. See [Simple Dense Matrix](../index.html). diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index badd8c4..3596899 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -84,6 +84,76 @@ impl BaseVector for MatrixMN { self.dot(other) } + fn norm2(&self) -> T { + self.iter().map(|x| *x * *x).sum::().sqrt() + } + + fn norm(&self, p: T) -> T { + if p.is_infinite() && p.is_sign_positive() { + self.iter().fold(T::neg_infinity(), |f, &val| { + let v = val.abs(); + if f > v { + f + } else { + v + } + }) + } else if p.is_infinite() && p.is_sign_negative() { + self.iter().fold(T::infinity(), |f, &val| { + let v = val.abs(); + if f < v { + f + } else { + v + } + }) + } else { + let mut norm = T::zero(); + + for xi in self.iter() { + norm = norm + xi.abs().powf(p); + } + + norm.powf(T::one() / p) + } + } + + fn div_element_mut(&mut self, pos: usize, x: T) { + *self.get_mut(pos).unwrap() = *self.get(pos).unwrap() / x; + } + + fn mul_element_mut(&mut self, pos: usize, x: T) { + *self.get_mut(pos).unwrap() = *self.get(pos).unwrap() * x; + } + + fn add_element_mut(&mut self, pos: usize, x: T) { + *self.get_mut(pos).unwrap() = *self.get(pos).unwrap() + x; + } + + fn sub_element_mut(&mut self, pos: usize, x: T) { + *self.get_mut(pos).unwrap() = *self.get(pos).unwrap() - x; + } + + fn add_mut(&mut self, other: &Self) -> &Self { + *self += other; + self + } + + fn sub_mut(&mut self, other: &Self) -> &Self { + *self -= other; + self + } + + fn mul_mut(&mut self, other: &Self) -> &Self { + self.component_mul_assign(other); + self + } + + fn div_mut(&mut self, other: &Self) -> &Self { + self.component_div_assign(other); + self + } + fn approximate_eq(&self, other: &Self, error: T) -> bool { if self.shape() != other.shape() { false @@ -93,6 +163,21 @@ impl BaseVector for MatrixMN { .all(|(a, b)| (*a - *b).abs() <= error) } } + + fn sum(&self) -> T { + let mut sum = T::zero(); + for v in self.iter() { + sum += *v; + } + sum + } + + fn unique(&self) -> Vec { + let mut result: Vec = self.iter().map(|v| *v).collect(); + result.sort_by(|a, b| a.partial_cmp(b).unwrap()); + result.dedup(); + result + } } impl diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 9cfb6d7..9f911f5 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -89,9 +89,90 @@ impl BaseVector for ArrayBase, Ix self.dot(other) } + fn norm2(&self) -> T { + self.iter().map(|x| *x * *x).sum::().sqrt() + } + + fn norm(&self, p: T) -> T { + if p.is_infinite() && p.is_sign_positive() { + self.iter().fold(T::neg_infinity(), |f, &val| { + let v = val.abs(); + if f > v { + f + } else { + v + } + }) + } else if p.is_infinite() && p.is_sign_negative() { + self.iter().fold(T::infinity(), |f, &val| { + let v = val.abs(); + if f < v { + f + } else { + v + } + }) + } else { + let mut norm = T::zero(); + + for xi in self.iter() { + norm = norm + xi.abs().powf(p); + } + + norm.powf(T::one() / p) + } + } + + fn div_element_mut(&mut self, pos: usize, x: T) { + self[pos] = self[pos] / x; + } + + fn mul_element_mut(&mut self, pos: usize, x: T) { + self[pos] = self[pos] * x; + } + + fn add_element_mut(&mut self, pos: usize, x: T) { + self[pos] = self[pos] + x; + } + + fn sub_element_mut(&mut self, pos: usize, x: T) { + self[pos] = self[pos] - x; + } + fn approximate_eq(&self, other: &Self, error: T) -> bool { (self - other).iter().all(|v| v.abs() <= error) } + + fn add_mut(&mut self, other: &Self) -> &Self { + *self += other; + self + } + + fn sub_mut(&mut self, other: &Self) -> &Self { + *self -= other; + self + } + + fn mul_mut(&mut self, other: &Self) -> &Self { + *self *= other; + self + } + + fn div_mut(&mut self, other: &Self) -> &Self { + *self /= other; + self + } + + fn sum(&self) -> T { + self.sum() + } + + fn unique(&self) -> Vec { + let mut result = self.clone().into_raw_vec(); + result.sort_by(|a, b| a.partial_cmp(b).unwrap()); + result.dedup(); + result + } } impl diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 6b31683..0605907 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -1,5 +1,7 @@ //! # Support Vector Machines //! +//! +//! pub mod svc; pub mod svr; @@ -9,18 +11,161 @@ use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; -/// Kernel +/// Defines a kernel function pub trait Kernel> { /// Apply kernel function to x_i and x_j fn apply(&self, x_i: &V, x_j: &V) -> T; } +/// Pre-defined kernel functions +pub struct Kernels {} + +impl Kernels { + /// Linear kernel + pub fn linear() -> LinearKernel { + LinearKernel {} + } + + /// Radial basis function kernel (Gaussian) + pub fn rbf(gamma: T) -> RBFKernel { + RBFKernel { gamma: gamma } + } + + /// Polynomial kernel + /// * `degree` - degree of the polynomial + /// * `gamma` - kernel coefficient + /// * `coef0` - independent term in kernel function + pub fn polynomial(degree: T, gamma: T, coef0: T) -> PolynomialKernel { + PolynomialKernel { + degree: degree, + gamma: gamma, + coef0: coef0, + } + } + + /// Polynomial kernel + /// * `degree` - degree of the polynomial + /// * `n_features` - number of features in vector + pub fn polynomial_with_degree( + degree: T, + n_features: usize, + ) -> PolynomialKernel { + let coef0 = T::one(); + let gamma = T::one() / T::from_usize(n_features).unwrap(); + Kernels::polynomial(degree, gamma, coef0) + } + + /// Sigmoid kernel + /// * `gamma` - kernel coefficient + /// * `coef0` - independent term in kernel function + pub fn sigmoid(gamma: T, coef0: T) -> SigmoidKernel { + SigmoidKernel { + gamma: gamma, + coef0: coef0, + } + } + + /// Sigmoid kernel + /// * `gamma` - kernel coefficient + pub fn sigmoid_with_gamma(gamma: T) -> SigmoidKernel { + SigmoidKernel { + gamma: gamma, + coef0: T::one(), + } + } +} + /// Linear Kernel #[derive(Serialize, Deserialize, Debug)] pub struct LinearKernel {} +/// Radial basis function (Gaussian) kernel +pub struct RBFKernel { + /// kernel coefficient + pub gamma: T, +} + +/// Polynomial kernel +pub struct PolynomialKernel { + /// degree of the polynomial + pub degree: T, + /// kernel coefficient + pub gamma: T, + /// independent term in kernel function + pub coef0: T, +} + +/// Sigmoid (hyperbolic tangent) kernel +pub struct SigmoidKernel { + /// kernel coefficient + pub gamma: T, + /// independent term in kernel function + pub coef0: T, +} + impl> Kernel for LinearKernel { fn apply(&self, x_i: &V, x_j: &V) -> T { x_i.dot(x_j) } } + +impl> Kernel for RBFKernel { + fn apply(&self, x_i: &V, x_j: &V) -> T { + let v_diff = x_i.sub(x_j); + (-self.gamma * v_diff.mul(&v_diff).sum()).exp() + } +} + +impl> Kernel for PolynomialKernel { + fn apply(&self, x_i: &V, x_j: &V) -> T { + let dot = x_i.dot(x_j); + (self.gamma * dot + self.coef0).powf(self.degree) + } +} + +impl> Kernel for SigmoidKernel { + fn apply(&self, x_i: &V, x_j: &V) -> T { + let dot = x_i.dot(x_j); + (self.gamma * dot + self.coef0).tanh() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn linear_kernel() { + let v1 = vec![1., 2., 3.]; + let v2 = vec![4., 5., 6.]; + + assert_eq!(32f64, Kernels::linear().apply(&v1, &v2)); + } + + #[test] + fn rbf_kernel() { + let v1 = vec![1., 2., 3.]; + let v2 = vec![4., 5., 6.]; + + assert!((0.2265f64 - Kernels::rbf(0.055).apply(&v1, &v2)).abs() < 1e-4); + } + + #[test] + fn polynomial_kernel() { + let v1 = vec![1., 2., 3.]; + let v2 = vec![4., 5., 6.]; + + assert!( + (4913f64 - Kernels::polynomial(3.0, 0.5, 1.0).apply(&v1, &v2)).abs() + < std::f64::EPSILON + ); + } + + #[test] + fn sigmoid_kernel() { + let v1 = vec![1., 2., 3.]; + let v2 = vec![4., 5., 6.]; + + assert!((0.3969f64 - Kernels::sigmoid(0.01, 0.1).apply(&v1, &v2)).abs() < 1e-4); + } +} diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 22623b4..46625a9 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -5,7 +5,7 @@ //! ``` //! use smartcore::linalg::naive::dense_matrix::*; //! use smartcore::linear::linear_regression::*; -//! use smartcore::svm::LinearKernel; +//! use smartcore::svm::Kernels; //! use smartcore::svm::svc::{SVC, SVCParameters}; //! //! // Iris dataset @@ -31,11 +31,11 @@ //! &[6.6, 2.9, 4.6, 1.3], //! &[5.2, 2.7, 3.9, 1.4], //! ]); -//! let y = vec![ -1., -1., -1., -1., -1., -1., -1., -1., +//! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0., //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; //! //! let svr = SVC::fit(&x, &y, -//! LinearKernel {}, +//! Kernels::linear(), //! SVCParameters { //! epoch: 2, //! c: 200.0, @@ -83,6 +83,7 @@ pub struct SVCParameters { ))] /// Support Vector Classifier pub struct SVC, K: Kernel> { + classes: Vec, kernel: K, instances: Vec, w: Vec, @@ -150,11 +151,32 @@ impl, K: Kernel> SVC { ))); } - let optimizer = Optimizer::new(x, y, &kernel, ¶meters); + let classes = y.unique(); + + if classes.len() != 2 { + return Err(Failed::fit(&format!( + "Incorrect number of classes {}", classes.len() + ))); + } + + // Make sure class labels are either 1 or -1 + let mut y = y.clone(); + for i in 0..y.len() { + let y_v = y.get(i); + if y_v != -T::one() || y_v != T::one() { + match y_v == classes[0] { + true => y.set(i, -T::one()), + false => y.set(i, T::one()) + } + } + } + + let optimizer = Optimizer::new(x, &y, &kernel, ¶meters); let (support_vectors, weight, b) = optimizer.optimize(); Ok(SVC { + classes: classes, kernel: kernel, instances: support_vectors, w: weight, @@ -170,7 +192,11 @@ impl, K: Kernel> SVC { let mut y_hat = M::RowVector::zeros(n); for i in 0..n { - y_hat.set(i, self.predict_for_row(x.get_row(i))); + let cls_idx = match self.predict_for_row(x.get_row(i)) == T::one() { + false => self.classes[0], + true => self.classes[1] + }; + y_hat.set(i, cls_idx); } Ok(y_hat) @@ -647,13 +673,13 @@ mod tests { ]); let y: Vec = vec![ - -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; let y_hat = SVC::fit( &x, &y, - LinearKernel {}, + Kernels::linear(), SVCParameters { epoch: 2, c: 200.0, @@ -663,6 +689,53 @@ mod tests { .and_then(|lr| lr.predict(&x)) .unwrap(); + println!("{:?}", y_hat); + + assert!(accuracy(&y_hat, &y) >= 0.9); + } + + #[test] + fn svc_fit_predict_rbf() { + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + + let y: Vec = vec![ + -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let y_hat = SVC::fit( + &x, + &y, + Kernels::rbf(0.7), + SVCParameters { + epoch: 2, + c: 1.0, + tol: 1e-3, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + assert!(accuracy(&y_hat, &y) >= 0.9); } @@ -695,7 +768,7 @@ mod tests { -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let svr = SVC::fit(&x, &y, LinearKernel {}, Default::default()).unwrap(); + let svr = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap(); let deserialized_svr: SVC, LinearKernel> = serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); From 797dc3c8e08c221f1978a231fd04d2ba07d7f552 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 28 Oct 2020 17:23:40 -0700 Subject: [PATCH 12/79] fix: formatting --- src/svm/mod.rs | 2 +- src/svm/svc.rs | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 0605907..97955e7 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -79,7 +79,7 @@ impl Kernels { #[derive(Serialize, Deserialize, Debug)] pub struct LinearKernel {} -/// Radial basis function (Gaussian) kernel +/// Radial basis function (Gaussian) kernel pub struct RBFKernel { /// kernel coefficient pub gamma: T, diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 46625a9..97347fd 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -155,7 +155,8 @@ impl, K: Kernel> SVC { if classes.len() != 2 { return Err(Failed::fit(&format!( - "Incorrect number of classes {}", classes.len() + "Incorrect number of classes {}", + classes.len() ))); } @@ -166,7 +167,7 @@ impl, K: Kernel> SVC { if y_v != -T::one() || y_v != T::one() { match y_v == classes[0] { true => y.set(i, -T::one()), - false => y.set(i, T::one()) + false => y.set(i, T::one()), } } } @@ -194,7 +195,7 @@ impl, K: Kernel> SVC { for i in 0..n { let cls_idx = match self.predict_for_row(x.get_row(i)) == T::one() { false => self.classes[0], - true => self.classes[1] + true => self.classes[1], }; y_hat.set(i, cls_idx); } @@ -720,7 +721,8 @@ mod tests { ]); let y: Vec = vec![ - -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + 1., ]; let y_hat = SVC::fit( @@ -734,7 +736,7 @@ mod tests { }, ) .and_then(|lr| lr.predict(&x)) - .unwrap(); + .unwrap(); assert!(accuracy(&y_hat, &y) >= 0.9); } From 3a3f90491494351fc20806d3d0a66698ff99b36e Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 30 Oct 2020 15:08:05 -0700 Subject: [PATCH 13/79] feat: documents SVM, SVR and SVC --- src/svm/mod.rs | 20 ++++++++++++++++++++ src/svm/svc.rs | 31 +++++++++++++++++++++++++++++-- src/svm/svr.rs | 27 ++++++++++++++++++++++++--- 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 97955e7..c9aea15 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -1,4 +1,24 @@ //! # Support Vector Machines +//! +//! Support Vector Machines (SVM) is one of the most performant off-the-shelf machine learning algorithms. +//! SVM is based on the [Vapnik–Chervonenkiy theory](https://en.wikipedia.org/wiki/Vapnik%E2%80%93Chervonenkis_theory) that was developed during 1960–1990 by Vladimir Vapnik and Alexey Chervonenkiy. +//! +//! SVM splits data into two sets using a maximal-margin decision boundary, \\(f(x)\\). For regression, the algorithm uses a value of the function \\(f(x)\\) to predict a target value. +//! To classify a new point, algorithm calculates a sign of the decision function to see where the new point is relative to the boundary. +//! +//! SVM is memory efficient since it uses only a subset of training data to find a decision boundary. This subset is called support vectors. +//! +//! In SVM distance between a data point and the support vectors is defined by the kernel function. +//! SmartCore supports multiple kernel functions but you can always define a new kernel function by implementing the `Kernel` trait. Not all functions can be a kernel. +//! Building a new kernel requires a good mathematical understanding of the [Mercer theorem](https://en.wikipedia.org/wiki/Mercer%27s_theorem) +//! that gives necessary and sufficient condition for a function to be a kernel function. +//! +//! Pre-defined kernel functions: +//! +//! * *Linear*, \\( K(x, x') = \langle x, x' \rangle\\) +//! * *Polynomial*, \\( K(x, x') = (\gamma\langle x, x' \rangle + r)^d\\), where \\(d\\) is polynomial degree, \\(\gamma\\) is a kernel coefficient and \\(r\\) is an independent term in the kernel function. +//! * *RBF (Gaussian)*, \\( K(x, x') = e^{-\gamma \lVert x - x' \rVert ^2} \\), where \\(\gamma\\) is kernel coefficient +//! * *Sigmoid (hyperbolic tangent)*, \\( K(x, x') = \tanh ( \gamma \langle x, x' \rangle + r ) \\), where \\(\gamma\\) is kernel coefficient and \\(r\\) is an independent term in the kernel function. //! //! //! diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 97347fd..5cce80a 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -1,6 +1,30 @@ //! # Support Vector Classifier. +//! +//! Support Vector Classifier (SVC) is a binary classifier that uses an optimal hyperplane to separate the points in the input variable space by their class. +//! +//! During training, SVC chooses a Maximal-Margin hyperplane that can separate all training instances with the largest margin. +//! The margin is calculated as the perpendicular distance from the boundary to only the closest points. Hence, only these points are relevant in defining +//! the hyperplane and in the construction of the classifier. These points are called the support vectors. +//! +//! While SVC selects a hyperplane with the largest margin it allows some points in the training data to violate the separating boundary. +//! The parameter `C` > 0 gives you control over how SVC will handle violating points. The bigger the value of this parameter the more we penalize the algorithm +//! for incorrectly classified points. In other words, setting this parameter to a small value will result in a classifier that allows for a big number +//! of misclassified samples. Mathematically, SVC optimization problem can be defined as: +//! +//! \\[\underset{w, \zeta}{minimize} \space \space \frac{1}{2} \lVert \vec{w} \rVert^2 + C\sum_{i=1}^m \zeta_i \\] +//! +//! subject to: +//! +//! \\[y_i(\langle\vec{w}, \vec{x}_i \rangle + b) \geq 1 - \zeta_i \\] +//! \\[\zeta_i \geq 0 for \space any \space i = 1, ... , m\\] +//! +//! Where \\( m \\) is a number of training samples, \\( y_i \\) is a label value (either 1 or -1) and \\(\langle\vec{w}, \vec{x}_i \rangle + b\\) is a decision boundary. +//! +//! To solve this optimization problem, SmartCore uses an [approximate SVM solver](https://leon.bottou.org/projects/lasvm). +//! The optimizer reaches accuracies similar to that of a real SVM after performing two passes through the training examples. You can choose the number of passes +//! through the data that the algorithm takes by changing the `epoch` parameter of the classifier. //! -//! Example +//! Example: //! //! ``` //! use smartcore::linalg::naive::dense_matrix::*; @@ -47,8 +71,11 @@ //! //! ## References: //! -//! * ["Support Vector Machines" Kowalczyk A., 2017](https://www.svm-tutorial.com/2017/10/support-vector-machines-succinctly-released/) +//! * ["Support Vector Machines", Kowalczyk A., 2017](https://www.svm-tutorial.com/2017/10/support-vector-machines-succinctly-released/) //! * ["Fast Kernel Classifiers with Online and Active Learning", Bordes A., Ertekin S., Weston J., Bottou L., 2005](https://www.jmlr.org/papers/volume6/bordes05a/bordes05a.pdf) +//! +//! +//! use std::collections::{HashMap, HashSet}; use std::fmt::Debug; diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 29c6b2e..be5d7b9 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -1,6 +1,24 @@ //! # Epsilon-Support Vector Regression. +//! +//! Support Vector Regression (SVR) is a popular algorithm used for regression that uses the same principle as SVM. +//! +//! Just like [SVC](../svc/index.html) SVR finds optimal decision boundary, \\(f(x)\\) that separates all training instances with the largest margin. +//! Unlike SVC, in \\(\epsilon\\)-SVR regression the goal is to find a function \\(f(x)\\) that has at most \\(\epsilon\\) deviation from the +//! known targets \\(y_i\\) for all the training data. To find this function, we need to find solution to this optimization problem: +//! +//! \\[\underset{w, \zeta}{minimize} \space \space \frac{1}{2} \lVert \vec{w} \rVert^2 + C\sum_{i=1}^m \zeta_i \\] +//! +//! subject to: +//! +//! \\[\lvert y_i - \langle\vec{w}, \vec{x}_i \rangle - b \rvert \leq \epsilon + \zeta_i \\] +//! \\[\lvert \langle\vec{w}, \vec{x}_i \rangle + b - y_i \rvert \leq \epsilon + \zeta_i \\] +//! \\[\zeta_i \geq 0 for \space any \space i = 1, ... , m\\] +//! +//! Where \\( m \\) is a number of training samples, \\( y_i \\) is a target value and \\(\langle\vec{w}, \vec{x}_i \rangle + b\\) is a decision boundary. +//! +//! The parameter `C` > 0 determines the trade-off between the flatness of \\(f(x)\\) and the amount up to which deviations larger than \\(\epsilon\\) are tolerated //! -//! Example +//! Example: //! //! ``` //! use smartcore::linalg::naive::dense_matrix::*; @@ -44,10 +62,13 @@ //! //! ## References: //! -//! * ["Support Vector Machines" Kowalczyk A., 2017](https://www.svm-tutorial.com/2017/10/support-vector-machines-succinctly-released/) +//! * ["Support Vector Machines", Kowalczyk A., 2017](https://www.svm-tutorial.com/2017/10/support-vector-machines-succinctly-released/) //! * ["A Fast Algorithm for Training Support Vector Machines", Platt J.C., 1998](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-98-14.pdf) //! * ["Working Set Selection Using Second Order Information for Training Support Vector Machines", Rong-En Fan et al., 2005](https://www.jmlr.org/papers/volume6/fan05a/fan05a.pdf) -//! * ["A tutorial on support vector regression", SMOLA A.J., Scholkopf B., 2003](https://alex.smola.org/papers/2004/SmoSch04.pdf) +//! * ["A tutorial on support vector regression", Smola A.J., Scholkopf B., 2003](https://alex.smola.org/papers/2004/SmoSch04.pdf) +//! +//! +//! use std::cell::{Ref, RefCell}; use std::fmt::Debug; From 81395bcbb71f0ab5b01e7158c3cc747cad17eb85 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 30 Oct 2020 15:08:22 -0700 Subject: [PATCH 14/79] fix: formatting --- src/svm/mod.rs | 22 +++++++++++----------- src/svm/svc.rs | 36 ++++++++++++++++++------------------ src/svm/svr.rs | 22 +++++++++++----------- 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/svm/mod.rs b/src/svm/mod.rs index c9aea15..84a405e 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -1,20 +1,20 @@ //! # Support Vector Machines -//! -//! Support Vector Machines (SVM) is one of the most performant off-the-shelf machine learning algorithms. +//! +//! Support Vector Machines (SVM) is one of the most performant off-the-shelf machine learning algorithms. //! SVM is based on the [Vapnik–Chervonenkiy theory](https://en.wikipedia.org/wiki/Vapnik%E2%80%93Chervonenkis_theory) that was developed during 1960–1990 by Vladimir Vapnik and Alexey Chervonenkiy. -//! -//! SVM splits data into two sets using a maximal-margin decision boundary, \\(f(x)\\). For regression, the algorithm uses a value of the function \\(f(x)\\) to predict a target value. +//! +//! SVM splits data into two sets using a maximal-margin decision boundary, \\(f(x)\\). For regression, the algorithm uses a value of the function \\(f(x)\\) to predict a target value. //! To classify a new point, algorithm calculates a sign of the decision function to see where the new point is relative to the boundary. -//! +//! //! SVM is memory efficient since it uses only a subset of training data to find a decision boundary. This subset is called support vectors. -//! -//! In SVM distance between a data point and the support vectors is defined by the kernel function. -//! SmartCore supports multiple kernel functions but you can always define a new kernel function by implementing the `Kernel` trait. Not all functions can be a kernel. -//! Building a new kernel requires a good mathematical understanding of the [Mercer theorem](https://en.wikipedia.org/wiki/Mercer%27s_theorem) +//! +//! In SVM distance between a data point and the support vectors is defined by the kernel function. +//! SmartCore supports multiple kernel functions but you can always define a new kernel function by implementing the `Kernel` trait. Not all functions can be a kernel. +//! Building a new kernel requires a good mathematical understanding of the [Mercer theorem](https://en.wikipedia.org/wiki/Mercer%27s_theorem) //! that gives necessary and sufficient condition for a function to be a kernel function. -//! +//! //! Pre-defined kernel functions: -//! +//! //! * *Linear*, \\( K(x, x') = \langle x, x' \rangle\\) //! * *Polynomial*, \\( K(x, x') = (\gamma\langle x, x' \rangle + r)^d\\), where \\(d\\) is polynomial degree, \\(\gamma\\) is a kernel coefficient and \\(r\\) is an independent term in the kernel function. //! * *RBF (Gaussian)*, \\( K(x, x') = e^{-\gamma \lVert x - x' \rVert ^2} \\), where \\(\gamma\\) is kernel coefficient diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 5cce80a..04b3b7b 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -1,27 +1,27 @@ //! # Support Vector Classifier. -//! +//! //! Support Vector Classifier (SVC) is a binary classifier that uses an optimal hyperplane to separate the points in the input variable space by their class. -//! -//! During training, SVC chooses a Maximal-Margin hyperplane that can separate all training instances with the largest margin. -//! The margin is calculated as the perpendicular distance from the boundary to only the closest points. Hence, only these points are relevant in defining +//! +//! During training, SVC chooses a Maximal-Margin hyperplane that can separate all training instances with the largest margin. +//! The margin is calculated as the perpendicular distance from the boundary to only the closest points. Hence, only these points are relevant in defining //! the hyperplane and in the construction of the classifier. These points are called the support vectors. -//! -//! While SVC selects a hyperplane with the largest margin it allows some points in the training data to violate the separating boundary. -//! The parameter `C` > 0 gives you control over how SVC will handle violating points. The bigger the value of this parameter the more we penalize the algorithm -//! for incorrectly classified points. In other words, setting this parameter to a small value will result in a classifier that allows for a big number -//! of misclassified samples. Mathematically, SVC optimization problem can be defined as: -//! +//! +//! While SVC selects a hyperplane with the largest margin it allows some points in the training data to violate the separating boundary. +//! The parameter `C` > 0 gives you control over how SVC will handle violating points. The bigger the value of this parameter the more we penalize the algorithm +//! for incorrectly classified points. In other words, setting this parameter to a small value will result in a classifier that allows for a big number +//! of misclassified samples. Mathematically, SVC optimization problem can be defined as: +//! //! \\[\underset{w, \zeta}{minimize} \space \space \frac{1}{2} \lVert \vec{w} \rVert^2 + C\sum_{i=1}^m \zeta_i \\] -//! +//! //! subject to: -//! +//! //! \\[y_i(\langle\vec{w}, \vec{x}_i \rangle + b) \geq 1 - \zeta_i \\] //! \\[\zeta_i \geq 0 for \space any \space i = 1, ... , m\\] -//! -//! Where \\( m \\) is a number of training samples, \\( y_i \\) is a label value (either 1 or -1) and \\(\langle\vec{w}, \vec{x}_i \rangle + b\\) is a decision boundary. -//! -//! To solve this optimization problem, SmartCore uses an [approximate SVM solver](https://leon.bottou.org/projects/lasvm). -//! The optimizer reaches accuracies similar to that of a real SVM after performing two passes through the training examples. You can choose the number of passes +//! +//! Where \\( m \\) is a number of training samples, \\( y_i \\) is a label value (either 1 or -1) and \\(\langle\vec{w}, \vec{x}_i \rangle + b\\) is a decision boundary. +//! +//! To solve this optimization problem, SmartCore uses an [approximate SVM solver](https://leon.bottou.org/projects/lasvm). +//! The optimizer reaches accuracies similar to that of a real SVM after performing two passes through the training examples. You can choose the number of passes //! through the data that the algorithm takes by changing the `epoch` parameter of the classifier. //! //! Example: @@ -73,7 +73,7 @@ //! //! * ["Support Vector Machines", Kowalczyk A., 2017](https://www.svm-tutorial.com/2017/10/support-vector-machines-succinctly-released/) //! * ["Fast Kernel Classifiers with Online and Active Learning", Bordes A., Ertekin S., Weston J., Bottou L., 2005](https://www.jmlr.org/papers/volume6/bordes05a/bordes05a.pdf) -//! +//! //! //! diff --git a/src/svm/svr.rs b/src/svm/svr.rs index be5d7b9..0fcaa30 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -1,21 +1,21 @@ //! # Epsilon-Support Vector Regression. -//! -//! Support Vector Regression (SVR) is a popular algorithm used for regression that uses the same principle as SVM. -//! +//! +//! Support Vector Regression (SVR) is a popular algorithm used for regression that uses the same principle as SVM. +//! //! Just like [SVC](../svc/index.html) SVR finds optimal decision boundary, \\(f(x)\\) that separates all training instances with the largest margin. -//! Unlike SVC, in \\(\epsilon\\)-SVR regression the goal is to find a function \\(f(x)\\) that has at most \\(\epsilon\\) deviation from the +//! Unlike SVC, in \\(\epsilon\\)-SVR regression the goal is to find a function \\(f(x)\\) that has at most \\(\epsilon\\) deviation from the //! known targets \\(y_i\\) for all the training data. To find this function, we need to find solution to this optimization problem: -//! +//! //! \\[\underset{w, \zeta}{minimize} \space \space \frac{1}{2} \lVert \vec{w} \rVert^2 + C\sum_{i=1}^m \zeta_i \\] -//! +//! //! subject to: -//! +//! //! \\[\lvert y_i - \langle\vec{w}, \vec{x}_i \rangle - b \rvert \leq \epsilon + \zeta_i \\] //! \\[\lvert \langle\vec{w}, \vec{x}_i \rangle + b - y_i \rvert \leq \epsilon + \zeta_i \\] //! \\[\zeta_i \geq 0 for \space any \space i = 1, ... , m\\] -//! -//! Where \\( m \\) is a number of training samples, \\( y_i \\) is a target value and \\(\langle\vec{w}, \vec{x}_i \rangle + b\\) is a decision boundary. -//! +//! +//! Where \\( m \\) is a number of training samples, \\( y_i \\) is a target value and \\(\langle\vec{w}, \vec{x}_i \rangle + b\\) is a decision boundary. +//! //! The parameter `C` > 0 determines the trade-off between the flatness of \\(f(x)\\) and the amount up to which deviations larger than \\(\epsilon\\) are tolerated //! //! Example: @@ -66,7 +66,7 @@ //! * ["A Fast Algorithm for Training Support Vector Machines", Platt J.C., 1998](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-98-14.pdf) //! * ["Working Set Selection Using Second Order Information for Training Support Vector Machines", Rong-En Fan et al., 2005](https://www.jmlr.org/papers/volume6/fan05a/fan05a.pdf) //! * ["A tutorial on support vector regression", Smola A.J., Scholkopf B., 2003](https://alex.smola.org/papers/2004/SmoSch04.pdf) -//! +//! //! //! From a9446c00c2c97d41f6522492b162ff934b830edd Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Sat, 31 Oct 2020 14:43:52 -0700 Subject: [PATCH 15/79] fix: fixes a bug in Eq implementation for SVC and SVR --- src/svm/svc.rs | 2 +- src/svm/svr.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 04b3b7b..119b812 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -247,7 +247,7 @@ impl, K: Kernel> SVC { impl, K: Kernel> PartialEq for SVC { fn eq(&self, other: &Self) -> bool { - if self.b != other.b + if (self.b - other.b).abs() > T::epsilon() * T::two() || self.w.len() != other.w.len() || self.instances.len() != other.instances.len() { diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 0fcaa30..61feb80 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -204,7 +204,7 @@ impl, K: Kernel> SVR { impl, K: Kernel> PartialEq for SVR { fn eq(&self, other: &Self) -> bool { - if self.b != other.b + if (self.b - other.b).abs() > T::epsilon() * T::two() || self.w.len() != other.w.len() || self.instances.len() != other.instances.len() { From 6473a6c4aebf5800d973879f0959242a47403c22 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 3 Nov 2020 15:39:43 -0800 Subject: [PATCH 16/79] feat: adds Cholesky matrix decomposition --- src/error/mod.rs | 3 + src/linalg/cholesky.rs | 206 +++++++++++++++++++++++++++++++ src/linalg/mod.rs | 3 + src/linalg/naive/dense_matrix.rs | 3 + src/linalg/nalgebra_bindings.rs | 6 + src/linalg/ndarray_bindings.rs | 6 + 6 files changed, 227 insertions(+) create mode 100644 src/linalg/cholesky.rs diff --git a/src/error/mod.rs b/src/error/mod.rs index 320b991..c411e87 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -24,6 +24,8 @@ pub enum FailedError { FindFailed, /// Can't decompose a matrix DecompositionFailed, + /// Can't solve for x + SolutionFailed, } impl Failed { @@ -87,6 +89,7 @@ impl fmt::Display for FailedError { FailedError::TransformFailed => "Transform failed", FailedError::FindFailed => "Find failed", FailedError::DecompositionFailed => "Decomposition failed", + FailedError::SolutionFailed => "Can't find solution", }; write!(f, "{}", failed_err_str) } diff --git a/src/linalg/cholesky.rs b/src/linalg/cholesky.rs new file mode 100644 index 0000000..286e3f2 --- /dev/null +++ b/src/linalg/cholesky.rs @@ -0,0 +1,206 @@ +//! # Cholesky Decomposition +//! +//! every positive definite matrix \\(A \in R^{n \times n}\\) can be factored as +//! +//! \\[A = R^TR\\] +//! +//! where \\(R\\) is upper triangular matrix with positive diagonal elements +//! +//! Example: +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use crate::smartcore::linalg::cholesky::*; +//! +//! let A = DenseMatrix::from_2d_array(&[ +//! &[25., 15., -5.], +//! &[15., 18., 0.], +//! &[-5., 0., 11.] +//! ]); +//! +//! let cholesky = A.cholesky().unwrap(); +//! let lower_triangular: DenseMatrix = cholesky.L(); +//! let upper_triangular: DenseMatrix = cholesky.U(); +//! ``` +//! +//! ## References: +//! * ["No bullshit guide to linear algebra", Ivan Savov, 2016, 7.6 Matrix decompositions](https://minireference.com/) +//! * ["Numerical Recipes: The Art of Scientific Computing", Press W.H., Teukolsky S.A., Vetterling W.T, Flannery B.P, 3rd ed., 2.9 Cholesky Decomposition](http://numerical.recipes/) +//! +//! +//! +#![allow(non_snake_case)] + +use std::fmt::Debug; +use std::marker::PhantomData; + +use crate::error::{Failed, FailedError}; +use crate::linalg::BaseMatrix; +use crate::math::num::RealNumber; + +#[derive(Debug, Clone)] +/// Results of Cholesky decomposition. +pub struct Cholesky> { + R: M, + t: PhantomData +} + +impl> Cholesky { + pub(crate) fn new(R: M) -> Cholesky { + Cholesky { + R: R, + t: PhantomData + } + } + + /// Get lower triangular matrix. + pub fn L(&self) -> M { + let (n, _) = self.R.shape(); + let mut R = M::zeros(n, n); + + for i in 0..n { + for j in 0..n { + if j <= i { + R.set(i, j, self.R.get(i, j)); + } + } + } + R + } + + /// Get upper triangular matrix. + pub fn U(&self) -> M { + let (n, _) = self.R.shape(); + let mut R = M::zeros(n, n); + + for i in 0..n { + for j in 0..n { + if j <= i { + R.set(j, i, self.R.get(i, j)); + } + } + } + R + } + + /// Solves Ax = b + pub(crate) fn solve(&self, mut b: M) -> Result { + + let (bn, m) = b.shape(); + let (rn, _) = self.R.shape(); + + if bn != rn { + return Err(Failed::because(FailedError::SolutionFailed, &format!( + "Can't solve Ax = b for x. Number of rows in b != number of rows in R." + ))); + } + + for k in 0..bn { + for j in 0..m { + for i in 0..k { + b.sub_element_mut(k, j, b.get(i, j) * self.R.get(k, i)); + } + b.div_element_mut(k, j, self.R.get(k, k)); + } + } + + for k in (0..bn).rev() { + for j in 0..m { + for i in k + 1..bn { + b.sub_element_mut(k, j, b.get(i, j) * self.R.get(i, k)); + } + b.div_element_mut(k, j, self.R.get(k, k)); + } + } + Ok(b) + } +} + +/// Trait that implements Cholesky decomposition routine for any matrix. +pub trait CholeskyDecomposableMatrix: BaseMatrix { + /// Compute the Cholesky decomposition of a matrix. + fn cholesky(&self) -> Result, Failed> { + self.clone().cholesky_mut() + } + + /// Compute the Cholesky decomposition of a matrix. The input matrix + /// will be used for factorization. + fn cholesky_mut(mut self) -> Result, Failed> { + let (m, n) = self.shape(); + + if m != n { + return Err(Failed::because(FailedError::DecompositionFailed, &format!( + "Can't do Cholesky decomposition on a non-square matrix" + ))); + } + + for j in 0..n { + let mut d = T::zero(); + for k in 0..j { + let mut s = T::zero(); + for i in 0..k { + s += self.get(k, i) * self.get(j, i); + } + s = (self.get(j, k) - s) / self.get(k, k); + self.set(j, k, s); + d = d + s * s; + } + d = self.get(j, j) - d; + + if d < T::zero() { + return Err(Failed::because(FailedError::DecompositionFailed, &format!( + "The matrix is not positive definite." + ))); + } + + self.set(j, j, d.sqrt()); + } + + Ok(Cholesky::new(self)) + } + + /// Solves Ax = b + fn cholesky_solve_mut(self, b: Self) -> Result { + self.cholesky_mut().and_then(|qr| qr.solve(b)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn cholesky_decompose() { + let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]); + let l = DenseMatrix::from_2d_array(&[ + &[5.0, 0.0, 0.0], + &[3.0, 3.0, 0.0], + &[-1.0, 1.0, 3.0], + ]); + let u = DenseMatrix::from_2d_array(&[ + &[5.0, 3.0, -1.0], + &[0.0, 3.0, 1.0], + &[0.0, 0.0, 3.0], + ]); + let cholesky = a.cholesky().unwrap(); + + assert!(cholesky.L().abs().approximate_eq(&l.abs(), 1e-4)); + assert!(cholesky.U().abs().approximate_eq(&u.abs(), 1e-4)); + assert!(cholesky.L().matmul(&cholesky.U()).abs().approximate_eq(&a.abs(), 1e-4)); + } + + #[test] + fn cholesky_solve_mut() { + let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]); + let b = DenseMatrix::from_2d_array(&[&[40., 51., 28.]]); + let expected = DenseMatrix::from_2d_array(&[ + &[1.0, 2.0, 3.0] + ]); + + let cholesky = a.cholesky().unwrap(); + + assert!(cholesky.solve(b.transpose()).unwrap().transpose().approximate_eq(&expected, 1e-4)); + + } + +} diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 29c7a89..61749b0 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -49,6 +49,7 @@ pub mod ndarray_bindings; pub mod qr; /// Singular value decomposition. pub mod svd; +pub mod cholesky; use std::fmt::{Debug, Display}; use std::marker::PhantomData; @@ -59,6 +60,7 @@ use evd::EVDDecomposableMatrix; use lu::LUDecomposableMatrix; use qr::QRDecomposableMatrix; use svd::SVDDecomposableMatrix; +use cholesky::CholeskyDecomposableMatrix; /// Column or row vector pub trait BaseVector: Clone + Debug { @@ -507,6 +509,7 @@ pub trait Matrix: + EVDDecomposableMatrix + QRDecomposableMatrix + LUDecomposableMatrix + + CholeskyDecomposableMatrix + PartialEq + Display { diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index cf29061..b5ecd90 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -12,6 +12,7 @@ use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::svd::SVDDecomposableMatrix; +use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::Matrix; pub use crate::linalg::{BaseMatrix, BaseVector}; use crate::math::num::RealNumber; @@ -442,6 +443,8 @@ impl QRDecomposableMatrix for DenseMatrix {} impl LUDecomposableMatrix for DenseMatrix {} +impl CholeskyDecomposableMatrix for DenseMatrix {} + impl Matrix for DenseMatrix {} impl PartialEq for DenseMatrix { diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index 3596899..a400a67 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -46,6 +46,7 @@ use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::svd::SVDDecomposableMatrix; +use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::Matrix as SmartCoreMatrix; use crate::linalg::{BaseMatrix, BaseVector}; use crate::math::num::RealNumber; @@ -544,6 +545,11 @@ impl + CholeskyDecomposableMatrix for Matrix> +{ +} + impl SmartCoreMatrix for Matrix> { diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 9f911f5..76749a7 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -53,6 +53,7 @@ use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::svd::SVDDecomposableMatrix; +use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::Matrix; use crate::linalg::{BaseMatrix, BaseVector}; use crate::math::num::RealNumber; @@ -494,6 +495,11 @@ impl + CholeskyDecomposableMatrix for ArrayBase, Ix2> +{ +} + impl Matrix for ArrayBase, Ix2> { From b8fea67fd28cdbd67107fd423f0763b7f5a47722 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 3 Nov 2020 15:49:04 -0800 Subject: [PATCH 17/79] fix: formatting --- src/linalg/cholesky.rs | 90 ++++++++++++++++---------------- src/linalg/mod.rs | 4 +- src/linalg/naive/dense_matrix.rs | 2 +- src/linalg/nalgebra_bindings.rs | 2 +- src/linalg/ndarray_bindings.rs | 2 +- 5 files changed, 50 insertions(+), 50 deletions(-) diff --git a/src/linalg/cholesky.rs b/src/linalg/cholesky.rs index 286e3f2..e55d6bb 100644 --- a/src/linalg/cholesky.rs +++ b/src/linalg/cholesky.rs @@ -1,9 +1,9 @@ //! # Cholesky Decomposition //! -//! every positive definite matrix \\(A \in R^{n \times n}\\) can be factored as +//! every positive definite matrix \\(A \in R^{n \times n}\\) can be factored as //! //! \\[A = R^TR\\] -//! +//! //! where \\(R\\) is upper triangular matrix with positive diagonal elements //! //! Example: @@ -12,8 +12,8 @@ //! use crate::smartcore::linalg::cholesky::*; //! //! let A = DenseMatrix::from_2d_array(&[ -//! &[25., 15., -5.], -//! &[15., 18., 0.], +//! &[25., 15., -5.], +//! &[15., 18., 0.], //! &[-5., 0., 11.] //! ]); //! @@ -41,14 +41,14 @@ use crate::math::num::RealNumber; /// Results of Cholesky decomposition. pub struct Cholesky> { R: M, - t: PhantomData + t: PhantomData, } impl> Cholesky { pub(crate) fn new(R: M) -> Cholesky { Cholesky { R: R, - t: PhantomData + t: PhantomData, } } @@ -65,10 +65,10 @@ impl> Cholesky { } } R - } - + } + /// Get upper triangular matrix. - pub fn U(&self) -> M { + pub fn U(&self) -> M { let (n, _) = self.R.shape(); let mut R = M::zeros(n, n); @@ -80,20 +80,20 @@ impl> Cholesky { } } R - } + } /// Solves Ax = b - pub(crate) fn solve(&self, mut b: M) -> Result { - + pub(crate) fn solve(&self, mut b: M) -> Result { let (bn, m) = b.shape(); let (rn, _) = self.R.shape(); if bn != rn { - return Err(Failed::because(FailedError::SolutionFailed, &format!( - "Can't solve Ax = b for x. Number of rows in b != number of rows in R." - ))); + return Err(Failed::because( + FailedError::SolutionFailed, + &format!("Can't solve Ax = b for x. Number of rows in b != number of rows in R."), + )); } - + for k in 0..bn { for j in 0..m { for i in 0..k { @@ -102,7 +102,7 @@ impl> Cholesky { b.div_element_mut(k, j, self.R.get(k, k)); } } - + for k in (0..bn).rev() { for j in 0..m { for i in k + 1..bn { @@ -128,11 +128,12 @@ pub trait CholeskyDecomposableMatrix: BaseMatrix { let (m, n) = self.shape(); if m != n { - return Err(Failed::because(FailedError::DecompositionFailed, &format!( - "Can't do Cholesky decomposition on a non-square matrix" - ))); + return Err(Failed::because( + FailedError::DecompositionFailed, + &format!("Can't do Cholesky decomposition on a non-square matrix"), + )); } - + for j in 0..n { let mut d = T::zero(); for k in 0..j { @@ -147,9 +148,10 @@ pub trait CholeskyDecomposableMatrix: BaseMatrix { d = self.get(j, j) - d; if d < T::zero() { - return Err(Failed::because(FailedError::DecompositionFailed, &format!( - "The matrix is not positive definite." - ))); + return Err(Failed::because( + FailedError::DecompositionFailed, + &format!("The matrix is not positive definite."), + )); } self.set(j, j, d.sqrt()); @@ -172,35 +174,33 @@ mod tests { #[test] fn cholesky_decompose() { let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]); - let l = DenseMatrix::from_2d_array(&[ - &[5.0, 0.0, 0.0], - &[3.0, 3.0, 0.0], - &[-1.0, 1.0, 3.0], - ]); - let u = DenseMatrix::from_2d_array(&[ - &[5.0, 3.0, -1.0], - &[0.0, 3.0, 1.0], - &[0.0, 0.0, 3.0], - ]); + let l = + DenseMatrix::from_2d_array(&[&[5.0, 0.0, 0.0], &[3.0, 3.0, 0.0], &[-1.0, 1.0, 3.0]]); + let u = + DenseMatrix::from_2d_array(&[&[5.0, 3.0, -1.0], &[0.0, 3.0, 1.0], &[0.0, 0.0, 3.0]]); let cholesky = a.cholesky().unwrap(); - - assert!(cholesky.L().abs().approximate_eq(&l.abs(), 1e-4)); - assert!(cholesky.U().abs().approximate_eq(&u.abs(), 1e-4)); - assert!(cholesky.L().matmul(&cholesky.U()).abs().approximate_eq(&a.abs(), 1e-4)); + + assert!(cholesky.L().abs().approximate_eq(&l.abs(), 1e-4)); + assert!(cholesky.U().abs().approximate_eq(&u.abs(), 1e-4)); + assert!(cholesky + .L() + .matmul(&cholesky.U()) + .abs() + .approximate_eq(&a.abs(), 1e-4)); } #[test] fn cholesky_solve_mut() { let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]); let b = DenseMatrix::from_2d_array(&[&[40., 51., 28.]]); - let expected = DenseMatrix::from_2d_array(&[ - &[1.0, 2.0, 3.0] - ]); - + let expected = DenseMatrix::from_2d_array(&[&[1.0, 2.0, 3.0]]); + let cholesky = a.cholesky().unwrap(); - assert!(cholesky.solve(b.transpose()).unwrap().transpose().approximate_eq(&expected, 1e-4)); - + assert!(cholesky + .solve(b.transpose()) + .unwrap() + .transpose() + .approximate_eq(&expected, 1e-4)); } - } diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 61749b0..fb12909 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -33,6 +33,7 @@ //! let u: DenseMatrix = svd.U; //! ``` +pub mod cholesky; /// The matrix is represented in terms of its eigenvalues and eigenvectors. pub mod evd; /// Factors a matrix as the product of a lower triangular matrix and an upper triangular matrix. @@ -49,18 +50,17 @@ pub mod ndarray_bindings; pub mod qr; /// Singular value decomposition. pub mod svd; -pub mod cholesky; use std::fmt::{Debug, Display}; use std::marker::PhantomData; use std::ops::Range; use crate::math::num::RealNumber; +use cholesky::CholeskyDecomposableMatrix; use evd::EVDDecomposableMatrix; use lu::LUDecomposableMatrix; use qr::QRDecomposableMatrix; use svd::SVDDecomposableMatrix; -use cholesky::CholeskyDecomposableMatrix; /// Column or row vector pub trait BaseVector: Clone + Debug { diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index b5ecd90..d3d6353 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -8,11 +8,11 @@ use serde::de::{Deserializer, MapAccess, SeqAccess, Visitor}; use serde::ser::{SerializeStruct, Serializer}; use serde::{Deserialize, Serialize}; +use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::svd::SVDDecomposableMatrix; -use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::Matrix; pub use crate::linalg::{BaseMatrix, BaseVector}; use crate::math::num::RealNumber; diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index a400a67..e0b885b 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -42,11 +42,11 @@ use std::ops::{AddAssign, DivAssign, MulAssign, Range, SubAssign}; use nalgebra::{DMatrix, Dynamic, Matrix, MatrixMN, RowDVector, Scalar, VecStorage, U1}; +use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::svd::SVDDecomposableMatrix; -use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::Matrix as SmartCoreMatrix; use crate::linalg::{BaseMatrix, BaseVector}; use crate::math::num::RealNumber; diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 76749a7..00c9745 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -49,11 +49,11 @@ use std::ops::SubAssign; use ndarray::ScalarOperand; use ndarray::{s, stack, Array, ArrayBase, Axis, Ix1, Ix2, OwnedRepr}; +use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::svd::SVDDecomposableMatrix; -use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::Matrix; use crate::linalg::{BaseMatrix, BaseVector}; use crate::math::num::RealNumber; From ab7f46603c576f7c96feea8a9db09db79d996a54 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 6 Nov 2020 10:48:00 -0800 Subject: [PATCH 18/79] feat: + ridge regression --- src/linalg/mod.rs | 48 +++++ src/linalg/naive/dense_matrix.rs | 3 + src/linalg/nalgebra_bindings.rs | 6 + src/linalg/ndarray_bindings.rs | 6 + src/linalg/stats.rs | 139 +++++++++++++ src/linear/mod.rs | 1 + src/linear/ridge_regression.rs | 323 +++++++++++++++++++++++++++++++ 7 files changed, 526 insertions(+) create mode 100644 src/linalg/stats.rs create mode 100644 src/linear/ridge_regression.rs diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index fb12909..42ed558 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -48,6 +48,7 @@ pub mod nalgebra_bindings; pub mod ndarray_bindings; /// QR factorization that factors a matrix into a product of an orthogonal matrix and an upper triangular matrix. pub mod qr; +pub mod stats; /// Singular value decomposition. pub mod svd; @@ -60,6 +61,7 @@ use cholesky::CholeskyDecomposableMatrix; use evd::EVDDecomposableMatrix; use lu::LUDecomposableMatrix; use qr::QRDecomposableMatrix; +use stats::MatrixStats; use svd::SVDDecomposableMatrix; /// Column or row vector @@ -163,6 +165,32 @@ pub trait BaseVector: Clone + Debug { ///assert_eq!(a.unique(), vec![-7., -6., -2., 1., 2., 3., 4.]); /// ``` fn unique(&self) -> Vec; + + /// Compute the arithmetic mean. + fn mean(&self) -> T { + let n = self.len(); + let mut mean = T::zero(); + + for i in 0..n { + mean += self.get(i); + } + mean / T::from_usize(n).unwrap() + } + /// Compute the standard deviation. + fn std(&self) -> T { + let n = self.len(); + + let mut mu = T::zero(); + let mut sum = T::zero(); + let div = T::from_usize(n).unwrap(); + for i in 0..n { + let xi = self.get(i); + mu += xi; + sum += xi * xi; + } + mu /= div; + (sum / div - mu * mu).sqrt() + } } /// Generic matrix type. @@ -510,6 +538,7 @@ pub trait Matrix: + QRDecomposableMatrix + LUDecomposableMatrix + CholeskyDecomposableMatrix + + MatrixStats + PartialEq + Display { @@ -545,3 +574,22 @@ impl<'a, T: RealNumber, M: BaseMatrix> Iterator for RowIter<'a, T, M> { res } } + +#[cfg(test)] +mod tests { + use crate::linalg::BaseVector; + + #[test] + fn mean() { + let m = vec![1., 2., 3.]; + + assert_eq!(m.mean(), 2.0); + } + + #[test] + fn std() { + let m = vec![1., 2., 3.]; + + assert!((m.std() - 0.81f64).abs() < 1e-2); + } +} diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index d3d6353..e34dd91 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -12,6 +12,7 @@ use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; +use crate::linalg::stats::MatrixStats; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix; pub use crate::linalg::{BaseMatrix, BaseVector}; @@ -445,6 +446,8 @@ impl LUDecomposableMatrix for DenseMatrix {} impl CholeskyDecomposableMatrix for DenseMatrix {} +impl MatrixStats for DenseMatrix {} + impl Matrix for DenseMatrix {} impl PartialEq for DenseMatrix { diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index e0b885b..ad39057 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -46,6 +46,7 @@ use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; +use crate::linalg::stats::MatrixStats; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix as SmartCoreMatrix; use crate::linalg::{BaseMatrix, BaseVector}; @@ -550,6 +551,11 @@ impl + MatrixStats for Matrix> +{ +} + impl SmartCoreMatrix for Matrix> { diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 00c9745..e8de983 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -53,6 +53,7 @@ use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; +use crate::linalg::stats::MatrixStats; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix; use crate::linalg::{BaseMatrix, BaseVector}; @@ -500,6 +501,11 @@ impl + MatrixStats for ArrayBase, Ix2> +{ +} + impl Matrix for ArrayBase, Ix2> { diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs new file mode 100644 index 0000000..f5db1e9 --- /dev/null +++ b/src/linalg/stats.rs @@ -0,0 +1,139 @@ +//! # Various Statistical Methods +//! +//! + +use crate::linalg::BaseMatrix; +use crate::math::num::RealNumber; + +/// Defines baseline implementations for various statistical functions +pub trait MatrixStats: BaseMatrix { + /// Compute the arithmetic mean along the specified axis. + fn mean(&self, axis: u8) -> Vec { + let (n, m) = match axis { + 0 => { + let (n, m) = self.shape(); + (m, n) + } + _ => self.shape(), + }; + + let mut x: Vec = vec![T::zero(); n]; + + let div = T::from_usize(m).unwrap(); + + for i in 0..n { + for j in 0..m { + x[i] += match axis { + 0 => self.get(j, i), + _ => self.get(i, j), + }; + } + x[i] /= div; + } + + x + } + + /// Compute the standard deviation along the specified axis. + fn std(&self, axis: u8) -> Vec { + let (n, m) = match axis { + 0 => { + let (n, m) = self.shape(); + (m, n) + } + _ => self.shape(), + }; + + let mut x: Vec = vec![T::zero(); n]; + + let div = T::from_usize(m).unwrap(); + + for i in 0..n { + let mut mu = T::zero(); + let mut sum = T::zero(); + for j in 0..m { + let a = match axis { + 0 => self.get(j, i), + _ => self.get(i, j), + }; + mu += a; + sum += a * a; + } + mu /= div; + x[i] = (sum / div - mu * mu).sqrt(); + } + + x + } + + /// standardize values by removing the mean and scaling to unit variance + fn scale_mut(&mut self, mean: &Vec, std: &Vec, axis: u8) { + let (n, m) = match axis { + 0 => { + let (n, m) = self.shape(); + (m, n) + } + _ => self.shape(), + }; + + for i in 0..n { + for j in 0..m { + match axis { + 0 => self.set(j, i, (self.get(j, i) - mean[i]) / std[i]), + _ => self.set(i, j, (self.get(i, j) - mean[i]) / std[i]), + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::linalg::BaseVector; + + #[test] + fn mean() { + let m = DenseMatrix::from_2d_array(&[ + &[1., 2., 3., 1., 2.], + &[4., 5., 6., 3., 4.], + &[7., 8., 9., 5., 6.], + ]); + let expected_0 = vec![4., 5., 6., 3., 4.]; + let expected_1 = vec![1.8, 4.4, 7.]; + + assert_eq!(m.mean(0), expected_0); + assert_eq!(m.mean(1), expected_1); + } + + #[test] + fn std() { + let m = DenseMatrix::from_2d_array(&[ + &[1., 2., 3., 1., 2.], + &[4., 5., 6., 3., 4.], + &[7., 8., 9., 5., 6.], + ]); + let expected_0 = vec![2.44, 2.44, 2.44, 1.63, 1.63]; + let expected_1 = vec![0.74, 1.01, 1.41]; + + assert!(m.std(0).approximate_eq(&expected_0, 1e-2)); + assert!(m.std(1).approximate_eq(&expected_1, 1e-2)); + } + + #[test] + fn scale() { + let mut m = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); + let expected_0 = DenseMatrix::from_2d_array(&[&[-1., -1., -1.], &[1., 1., 1.]]); + let expected_1 = DenseMatrix::from_2d_array(&[&[-1.22, 0.0, 1.22], &[-1.22, 0.0, 1.22]]); + + { + let mut m = m.clone(); + m.scale_mut(&m.mean(0), &m.std(0), 0); + assert!(m.approximate_eq(&expected_0, std::f32::EPSILON)); + } + + m.scale_mut(&m.mean(1), &m.std(1), 1); + assert!(m.approximate_eq(&expected_1, 1e-2)); + } +} diff --git a/src/linear/mod.rs b/src/linear/mod.rs index 54bbca0..fef7070 100644 --- a/src/linear/mod.rs +++ b/src/linear/mod.rs @@ -22,3 +22,4 @@ pub mod linear_regression; pub mod logistic_regression; +pub mod ridge_regression; diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs new file mode 100644 index 0000000..18df6cb --- /dev/null +++ b/src/linear/ridge_regression.rs @@ -0,0 +1,323 @@ +//! # Ridge Regression +//! +//! [Linear regression](../linear_regression/index.html) is the standard algorithm for predicting a quantitative response \\(y\\) on the basis of a linear combination of explanatory variables \\(X\\) +//! that assumes that there is approximately a linear relationship between \\(X\\) and \\(y\\). +//! Ridge regression is an extension to linear regression that adds L2 regularization term to the loss function during training. +//! This term encourages simpler models that have smaller coefficient values. +//! +//! In ridge regression coefficients \\(\beta_0, \beta_0, ... \beta_n\\) are are estimated by solving +//! +//! \\[\hat{\beta} = (X^TX + \alpha I)^{-1}X^Ty \\] +//! +//! where \\(\alpha \geq 0\\) is a tuning parameter that controls strength of regularization. When \\(\alpha = 0\\) the penalty term has no effect, and ridge regression will produce the least squares estimates. +//! However, as \\(\alpha \rightarrow \infty\\), the impact of the shrinkage penalty grows, and the ridge regression coefficient estimates will approach zero. +//! +//! SmartCore uses [SVD](../../linalg/svd/index.html) and [Cholesky](../../linalg/cholesky/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). +//! The Cholesky decomposition is more computationally efficient and more numerically stable than calculating the normal equation directly, +//! but does not work for all data matrices. Unlike the Cholesky decomposition, all matrices have an SVD decomposition. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::linear::ridge_regression::*; +//! +//! // Longley dataset (https://www.statsmodels.org/stable/datasets/generated/longley.html) +//! let x = DenseMatrix::from_2d_array(&[ +//! &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], +//! &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], +//! &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], +//! &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], +//! &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], +//! &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], +//! &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], +//! &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], +//! &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], +//! &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], +//! &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], +//! &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], +//! &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], +//! &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], +//! &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], +//! &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], +//! ]); +//! +//! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, +//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; +//! +//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters { +//! solver: RidgeRegressionSolverName::Cholesky, +//! alpha: 0.1, +//! normalize: true +//! }).and_then(|lr| lr.predict(&x)).unwrap(); +//! ``` +//! +//! ## References: +//! +//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 6.2. Shrinkage Methods](http://faculty.marshall.usc.edu/gareth-james/ISL/) +//! * ["Numerical Recipes: The Art of Scientific Computing", Press W.H., Teukolsky S.A., Vetterling W.T, Flannery B.P, 3rd ed., Section 15.4 General Linear Least Squares](http://numerical.recipes/) +//! +//! +//! +use std::fmt::Debug; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +#[derive(Serialize, Deserialize, Debug)] +/// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable. +pub enum RidgeRegressionSolverName { + /// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html) + Cholesky, + /// SVD decomposition, see [SVD](../../linalg/svd/index.html) + SVD, +} + +/// Ridge Regression parameters +#[derive(Serialize, Deserialize, Debug)] +pub struct RidgeRegressionParameters { + /// Solver to use for estimation of regression coefficients. + pub solver: RidgeRegressionSolverName, + /// Controls the strength of the penalty to the loss function. + pub alpha: T, + /// If true the regressors X will be normalized before regression + /// by subtracting the mean and dividing by the standard deviation. + pub normalize: bool, +} + +/// Ridge regression +#[derive(Serialize, Deserialize, Debug)] +pub struct RidgeRegression> { + coefficients: M, + intercept: T, + solver: RidgeRegressionSolverName, +} + +impl Default for RidgeRegressionParameters { + fn default() -> Self { + RidgeRegressionParameters { + solver: RidgeRegressionSolverName::Cholesky, + alpha: T::one(), + normalize: true, + } + } +} + +impl> PartialEq for RidgeRegression { + fn eq(&self, other: &Self) -> bool { + self.coefficients == other.coefficients + && (self.intercept - other.intercept).abs() <= T::epsilon() + } +} + +impl> RidgeRegression { + /// Fits ridge regression to your data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - target values + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: RidgeRegressionParameters, + ) -> Result, Failed> { + //w = inv(X^t X + alpha*Id) * X.T y + + let (n, p) = x.shape(); + + if n <= p { + return Err(Failed::fit(&format!( + "Number of rows in X should be >= number of columns in X" + ))); + } + + let y_column = M::from_row_vector(y.clone()).transpose(); + + let (w, b) = if parameters.normalize { + let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; + let x_t = scaled_x.transpose(); + let x_t_y = x_t.matmul(&y_column); + let mut x_t_x = x_t.matmul(&scaled_x); + + for i in 0..p { + x_t_x.add_element_mut(i, i, parameters.alpha); + } + + let mut w = match parameters.solver { + RidgeRegressionSolverName::Cholesky => x_t_x.cholesky_solve_mut(x_t_y)?, + RidgeRegressionSolverName::SVD => x_t_x.svd_solve_mut(x_t_y)?, + }; + + for i in 0..p { + w.set(i, 0, w.get(i, 0) / col_std[i]); + } + + let mut b = T::zero(); + + for i in 0..p { + b += w.get(i, 0) * col_mean[i]; + } + + let b = y.mean() - b; + + (w, b) + } else { + let x_t = x.transpose(); + let x_t_y = x_t.matmul(&y_column); + let mut x_t_x = x_t.matmul(x); + + for i in 0..p { + x_t_x.add_element_mut(i, i, parameters.alpha); + } + + let w = match parameters.solver { + RidgeRegressionSolverName::Cholesky => x_t_x.cholesky_solve_mut(x_t_y)?, + RidgeRegressionSolverName::SVD => x_t_x.svd_solve_mut(x_t_y)?, + }; + + (w, T::zero()) + }; + + Ok(RidgeRegression { + intercept: b, + coefficients: w, + solver: parameters.solver, + }) + } + + fn rescale_x(x: &M) -> Result<(M, Vec, Vec), Failed> { + let col_mean = x.mean(0); + let col_std = x.std(0); + + for i in 0..col_std.len() { + if (col_std[i] - T::zero()).abs() < T::epsilon() { + return Err(Failed::fit(&format!( + "Cannot rescale constant column {}", + i + ))); + } + } + + let mut scaled_x = x.clone(); + scaled_x.scale_mut(&col_mean, &col_std, 0); + Ok((scaled_x, col_mean, col_std)) + } + + /// Predict target values from `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &M) -> Result { + let (nrows, _) = x.shape(); + let mut y_hat = x.matmul(&self.coefficients); + y_hat.add_mut(&M::fill(nrows, 1, self.intercept)); + Ok(y_hat.transpose().to_row_vector()) + } + + /// Get estimates regression coefficients + pub fn coefficients(&self) -> M { + self.coefficients.clone() + } + + /// Get estimate of intercept + pub fn intercept(&self) -> T { + self.intercept + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + use crate::metrics::mean_absolute_error; + + #[test] + fn ridge_fit_predict() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y: Vec = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let y_hat_cholesky = RidgeRegression::fit( + &x, + &y, + RidgeRegressionParameters { + solver: RidgeRegressionSolverName::Cholesky, + alpha: 0.1, + normalize: true, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat_cholesky, &y) < 2.0); + + let y_hat_svd = RidgeRegression::fit( + &x, + &y, + RidgeRegressionParameters { + solver: RidgeRegressionSolverName::SVD, + alpha: 0.1, + normalize: false, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat_svd, &y) < 2.0); + } + + #[test] + fn serde() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let lr = RidgeRegression::fit(&x, &y, Default::default()).unwrap(); + + let deserialized_lr: RidgeRegression> = + serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); + + assert_eq!(lr, deserialized_lr); + } +} From 83048dbe9457ced9e71309682d1b7db6437dd990 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 6 Nov 2020 11:20:43 -0800 Subject: [PATCH 19/79] fix: small doc changes --- src/linalg/mod.rs | 4 ++-- src/linalg/stats.rs | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 42ed558..fe3e197 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -166,7 +166,7 @@ pub trait BaseVector: Clone + Debug { /// ``` fn unique(&self) -> Vec; - /// Compute the arithmetic mean. + /// Computes the arithmetic mean. fn mean(&self) -> T { let n = self.len(); let mut mean = T::zero(); @@ -176,7 +176,7 @@ pub trait BaseVector: Clone + Debug { } mean / T::from_usize(n).unwrap() } - /// Compute the standard deviation. + /// Computes the standard deviation. fn std(&self) -> T { let n = self.len(); diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index f5db1e9..ecb7ceb 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -1,13 +1,14 @@ //! # Various Statistical Methods //! -//! +//! This module provides reference implementations for various statistical functions. +//! Concrete implementations of the `BaseMatrix` trait are free to override these methods for better performance. use crate::linalg::BaseMatrix; use crate::math::num::RealNumber; /// Defines baseline implementations for various statistical functions pub trait MatrixStats: BaseMatrix { - /// Compute the arithmetic mean along the specified axis. + /// Computes the arithmetic mean along the specified axis. fn mean(&self, axis: u8) -> Vec { let (n, m) = match axis { 0 => { @@ -34,7 +35,7 @@ pub trait MatrixStats: BaseMatrix { x } - /// Compute the standard deviation along the specified axis. + /// Computes the standard deviation along the specified axis. fn std(&self, axis: u8) -> Vec { let (n, m) = match axis { 0 => { From ba03ef4678345229ad5f04c8986049f6823defcb Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Fri, 6 Nov 2020 19:41:32 -0400 Subject: [PATCH 20/79] Add clippy CI job --- .circleci/config.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4ed3135..dd616af 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,5 +1,11 @@ version: 2.1 +workflows: + version: 2.1 + build: + jobs: + - build + - clippy jobs: build: docker: @@ -24,3 +30,14 @@ jobs: paths: - "~/.cargo" - "./target" + clippy: + docker: + - image: circleci/rust:latest + steps: + - checkout + - run: + name: Install cargo clippy + command: rustup component add clippy + - run: + name: Run cargo clippy + command: cargo clippy From 8281a1620ebbef9e527408f4813319288e69d9fe Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Fri, 6 Nov 2020 20:24:14 -0400 Subject: [PATCH 21/79] Fix clippy errors --- src/algorithm/neighbour/cover_tree.rs | 2 +- src/cluster/kmeans.rs | 2 +- src/dataset/mod.rs | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index da870d2..bbd7254 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -101,7 +101,7 @@ impl> CoverTree /// * `p` - look for k nearest points to `p` /// * `k` - the number of nearest neighbors to return pub fn find(&self, p: &T, k: usize) -> Result, Failed> { - if k <= 0 { + if k == 0 { return Err(Failed::because(FailedError::FindFailed, "k should be > 0")); } diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index eff65aa..278024f 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -129,7 +129,7 @@ impl KMeans { return Err(Failed::fit(&format!("invalid number of clusters: {}", k))); } - if parameters.max_iter <= 0 { + if parameters.max_iter == 0 { return Err(Failed::fit(&format!( "invalid maximum number of iterations: {}", parameters.max_iter diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index bfcd1c9..85829fe 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -56,8 +56,8 @@ pub(crate) fn serialize_data( ) -> Result<(), io::Error> { match File::create(filename) { Ok(mut file) => { - file.write(&dataset.num_features.to_le_bytes())?; - file.write(&dataset.num_samples.to_le_bytes())?; + file.write_all(&dataset.num_features.to_le_bytes())?; + file.write_all(&dataset.num_samples.to_le_bytes())?; let x: Vec = dataset .data .iter() From 860056c3bab6a6c8ace5bf45c9938f2a1f8e3bf3 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 19:39:11 -0400 Subject: [PATCH 22/79] Run: cargo clippy --fix -Z unstable-options and cargo fmt --- src/algorithm/neighbour/bbd_tree.rs | 22 +++-- src/algorithm/neighbour/cover_tree.rs | 27 +++--- src/algorithm/neighbour/linear_search.rs | 10 +-- src/algorithm/neighbour/mod.rs | 4 +- src/algorithm/sort/heap_select.rs | 8 +- src/cluster/dbscan.rs | 4 +- src/cluster/kmeans.rs | 18 ++-- src/dataset/boston.rs | 4 +- src/dataset/breast_cancer.rs | 4 +- src/dataset/diabetes.rs | 4 +- src/dataset/digits.rs | 4 +- src/dataset/generator.rs | 6 +- src/dataset/iris.rs | 4 +- src/dataset/mod.rs | 4 +- src/decomposition/pca.rs | 14 +-- src/ensemble/random_forest_classifier.rs | 8 +- src/ensemble/random_forest_regressor.rs | 9 +- src/error/mod.rs | 2 +- src/linalg/cholesky.rs | 14 ++- src/linalg/evd.rs | 90 +++++++++---------- src/linalg/lu.rs | 16 ++-- src/linalg/mod.rs | 2 +- src/linalg/naive/dense_matrix.rs | 80 ++++++++--------- src/linalg/qr.rs | 16 ++-- src/linalg/svd.rs | 40 ++++----- src/linear/linear_regression.rs | 6 +- src/linear/logistic_regression.rs | 36 ++++---- src/math/distance/euclidian.rs | 2 +- src/math/distance/mahalanobis.rs | 10 +-- src/math/distance/manhattan.rs | 2 +- src/math/distance/minkowski.rs | 2 +- src/math/distance/mod.rs | 2 +- src/math/num.rs | 20 ++--- src/metrics/auc.rs | 6 +- src/metrics/cluster_helpers.rs | 7 +- src/metrics/mean_absolute_error.rs | 2 +- src/metrics/mean_squared_error.rs | 2 +- src/metrics/mod.rs | 2 +- src/metrics/r2.rs | 8 +- src/model_selection/mod.rs | 8 +- src/neighbors/knn_classifier.rs | 8 +- src/neighbors/knn_regressor.rs | 6 +- .../first_order/gradient_descent.rs | 4 +- src/svm/mod.rs | 15 ++-- src/svm/svc.rs | 62 ++++++------- src/svm/svr.rs | 52 +++++------ src/tree/decision_tree_classifier.rs | 43 +++++---- src/tree/decision_tree_regressor.rs | 43 +++++---- 48 files changed, 367 insertions(+), 395 deletions(-) diff --git a/src/algorithm/neighbour/bbd_tree.rs b/src/algorithm/neighbour/bbd_tree.rs index cc71f54..632da86 100644 --- a/src/algorithm/neighbour/bbd_tree.rs +++ b/src/algorithm/neighbour/bbd_tree.rs @@ -50,8 +50,8 @@ impl BBDTree { } let mut tree = BBDTree { - nodes: nodes, - index: index, + nodes, + index, root: 0, }; @@ -113,7 +113,7 @@ impl BBDTree { } } - if !self.nodes[node].lower.is_none() { + if self.nodes[node].lower.is_some() { let mut new_candidates = vec![0; k]; let mut newk = 0; @@ -152,7 +152,7 @@ impl BBDTree { } for i in 0..d { - sums[closest][i] = sums[closest][i] + self.nodes[node].sum[i]; + sums[closest][i] += self.nodes[node].sum[i]; } counts[closest] += self.nodes[node].count; @@ -184,11 +184,11 @@ impl BBDTree { let mut rhs = T::zero(); for i in 0..d { let diff = test[i] - best[i]; - lhs = lhs + diff * diff; + lhs += diff * diff; if diff > T::zero() { - rhs = rhs + (center[i] + radius[i] - best[i]) * diff; + rhs += (center[i] + radius[i] - best[i]) * diff; } else { - rhs = rhs + (center[i] - radius[i] - best[i]) * diff; + rhs += (center[i] - radius[i] - best[i]) * diff; } } @@ -244,7 +244,7 @@ impl BBDTree { if end > begin + 1 { let len = end - begin; for i in 0..d { - node.sum[i] = node.sum[i] * T::from(len).unwrap(); + node.sum[i] *= T::from(len).unwrap(); } } @@ -261,9 +261,7 @@ impl BBDTree { let mut i2_good = data.get(self.index[i2], split_index) >= split_cutoff; if !i1_good && !i2_good { - let temp = self.index[i1]; - self.index[i1] = self.index[i2]; - self.index[i2] = temp; + self.index.swap(i1, i2); i1_good = true; i2_good = true; } @@ -302,7 +300,7 @@ impl BBDTree { let mut scatter = T::zero(); for i in 0..d { let x = (node.sum[i] / T::from(node.count).unwrap()) - center[i]; - scatter = scatter + x * x; + scatter += x * x; } node.cost + T::from(node.count).unwrap() * scatter } diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index bbd7254..e7dbac0 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -51,7 +51,7 @@ impl> PartialEq for CoverTree { return false; } } - return true; + true } } @@ -84,11 +84,11 @@ impl> CoverTree scale: 0, }; let mut tree = CoverTree { - base: base, + base, inv_log_base: F::one() / base.ln(), - distance: distance, - root: root, - data: data, + distance, + root, + data, identical_excluded: false, }; @@ -147,10 +147,11 @@ impl> CoverTree *heap.peek() }; if d <= (upper_bound + child.max_dist) { - if c > 0 && d < upper_bound { - if !self.identical_excluded || self.get_data_value(child.idx) != p { - heap.add(d); - } + if c > 0 + && d < upper_bound + && (!self.identical_excluded || self.get_data_value(child.idx) != p) + { + heap.add(d); } if !child.children.is_empty() { @@ -234,7 +235,7 @@ impl> CoverTree fn new_leaf(&self, idx: usize) -> Node { Node { - idx: idx, + idx, max_dist: F::zero(), parent_dist: F::zero(), children: Vec::new(), @@ -298,7 +299,7 @@ impl> CoverTree idx: p, max_dist: F::zero(), parent_dist: F::zero(), - children: children, + children, scale: 100, } } else { @@ -368,7 +369,7 @@ impl> CoverTree idx: p, max_dist: self.max(consumed_set), parent_dist: F::zero(), - children: children, + children, scale: (top_scale - max_scale), } } @@ -442,7 +443,7 @@ impl> CoverTree max = n.dist[n.dist.len() - 1]; } } - return max; + max } } diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index e89a793..d09f2ed 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -44,8 +44,8 @@ impl> LinearKNNSearch { /// * `distance` - distance metric to use for searching. This function should extend [`Distance`](../../../math/distance/index.html) interface. pub fn new(data: Vec, distance: D) -> Result, Failed> { Ok(LinearKNNSearch { - data: data, - distance: distance, + data, + distance, f: PhantomData, }) } @@ -157,7 +157,7 @@ mod tests { .iter() .map(|v| v.0) .collect(); - found_idxs1.sort(); + found_idxs1.sort_unstable(); assert_eq!(vec!(0, 1, 2), found_idxs1); @@ -167,7 +167,7 @@ mod tests { .iter() .map(|v| *v.2) .collect(); - found_idxs1.sort(); + found_idxs1.sort_unstable(); assert_eq!(vec!(2, 3, 4, 5, 6, 7, 8), found_idxs1); @@ -187,7 +187,7 @@ mod tests { .iter() .map(|v| v.0) .collect(); - found_idxs2.sort(); + found_idxs2.sort_unstable(); assert_eq!(vec!(1, 2, 3), found_idxs2); } diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 0a4f21a..7ef1c5c 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -66,10 +66,10 @@ impl KNNAlgorithmName { ) -> Result, Failed> { match *self { KNNAlgorithmName::LinearSearch => { - LinearKNNSearch::new(data, distance).map(|a| KNNAlgorithm::LinearSearch(a)) + LinearKNNSearch::new(data, distance).map(KNNAlgorithm::LinearSearch) } KNNAlgorithmName::CoverTree => { - CoverTree::new(data, distance).map(|a| KNNAlgorithm::CoverTree(a)) + CoverTree::new(data, distance).map(KNNAlgorithm::CoverTree) } } } diff --git a/src/algorithm/sort/heap_select.rs b/src/algorithm/sort/heap_select.rs index ae3ff18..a44b2bb 100644 --- a/src/algorithm/sort/heap_select.rs +++ b/src/algorithm/sort/heap_select.rs @@ -15,7 +15,7 @@ pub struct HeapSelection { impl<'a, T: PartialOrd + Debug> HeapSelection { pub fn with_capacity(k: usize) -> HeapSelection { HeapSelection { - k: k, + k, n: 0, sorted: false, heap: Vec::new(), @@ -51,7 +51,7 @@ impl<'a, T: PartialOrd + Debug> HeapSelection { pub fn peek(&self) -> &T { if self.sorted { - return &self.heap[0]; + &self.heap[0] } else { &self .heap @@ -62,11 +62,11 @@ impl<'a, T: PartialOrd + Debug> HeapSelection { } pub fn peek_mut(&mut self) -> &mut T { - return &mut self.heap[0]; + &mut self.heap[0] } pub fn get(self) -> Vec { - return self.heap; + self.heap } fn sift_down(&mut self, k: usize, n: usize) { diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 488a7ac..787d8d3 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -93,11 +93,11 @@ impl, T>> DBSCAN { parameters: DBSCANParameters, ) -> Result, Failed> { if parameters.min_samples < 1 { - return Err(Failed::fit(&format!("Invalid minPts"))); + return Err(Failed::fit(&"Invalid minPts".to_string())); } if parameters.eps <= T::zero() { - return Err(Failed::fit(&format!("Invalid radius: "))); + return Err(Failed::fit(&"Invalid radius: ".to_string())); } let mut k = 0; diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 278024f..0da8a72 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -149,13 +149,13 @@ impl KMeans { for i in 0..n { for j in 0..d { - centroids[y[i]][j] = centroids[y[i]][j] + data.get(i, j); + centroids[y[i]][j] += data.get(i, j); } } for i in 0..k { for j in 0..d { - centroids[i][j] = centroids[i][j] / T::from(size[i]).unwrap(); + centroids[i][j] /= T::from(size[i]).unwrap(); } } @@ -178,11 +178,11 @@ impl KMeans { } Ok(KMeans { - k: k, - y: y, - size: size, - distortion: distortion, - centroids: centroids, + k, + y, + size, + distortion, + centroids, }) } @@ -235,13 +235,13 @@ impl KMeans { let mut sum: T = T::zero(); for i in d.iter() { - sum = sum + *i; + sum += *i; } let cutoff = T::from(rng.gen::()).unwrap() * sum; let mut cost = T::zero(); let mut index = 0; while index < n { - cost = cost + d[index]; + cost += d[index]; if cost >= cutoff { break; } diff --git a/src/dataset/boston.rs b/src/dataset/boston.rs index 2a0d30e..33f7700 100644 --- a/src/dataset/boston.rs +++ b/src/dataset/boston.rs @@ -38,8 +38,8 @@ pub fn load_dataset() -> Dataset { Dataset { data: x, target: y, - num_samples: num_samples, - num_features: num_features, + num_samples, + num_features, feature_names: vec![ "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", diff --git a/src/dataset/breast_cancer.rs b/src/dataset/breast_cancer.rs index 0a88f31..e469794 100644 --- a/src/dataset/breast_cancer.rs +++ b/src/dataset/breast_cancer.rs @@ -40,8 +40,8 @@ pub fn load_dataset() -> Dataset { Dataset { data: x, target: y, - num_samples: num_samples, - num_features: num_features, + num_samples, + num_features, feature_names: vec![ "mean radius", "mean texture", "mean perimeter", "mean area", "mean smoothness", "mean compactness", "mean concavity", diff --git a/src/dataset/diabetes.rs b/src/dataset/diabetes.rs index 352fd46..2a3e20c 100644 --- a/src/dataset/diabetes.rs +++ b/src/dataset/diabetes.rs @@ -33,8 +33,8 @@ pub fn load_dataset() -> Dataset { Dataset { data: x, target: y, - num_samples: num_samples, - num_features: num_features, + num_samples, + num_features, feature_names: vec![ "Age", "Sex", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6", ] diff --git a/src/dataset/digits.rs b/src/dataset/digits.rs index 10068ab..fd643d5 100644 --- a/src/dataset/digits.rs +++ b/src/dataset/digits.rs @@ -23,8 +23,8 @@ pub fn load_dataset() -> Dataset { Dataset { data: x, target: y, - num_samples: num_samples, - num_features: num_features, + num_samples, + num_features, feature_names: vec![ "sepal length (cm)", "sepal width (cm)", diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index fd4f400..2514134 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -39,8 +39,8 @@ pub fn make_blobs( Dataset { data: x, target: y, - num_samples: num_samples, - num_features: num_features, + num_samples, + num_features, feature_names: (0..num_features).map(|n| n.to_string()).collect(), target_names: vec!["label".to_string()], description: "Isotropic Gaussian blobs".to_string(), @@ -82,7 +82,7 @@ pub fn make_circles(num_samples: usize, factor: f32, noise: f32) -> Dataset Dataset { Dataset { data: x, target: y, - num_samples: num_samples, - num_features: num_features, + num_samples, + num_features, feature_names: vec![ "sepal length (cm)", "sepal width (cm)", diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 85829fe..da790b4 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -61,14 +61,14 @@ pub(crate) fn serialize_data( let x: Vec = dataset .data .iter() - .map(|v| *v) + .copied() .flat_map(|f| f.to_f32_bits().to_le_bytes().to_vec().into_iter()) .collect(); file.write_all(&x)?; let y: Vec = dataset .target .iter() - .map(|v| *v) + .copied() .flat_map(|f| f.to_f32_bits().to_le_bytes().to_vec().into_iter()) .collect(); file.write_all(&y)?; diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index f66ca9b..f25aaad 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -68,14 +68,14 @@ impl> PartialEq for PCA { if self.eigenvectors != other.eigenvectors || self.eigenvalues.len() != other.eigenvalues.len() { - return false; + false } else { for i in 0..self.eigenvalues.len() { if (self.eigenvalues[i] - other.eigenvalues[i]).abs() > T::epsilon() { return false; } } - return true; + true } } } @@ -190,16 +190,16 @@ impl> PCA { let mut pmu = vec![T::zero(); n_components]; for k in 0..n { for i in 0..n_components { - pmu[i] = pmu[i] + projection.get(i, k) * mu[k]; + pmu[i] += projection.get(i, k) * mu[k]; } } Ok(PCA { - eigenvectors: eigenvectors, - eigenvalues: eigenvalues, + eigenvectors, + eigenvalues, projection: projection.transpose(), - mu: mu, - pmu: pmu, + mu, + pmu, }) } diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index add6079..0cfebf1 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -89,7 +89,7 @@ pub struct RandomForestClassifier { impl PartialEq for RandomForestClassifier { fn eq(&self, other: &Self) -> bool { if self.classes.len() != other.classes.len() || self.trees.len() != other.trees.len() { - return false; + false } else { for i in 0..self.classes.len() { if (self.classes[i] - other.classes[i]).abs() > T::epsilon() { @@ -164,8 +164,8 @@ impl RandomForestClassifier { } Ok(RandomForestClassifier { - parameters: parameters, - trees: trees, + parameters, + trees, classes, }) } @@ -191,7 +191,7 @@ impl RandomForestClassifier { result[tree.predict_for_row(x, row)] += 1; } - return which_max(&result); + which_max(&result) } fn sample_with_replacement(y: &Vec, num_classes: usize) -> Vec { diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index d25c850..c704a8f 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -95,7 +95,7 @@ impl Default for RandomForestRegressorParameters { impl PartialEq for RandomForestRegressor { fn eq(&self, other: &Self) -> bool { if self.trees.len() != other.trees.len() { - return false; + false } else { for i in 0..self.trees.len() { if self.trees[i] != other.trees[i] { @@ -135,10 +135,7 @@ impl RandomForestRegressor { trees.push(tree); } - Ok(RandomForestRegressor { - parameters: parameters, - trees: trees, - }) + Ok(RandomForestRegressor { parameters, trees }) } /// Predict class for `x` @@ -161,7 +158,7 @@ impl RandomForestRegressor { let mut result = T::zero(); for tree in self.trees.iter() { - result = result + tree.predict_for_row(x, row); + result += tree.predict_for_row(x, row); } result / T::from(n_trees).unwrap() diff --git a/src/error/mod.rs b/src/error/mod.rs index c411e87..679f685 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -61,7 +61,7 @@ impl Failed { /// new instance of `err` pub fn because(err: FailedError, msg: &str) -> Self { Failed { - err: err, + err, msg: msg.to_string(), } } diff --git a/src/linalg/cholesky.rs b/src/linalg/cholesky.rs index e55d6bb..724dc8a 100644 --- a/src/linalg/cholesky.rs +++ b/src/linalg/cholesky.rs @@ -46,10 +46,7 @@ pub struct Cholesky> { impl> Cholesky { pub(crate) fn new(R: M) -> Cholesky { - Cholesky { - R: R, - t: PhantomData, - } + Cholesky { R, t: PhantomData } } /// Get lower triangular matrix. @@ -90,7 +87,8 @@ impl> Cholesky { if bn != rn { return Err(Failed::because( FailedError::SolutionFailed, - &format!("Can't solve Ax = b for x. Number of rows in b != number of rows in R."), + &"Can\'t solve Ax = b for x. Number of rows in b != number of rows in R." + .to_string(), )); } @@ -130,7 +128,7 @@ pub trait CholeskyDecomposableMatrix: BaseMatrix { if m != n { return Err(Failed::because( FailedError::DecompositionFailed, - &format!("Can't do Cholesky decomposition on a non-square matrix"), + &"Can\'t do Cholesky decomposition on a non-square matrix".to_string(), )); } @@ -143,14 +141,14 @@ pub trait CholeskyDecomposableMatrix: BaseMatrix { } s = (self.get(j, k) - s) / self.get(k, k); self.set(j, k, s); - d = d + s * s; + d += s * s; } d = self.get(j, j) - d; if d < T::zero() { return Err(Failed::because( FailedError::DecompositionFailed, - &format!("The matrix is not positive definite."), + &"The matrix is not positive definite.".to_string(), )); } diff --git a/src/linalg/evd.rs b/src/linalg/evd.rs index 60602ce..c216696 100644 --- a/src/linalg/evd.rs +++ b/src/linalg/evd.rs @@ -93,7 +93,7 @@ pub trait EVDDecomposableMatrix: BaseMatrix { sort(&mut d, &mut e, &mut V); } - Ok(EVD { V: V, d: d, e: e }) + Ok(EVD { V, d, e }) } } @@ -107,7 +107,7 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec let mut scale = T::zero(); let mut h = T::zero(); for k in 0..i { - scale = scale + d[k].abs(); + scale += d[k].abs(); } if scale == T::zero() { e[i] = d[i - 1]; @@ -118,8 +118,8 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec } } else { for k in 0..i { - d[k] = d[k] / scale; - h = h + d[k] * d[k]; + d[k] /= scale; + h += d[k] * d[k]; } let mut f = d[i - 1]; let mut g = h.sqrt(); @@ -127,7 +127,7 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec g = -g; } e[i] = scale * g; - h = h - f * g; + h -= f * g; d[i - 1] = f - g; for j in 0..i { e[j] = T::zero(); @@ -138,19 +138,19 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec V.set(j, i, f); g = e[j] + V.get(j, j) * f; for k in j + 1..=i - 1 { - g = g + V.get(k, j) * d[k]; - e[k] = e[k] + V.get(k, j) * f; + g += V.get(k, j) * d[k]; + e[k] += V.get(k, j) * f; } e[j] = g; } f = T::zero(); for j in 0..i { - e[j] = e[j] / h; - f = f + e[j] * d[j]; + e[j] /= h; + f += e[j] * d[j]; } let hh = f / (h + h); for j in 0..i { - e[j] = e[j] - hh * d[j]; + e[j] -= hh * d[j]; } for j in 0..i { f = d[j]; @@ -176,7 +176,7 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec for j in 0..=i { let mut g = T::zero(); for k in 0..=i { - g = g + V.get(k, i + 1) * V.get(k, j); + g += V.get(k, i + 1) * V.get(k, j); } for k in 0..=i { V.sub_element_mut(k, j, g * d[k]); @@ -239,9 +239,9 @@ fn tql2>(V: &mut M, d: &mut Vec, e: &mut Vec< let dl1 = d[l + 1]; let mut h = g - d[l]; for i in l + 2..n { - d[i] = d[i] - h; + d[i] -= h; } - f = f + h; + f += h; p = d[m]; let mut c = T::one(); @@ -278,7 +278,7 @@ fn tql2>(V: &mut M, d: &mut Vec, e: &mut Vec< } } } - d[l] = d[l] + f; + d[l] += f; e[l] = T::zero(); } @@ -321,8 +321,8 @@ fn balance>(A: &mut M) -> Vec { let mut c = T::zero(); for j in 0..n { if j != i { - c = c + A.get(j, i).abs(); - r = r + A.get(i, j).abs(); + c += A.get(j, i).abs(); + r += A.get(i, j).abs(); } } if c != T::zero() && r != T::zero() { @@ -330,18 +330,18 @@ fn balance>(A: &mut M) -> Vec { let mut f = T::one(); let s = c + r; while c < g { - f = f * radix; - c = c * sqrdx; + f *= radix; + c *= sqrdx; } g = r * radix; while c > g { - f = f / radix; - c = c / sqrdx; + f /= radix; + c /= sqrdx; } if (c + r) / f < t * s { done = false; g = T::one() / f; - scale[i] = scale[i] * f; + scale[i] *= f; for j in 0..n { A.mul_element_mut(i, j, g); } @@ -353,7 +353,7 @@ fn balance>(A: &mut M) -> Vec { } } - return scale; + scale } fn elmhes>(A: &mut M) -> Vec { @@ -386,7 +386,7 @@ fn elmhes>(A: &mut M) -> Vec { for i in (m + 1)..n { let mut y = A.get(i, m - 1); if y != T::zero() { - y = y / x; + y /= x; A.set(i, m - 1, y); for j in m..n { A.sub_element_mut(i, j, y * A.get(m, j)); @@ -399,7 +399,7 @@ fn elmhes>(A: &mut M) -> Vec { } } - return perm; + perm } fn eltran>(A: &M, V: &mut M, perm: &Vec) { @@ -430,7 +430,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e for i in 0..n { for j in i32::max(i as i32 - 1, 0)..n as i32 { - anorm = anorm + A.get(i, j as usize).abs(); + anorm += A.get(i, j as usize).abs(); } } @@ -467,7 +467,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e p = T::half() * (y - x); q = p * p + w; z = q.abs().sqrt(); - x = x + t; + x += t; A.set(nn, nn, x); A.set(nn - 1, nn - 1, y + t); if q >= T::zero() { @@ -482,8 +482,8 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e p = x / s; q = z / s; r = (p * p + q * q).sqrt(); - p = p / r; - q = q / r; + p /= r; + q /= r; for j in nn - 1..n { z = A.get(nn - 1, j); A.set(nn - 1, j, q * z + p * A.get(nn, j)); @@ -516,7 +516,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e panic!("Too many iterations in hqr"); } if its == 10 || its == 20 { - t = t + x; + t += x; for i in 0..nn + 1 { A.sub_element_mut(i, i, x); } @@ -535,9 +535,9 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e q = A.get(m + 1, m + 1) - z - r - s; r = A.get(m + 2, m + 1); s = p.abs() + q.abs() + r.abs(); - p = p / s; - q = q / s; - r = r / s; + p /= s; + q /= s; + r /= s; if m == l { break; } @@ -565,9 +565,9 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e } x = p.abs() + q.abs() + r.abs(); if x != T::zero() { - p = p / x; - q = q / x; - r = r / x; + p /= x; + q /= x; + r /= x; } } let s = (p * p + q * q + r * r).sqrt().copysign(p); @@ -579,16 +579,16 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e } else { A.set(k, k - 1, -s * x); } - p = p + s; + p += s; x = p / s; y = q / s; z = r / s; - q = q / p; - r = r / p; + q /= p; + r /= p; for j in k..n { p = A.get(k, j) + q * A.get(k + 1, j); if k + 1 != nn { - p = p + r * A.get(k + 2, j); + p += r * A.get(k + 2, j); A.sub_element_mut(k + 2, j, p * z); } A.sub_element_mut(k + 1, j, p * y); @@ -603,7 +603,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e for i in 0..mmin + 1 { p = x * A.get(i, k) + y * A.get(i, k + 1); if k + 1 != nn { - p = p + z * A.get(i, k + 2); + p += z * A.get(i, k + 2); A.sub_element_mut(i, k + 2, p * r); } A.sub_element_mut(i, k + 1, p * q); @@ -612,7 +612,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e for i in 0..n { p = x * V.get(i, k) + y * V.get(i, k + 1); if k + 1 != nn { - p = p + z * V.get(i, k + 2); + p += z * V.get(i, k + 2); V.sub_element_mut(i, k + 2, p * r); } V.sub_element_mut(i, k + 1, p * q); @@ -642,7 +642,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e let w = A.get(i, i) - p; r = T::zero(); for j in m..=nn { - r = r + A.get(i, j) * A.get(j, nn); + r += A.get(i, j) * A.get(j, nn); } if e[i] < T::zero() { z = w; @@ -701,8 +701,8 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e let mut ra = T::zero(); let mut sa = T::zero(); for j in m..=nn { - ra = ra + A.get(i, j) * A.get(j, na); - sa = sa + A.get(i, j) * A.get(j, nn); + ra += A.get(i, j) * A.get(j, na); + sa += A.get(i, j) * A.get(j, nn); } if e[i] < T::zero() { z = w; @@ -766,7 +766,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e for i in 0..n { z = T::zero(); for k in 0..=j { - z = z + V.get(i, k) * A.get(k, j); + z += V.get(i, k) * A.get(k, j); } V.set(i, j, z); } diff --git a/src/linalg/lu.rs b/src/linalg/lu.rs index a4cc58d..cbe195f 100644 --- a/src/linalg/lu.rs +++ b/src/linalg/lu.rs @@ -63,10 +63,10 @@ impl> LU { } LU { - LU: LU, - pivot: pivot, - pivot_sign: pivot_sign, - singular: singular, + LU, + pivot, + pivot_sign, + singular, phantom: PhantomData, } } @@ -220,10 +220,10 @@ pub trait LUDecomposableMatrix: BaseMatrix { let kmax = usize::min(i, j); let mut s = T::zero(); for k in 0..kmax { - s = s + self.get(i, k) * LUcolj[k]; + s += self.get(i, k) * LUcolj[k]; } - LUcolj[i] = LUcolj[i] - s; + LUcolj[i] -= s; self.set(i, j, LUcolj[i]); } @@ -239,9 +239,7 @@ pub trait LUDecomposableMatrix: BaseMatrix { self.set(p, k, self.get(j, k)); self.set(j, k, t); } - let k = piv[p]; - piv[p] = piv[j]; - piv[j] = k; + piv.swap(p, j); pivsign = -pivsign; } diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index fb12909..09a9687 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -517,7 +517,7 @@ pub trait Matrix: pub(crate) fn row_iter>(m: &M) -> RowIter { RowIter { - m: m, + m, pos: 0, max_pos: m.shape().0, phantom: PhantomData, diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index d3d6353..c1ba650 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -53,7 +53,7 @@ impl BaseVector for Vec { let mut result = T::zero(); for i in 0..self.len() { - result = result + self[i] * other[i]; + result += self[i] * other[i]; } result @@ -63,7 +63,7 @@ impl BaseVector for Vec { let mut norm = T::zero(); for xi in self.iter() { - norm = norm + *xi * *xi; + norm += *xi * *xi; } norm.sqrt() @@ -82,7 +82,7 @@ impl BaseVector for Vec { let mut norm = T::zero(); for xi in self.iter() { - norm = norm + xi.abs().powf(p); + norm += xi.abs().powf(p); } norm.powf(T::one() / p) @@ -90,19 +90,19 @@ impl BaseVector for Vec { } fn div_element_mut(&mut self, pos: usize, x: T) { - self[pos] = self[pos] / x; + self[pos] /= x; } fn mul_element_mut(&mut self, pos: usize, x: T) { - self[pos] = self[pos] * x; + self[pos] *= x; } fn add_element_mut(&mut self, pos: usize, x: T) { - self[pos] = self[pos] + x + self[pos] += x } fn sub_element_mut(&mut self, pos: usize, x: T) { - self[pos] = self[pos] - x; + self[pos] -= x; } fn add_mut(&mut self, other: &Self) -> &Self { @@ -165,7 +165,7 @@ impl BaseVector for Vec { fn sum(&self) -> T { let mut sum = T::zero(); for i in 0..self.len() { - sum = sum + self[i]; + sum += self[i]; } sum } @@ -216,15 +216,15 @@ impl DenseMatrix { /// `values` should be in column-major order. pub fn new(nrows: usize, ncols: usize, values: Vec) -> Self { DenseMatrix { - ncols: ncols, - nrows: nrows, - values: values, + ncols, + nrows, + values, } } /// New instance of `DenseMatrix` from 2d array. pub fn from_2d_array(values: &[&[T]]) -> Self { - DenseMatrix::from_2d_vec(&values.into_iter().map(|row| Vec::from(*row)).collect()) + DenseMatrix::from_2d_vec(&values.iter().map(|row| Vec::from(*row)).collect()) } /// New instance of `DenseMatrix` from 2d vector. @@ -235,8 +235,8 @@ impl DenseMatrix { .unwrap_or_else(|| panic!("Cannot create 2d matrix from an empty vector")) .len(); let mut m = DenseMatrix { - ncols: ncols, - nrows: nrows, + ncols, + nrows, values: vec![T::zero(); ncols * nrows], }; for row in 0..nrows { @@ -261,8 +261,8 @@ impl DenseMatrix { /// * `values` - values to initialize the matrix. pub fn from_vec(nrows: usize, ncols: usize, values: &Vec) -> DenseMatrix { let mut m = DenseMatrix { - ncols: ncols, - nrows: nrows, + ncols, + nrows, values: vec![T::zero(); ncols * nrows], }; for row in 0..nrows { @@ -285,7 +285,7 @@ impl DenseMatrix { DenseMatrix { ncols: values.len(), nrows: 1, - values: values, + values, } } @@ -301,7 +301,7 @@ impl DenseMatrix { DenseMatrix { ncols: 1, nrows: values.len(), - values: values, + values, } } @@ -412,7 +412,7 @@ impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for De } } - const FIELDS: &'static [&'static str] = &["nrows", "ncols", "values"]; + const FIELDS: &[&str] = &["nrows", "ncols", "values"]; deserializer.deserialize_struct( "DenseMatrix", FIELDS, @@ -562,7 +562,7 @@ impl BaseMatrix for DenseMatrix { matrix.set(i, i, T::one()); } - return matrix; + matrix } fn shape(&self) -> (usize, usize) { @@ -614,7 +614,7 @@ impl BaseMatrix for DenseMatrix { for c in 0..other.ncols { let mut s = T::zero(); for i in 0..inner_d { - s = s + self.get(r, i) * other.get(i, c); + s += self.get(r, i) * other.get(i, c); } result.set(r, c, s); } @@ -633,7 +633,7 @@ impl BaseMatrix for DenseMatrix { let mut result = T::zero(); for i in 0..(self.nrows * self.ncols) { - result = result + self.values[i] * other.values[i]; + result += self.values[i] * other.values[i]; } result @@ -727,19 +727,19 @@ impl BaseMatrix for DenseMatrix { } fn div_element_mut(&mut self, row: usize, col: usize, x: T) { - self.values[col * self.nrows + row] = self.values[col * self.nrows + row] / x; + self.values[col * self.nrows + row] /= x; } fn mul_element_mut(&mut self, row: usize, col: usize, x: T) { - self.values[col * self.nrows + row] = self.values[col * self.nrows + row] * x; + self.values[col * self.nrows + row] *= x; } fn add_element_mut(&mut self, row: usize, col: usize, x: T) { - self.values[col * self.nrows + row] = self.values[col * self.nrows + row] + x + self.values[col * self.nrows + row] += x } fn sub_element_mut(&mut self, row: usize, col: usize, x: T) { - self.values[col * self.nrows + row] = self.values[col * self.nrows + row] - x; + self.values[col * self.nrows + row] -= x; } fn transpose(&self) -> Self { @@ -759,9 +759,9 @@ impl BaseMatrix for DenseMatrix { fn rand(nrows: usize, ncols: usize) -> Self { let values: Vec = (0..nrows * ncols).map(|_| T::rand()).collect(); DenseMatrix { - ncols: ncols, - nrows: nrows, - values: values, + ncols, + nrows, + values, } } @@ -769,7 +769,7 @@ impl BaseMatrix for DenseMatrix { let mut norm = T::zero(); for xi in self.values.iter() { - norm = norm + *xi * *xi; + norm += *xi * *xi; } norm.sqrt() @@ -790,7 +790,7 @@ impl BaseMatrix for DenseMatrix { let mut norm = T::zero(); for xi in self.values.iter() { - norm = norm + xi.abs().powf(p); + norm += xi.abs().powf(p); } norm.powf(T::one() / p) @@ -802,12 +802,12 @@ impl BaseMatrix for DenseMatrix { for r in 0..self.nrows { for c in 0..self.ncols { - mean[c] = mean[c] + self.get(r, c); + mean[c] += self.get(r, c); } } for i in 0..mean.len() { - mean[i] = mean[i] / T::from(self.nrows).unwrap(); + mean[i] /= T::from(self.nrows).unwrap(); } mean @@ -815,28 +815,28 @@ impl BaseMatrix for DenseMatrix { fn add_scalar_mut(&mut self, scalar: T) -> &Self { for i in 0..self.values.len() { - self.values[i] = self.values[i] + scalar; + self.values[i] += scalar; } self } fn sub_scalar_mut(&mut self, scalar: T) -> &Self { for i in 0..self.values.len() { - self.values[i] = self.values[i] - scalar; + self.values[i] -= scalar; } self } fn mul_scalar_mut(&mut self, scalar: T) -> &Self { for i in 0..self.values.len() { - self.values[i] = self.values[i] * scalar; + self.values[i] *= scalar; } self } fn div_scalar_mut(&mut self, scalar: T) -> &Self { for i in 0..self.values.len() { - self.values[i] = self.values[i] / scalar; + self.values[i] /= scalar; } self } @@ -902,7 +902,7 @@ impl BaseMatrix for DenseMatrix { fn sum(&self) -> T { let mut sum = T::zero(); for i in 0..self.values.len() { - sum = sum + self.values[i]; + sum += self.values[i]; } sum } @@ -934,7 +934,7 @@ impl BaseMatrix for DenseMatrix { for c in 0..self.ncols { let p = (self.get(r, c) - max).exp(); self.set(r, c, p); - z = z + p; + z += p; } } for r in 0..self.nrows { @@ -1058,7 +1058,7 @@ mod tests { DenseMatrix::new(1, 3, vec![1., 2., 3.]) ); assert_eq!( - DenseMatrix::from_row_vector(vec.clone()).to_row_vector(), + DenseMatrix::from_row_vector(vec).to_row_vector(), vec![1., 2., 3.] ); } diff --git a/src/linalg/qr.rs b/src/linalg/qr.rs index e0e5860..c3a7978 100644 --- a/src/linalg/qr.rs +++ b/src/linalg/qr.rs @@ -51,11 +51,7 @@ impl> QR { } } - QR { - QR: QR, - tau: tau, - singular: singular, - } + QR { QR, tau, singular } } /// Get upper triangular matrix. @@ -68,7 +64,7 @@ impl> QR { R.set(i, j, self.QR.get(i, j)); } } - return R; + R } /// Get an orthogonal matrix. @@ -82,7 +78,7 @@ impl> QR { if self.QR.get(k, k) != T::zero() { let mut s = T::zero(); for i in k..m { - s = s + self.QR.get(i, k) * Q.get(i, j); + s += self.QR.get(i, k) * Q.get(i, j); } s = -s / self.QR.get(k, k); for i in k..m { @@ -96,7 +92,7 @@ impl> QR { k -= 1; } } - return Q; + Q } fn solve(&self, mut b: M) -> Result { @@ -118,7 +114,7 @@ impl> QR { for j in 0..b_ncols { let mut s = T::zero(); for i in k..m { - s = s + self.QR.get(i, k) * b.get(i, j); + s += self.QR.get(i, k) * b.get(i, j); } s = -s / self.QR.get(k, k); for i in k..m { @@ -175,7 +171,7 @@ pub trait QRDecomposableMatrix: BaseMatrix { for j in k + 1..n { let mut s = T::zero(); for i in k..m { - s = s + self.get(i, k) * self.get(i, j); + s += self.get(i, k) * self.get(i, j); } s = -s / self.get(k, k); for i in k..m { diff --git a/src/linalg/svd.rs b/src/linalg/svd.rs index 8866ba9..9271f5b 100644 --- a/src/linalg/svd.rs +++ b/src/linalg/svd.rs @@ -106,13 +106,13 @@ pub trait SVDDecomposableMatrix: BaseMatrix { if i < m { for k in i..m { - scale = scale + U.get(k, i).abs(); + scale += U.get(k, i).abs(); } if scale.abs() > T::epsilon() { for k in i..m { U.div_element_mut(k, i, scale); - s = s + U.get(k, i) * U.get(k, i); + s += U.get(k, i) * U.get(k, i); } let mut f = U.get(i, i); @@ -122,7 +122,7 @@ pub trait SVDDecomposableMatrix: BaseMatrix { for j in l - 1..n { s = T::zero(); for k in i..m { - s = s + U.get(k, i) * U.get(k, j); + s += U.get(k, i) * U.get(k, j); } f = s / h; for k in i..m { @@ -140,15 +140,15 @@ pub trait SVDDecomposableMatrix: BaseMatrix { let mut s = T::zero(); scale = T::zero(); - if i + 1 <= m && i + 1 != n { + if i < m && i + 1 != n { for k in l - 1..n { - scale = scale + U.get(i, k).abs(); + scale += U.get(i, k).abs(); } if scale.abs() > T::epsilon() { for k in l - 1..n { U.div_element_mut(i, k, scale); - s = s + U.get(i, k) * U.get(i, k); + s += U.get(i, k) * U.get(i, k); } let f = U.get(i, l - 1); @@ -163,7 +163,7 @@ pub trait SVDDecomposableMatrix: BaseMatrix { for j in l - 1..m { s = T::zero(); for k in l - 1..n { - s = s + U.get(j, k) * U.get(i, k); + s += U.get(j, k) * U.get(i, k); } for k in l - 1..n { @@ -189,7 +189,7 @@ pub trait SVDDecomposableMatrix: BaseMatrix { for j in l..n { let mut s = T::zero(); for k in l..n { - s = s + U.get(i, k) * v.get(k, j); + s += U.get(i, k) * v.get(k, j); } for k in l..n { v.add_element_mut(k, j, s * v.get(k, i)); @@ -218,7 +218,7 @@ pub trait SVDDecomposableMatrix: BaseMatrix { for j in l..n { let mut s = T::zero(); for k in l..m { - s = s + U.get(k, i) * U.get(k, j); + s += U.get(k, i) * U.get(k, j); } let f = (s / U.get(i, i)) * g; for k in i..m { @@ -316,7 +316,7 @@ pub trait SVDDecomposableMatrix: BaseMatrix { f = x * c + g * s; g = g * c - x * s; h = y * s; - y = y * c; + y *= c; for jj in 0..n { x = v.get(jj, j); @@ -431,13 +431,13 @@ impl> SVD { let full = s.len() == m.min(n); let tol = T::half() * (T::from(m + n).unwrap() + T::one()).sqrt() * s[0] * T::epsilon(); SVD { - U: U, - V: V, - s: s, - full: full, - m: m, - n: n, - tol: tol, + U, + V, + s, + full, + m, + n, + tol, } } @@ -458,9 +458,9 @@ impl> SVD { let mut r = T::zero(); if self.s[j] > self.tol { for i in 0..self.m { - r = r + self.U.get(i, j) * b.get(i, k); + r += self.U.get(i, j) * b.get(i, k); } - r = r / self.s[j]; + r /= self.s[j]; } tmp[j] = r; } @@ -468,7 +468,7 @@ impl> SVD { for j in 0..self.n { let mut r = T::zero(); for jj in 0..self.n { - r = r + self.V.get(j, jj) * tmp[jj]; + r += self.V.get(j, jj) * tmp[jj]; } b.set(j, k, r); } diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 61bb678..d8ff1ff 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -123,9 +123,9 @@ impl> LinearRegression { let (y_nrows, _) = b.shape(); if x_nrows != y_nrows { - return Err(Failed::fit(&format!( - "Number of rows of X doesn't match number of rows of Y" - ))); + return Err(Failed::fit( + &"Number of rows of X doesn\'t match number of rows of Y".to_string(), + )); } let a = x.h_stack(&M::ones(x_nrows, 1)); diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index ec09184..ec90af1 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -82,7 +82,7 @@ trait ObjectiveFunction> { let mut sum = T::zero(); let p = x.shape().1; for i in 0..p { - sum = sum + x.get(m_row, i) * w.get(0, i + v_col); + sum += x.get(m_row, i) * w.get(0, i + v_col); } sum + w.get(0, p + v_col) @@ -101,7 +101,7 @@ impl> PartialEq for LogisticRegression { || self.num_attributes != other.num_attributes || self.classes.len() != other.classes.len() { - return false; + false } else { for i in 0..self.classes.len() { if (self.classes[i] - other.classes[i]).abs() > T::epsilon() { @@ -109,7 +109,7 @@ impl> PartialEq for LogisticRegression { } } - return self.weights == other.weights; + self.weights == other.weights } } } @@ -123,7 +123,7 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction for i in 0..n { let wx = BinaryObjectiveFunction::partial_dot(w_bias, self.x, 0, i); - f = f + (wx.ln_1pe() - (T::from(self.y[i]).unwrap()) * wx); + f += wx.ln_1pe() - (T::from(self.y[i]).unwrap()) * wx; } f @@ -169,7 +169,7 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction ); } prob.softmax_mut(); - f = f - prob.get(0, self.y[i]).ln(); + f -= prob.get(0, self.y[i]).ln(); } f @@ -215,9 +215,9 @@ impl> LogisticRegression { let (_, y_nrows) = y_m.shape(); if x_nrows != y_nrows { - return Err(Failed::fit(&format!( - "Number of rows of X doesn't match number of rows of Y" - ))); + return Err(Failed::fit( + &"Number of rows of X doesn\'t match number of rows of Y".to_string(), + )); } let classes = y_m.unique(); @@ -240,7 +240,7 @@ impl> LogisticRegression { let x0 = M::zeros(1, num_attributes + 1); let objective = BinaryObjectiveFunction { - x: x, + x, y: yi, phantom: PhantomData, }; @@ -249,17 +249,17 @@ impl> LogisticRegression { Ok(LogisticRegression { weights: result.x, - classes: classes, - num_attributes: num_attributes, + classes, + num_attributes, num_classes: k, }) } else { let x0 = M::zeros(1, (num_attributes + 1) * k); let objective = MultiClassObjectiveFunction { - x: x, + x, y: yi, - k: k, + k, phantom: PhantomData, }; @@ -268,9 +268,9 @@ impl> LogisticRegression { let weights = result.x.reshape(k, num_attributes + 1); Ok(LogisticRegression { - weights: weights, - classes: classes, - num_attributes: num_attributes, + weights, + classes, + num_attributes, num_classes: k, }) } @@ -362,7 +362,7 @@ mod tests { let objective = MultiClassObjectiveFunction { x: &x, - y: y, + y, k: 3, phantom: PhantomData, }; @@ -411,7 +411,7 @@ mod tests { let objective = BinaryObjectiveFunction { x: &x, - y: y, + y, phantom: PhantomData, }; diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 4ec0ad0..31503bd 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -38,7 +38,7 @@ impl Euclidian { let mut sum = T::zero(); for i in 0..x.len() { let d = x[i] - y[i]; - sum = sum + d * d; + sum += d * d; } sum diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index 6c205e5..fd320c3 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -68,8 +68,8 @@ impl> Mahalanobis { let sigma = data.cov(); let sigmaInv = sigma.lu().and_then(|lu| lu.inverse()).unwrap(); Mahalanobis { - sigma: sigma, - sigmaInv: sigmaInv, + sigma, + sigmaInv, t: PhantomData, } } @@ -80,8 +80,8 @@ impl> Mahalanobis { let sigma = cov.clone(); let sigmaInv = sigma.lu().and_then(|lu| lu.inverse()).unwrap(); Mahalanobis { - sigma: sigma, - sigmaInv: sigmaInv, + sigma, + sigmaInv, t: PhantomData, } } @@ -118,7 +118,7 @@ impl> Distance, T> for Mahalanobis { let mut s = T::zero(); for j in 0..n { for i in 0..n { - s = s + self.sigmaInv.get(i, j) * z[i] * z[j]; + s += self.sigmaInv.get(i, j) * z[i] * z[j]; } } diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 9b46a0c..66125a5 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -35,7 +35,7 @@ impl Distance, T> for Manhattan { let mut dist = T::zero(); for i in 0..x.len() { - dist = dist + (x[i] - y[i]).abs(); + dist += (x[i] - y[i]).abs(); } dist diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index 667e0db..b7c5691 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -48,7 +48,7 @@ impl Distance, T> for Minkowski { for i in 0..x.len() { let d = (x[i] - y[i]).abs(); - dist = dist + d.powf(p_t); + dist += d.powf(p_t); } dist.powf(T::one() / p_t) diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 0532e86..1219ec6 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -45,7 +45,7 @@ impl Distances { /// Minkowski distance, see [`Minkowski`](minkowski/index.html) /// * `p` - function order. Should be >= 1 pub fn minkowski(p: u16) -> minkowski::Minkowski { - minkowski::Minkowski { p: p } + minkowski::Minkowski { p } } /// Manhattan distance, see [`Manhattan`](manhattan/index.html) diff --git a/src/math/num.rs b/src/math/num.rs index 894e5a3..490623c 100644 --- a/src/math/num.rs +++ b/src/math/num.rs @@ -57,19 +57,19 @@ impl RealNumber for f64 { fn ln_1pe(self) -> f64 { if self > 15. { - return self; + self } else { - return self.exp().ln_1p(); + self.exp().ln_1p() } } fn sigmoid(self) -> f64 { if self < -40. { - return 0.; + 0. } else if self > 40. { - return 1.; + 1. } else { - return 1. / (1. + f64::exp(-self)); + 1. / (1. + f64::exp(-self)) } } @@ -98,19 +98,19 @@ impl RealNumber for f32 { fn ln_1pe(self) -> f32 { if self > 15. { - return self; + self } else { - return self.exp().ln_1p(); + self.exp().ln_1p() } } fn sigmoid(self) -> f32 { if self < -40. { - return 0.; + 0. } else if self > 40. { - return 1.; + 1. } else { - return 1. / (1. + f32::exp(-self)); + 1. / (1. + f32::exp(-self)) } } diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index 99e6cbd..571dd49 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -42,9 +42,9 @@ impl AUC { for i in 0..n { if y_true.get(i) == T::zero() { - neg = neg + T::one(); + neg += T::one(); } else if y_true.get(i) == T::one() { - pos = pos + T::one(); + pos += T::one(); } else { panic!( "AUC is only for binary classification. Invalid label: {}", @@ -79,7 +79,7 @@ impl AUC { let mut auc = T::zero(); for i in 0..n { if y_true.get(label_idx[i]) == T::one() { - auc = auc + rank[i]; + auc += rank[i]; } } diff --git a/src/metrics/cluster_helpers.rs b/src/metrics/cluster_helpers.rs index 76cd643..dd5bbb3 100644 --- a/src/metrics/cluster_helpers.rs +++ b/src/metrics/cluster_helpers.rs @@ -37,7 +37,7 @@ pub fn entropy(data: &Vec) -> Option { for &c in bincounts.values() { if c > 0 { let pi = T::from_usize(c).unwrap(); - entropy = entropy - (pi / sum) * (pi.ln() - sum.ln()); + entropy -= (pi / sum) * (pi.ln() - sum.ln()); } } @@ -89,9 +89,8 @@ pub fn mutual_info_score(contingency: &Vec>) -> T { let mut result = T::zero(); for i in 0..log_outer.len() { - result = result - + ((contingency_nm[i] * (log_contingency_nm[i] - contingency_sum_ln)) - + contingency_nm[i] * log_outer[i]) + result += (contingency_nm[i] * (log_contingency_nm[i] - contingency_sum_ln)) + + contingency_nm[i] * log_outer[i] } result.max(T::zero()) diff --git a/src/metrics/mean_absolute_error.rs b/src/metrics/mean_absolute_error.rs index 3e5099e..a069335 100644 --- a/src/metrics/mean_absolute_error.rs +++ b/src/metrics/mean_absolute_error.rs @@ -43,7 +43,7 @@ impl MeanAbsoluteError { let n = y_true.len(); let mut ras = T::zero(); for i in 0..n { - ras = ras + (y_true.get(i) - y_pred.get(i)).abs(); + ras += (y_true.get(i) - y_pred.get(i)).abs(); } ras / T::from_usize(n).unwrap() diff --git a/src/metrics/mean_squared_error.rs b/src/metrics/mean_squared_error.rs index 816cc70..137c8e6 100644 --- a/src/metrics/mean_squared_error.rs +++ b/src/metrics/mean_squared_error.rs @@ -43,7 +43,7 @@ impl MeanSquareError { let n = y_true.len(); let mut rss = T::zero(); for i in 0..n { - rss = rss + (y_true.get(i) - y_pred.get(i)).square(); + rss += (y_true.get(i) - y_pred.get(i)).square(); } rss / T::from_usize(n).unwrap() diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 4fe199b..f49300d 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -101,7 +101,7 @@ impl ClassificationMetrics { /// F1 score, also known as balanced F-score or F-measure, see [F1](f1/index.html). pub fn f1(beta: T) -> f1::F1 { - f1::F1 { beta: beta } + f1::F1 { beta } } /// Area Under the Receiver Operating Characteristic Curve (ROC AUC), see [AUC](auc/index.html). diff --git a/src/metrics/r2.rs b/src/metrics/r2.rs index e689c6f..cbcf7e4 100644 --- a/src/metrics/r2.rs +++ b/src/metrics/r2.rs @@ -45,10 +45,10 @@ impl R2 { let mut mean = T::zero(); for i in 0..n { - mean = mean + y_true.get(i); + mean += y_true.get(i); } - mean = mean / T::from_usize(n).unwrap(); + mean /= T::from_usize(n).unwrap(); let mut ss_tot = T::zero(); let mut ss_res = T::zero(); @@ -56,8 +56,8 @@ impl R2 { for i in 0..n { let y_i = y_true.get(i); let f_i = y_pred.get(i); - ss_tot = ss_tot + (y_i - mean).square(); - ss_res = ss_res + (y_i - f_i).square(); + ss_tot += (y_i - mean).square(); + ss_res += (y_i - f_i).square(); } T::one() - (ss_res / ss_tot) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 49938cf..c53451d 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -127,7 +127,7 @@ impl BaseKFold for KFold { // initialise indices let mut indices: Vec = (0..n_samples).collect(); - if self.shuffle == true { + if self.shuffle { indices.shuffle(&mut thread_rng()); } // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. @@ -135,7 +135,7 @@ impl BaseKFold for KFold { // increment by one if odd for i in 0..(n_samples % self.n_splits) { - fold_sizes[i] = fold_sizes[i] + 1; + fold_sizes[i] += 1; } // generate the right array of arrays for test indices @@ -175,13 +175,13 @@ impl BaseKFold for KFold { .clone() .iter() .enumerate() - .filter(|&(idx, _)| test_index[idx] == false) + .filter(|&(idx, _)| !test_index[idx]) .map(|(idx, _)| idx) .collect::>(); // filter train indices out according to mask let test_index = indices .iter() .enumerate() - .filter(|&(idx, _)| test_index[idx] == true) + .filter(|&(idx, _)| test_index[idx]) .map(|(idx, _)| idx) .collect::>(); // filter tests indices out according to mask return_values.push((train_index, test_index)) diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 3ad4297..135594a 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -78,7 +78,7 @@ impl, T>> PartialEq for KNNClassifier { || self.k != other.k || self.y.len() != other.y.len() { - return false; + false } else { for i in 0..self.classes.len() { if (self.classes[i] - other.classes[i]).abs() > T::epsilon() { @@ -139,7 +139,7 @@ impl, T>> KNNClassifier { } Ok(KNNClassifier { - classes: classes, + classes, y: yi, k: parameters.k, knn_algorithm: parameters.algorithm.fit(data, distance)?, @@ -166,13 +166,13 @@ impl, T>> KNNClassifier { let weights = self .weight .calc_weights(search_result.iter().map(|v| v.1).collect()); - let w_sum = weights.iter().map(|w| *w).sum(); + let w_sum = weights.iter().copied().sum(); let mut c = vec![T::zero(); self.classes.len()]; let mut max_c = T::zero(); let mut max_i = 0; for (r, w) in search_result.iter().zip(weights.iter()) { - c[self.y[r.0]] = c[self.y[r.0]] + (*w / w_sum); + c[self.y[r.0]] += *w / w_sum; if c[self.y[r.0]] > max_c { max_c = c[self.y[r.0]]; max_i = self.y[r.0]; diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 0bf283f..b7c0f2d 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -76,7 +76,7 @@ impl Default for KNNRegressorParameters { impl, T>> PartialEq for KNNRegressor { fn eq(&self, other: &Self) -> bool { if self.k != other.k || self.y.len() != other.y.len() { - return false; + false } else { for i in 0..self.y.len() { if (self.y[i] - other.y[i]).abs() > T::epsilon() { @@ -151,10 +151,10 @@ impl, T>> KNNRegressor { let weights = self .weight .calc_weights(search_result.iter().map(|v| v.1).collect()); - let w_sum = weights.iter().map(|w| *w).sum(); + let w_sum = weights.iter().copied().sum(); for (r, w) in search_result.iter().zip(weights.iter()) { - result = result + self.y[r.0] * (*w / w_sum); + result += self.y[r.0] * (*w / w_sum); } Ok(result) diff --git a/src/optimization/first_order/gradient_descent.rs b/src/optimization/first_order/gradient_descent.rs index c860084..9cc78ec 100644 --- a/src/optimization/first_order/gradient_descent.rs +++ b/src/optimization/first_order/gradient_descent.rs @@ -74,8 +74,8 @@ impl FirstOrderOptimizer for GradientDescent { let f_x = f(&x); OptimizerResult { - x: x, - f_x: f_x, + x, + f_x, iterations: iter, } } diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 84a405e..1f563c1 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -48,7 +48,7 @@ impl Kernels { /// Radial basis function kernel (Gaussian) pub fn rbf(gamma: T) -> RBFKernel { - RBFKernel { gamma: gamma } + RBFKernel { gamma } } /// Polynomial kernel @@ -57,9 +57,9 @@ impl Kernels { /// * `coef0` - independent term in kernel function pub fn polynomial(degree: T, gamma: T, coef0: T) -> PolynomialKernel { PolynomialKernel { - degree: degree, - gamma: gamma, - coef0: coef0, + degree, + gamma, + coef0, } } @@ -79,17 +79,14 @@ impl Kernels { /// * `gamma` - kernel coefficient /// * `coef0` - independent term in kernel function pub fn sigmoid(gamma: T, coef0: T) -> SigmoidKernel { - SigmoidKernel { - gamma: gamma, - coef0: coef0, - } + SigmoidKernel { gamma, coef0 } } /// Sigmoid kernel /// * `gamma` - kernel coefficient pub fn sigmoid_with_gamma(gamma: T) -> SigmoidKernel { SigmoidKernel { - gamma: gamma, + gamma, coef0: T::one(), } } diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 119b812..bac6e4e 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -173,9 +173,9 @@ impl, K: Kernel> SVC { let (n, _) = x.shape(); if n != y.len() { - return Err(Failed::fit(&format!( - "Number of rows of X doesn't match number of rows of Y" - ))); + return Err(Failed::fit( + &"Number of rows of X doesn\'t match number of rows of Y".to_string(), + )); } let classes = y.unique(); @@ -204,11 +204,11 @@ impl, K: Kernel> SVC { let (support_vectors, weight, b) = optimizer.optimize(); Ok(SVC { - classes: classes, - kernel: kernel, + classes, + kernel, instances: support_vectors, w: weight, - b: b, + b, }) } @@ -251,7 +251,7 @@ impl, K: Kernel> PartialEq for SVC< || self.w.len() != other.w.len() || self.instances.len() != other.instances.len() { - return false; + false } else { for i in 0..self.w.len() { if (self.w[i] - other.w[i]).abs() > T::epsilon() { @@ -263,7 +263,7 @@ impl, K: Kernel> PartialEq for SVC< return false; } } - return true; + true } } } @@ -278,12 +278,12 @@ impl> SupportVector { }; SupportVector { index: i, - x: x, + x, grad: g, k: k_v, alpha: T::zero(), - cmin: cmin, - cmax: cmax, + cmin, + cmax, } } } @@ -291,7 +291,7 @@ impl> SupportVector { impl<'a, T: RealNumber, M: Matrix, K: Kernel> Cache<'a, T, M, K> { fn new(kernel: &'a K) -> Cache<'a, T, M, K> { Cache { - kernel: kernel, + kernel, data: HashMap::new(), phantom: PhantomData, } @@ -326,8 +326,8 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, let (n, _) = x.shape(); Optimizer { - x: x, - y: y, + x, + y, parameters: ¶meters, svmin: 0, svmax: 0, @@ -335,7 +335,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, gmax: T::min_value(), tau: T::from_f64(1e-12).unwrap(), sv: Vec::with_capacity(n), - kernel: kernel, + kernel, recalculate_minmax_grad: true, } } @@ -389,10 +389,11 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, if self.process(i, self.x.get_row(i), self.y.get(i), cache) { cp += 1; } - } else if self.y.get(i) == -T::one() && cn < few { - if self.process(i, self.x.get_row(i), self.y.get(i), cache) { - cn += 1; - } + } else if self.y.get(i) == -T::one() + && cn < few + && self.process(i, self.x.get_row(i), self.y.get(i), cache) + { + cn += 1; } if cp >= few && cn >= few { @@ -420,10 +421,10 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, self.find_min_max_gradient(); - if self.gmin < self.gmax { - if (y > T::zero() && g < self.gmin) || (y < T::zero() && g > self.gmax) { - return false; - } + if self.gmin < self.gmax + && ((y > T::zero() && g < self.gmin) || (y < T::zero() && g > self.gmax)) + { + return false; } for v in cache_values { @@ -494,13 +495,12 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, let mut idxs_to_drop: HashSet = HashSet::new(); self.sv.retain(|v| { - if v.alpha == T::zero() { - if (v.grad >= gmax && T::zero() >= v.cmax) - || (v.grad <= gmin && T::zero() <= v.cmin) - { - idxs_to_drop.insert(v.index); - return false; - } + if v.alpha == T::zero() + && ((v.grad >= gmax && T::zero() >= v.cmax) + || (v.grad <= gmin && T::zero() <= v.cmin)) + { + idxs_to_drop.insert(v.index); + return false; }; true }); @@ -647,7 +647,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, self.update(idx_1, idx_2, step, cache); - return self.gmax - self.gmin > tol; + self.gmax - self.gmin > tol } None => false, } diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 61feb80..36f308a 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -160,9 +160,9 @@ impl, K: Kernel> SVR { let (n, _) = x.shape(); if n != y.len() { - return Err(Failed::fit(&format!( - "Number of rows of X doesn't match number of rows of Y" - ))); + return Err(Failed::fit( + &"Number of rows of X doesn\'t match number of rows of Y".to_string(), + )); } let optimizer = Optimizer::new(x, y, &kernel, ¶meters); @@ -170,10 +170,10 @@ impl, K: Kernel> SVR { let (support_vectors, weight, b) = optimizer.smo(); Ok(SVR { - kernel: kernel, + kernel, instances: support_vectors, w: weight, - b: b, + b, }) } @@ -198,7 +198,7 @@ impl, K: Kernel> SVR { f += self.w[i] * self.kernel.apply(&x, &self.instances[i]); } - return f; + f } } @@ -208,7 +208,7 @@ impl, K: Kernel> PartialEq for SVR< || self.w.len() != other.w.len() || self.instances.len() != other.instances.len() { - return false; + false } else { for i in 0..self.w.len() { if (self.w[i] - other.w[i]).abs() > T::epsilon() { @@ -220,7 +220,7 @@ impl, K: Kernel> PartialEq for SVR< return false; } } - return true; + true } } } @@ -230,7 +230,7 @@ impl> SupportVector { let k_v = k.apply(&x, &x); SupportVector { index: i, - x: x, + x, grad: [eps + y, eps - y], k: k_v, alpha: [T::zero(), T::zero()], @@ -270,7 +270,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, gmaxindex: 0, tau: T::from_f64(1e-12).unwrap(), sv: support_vectors, - kernel: kernel, + kernel, } } @@ -392,11 +392,9 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, self.sv[v2].alpha[j] = T::zero(); self.sv[v1].alpha[i] = diff; } - } else { - if self.sv[v1].alpha[i] < T::zero() { - self.sv[v1].alpha[i] = T::zero(); - self.sv[v2].alpha[j] = -diff; - } + } else if self.sv[v1].alpha[i] < T::zero() { + self.sv[v1].alpha[i] = T::zero(); + self.sv[v2].alpha[j] = -diff; } if diff > T::zero() { @@ -404,11 +402,9 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, self.sv[v1].alpha[i] = self.c; self.sv[v2].alpha[j] = self.c - diff; } - } else { - if self.sv[v2].alpha[j] > self.c { - self.sv[v2].alpha[j] = self.c; - self.sv[v1].alpha[i] = self.c + diff; - } + } else if self.sv[v2].alpha[j] > self.c { + self.sv[v2].alpha[j] = self.c; + self.sv[v1].alpha[i] = self.c + diff; } } else { let delta = (self.sv[v1].grad[i] - self.sv[v2].grad[j]) / curv; @@ -421,11 +417,9 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, self.sv[v1].alpha[i] = self.c; self.sv[v2].alpha[j] = sum - self.c; } - } else { - if self.sv[v2].alpha[j] < T::zero() { - self.sv[v2].alpha[j] = T::zero(); - self.sv[v1].alpha[i] = sum; - } + } else if self.sv[v2].alpha[j] < T::zero() { + self.sv[v2].alpha[j] = T::zero(); + self.sv[v1].alpha[i] = sum; } if sum > self.c { @@ -433,11 +427,9 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, self.sv[v2].alpha[j] = self.c; self.sv[v1].alpha[i] = sum - self.c; } - } else { - if self.sv[v1].alpha[i] < T::zero() { - self.sv[v1].alpha[i] = T::zero(); - self.sv[v2].alpha[j] = sum; - } + } else if self.sv[v1].alpha[i] < T::zero() { + self.sv[v1].alpha[i] = T::zero(); + self.sv[v2].alpha[j] = sum; } } diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 25704e6..b30fb2d 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -126,7 +126,7 @@ impl PartialEq for DecisionTreeClassifier { || self.num_classes != other.num_classes || self.nodes.len() != other.nodes.len() { - return false; + false } else { for i in 0..self.classes.len() { if (self.classes[i] - other.classes[i]).abs() > T::epsilon() { @@ -138,7 +138,7 @@ impl PartialEq for DecisionTreeClassifier { return false; } } - return true; + true } } } @@ -174,8 +174,8 @@ impl Default for DecisionTreeClassifierParameters { impl Node { fn new(index: usize, output: usize) -> Self { Node { - index: index, - output: output, + index, + output, split_feature: 0, split_value: Option::None, split_score: Option::None, @@ -206,7 +206,7 @@ fn impurity(criterion: &SplitCriterion, count: &Vec, n: us for i in 0..count.len() { if count[i] > 0 { let p = T::from(count[i]).unwrap() / T::from(n).unwrap(); - impurity = impurity - p * p; + impurity -= p * p; } } } @@ -215,7 +215,7 @@ fn impurity(criterion: &SplitCriterion, count: &Vec, n: us for i in 0..count.len() { if count[i] > 0 { let p = T::from(count[i]).unwrap() / T::from(n).unwrap(); - impurity = impurity - p * p.log2(); + impurity -= p * p.log2(); } } } @@ -229,7 +229,7 @@ fn impurity(criterion: &SplitCriterion, count: &Vec, n: us } } - return impurity; + impurity } impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { @@ -242,14 +242,14 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { level: u16, ) -> Self { NodeVisitor { - x: x, - y: y, + x, + y, node: node_id, - samples: samples, - order: order, + samples, + order, true_child_output: 0, false_child_output: 0, - level: level, + level, phantom: PhantomData, } } @@ -266,7 +266,7 @@ pub(in crate) fn which_max(x: &Vec) -> usize { } } - return which; + which } impl DecisionTreeClassifier { @@ -325,10 +325,10 @@ impl DecisionTreeClassifier { } let mut tree = DecisionTreeClassifier { - nodes: nodes, - parameters: parameters, + nodes, + parameters, num_classes: k, - classes: classes, + classes, depth: 0, }; @@ -376,19 +376,18 @@ impl DecisionTreeClassifier { let node = &self.nodes[node_id]; if node.true_child == None && node.false_child == None { result = node.output; + } else if x.get(row, node.split_feature) <= node.split_value.unwrap_or(T::nan()) + { + queue.push_back(node.true_child.unwrap()); } else { - if x.get(row, node.split_feature) <= node.split_value.unwrap_or(T::nan()) { - queue.push_back(node.true_child.unwrap()); - } else { - queue.push_back(node.false_child.unwrap()); - } + queue.push_back(node.false_child.unwrap()); } } None => break, }; } - return result; + result } fn find_best_cutoff>( diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 0f88d4d..0d6da54 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -113,8 +113,8 @@ impl Default for DecisionTreeRegressorParameters { impl Node { fn new(index: usize, output: T) -> Self { Node { - index: index, - output: output, + index, + output, split_feature: 0, split_value: Option::None, split_score: Option::None, @@ -144,14 +144,14 @@ impl PartialEq for Node { impl PartialEq for DecisionTreeRegressor { fn eq(&self, other: &Self) -> bool { if self.depth != other.depth || self.nodes.len() != other.nodes.len() { - return false; + false } else { for i in 0..self.nodes.len() { if self.nodes[i] != other.nodes[i] { return false; } } - return true; + true } } } @@ -177,14 +177,14 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { level: u16, ) -> Self { NodeVisitor { - x: x, - y: y, + x, + y, node: node_id, - samples: samples, - order: order, + samples, + order, true_child_output: T::zero(), false_child_output: T::zero(), - level: level, + level, } } } @@ -221,7 +221,7 @@ impl DecisionTreeRegressor { let mut sum = T::zero(); for i in 0..y_ncols { n += samples[i]; - sum = sum + T::from(samples[i]).unwrap() * y_m.get(0, i); + sum += T::from(samples[i]).unwrap() * y_m.get(0, i); } let root = Node::new(0, sum / T::from(n).unwrap()); @@ -233,8 +233,8 @@ impl DecisionTreeRegressor { } let mut tree = DecisionTreeRegressor { - nodes: nodes, - parameters: parameters, + nodes, + parameters, depth: 0, }; @@ -282,19 +282,18 @@ impl DecisionTreeRegressor { let node = &self.nodes[node_id]; if node.true_child == None && node.false_child == None { result = node.output; + } else if x.get(row, node.split_feature) <= node.split_value.unwrap_or(T::nan()) + { + queue.push_back(node.true_child.unwrap()); } else { - if x.get(row, node.split_feature) <= node.split_value.unwrap_or(T::nan()) { - queue.push_back(node.true_child.unwrap()); - } else { - queue.push_back(node.false_child.unwrap()); - } + queue.push_back(node.false_child.unwrap()); } } None => break, }; } - return result; + result } fn find_best_cutoff>( @@ -348,8 +347,7 @@ impl DecisionTreeRegressor { if prevx.is_nan() || visitor.x.get(*i, j) == prevx { prevx = visitor.x.get(*i, j); true_count += visitor.samples[*i]; - true_sum = - true_sum + T::from(visitor.samples[*i]).unwrap() * visitor.y.get(0, *i); + true_sum += T::from(visitor.samples[*i]).unwrap() * visitor.y.get(0, *i); continue; } @@ -360,8 +358,7 @@ impl DecisionTreeRegressor { { prevx = visitor.x.get(*i, j); true_count += visitor.samples[*i]; - true_sum = - true_sum + T::from(visitor.samples[*i]).unwrap() * visitor.y.get(0, *i); + true_sum += T::from(visitor.samples[*i]).unwrap() * visitor.y.get(0, *i); continue; } @@ -384,7 +381,7 @@ impl DecisionTreeRegressor { } prevx = visitor.x.get(*i, j); - true_sum = true_sum + T::from(visitor.samples[*i]).unwrap() * visitor.y.get(0, *i); + true_sum += T::from(visitor.samples[*i]).unwrap() * visitor.y.get(0, *i); true_count += visitor.samples[*i]; } } From ea5de9758a2d367cdefab2da4f8f8332787b937d Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 19:46:37 -0400 Subject: [PATCH 23/79] Add -Drust-2018-idioms to clippy --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index dd616af..0f118da 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -40,4 +40,4 @@ jobs: command: rustup component add clippy - run: name: Run cargo clippy - command: cargo clippy + command: cargo clippy -- -Drust-2018-idioms From 54886ebd728d58ae9fc5bbebf21b0b7a594bcf4a Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 20:24:08 -0400 Subject: [PATCH 24/79] Fix rust-2018-idioms warnings --- src/cluster/dbscan.rs | 2 -- src/cluster/kmeans.rs | 2 -- src/ensemble/random_forest_classifier.rs | 2 -- src/ensemble/random_forest_regressor.rs | 1 - src/error/mod.rs | 4 ++-- src/linalg/mod.rs | 2 +- src/linalg/naive/dense_matrix.rs | 5 ++--- src/model_selection/mod.rs | 1 - src/optimization/first_order/gradient_descent.rs | 4 ++-- src/optimization/first_order/lbfgs.rs | 10 +++++----- src/optimization/first_order/mod.rs | 4 ++-- src/svm/svc.rs | 16 ++++++++-------- src/svm/svr.rs | 2 +- src/tree/decision_tree_classifier.rs | 6 +++--- src/tree/decision_tree_regressor.rs | 6 +++--- 15 files changed, 29 insertions(+), 38 deletions(-) diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 787d8d3..e595028 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -29,8 +29,6 @@ //! * ["A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", Ester M., Kriegel HP., Sander J., Xu X.](http://faculty.marshall.usc.edu/gareth-james/ISL/) //! * ["Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and its Applications", Sander J., Ester M., Kriegel HP., Xu X.](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.63.1629&rep=rep1&type=pdf) -extern crate rand; - use std::fmt::Debug; use std::iter::Sum; diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 0da8a72..26a4038 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -52,8 +52,6 @@ //! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 10.3.1 K-Means Clustering](http://faculty.marshall.usc.edu/gareth-james/ISL/) //! * ["k-means++: The Advantages of Careful Seeding", Arthur D., Vassilvitskii S.](http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf) -extern crate rand; - use rand::Rng; use std::fmt::Debug; use std::iter::Sum; diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 0cfebf1..e1d462a 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -45,8 +45,6 @@ //! //! //! -extern crate rand; - use std::default::Default; use std::fmt::Debug; diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index c704a8f..36fa096 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -42,7 +42,6 @@ //! //! //! -extern crate rand; use std::default::Default; use std::fmt::Debug; diff --git a/src/error/mod.rs b/src/error/mod.rs index 679f685..1615290 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -82,7 +82,7 @@ impl PartialEq for Failed { } impl fmt::Display for FailedError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let failed_err_str = match self { FailedError::FitFailed => "Fit failed", FailedError::PredictFailed => "Predict failed", @@ -96,7 +96,7 @@ impl fmt::Display for FailedError { } impl fmt::Display for Failed { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}: {}", self.err, self.msg) } } diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 09a9687..fc9d6c9 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -515,7 +515,7 @@ pub trait Matrix: { } -pub(crate) fn row_iter>(m: &M) -> RowIter { +pub(crate) fn row_iter>(m: &M) -> RowIter<'_, F, M> { RowIter { m, pos: 0, diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index c1ba650..aff0fa2 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1,4 +1,3 @@ -extern crate num; use std::fmt; use std::fmt::Debug; use std::marker::PhantomData; @@ -197,7 +196,7 @@ pub struct DenseMatrixIterator<'a, T: RealNumber> { } impl fmt::Display for DenseMatrix { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut rows: Vec> = Vec::new(); for r in 0..self.nrows { rows.push( @@ -356,7 +355,7 @@ impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for De impl<'a, T: RealNumber + fmt::Debug + Deserialize<'a>> Visitor<'a> for DenseMatrixVisitor { type Value = DenseMatrix; - fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { formatter.write_str("struct DenseMatrix") } diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index c53451d..d4908f6 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -8,7 +8,6 @@ //! your data. //! //! In SmartCore you can split your data into training and test datasets using `train_test_split` function. -extern crate rand; use crate::linalg::BaseVector; use crate::linalg::Matrix; diff --git a/src/optimization/first_order/gradient_descent.rs b/src/optimization/first_order/gradient_descent.rs index 9cc78ec..d57896f 100644 --- a/src/optimization/first_order/gradient_descent.rs +++ b/src/optimization/first_order/gradient_descent.rs @@ -25,8 +25,8 @@ impl Default for GradientDescent { impl FirstOrderOptimizer for GradientDescent { fn optimize<'a, X: Matrix, LS: LineSearchMethod>( &self, - f: &'a F, - df: &'a DF, + f: &'a F<'_, T, X>, + df: &'a DF<'_, X>, x0: &X, ls: &'a LS, ) -> OptimizerResult { diff --git a/src/optimization/first_order/lbfgs.rs b/src/optimization/first_order/lbfgs.rs index b63f617..5dedfe6 100644 --- a/src/optimization/first_order/lbfgs.rs +++ b/src/optimization/first_order/lbfgs.rs @@ -100,8 +100,8 @@ impl LBFGS { fn update_state<'a, X: Matrix, LS: LineSearchMethod>( &self, - f: &'a F, - df: &'a DF, + f: &'a F<'_, T, X>, + df: &'a DF<'_, X>, ls: &'a LS, state: &mut LBFGSState, ) { @@ -162,7 +162,7 @@ impl LBFGS { g_converged || x_converged || state.counter_f_tol > self.successive_f_tol } - fn update_hessian<'a, X: Matrix>(&self, _: &'a DF, state: &mut LBFGSState) { + fn update_hessian<'a, X: Matrix>(&self, _: &'a DF<'_, X>, state: &mut LBFGSState) { state.dg = state.x_df.sub(&state.x_df_prev); let rho_iteration = T::one() / state.dx.dot(&state.dg); if !rho_iteration.is_infinite() { @@ -198,8 +198,8 @@ struct LBFGSState> { impl FirstOrderOptimizer for LBFGS { fn optimize<'a, X: Matrix, LS: LineSearchMethod>( &self, - f: &F, - df: &'a DF, + f: &F<'_, T, X>, + df: &'a DF<'_, X>, x0: &X, ls: &'a LS, ) -> OptimizerResult { diff --git a/src/optimization/first_order/mod.rs b/src/optimization/first_order/mod.rs index d1c628f..f2e476f 100644 --- a/src/optimization/first_order/mod.rs +++ b/src/optimization/first_order/mod.rs @@ -12,8 +12,8 @@ use crate::optimization::{DF, F}; pub trait FirstOrderOptimizer { fn optimize<'a, X: Matrix, LS: LineSearchMethod>( &self, - f: &F, - df: &'a DF, + f: &F<'_, T, X>, + df: &'a DF<'_, X>, x0: &X, ls: &'a LS, ) -> OptimizerResult; diff --git a/src/svm/svc.rs b/src/svm/svc.rs index bac6e4e..62a9e01 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -378,7 +378,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, (support_vectors, w, b) } - fn initialize(&mut self, cache: &mut Cache) { + fn initialize(&mut self, cache: &mut Cache<'_, T, M, K>) { let (n, _) = self.x.shape(); let few = 5; let mut cp = 0; @@ -402,7 +402,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, } } - fn process(&mut self, i: usize, x: M::RowVector, y: T, cache: &mut Cache) -> bool { + fn process(&mut self, i: usize, x: M::RowVector, y: T, cache: &mut Cache<'_, T, M, K>) -> bool { for j in 0..self.sv.len() { if self.sv[j].index == i { return true; @@ -445,13 +445,13 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, true } - fn reprocess(&mut self, tol: T, cache: &mut Cache) -> bool { + fn reprocess(&mut self, tol: T, cache: &mut Cache<'_, T, M, K>) -> bool { let status = self.smo(None, None, tol, cache); self.clean(cache); status } - fn finish(&mut self, cache: &mut Cache) { + fn finish(&mut self, cache: &mut Cache<'_, T, M, K>) { let mut max_iter = self.sv.len(); while self.smo(None, None, self.parameters.tol, cache) && max_iter > 0 { @@ -486,7 +486,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, self.recalculate_minmax_grad = false } - fn clean(&mut self, cache: &mut Cache) { + fn clean(&mut self, cache: &mut Cache<'_, T, M, K>) { self.find_min_max_gradient(); let gmax = self.gmax; @@ -520,7 +520,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, &mut self, idx_1: Option, idx_2: Option, - cache: &mut Cache, + cache: &mut Cache<'_, T, M, K>, ) -> Option<(usize, usize, T)> { match (idx_1, idx_2) { (None, None) => { @@ -614,7 +614,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, idx_1: Option, idx_2: Option, tol: T, - cache: &mut Cache, + cache: &mut Cache<'_, T, M, K>, ) -> bool { match self.select_pair(idx_1, idx_2, cache) { Some((idx_1, idx_2, k_v_12)) => { @@ -653,7 +653,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, } } - fn update(&mut self, v1: usize, v2: usize, step: T, cache: &mut Cache) { + fn update(&mut self, v1: usize, v2: usize, step: T, cache: &mut Cache<'_, T, M, K>) { self.sv[v1].alpha -= step; self.sv[v2].alpha += step; diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 36f308a..5d007d7 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -469,7 +469,7 @@ impl Cache { } } - fn get Vec>(&self, i: usize, or: F) -> Ref> { + fn get Vec>(&self, i: usize, or: F) -> Ref<'_, Vec> { if self.data[i].borrow().is_none() { self.data[i].replace(Some(or())); } diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index b30fb2d..353c1bd 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -334,7 +334,7 @@ impl DecisionTreeClassifier { let mut visitor = NodeVisitor::::new(0, samples, &order, &x, &yi, 1); - let mut visitor_queue: LinkedList> = LinkedList::new(); + let mut visitor_queue: LinkedList> = LinkedList::new(); if tree.find_best_cutoff(&mut visitor, mtry) { visitor_queue.push_back(visitor); @@ -392,7 +392,7 @@ impl DecisionTreeClassifier { fn find_best_cutoff>( &mut self, - visitor: &mut NodeVisitor, + visitor: &mut NodeVisitor<'_, T, M>, mtry: usize, ) -> bool { let (n_rows, n_attr) = visitor.x.shape(); @@ -455,7 +455,7 @@ impl DecisionTreeClassifier { fn find_best_split>( &mut self, - visitor: &mut NodeVisitor, + visitor: &mut NodeVisitor<'_, T, M>, n: usize, count: &Vec, false_count: &mut Vec, diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 0d6da54..39f3eb8 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -240,7 +240,7 @@ impl DecisionTreeRegressor { let mut visitor = NodeVisitor::::new(0, samples, &order, &x, &y_m, 1); - let mut visitor_queue: LinkedList> = LinkedList::new(); + let mut visitor_queue: LinkedList> = LinkedList::new(); if tree.find_best_cutoff(&mut visitor, mtry) { visitor_queue.push_back(visitor); @@ -298,7 +298,7 @@ impl DecisionTreeRegressor { fn find_best_cutoff>( &mut self, - visitor: &mut NodeVisitor, + visitor: &mut NodeVisitor<'_, T, M>, mtry: usize, ) -> bool { let (_, n_attr) = visitor.x.shape(); @@ -332,7 +332,7 @@ impl DecisionTreeRegressor { fn find_best_split>( &mut self, - visitor: &mut NodeVisitor, + visitor: &mut NodeVisitor<'_, T, M>, n: usize, sum: T, parent_gain: T, From 8a2da00665df708b883765edce7529717c55f831 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 20:58:47 -0400 Subject: [PATCH 25/79] Fail in case of clippy warning --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0f118da..069c56d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -40,4 +40,4 @@ jobs: command: rustup component add clippy - run: name: Run cargo clippy - command: cargo clippy -- -Drust-2018-idioms + command: cargo clippy -- -Drust-2018-idioms -Dwarnings From 4d75af67033f81eaf005b8319f044d48ea439d60 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 20:59:27 -0400 Subject: [PATCH 26/79] Allow temporally the warnings that are currently failing --- src/lib.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 083b95f..687becf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,6 +64,22 @@ //! let y_hat = knn.predict(&x).unwrap(); //! ``` +#![allow( + clippy::or_fun_call, + clippy::needless_range_loop, + clippy::ptr_arg, + clippy::len_without_is_empty, + clippy::extra_unused_lifetimes, + clippy::map_entry, + clippy::comparison_chain, + clippy::type_complexity, + clippy::needless_lifetimes, + clippy::too_many_arguments, + clippy::unnecessary_mut_passed, + clippy::let_and_return, + clippy::many_single_char_names, + clippy::tabs_in_doc_comments +)] /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; /// Algorithms for clustering of unlabeled data From 43584e14e57ed131104c9dd71c61c3fd9f78fc27 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 23:15:50 -0400 Subject: [PATCH 27/79] Fix clippy::or_fun_call --- src/ensemble/random_forest_classifier.rs | 6 +++--- src/lib.rs | 1 - src/metrics/cluster_hcv.rs | 4 ++-- src/svm/svc.rs | 8 ++++++-- src/tree/decision_tree_classifier.rs | 5 +++-- src/tree/decision_tree_regressor.rs | 5 +++-- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index e1d462a..011b0ba 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -137,13 +137,13 @@ impl RandomForestClassifier { yi[i] = classes.iter().position(|c| yc == *c).unwrap(); } - let mtry = parameters.m.unwrap_or( + let mtry = parameters.m.unwrap_or_else(|| { (T::from(num_attributes).unwrap()) .sqrt() .floor() .to_usize() - .unwrap(), - ); + .unwrap() + }); let classes = y_m.unique(); let k = classes.len(); diff --git a/src/lib.rs b/src/lib.rs index 687becf..2142c8b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,7 +65,6 @@ //! ``` #![allow( - clippy::or_fun_call, clippy::needless_range_loop, clippy::ptr_arg, clippy::len_without_is_empty, diff --git a/src/metrics/cluster_hcv.rs b/src/metrics/cluster_hcv.rs index bdefc8d..29a9db2 100644 --- a/src/metrics/cluster_hcv.rs +++ b/src/metrics/cluster_hcv.rs @@ -24,8 +24,8 @@ impl HCVScore { let contingency = contingency_matrix(&labels_true, &labels_pred); let mi: T = mutual_info_score(&contingency); - let homogeneity = entropy_c.map(|e| mi / e).unwrap_or(T::one()); - let completeness = entropy_k.map(|e| mi / e).unwrap_or(T::one()); + let homogeneity = entropy_c.map(|e| mi / e).unwrap_or_else(T::one); + let completeness = entropy_k.map(|e| mi / e).unwrap_or_else(T::one); let v_measure_score = if homogeneity + completeness == T::zero() { T::zero() diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 62a9e01..f2d518b 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -561,7 +561,9 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, ( idx_1, idx_2, - k_v_12.unwrap_or(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x)), + k_v_12.unwrap_or_else(|| { + self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x) + }), ) }) } @@ -597,7 +599,9 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, ( idx_1, idx_2, - k_v_12.unwrap_or(self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x)), + k_v_12.unwrap_or_else(|| { + self.kernel.apply(&self.sv[idx_1].x, &self.sv[idx_2].x) + }), ) }) } diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 353c1bd..9fe1b1a 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -376,7 +376,8 @@ impl DecisionTreeClassifier { let node = &self.nodes[node_id]; if node.true_child == None && node.false_child == None { result = node.output; - } else if x.get(row, node.split_feature) <= node.split_value.unwrap_or(T::nan()) + } else if x.get(row, node.split_feature) + <= node.split_value.unwrap_or_else(T::nan) { queue.push_back(node.true_child.unwrap()); } else { @@ -529,7 +530,7 @@ impl DecisionTreeClassifier { for i in 0..n { if visitor.samples[i] > 0 { if visitor.x.get(i, self.nodes[visitor.node].split_feature) - <= self.nodes[visitor.node].split_value.unwrap_or(T::nan()) + <= self.nodes[visitor.node].split_value.unwrap_or_else(T::nan) { true_samples[i] = visitor.samples[i]; tc += true_samples[i]; diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 39f3eb8..c30c9e2 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -282,7 +282,8 @@ impl DecisionTreeRegressor { let node = &self.nodes[node_id]; if node.true_child == None && node.false_child == None { result = node.output; - } else if x.get(row, node.split_feature) <= node.split_value.unwrap_or(T::nan()) + } else if x.get(row, node.split_feature) + <= node.split_value.unwrap_or_else(T::nan) { queue.push_back(node.true_child.unwrap()); } else { @@ -401,7 +402,7 @@ impl DecisionTreeRegressor { for i in 0..n { if visitor.samples[i] > 0 { if visitor.x.get(i, self.nodes[visitor.node].split_feature) - <= self.nodes[visitor.node].split_value.unwrap_or(T::nan()) + <= self.nodes[visitor.node].split_value.unwrap_or_else(T::nan) { true_samples[i] = visitor.samples[i]; tc += true_samples[i]; From 513d916580f72ca87418fb6a62bc2ef3898a89b2 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 23:20:22 -0400 Subject: [PATCH 28/79] Fix clippy::tabs_in_doc_comments --- src/lib.rs | 1 - src/math/distance/mod.rs | 2 +- src/neighbors/mod.rs | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2142c8b..85aa3b8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,7 +77,6 @@ clippy::unnecessary_mut_passed, clippy::let_and_return, clippy::many_single_char_names, - clippy::tabs_in_doc_comments )] /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 1219ec6..696b5ff 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -4,7 +4,7 @@ //! Formally, the distance can be any metric measure that is defined as \\( d(x, y) \geq 0\\) and follows three conditions: //! 1. \\( d(x, y) = 0 \\) if and only \\( x = y \\), positive definiteness //! 1. \\( d(x, y) = d(y, x) \\), symmetry -//! 1. \\( d(x, y) \leq d(x, z) + d(z, y) \\), subadditivity or triangle inequality +//! 1. \\( d(x, y) \leq d(x, z) + d(z, y) \\), subadditivity or triangle inequality //! //! for all \\(x, y, z \in Z \\) //! diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index 6d542f6..be1ad4d 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -10,7 +10,7 @@ //! and follows three conditions: //! 1. \\( d(x, y) = 0 \\) if and only \\( x = y \\), positive definiteness //! 1. \\( d(x, y) = d(y, x) \\), symmetry -//! 1. \\( d(x, y) \leq d(x, z) + d(z, y) \\), subadditivity or triangle inequality +//! 1. \\( d(x, y) \leq d(x, z) + d(z, y) \\), subadditivity or triangle inequality //! //! for all \\(x, y, z \in Z \\) //! From b780e0c289080526a940c1d06fbe337cd81ec8b4 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 23:22:18 -0400 Subject: [PATCH 29/79] Fix clippy::unnecessary_mut_passed --- src/algorithm/neighbour/bbd_tree.rs | 4 ++-- src/lib.rs | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/algorithm/neighbour/bbd_tree.rs b/src/algorithm/neighbour/bbd_tree.rs index 632da86..85e6628 100644 --- a/src/algorithm/neighbour/bbd_tree.rs +++ b/src/algorithm/neighbour/bbd_tree.rs @@ -134,7 +134,7 @@ impl BBDTree { return self.filter( self.nodes[node].lower.unwrap(), centroids, - &mut new_candidates, + &new_candidates, newk, sums, counts, @@ -142,7 +142,7 @@ impl BBDTree { ) + self.filter( self.nodes[node].upper.unwrap(), centroids, - &mut new_candidates, + &new_candidates, newk, sums, counts, diff --git a/src/lib.rs b/src/lib.rs index 85aa3b8..80da506 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -74,7 +74,6 @@ clippy::type_complexity, clippy::needless_lifetimes, clippy::too_many_arguments, - clippy::unnecessary_mut_passed, clippy::let_and_return, clippy::many_single_char_names, )] From dd2864abe78426554d4b3217b01140139fc2bb6e Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 23:23:55 -0400 Subject: [PATCH 30/79] Fix clippy::extra_unused_lifetimes --- src/lib.rs | 3 +-- src/optimization/line_search.rs | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 80da506..b0bf26c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -68,14 +68,13 @@ clippy::needless_range_loop, clippy::ptr_arg, clippy::len_without_is_empty, - clippy::extra_unused_lifetimes, clippy::map_entry, clippy::comparison_chain, clippy::type_complexity, clippy::needless_lifetimes, clippy::too_many_arguments, clippy::let_and_return, - clippy::many_single_char_names, + clippy::many_single_char_names )] /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; diff --git a/src/optimization/line_search.rs b/src/optimization/line_search.rs index 3481c87..e6a3b80 100644 --- a/src/optimization/line_search.rs +++ b/src/optimization/line_search.rs @@ -2,7 +2,7 @@ use crate::optimization::FunctionOrder; use num_traits::Float; pub trait LineSearchMethod { - fn search<'a>( + fn search( &self, f: &(dyn Fn(T) -> T), df: &(dyn Fn(T) -> T), From 0c35adf76aaba8b56745d2bac964e2ffd73d59f3 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 23:26:22 -0400 Subject: [PATCH 31/79] Fix clippy::let_and_return --- src/lib.rs | 1 - src/linalg/naive/dense_matrix.rs | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b0bf26c..0df22b0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,7 +73,6 @@ clippy::type_complexity, clippy::needless_lifetimes, clippy::too_many_arguments, - clippy::let_and_return, clippy::many_single_char_names )] /// Various algorithms and helper methods that are used elsewhere in SmartCore diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index aff0fa2..02bb8b6 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -29,8 +29,7 @@ impl BaseVector for Vec { } fn to_vec(&self) -> Vec { - let v = self.clone(); - v + self.clone() } fn zeros(len: usize) -> Self { From 3c1969bdf508eef642a97aa9c97e5d899fcf4225 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sun, 8 Nov 2020 23:30:08 -0400 Subject: [PATCH 32/79] Fix clippy::needless_lifetimes --- src/lib.rs | 1 - src/linalg/naive/dense_matrix.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0df22b0..c85596e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,7 +71,6 @@ clippy::map_entry, clippy::comparison_chain, clippy::type_complexity, - clippy::needless_lifetimes, clippy::too_many_arguments, clippy::many_single_char_names )] diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 02bb8b6..7ba28bf 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -305,7 +305,7 @@ impl DenseMatrix { /// Creates new column vector (_1xN_ matrix) from a vector. /// * `values` - values to initialize the matrix. - pub fn iter<'a>(&'a self) -> DenseMatrixIterator<'a, T> { + pub fn iter(&self) -> DenseMatrixIterator<'_, T> { DenseMatrixIterator { cur_c: 0, cur_r: 0, From 5e887634db987137f3691a563b7a932855f3508b Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Mon, 9 Nov 2020 00:02:22 -0400 Subject: [PATCH 33/79] Fix clippy::comparison_chain --- src/lib.rs | 1 - src/linalg/lu.rs | 11 +++-- src/linear/logistic_regression.rs | 69 ++++++++++++++++--------------- 3 files changed, 41 insertions(+), 40 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c85596e..8c97bf7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -69,7 +69,6 @@ clippy::ptr_arg, clippy::len_without_is_empty, clippy::map_entry, - clippy::comparison_chain, clippy::type_complexity, clippy::too_many_arguments, clippy::many_single_char_names diff --git a/src/linalg/lu.rs b/src/linalg/lu.rs index cbe195f..bfc7fff 100644 --- a/src/linalg/lu.rs +++ b/src/linalg/lu.rs @@ -33,6 +33,7 @@ //! #![allow(non_snake_case)] +use std::cmp::Ordering; use std::fmt::Debug; use std::marker::PhantomData; @@ -78,12 +79,10 @@ impl> LU { for i in 0..n_rows { for j in 0..n_cols { - if i > j { - L.set(i, j, self.LU.get(i, j)); - } else if i == j { - L.set(i, j, T::one()); - } else { - L.set(i, j, T::zero()); + match i.cmp(&j) { + Ordering::Greater => L.set(i, j, self.LU.get(i, j)), + Ordering::Equal => L.set(i, j, T::one()), + Ordering::Less => L.set(i, j, T::zero()), } } } diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index ec90af1..796caed 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -52,6 +52,7 @@ //! //! //! +use std::cmp::Ordering; use std::fmt::Debug; use std::marker::PhantomData; @@ -231,48 +232,50 @@ impl> LogisticRegression { yi[i] = classes.iter().position(|c| yc == *c).unwrap(); } - if k < 2 { - Err(Failed::fit(&format!( + match k.cmp(&2) { + Ordering::Less => Err(Failed::fit(&format!( "incorrect number of classes: {}. Should be >= 2.", k - ))) - } else if k == 2 { - let x0 = M::zeros(1, num_attributes + 1); + ))), + Ordering::Greater => { + let x0 = M::zeros(1, (num_attributes + 1) * k); - let objective = BinaryObjectiveFunction { - x, - y: yi, - phantom: PhantomData, - }; + let objective = MultiClassObjectiveFunction { + x, + y: yi, + k, + phantom: PhantomData, + }; - let result = LogisticRegression::minimize(x0, objective); + let result = LogisticRegression::minimize(x0, objective); - Ok(LogisticRegression { - weights: result.x, - classes, - num_attributes, - num_classes: k, - }) - } else { - let x0 = M::zeros(1, (num_attributes + 1) * k); + let weights = result.x.reshape(k, num_attributes + 1); - let objective = MultiClassObjectiveFunction { - x, - y: yi, - k, - phantom: PhantomData, - }; + Ok(LogisticRegression { + weights, + classes, + num_attributes, + num_classes: k, + }) + } + Ordering::Equal => { + let x0 = M::zeros(1, num_attributes + 1); - let result = LogisticRegression::minimize(x0, objective); + let objective = BinaryObjectiveFunction { + x, + y: yi, + phantom: PhantomData, + }; - let weights = result.x.reshape(k, num_attributes + 1); + let result = LogisticRegression::minimize(x0, objective); - Ok(LogisticRegression { - weights, - classes, - num_attributes, - num_classes: k, - }) + Ok(LogisticRegression { + weights: result.x, + classes, + num_attributes, + num_classes: k, + }) + } } } From 3d4d5f64f6ebcd9adf037442778639a7b6cbd00c Mon Sep 17 00:00:00 2001 From: morenol Date: Mon, 9 Nov 2020 15:54:27 -0400 Subject: [PATCH 34/79] feat: add Naive Bayes and CategoricalNB (#15) * feat: Implement Naive Bayes classifier * Implement CategoricalNB --- src/lib.rs | 2 + src/naive_bayes/categorical.rs | 232 +++++++++++++++++++++++++++++++++ src/naive_bayes/mod.rs | 69 ++++++++++ 3 files changed, 303 insertions(+) create mode 100644 src/naive_bayes/categorical.rs create mode 100644 src/naive_bayes/mod.rs diff --git a/src/lib.rs b/src/lib.rs index 083b95f..966d5ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -85,6 +85,8 @@ pub mod math; /// Functions for assessing prediction error. pub mod metrics; pub mod model_selection; +/// Supervised learning algorithms based on applying the Bayes theorem with the independence assumptions between predictors +pub mod naive_bayes; /// Supervised neighbors-based learning methods pub mod neighbors; pub(crate) mod optimization; diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs new file mode 100644 index 0000000..f948aeb --- /dev/null +++ b/src/naive_bayes/categorical.rs @@ -0,0 +1,232 @@ +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; +use serde::{Deserialize, Serialize}; + +/// Naive Bayes classifier for categorical features +struct CategoricalNBDistribution { + class_labels: Vec, + class_probabilities: Vec, + coef: Vec>>, + feature_categories: Vec>, +} + +impl> NBDistribution for CategoricalNBDistribution { + fn prior(&self, class_index: usize) -> T { + if class_index >= self.class_labels.len() { + T::zero() + } else { + self.class_probabilities[class_index] + } + } + + fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T { + if class_index < self.class_labels.len() { + let mut prob = T::one(); + for feature in 0..j.len() { + let value = j.get(feature); + match self.feature_categories[feature] + .iter() + .position(|&t| t == value) + { + Some(_i) => prob *= self.coef[class_index][feature][_i], + None => return T::zero(), + } + } + prob + } else { + T::zero() + } + } + + fn classes(&self) -> &Vec { + &self.class_labels + } +} + +impl CategoricalNBDistribution { + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data. + /// * `y` - vector with target values (classes) of length N. + /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn fit>(x: &M, y: &M::RowVector, alpha: T) -> Result { + if alpha < T::zero() { + return Err(Failed::fit(&format!( + "alpha should be >= 0, alpha=[{}]", + alpha + ))); + } + + let (n_samples, n_features) = x.shape(); + let y_samples = y.len(); + if y_samples != n_samples { + return Err(Failed::fit(&format!( + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", + n_samples, y_samples + ))); + } + + if n_samples == 0 { + return Err(Failed::fit(&format!( + "Size of x and y should greater than 0; |x|=[{}]", + n_samples + ))); + } + + let mut y_sorted = y.to_vec(); + y_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let mut class_labels = Vec::with_capacity(y.len()); + class_labels.push(y_sorted[0]); + let mut classes_count = Vec::with_capacity(y.len()); + let mut current_count = T::one(); + for idx in 1..y_samples { + if y_sorted[idx] == y_sorted[idx - 1] { + current_count += T::one(); + } else { + classes_count.push(current_count); + class_labels.push(y_sorted[idx]); + current_count = T::one() + } + classes_count.push(current_count); + } + + let mut feature_categories: Vec> = Vec::with_capacity(n_features); + + for feature in 0..n_features { + let feature_types = x.get_col_as_vec(feature).unique(); + feature_categories.push(feature_types); + } + let mut coef: Vec>> = Vec::with_capacity(class_labels.len()); + for (label, label_count) in class_labels.iter().zip(classes_count.iter()) { + let mut coef_i: Vec> = Vec::with_capacity(n_features); + for (feature_index, feature_options) in + feature_categories.iter().enumerate().take(n_features) + { + let col = x + .get_col_as_vec(feature_index) + .iter() + .enumerate() + .filter(|(i, _j)| y.get(*i) == *label) + .map(|(_, j)| *j) + .collect::>(); + let mut feat_count: Vec = Vec::with_capacity(feature_options.len()); + for k in feature_options.iter() { + let feat_k_count = col.iter().filter(|&v| v == k).count(); + feat_count.push(feat_k_count); + } + + let coef_i_j = feat_count + .iter() + .map(|c| { + (T::from(*c).unwrap() + alpha) + / (T::from(*label_count).unwrap() + + T::from(feature_options.len()).unwrap() * alpha) + }) + .collect::>(); + coef_i.push(coef_i_j); + } + coef.push(coef_i); + } + let class_probabilities = classes_count + .into_iter() + .map(|count| count / T::from(n_samples).unwrap()) + .collect::>(); + + Ok(Self { + class_labels, + class_probabilities, + coef, + feature_categories, + }) + } +} + +/// `CategoricalNB` parameters. Use `Default::default()` for default values. +#[derive(Serialize, Deserialize, Debug)] +pub struct CategoricalNBParameters { + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub alpha: T, +} + +impl CategoricalNBParameters { + /// Create CategoricalNBParameters with specific paramaters. + pub fn new(alpha: T) -> Result { + if alpha > T::zero() { + Ok(Self { alpha }) + } else { + Err(Failed::fit(&format!( + "alpha should be >= 0, alpha=[{}]", + alpha + ))) + } + } +} +impl Default for CategoricalNBParameters { + fn default() -> Self { + Self { alpha: T::one() } + } +} + +/// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data. +pub struct CategoricalNB> { + inner: BaseNaiveBayes>, +} + +impl> CategoricalNB { + /// Fits CategoricalNB with given data + /// * `x` - training data of size NxM where N is the number of samples and M is the number of + /// features. + /// * `y` - vector with target values (classes) of length N. + /// * `parameters` - additional parameters like alpha for smoothing + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: CategoricalNBParameters, + ) -> Result { + let alpha = parameters.alpha; + let distribution = CategoricalNBDistribution::fit(x, y, alpha)?; + let inner = BaseNaiveBayes::fit(distribution)?; + Ok(Self { inner }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + self.inner.predict(x) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn run_base_naive_bayes() { + let x = DenseMatrix::from_2d_array(&[ + &[0., 2., 1., 0.], + &[0., 2., 1., 1.], + &[1., 2., 1., 0.], + &[2., 1., 1., 0.], + &[2., 0., 0., 0.], + &[2., 0., 0., 1.], + &[1., 0., 0., 1.], + &[0., 1., 1., 0.], + &[0., 0., 0., 0.], + &[2., 1., 0., 0.], + &[0., 1., 0., 1.], + &[1., 1., 1., 1.], + &[1., 2., 0., 0.], + &[2., 1., 1., 1.], + ]); + let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; + + let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); + let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]); + let y_hat = cnb.predict(&x_test).unwrap(); + assert_eq!(y_hat, vec![0., 1.]); + } +} diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs new file mode 100644 index 0000000..e9ab792 --- /dev/null +++ b/src/naive_bayes/mod.rs @@ -0,0 +1,69 @@ +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use std::marker::PhantomData; + +/// Distribution used in the Naive Bayes classifier. +pub(crate) trait NBDistribution> { + /// Prior of class at the given index. + fn prior(&self, class_index: usize) -> T; + + /// Conditional probability of sample j given class in the specified index. + fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T; + + /// Possible classes of the distribution. + fn classes(&self) -> &Vec; +} + +/// Base struct for the Naive Bayes classifier. +pub(crate) struct BaseNaiveBayes, D: NBDistribution> { + distribution: D, + _phantom_t: PhantomData, + _phantom_m: PhantomData, +} + +impl, D: NBDistribution> BaseNaiveBayes { + /// Fits NB classifier to a given NBdistribution. + /// * `distribution` - NBDistribution of the training data + pub fn fit(distribution: D) -> Result { + Ok(Self { + distribution, + _phantom_t: PhantomData, + _phantom_m: PhantomData, + }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + let y_classes = self.distribution.classes(); + let (rows, _) = x.shape(); + let predictions = (0..rows) + .map(|row_index| { + let row = x.get_row(row_index); + let (prediction, _probability) = y_classes + .iter() + .enumerate() + .map(|(class_index, class)| { + ( + class, + self.distribution.conditional_probability(class_index, &row) + * self.distribution.prior(class_index), + ) + }) + .max_by(|(_, p1), (_, p2)| p1.partial_cmp(p2).unwrap()) + .unwrap(); + *prediction + }) + .collect::>(); + let mut y_hat = M::RowVector::zeros(rows); + for (i, prediction) in predictions.iter().enumerate().take(rows) { + y_hat.set(i, *prediction); + } + Ok(y_hat) + } +} +mod categorical; +pub use categorical::{CategoricalNB, CategoricalNBParameters}; From c756496b710a962f632061b2e9c153488aeaefb7 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Mon, 9 Nov 2020 00:21:02 -0400 Subject: [PATCH 35/79] Fix clippy::len_without_is_empty --- src/lib.rs | 1 - src/linalg/mod.rs | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 8c97bf7..4e87301 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -67,7 +67,6 @@ #![allow( clippy::needless_range_loop, clippy::ptr_arg, - clippy::len_without_is_empty, clippy::map_entry, clippy::type_complexity, clippy::too_many_arguments, diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index fc9d6c9..896d718 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -76,6 +76,11 @@ pub trait BaseVector: Clone + Debug { /// Get number of elevemnt in the vector fn len(&self) -> usize; + /// Returns true if the vector is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + /// Return a vector with the elements of the one-dimensional array. fn to_vec(&self) -> Vec; From d620f225ee167dff86cf85601cee918b4c2ff5d7 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 10 Nov 2020 00:20:26 -0400 Subject: [PATCH 36/79] Fix new warnings after rustup update --- src/dataset/generator.rs | 2 +- src/linear/logistic_regression.rs | 6 ++++-- src/model_selection/mod.rs | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index 2514134..e0b2939 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -49,7 +49,7 @@ pub fn make_blobs( /// Make a large circle containing a smaller circle in 2d. pub fn make_circles(num_samples: usize, factor: f32, noise: f32) -> Dataset { - if factor >= 1.0 || factor < 0.0 { + if !(0.0..1.0).contains(&factor) { panic!("'factor' has to be between 0 and 1."); } diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 796caed..022942c 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -328,8 +328,10 @@ impl> LogisticRegression { let df = |g: &mut M, w: &M| objective.df(g, w); - let mut ls: Backtracking = Default::default(); - ls.order = FunctionOrder::THIRD; + let ls: Backtracking = Backtracking { + order: FunctionOrder::THIRD, + ..Default::default() + }; let optimizer: LBFGS = Default::default(); optimizer.optimize(&f, &df, &x0, &ls) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index d4908f6..ddcd9d4 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -110,7 +110,7 @@ pub struct KFold { impl Default for KFold { fn default() -> KFold { KFold { - n_splits: 3 as usize, + n_splits: 3_usize, shuffle: true, } } From 18df9c758ced915d67134395b00b62daa5f0f596 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 10 Nov 2020 00:36:54 -0400 Subject: [PATCH 37/79] Fix clippy::map_entry --- src/lib.rs | 1 - src/svm/svc.rs | 11 ++++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4e87301..97c953e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -67,7 +67,6 @@ #![allow( clippy::needless_range_loop, clippy::ptr_arg, - clippy::map_entry, clippy::type_complexity, clippy::too_many_arguments, clippy::many_single_char_names diff --git a/src/svm/svc.rs b/src/svm/svc.rs index f2d518b..4fd70df 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -300,11 +300,12 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Cache<'a, T, M fn get(&mut self, i: &SupportVector, j: &SupportVector) -> T { let idx_i = i.index; let idx_j = j.index; - if !self.data.contains_key(&(idx_i, idx_j)) { - let v = self.kernel.apply(&i.x, &j.x); - self.data.insert((idx_i, idx_j), v); - } - *self.data.get(&(idx_i, idx_j)).unwrap() + #[allow(clippy::or_fun_call)] + let entry = self + .data + .entry((idx_i, idx_j)) + .or_insert(self.kernel.apply(&i.x, &j.x)); + *entry } fn insert(&mut self, key: (usize, usize), value: T) { From 126b306681382a42f647bc101ce9ef0ed00822e2 Mon Sep 17 00:00:00 2001 From: morenol Date: Tue, 10 Nov 2020 20:50:41 -0400 Subject: [PATCH 38/79] Update .circleci/config.yml Co-authored-by: VolodymyrOrlov --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 069c56d..17da167 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -40,4 +40,4 @@ jobs: command: rustup component add clippy - run: name: Run cargo clippy - command: cargo clippy -- -Drust-2018-idioms -Dwarnings + command: cargo clippy --all-features -- -Drust-2018-idioms -Dwarnings From 85d2ecd1c97c00d53d5e06b69f115d158ee4e40d Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 10 Nov 2020 21:10:21 -0400 Subject: [PATCH 39/79] Fix clippy errors after --all-features was enabled --- src/linalg/nalgebra_bindings.rs | 26 +++++++++++--------------- src/linalg/ndarray_bindings.rs | 32 ++++++++++++++------------------ src/model_selection/mod.rs | 8 ++++---- 3 files changed, 29 insertions(+), 37 deletions(-) diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index e0b885b..4c8120d 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -64,7 +64,7 @@ impl BaseVector for MatrixMN { } fn to_vec(&self) -> Vec { - self.row(0).iter().map(|v| *v).collect() + self.row(0).iter().copied().collect() } fn zeros(len: usize) -> Self { @@ -112,7 +112,7 @@ impl BaseVector for MatrixMN { let mut norm = T::zero(); for xi in self.iter() { - norm = norm + xi.abs().powf(p); + norm += xi.abs().powf(p); } norm.powf(T::one() / p) @@ -174,7 +174,7 @@ impl BaseVector for MatrixMN { } fn unique(&self) -> Vec { - let mut result: Vec = self.iter().map(|v| *v).collect(); + let mut result: Vec = self.iter().copied().collect(); result.sort_by(|a, b| a.partial_cmp(b).unwrap()); result.dedup(); result @@ -199,7 +199,7 @@ impl Vec { - self.row(row).iter().map(|v| *v).collect() + self.row(row).iter().copied().collect() } fn get_row(&self, row: usize) -> Self::RowVector { @@ -207,22 +207,18 @@ impl) { - let mut r = 0; - for e in self.row(row).iter() { + for (r, e) in self.row(row).iter().enumerate() { result[r] = *e; - r += 1; } } fn get_col_as_vec(&self, col: usize) -> Vec { - self.column(col).iter().map(|v| *v).collect() + self.column(col).iter().copied().collect() } fn copy_col_as_vec(&self, col: usize, result: &mut Vec) { - let mut r = 0; - for e in self.column(col).iter() { - result[r] = *e; - r += 1; + for (c, e) in self.column(col).iter().enumerate() { + result[c] = *e; } } @@ -368,7 +364,7 @@ impl Vec { - let mut result: Vec = self.iter().map(|v| *v).collect(); + let mut result: Vec = self.iter().copied().collect(); result.sort_by(|a, b| a.partial_cmp(b).unwrap()); result.dedup(); result diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 00c9745..958123a 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -117,7 +117,7 @@ impl BaseVector for ArrayBase, Ix let mut norm = T::zero(); for xi in self.iter() { - norm = norm + xi.abs().powf(p); + norm += xi.abs().powf(p); } norm.powf(T::one() / p) @@ -125,19 +125,19 @@ impl BaseVector for ArrayBase, Ix } fn div_element_mut(&mut self, pos: usize, x: T) { - self[pos] = self[pos] / x; + self[pos] /= x; } fn mul_element_mut(&mut self, pos: usize, x: T) { - self[pos] = self[pos] * x; + self[pos] *= x; } fn add_element_mut(&mut self, pos: usize, x: T) { - self[pos] = self[pos] + x; + self[pos] += x; } fn sub_element_mut(&mut self, pos: usize, x: T) { - self[pos] = self[pos] - x; + self[pos] -= x; } fn approximate_eq(&self, other: &Self, error: T) -> bool { @@ -204,10 +204,8 @@ impl) { - let mut r = 0; - for e in self.row(row).iter() { + for (r, e) in self.row(row).iter().enumerate() { result[r] = *e; - r += 1; } } @@ -216,10 +214,8 @@ impl) { - let mut r = 0; - for e in self.column(col).iter() { - result[r] = *e; - r += 1; + for (c, e) in self.column(col).iter().enumerate() { + result[c] = *e; } } @@ -347,7 +343,7 @@ impl = DenseMatrix::rand(23, 100); let train_test_splits = k.split(&x); - assert_eq!(train_test_splits[0].1.len(), 12 as usize); - assert_eq!(train_test_splits[0].0.len(), 11 as usize); - assert_eq!(train_test_splits[1].0.len(), 12 as usize); - assert_eq!(train_test_splits[1].1.len(), 11 as usize); + assert_eq!(train_test_splits[0].1.len(), 12_usize); + assert_eq!(train_test_splits[0].0.len(), 11_usize); + assert_eq!(train_test_splits[1].0.len(), 12_usize); + assert_eq!(train_test_splits[1].1.len(), 11_usize); } #[test] From f46d3ba94c5a0bf3b938c8adf4923cf53bd80c62 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 10 Nov 2020 21:12:48 -0400 Subject: [PATCH 40/79] Address feedback --- src/lib.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 97c953e..49e106f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,10 @@ +#![allow( + clippy::needless_range_loop, + clippy::ptr_arg, + clippy::type_complexity, + clippy::too_many_arguments, + clippy::many_single_char_names +)] #![warn(missing_docs)] #![warn(missing_doc_code_examples)] @@ -64,13 +71,6 @@ //! let y_hat = knn.predict(&x).unwrap(); //! ``` -#![allow( - clippy::needless_range_loop, - clippy::ptr_arg, - clippy::type_complexity, - clippy::too_many_arguments, - clippy::many_single_char_names -)] /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; /// Algorithms for clustering of unlabeled data From ca3a3a101c9209552db558cc20226b511d9636d6 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 11 Nov 2020 12:00:58 -0800 Subject: [PATCH 41/79] fix: ridge regression, post-review changes --- src/linalg/mod.rs | 25 +++++++++++++++---------- src/linalg/stats.rs | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 13 deletions(-) diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index fe3e197..41ec415 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -168,16 +168,10 @@ pub trait BaseVector: Clone + Debug { /// Computes the arithmetic mean. fn mean(&self) -> T { - let n = self.len(); - let mut mean = T::zero(); - - for i in 0..n { - mean += self.get(i); - } - mean / T::from_usize(n).unwrap() + self.sum() / T::from_usize(self.len()).unwrap() } - /// Computes the standard deviation. - fn std(&self) -> T { + /// Computes variance. + fn var(&self) -> T { let n = self.len(); let mut mu = T::zero(); @@ -189,7 +183,11 @@ pub trait BaseVector: Clone + Debug { sum += xi * xi; } mu /= div; - (sum / div - mu * mu).sqrt() + sum / div - mu * mu + } + /// Computes the standard deviation. + fn std(&self) -> T { + self.var().sqrt() } } @@ -592,4 +590,11 @@ mod tests { assert!((m.std() - 0.81f64).abs() < 1e-2); } + + #[test] + fn var() { + let m = vec![1., 2., 3., 4.]; + + assert!((m.var() - 1.25f64).abs() < std::f64::EPSILON); + } } diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index ecb7ceb..fc339e0 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -35,8 +35,8 @@ pub trait MatrixStats: BaseMatrix { x } - /// Computes the standard deviation along the specified axis. - fn std(&self, axis: u8) -> Vec { + /// Computes variance along the specified axis. + fn var(&self, axis: u8) -> Vec { let (n, m) = match axis { 0 => { let (n, m) = self.shape(); @@ -61,7 +61,24 @@ pub trait MatrixStats: BaseMatrix { sum += a * a; } mu /= div; - x[i] = (sum / div - mu * mu).sqrt(); + x[i] = sum / div - mu * mu; + } + + x + } + + /// Computes the standard deviation along the specified axis. + fn std(&self, axis: u8) -> Vec { + + let mut x = self.var(axis); + + let n = match axis { + 0 => self.shape().1, + _ => self.shape().0, + }; + + for i in 0..n { + x[i] = x[i].sqrt(); } x @@ -122,6 +139,19 @@ mod tests { assert!(m.std(1).approximate_eq(&expected_1, 1e-2)); } + #[test] + fn var() { + let m = DenseMatrix::from_2d_array(&[ + &[1., 2., 3., 4.], + &[5., 6., 7., 8.] + ]); + let expected_0 = vec![4., 4., 4., 4.]; + let expected_1 = vec![1.25, 1.25]; + + assert!(m.var(0).approximate_eq(&expected_0, std::f64::EPSILON)); + assert!(m.var(1).approximate_eq(&expected_1, std::f64::EPSILON)); + } + #[test] fn scale() { let mut m = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); From 7a4fe114d8eb8d3d39e9985e5d56510e9bb4cad7 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 11 Nov 2020 12:01:57 -0800 Subject: [PATCH 42/79] fix: ridge regression, formatting --- src/linalg/stats.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index fc339e0..ac7a1bc 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -69,7 +69,6 @@ pub trait MatrixStats: BaseMatrix { /// Computes the standard deviation along the specified axis. fn std(&self, axis: u8) -> Vec { - let mut x = self.var(axis); let n = match axis { @@ -77,7 +76,7 @@ pub trait MatrixStats: BaseMatrix { _ => self.shape().0, }; - for i in 0..n { + for i in 0..n { x[i] = x[i].sqrt(); } @@ -141,16 +140,13 @@ mod tests { #[test] fn var() { - let m = DenseMatrix::from_2d_array(&[ - &[1., 2., 3., 4.], - &[5., 6., 7., 8.] - ]); + let m = DenseMatrix::from_2d_array(&[&[1., 2., 3., 4.], &[5., 6., 7., 8.]]); let expected_0 = vec![4., 4., 4., 4.]; let expected_1 = vec![1.25, 1.25]; assert!(m.var(0).approximate_eq(&expected_0, std::f64::EPSILON)); assert!(m.var(1).approximate_eq(&expected_1, std::f64::EPSILON)); - } + } #[test] fn scale() { From c42fccdc228a17cfff98d4dfd91be7f16a67ab39 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 11 Nov 2020 15:59:04 -0800 Subject: [PATCH 43/79] fix: ridge regression, code refactoring --- src/linear/linear_regression.rs | 4 +- src/linear/logistic_regression.rs | 75 ++++++++++++++++++++++--------- src/linear/ridge_regression.rs | 8 +++- 3 files changed, 63 insertions(+), 24 deletions(-) diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 61bb678..5de5007 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -154,8 +154,8 @@ impl> LinearRegression { } /// Get estimates regression coefficients - pub fn coefficients(&self) -> M { - self.coefficients.clone() + pub fn coefficients(&self) -> &M { + &self.coefficients } /// Get estimate of intercept diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index ec09184..116d700 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -68,7 +68,8 @@ use crate::optimization::FunctionOrder; /// Logistic Regression #[derive(Serialize, Deserialize, Debug)] pub struct LogisticRegression> { - weights: M, + coefficients: M, + intercept: M, classes: Vec, num_attributes: usize, num_classes: usize, @@ -109,7 +110,7 @@ impl> PartialEq for LogisticRegression { } } - return self.weights == other.weights; + return self.coefficients == other.coefficients && self.intercept == other.intercept; } } } @@ -246,9 +247,11 @@ impl> LogisticRegression { }; let result = LogisticRegression::minimize(x0, objective); + let weights = result.x; Ok(LogisticRegression { - weights: result.x, + coefficients: weights.slice(0..1, 0..num_attributes), + intercept: weights.slice(0..1, num_attributes..num_attributes + 1), classes: classes, num_attributes: num_attributes, num_classes: k, @@ -268,7 +271,8 @@ impl> LogisticRegression { let weights = result.x.reshape(k, num_attributes + 1); Ok(LogisticRegression { - weights: weights, + coefficients: weights.slice(0..k, 0..num_attributes), + intercept: weights.slice(0..k, num_attributes..num_attributes + 1), classes: classes, num_attributes: num_attributes, num_classes: k, @@ -283,21 +287,26 @@ impl> LogisticRegression { let mut result = M::zeros(1, n); if self.num_classes == 2 { let (nrows, _) = x.shape(); - let x_and_bias = x.h_stack(&M::ones(nrows, 1)); - let y_hat: Vec = x_and_bias - .matmul(&self.weights.transpose()) - .get_col_as_vec(0); + let y_hat: Vec = x.matmul(&self.coefficients.transpose()).get_col_as_vec(0); + let intercept = self.intercept.get(0, 0); for i in 0..n { result.set( 0, i, - self.classes[if y_hat[i].sigmoid() > T::half() { 1 } else { 0 }], + self.classes[if (y_hat[i] + intercept).sigmoid() > T::half() { + 1 + } else { + 0 + }], ); } } else { - let (nrows, _) = x.shape(); - let x_and_bias = x.h_stack(&M::ones(nrows, 1)); - let y_hat = x_and_bias.matmul(&self.weights.transpose()); + let mut y_hat = x.matmul(&self.coefficients.transpose()); + for r in 0..n { + for c in 0..self.num_classes { + y_hat.set(r, c, y_hat.get(r, c) + self.intercept.get(c, 0)); + } + } let class_idxs = y_hat.argmax(); for i in 0..n { result.set(0, i, self.classes[class_idxs[i]]); @@ -307,17 +316,13 @@ impl> LogisticRegression { } /// Get estimates regression coefficients - pub fn coefficients(&self) -> M { - self.weights - .slice(0..self.num_classes, 0..self.num_attributes) + pub fn coefficients(&self) -> &M { + &self.coefficients } /// Get estimate of intercept - pub fn intercept(&self) -> M { - self.weights.slice( - 0..self.num_classes, - self.num_attributes..self.num_attributes + 1, - ) + pub fn intercept(&self) -> &M { + &self.intercept } fn minimize(x0: M, objective: impl ObjectiveFunction) -> OptimizerResult { @@ -336,7 +341,9 @@ impl> LogisticRegression { #[cfg(test)] mod tests { use super::*; + use crate::dataset::generator::make_blobs; use crate::linalg::naive::dense_matrix::*; + use crate::metrics::accuracy; #[test] fn multiclass_objective_f() { @@ -466,6 +473,34 @@ mod tests { ); } + #[test] + fn lr_fit_predict_multiclass() { + let blobs = make_blobs(15, 4, 3); + + let x = DenseMatrix::from_vec(15, 4, &blobs.data); + let y = blobs.target; + + let lr = LogisticRegression::fit(&x, &y).unwrap(); + + let y_hat = lr.predict(&x).unwrap(); + + assert!(accuracy(&y_hat, &y) > 0.9); + } + + #[test] + fn lr_fit_predict_binary() { + let blobs = make_blobs(20, 4, 2); + + let x = DenseMatrix::from_vec(20, 4, &blobs.data); + let y = blobs.target; + + let lr = LogisticRegression::fit(&x, &y).unwrap(); + + let y_hat = lr.predict(&x).unwrap(); + + assert!(accuracy(&y_hat, &y) > 0.9); + } + #[test] fn serde() { let x = DenseMatrix::from_2d_array(&[ diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 18df6cb..a718e2a 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -134,6 +134,10 @@ impl> RidgeRegression { ))); } + if y.len() != n { + return Err(Failed::fit(&format!("Number of rows in X should = len(y)"))); + } + let y_column = M::from_row_vector(y.clone()).transpose(); let (w, b) = if parameters.normalize { @@ -216,8 +220,8 @@ impl> RidgeRegression { } /// Get estimates regression coefficients - pub fn coefficients(&self) -> M { - self.coefficients.clone() + pub fn coefficients(&self) -> &M { + &self.coefficients } /// Get estimate of intercept From cc26555bfd4df2c9b96e194c0739af5e015d8458 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 11 Nov 2020 16:10:37 -0800 Subject: [PATCH 44/79] fix: fixes suggested by Clippy --- src/linear/logistic_regression.rs | 76 ++++++++++++++++--------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 116d700..2df9b87 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -52,6 +52,7 @@ //! //! //! +use std::cmp::Ordering; use std::fmt::Debug; use std::marker::PhantomData; @@ -232,51 +233,53 @@ impl> LogisticRegression { yi[i] = classes.iter().position(|c| yc == *c).unwrap(); } - if k < 2 { - Err(Failed::fit(&format!( + match k.cmp(&2) { + Ordering::Less => Err(Failed::fit(&format!( "incorrect number of classes: {}. Should be >= 2.", k - ))) - } else if k == 2 { - let x0 = M::zeros(1, num_attributes + 1); + ))), + Ordering::Equal => { + let x0 = M::zeros(1, num_attributes + 1); - let objective = BinaryObjectiveFunction { - x: x, - y: yi, - phantom: PhantomData, - }; + let objective = BinaryObjectiveFunction { + x: x, + y: yi, + phantom: PhantomData, + }; - let result = LogisticRegression::minimize(x0, objective); - let weights = result.x; + let result = LogisticRegression::minimize(x0, objective); + let weights = result.x; - Ok(LogisticRegression { - coefficients: weights.slice(0..1, 0..num_attributes), - intercept: weights.slice(0..1, num_attributes..num_attributes + 1), - classes: classes, - num_attributes: num_attributes, - num_classes: k, - }) - } else { - let x0 = M::zeros(1, (num_attributes + 1) * k); + Ok(LogisticRegression { + coefficients: weights.slice(0..1, 0..num_attributes), + intercept: weights.slice(0..1, num_attributes..num_attributes + 1), + classes: classes, + num_attributes: num_attributes, + num_classes: k, + }) + } + Ordering::Greater => { + let x0 = M::zeros(1, (num_attributes + 1) * k); - let objective = MultiClassObjectiveFunction { - x: x, - y: yi, - k: k, - phantom: PhantomData, - }; + let objective = MultiClassObjectiveFunction { + x: x, + y: yi, + k: k, + phantom: PhantomData, + }; - let result = LogisticRegression::minimize(x0, objective); + let result = LogisticRegression::minimize(x0, objective); - let weights = result.x.reshape(k, num_attributes + 1); + let weights = result.x.reshape(k, num_attributes + 1); - Ok(LogisticRegression { - coefficients: weights.slice(0..k, 0..num_attributes), - intercept: weights.slice(0..k, num_attributes..num_attributes + 1), - classes: classes, - num_attributes: num_attributes, - num_classes: k, - }) + Ok(LogisticRegression { + coefficients: weights.slice(0..k, 0..num_attributes), + intercept: weights.slice(0..k, num_attributes..num_attributes + 1), + classes: classes, + num_attributes: num_attributes, + num_classes: k, + }) + } } } @@ -286,7 +289,6 @@ impl> LogisticRegression { let n = x.shape().0; let mut result = M::zeros(1, n); if self.num_classes == 2 { - let (nrows, _) = x.shape(); let y_hat: Vec = x.matmul(&self.coefficients.transpose()).get_col_as_vec(0); let intercept = self.intercept.get(0, 0); for i in 0..n { From f0371673a43642314ae55ebb0006b8dd2163625e Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 11 Nov 2020 17:23:49 -0800 Subject: [PATCH 45/79] fix: changes recommended by Clippy --- src/linear/logistic_regression.rs | 18 +++++++++--------- src/linear/ridge_regression.rs | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index addede7..4b52529 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -110,8 +110,8 @@ impl> PartialEq for LogisticRegression { return false; } } - - return self.coefficients == other.coefficients && self.intercept == other.intercept; + + self.coefficients == other.coefficients && self.intercept == other.intercept } } } @@ -242,7 +242,7 @@ impl> LogisticRegression { let x0 = M::zeros(1, num_attributes + 1); let objective = BinaryObjectiveFunction { - x: x, + x, y: yi, phantom: PhantomData, }; @@ -254,8 +254,8 @@ impl> LogisticRegression { Ok(LogisticRegression { coefficients: weights.slice(0..1, 0..num_attributes), intercept: weights.slice(0..1, num_attributes..num_attributes + 1), - classes: classes, - num_attributes: num_attributes, + classes, + num_attributes, num_classes: k, }) } @@ -263,9 +263,9 @@ impl> LogisticRegression { let x0 = M::zeros(1, (num_attributes + 1) * k); let objective = MultiClassObjectiveFunction { - x: x, + x, y: yi, - k: k, + k, phantom: PhantomData, }; @@ -275,8 +275,8 @@ impl> LogisticRegression { Ok(LogisticRegression { coefficients: weights.slice(0..k, 0..num_attributes), intercept: weights.slice(0..k, num_attributes..num_attributes + 1), - classes: classes, - num_attributes: num_attributes, + classes, + num_attributes, num_classes: k, }) } diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index a718e2a..beac40b 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -129,13 +129,13 @@ impl> RidgeRegression { let (n, p) = x.shape(); if n <= p { - return Err(Failed::fit(&format!( + return Err(Failed::fit( "Number of rows in X should be >= number of columns in X" - ))); + )); } if y.len() != n { - return Err(Failed::fit(&format!("Number of rows in X should = len(y)"))); + return Err(Failed::fit("Number of rows in X should = len(y)")); } let y_column = M::from_row_vector(y.clone()).transpose(); From 830a0d919421a58ecb76a218df14d3f91bfa9b35 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 11 Nov 2020 17:26:49 -0800 Subject: [PATCH 46/79] fix: formatting --- src/linear/ridge_regression.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index beac40b..bb03c54 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -130,7 +130,7 @@ impl> RidgeRegression { if n <= p { return Err(Failed::fit( - "Number of rows in X should be >= number of columns in X" + "Number of rows in X should be >= number of columns in X", )); } From 900078cb04079da7802d9603215b214903743240 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 11 Nov 2020 20:53:50 -0400 Subject: [PATCH 47/79] Implement abstract method to convert a slice to a BaseVector, Implement RealNumberVector over BaseVector instead of over Vec --- src/linalg/mod.rs | 15 +++++++++++++++ src/math/vector.rs | 11 ++++++----- src/naive_bayes/mod.rs | 5 +---- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 46f09c9..4fb259f 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -83,6 +83,21 @@ pub trait BaseVector: Clone + Debug { self.len() == 0 } + /// Create a new vector from a &[T] + /// ``` + /// use smartcore::linalg::naive::dense_matrix::*; + /// let slice: &[f64] = &[0., 0.5, 2., 3., 4.]; + /// let a: Vec = BaseVector::from_slice(slice); + /// assert_eq!(a, vec![0., 0.5, 2., 3., 4.]); + /// ``` + fn from_slice(f: &[T]) -> Self { + let mut v = Self::zeros(f.len()); + for (i, elem) in f.iter().enumerate() { + v.set(i, *elem); + } + v + } + /// Return a vector with the elements of the one-dimensional array. fn to_vec(&self) -> Vec; diff --git a/src/math/vector.rs b/src/math/vector.rs index accfed6..14e1925 100644 --- a/src/math/vector.rs +++ b/src/math/vector.rs @@ -1,13 +1,14 @@ use crate::math::num::RealNumber; use std::collections::HashMap; +use crate::linalg::BaseVector; pub trait RealNumberVector { fn unique(&self) -> (Vec, Vec); } -impl RealNumberVector for Vec { +impl> RealNumberVector for V { fn unique(&self) -> (Vec, Vec) { - let mut unique = self.clone(); + let mut unique = self.to_vec(); unique.sort_by(|a, b| a.partial_cmp(b).unwrap()); unique.dedup(); @@ -17,8 +18,8 @@ impl RealNumberVector for Vec { } let mut unique_index = Vec::with_capacity(self.len()); - for e in self { - unique_index.push(index[&e.to_i64().unwrap()]); + for idx in 0..self.len() { + unique_index.push(index[&self.get(idx).to_i64().unwrap()]); } (unique, unique_index) @@ -27,7 +28,7 @@ impl RealNumberVector for Vec { #[cfg(test)] mod tests { - use super::*; + use super::RealNumberVector; #[test] fn unique() { diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index e9ab792..ffc3e2e 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -58,10 +58,7 @@ impl, D: NBDistribution> BaseNaiveBayes>(); - let mut y_hat = M::RowVector::zeros(rows); - for (i, prediction) in predictions.iter().enumerate().take(rows) { - y_hat.set(i, *prediction); - } + let y_hat = M::RowVector::from_slice(&predictions); Ok(y_hat) } } From 49487bccd3e6df126d156ab9a1587da8b4524983 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 11 Nov 2020 22:08:57 -0400 Subject: [PATCH 48/79] Rename trait function --- src/math/vector.rs | 10 +++++----- src/metrics/cluster_helpers.rs | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/math/vector.rs b/src/math/vector.rs index 14e1925..62cf63b 100644 --- a/src/math/vector.rs +++ b/src/math/vector.rs @@ -3,11 +3,11 @@ use std::collections::HashMap; use crate::linalg::BaseVector; pub trait RealNumberVector { - fn unique(&self) -> (Vec, Vec); + fn unique_with_indices(&self) -> (Vec, Vec); } impl> RealNumberVector for V { - fn unique(&self) -> (Vec, Vec) { + fn unique_with_indices(&self) -> (Vec, Vec) { let mut unique = self.to_vec(); unique.sort_by(|a, b| a.partial_cmp(b).unwrap()); unique.dedup(); @@ -28,14 +28,14 @@ impl> RealNumberVector for V { #[cfg(test)] mod tests { - use super::RealNumberVector; + use super::*; #[test] - fn unique() { + fn unique_with_indices() { let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0]; assert_eq!( (vec!(0.0, 1.0, 2.0, 4.0), vec!(0, 0, 1, 1, 2, 0, 3)), - v1.unique() + v1.unique_with_indices() ); } } diff --git a/src/metrics/cluster_helpers.rs b/src/metrics/cluster_helpers.rs index dd5bbb3..8d1e17e 100644 --- a/src/metrics/cluster_helpers.rs +++ b/src/metrics/cluster_helpers.rs @@ -7,8 +7,8 @@ pub fn contingency_matrix( labels_true: &Vec, labels_pred: &Vec, ) -> Vec> { - let (classes, class_idx) = labels_true.unique(); - let (clusters, cluster_idx) = labels_pred.unique(); + let (classes, class_idx) = labels_true.unique_with_indices(); + let (clusters, cluster_idx) = labels_pred.unique_with_indices(); let mut contingency_matrix = Vec::with_capacity(classes.len()); From 6587ac032b6eb732437594bf99d318fbe8241ccd Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 11 Nov 2020 22:23:56 -0400 Subject: [PATCH 49/79] Rename to from_array --- src/linalg/mod.rs | 8 ++++---- src/naive_bayes/mod.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 4fb259f..c560b78 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -86,11 +86,11 @@ pub trait BaseVector: Clone + Debug { /// Create a new vector from a &[T] /// ``` /// use smartcore::linalg::naive::dense_matrix::*; - /// let slice: &[f64] = &[0., 0.5, 2., 3., 4.]; - /// let a: Vec = BaseVector::from_slice(slice); - /// assert_eq!(a, vec![0., 0.5, 2., 3., 4.]); + /// let a: [f64; 5] = [0., 0.5, 2., 3., 4.]; + /// let v: Vec = BaseVector::from_array(&a); + /// assert_eq!(v, vec![0., 0.5, 2., 3., 4.]); /// ``` - fn from_slice(f: &[T]) -> Self { + fn from_array(f: &[T]) -> Self { let mut v = Self::zeros(f.len()); for (i, elem) in f.iter().enumerate() { v.set(i, *elem); diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index ffc3e2e..f93d3bf 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -58,7 +58,7 @@ impl, D: NBDistribution> BaseNaiveBayes>(); - let y_hat = M::RowVector::from_slice(&predictions); + let y_hat = M::RowVector::from_array(&predictions); Ok(y_hat) } } From 72e9f8293f8557db12f3a8f2450e5642d35fcffb Mon Sep 17 00:00:00 2001 From: morenol Date: Mon, 16 Nov 2020 23:56:50 -0400 Subject: [PATCH 50/79] Use log likelihood to make calculations more stable (#28) * Use log likelihood to make calculations more stable * Fix problem with class_count in categoricalnb * Use a similar approach to the one used in scikitlearn to define which are the possible categories of each feature. --- src/naive_bayes/categorical.rs | 134 +++++++++++++++++++++------------ src/naive_bayes/mod.rs | 10 ++- 2 files changed, 93 insertions(+), 51 deletions(-) diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index f948aeb..ae6eb0c 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -6,11 +6,11 @@ use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features +#[derive(Debug)] struct CategoricalNBDistribution { class_labels: Vec, - class_probabilities: Vec, - coef: Vec>>, - feature_categories: Vec>, + class_priors: Vec, + coefficients: Vec>>, } impl> NBDistribution for CategoricalNBDistribution { @@ -18,24 +18,22 @@ impl> NBDistribution for CategoricalNBDistribu if class_index >= self.class_labels.len() { T::zero() } else { - self.class_probabilities[class_index] + self.class_priors[class_index] } } - fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T { + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { if class_index < self.class_labels.len() { - let mut prob = T::one(); + let mut likelihood = T::zero(); for feature in 0..j.len() { - let value = j.get(feature); - match self.feature_categories[feature] - .iter() - .position(|&t| t == value) - { - Some(_i) => prob *= self.coef[class_index][feature][_i], - None => return T::zero(), + let value = j.get(feature).floor().to_usize().unwrap(); + if self.coefficients[class_index][feature].len() > value { + likelihood += self.coefficients[class_index][feature][value]; + } else { + return T::zero(); } } - prob + likelihood } else { T::zero() } @@ -74,31 +72,45 @@ impl CategoricalNBDistribution { n_samples ))); } + let y: Vec = y + .to_vec() + .iter() + .map(|y_i| y_i.floor().to_usize().unwrap()) + .collect(); - let mut y_sorted = y.to_vec(); - y_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); - let mut class_labels = Vec::with_capacity(y.len()); - class_labels.push(y_sorted[0]); - let mut classes_count = Vec::with_capacity(y.len()); - let mut current_count = T::one(); - for idx in 1..y_samples { - if y_sorted[idx] == y_sorted[idx - 1] { - current_count += T::one(); - } else { - classes_count.push(current_count); - class_labels.push(y_sorted[idx]); - current_count = T::one() - } - classes_count.push(current_count); + let y_max = y + .iter() + .max() + .ok_or_else(|| Failed::fit(&"Failed to get the labels of y.".to_string()))?; + + let class_labels: Vec = (0..*y_max + 1) + .map(|label| T::from(label).unwrap()) + .collect(); + let mut classes_count: Vec = vec![T::zero(); class_labels.len()]; + for elem in y.iter() { + classes_count[*elem] += T::one(); } let mut feature_categories: Vec> = Vec::with_capacity(n_features); - for feature in 0..n_features { - let feature_types = x.get_col_as_vec(feature).unique(); + let feature_max = x + .get_col_as_vec(feature) + .iter() + .map(|f_i| f_i.floor().to_usize().unwrap()) + .max() + .ok_or_else(|| { + Failed::fit(&format!( + "Failed to get the categories for feature = {}", + feature + )) + })?; + let feature_types = (0..feature_max + 1) + .map(|feat| T::from(feat).unwrap()) + .collect(); feature_categories.push(feature_types); } - let mut coef: Vec>> = Vec::with_capacity(class_labels.len()); + + let mut coefficients: Vec>> = Vec::with_capacity(class_labels.len()); for (label, label_count) in class_labels.iter().zip(classes_count.iter()) { let mut coef_i: Vec> = Vec::with_capacity(n_features); for (feature_index, feature_options) in @@ -108,37 +120,36 @@ impl CategoricalNBDistribution { .get_col_as_vec(feature_index) .iter() .enumerate() - .filter(|(i, _j)| y.get(*i) == *label) + .filter(|(i, _j)| T::from(y[*i]).unwrap() == *label) .map(|(_, j)| *j) .collect::>(); - let mut feat_count: Vec = Vec::with_capacity(feature_options.len()); - for k in feature_options.iter() { - let feat_k_count = col.iter().filter(|&v| v == k).count(); - feat_count.push(feat_k_count); + let mut feat_count: Vec = vec![T::zero(); feature_options.len()]; + for row in col.iter() { + let index = row.floor().to_usize().unwrap(); + feat_count[index] += T::one(); } - let coef_i_j = feat_count .iter() .map(|c| { - (T::from(*c).unwrap() + alpha) - / (T::from(*label_count).unwrap() - + T::from(feature_options.len()).unwrap() * alpha) + ((*c + alpha) + / (*label_count + T::from(feature_options.len()).unwrap() * alpha)) + .ln() }) .collect::>(); coef_i.push(coef_i_j); } - coef.push(coef_i); + coefficients.push(coef_i); } - let class_probabilities = classes_count + + let class_priors = classes_count .into_iter() .map(|count| count / T::from(n_samples).unwrap()) .collect::>(); Ok(Self { class_labels, - class_probabilities, - coef, - feature_categories, + class_priors, + coefficients, }) } } @@ -170,6 +181,7 @@ impl Default for CategoricalNBParameters { } /// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data. +#[derive(Debug)] pub struct CategoricalNB> { inner: BaseNaiveBayes>, } @@ -205,7 +217,7 @@ mod tests { use crate::linalg::naive::dense_matrix::DenseMatrix; #[test] - fn run_base_naive_bayes() { + fn run_categorical_naive_bayes() { let x = DenseMatrix::from_2d_array(&[ &[0., 2., 1., 0.], &[0., 2., 1., 1.], @@ -229,4 +241,32 @@ mod tests { let y_hat = cnb.predict(&x_test).unwrap(); assert_eq!(y_hat, vec![0., 1.]); } + + #[test] + fn run_categorical_naive_bayes2() { + let x = DenseMatrix::from_2d_array(&[ + &[3., 4., 0., 1.], + &[3., 0., 0., 1.], + &[4., 4., 1., 2.], + &[4., 2., 4., 3.], + &[4., 2., 4., 2.], + &[4., 1., 1., 0.], + &[1., 1., 1., 1.], + &[0., 4., 1., 0.], + &[0., 3., 2., 1.], + &[0., 3., 1., 1.], + &[3., 4., 0., 1.], + &[3., 4., 2., 4.], + &[0., 3., 1., 2.], + &[0., 4., 1., 2.], + ]); + let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; + + let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); + let y_hat = cnb.predict(&x).unwrap(); + assert_eq!( + y_hat, + vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1.] + ); + } } diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index f93d3bf..8a9920e 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -2,6 +2,7 @@ use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; +use serde::{Deserialize, Serialize}; use std::marker::PhantomData; /// Distribution used in the Naive Bayes classifier. @@ -9,14 +10,15 @@ pub(crate) trait NBDistribution> { /// Prior of class at the given index. fn prior(&self, class_index: usize) -> T; - /// Conditional probability of sample j given class in the specified index. - fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T; + /// Logarithm of conditional probability of sample j given class in the specified index. + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T; /// Possible classes of the distribution. fn classes(&self) -> &Vec; } /// Base struct for the Naive Bayes classifier. +#[derive(Serialize, Deserialize, Debug, PartialEq)] pub(crate) struct BaseNaiveBayes, D: NBDistribution> { distribution: D, _phantom_t: PhantomData, @@ -49,8 +51,8 @@ impl, D: NBDistribution> BaseNaiveBayes Date: Thu, 19 Nov 2020 14:19:22 -0400 Subject: [PATCH 51/79] Implement GaussianNB (#27) * feat: Add GaussianNB --- src/naive_bayes/gaussian.rs | 257 ++++++++++++++++++++++++++++++++++++ src/naive_bayes/mod.rs | 2 + 2 files changed, 259 insertions(+) create mode 100644 src/naive_bayes/gaussian.rs diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs new file mode 100644 index 0000000..8e7e37c --- /dev/null +++ b/src/naive_bayes/gaussian.rs @@ -0,0 +1,257 @@ +use crate::error::Failed; +use crate::linalg::row_iter; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use crate::math::vector::RealNumberVector; +use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; +use serde::{Deserialize, Serialize}; + +/// Naive Bayes classifier for categorical features +#[derive(Serialize, Deserialize, Debug, PartialEq)] +struct GaussianNBDistribution { + /// class labels known to the classifier + class_labels: Vec, + /// probability of each class. + class_priors: Vec, + /// variance of each feature per class + sigma: Vec>, + /// mean of each feature per class + theta: Vec>, +} + +impl> NBDistribution for GaussianNBDistribution { + fn prior(&self, class_index: usize) -> T { + if class_index >= self.class_labels.len() { + T::zero() + } else { + self.class_priors[class_index] + } + } + + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { + if class_index < self.class_labels.len() { + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + let mean = self.theta[class_index][feature]; + let variance = self.sigma[class_index][feature]; + likelihood += self.calculate_log_probability(value, mean, variance); + } + likelihood + } else { + T::zero() + } + } + + fn classes(&self) -> &Vec { + &self.class_labels + } +} + +/// `GaussianNB` parameters. Use `Default::default()` for default values. +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct GaussianNBParameters { + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub priors: Option>, +} + +impl GaussianNBParameters { + /// Create GaussianNBParameters with specific paramaters. + pub fn new(priors: Option>) -> Self { + Self { priors } + } +} + +impl GaussianNBDistribution { + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data. + /// * `y` - vector with target values (classes) of length N. + /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, + /// priors are adjusted according to the data. + pub fn fit>( + x: &M, + y: &M::RowVector, + priors: Option>, + ) -> Result { + let (n_samples, n_features) = x.shape(); + let y_samples = y.len(); + if y_samples != n_samples { + return Err(Failed::fit(&format!( + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", + n_samples, y_samples + ))); + } + + if n_samples == 0 { + return Err(Failed::fit(&format!( + "Size of x and y should greater than 0; |x|=[{}]", + n_samples + ))); + } + let y = y.to_vec(); + let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); + + let mut class_count = vec![T::zero(); class_labels.len()]; + + let mut subdataset: Vec>> = vec![vec![]; class_labels.len()]; + + for (row, class_index) in row_iter(x).zip(indices.iter()) { + class_count[*class_index] += T::one(); + subdataset[*class_index].push(row); + } + + let class_priors = if let Some(class_priors) = priors { + if class_priors.len() != class_labels.len() { + return Err(Failed::fit( + "Size of priors provided does not match the number of classes of the data.", + )); + } + class_priors + } else { + class_count + .into_iter() + .map(|c| c / T::from(n_samples).unwrap()) + .collect() + }; + + let subdataset: Vec = subdataset + .into_iter() + .map(|v| { + let mut m = M::zeros(v.len(), n_features); + for row in 0..v.len() { + for col in 0..n_features { + m.set(row, col, v[row][col]); + } + } + m + }) + .collect(); + + let (sigma, theta): (Vec>, Vec>) = subdataset + .iter() + .map(|data| (data.var(0), data.mean(0))) + .unzip(); + + Ok(Self { + class_labels, + class_priors, + sigma, + theta, + }) + } + + /// Calculate probability of x equals to a value of a Gaussian distribution given its mean and its + /// variance. + fn calculate_log_probability(&self, value: T, mean: T, variance: T) -> T { + let pi = T::from(std::f64::consts::PI).unwrap(); + -((value - mean).powf(T::two()) / (T::two() * variance)) + - (T::two() * pi).ln() / T::two() + - (variance).ln() / T::two() + } +} + +/// GaussianNB implements the categorical naive Bayes algorithm for categorically distributed data. +#[derive(Serialize, Deserialize, Debug, PartialEq)] +pub struct GaussianNB> { + inner: BaseNaiveBayes>, +} + +impl> GaussianNB { + /// Fits GaussianNB with given data + /// * `x` - training data of size NxM where N is the number of samples and M is the number of + /// features. + /// * `y` - vector with target values (classes) of length N. + /// * `parameters` - additional parameters like class priors. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: GaussianNBParameters, + ) -> Result { + let distribution = GaussianNBDistribution::fit(x, y, parameters.priors)?; + let inner = BaseNaiveBayes::fit(distribution)?; + Ok(Self { inner }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + self.inner.predict(x) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn run_gaussian_naive_bayes() { + let x = DenseMatrix::from_2d_array(&[ + &[-1., -1.], + &[-2., -1.], + &[-3., -2.], + &[1., 1.], + &[2., 1.], + &[3., 2.], + ]); + let y = vec![1., 1., 1., 2., 2., 2.]; + + let gnb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); + let y_hat = gnb.predict(&x).unwrap(); + assert_eq!(y_hat, y); + assert_eq!( + gnb.inner.distribution.sigma, + &[ + &[0.666666666666667, 0.22222222222222232], + &[0.666666666666667, 0.22222222222222232] + ] + ); + + assert_eq!(gnb.inner.distribution.class_priors, &[0.5, 0.5]); + + assert_eq!( + gnb.inner.distribution.theta, + &[&[-2., -1.3333333333333333], &[2., 1.3333333333333333]] + ); + } + + #[test] + fn run_gaussian_naive_bayes_with_priors() { + let x = DenseMatrix::from_2d_array(&[ + &[-1., -1.], + &[-2., -1.], + &[-3., -2.], + &[1., 1.], + &[2., 1.], + &[3., 2.], + ]); + let y = vec![1., 1., 1., 2., 2., 2.]; + + let priors = vec![0.3, 0.7]; + let parameters = GaussianNBParameters::new(Some(priors.clone())); + let gnb = GaussianNB::fit(&x, &y, parameters).unwrap(); + + assert_eq!(gnb.inner.distribution.class_priors, priors); + } + + #[test] + fn serde() { + let x = DenseMatrix::::from_2d_array(&[ + &[-1., -1.], + &[-2., -1.], + &[-3., -2.], + &[1., 1.], + &[2., 1.], + &[3., 2.], + ]); + let y = vec![1., 1., 1., 2., 2., 2.]; + + let gnb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); + let deserialized_gnb: GaussianNB> = + serde_json::from_str(&serde_json::to_string(&gnb).unwrap()).unwrap(); + + assert_eq!(gnb, deserialized_gnb); + } +} diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 8a9920e..0268da6 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -65,4 +65,6 @@ impl, D: NBDistribution> BaseNaiveBayes Date: Thu, 19 Nov 2020 16:07:10 -0400 Subject: [PATCH 52/79] Add serde to CategoricalNB (#30) * Add serde to CategoricalNB * Implement PartialEq for CategoricalNBDistribution --- src/naive_bayes/categorical.rs | 60 ++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index ae6eb0c..d32c34d 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -6,13 +6,41 @@ use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug)] struct CategoricalNBDistribution { class_labels: Vec, class_priors: Vec, coefficients: Vec>>, } +impl PartialEq for CategoricalNBDistribution { + fn eq(&self, other: &Self) -> bool { + if self.class_labels == other.class_labels && self.class_priors == other.class_priors { + if self.coefficients.len() != other.coefficients.len() { + return false; + } + for (a, b) in self.coefficients.iter().zip(other.coefficients.iter()) { + if a.len() != b.len() { + return false; + } + for (a_i, b_i) in a.iter().zip(b.iter()) { + if a_i.len() != b_i.len() { + return false; + } + for (a_i_j, b_i_j) in a_i.iter().zip(b_i.iter()) { + if (*a_i_j - *b_i_j).abs() > T::epsilon() { + return false; + } + } + } + } + true + } else { + false + } + } +} + impl> NBDistribution for CategoricalNBDistribution { fn prior(&self, class_index: usize) -> T { if class_index >= self.class_labels.len() { @@ -181,7 +209,7 @@ impl Default for CategoricalNBParameters { } /// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug, PartialEq)] pub struct CategoricalNB> { inner: BaseNaiveBayes>, } @@ -269,4 +297,32 @@ mod tests { vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1.] ); } + + #[test] + fn serde() { + let x = DenseMatrix::::from_2d_array(&[ + &[3., 4., 0., 1.], + &[3., 0., 0., 1.], + &[4., 4., 1., 2.], + &[4., 2., 4., 3.], + &[4., 2., 4., 2.], + &[4., 1., 1., 0.], + &[1., 1., 1., 1.], + &[0., 4., 1., 0.], + &[0., 3., 2., 1.], + &[0., 3., 1., 1.], + &[3., 4., 0., 1.], + &[3., 4., 2., 4.], + &[0., 3., 1., 2.], + &[0., 4., 1., 2.], + ]); + + let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; + let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); + + let deserialized_cnb: CategoricalNB> = + serde_json::from_str(&serde_json::to_string(&cnb).unwrap()).unwrap(); + + assert_eq!(cnb, deserialized_cnb); + } } From 583284e66f3e27c03a393f630bee0b677f05b706 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 24 Nov 2020 19:12:53 -0800 Subject: [PATCH 53/79] feat: adds LASSO --- src/linalg/high_order.rs | 28 ++ src/linalg/mod.rs | 63 ++++ src/linalg/naive/dense_matrix.rs | 60 +++- src/linalg/nalgebra_bindings.rs | 6 + src/linalg/ndarray_bindings.rs | 6 + src/linear/bg_solver.rs | 146 +++++++++ src/linear/lasso.rs | 509 ++++++++++++++++++++++++++++++ src/linear/logistic_regression.rs | 2 +- src/linear/mod.rs | 2 + 9 files changed, 819 insertions(+), 3 deletions(-) create mode 100644 src/linalg/high_order.rs create mode 100644 src/linear/bg_solver.rs create mode 100644 src/linear/lasso.rs diff --git a/src/linalg/high_order.rs b/src/linalg/high_order.rs new file mode 100644 index 0000000..359c4a1 --- /dev/null +++ b/src/linalg/high_order.rs @@ -0,0 +1,28 @@ +//! In this module you will find composite of matrix operations that are used elsewhere +//! for improved efficiency. + +use crate::linalg::BaseMatrix; +use crate::math::num::RealNumber; + +/// High order matrix operations. +pub trait HighOrderOperations: BaseMatrix { + /// Y = AB + /// ``` + /// use smartcore::linalg::naive::dense_matrix::*; + /// use smartcore::linalg::high_order::HighOrderOperations; + /// + /// let a = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.]]); + /// let b = DenseMatrix::from_2d_array(&[&[5., 6.], &[7., 8.], &[9., 10.]]); + /// let expected = DenseMatrix::from_2d_array(&[&[71., 80.], &[92., 104.]]); + /// + /// assert_eq!(a.ab(true, &b, false), expected); + /// ``` + fn ab(&self, a_transpose: bool, b: &Self, b_transpose: bool) -> Self { + match (a_transpose, b_transpose) { + (true, true) => self.transpose().matmul(&b.transpose()), + (false, true) => self.matmul(&b.transpose()), + (true, false) => self.transpose().matmul(b), + (false, false) => self.matmul(b), + } + } +} diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index c560b78..1be2e75 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -36,6 +36,7 @@ pub mod cholesky; /// The matrix is represented in terms of its eigenvalues and eigenvectors. pub mod evd; +pub mod high_order; /// Factors a matrix as the product of a lower triangular matrix and an upper triangular matrix. pub mod lu; /// Dense matrix with column-major order that wraps [Vec](https://doc.rust-lang.org/std/vec/struct.Vec.html). @@ -59,6 +60,7 @@ use std::ops::Range; use crate::math::num::RealNumber; use cholesky::CholeskyDecomposableMatrix; use evd::EVDDecomposableMatrix; +use high_order::HighOrderOperations; use lu::LUDecomposableMatrix; use qr::QRDecomposableMatrix; use stats::MatrixStats; @@ -134,6 +136,66 @@ pub trait BaseVector: Clone + Debug { /// Subtract `x` from single element of the vector, write result to original vector. fn sub_element_mut(&mut self, pos: usize, x: T); + /// Subtract scalar + fn sub_scalar_mut(&mut self, x: T) -> &Self { + for i in 0..self.len() { + self.set(i, self.get(i) - x); + } + self + } + + /// Subtract scalar + fn add_scalar_mut(&mut self, x: T) -> &Self { + for i in 0..self.len() { + self.set(i, self.get(i) + x); + } + self + } + + /// Subtract scalar + fn mul_scalar_mut(&mut self, x: T) -> &Self { + for i in 0..self.len() { + self.set(i, self.get(i) * x); + } + self + } + + /// Subtract scalar + fn div_scalar_mut(&mut self, x: T) -> &Self { + for i in 0..self.len() { + self.set(i, self.get(i) / x); + } + self + } + + /// Add vectors, element-wise + fn add_scalar(&self, x: T) -> Self { + let mut r = self.clone(); + r.add_scalar_mut(x); + r + } + + /// Subtract vectors, element-wise + fn sub_scalar(&self, x: T) -> Self { + let mut r = self.clone(); + r.sub_scalar_mut(x); + r + } + + /// Multiply vectors, element-wise + fn mul_scalar(&self, x: T) -> Self { + let mut r = self.clone(); + r.mul_scalar_mut(x); + r + } + + /// Divide vectors, element-wise + fn div_scalar(&self, x: T) -> Self { + let mut r = self.clone(); + r.div_scalar_mut(x); + r + } + /// Add vectors, element-wise, overriding original vector with result. fn add_mut(&mut self, other: &Self) -> &Self; @@ -557,6 +619,7 @@ pub trait Matrix: + LUDecomposableMatrix + CholeskyDecomposableMatrix + MatrixStats + + HighOrderOperations + PartialEq + Display { diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 7486329..f4c8a97 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; +use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::stats::MatrixStats; @@ -444,6 +445,38 @@ impl LUDecomposableMatrix for DenseMatrix {} impl CholeskyDecomposableMatrix for DenseMatrix {} +impl HighOrderOperations for DenseMatrix { + fn ab(&self, a_transpose: bool, b: &Self, b_transpose: bool) -> Self { + if !a_transpose && !b_transpose { + self.matmul(b) + } else { + let (d1, d2, d3, d4) = match (a_transpose, b_transpose) { + (true, false) => (self.nrows, self.ncols, b.ncols, b.nrows), + (false, true) => (self.ncols, self.nrows, b.nrows, b.ncols), + _ => (self.nrows, self.ncols, b.nrows, b.ncols), + }; + if d1 != d4 { + panic!("Can not multiply {}x{} by {}x{} matrices", d2, d1, d4, d3); + } + let mut result = Self::zeros(d2, d3); + for r in 0..d2 { + for c in 0..d3 { + let mut s = T::zero(); + for i in 0..d1 { + match (a_transpose, b_transpose) { + (true, false) => s += self.get(i, r) * b.get(i, c), + (false, true) => s += self.get(r, i) * b.get(c, i), + _ => s += self.get(i, r) * b.get(c, i), + } + } + result.set(r, c, s); + } + } + result + } + } +} + impl MatrixStats for DenseMatrix {} impl Matrix for DenseMatrix {} @@ -625,8 +658,8 @@ impl BaseMatrix for DenseMatrix { } fn dot(&self, other: &Self) -> T { - if self.nrows != 1 && other.nrows != 1 { - panic!("A and B should both be 1-dimentional vectors."); + if (self.nrows != 1 && other.nrows != 1) && (self.ncols != 1 && other.ncols != 1) { + panic!("A and B should both be either a row or a column vector."); } if self.nrows * self.ncols != other.nrows * other.ncols { panic!("A and B should have the same size"); @@ -1114,6 +1147,29 @@ mod tests { assert_eq!(result, expected); } + #[test] + fn ab() { + let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); + let b = DenseMatrix::from_2d_array(&[&[5., 6.], &[7., 8.], &[9., 10.]]); + let c = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.]]); + assert_eq!( + a.ab(false, &b, false), + DenseMatrix::from_2d_array(&[&[46., 52.], &[109., 124.]]) + ); + assert_eq!( + c.ab(true, &b, false), + DenseMatrix::from_2d_array(&[&[71., 80.], &[92., 104.]]) + ); + assert_eq!( + b.ab(false, &c, true), + DenseMatrix::from_2d_array(&[&[17., 39., 61.], &[23., 53., 83.,], &[29., 67., 105.]]) + ); + assert_eq!( + a.ab(true, &b, true), + DenseMatrix::from_2d_array(&[&[29., 39., 49.], &[40., 54., 68.,], &[51., 69., 87.]]) + ); + } + #[test] fn dot() { let a = DenseMatrix::from_array(1, 3, &[1., 2., 3.]); diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index 8ddfdb6..8f504c6 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -44,6 +44,7 @@ use nalgebra::{DMatrix, Dynamic, Matrix, MatrixMN, RowDVector, Scalar, VecStorag use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; +use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::stats::MatrixStats; @@ -552,6 +553,11 @@ impl + HighOrderOperations for Matrix> +{ +} + impl SmartCoreMatrix for Matrix> { diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index b5058ab..9945c5f 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -51,6 +51,7 @@ use ndarray::{s, stack, Array, ArrayBase, Axis, Ix1, Ix2, OwnedRepr}; use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; +use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; use crate::linalg::stats::MatrixStats; @@ -502,6 +503,11 @@ impl + HighOrderOperations for ArrayBase, Ix2> +{ +} + impl Matrix for ArrayBase, Ix2> { diff --git a/src/linear/bg_solver.rs b/src/linear/bg_solver.rs new file mode 100644 index 0000000..b299623 --- /dev/null +++ b/src/linear/bg_solver.rs @@ -0,0 +1,146 @@ +//! This is a generic solver for Ax = b type of equation +//! +//! for more information take a look at [this Wikipedia article](https://en.wikipedia.org/wiki/Biconjugate_gradient_method) +//! and [this paper](https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf) +use crate::error::Failed; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +pub trait BiconjugateGradientSolver> { + fn solve_mut(&self, a: &M, b: &M, x: &mut M, tol: T, max_iter: usize) -> Result { + if tol <= T::zero() { + return Err(Failed::fit("tolerance shoud be > 0")); + } + + if max_iter == 0 { + return Err(Failed::fit("maximum number of iterations should be > 0")); + } + + let (n, _) = b.shape(); + + let mut r = M::zeros(n, 1); + let mut rr = M::zeros(n, 1); + let mut z = M::zeros(n, 1); + let mut zz = M::zeros(n, 1); + + self.mat_vec_mul(a, x, &mut r); + + for j in 0..n { + r.set(j, 0, b.get(j, 0) - r.get(j, 0)); + rr.set(j, 0, r.get(j, 0)); + } + + let bnrm = b.norm(T::two()); + self.solve_preconditioner(a, &r, &mut z); + + let mut p = M::zeros(n, 1); + let mut pp = M::zeros(n, 1); + let mut bkden = T::zero(); + let mut err = T::zero(); + + for iter in 1..max_iter { + let mut bknum = T::zero(); + + self.solve_preconditioner(a, &rr, &mut zz); + for j in 0..n { + bknum += z.get(j, 0) * rr.get(j, 0); + } + if iter == 1 { + for j in 0..n { + p.set(j, 0, z.get(j, 0)); + pp.set(j, 0, zz.get(j, 0)); + } + } else { + let bk = bknum / bkden; + for j in 0..n { + p.set(j, 0, bk * p.get(j, 0) + z.get(j, 0)); + pp.set(j, 0, bk * pp.get(j, 0) + zz.get(j, 0)); + } + } + bkden = bknum; + self.mat_vec_mul(a, &p, &mut z); + let mut akden = T::zero(); + for j in 0..n { + akden += z.get(j, 0) * pp.get(j, 0); + } + let ak = bknum / akden; + self.mat_t_vec_mul(a, &pp, &mut zz); + for j in 0..n { + x.set(j, 0, x.get(j, 0) + ak * p.get(j, 0)); + r.set(j, 0, r.get(j, 0) - ak * z.get(j, 0)); + rr.set(j, 0, rr.get(j, 0) - ak * zz.get(j, 0)); + } + self.solve_preconditioner(a, &r, &mut z); + err = r.norm(T::two()) / bnrm; + + if err <= tol { + break; + } + } + + Ok(err) + } + + fn solve_preconditioner(&self, a: &M, b: &M, x: &mut M) { + let diag = Self::diag(a); + let n = diag.len(); + + for i in 0..n { + if diag[i] != T::zero() { + x.set(i, 0, b.get(i, 0) / diag[i]); + } else { + x.set(i, 0, b.get(i, 0)); + } + } + } + + // y = Ax + fn mat_vec_mul(&self, a: &M, x: &M, y: &mut M) { + y.copy_from(&a.matmul(x)); + } + + // y = Atx + fn mat_t_vec_mul(&self, a: &M, x: &M, y: &mut M) { + y.copy_from(&a.ab(true, x, false)); + } + + fn diag(a: &M) -> Vec { + let (nrows, ncols) = a.shape(); + let n = nrows.min(ncols); + + let mut d = Vec::with_capacity(n); + for i in 0..n { + d.push(a.get(i, i)); + } + + d + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + + pub struct BGSolver {} + + impl> BiconjugateGradientSolver for BGSolver {} + + #[test] + fn bg_solver() { + let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]); + let b = DenseMatrix::from_2d_array(&[&[40., 51., 28.]]); + let expected = DenseMatrix::from_2d_array(&[&[1.0, 2.0, 3.0]]); + + let mut x = DenseMatrix::zeros(3, 1); + + let solver = BGSolver {}; + + let err: f64 = solver + .solve_mut(&a, &b.transpose(), &mut x, 1e-6, 6) + .unwrap(); + + assert!(x.transpose().approximate_eq(&expected, 1e-4)); + assert!((err - 0.0).abs() < 1e-4); + } +} diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs new file mode 100644 index 0000000..306b1aa --- /dev/null +++ b/src/linear/lasso.rs @@ -0,0 +1,509 @@ +//! # Lasso +//! +//! [Linear regression](../linear_regression/index.html) is the standard algorithm for predicting a quantitative response \\(y\\) on the basis of a linear combination of explanatory variables \\(X\\) +//! that assumes that there is approximately a linear relationship between \\(X\\) and \\(y\\). +//! Lasso is an extension to linear regression that adds L1 regularization term to the loss function during training. +//! +//! Similar to [ridge regression](../ridge_regression/index.html), the lasso shrinks the coefficient estimates towards zero when. However, in the case of the lasso, the l1 penalty has the effect of +//! forcing some of the coefficient estimates to be exactly equal to zero when the tuning parameter \\(\alpha\\) is sufficiently large. +//! +//! Lasso coefficient estimates solve the problem: +//! +//! \\[\underset{\beta}{minimize} \space \space \sum_{i=1}^n \left( y_i - \beta_0 - \sum_{j=1}^p \beta_jx_{ij} \right)^2 + \alpha \sum_{j=1}^p \lVert \beta_j \rVert_1\\] +//! +//! This problem is solved with an interior-point method that is comparable to coordinate descent in solving large problems with modest accuracy, +//! but is able to solve them with high accuracy with relatively small additional computational cost. +//! +//! ## References: +//! +//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 6.2. Shrinkage Methods](http://faculty.marshall.usc.edu/gareth-james/ISL/) +//! * ["An Interior-Point Method for Large-Scale l1-Regularized Least Squares", K. Koh, M. Lustig, S. Boyd, D. Gorinevsky](https://web.stanford.edu/~boyd/papers/pdf/l1_ls.pdf) +//! * [Simple Matlab Solver for l1-regularized Least Squares Problems](https://web.stanford.edu/~boyd/l1_ls/) +//! +//! +//! +use std::fmt::Debug; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::linear::bg_solver::BiconjugateGradientSolver; +use crate::math::num::RealNumber; + +/// Lasso regression parameters +#[derive(Serialize, Deserialize, Debug)] +pub struct LassoParameters { + /// Controls the strength of the penalty to the loss function. + pub alpha: T, + /// If true the regressors X will be normalized before regression + /// by subtracting the mean and dividing by the standard deviation. + pub normalize: bool, + /// The tolerance for the optimization + pub tol: T, + /// The maximum number of iterations + pub max_iter: usize, +} + +#[derive(Serialize, Deserialize, Debug)] +/// Lasso regressor +pub struct Lasso> { + coefficients: M, + intercept: T, +} + +struct InteriorPointOptimizer> { + ata: M, + d1: Vec, + d2: Vec, + prb: Vec, + prs: Vec, +} + +impl Default for LassoParameters { + fn default() -> Self { + LassoParameters { + alpha: T::one(), + normalize: true, + tol: T::from_f64(1e-4).unwrap(), + max_iter: 1000, + } + } +} + +impl> PartialEq for Lasso { + fn eq(&self, other: &Self) -> bool { + self.coefficients == other.coefficients + && (self.intercept - other.intercept).abs() <= T::epsilon() + } +} + +impl> Lasso { + /// Fits Lasso regression to your data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - target values + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: LassoParameters, + ) -> Result, Failed> { + let (n, p) = x.shape(); + + if n <= p { + return Err(Failed::fit( + "Number of rows in X should be >= number of columns in X", + )); + } + + if parameters.alpha < T::zero() { + return Err(Failed::fit("alpha should be >= 0")); + } + + if parameters.tol <= T::zero() { + return Err(Failed::fit("tol should be > 0")); + } + + if parameters.max_iter == 0 { + return Err(Failed::fit("max_iter should be > 0")); + } + + if y.len() != n { + return Err(Failed::fit("Number of rows in X should = len(y)")); + } + + let (w, b) = if parameters.normalize { + let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; + + let mut optimizer = InteriorPointOptimizer::new(&scaled_x, p); + + let mut w = optimizer.optimize(&scaled_x, y, ¶meters)?; + + for j in 0..p { + w.set(j, 0, w.get(j, 0) / col_std[j]); + } + + let mut b = T::zero(); + + for i in 0..p { + b += w.get(i, 0) * col_mean[i]; + } + + b = y.mean() - b; + (w, b) + } else { + let mut optimizer = InteriorPointOptimizer::new(x, p); + + let w = optimizer.optimize(x, y, ¶meters)?; + + (w, y.mean()) + }; + + Ok(Lasso { + intercept: b, + coefficients: w, + }) + } + + /// Predict target values from `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &M) -> Result { + let (nrows, _) = x.shape(); + let mut y_hat = x.matmul(&self.coefficients); + y_hat.add_mut(&M::fill(nrows, 1, self.intercept)); + Ok(y_hat.transpose().to_row_vector()) + } + + /// Get estimates regression coefficients + pub fn coefficients(&self) -> &M { + &self.coefficients + } + + /// Get estimate of intercept + pub fn intercept(&self) -> T { + self.intercept + } + + fn rescale_x(x: &M) -> Result<(M, Vec, Vec), Failed> { + let col_mean = x.mean(0); + let col_std = x.std(0); + + for i in 0..col_std.len() { + if (col_std[i] - T::zero()).abs() < T::epsilon() { + return Err(Failed::fit(&format!( + "Cannot rescale constant column {}", + i + ))); + } + } + + let mut scaled_x = x.clone(); + scaled_x.scale_mut(&col_mean, &col_std, 0); + Ok((scaled_x, col_mean, col_std)) + } +} + +impl> InteriorPointOptimizer { + fn new(a: &M, n: usize) -> InteriorPointOptimizer { + InteriorPointOptimizer { + ata: a.ab(true, a, false), + d1: vec![T::zero(); n], + d2: vec![T::zero(); n], + prb: vec![T::zero(); n], + prs: vec![T::zero(); n], + } + } + + fn optimize( + &mut self, + x: &M, + y: &M::RowVector, + parameters: &LassoParameters, + ) -> Result { + let (n, p) = x.shape(); + let p_f64 = T::from_usize(p).unwrap(); + + //parameters + let pcgmaxi = 5000; + let min_pcgtol = T::from_f64(0.1).unwrap(); + let eta = T::from_f64(1E-3).unwrap(); + let alpha = T::from_f64(0.01).unwrap(); + let beta = T::from_f64(0.5).unwrap(); + let gamma = T::from_f64(-0.25).unwrap(); + let mu = T::two(); + + let y = M::from_row_vector(y.sub_scalar(y.mean())).transpose(); + + let mut max_ls_iter = 100; + let mut pitr = 0; + let mut w = M::zeros(p, 1); + let mut neww = w.clone(); + let mut u = M::ones(p, 1); + let mut newu = u.clone(); + + let mut f = M::fill(p, 2, -T::one()); + let mut newf = f.clone(); + + let mut q1 = vec![T::zero(); p]; + let mut q2 = vec![T::zero(); p]; + + let mut dx = M::zeros(p, 1); + let mut du = M::zeros(p, 1); + let mut dxu = M::zeros(2 * p, 1); + let mut grad = M::zeros(2 * p, 1); + + let mut nu = M::zeros(n, 1); + let mut dobj = T::zero(); + let mut s = T::infinity(); + let mut t = T::one() + .max(T::one() / parameters.alpha) + .min(T::two() * p_f64 / T::from(1e-3).unwrap()); + + for ntiter in 0..parameters.max_iter { + let mut z = x.matmul(&w); + + for i in 0..n { + z.set(i, 0, z.get(i, 0) - y.get(i, 0)); + nu.set(i, 0, T::two() * z.get(i, 0)); + } + + // CALCULATE DUALITY GAP + let xnu = x.ab(true, &nu, false); + let max_xnu = xnu.norm(T::infinity()); + if max_xnu > parameters.alpha { + let lnu = parameters.alpha / max_xnu; + nu.mul_scalar_mut(lnu); + } + + let pobj = z.dot(&z) + parameters.alpha * w.norm(T::one()); + dobj = dobj.max(gamma * nu.dot(&nu) - nu.dot(&y)); + + let gap = pobj - dobj; + + // STOPPING CRITERION + if gap / dobj < parameters.tol { + break; + } + + // UPDATE t + if s >= T::half() { + t = t.max((T::two() * p_f64 * mu / gap).min(mu * t)); + } + + // CALCULATE NEWTON STEP + for i in 0..p { + let q1i = T::one() / (u.get(i, 0) + w.get(i, 0)); + let q2i = T::one() / (u.get(i, 0) - w.get(i, 0)); + q1[i] = q1i; + q2[i] = q2i; + self.d1[i] = (q1i * q1i + q2i * q2i) / t; + self.d2[i] = (q1i * q1i - q2i * q2i) / t; + } + + let mut gradphi = x.ab(true, &z, false); + + for i in 0..p { + let g1 = T::two() * gradphi.get(i, 0) - (q1[i] - q2[i]) / t; + let g2 = parameters.alpha - (q1[i] + q2[i]) / t; + gradphi.set(i, 0, g1); + grad.set(i, 0, -g1); + grad.set(i + p, 0, -g2); + } + + for i in 0..p { + self.prb[i] = T::two() + self.d1[i]; + self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i] * self.d2[i]; + } + + let normg = grad.norm2(); + let mut pcgtol = min_pcgtol.min(eta * gap / T::one().min(normg)); + if ntiter != 0 && pitr == 0 { + pcgtol *= min_pcgtol; + } + + let error = self.solve_mut(x, &grad, &mut dxu, pcgtol, pcgmaxi)?; + if error > pcgtol { + pitr = pcgmaxi; + } + + for i in 0..p { + dx.set(i, 0, dxu.get(i, 0)); + du.set(i, 0, dxu.get(i + p, 0)); + } + + // BACKTRACKING LINE SEARCH + let phi = z.dot(&z) + parameters.alpha * u.sum() - Self::sumlogneg(&f) / t; + s = T::one(); + let gdx = grad.dot(&dxu); + + let lsiter = 0; + while lsiter < max_ls_iter { + for i in 0..p { + neww.set(i, 0, w.get(i, 0) + s * dx.get(i, 0)); + newu.set(i, 0, u.get(i, 0) + s * du.get(i, 0)); + newf.set(i, 0, neww.get(i, 0) - newu.get(i, 0)); + newf.set(i, 1, -neww.get(i, 0) - newu.get(i, 0)); + } + + if newf.max() < T::zero() { + let mut newz = x.matmul(&neww); + for i in 0..n { + newz.set(i, 0, newz.get(i, 0) - y.get(i, 0)); + } + + let newphi = newz.dot(&newz) + parameters.alpha * newu.sum() + - Self::sumlogneg(&newf) / t; + if newphi - phi <= alpha * s * gdx { + break; + } + } + s = beta * s; + max_ls_iter += 1; + } + + if lsiter == max_ls_iter { + return Err(Failed::fit( + "Exceeded maximum number of iteration for interior point optimizer", + )); + } + + w.copy_from(&neww); + u.copy_from(&newu); + f.copy_from(&newf); + } + + Ok(w) + } + + fn sumlogneg(f: &M) -> T { + let (n, _) = f.shape(); + let mut sum = T::zero(); + for i in 0..n { + sum += (-f.get(i, 0)).ln(); + sum += (-f.get(i, 1)).ln(); + } + sum + } +} + +impl<'a, T: RealNumber, M: Matrix> BiconjugateGradientSolver + for InteriorPointOptimizer +{ + fn solve_preconditioner(&self, a: &M, b: &M, x: &mut M) { + let (_, p) = a.shape(); + + for i in 0..p { + x.set( + i, + 0, + (self.d1[i] * b.get(i, 0) - self.d2[i] * b.get(i + p, 0)) / self.prs[i], + ); + x.set( + i + p, + 0, + (-self.d2[i] * b.get(i, 0) + self.prb[i] * b.get(i + p, 0)) / self.prs[i], + ); + } + } + + fn mat_vec_mul(&self, _: &M, x: &M, y: &mut M) { + let (_, p) = self.ata.shape(); + let atax = self.ata.matmul(&x.slice(0..p, 0..1)); + + for i in 0..p { + y.set( + i, + 0, + T::two() * atax.get(i, 0) + self.d1[i] * x.get(i, 0) + self.d2[i] * x.get(i + p, 0), + ); + y.set( + i + p, + 0, + self.d2[i] * x.get(i, 0) + self.d1[i] * x.get(i + p, 0), + ); + } + } + + fn mat_t_vec_mul(&self, a: &M, x: &M, y: &mut M) { + self.mat_vec_mul(a, x, y); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + use crate::metrics::mean_absolute_error; + + #[test] + fn lasso_fit_predict() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y: Vec = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let y_hat = Lasso::fit( + &x, + &y, + LassoParameters { + alpha: 0.1, + normalize: false, + tol: 1e-4, + max_iter: 1000, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat, &y) < 2.0); + + let y_hat = Lasso::fit( + &x, + &y, + LassoParameters { + alpha: 0.1, + normalize: false, + tol: 1e-4, + max_iter: 1000, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat, &y) < 2.0); + } + + #[test] + fn serde() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let lr = Lasso::fit(&x, &y, Default::default()).unwrap(); + + let deserialized_lr: Lasso> = + serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); + + assert_eq!(lr, deserialized_lr); + } +} diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 4b52529..a3674b3 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -289,7 +289,7 @@ impl> LogisticRegression { let n = x.shape().0; let mut result = M::zeros(1, n); if self.num_classes == 2 { - let y_hat: Vec = x.matmul(&self.coefficients.transpose()).get_col_as_vec(0); + let y_hat: Vec = x.ab(false, &self.coefficients, true).get_col_as_vec(0); let intercept = self.intercept.get(0, 0); for i in 0..n { result.set( diff --git a/src/linear/mod.rs b/src/linear/mod.rs index fef7070..edaea4f 100644 --- a/src/linear/mod.rs +++ b/src/linear/mod.rs @@ -20,6 +20,8 @@ //! //! +pub(crate) mod bg_solver; +pub mod lasso; pub mod linear_regression; pub mod logistic_regression; pub mod ridge_regression; From f9056f716ad1296c335d816b5c3e15c1823dd174 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 24 Nov 2020 19:21:27 -0800 Subject: [PATCH 54/79] lasso: minor change in unit test --- src/linear/lasso.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 306b1aa..965c1c4 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -447,7 +447,7 @@ mod tests { &y, LassoParameters { alpha: 0.1, - normalize: false, + normalize: true, tol: 1e-4, max_iter: 1000, }, From 89a5136191522a2882ffa3f8a10bda92161024b5 Mon Sep 17 00:00:00 2001 From: morenol Date: Wed, 25 Nov 2020 14:39:02 -0400 Subject: [PATCH 55/79] Change implementation of to_row_vector for nalgebra (#34) * Add failing test * Change implementation of to_row_vector for nalgebra --- Cargo.toml | 4 ++-- src/linalg/naive/dense_matrix.rs | 6 ++++++ src/linalg/nalgebra_bindings.rs | 11 +++++++++-- src/linalg/ndarray_bindings.rs | 6 ++++++ 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 20eebf5..6e15f88 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ datasets = [] [dependencies] ndarray = { version = "0.13", optional = true } -nalgebra = { version = "0.22.0", optional = true } +nalgebra = { version = "0.23.0", optional = true } num-traits = "0.2.12" num = "0.3.0" rand = "0.7.3" @@ -35,4 +35,4 @@ bincode = "1.3.1" [[bench]] name = "distance" -harness = false \ No newline at end of file +harness = false diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 7486329..9279c3c 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1064,6 +1064,12 @@ mod tests { ); } + #[test] + fn col_matrix_to_row_vector() { + let m: DenseMatrix = BaseMatrix::zeros(10, 1); + assert_eq!(m.to_row_vector().len(), 10) + } + #[test] fn iter() { let vec = vec![1., 2., 3., 4., 5., 6.]; diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index 8ddfdb6..da2ec05 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -185,14 +185,15 @@ impl BaseVector for MatrixMN { impl BaseMatrix for Matrix> { - type RowVector = MatrixMN; + type RowVector = RowDVector; fn from_row_vector(vec: Self::RowVector) -> Self { Matrix::from_rows(&[vec]) } fn to_row_vector(self) -> Self::RowVector { - self.row(0).into_owned() + let (nrows, ncols) = self.shape(); + self.reshape_generic(U1, Dynamic::new(nrows * ncols)) } fn get(&self, row: usize, col: usize) -> T { @@ -697,6 +698,12 @@ mod tests { assert_eq!(m.to_row_vector(), expected); } + #[test] + fn col_matrix_to_row_vector() { + let m: DMatrix = BaseMatrix::zeros(10, 1); + assert_eq!(m.to_row_vector().len(), 10) + } + #[test] fn get_row_col_as_vec() { let m = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]); diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index b5058ab..308e355 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -563,6 +563,12 @@ mod tests { ); } + #[test] + fn col_matrix_to_row_vector() { + let m: Array2 = BaseMatrix::zeros(10, 1); + assert_eq!(m.to_row_vector().len(), 10) + } + #[test] fn add_mut() { let mut a1 = arr2(&[[1., 2., 3.], [4., 5., 6.]]); From 67e582987792166ca15a8e0303968e78b5160626 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 25 Nov 2020 12:23:04 -0800 Subject: [PATCH 56/79] simplifies generic matrix.ab implementation --- src/linalg/high_order.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linalg/high_order.rs b/src/linalg/high_order.rs index 359c4a1..493c737 100644 --- a/src/linalg/high_order.rs +++ b/src/linalg/high_order.rs @@ -19,7 +19,7 @@ pub trait HighOrderOperations: BaseMatrix { /// ``` fn ab(&self, a_transpose: bool, b: &Self, b_transpose: bool) -> Self { match (a_transpose, b_transpose) { - (true, true) => self.transpose().matmul(&b.transpose()), + (true, true) => b.matmul(self).transpose(), (false, true) => self.matmul(&b.transpose()), (true, false) => self.transpose().matmul(b), (false, false) => self.matmul(b), From 4720a3a4ebf0137f68d87e0b34e83192d539e8bf Mon Sep 17 00:00:00 2001 From: morenol Date: Thu, 3 Dec 2020 09:51:33 -0400 Subject: [PATCH 57/79] MultinomialNB (#32) feat: add MultinomialNB --- src/naive_bayes/mod.rs | 3 + src/naive_bayes/multinomial.rs | 278 +++++++++++++++++++++++++++++++++ 2 files changed, 281 insertions(+) create mode 100644 src/naive_bayes/multinomial.rs diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 0268da6..8b63aaa 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -66,5 +66,8 @@ impl, D: NBDistribution> BaseNaiveBayes { + /// class labels known to the classifier + class_labels: Vec, + class_priors: Vec, + feature_prob: Vec>, +} + +impl> NBDistribution for MultinomialNBDistribution { + fn prior(&self, class_index: usize) -> T { + self.class_priors[class_index] + } + + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + likelihood += value * self.feature_prob[class_index][feature].ln(); + } + likelihood + } + + fn classes(&self) -> &Vec { + &self.class_labels + } +} + +/// `MultinomialNB` parameters. Use `Default::default()` for default values. +#[derive(Serialize, Deserialize, Debug)] +pub struct MultinomialNBParameters { + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub alpha: T, + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub priors: Option>, +} + +impl MultinomialNBParameters { + /// Create MultinomialNBParameters with specific paramaters. + pub fn new(alpha: T, priors: Option>) -> Self { + Self { alpha, priors } + } +} + +impl Default for MultinomialNBParameters { + fn default() -> Self { + Self { + alpha: T::one(), + priors: None, + } + } +} + +impl MultinomialNBDistribution { + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data. + /// * `y` - vector with target values (classes) of length N. + /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, + /// priors are adjusted according to the data. + /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter. + pub fn fit>( + x: &M, + y: &M::RowVector, + alpha: T, + priors: Option>, + ) -> Result { + let (n_samples, n_features) = x.shape(); + let y_samples = y.len(); + if y_samples != n_samples { + return Err(Failed::fit(&format!( + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", + n_samples, y_samples + ))); + } + + if n_samples == 0 { + return Err(Failed::fit(&format!( + "Size of x and y should greater than 0; |x|=[{}]", + n_samples + ))); + } + if alpha < T::zero() { + return Err(Failed::fit(&format!( + "Alpha should be greater than 0; |alpha|=[{}]", + alpha + ))); + } + + let y = y.to_vec(); + + let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); + let mut class_count = vec![T::zero(); class_labels.len()]; + + for class_index in indices.iter() { + class_count[*class_index] += T::one(); + } + + let class_priors = if let Some(class_priors) = priors { + if class_priors.len() != class_labels.len() { + return Err(Failed::fit( + "Size of priors provided does not match the number of classes of the data.", + )); + } + class_priors + } else { + class_count + .iter() + .map(|&c| c / T::from(n_samples).unwrap()) + .collect() + }; + + let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + + for (row, class_index) in row_iter(x).zip(indices) { + for idx in 0..n_features { + feature_in_class_counter[class_index][idx] += row[idx]; + } + } + + let feature_prob = feature_in_class_counter + .iter() + .map(|feature_count| { + let n_c = feature_count.sum(); + feature_count + .iter() + .map(|&count| (count + alpha) / (n_c + alpha * T::from(n_features).unwrap())) + .collect() + }) + .collect(); + + Ok(Self { + class_labels, + class_priors, + feature_prob, + }) + } +} + +/// MultinomialNB implements the categorical naive Bayes algorithm for categorically distributed data. +#[derive(Serialize, Deserialize, Debug, PartialEq)] +pub struct MultinomialNB> { + inner: BaseNaiveBayes>, +} + +impl> MultinomialNB { + /// Fits MultinomialNB with given data + /// * `x` - training data of size NxM where N is the number of samples and M is the number of + /// features. + /// * `y` - vector with target values (classes) of length N. + /// * `parameters` - additional parameters like class priors, alpha for smoothing and + /// binarizing threshold. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: MultinomialNBParameters, + ) -> Result { + let distribution = + MultinomialNBDistribution::fit(x, y, parameters.alpha, parameters.priors)?; + let inner = BaseNaiveBayes::fit(distribution)?; + Ok(Self { inner }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + self.inner.predict(x) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn run_multinomial_naive_bayes() { + // Tests that MultinomialNB when alpha=1.0 gives the same values as + // those given for the toy example in Manning, Raghavan, and + // Schuetze's "Introduction to Information Retrieval" book: + // https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html + + // Training data points are: + // Chinese Beijing Chinese (class: China) + // Chinese Chinese Shanghai (class: China) + // Chinese Macao (class: China) + // Tokyo Japan Chinese (class: Japan) + let x = DenseMatrix::::from_2d_array(&[ + &[1., 2., 0., 0., 0., 0.], + &[0., 2., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + + assert_eq!(mnb.inner.distribution.class_priors, &[0.75, 0.25]); + assert_eq!( + mnb.inner.distribution.feature_prob, + &[ + &[1. / 7., 3. / 7., 1. / 14., 1. / 7., 1. / 7., 1. / 14.], + &[1. / 9., 2. / 9.0, 2. / 9.0, 1. / 9.0, 1. / 9.0, 2. / 9.0] + ] + ); + + // Testing data point is: + // Chinese Chinese Chinese Tokyo Japan + let x_test = DenseMatrix::::from_2d_array(&[&[0., 3., 1., 0., 0., 1.]]); + let y_hat = mnb.predict(&x_test).unwrap(); + + assert_eq!(y_hat, &[0.]); + } + + #[test] + fn multinomial_nb_scikit_parity() { + let x = DenseMatrix::::from_2d_array(&[ + &[2., 4., 0., 0., 2., 1., 2., 4., 2., 0.], + &[3., 4., 0., 2., 1., 0., 1., 4., 0., 3.], + &[1., 4., 2., 4., 1., 0., 1., 2., 3., 2.], + &[0., 3., 3., 4., 1., 0., 3., 1., 1., 1.], + &[0., 2., 1., 4., 3., 4., 1., 2., 3., 1.], + &[3., 2., 4., 1., 3., 0., 2., 4., 0., 2.], + &[3., 1., 3., 0., 2., 0., 4., 4., 3., 4.], + &[2., 2., 2., 0., 1., 1., 2., 1., 0., 1.], + &[3., 3., 2., 2., 0., 2., 3., 2., 2., 3.], + &[4., 3., 4., 4., 4., 2., 2., 0., 1., 4.], + &[3., 4., 2., 2., 1., 4., 4., 4., 1., 3.], + &[3., 0., 1., 4., 4., 0., 0., 3., 2., 4.], + &[2., 0., 3., 3., 1., 2., 0., 2., 4., 1.], + &[2., 4., 0., 4., 2., 4., 1., 3., 1., 4.], + &[0., 2., 2., 3., 4., 0., 4., 4., 4., 4.], + ]); + let y = vec![2., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0., 0., 2., 0., 2.]; + let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + + let y_hat = nb.predict(&x).unwrap(); + + assert!(nb + .inner + .distribution + .class_priors + .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); + assert!(nb.inner.distribution.feature_prob[1].approximate_eq( + &vec!(0.07, 0.12, 0.07, 0.15, 0.07, 0.09, 0.08, 0.10, 0.08, 0.11), + 1e-1 + )); + assert!(y_hat.approximate_eq( + &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 2.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 2.0), + 1e-5 + )); + } + #[test] + fn serde() { + let x = DenseMatrix::::from_2d_array(&[ + &[1., 1., 0., 0., 0., 0.], + &[0., 1., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + + let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + let deserialized_mnb: MultinomialNB> = + serde_json::from_str(&serde_json::to_string(&mnb).unwrap()).unwrap(); + + assert_eq!(mnb, deserialized_mnb); + } +} From f0b348dd6ee45ecbcba26ec783b5e092862844af Mon Sep 17 00:00:00 2001 From: morenol Date: Fri, 4 Dec 2020 20:45:40 -0400 Subject: [PATCH 58/79] feat: BernoulliNB (#31) * feat: BernoulliNB * Move preprocessing to a trait in linalg/stats.rs --- src/linalg/mod.rs | 3 +- src/linalg/naive/dense_matrix.rs | 3 +- src/linalg/nalgebra_bindings.rs | 7 +- src/linalg/ndarray_bindings.rs | 7 +- src/linalg/stats.rs | 41 ++++ src/naive_bayes/bernoulli.rs | 308 +++++++++++++++++++++++++++++++ src/naive_bayes/mod.rs | 2 + 7 files changed, 367 insertions(+), 4 deletions(-) create mode 100644 src/naive_bayes/bernoulli.rs diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 1be2e75..d3fb635 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -63,7 +63,7 @@ use evd::EVDDecomposableMatrix; use high_order::HighOrderOperations; use lu::LUDecomposableMatrix; use qr::QRDecomposableMatrix; -use stats::MatrixStats; +use stats::{MatrixPreprocessing, MatrixStats}; use svd::SVDDecomposableMatrix; /// Column or row vector @@ -619,6 +619,7 @@ pub trait Matrix: + LUDecomposableMatrix + CholeskyDecomposableMatrix + MatrixStats + + MatrixPreprocessing + HighOrderOperations + PartialEq + Display diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 89abe20..14e5e62 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -12,7 +12,7 @@ use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; -use crate::linalg::stats::MatrixStats; +use crate::linalg::stats::{MatrixPreprocessing, MatrixStats}; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix; pub use crate::linalg::{BaseMatrix, BaseVector}; @@ -478,6 +478,7 @@ impl HighOrderOperations for DenseMatrix { } impl MatrixStats for DenseMatrix {} +impl MatrixPreprocessing for DenseMatrix {} impl Matrix for DenseMatrix {} diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index e108831..ad2d4a2 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -47,7 +47,7 @@ use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; -use crate::linalg::stats::MatrixStats; +use crate::linalg::stats::{MatrixPreprocessing, MatrixStats}; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix as SmartCoreMatrix; use crate::linalg::{BaseMatrix, BaseVector}; @@ -554,6 +554,11 @@ impl + MatrixPreprocessing for Matrix> +{ +} + impl HighOrderOperations for Matrix> { diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index e50bdcd..3f0478f 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -54,7 +54,7 @@ use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::high_order::HighOrderOperations; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; -use crate::linalg::stats::MatrixStats; +use crate::linalg::stats::{MatrixPreprocessing, MatrixStats}; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix; use crate::linalg::{BaseMatrix, BaseVector}; @@ -503,6 +503,11 @@ impl + MatrixPreprocessing for ArrayBase, Ix2> +{ +} + impl HighOrderOperations for ArrayBase, Ix2> { diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index ac7a1bc..fff87c3 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -104,6 +104,47 @@ pub trait MatrixStats: BaseMatrix { } } +/// Defines baseline implementations for various matrix processing functions +pub trait MatrixPreprocessing: BaseMatrix { + /// Each element of the matrix greater than the threshold becomes 1, while values less than or equal to the threshold become 0 + /// ``` + /// use smartcore::linalg::naive::dense_matrix::*; + /// use crate::smartcore::linalg::stats::MatrixPreprocessing; + /// let mut a = DenseMatrix::from_array(2, 3, &[0., 2., 3., -5., -6., -7.]); + /// let expected = DenseMatrix::from_array(2, 3, &[0., 1., 1., 0., 0., 0.]); + /// a.binarize_mut(0.); + /// + /// assert_eq!(a, expected); + /// ``` + + fn binarize_mut(&mut self, threshold: T) { + let (nrows, ncols) = self.shape(); + for row in 0..nrows { + for col in 0..ncols { + if self.get(row, col) > threshold { + self.set(row, col, T::one()); + } else { + self.set(row, col, T::zero()); + } + } + } + } + /// Returns new matrix where elements are binarized according to a given threshold. + /// ``` + /// use smartcore::linalg::naive::dense_matrix::*; + /// use crate::smartcore::linalg::stats::MatrixPreprocessing; + /// let a = DenseMatrix::from_array(2, 3, &[0., 2., 3., -5., -6., -7.]); + /// let expected = DenseMatrix::from_array(2, 3, &[0., 1., 1., 0., 0., 0.]); + /// + /// assert_eq!(a.binarize(0.), expected); + /// ``` + fn binarize(&self, threshold: T) -> Self { + let mut m = self.clone(); + m.binarize_mut(threshold); + m + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs new file mode 100644 index 0000000..535b5ee --- /dev/null +++ b/src/naive_bayes/bernoulli.rs @@ -0,0 +1,308 @@ +use crate::error::Failed; +use crate::linalg::row_iter; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use crate::math::vector::RealNumberVector; +use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; + +use serde::{Deserialize, Serialize}; + +/// Naive Bayes classifier for Bearnoulli features +#[derive(Serialize, Deserialize, Debug, PartialEq)] +struct BernoulliNBDistribution { + /// class labels known to the classifier + class_labels: Vec, + class_priors: Vec, + feature_prob: Vec>, +} + +impl> NBDistribution for BernoulliNBDistribution { + fn prior(&self, class_index: usize) -> T { + self.class_priors[class_index] + } + + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + if value == T::one() { + likelihood += self.feature_prob[class_index][feature].ln(); + } else { + likelihood += (T::one() - self.feature_prob[class_index][feature]).ln(); + } + } + likelihood + } + + fn classes(&self) -> &Vec { + &self.class_labels + } +} + +/// `BernoulliNB` parameters. Use `Default::default()` for default values. +#[derive(Serialize, Deserialize, Debug)] +pub struct BernoulliNBParameters { + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub alpha: T, + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub priors: Option>, + /// Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors. + pub binarize: Option, +} + +impl BernoulliNBParameters { + /// Create BernoulliNBParameters with specific paramaters. + pub fn new(alpha: T, priors: Option>, binarize: Option) -> Self { + Self { + alpha, + priors, + binarize, + } + } +} + +impl Default for BernoulliNBParameters { + fn default() -> Self { + Self { + alpha: T::one(), + priors: None, + binarize: Some(T::zero()), + } + } +} + +impl BernoulliNBDistribution { + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. + /// * `x` - training data. + /// * `y` - vector with target values (classes) of length N. + /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, + /// priors are adjusted according to the data. + /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter. + /// * `binarize` - Threshold for binarizing. + pub fn fit>( + x: &M, + y: &M::RowVector, + alpha: T, + priors: Option>, + ) -> Result { + let (n_samples, n_features) = x.shape(); + let y_samples = y.len(); + if y_samples != n_samples { + return Err(Failed::fit(&format!( + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", + n_samples, y_samples + ))); + } + + if n_samples == 0 { + return Err(Failed::fit(&format!( + "Size of x and y should greater than 0; |x|=[{}]", + n_samples + ))); + } + if alpha < T::zero() { + return Err(Failed::fit(&format!( + "Alpha should be greater than 0; |alpha|=[{}]", + alpha + ))); + } + + let y = y.to_vec(); + + let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); + let mut class_count = vec![T::zero(); class_labels.len()]; + + for class_index in indices.iter() { + class_count[*class_index] += T::one(); + } + + let class_priors = if let Some(class_priors) = priors { + if class_priors.len() != class_labels.len() { + return Err(Failed::fit( + "Size of priors provided does not match the number of classes of the data.", + )); + } + class_priors + } else { + class_count + .iter() + .map(|&c| c / T::from(n_samples).unwrap()) + .collect() + }; + + let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + + for (row, class_index) in row_iter(x).zip(indices) { + for idx in 0..n_features { + feature_in_class_counter[class_index][idx] += row[idx]; + } + } + + let feature_prob = feature_in_class_counter + .iter() + .enumerate() + .map(|(class_index, feature_count)| { + feature_count + .iter() + .map(|&count| (count + alpha) / (class_count[class_index] + alpha * T::two())) + .collect() + }) + .collect(); + + Ok(Self { + class_labels, + class_priors, + feature_prob, + }) + } +} + +/// BernoulliNB implements the categorical naive Bayes algorithm for categorically distributed data. +#[derive(Serialize, Deserialize, Debug, PartialEq)] +pub struct BernoulliNB> { + inner: BaseNaiveBayes>, + binarize: Option, +} + +impl> BernoulliNB { + /// Fits BernoulliNB with given data + /// * `x` - training data of size NxM where N is the number of samples and M is the number of + /// features. + /// * `y` - vector with target values (classes) of length N. + /// * `parameters` - additional parameters like class priors, alpha for smoothing and + /// binarizing threshold. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: BernoulliNBParameters, + ) -> Result { + let distribution = if let Some(threshold) = parameters.binarize { + BernoulliNBDistribution::fit( + &(x.binarize(threshold)), + y, + parameters.alpha, + parameters.priors, + )? + } else { + BernoulliNBDistribution::fit(x, y, parameters.alpha, parameters.priors)? + }; + + let inner = BaseNaiveBayes::fit(distribution)?; + Ok(Self { + inner, + binarize: parameters.binarize, + }) + } + + /// Estimates the class labels for the provided data. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// Returns a vector of size N with class estimates. + pub fn predict(&self, x: &M) -> Result { + if let Some(threshold) = self.binarize { + self.inner.predict(&(x.binarize(threshold))) + } else { + self.inner.predict(x) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + + #[test] + fn run_bernoulli_naive_bayes() { + // Tests that BernoulliNB when alpha=1.0 gives the same values as + // those given for the toy example in Manning, Raghavan, and + // Schuetze's "Introduction to Information Retrieval" book: + // https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html + + // Training data points are: + // Chinese Beijing Chinese (class: China) + // Chinese Chinese Shanghai (class: China) + // Chinese Macao (class: China) + // Tokyo Japan Chinese (class: Japan) + let x = DenseMatrix::::from_2d_array(&[ + &[1., 1., 0., 0., 0., 0.], + &[0., 1., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + let bnb = BernoulliNB::fit(&x, &y, Default::default()).unwrap(); + + assert_eq!(bnb.inner.distribution.class_priors, &[0.75, 0.25]); + assert_eq!( + bnb.inner.distribution.feature_prob, + &[ + &[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], + &[1. / 3.0, 2. / 3.0, 2. / 3.0, 1. / 3.0, 1. / 3.0, 2. / 3.0] + ] + ); + + // Testing data point is: + // Chinese Chinese Chinese Tokyo Japan + let x_test = DenseMatrix::::from_2d_array(&[&[0., 1., 1., 0., 0., 1.]]); + let y_hat = bnb.predict(&x_test).unwrap(); + + assert_eq!(y_hat, &[1.]); + } + + #[test] + fn bernoulli_nb_scikit_parity() { + let x = DenseMatrix::::from_2d_array(&[ + &[2., 4., 0., 0., 2., 1., 2., 4., 2., 0.], + &[3., 4., 0., 2., 1., 0., 1., 4., 0., 3.], + &[1., 4., 2., 4., 1., 0., 1., 2., 3., 2.], + &[0., 3., 3., 4., 1., 0., 3., 1., 1., 1.], + &[0., 2., 1., 4., 3., 4., 1., 2., 3., 1.], + &[3., 2., 4., 1., 3., 0., 2., 4., 0., 2.], + &[3., 1., 3., 0., 2., 0., 4., 4., 3., 4.], + &[2., 2., 2., 0., 1., 1., 2., 1., 0., 1.], + &[3., 3., 2., 2., 0., 2., 3., 2., 2., 3.], + &[4., 3., 4., 4., 4., 2., 2., 0., 1., 4.], + &[3., 4., 2., 2., 1., 4., 4., 4., 1., 3.], + &[3., 0., 1., 4., 4., 0., 0., 3., 2., 4.], + &[2., 0., 3., 3., 1., 2., 0., 2., 4., 1.], + &[2., 4., 0., 4., 2., 4., 1., 3., 1., 4.], + &[0., 2., 2., 3., 4., 0., 4., 4., 4., 4.], + ]); + let y = vec![2., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0., 0., 2., 0., 2.]; + let bnb = BernoulliNB::fit(&x, &y, Default::default()).unwrap(); + + let y_hat = bnb.predict(&x).unwrap(); + + assert!(bnb + .inner + .distribution + .class_priors + .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); + assert!(bnb.inner.distribution.feature_prob[1].approximate_eq( + &vec!(0.8, 0.8, 0.8, 0.4, 0.8, 0.6, 0.8, 0.6, 0.6, 0.8), + 1e-1 + )); + assert!(y_hat.approximate_eq( + &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), + 1e-5 + )); + } + + #[test] + fn serde() { + let x = DenseMatrix::::from_2d_array(&[ + &[1., 1., 0., 0., 0., 0.], + &[0., 1., 0., 0., 1., 0.], + &[0., 1., 0., 1., 0., 0.], + &[0., 1., 1., 0., 0., 1.], + ]); + let y = vec![0., 0., 0., 1.]; + + let bnb = BernoulliNB::fit(&x, &y, Default::default()).unwrap(); + let deserialized_bnb: BernoulliNB> = + serde_json::from_str(&serde_json::to_string(&bnb).unwrap()).unwrap(); + + assert_eq!(bnb, deserialized_bnb); + } +} diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 8b63aaa..508b976 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -64,10 +64,12 @@ impl, D: NBDistribution> BaseNaiveBayes Date: Fri, 4 Dec 2020 20:46:36 -0400 Subject: [PATCH 59/79] Add benches for GNB (#33) * Add benches for GNB * use [black_box](https://github.com/bheisler/criterion.rs/blob/master/book/src/faq.md#when-should-i-use-criterionblack_box) --- Cargo.toml | 5 +++ benches/naive_bayes.rs | 73 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 benches/naive_bayes.rs diff --git a/Cargo.toml b/Cargo.toml index 6e15f88..1503957 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,3 +36,8 @@ bincode = "1.3.1" [[bench]] name = "distance" harness = false + +[[bench]] +name = "naive_bayes" +harness = false +required-features = ["ndarray-bindings", "nalgebra-bindings"] diff --git a/benches/naive_bayes.rs b/benches/naive_bayes.rs new file mode 100644 index 0000000..2a4595b --- /dev/null +++ b/benches/naive_bayes.rs @@ -0,0 +1,73 @@ +use criterion::BenchmarkId; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +use nalgebra::DMatrix; +use ndarray::Array2; +use smartcore::linalg::naive::dense_matrix::DenseMatrix; +use smartcore::linalg::BaseMatrix; +use smartcore::linalg::BaseVector; +use smartcore::naive_bayes::GaussianNB; + +pub fn gaussian_naive_bayes_fit_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("GaussianNB::fit"); + + for n_samples in [100_usize, 1000_usize, 10000_usize].iter() { + for n_features in [10_usize, 100_usize, 1000_usize].iter() { + let x = DenseMatrix::::rand(*n_samples, *n_features); + let y: Vec = (0..*n_samples) + .map(|i| (i % *n_samples / 5_usize) as f64) + .collect::>(); + group.bench_with_input( + BenchmarkId::from_parameter(format!( + "n_samples: {}, n_features: {}", + n_samples, n_features + )), + n_samples, + |b, _| { + b.iter(|| { + GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap(); + }) + }, + ); + } + } + group.finish(); +} + +pub fn gaussian_naive_matrix_datastructure(c: &mut Criterion) { + let mut group = c.benchmark_group("GaussianNB"); + let classes = (0..10000).map(|i| (i % 25) as f64).collect::>(); + + group.bench_function("DenseMatrix", |b| { + let x = DenseMatrix::::rand(10000, 500); + let y = as BaseMatrix>::RowVector::from_array(&classes); + + b.iter(|| { + GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap(); + }) + }); + + group.bench_function("ndarray", |b| { + let x = Array2::::rand(10000, 500); + let y = as BaseMatrix>::RowVector::from_array(&classes); + + b.iter(|| { + GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap(); + }) + }); + + group.bench_function("ndalgebra", |b| { + let x = DMatrix::::rand(10000, 500); + let y = as BaseMatrix>::RowVector::from_array(&classes); + + b.iter(|| { + GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap(); + }) + }); +} +criterion_group!( + benches, + gaussian_naive_bayes_fit_benchmark, + gaussian_naive_matrix_datastructure +); +criterion_main!(benches); From 53351b2eceff2e5f256ad7f2352ee0a9a2ef7c6f Mon Sep 17 00:00:00 2001 From: morenol Date: Fri, 11 Dec 2020 16:52:39 -0400 Subject: [PATCH 60/79] fix needless-range and clippy::ptr_arg warnings. (#36) * Fix needless for loop range * Do not ignore clippy::ptr_arg --- src/algorithm/neighbour/bbd_tree.rs | 33 ++++++------ src/algorithm/neighbour/cover_tree.rs | 2 +- src/algorithm/neighbour/mod.rs | 1 + src/decomposition/pca.rs | 22 ++++---- src/ensemble/random_forest_classifier.rs | 14 ++--- src/lib.rs | 2 - src/linalg/evd.rs | 68 ++++++++++++------------ src/linalg/lu.rs | 13 ++--- src/linalg/naive/dense_matrix.rs | 45 ++++++++-------- src/linalg/qr.rs | 8 +-- src/linalg/stats.rs | 16 +++--- src/linalg/svd.rs | 32 +++++------ src/linear/bg_solver.rs | 6 +-- src/linear/lasso.rs | 12 ++--- src/linear/logistic_regression.rs | 12 ++--- src/linear/ridge_regression.rs | 12 ++--- src/math/distance/euclidian.rs | 2 +- src/metrics/auc.rs | 4 +- src/metrics/cluster_helpers.rs | 9 ++-- src/model_selection/mod.rs | 12 ++--- src/naive_bayes/bernoulli.rs | 4 +- src/naive_bayes/gaussian.rs | 6 +-- src/naive_bayes/multinomial.rs | 4 +- src/neighbors/knn_classifier.rs | 4 +- src/optimization/line_search.rs | 2 +- src/tree/decision_tree_classifier.rs | 57 ++++++++++---------- src/tree/decision_tree_regressor.rs | 25 ++++----- 27 files changed, 208 insertions(+), 219 deletions(-) diff --git a/src/algorithm/neighbour/bbd_tree.rs b/src/algorithm/neighbour/bbd_tree.rs index 85e6628..0d11fc6 100644 --- a/src/algorithm/neighbour/bbd_tree.rs +++ b/src/algorithm/neighbour/bbd_tree.rs @@ -44,10 +44,7 @@ impl BBDTree { let (n, _) = data.shape(); - let mut index = vec![0; n]; - for i in 0..n { - index[i] = i; - } + let index = (0..n).collect::>(); let mut tree = BBDTree { nodes, @@ -64,7 +61,7 @@ impl BBDTree { pub(in crate) fn clustering( &self, - centroids: &Vec>, + centroids: &[Vec], sums: &mut Vec>, counts: &mut Vec, membership: &mut Vec, @@ -92,8 +89,8 @@ impl BBDTree { fn filter( &self, node: usize, - centroids: &Vec>, - candidates: &Vec, + centroids: &[Vec], + candidates: &[usize], k: usize, sums: &mut Vec>, counts: &mut Vec, @@ -117,15 +114,15 @@ impl BBDTree { let mut new_candidates = vec![0; k]; let mut newk = 0; - for i in 0..k { + for candidate in candidates.iter().take(k) { if !BBDTree::prune( &self.nodes[node].center, &self.nodes[node].radius, centroids, closest, - candidates[i], + *candidate, ) { - new_candidates[newk] = candidates[i]; + new_candidates[newk] = *candidate; newk += 1; } } @@ -166,9 +163,9 @@ impl BBDTree { } fn prune( - center: &Vec, - radius: &Vec, - centroids: &Vec>, + center: &[T], + radius: &[T], + centroids: &[Vec], best_index: usize, test_index: usize, ) -> bool { @@ -285,8 +282,8 @@ impl BBDTree { } let mut mean = vec![T::zero(); d]; - for i in 0..d { - mean[i] = node.sum[i] / T::from(node.count).unwrap(); + for (i, mean_i) in mean.iter_mut().enumerate().take(d) { + *mean_i = node.sum[i] / T::from(node.count).unwrap(); } node.cost = BBDTree::node_cost(&self.nodes[node.lower.unwrap()], &mean) @@ -295,11 +292,11 @@ impl BBDTree { self.add_node(node) } - fn node_cost(node: &BBDTreeNode, center: &Vec) -> T { + fn node_cost(node: &BBDTreeNode, center: &[T]) -> T { let d = center.len(); let mut scatter = T::zero(); - for i in 0..d { - let x = (node.sum[i] / T::from(node.count).unwrap()) - center[i]; + for (i, center_i) in center.iter().enumerate().take(d) { + let x = (node.sum[i] / T::from(node.count).unwrap()) - *center_i; scatter += x * x; } node.cost + T::from(node.count).unwrap() * scatter diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index e7dbac0..2fe7792 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -436,7 +436,7 @@ impl> CoverTree } } - fn max(&self, distance_set: &Vec>) -> F { + fn max(&self, distance_set: &[DistanceSet]) -> F { let mut max = F::zero(); for n in distance_set { if max < n.dist[n.dist.len() - 1] { diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 7ef1c5c..bf9e669 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -1,3 +1,4 @@ +#![allow(clippy::ptr_arg)] //! # Nearest Neighbors Search Algorithms and Data Structures //! //! Nearest neighbor search is a basic computational tool that is particularly relevant to machine learning, diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index f25aaad..9f5bd39 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -112,9 +112,9 @@ impl> PCA { let mut x = data.clone(); - for c in 0..n { + for (c, mu_c) in mu.iter().enumerate().take(n) { for r in 0..m { - x.sub_element_mut(r, c, mu[c]); + x.sub_element_mut(r, c, *mu_c); } } @@ -124,8 +124,8 @@ impl> PCA { if m > n && !parameters.use_correlation_matrix { let svd = x.svd()?; eigenvalues = svd.s; - for i in 0..eigenvalues.len() { - eigenvalues[i] = eigenvalues[i] * eigenvalues[i]; + for eigenvalue in &mut eigenvalues { + *eigenvalue = *eigenvalue * (*eigenvalue); } eigenvectors = svd.V; @@ -149,8 +149,8 @@ impl> PCA { if parameters.use_correlation_matrix { let mut sd = vec![T::zero(); n]; - for i in 0..n { - sd[i] = cov.get(i, i).sqrt(); + for (i, sd_i) in sd.iter_mut().enumerate().take(n) { + *sd_i = cov.get(i, i).sqrt(); } for i in 0..n { @@ -166,9 +166,9 @@ impl> PCA { eigenvectors = evd.V; - for i in 0..n { + for (i, sd_i) in sd.iter().enumerate().take(n) { for j in 0..n { - eigenvectors.div_element_mut(i, j, sd[i]); + eigenvectors.div_element_mut(i, j, *sd_i); } } } else { @@ -188,9 +188,9 @@ impl> PCA { } let mut pmu = vec![T::zero(); n_components]; - for k in 0..n { - for i in 0..n_components { - pmu[i] += projection.get(i, k) * mu[k]; + for (k, mu_k) in mu.iter().enumerate().take(n) { + for (i, pmu_i) in pmu.iter_mut().enumerate().take(n_components) { + *pmu_i += projection.get(i, k) * (*mu_k); } } diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 011b0ba..7229d92 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -132,9 +132,9 @@ impl RandomForestClassifier { let mut yi: Vec = vec![0; y_ncols]; let classes = y_m.unique(); - for i in 0..y_ncols { + for (i, yi_i) in yi.iter_mut().enumerate().take(y_ncols) { let yc = y_m.get(0, i); - yi[i] = classes.iter().position(|c| yc == *c).unwrap(); + *yi_i = classes.iter().position(|c| yc == *c).unwrap(); } let mtry = parameters.m.unwrap_or_else(|| { @@ -192,22 +192,22 @@ impl RandomForestClassifier { which_max(&result) } - fn sample_with_replacement(y: &Vec, num_classes: usize) -> Vec { + fn sample_with_replacement(y: &[usize], num_classes: usize) -> Vec { let mut rng = rand::thread_rng(); let class_weight = vec![1.; num_classes]; let nrows = y.len(); let mut samples = vec![0; nrows]; - for l in 0..num_classes { + for (l, class_weight_l) in class_weight.iter().enumerate().take(num_classes) { let mut n_samples = 0; let mut index: Vec = Vec::new(); - for i in 0..nrows { - if y[i] == l { + for (i, y_i) in y.iter().enumerate().take(nrows) { + if *y_i == l { index.push(i); n_samples += 1; } } - let size = ((n_samples as f64) / class_weight[l]) as usize; + let size = ((n_samples as f64) / *class_weight_l) as usize; for _ in 0..size { let xi: usize = rng.gen_range(0, n_samples); samples[index[xi]] += 1; diff --git a/src/lib.rs b/src/lib.rs index ada7925..9290c86 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,4 @@ #![allow( - clippy::needless_range_loop, - clippy::ptr_arg, clippy::type_complexity, clippy::too_many_arguments, clippy::many_single_char_names diff --git a/src/linalg/evd.rs b/src/linalg/evd.rs index c216696..4c1b6c3 100644 --- a/src/linalg/evd.rs +++ b/src/linalg/evd.rs @@ -99,27 +99,27 @@ pub trait EVDDecomposableMatrix: BaseMatrix { fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec) { let (n, _) = V.shape(); - for i in 0..n { - d[i] = V.get(n - 1, i); + for (i, d_i) in d.iter_mut().enumerate().take(n) { + *d_i = V.get(n - 1, i); } for i in (1..n).rev() { let mut scale = T::zero(); let mut h = T::zero(); - for k in 0..i { - scale += d[k].abs(); + for d_k in d.iter().take(i) { + scale += d_k.abs(); } if scale == T::zero() { e[i] = d[i - 1]; - for j in 0..i { - d[j] = V.get(i - 1, j); + for (j, d_j) in d.iter_mut().enumerate().take(i) { + *d_j = V.get(i - 1, j); V.set(i, j, T::zero()); V.set(j, i, T::zero()); } } else { - for k in 0..i { - d[k] /= scale; - h += d[k] * d[k]; + for d_k in d.iter_mut().take(i) { + *d_k /= scale; + h += (*d_k) * (*d_k); } let mut f = d[i - 1]; let mut g = h.sqrt(); @@ -129,8 +129,8 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec e[i] = scale * g; h -= f * g; d[i - 1] = f - g; - for j in 0..i { - e[j] = T::zero(); + for e_j in e.iter_mut().take(i) { + *e_j = T::zero(); } for j in 0..i { @@ -170,16 +170,16 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec V.set(i, i, T::one()); let h = d[i + 1]; if h != T::zero() { - for k in 0..=i { - d[k] = V.get(k, i + 1) / h; + for (k, d_k) in d.iter_mut().enumerate().take(i + 1) { + *d_k = V.get(k, i + 1) / h; } for j in 0..=i { let mut g = T::zero(); for k in 0..=i { g += V.get(k, i + 1) * V.get(k, j); } - for k in 0..=i { - V.sub_element_mut(k, j, g * d[k]); + for (k, d_k) in d.iter().enumerate().take(i + 1) { + V.sub_element_mut(k, j, g * (*d_k)); } } } @@ -187,8 +187,8 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec V.set(k, i + 1, T::zero()); } } - for j in 0..n { - d[j] = V.get(n - 1, j); + for (j, d_j) in d.iter_mut().enumerate().take(n) { + *d_j = V.get(n - 1, j); V.set(n - 1, j, T::zero()); } V.set(n - 1, n - 1, T::one()); @@ -238,8 +238,8 @@ fn tql2>(V: &mut M, d: &mut Vec, e: &mut Vec< d[l + 1] = e[l] * (p + r); let dl1 = d[l + 1]; let mut h = g - d[l]; - for i in l + 2..n { - d[i] -= h; + for d_i in d.iter_mut().take(n).skip(l + 2) { + *d_i -= h; } f += h; @@ -285,10 +285,10 @@ fn tql2>(V: &mut M, d: &mut Vec, e: &mut Vec< for i in 0..n - 1 { let mut k = i; let mut p = d[i]; - for j in i + 1..n { - if d[j] > p { + for (j, d_j) in d.iter().enumerate().take(n).skip(i + 1) { + if *d_j > p { k = j; - p = d[j]; + p = *d_j; } } if k != i { @@ -316,7 +316,7 @@ fn balance>(A: &mut M) -> Vec { let mut done = false; while !done { done = true; - for i in 0..n { + for (i, scale_i) in scale.iter_mut().enumerate().take(n) { let mut r = T::zero(); let mut c = T::zero(); for j in 0..n { @@ -341,7 +341,7 @@ fn balance>(A: &mut M) -> Vec { if (c + r) / f < t * s { done = false; g = T::one() / f; - scale[i] *= f; + *scale_i *= f; for j in 0..n { A.mul_element_mut(i, j, g); } @@ -360,7 +360,7 @@ fn elmhes>(A: &mut M) -> Vec { let (n, _) = A.shape(); let mut perm = vec![0; n]; - for m in 1..n - 1 { + for (m, perm_m) in perm.iter_mut().enumerate().take(n - 1).skip(1) { let mut x = T::zero(); let mut i = m; for j in m..n { @@ -369,7 +369,7 @@ fn elmhes>(A: &mut M) -> Vec { i = j; } } - perm[m] = i; + *perm_m = i; if i != m { for j in (m - 1)..n { let swap = A.get(i, j); @@ -402,7 +402,7 @@ fn elmhes>(A: &mut M) -> Vec { perm } -fn eltran>(A: &M, V: &mut M, perm: &Vec) { +fn eltran>(A: &M, V: &mut M, perm: &[usize]) { let (n, _) = A.shape(); for mp in (1..n - 1).rev() { for k in mp + 1..n { @@ -774,11 +774,11 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e } } -fn balbak>(V: &mut M, scale: &Vec) { +fn balbak>(V: &mut M, scale: &[T]) { let (n, _) = V.shape(); - for i in 0..n { + for (i, scale_i) in scale.iter().enumerate().take(n) { for j in 0..n { - V.mul_element_mut(i, j, scale[i]); + V.mul_element_mut(i, j, *scale_i); } } } @@ -789,8 +789,8 @@ fn sort>(d: &mut Vec, e: &mut Vec, V: &mut for j in 1..n { let real = d[j]; let img = e[j]; - for k in 0..n { - temp[k] = V.get(k, j); + for (k, temp_k) in temp.iter_mut().enumerate().take(n) { + *temp_k = V.get(k, j); } let mut i = j as i32 - 1; while i >= 0 { @@ -806,8 +806,8 @@ fn sort>(d: &mut Vec, e: &mut Vec, V: &mut } d[i as usize + 1] = real; e[i as usize + 1] = img; - for k in 0..n { - V.set(k, i as usize + 1, temp[k]); + for (k, temp_k) in temp.iter().enumerate().take(n) { + V.set(k, i as usize + 1, *temp_k); } } } diff --git a/src/linalg/lu.rs b/src/linalg/lu.rs index bfc7fff..6daed69 100644 --- a/src/linalg/lu.rs +++ b/src/linalg/lu.rs @@ -202,24 +202,21 @@ pub trait LUDecomposableMatrix: BaseMatrix { fn lu_mut(mut self) -> Result, Failed> { let (m, n) = self.shape(); - let mut piv = vec![0; m]; - for i in 0..m { - piv[i] = i; - } + let mut piv = (0..m).collect::>(); let mut pivsign = 1; let mut LUcolj = vec![T::zero(); m]; for j in 0..n { - for i in 0..m { - LUcolj[i] = self.get(i, j); + for (i, LUcolj_i) in LUcolj.iter_mut().enumerate().take(m) { + *LUcolj_i = self.get(i, j); } for i in 0..m { let kmax = usize::min(i, j); let mut s = T::zero(); - for k in 0..kmax { - s += self.get(i, k) * LUcolj[k]; + for (k, LUcolj_k) in LUcolj.iter().enumerate().take(kmax) { + s += self.get(i, k) * (*LUcolj_k); } LUcolj[i] -= s; diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 14e5e62..8c822d2 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1,3 +1,4 @@ +#![allow(clippy::ptr_arg)] use std::fmt; use std::fmt::Debug; use std::marker::PhantomData; @@ -164,8 +165,8 @@ impl BaseVector for Vec { fn sum(&self) -> T { let mut sum = T::zero(); - for i in 0..self.len() { - sum += self[i]; + for self_i in self.iter() { + sum += *self_i; } sum } @@ -239,9 +240,9 @@ impl DenseMatrix { nrows, values: vec![T::zero(); ncols * nrows], }; - for row in 0..nrows { - for col in 0..ncols { - m.set(row, col, values[row][col]); + for (row_index, row) in values.iter().enumerate().take(nrows) { + for (col_index, value) in row.iter().enumerate().take(ncols) { + m.set(row_index, col_index, *value); } } m @@ -259,7 +260,7 @@ impl DenseMatrix { /// * `nrows` - number of rows in new matrix. /// * `ncols` - number of columns in new matrix. /// * `values` - values to initialize the matrix. - pub fn from_vec(nrows: usize, ncols: usize, values: &Vec) -> DenseMatrix { + pub fn from_vec(nrows: usize, ncols: usize, values: &[T]) -> DenseMatrix { let mut m = DenseMatrix { ncols, nrows, @@ -543,8 +544,8 @@ impl BaseMatrix for DenseMatrix { fn get_row(&self, row: usize) -> Self::RowVector { let mut v = vec![T::zero(); self.ncols]; - for c in 0..self.ncols { - v[c] = self.get(row, c); + for (c, v_c) in v.iter_mut().enumerate().take(self.ncols) { + *v_c = self.get(row, c); } v @@ -552,29 +553,29 @@ impl BaseMatrix for DenseMatrix { fn get_row_as_vec(&self, row: usize) -> Vec { let mut result = vec![T::zero(); self.ncols]; - for c in 0..self.ncols { - result[c] = self.get(row, c); + for (c, result_c) in result.iter_mut().enumerate().take(self.ncols) { + *result_c = self.get(row, c); } result } fn copy_row_as_vec(&self, row: usize, result: &mut Vec) { - for c in 0..self.ncols { - result[c] = self.get(row, c); + for (c, result_c) in result.iter_mut().enumerate().take(self.ncols) { + *result_c = self.get(row, c); } } fn get_col_as_vec(&self, col: usize) -> Vec { let mut result = vec![T::zero(); self.nrows]; - for r in 0..self.nrows { - result[r] = self.get(r, col); + for (r, result_r) in result.iter_mut().enumerate().take(self.nrows) { + *result_r = self.get(r, col); } result } fn copy_col_as_vec(&self, col: usize, result: &mut Vec) { - for r in 0..self.nrows { - result[r] = self.get(r, col); + for (r, result_r) in result.iter_mut().enumerate().take(self.nrows) { + *result_r = self.get(r, col); } } @@ -836,13 +837,13 @@ impl BaseMatrix for DenseMatrix { let mut mean = vec![T::zero(); self.ncols]; for r in 0..self.nrows { - for c in 0..self.ncols { - mean[c] += self.get(r, c); + for (c, mean_c) in mean.iter_mut().enumerate().take(self.ncols) { + *mean_c += self.get(r, c); } } - for i in 0..mean.len() { - mean[i] /= T::from(self.nrows).unwrap(); + for mean_i in mean.iter_mut() { + *mean_i /= T::from(self.nrows).unwrap(); } mean @@ -989,7 +990,7 @@ impl BaseMatrix for DenseMatrix { fn argmax(&self) -> Vec { let mut res = vec![0usize; self.nrows]; - for r in 0..self.nrows { + for (r, res_r) in res.iter_mut().enumerate().take(self.nrows) { let mut max = T::neg_infinity(); let mut max_pos = 0usize; for c in 0..self.ncols { @@ -999,7 +1000,7 @@ impl BaseMatrix for DenseMatrix { max_pos = c; } } - res[r] = max_pos; + *res_r = max_pos; } res diff --git a/src/linalg/qr.rs b/src/linalg/qr.rs index c3a7978..a06a01f 100644 --- a/src/linalg/qr.rs +++ b/src/linalg/qr.rs @@ -44,8 +44,8 @@ pub struct QR> { impl> QR { pub(crate) fn new(QR: M, tau: Vec) -> QR { let mut singular = false; - for j in 0..tau.len() { - if tau[j] == T::zero() { + for tau_elem in tau.iter() { + if *tau_elem == T::zero() { singular = true; break; } @@ -153,7 +153,7 @@ pub trait QRDecomposableMatrix: BaseMatrix { let mut r_diagonal: Vec = vec![T::zero(); n]; - for k in 0..n { + for (k, r_diagonal_k) in r_diagonal.iter_mut().enumerate().take(n) { let mut nrm = T::zero(); for i in k..m { nrm = nrm.hypot(self.get(i, k)); @@ -179,7 +179,7 @@ pub trait QRDecomposableMatrix: BaseMatrix { } } } - r_diagonal[k] = -nrm; + *r_diagonal_k = -nrm; } Ok(QR::new(self, r_diagonal)) diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index fff87c3..45a17af 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -22,14 +22,14 @@ pub trait MatrixStats: BaseMatrix { let div = T::from_usize(m).unwrap(); - for i in 0..n { + for (i, x_i) in x.iter_mut().enumerate().take(n) { for j in 0..m { - x[i] += match axis { + *x_i += match axis { 0 => self.get(j, i), _ => self.get(i, j), }; } - x[i] /= div; + *x_i /= div; } x @@ -49,7 +49,7 @@ pub trait MatrixStats: BaseMatrix { let div = T::from_usize(m).unwrap(); - for i in 0..n { + for (i, x_i) in x.iter_mut().enumerate().take(n) { let mut mu = T::zero(); let mut sum = T::zero(); for j in 0..m { @@ -61,7 +61,7 @@ pub trait MatrixStats: BaseMatrix { sum += a * a; } mu /= div; - x[i] = sum / div - mu * mu; + *x_i = sum / div - mu * mu; } x @@ -76,15 +76,15 @@ pub trait MatrixStats: BaseMatrix { _ => self.shape().0, }; - for i in 0..n { - x[i] = x[i].sqrt(); + for x_i in x.iter_mut().take(n) { + *x_i = x_i.sqrt(); } x } /// standardize values by removing the mean and scaling to unit variance - fn scale_mut(&mut self, mean: &Vec, std: &Vec, axis: u8) { + fn scale_mut(&mut self, mean: &[T], std: &[T], axis: u8) { let (n, m) = match axis { 0 => { let (n, m) = self.shape(); diff --git a/src/linalg/svd.rs b/src/linalg/svd.rs index 9271f5b..e370453 100644 --- a/src/linalg/svd.rs +++ b/src/linalg/svd.rs @@ -156,8 +156,8 @@ pub trait SVDDecomposableMatrix: BaseMatrix { let h = f * g - s; U.set(i, l - 1, f - g); - for k in l - 1..n { - rv1[k] = U.get(i, k) / h; + for (k, rv1_k) in rv1.iter_mut().enumerate().take(n).skip(l - 1) { + *rv1_k = U.get(i, k) / h; } for j in l - 1..m { @@ -166,8 +166,8 @@ pub trait SVDDecomposableMatrix: BaseMatrix { s += U.get(j, k) * U.get(i, k); } - for k in l - 1..n { - U.add_element_mut(j, k, s * rv1[k]); + for (k, rv1_k) in rv1.iter().enumerate().take(n).skip(l - 1) { + U.add_element_mut(j, k, s * (*rv1_k)); } } @@ -365,11 +365,11 @@ pub trait SVDDecomposableMatrix: BaseMatrix { inc /= 3; for i in inc..n { let sw = w[i]; - for k in 0..m { - su[k] = U.get(k, i); + for (k, su_k) in su.iter_mut().enumerate().take(m) { + *su_k = U.get(k, i); } - for k in 0..n { - sv[k] = v.get(k, i); + for (k, sv_k) in sv.iter_mut().enumerate().take(n) { + *sv_k = v.get(k, i); } let mut j = i; while w[j - inc] < sw { @@ -386,11 +386,11 @@ pub trait SVDDecomposableMatrix: BaseMatrix { } } w[j] = sw; - for k in 0..m { - U.set(k, j, su[k]); + for (k, su_k) in su.iter().enumerate().take(m) { + U.set(k, j, *su_k); } - for k in 0..n { - v.set(k, j, sv[k]); + for (k, sv_k) in sv.iter().enumerate().take(n) { + v.set(k, j, *sv_k); } } if inc <= 1 { @@ -454,7 +454,7 @@ impl> SVD { for k in 0..p { let mut tmp = vec![T::zero(); self.n]; - for j in 0..self.n { + for (j, tmp_j) in tmp.iter_mut().enumerate().take(self.n) { let mut r = T::zero(); if self.s[j] > self.tol { for i in 0..self.m { @@ -462,13 +462,13 @@ impl> SVD { } r /= self.s[j]; } - tmp[j] = r; + *tmp_j = r; } for j in 0..self.n { let mut r = T::zero(); - for jj in 0..self.n { - r += self.V.get(j, jj) * tmp[jj]; + for (jj, tmp_jj) in tmp.iter().enumerate().take(self.n) { + r += self.V.get(j, jj) * (*tmp_jj); } b.set(j, k, r); } diff --git a/src/linear/bg_solver.rs b/src/linear/bg_solver.rs index b299623..46ef13d 100644 --- a/src/linear/bg_solver.rs +++ b/src/linear/bg_solver.rs @@ -85,9 +85,9 @@ pub trait BiconjugateGradientSolver> { let diag = Self::diag(a); let n = diag.len(); - for i in 0..n { - if diag[i] != T::zero() { - x.set(i, 0, b.get(i, 0) / diag[i]); + for (i, diag_i) in diag.iter().enumerate().take(n) { + if *diag_i != T::zero() { + x.set(i, 0, b.get(i, 0) / *diag_i); } else { x.set(i, 0, b.get(i, 0)); } diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 965c1c4..490694c 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -120,14 +120,14 @@ impl> Lasso { let mut w = optimizer.optimize(&scaled_x, y, ¶meters)?; - for j in 0..p { - w.set(j, 0, w.get(j, 0) / col_std[j]); + for (j, col_std_j) in col_std.iter().enumerate().take(p) { + w.set(j, 0, w.get(j, 0) / *col_std_j); } let mut b = T::zero(); - for i in 0..p { - b += w.get(i, 0) * col_mean[i]; + for (i, col_mean_i) in col_mean.iter().enumerate().take(p) { + b += w.get(i, 0) * *col_mean_i; } b = y.mean() - b; @@ -169,8 +169,8 @@ impl> Lasso { let col_mean = x.mean(0); let col_std = x.std(0); - for i in 0..col_std.len() { - if (col_std[i] - T::zero()).abs() < T::epsilon() { + for (i, col_std_i) in col_std.iter().enumerate() { + if (*col_std_i - T::zero()).abs() < T::epsilon() { return Err(Failed::fit(&format!( "Cannot rescale constant column {}", i diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index a3674b3..7b7cab6 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -228,9 +228,9 @@ impl> LogisticRegression { let mut yi: Vec = vec![0; y_nrows]; - for i in 0..y_nrows { + for (i, yi_i) in yi.iter_mut().enumerate().take(y_nrows) { let yc = y_m.get(0, i); - yi[i] = classes.iter().position(|c| yc == *c).unwrap(); + *yi_i = classes.iter().position(|c| yc == *c).unwrap(); } match k.cmp(&2) { @@ -291,11 +291,11 @@ impl> LogisticRegression { if self.num_classes == 2 { let y_hat: Vec = x.ab(false, &self.coefficients, true).get_col_as_vec(0); let intercept = self.intercept.get(0, 0); - for i in 0..n { + for (i, y_hat_i) in y_hat.iter().enumerate().take(n) { result.set( 0, i, - self.classes[if (y_hat[i] + intercept).sigmoid() > T::half() { + self.classes[if (*y_hat_i + intercept).sigmoid() > T::half() { 1 } else { 0 @@ -310,8 +310,8 @@ impl> LogisticRegression { } } let class_idxs = y_hat.argmax(); - for i in 0..n { - result.set(0, i, self.classes[class_idxs[i]]); + for (i, class_i) in class_idxs.iter().enumerate().take(n) { + result.set(0, i, self.classes[*class_i]); } } Ok(result.to_row_vector()) diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index bb03c54..98bc639 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -155,14 +155,14 @@ impl> RidgeRegression { RidgeRegressionSolverName::SVD => x_t_x.svd_solve_mut(x_t_y)?, }; - for i in 0..p { - w.set(i, 0, w.get(i, 0) / col_std[i]); + for (i, col_std_i) in col_std.iter().enumerate().take(p) { + w.set(i, 0, w.get(i, 0) / *col_std_i); } let mut b = T::zero(); - for i in 0..p { - b += w.get(i, 0) * col_mean[i]; + for (i, col_mean_i) in col_mean.iter().enumerate().take(p) { + b += w.get(i, 0) * *col_mean_i; } let b = y.mean() - b; @@ -196,8 +196,8 @@ impl> RidgeRegression { let col_mean = x.mean(0); let col_std = x.std(0); - for i in 0..col_std.len() { - if (col_std[i] - T::zero()).abs() < T::epsilon() { + for (i, col_std_i) in col_std.iter().enumerate() { + if (*col_std_i - T::zero()).abs() < T::epsilon() { return Err(Failed::fit(&format!( "Cannot rescale constant column {}", i diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 31503bd..e292f9c 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -30,7 +30,7 @@ pub struct Euclidian {} impl Euclidian { #[inline] - pub(crate) fn squared_distance(x: &Vec, y: &Vec) -> T { + pub(crate) fn squared_distance(x: &[T], y: &[T]) -> T { if x.len() != y.len() { panic!("Input vector sizes are different."); } diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index 571dd49..0f8d56a 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -68,8 +68,8 @@ impl AUC { j += 1; } let r = T::from_usize(i + 1 + j).unwrap() / T::two(); - for k in i..j { - rank[k] = r; + for rank_k in rank.iter_mut().take(j).skip(i) { + *rank_k = r; } i = j - 1; } diff --git a/src/metrics/cluster_helpers.rs b/src/metrics/cluster_helpers.rs index 8d1e17e..a8fa7e5 100644 --- a/src/metrics/cluster_helpers.rs +++ b/src/metrics/cluster_helpers.rs @@ -1,3 +1,4 @@ +#![allow(clippy::ptr_arg)] use std::collections::HashMap; use crate::math::num::RealNumber; @@ -23,7 +24,7 @@ pub fn contingency_matrix( contingency_matrix } -pub fn entropy(data: &Vec) -> Option { +pub fn entropy(data: &[T]) -> Option { let mut bincounts = HashMap::with_capacity(data.len()); for e in data.iter() { @@ -44,17 +45,17 @@ pub fn entropy(data: &Vec) -> Option { Some(entropy) } -pub fn mutual_info_score(contingency: &Vec>) -> T { +pub fn mutual_info_score(contingency: &[Vec]) -> T { let mut contingency_sum = 0; let mut pi = vec![0; contingency.len()]; let mut pj = vec![0; contingency[0].len()]; let (mut nzx, mut nzy, mut nz_val) = (Vec::new(), Vec::new(), Vec::new()); for r in 0..contingency.len() { - for c in 0..contingency[0].len() { + for (c, pj_c) in pj.iter_mut().enumerate().take(contingency[0].len()) { contingency_sum += contingency[r][c]; pi[r] += contingency[r][c]; - pj[c] += contingency[r][c]; + *pj_c += contingency[r][c]; if contingency[r][c] > 0 { nzx.push(r); nzy.push(c); diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index b066b30..bc0f9b8 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -44,10 +44,10 @@ pub fn train_test_split>( let mut n_test = 0; let mut index = vec![false; n]; - for i in 0..n { + for index_i in index.iter_mut().take(n) { let p_test: f32 = rng.gen(); if p_test <= test_size { - index[i] = true; + *index_i = true; n_test += 1; } } @@ -62,8 +62,8 @@ pub fn train_test_split>( let mut r_train = 0; let mut r_test = 0; - for r in 0..n { - if index[r] { + for (r, index_r) in index.iter().enumerate().take(n) { + if *index_r { //sample belongs to test for c in 0..m { x_test.set(r_test, c, x.get(r, c)); @@ -133,8 +133,8 @@ impl BaseKFold for KFold { let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; // increment by one if odd - for i in 0..(n_samples % self.n_splits) { - fold_sizes[i] += 1; + for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { + *fold_size += 1; } // generate the right array of arrays for test indices diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 535b5ee..057b447 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -134,8 +134,8 @@ impl BernoulliNBDistribution { let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { - for idx in 0..n_features { - feature_in_class_counter[class_index][idx] += row[idx]; + for (idx, row_i) in row.iter().enumerate().take(n_features) { + feature_in_class_counter[class_index][idx] += *row_i; } } diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 8e7e37c..af5732d 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -119,9 +119,9 @@ impl GaussianNBDistribution { .into_iter() .map(|v| { let mut m = M::zeros(v.len(), n_features); - for row in 0..v.len() { - for col in 0..n_features { - m.set(row, col, v[row][col]); + for (row_i, v_i) in v.iter().enumerate() { + for (col_j, v_i_j) in v_i.iter().enumerate().take(n_features) { + m.set(row_i, col_j, *v_i_j); } } m diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index a70fd2d..be8a7da 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -122,8 +122,8 @@ impl MultinomialNBDistribution { let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { - for idx in 0..n_features { - feature_in_class_counter[class_index][idx] += row[idx]; + for (idx, row_i) in row.iter().enumerate().take(n_features) { + feature_in_class_counter[class_index][idx] += *row_i; } } diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 135594a..f940211 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -119,9 +119,9 @@ impl, T>> KNNClassifier { let mut yi: Vec = vec![0; y_n]; let classes = y_m.unique(); - for i in 0..y_n { + for (i, yi_i) in yi.iter_mut().enumerate().take(y_n) { let yc = y_m.get(0, i); - yi[i] = classes.iter().position(|c| yc == *c).unwrap(); + *yi_i = classes.iter().position(|c| yc == *c).unwrap(); } if x_n != y_n { diff --git a/src/optimization/line_search.rs b/src/optimization/line_search.rs index e6a3b80..99457c9 100644 --- a/src/optimization/line_search.rs +++ b/src/optimization/line_search.rs @@ -41,7 +41,7 @@ impl Default for Backtracking { } impl LineSearchMethod for Backtracking { - fn search<'a>( + fn search( &self, f: &(dyn Fn(T) -> T), _: &(dyn Fn(T) -> T), diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 9fe1b1a..371bc4e 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -187,42 +187,42 @@ impl Node { struct NodeVisitor<'a, T: RealNumber, M: Matrix> { x: &'a M, - y: &'a Vec, + y: &'a [usize], node: usize, samples: Vec, - order: &'a Vec>, + order: &'a [Vec], true_child_output: usize, false_child_output: usize, level: u16, phantom: PhantomData<&'a T>, } -fn impurity(criterion: &SplitCriterion, count: &Vec, n: usize) -> T { +fn impurity(criterion: &SplitCriterion, count: &[usize], n: usize) -> T { let mut impurity = T::zero(); match criterion { SplitCriterion::Gini => { impurity = T::one(); - for i in 0..count.len() { - if count[i] > 0 { - let p = T::from(count[i]).unwrap() / T::from(n).unwrap(); + for count_i in count.iter() { + if *count_i > 0 { + let p = T::from(*count_i).unwrap() / T::from(n).unwrap(); impurity -= p * p; } } } SplitCriterion::Entropy => { - for i in 0..count.len() { - if count[i] > 0 { - let p = T::from(count[i]).unwrap() / T::from(n).unwrap(); + for count_i in count.iter() { + if *count_i > 0 { + let p = T::from(*count_i).unwrap() / T::from(n).unwrap(); impurity -= p * p.log2(); } } } SplitCriterion::ClassificationError => { - for i in 0..count.len() { - if count[i] > 0 { - impurity = impurity.max(T::from(count[i]).unwrap() / T::from(n).unwrap()); + for count_i in count.iter() { + if *count_i > 0 { + impurity = impurity.max(T::from(*count_i).unwrap() / T::from(n).unwrap()); } } impurity = (T::one() - impurity).abs(); @@ -236,9 +236,9 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { fn new( node_id: usize, samples: Vec, - order: &'a Vec>, + order: &'a [Vec], x: &'a M, - y: &'a Vec, + y: &'a [usize], level: u16, ) -> Self { NodeVisitor { @@ -255,13 +255,13 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } } -pub(in crate) fn which_max(x: &Vec) -> usize { +pub(in crate) fn which_max(x: &[usize]) -> usize { let mut m = x[0]; let mut which = 0; - for i in 1..x.len() { - if x[i] > m { - m = x[i]; + for (i, x_i) in x.iter().enumerate().skip(1) { + if *x_i > m { + m = *x_i; which = i; } } @@ -304,9 +304,9 @@ impl DecisionTreeClassifier { let mut yi: Vec = vec![0; y_ncols]; - for i in 0..y_ncols { + for (i, yi_i) in yi.iter_mut().enumerate().take(y_ncols) { let yc = y_m.get(0, i); - yi[i] = classes.iter().position(|c| yc == *c).unwrap(); + *yi_i = classes.iter().position(|c| yc == *c).unwrap(); } let mut nodes: Vec> = Vec::new(); @@ -431,23 +431,20 @@ impl DecisionTreeClassifier { let parent_impurity = impurity(&self.parameters.criterion, &count, n); - let mut variables = vec![0; n_attr]; - for i in 0..n_attr { - variables[i] = i; - } + let mut variables = (0..n_attr).collect::>(); if mtry < n_attr { variables.shuffle(&mut rand::thread_rng()); } - for j in 0..mtry { + for variable in variables.iter().take(mtry) { self.find_best_split( visitor, n, &count, &mut false_count, parent_impurity, - variables[j], + *variable, ); } @@ -458,7 +455,7 @@ impl DecisionTreeClassifier { &mut self, visitor: &mut NodeVisitor<'_, T, M>, n: usize, - count: &Vec, + count: &[usize], false_count: &mut Vec, parent_impurity: T, j: usize, @@ -527,13 +524,13 @@ impl DecisionTreeClassifier { let mut fc = 0; let mut true_samples: Vec = vec![0; n]; - for i in 0..n { + for (i, true_sample) in true_samples.iter_mut().enumerate().take(n) { if visitor.samples[i] > 0 { if visitor.x.get(i, self.nodes[visitor.node].split_feature) <= self.nodes[visitor.node].split_value.unwrap_or_else(T::nan) { - true_samples[i] = visitor.samples[i]; - tc += true_samples[i]; + *true_sample = visitor.samples[i]; + tc += *true_sample; visitor.samples[i] = 0; } else { fc += visitor.samples[i]; diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index c30c9e2..5e80b4c 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -161,7 +161,7 @@ struct NodeVisitor<'a, T: RealNumber, M: Matrix> { y: &'a M, node: usize, samples: Vec, - order: &'a Vec>, + order: &'a [Vec], true_child_output: T, false_child_output: T, level: u16, @@ -171,7 +171,7 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { fn new( node_id: usize, samples: Vec, - order: &'a Vec>, + order: &'a [Vec], x: &'a M, y: &'a M, level: u16, @@ -219,9 +219,9 @@ impl DecisionTreeRegressor { let mut n = 0; let mut sum = T::zero(); - for i in 0..y_ncols { - n += samples[i]; - sum += T::from(samples[i]).unwrap() * y_m.get(0, i); + for (i, sample_i) in samples.iter().enumerate().take(y_ncols) { + n += *sample_i; + sum += T::from(*sample_i).unwrap() * y_m.get(0, i); } let root = Node::new(0, sum / T::from(n).unwrap()); @@ -312,10 +312,7 @@ impl DecisionTreeRegressor { let sum = self.nodes[visitor.node].output * T::from(n).unwrap(); - let mut variables = vec![0; n_attr]; - for i in 0..n_attr { - variables[i] = i; - } + let mut variables = (0..n_attr).collect::>(); if mtry < n_attr { variables.shuffle(&mut rand::thread_rng()); @@ -324,8 +321,8 @@ impl DecisionTreeRegressor { let parent_gain = T::from(n).unwrap() * self.nodes[visitor.node].output * self.nodes[visitor.node].output; - for j in 0..mtry { - self.find_best_split(visitor, n, sum, parent_gain, variables[j]); + for variable in variables.iter().take(mtry) { + self.find_best_split(visitor, n, sum, parent_gain, *variable); } self.nodes[visitor.node].split_score != Option::None @@ -399,13 +396,13 @@ impl DecisionTreeRegressor { let mut fc = 0; let mut true_samples: Vec = vec![0; n]; - for i in 0..n { + for (i, true_sample) in true_samples.iter_mut().enumerate().take(n) { if visitor.samples[i] > 0 { if visitor.x.get(i, self.nodes[visitor.node].split_feature) <= self.nodes[visitor.node].split_value.unwrap_or_else(T::nan) { - true_samples[i] = visitor.samples[i]; - tc += true_samples[i]; + *true_sample = visitor.samples[i]; + tc += *true_sample; visitor.samples[i] = 0; } else { fc += visitor.samples[i]; From 78673b597fb02d825b882a650eac881c4c7dc916 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 11 Dec 2020 18:55:07 -0800 Subject: [PATCH 61/79] feat: adds elastic net --- src/linalg/mod.rs | 3 + src/linalg/naive/dense_matrix.rs | 14 ++ src/linalg/nalgebra_bindings.rs | 14 ++ src/linalg/ndarray_bindings.rs | 14 ++ src/linear/elasticnet.rs | 335 +++++++++++++++++++++++++++++++ src/linear/lasso.rs | 247 +---------------------- src/linear/lasso_optimizer.rs | 255 +++++++++++++++++++++++ src/linear/mod.rs | 2 + 8 files changed, 647 insertions(+), 237 deletions(-) create mode 100644 src/linear/elasticnet.rs create mode 100644 src/linear/lasso_optimizer.rs diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index d3fb635..c768cbf 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -271,6 +271,9 @@ pub trait BaseVector: Clone + Debug { fn std(&self) -> T { self.var().sqrt() } + + /// Copies content of `other` vector. + fn copy_from(&mut self, other: &Self); } /// Generic matrix type. diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 14e5e62..fd049ed 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -176,6 +176,20 @@ impl BaseVector for Vec { result.dedup(); result } + + fn copy_from(&mut self, other: &Self) { + if self.len() != other.len() { + panic!( + "Can't copy vector of length {} into a vector of length {}.", + self.len(), + other.len() + ); + } + + for i in 0..self.len() { + self[i] = other[i]; + } + } } /// Column-major, dense matrix. See [Simple Dense Matrix](../index.html). diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index ad2d4a2..b976fbd 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -181,6 +181,10 @@ impl BaseVector for MatrixMN { result.dedup(); result } + + fn copy_from(&mut self, other: &Self) { + Matrix::copy_from(self, other); + } } impl @@ -575,6 +579,16 @@ mod tests { use crate::linear::linear_regression::*; use nalgebra::{DMatrix, Matrix2x3, RowDVector}; + #[test] + fn vec_copy_from() { + let mut v1 = RowDVector::from_vec(vec![1., 2., 3.]); + let mut v2 = RowDVector::from_vec(vec![4., 5., 6.]); + v1.copy_from(&v2); + assert_eq!(v2, v1); + v2[0] = 10.0; + assert_ne!(v2, v1); + } + #[test] fn vec_len() { let v = RowDVector::from_vec(vec![1., 2., 3.]); diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 3f0478f..eb50f01 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -176,6 +176,10 @@ impl BaseVector for ArrayBase, Ix result.dedup(); result } + + fn copy_from(&mut self, other: &Self) { + self.assign(&other); + } } impl @@ -537,6 +541,16 @@ mod tests { assert_eq!(5., BaseVector::get(&result, 1)); } + #[test] + fn vec_copy_from() { + let mut v1 = arr1(&[1., 2., 3.]); + let mut v2 = arr1(&[4., 5., 6.]); + v1.copy_from(&v2); + assert_eq!(v1, v2); + v2[0] = 10.0; + assert_ne!(v1, v2); + } + #[test] fn vec_len() { let v = arr1(&[1., 2., 3.]); diff --git a/src/linear/elasticnet.rs b/src/linear/elasticnet.rs new file mode 100644 index 0000000..7b6acb1 --- /dev/null +++ b/src/linear/elasticnet.rs @@ -0,0 +1,335 @@ +//! # Elastic Net +//! +//! +//! ## References: +//! +//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 6.2. Shrinkage Methods](http://faculty.marshall.usc.edu/gareth-james/ISL/) +//! * ["Regularization and variable selection via the elastic net", Hui Zou and Trevor Hastie](https://web.stanford.edu/~hastie/Papers/B67.2%20(2005)%20301-320%20Zou%20&%20Hastie.pdf) +//! +//! +//! +use std::fmt::Debug; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +use crate::linear::lasso_optimizer::InteriorPointOptimizer; + +/// Ridge Regression parameters +#[derive(Serialize, Deserialize, Debug)] +pub struct ElasticNetParameters { + pub alpha: T, + pub l1_ratio: T, + pub normalize: bool, + pub tol: T, + pub max_iter: usize, +} + +/// Ridge regression +#[derive(Serialize, Deserialize, Debug)] +pub struct ElasticNet> { + coefficients: M, + intercept: T, +} + +impl Default for ElasticNetParameters { + fn default() -> Self { + ElasticNetParameters { + alpha: T::one(), + l1_ratio: T::half(), + normalize: true, + tol: T::from_f64(1e-4).unwrap(), + max_iter: 1000, + } + } +} + +impl> PartialEq for ElasticNet { + fn eq(&self, other: &Self) -> bool { + self.coefficients == other.coefficients + && (self.intercept - other.intercept).abs() <= T::epsilon() + } +} + +impl> ElasticNet { + /// Fits ridge regression to your data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - target values + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: ElasticNetParameters, + ) -> Result, Failed> { + let (n, p) = x.shape(); + + if y.len() != n { + return Err(Failed::fit("Number of rows in X should = len(y)")); + } + + let n_float = T::from_usize(n).unwrap(); + + let l1_reg = parameters.alpha * parameters.l1_ratio * n_float; + let l2_reg = parameters.alpha * (T::one() - parameters.l1_ratio) * n_float; + + let y_mean = y.mean(); + + let (w, b) = if parameters.normalize { + let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; + + let (x, y, gamma) = Self::augment_X_and_y(&scaled_x, y, l2_reg); + + let mut optimizer = InteriorPointOptimizer::new(&x, p); + + let mut w = + optimizer.optimize(&x, &y, l1_reg * gamma, parameters.max_iter, parameters.tol)?; + + for i in 0..p { + w.set(i, 0, gamma * w.get(i, 0) / col_std[i]); + } + + let mut b = T::zero(); + + for i in 0..p { + b += w.get(i, 0) * col_mean[i]; + } + + b = y_mean - b; + + (w, b) + } else { + let (x, y, gamma) = Self::augment_X_and_y(x, y, l2_reg); + + let mut optimizer = InteriorPointOptimizer::new(&x, p); + + let mut w = + optimizer.optimize(&x, &y, l1_reg * gamma, parameters.max_iter, parameters.tol)?; + + for i in 0..p { + w.set(i, 0, gamma * w.get(i, 0)); + } + + (w, y_mean) + }; + + Ok(ElasticNet { + intercept: b, + coefficients: w, + }) + } + + /// Predict target values from `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &M) -> Result { + let (nrows, _) = x.shape(); + let mut y_hat = x.matmul(&self.coefficients); + y_hat.add_mut(&M::fill(nrows, 1, self.intercept)); + Ok(y_hat.transpose().to_row_vector()) + } + + /// Get estimates regression coefficients + pub fn coefficients(&self) -> &M { + &self.coefficients + } + + /// Get estimate of intercept + pub fn intercept(&self) -> T { + self.intercept + } + + fn rescale_x(x: &M) -> Result<(M, Vec, Vec), Failed> { + let col_mean = x.mean(0); + let col_std = x.std(0); + + for i in 0..col_std.len() { + if (col_std[i] - T::zero()).abs() < T::epsilon() { + return Err(Failed::fit(&format!( + "Cannot rescale constant column {}", + i + ))); + } + } + + let mut scaled_x = x.clone(); + scaled_x.scale_mut(&col_mean, &col_std, 0); + Ok((scaled_x, col_mean, col_std)) + } + + fn augment_X_and_y(x: &M, y: &M::RowVector, l2_reg: T) -> (M, M::RowVector, T) { + let (n, p) = x.shape(); + + let gamma = T::one() / (T::one() + l2_reg).sqrt(); + let padding = gamma * l2_reg.sqrt(); + + let mut y2 = M::RowVector::zeros(n + p); + for i in 0..y.len() { + y2.set(i, y.get(i)); + } + + let mut x2 = M::zeros(n + p, p); + + for j in 0..p { + for i in 0..n { + x2.set(i, j, gamma * x.get(i, j)); + } + + x2.set(j + n, j, padding); + } + + (x2, y2, gamma) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + use crate::metrics::mean_absolute_error; + + #[test] + fn elasticnet_longley() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y: Vec = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let y_hat = ElasticNet::fit( + &x, + &y, + ElasticNetParameters { + alpha: 1.0, + l1_ratio: 0.5, + normalize: false, + tol: 1e-4, + max_iter: 1000, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat, &y) < 30.0); + } + + #[test] + fn elasticnet_fit_predict1() { + let x = DenseMatrix::from_2d_array(&[ + &[0.0, 1931.0, 1.2232755825400514], + &[1.0, 1933.0, 1.1379726120972395], + &[2.0, 1920.0, 1.4366265120543429], + &[3.0, 1918.0, 1.206005737827858], + &[4.0, 1934.0, 1.436613542400669], + &[5.0, 1918.0, 1.1594588621640636], + &[6.0, 1933.0, 1.19809994745985], + &[7.0, 1918.0, 1.3396363871645678], + &[8.0, 1931.0, 1.2535342096493207], + &[9.0, 1933.0, 1.3101281563456293], + &[10.0, 1922.0, 1.3585833349920762], + &[11.0, 1930.0, 1.4830786699709897], + &[12.0, 1916.0, 1.4919891143094546], + &[13.0, 1915.0, 1.259655137451551], + &[14.0, 1932.0, 1.3979191428724789], + &[15.0, 1917.0, 1.3686634746782371], + &[16.0, 1932.0, 1.381658454569724], + &[17.0, 1918.0, 1.4054969025700674], + &[18.0, 1929.0, 1.3271699396384906], + &[19.0, 1915.0, 1.1373332337674806], + ]); + + let y: Vec = vec![ + 1.48, 2.72, 4.52, 5.72, 5.25, 4.07, 3.75, 4.75, 6.77, 4.72, 6.78, 6.79, 8.3, 7.42, + 10.2, 7.92, 7.62, 8.06, 9.06, 9.29, + ]; + + let l1_model = ElasticNet::fit( + &x, + &y, + ElasticNetParameters { + alpha: 1.0, + l1_ratio: 1.0, + normalize: true, + tol: 1e-4, + max_iter: 1000, + }, + ) + .unwrap(); + + let l2_model = ElasticNet::fit( + &x, + &y, + ElasticNetParameters { + alpha: 1.0, + l1_ratio: 0.0, + normalize: true, + tol: 1e-4, + max_iter: 1000, + }, + ) + .unwrap(); + + let mae_l1 = mean_absolute_error(&l1_model.predict(&x).unwrap(), &y); + let mae_l2 = mean_absolute_error(&l2_model.predict(&x).unwrap(), &y); + + assert!(mae_l1 < 2.0); + assert!(mae_l2 < 2.0); + + assert!(l1_model.coefficients().get(0, 0) > l1_model.coefficients().get(1, 0)); + assert!(l1_model.coefficients().get(0, 0) > l1_model.coefficients().get(2, 0)); + } + + #[test] + fn serde() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let lr = ElasticNet::fit(&x, &y, Default::default()).unwrap(); + + let deserialized_lr: ElasticNet> = + serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); + + assert_eq!(lr, deserialized_lr); + } +} diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 965c1c4..b2c81d1 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -29,7 +29,7 @@ use serde::{Deserialize, Serialize}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; -use crate::linear::bg_solver::BiconjugateGradientSolver; +use crate::linear::lasso_optimizer::InteriorPointOptimizer; use crate::math::num::RealNumber; /// Lasso regression parameters @@ -53,14 +53,6 @@ pub struct Lasso> { intercept: T, } -struct InteriorPointOptimizer> { - ata: M, - d1: Vec, - d2: Vec, - prb: Vec, - prs: Vec, -} - impl Default for LassoParameters { fn default() -> Self { LassoParameters { @@ -118,7 +110,13 @@ impl> Lasso { let mut optimizer = InteriorPointOptimizer::new(&scaled_x, p); - let mut w = optimizer.optimize(&scaled_x, y, ¶meters)?; + let mut w = optimizer.optimize( + &scaled_x, + y, + parameters.alpha, + parameters.max_iter, + parameters.tol, + )?; for j in 0..p { w.set(j, 0, w.get(j, 0) / col_std[j]); @@ -135,7 +133,8 @@ impl> Lasso { } else { let mut optimizer = InteriorPointOptimizer::new(x, p); - let w = optimizer.optimize(x, y, ¶meters)?; + let w = + optimizer.optimize(x, y, parameters.alpha, parameters.max_iter, parameters.tol)?; (w, y.mean()) }; @@ -184,232 +183,6 @@ impl> Lasso { } } -impl> InteriorPointOptimizer { - fn new(a: &M, n: usize) -> InteriorPointOptimizer { - InteriorPointOptimizer { - ata: a.ab(true, a, false), - d1: vec![T::zero(); n], - d2: vec![T::zero(); n], - prb: vec![T::zero(); n], - prs: vec![T::zero(); n], - } - } - - fn optimize( - &mut self, - x: &M, - y: &M::RowVector, - parameters: &LassoParameters, - ) -> Result { - let (n, p) = x.shape(); - let p_f64 = T::from_usize(p).unwrap(); - - //parameters - let pcgmaxi = 5000; - let min_pcgtol = T::from_f64(0.1).unwrap(); - let eta = T::from_f64(1E-3).unwrap(); - let alpha = T::from_f64(0.01).unwrap(); - let beta = T::from_f64(0.5).unwrap(); - let gamma = T::from_f64(-0.25).unwrap(); - let mu = T::two(); - - let y = M::from_row_vector(y.sub_scalar(y.mean())).transpose(); - - let mut max_ls_iter = 100; - let mut pitr = 0; - let mut w = M::zeros(p, 1); - let mut neww = w.clone(); - let mut u = M::ones(p, 1); - let mut newu = u.clone(); - - let mut f = M::fill(p, 2, -T::one()); - let mut newf = f.clone(); - - let mut q1 = vec![T::zero(); p]; - let mut q2 = vec![T::zero(); p]; - - let mut dx = M::zeros(p, 1); - let mut du = M::zeros(p, 1); - let mut dxu = M::zeros(2 * p, 1); - let mut grad = M::zeros(2 * p, 1); - - let mut nu = M::zeros(n, 1); - let mut dobj = T::zero(); - let mut s = T::infinity(); - let mut t = T::one() - .max(T::one() / parameters.alpha) - .min(T::two() * p_f64 / T::from(1e-3).unwrap()); - - for ntiter in 0..parameters.max_iter { - let mut z = x.matmul(&w); - - for i in 0..n { - z.set(i, 0, z.get(i, 0) - y.get(i, 0)); - nu.set(i, 0, T::two() * z.get(i, 0)); - } - - // CALCULATE DUALITY GAP - let xnu = x.ab(true, &nu, false); - let max_xnu = xnu.norm(T::infinity()); - if max_xnu > parameters.alpha { - let lnu = parameters.alpha / max_xnu; - nu.mul_scalar_mut(lnu); - } - - let pobj = z.dot(&z) + parameters.alpha * w.norm(T::one()); - dobj = dobj.max(gamma * nu.dot(&nu) - nu.dot(&y)); - - let gap = pobj - dobj; - - // STOPPING CRITERION - if gap / dobj < parameters.tol { - break; - } - - // UPDATE t - if s >= T::half() { - t = t.max((T::two() * p_f64 * mu / gap).min(mu * t)); - } - - // CALCULATE NEWTON STEP - for i in 0..p { - let q1i = T::one() / (u.get(i, 0) + w.get(i, 0)); - let q2i = T::one() / (u.get(i, 0) - w.get(i, 0)); - q1[i] = q1i; - q2[i] = q2i; - self.d1[i] = (q1i * q1i + q2i * q2i) / t; - self.d2[i] = (q1i * q1i - q2i * q2i) / t; - } - - let mut gradphi = x.ab(true, &z, false); - - for i in 0..p { - let g1 = T::two() * gradphi.get(i, 0) - (q1[i] - q2[i]) / t; - let g2 = parameters.alpha - (q1[i] + q2[i]) / t; - gradphi.set(i, 0, g1); - grad.set(i, 0, -g1); - grad.set(i + p, 0, -g2); - } - - for i in 0..p { - self.prb[i] = T::two() + self.d1[i]; - self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i] * self.d2[i]; - } - - let normg = grad.norm2(); - let mut pcgtol = min_pcgtol.min(eta * gap / T::one().min(normg)); - if ntiter != 0 && pitr == 0 { - pcgtol *= min_pcgtol; - } - - let error = self.solve_mut(x, &grad, &mut dxu, pcgtol, pcgmaxi)?; - if error > pcgtol { - pitr = pcgmaxi; - } - - for i in 0..p { - dx.set(i, 0, dxu.get(i, 0)); - du.set(i, 0, dxu.get(i + p, 0)); - } - - // BACKTRACKING LINE SEARCH - let phi = z.dot(&z) + parameters.alpha * u.sum() - Self::sumlogneg(&f) / t; - s = T::one(); - let gdx = grad.dot(&dxu); - - let lsiter = 0; - while lsiter < max_ls_iter { - for i in 0..p { - neww.set(i, 0, w.get(i, 0) + s * dx.get(i, 0)); - newu.set(i, 0, u.get(i, 0) + s * du.get(i, 0)); - newf.set(i, 0, neww.get(i, 0) - newu.get(i, 0)); - newf.set(i, 1, -neww.get(i, 0) - newu.get(i, 0)); - } - - if newf.max() < T::zero() { - let mut newz = x.matmul(&neww); - for i in 0..n { - newz.set(i, 0, newz.get(i, 0) - y.get(i, 0)); - } - - let newphi = newz.dot(&newz) + parameters.alpha * newu.sum() - - Self::sumlogneg(&newf) / t; - if newphi - phi <= alpha * s * gdx { - break; - } - } - s = beta * s; - max_ls_iter += 1; - } - - if lsiter == max_ls_iter { - return Err(Failed::fit( - "Exceeded maximum number of iteration for interior point optimizer", - )); - } - - w.copy_from(&neww); - u.copy_from(&newu); - f.copy_from(&newf); - } - - Ok(w) - } - - fn sumlogneg(f: &M) -> T { - let (n, _) = f.shape(); - let mut sum = T::zero(); - for i in 0..n { - sum += (-f.get(i, 0)).ln(); - sum += (-f.get(i, 1)).ln(); - } - sum - } -} - -impl<'a, T: RealNumber, M: Matrix> BiconjugateGradientSolver - for InteriorPointOptimizer -{ - fn solve_preconditioner(&self, a: &M, b: &M, x: &mut M) { - let (_, p) = a.shape(); - - for i in 0..p { - x.set( - i, - 0, - (self.d1[i] * b.get(i, 0) - self.d2[i] * b.get(i + p, 0)) / self.prs[i], - ); - x.set( - i + p, - 0, - (-self.d2[i] * b.get(i, 0) + self.prb[i] * b.get(i + p, 0)) / self.prs[i], - ); - } - } - - fn mat_vec_mul(&self, _: &M, x: &M, y: &mut M) { - let (_, p) = self.ata.shape(); - let atax = self.ata.matmul(&x.slice(0..p, 0..1)); - - for i in 0..p { - y.set( - i, - 0, - T::two() * atax.get(i, 0) + self.d1[i] * x.get(i, 0) + self.d2[i] * x.get(i + p, 0), - ); - y.set( - i + p, - 0, - self.d2[i] * x.get(i, 0) + self.d1[i] * x.get(i + p, 0), - ); - } - } - - fn mat_t_vec_mul(&self, a: &M, x: &M, y: &mut M) { - self.mat_vec_mul(a, x, y); - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/src/linear/lasso_optimizer.rs b/src/linear/lasso_optimizer.rs new file mode 100644 index 0000000..4f5011f --- /dev/null +++ b/src/linear/lasso_optimizer.rs @@ -0,0 +1,255 @@ +//! An Interior-Point Method for Large-Scale l1-Regularized Least Squares +//! +//! This is a specialized interior-point method for solving large-scale 1-regularized LSPs that uses the +//! preconditioned conjugate gradients algorithm to compute the search direction. +//! +//! The interior-point method can solve large sparse problems, with a million variables and observations, in a few tens of minutes on a PC. +//! It can efficiently solve large dense problems, that arise in sparse signal recovery with orthogonal transforms, by exploiting fast algorithms for these transforms. +//! +//! ## References: +//! * ["An Interior-Point Method for Large-Scale l1-Regularized Least Squares", K. Koh, M. Lustig, S. Boyd, D. Gorinevsky](https://web.stanford.edu/~boyd/papers/pdf/l1_ls.pdf) +//! * [Simple Matlab Solver for l1-regularized Least Squares Problems](https://web.stanford.edu/~boyd/l1_ls/) +//! + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::linear::bg_solver::BiconjugateGradientSolver; +use crate::math::num::RealNumber; + +pub struct InteriorPointOptimizer> { + ata: M, + d1: Vec, + d2: Vec, + prb: Vec, + prs: Vec, +} + +impl> InteriorPointOptimizer { + pub fn new(a: &M, n: usize) -> InteriorPointOptimizer { + InteriorPointOptimizer { + ata: a.ab(true, a, false), + d1: vec![T::zero(); n], + d2: vec![T::zero(); n], + prb: vec![T::zero(); n], + prs: vec![T::zero(); n], + } + } + + pub fn optimize( + &mut self, + x: &M, + y: &M::RowVector, + lambda: T, + max_iter: usize, + tol: T, + ) -> Result { + let (n, p) = x.shape(); + let p_f64 = T::from_usize(p).unwrap(); + + let lambda = lambda.max(T::epsilon()); + + //parameters + let pcgmaxi = 5000; + let min_pcgtol = T::from_f64(0.1).unwrap(); + let eta = T::from_f64(1E-3).unwrap(); + let alpha = T::from_f64(0.01).unwrap(); + let beta = T::from_f64(0.5).unwrap(); + let gamma = T::from_f64(-0.25).unwrap(); + let mu = T::two(); + + let y = M::from_row_vector(y.sub_scalar(y.mean())).transpose(); + + let mut max_ls_iter = 100; + let mut pitr = 0; + let mut w = M::zeros(p, 1); + let mut neww = w.clone(); + let mut u = M::ones(p, 1); + let mut newu = u.clone(); + + let mut f = M::fill(p, 2, -T::one()); + let mut newf = f.clone(); + + let mut q1 = vec![T::zero(); p]; + let mut q2 = vec![T::zero(); p]; + + let mut dx = M::zeros(p, 1); + let mut du = M::zeros(p, 1); + let mut dxu = M::zeros(2 * p, 1); + let mut grad = M::zeros(2 * p, 1); + + let mut nu = M::zeros(n, 1); + let mut dobj = T::zero(); + let mut s = T::infinity(); + let mut t = T::one() + .max(T::one() / lambda) + .min(T::two() * p_f64 / T::from(1e-3).unwrap()); + + for ntiter in 0..max_iter { + let mut z = x.matmul(&w); + + for i in 0..n { + z.set(i, 0, z.get(i, 0) - y.get(i, 0)); + nu.set(i, 0, T::two() * z.get(i, 0)); + } + + // CALCULATE DUALITY GAP + let xnu = x.ab(true, &nu, false); + let max_xnu = xnu.norm(T::infinity()); + if max_xnu > lambda { + let lnu = lambda / max_xnu; + nu.mul_scalar_mut(lnu); + } + + let pobj = z.dot(&z) + lambda * w.norm(T::one()); + dobj = dobj.max(gamma * nu.dot(&nu) - nu.dot(&y)); + + let gap = pobj - dobj; + + // STOPPING CRITERION + if gap / dobj < tol { + break; + } + + // UPDATE t + if s >= T::half() { + t = t.max((T::two() * p_f64 * mu / gap).min(mu * t)); + } + + // CALCULATE NEWTON STEP + for i in 0..p { + let q1i = T::one() / (u.get(i, 0) + w.get(i, 0)); + let q2i = T::one() / (u.get(i, 0) - w.get(i, 0)); + q1[i] = q1i; + q2[i] = q2i; + self.d1[i] = (q1i * q1i + q2i * q2i) / t; + self.d2[i] = (q1i * q1i - q2i * q2i) / t; + } + + let mut gradphi = x.ab(true, &z, false); + + for i in 0..p { + let g1 = T::two() * gradphi.get(i, 0) - (q1[i] - q2[i]) / t; + let g2 = lambda - (q1[i] + q2[i]) / t; + gradphi.set(i, 0, g1); + grad.set(i, 0, -g1); + grad.set(i + p, 0, -g2); + } + + for i in 0..p { + self.prb[i] = T::two() + self.d1[i]; + self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i] * self.d2[i]; + } + + let normg = grad.norm2(); + let mut pcgtol = min_pcgtol.min(eta * gap / T::one().min(normg)); + if ntiter != 0 && pitr == 0 { + pcgtol *= min_pcgtol; + } + + let error = self.solve_mut(x, &grad, &mut dxu, pcgtol, pcgmaxi)?; + if error > pcgtol { + pitr = pcgmaxi; + } + + for i in 0..p { + dx.set(i, 0, dxu.get(i, 0)); + du.set(i, 0, dxu.get(i + p, 0)); + } + + // BACKTRACKING LINE SEARCH + let phi = z.dot(&z) + lambda * u.sum() - Self::sumlogneg(&f) / t; + s = T::one(); + let gdx = grad.dot(&dxu); + + let lsiter = 0; + while lsiter < max_ls_iter { + for i in 0..p { + neww.set(i, 0, w.get(i, 0) + s * dx.get(i, 0)); + newu.set(i, 0, u.get(i, 0) + s * du.get(i, 0)); + newf.set(i, 0, neww.get(i, 0) - newu.get(i, 0)); + newf.set(i, 1, -neww.get(i, 0) - newu.get(i, 0)); + } + + if newf.max() < T::zero() { + let mut newz = x.matmul(&neww); + for i in 0..n { + newz.set(i, 0, newz.get(i, 0) - y.get(i, 0)); + } + + let newphi = newz.dot(&newz) + lambda * newu.sum() - Self::sumlogneg(&newf) / t; + if newphi - phi <= alpha * s * gdx { + break; + } + } + s = beta * s; + max_ls_iter += 1; + } + + if lsiter == max_ls_iter { + return Err(Failed::fit( + "Exceeded maximum number of iteration for interior point optimizer", + )); + } + + w.copy_from(&neww); + u.copy_from(&newu); + f.copy_from(&newf); + } + + Ok(w) + } + + fn sumlogneg(f: &M) -> T { + let (n, _) = f.shape(); + let mut sum = T::zero(); + for i in 0..n { + sum += (-f.get(i, 0)).ln(); + sum += (-f.get(i, 1)).ln(); + } + sum + } +} + +impl<'a, T: RealNumber, M: Matrix> BiconjugateGradientSolver + for InteriorPointOptimizer +{ + fn solve_preconditioner(&self, a: &M, b: &M, x: &mut M) { + let (_, p) = a.shape(); + + for i in 0..p { + x.set( + i, + 0, + (self.d1[i] * b.get(i, 0) - self.d2[i] * b.get(i + p, 0)) / self.prs[i], + ); + x.set( + i + p, + 0, + (-self.d2[i] * b.get(i, 0) + self.prb[i] * b.get(i + p, 0)) / self.prs[i], + ); + } + } + + fn mat_vec_mul(&self, _: &M, x: &M, y: &mut M) { + let (_, p) = self.ata.shape(); + let atax = self.ata.matmul(&x.slice(0..p, 0..1)); + + for i in 0..p { + y.set( + i, + 0, + T::two() * atax.get(i, 0) + self.d1[i] * x.get(i, 0) + self.d2[i] * x.get(i + p, 0), + ); + y.set( + i + p, + 0, + self.d2[i] * x.get(i, 0) + self.d1[i] * x.get(i + p, 0), + ); + } + } + + fn mat_t_vec_mul(&self, a: &M, x: &M, y: &mut M) { + self.mat_vec_mul(a, x, y); + } +} diff --git a/src/linear/mod.rs b/src/linear/mod.rs index edaea4f..8c056e8 100644 --- a/src/linear/mod.rs +++ b/src/linear/mod.rs @@ -21,7 +21,9 @@ //! pub(crate) mod bg_solver; +pub mod elasticnet; pub mod lasso; +pub(crate) mod lasso_optimizer; pub mod linear_regression; pub mod logistic_regression; pub mod ridge_regression; From cceb2f046d112094dd149985e1d482da40b1b194 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Sun, 13 Dec 2020 13:35:14 -0800 Subject: [PATCH 62/79] feat: lasso documentation --- src/linalg/naive/dense_matrix.rs | 24 ++++++-- src/linear/{elasticnet.rs => elastic_net.rs} | 65 ++++++++++++++++++-- src/linear/lasso.rs | 29 +++------ src/linear/mod.rs | 2 +- 4 files changed, 86 insertions(+), 34 deletions(-) rename src/linear/{elasticnet.rs => elastic_net.rs} (74%) diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 400366d..a0b7bdb 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -187,9 +187,7 @@ impl BaseVector for Vec { ); } - for i in 0..self.len() { - self[i] = other[i]; - } + self[..].clone_from_slice(&other[..]); } } @@ -929,9 +927,7 @@ impl BaseMatrix for DenseMatrix { ); } - for i in 0..self.values.len() { - self.values[i] = other.values[i]; - } + self.values[..].clone_from_slice(&other.values[..]); } fn abs_mut(&mut self) -> &Self { @@ -1066,6 +1062,14 @@ mod tests { assert_eq!(32.0, BaseVector::dot(&v1, &v2)); } + #[test] + fn vec_copy_from() { + let mut v1 = vec![1., 2., 3.]; + let v2 = vec![4., 5., 6.]; + v1.copy_from(&v2); + assert_eq!(v1, v2); + } + #[test] fn vec_approximate_eq() { let a = vec![1., 2., 3.]; @@ -1199,6 +1203,14 @@ mod tests { assert_eq!(a.dot(&b), 32.); } + #[test] + fn copy_from() { + let mut a = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.]]); + let b = DenseMatrix::from_2d_array(&[&[7., 8.], &[9., 10.], &[11., 12.]]); + a.copy_from(&b); + assert_eq!(a, b); + } + #[test] fn slice() { let m = DenseMatrix::from_2d_array(&[ diff --git a/src/linear/elasticnet.rs b/src/linear/elastic_net.rs similarity index 74% rename from src/linear/elasticnet.rs rename to src/linear/elastic_net.rs index 7b6acb1..c01f3c7 100644 --- a/src/linear/elasticnet.rs +++ b/src/linear/elastic_net.rs @@ -1,5 +1,51 @@ +#![allow(clippy::needless_range_loop)] //! # Elastic Net //! +//! Elastic net is an extension of [linear regression](../linear_regression/index.html) that adds regularization penalties to the loss function during training. +//! Just like in ordinary linear regression you assume a linear relationship between input variables and the target variable. +//! Unlike linear regression elastic net adds regularization penalties to the loss function during training. +//! In particular, the elastic net coefficient estimates \\(\beta\\) are the values that minimize +//! +//! \\[L(\alpha, \beta) = \vert \boldsymbol{y} - \boldsymbol{X}\beta\vert^2 + \lambda_1 \vert \beta \vert^2 + \lambda_2 \vert \beta \vert_1\\] +//! +//! where \\(\lambda_1 = \\alpha l_{1r}\\), \\(\lambda_2 = \\alpha (1 - l_{1r})\\) and \\(l_{1r}\\) is the l1 ratio, elastic net mixing parameter. +//! +//! In essense, elastic net combines both the [L1](../lasso/index.html) and [L2](../ridge_regression/index.html) penalties during training, +//! which can result in better performance than a model with either one or the other penalty on some problems. +//! The elastic net is particularly useful when the number of predictors (p) is much bigger than the number of observations (n). +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::linear::elastic_net::*; +//! +//! // Longley dataset (https://www.statsmodels.org/stable/datasets/generated/longley.html) +//! let x = DenseMatrix::from_2d_array(&[ +//! &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], +//! &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], +//! &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], +//! &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], +//! &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], +//! &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], +//! &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], +//! &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], +//! &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], +//! &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], +//! &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], +//! &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], +//! &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], +//! &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], +//! &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], +//! &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], +//! ]); +//! +//! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, +//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; +//! +//! let y_hat = ElasticNet::fit(&x, &y, Default::default()). +//! and_then(|lr| lr.predict(&x)).unwrap(); +//! ``` //! //! ## References: //! @@ -19,17 +65,24 @@ use crate::math::num::RealNumber; use crate::linear::lasso_optimizer::InteriorPointOptimizer; -/// Ridge Regression parameters +/// Elastic net parameters #[derive(Serialize, Deserialize, Debug)] pub struct ElasticNetParameters { + /// Regularization parameter. pub alpha: T, + /// The elastic net mixing parameter, with 0 <= l1_ratio <= 1. + /// For l1_ratio = 0 the penalty is an L2 penalty. + /// For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. pub l1_ratio: T, + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. pub normalize: bool, + /// The tolerance for the optimization pub tol: T, + /// The maximum number of iterations pub max_iter: usize, } -/// Ridge regression +/// Elastic net #[derive(Serialize, Deserialize, Debug)] pub struct ElasticNet> { coefficients: M, @@ -56,7 +109,7 @@ impl> PartialEq for ElasticNet { } impl> ElasticNet { - /// Fits ridge regression to your data. + /// Fits elastic net regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target values /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. @@ -81,7 +134,7 @@ impl> ElasticNet { let (w, b) = if parameters.normalize { let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; - let (x, y, gamma) = Self::augment_X_and_y(&scaled_x, y, l2_reg); + let (x, y, gamma) = Self::augment_x_and_y(&scaled_x, y, l2_reg); let mut optimizer = InteriorPointOptimizer::new(&x, p); @@ -102,7 +155,7 @@ impl> ElasticNet { (w, b) } else { - let (x, y, gamma) = Self::augment_X_and_y(x, y, l2_reg); + let (x, y, gamma) = Self::augment_x_and_y(x, y, l2_reg); let mut optimizer = InteriorPointOptimizer::new(&x, p); @@ -159,7 +212,7 @@ impl> ElasticNet { Ok((scaled_x, col_mean, col_std)) } - fn augment_X_and_y(x: &M, y: &M::RowVector, l2_reg: T) -> (M, M::RowVector, T) { + fn augment_x_and_y(x: &M, y: &M::RowVector, l2_reg: T) -> (M, M::RowVector, T) { let (n, p) = x.shape(); let gamma = T::one() / (T::one() + l2_reg).sqrt(); diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index bb9e69c..7395bdc 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -105,18 +105,15 @@ impl> Lasso { return Err(Failed::fit("Number of rows in X should = len(y)")); } + let l1_reg = parameters.alpha * T::from_usize(n).unwrap(); + let (w, b) = if parameters.normalize { let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; let mut optimizer = InteriorPointOptimizer::new(&scaled_x, p); - let mut w = optimizer.optimize( - &scaled_x, - y, - parameters.alpha, - parameters.max_iter, - parameters.tol, - )?; + let mut w = + optimizer.optimize(&scaled_x, y, l1_reg, parameters.max_iter, parameters.tol)?; for (j, col_std_j) in col_std.iter().enumerate().take(p) { w.set(j, 0, w.get(j, 0) / *col_std_j); @@ -133,8 +130,7 @@ impl> Lasso { } else { let mut optimizer = InteriorPointOptimizer::new(x, p); - let w = - optimizer.optimize(x, y, parameters.alpha, parameters.max_iter, parameters.tol)?; + let w = optimizer.optimize(x, y, l1_reg, parameters.max_iter, parameters.tol)?; (w, y.mean()) }; @@ -215,18 +211,9 @@ mod tests { 114.2, 115.7, 116.9, ]; - let y_hat = Lasso::fit( - &x, - &y, - LassoParameters { - alpha: 0.1, - normalize: true, - tol: 1e-4, - max_iter: 1000, - }, - ) - .and_then(|lr| lr.predict(&x)) - .unwrap(); + let y_hat = Lasso::fit(&x, &y, Default::default()) + .and_then(|lr| lr.predict(&x)) + .unwrap(); assert!(mean_absolute_error(&y_hat, &y) < 2.0); diff --git a/src/linear/mod.rs b/src/linear/mod.rs index 8c056e8..3824d36 100644 --- a/src/linear/mod.rs +++ b/src/linear/mod.rs @@ -21,7 +21,7 @@ //! pub(crate) mod bg_solver; -pub mod elasticnet; +pub mod elastic_net; pub mod lasso; pub(crate) mod lasso_optimizer; pub mod linear_regression; From 74a7c45c75313045c0547ef5271e604c5995fb89 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 14 Dec 2020 14:59:02 -0800 Subject: [PATCH 63/79] feat: adds SVD --- src/decomposition/mod.rs | 1 + src/decomposition/pca.rs | 28 +++++ src/decomposition/svd.rs | 235 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+) create mode 100644 src/decomposition/svd.rs diff --git a/src/decomposition/mod.rs b/src/decomposition/mod.rs index a01c114..1460bd6 100644 --- a/src/decomposition/mod.rs +++ b/src/decomposition/mod.rs @@ -13,3 +13,4 @@ /// PCA is a popular approach for deriving a low-dimensional set of features from a large set of variables. pub mod pca; +pub mod svd; diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 9f5bd39..7d80f88 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -108,6 +108,13 @@ impl> PCA { ) -> Result, Failed> { let (m, n) = data.shape(); + if n_components > n { + return Err(Failed::fit(&format!( + "Number of components, n_components should be <= number of attributes ({})", + n + ))); + } + let mu = data.column_mean(); let mut x = data.clone(); @@ -224,6 +231,11 @@ impl> PCA { } Ok(x_transformed) } + + /// Get a projection matrix + pub fn components(&self) -> &M { + &self.projection + } } #[cfg(test)] @@ -286,6 +298,22 @@ mod tests { ]) } + #[test] + fn pca_components() { + let us_arrests = us_arrests_data(); + + let expected = DenseMatrix::from_2d_array(&[ + &[0.0417, 0.0448], + &[0.9952, 0.0588], + &[0.0463, 0.9769], + &[0.0752, 0.2007], + ]); + + let pca = PCA::fit(&us_arrests, 2, Default::default()).unwrap(); + + assert!(expected.approximate_eq(&pca.components().abs(), 0.4)); + } + #[test] fn decompose_covariance() { let us_arrests = us_arrests_data(); diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs new file mode 100644 index 0000000..fbaf042 --- /dev/null +++ b/src/decomposition/svd.rs @@ -0,0 +1,235 @@ +//! # Dimensionality reduction using SVD +//! +//! Similar to [`PCA`](../pca/index.html), SVD is a technique that can be used to reduce the number of input variables _p_ to a smaller number _k_, while preserving +//! the most important structure or relationships between the variables observed in the data. +//! +//! Contrary to PCA, SVD does not center the data before computing the singular value decomposition. +//! +//! Example: +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::decomposition::svd::*; +//! +//! // Iris data +//! let iris = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! +//! let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); // Reduce number of features to 2 +//! +//! let iris_reduced = svd.transform(&iris).unwrap(); +//! +//! ``` +//! +//! +//! +use std::fmt::Debug; +use std::marker::PhantomData; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +/// SVD +#[derive(Serialize, Deserialize, Debug)] +pub struct SVD> { + components: M, + phantom: PhantomData, +} + +impl> PartialEq for SVD { + fn eq(&self, other: &Self) -> bool { + self.components + .approximate_eq(&other.components, T::from_f64(1e-8).unwrap()) + } +} + +#[derive(Debug, Clone)] +/// SVD parameters +pub struct SVDParameters {} + +impl Default for SVDParameters { + fn default() -> Self { + SVDParameters {} + } +} + +impl> SVD { + /// Fits SVD to your data. + /// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `n_components` - number of components to keep. + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit(x: &M, n_components: usize, _: SVDParameters) -> Result, Failed> { + let (_, p) = x.shape(); + + if n_components >= p { + return Err(Failed::fit(&format!( + "Number of components, n_components should be < number of attributes ({})", + p + ))); + } + + let svd = x.svd()?; + + let components = svd.V.slice(0..p, 0..n_components); + + Ok(SVD { + components, + phantom: PhantomData, + }) + } + + /// Run dimensionality reduction for `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn transform(&self, x: &M) -> Result { + let (n, p) = x.shape(); + let (p_c, k) = self.components.shape(); + if p_c != p { + return Err(Failed::transform(&format!( + "Can not transform a {}x{} matrix into {}x{} matrix, incorrect input dimentions", + n, p, n, k + ))); + } + + Ok(x.matmul(&self.components)) + } + + /// Get a projection matrix + pub fn components(&self) -> &M { + &self.components + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn svd_decompose() { + // https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/USArrests.html + let x = DenseMatrix::from_2d_array(&[ + &[13.2, 236.0, 58.0, 21.2], + &[10.0, 263.0, 48.0, 44.5], + &[8.1, 294.0, 80.0, 31.0], + &[8.8, 190.0, 50.0, 19.5], + &[9.0, 276.0, 91.0, 40.6], + &[7.9, 204.0, 78.0, 38.7], + &[3.3, 110.0, 77.0, 11.1], + &[5.9, 238.0, 72.0, 15.8], + &[15.4, 335.0, 80.0, 31.9], + &[17.4, 211.0, 60.0, 25.8], + &[5.3, 46.0, 83.0, 20.2], + &[2.6, 120.0, 54.0, 14.2], + &[10.4, 249.0, 83.0, 24.0], + &[7.2, 113.0, 65.0, 21.0], + &[2.2, 56.0, 57.0, 11.3], + &[6.0, 115.0, 66.0, 18.0], + &[9.7, 109.0, 52.0, 16.3], + &[15.4, 249.0, 66.0, 22.2], + &[2.1, 83.0, 51.0, 7.8], + &[11.3, 300.0, 67.0, 27.8], + &[4.4, 149.0, 85.0, 16.3], + &[12.1, 255.0, 74.0, 35.1], + &[2.7, 72.0, 66.0, 14.9], + &[16.1, 259.0, 44.0, 17.1], + &[9.0, 178.0, 70.0, 28.2], + &[6.0, 109.0, 53.0, 16.4], + &[4.3, 102.0, 62.0, 16.5], + &[12.2, 252.0, 81.0, 46.0], + &[2.1, 57.0, 56.0, 9.5], + &[7.4, 159.0, 89.0, 18.8], + &[11.4, 285.0, 70.0, 32.1], + &[11.1, 254.0, 86.0, 26.1], + &[13.0, 337.0, 45.0, 16.1], + &[0.8, 45.0, 44.0, 7.3], + &[7.3, 120.0, 75.0, 21.4], + &[6.6, 151.0, 68.0, 20.0], + &[4.9, 159.0, 67.0, 29.3], + &[6.3, 106.0, 72.0, 14.9], + &[3.4, 174.0, 87.0, 8.3], + &[14.4, 279.0, 48.0, 22.5], + &[3.8, 86.0, 45.0, 12.8], + &[13.2, 188.0, 59.0, 26.9], + &[12.7, 201.0, 80.0, 25.5], + &[3.2, 120.0, 80.0, 22.9], + &[2.2, 48.0, 32.0, 11.2], + &[8.5, 156.0, 63.0, 20.7], + &[4.0, 145.0, 73.0, 26.2], + &[5.7, 81.0, 39.0, 9.3], + &[2.6, 53.0, 66.0, 10.8], + &[6.8, 161.0, 60.0, 15.6], + ]); + + let expected = DenseMatrix::from_2d_array(&[ + &[243.54655757, -18.76673788], + &[268.36802004, -33.79304302], + &[305.93972467, -15.39087376], + &[197.28420365, -11.66808306], + &[293.43187394, 1.91163633], + ]); + let svd = SVD::fit(&x, 2, Default::default()).unwrap(); + + let x_transformed = svd.transform(&x).unwrap(); + + assert_eq!(svd.components.shape(), (x.shape().1, 2)); + + assert!(x_transformed + .slice(0..5, 0..2) + .approximate_eq(&expected, 1e-4)); + } + + #[test] + fn serde() { + let iris = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + + let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); + + let deserialized_svd: SVD> = + serde_json::from_str(&serde_json::to_string(&svd).unwrap()).unwrap(); + + assert_eq!(svd, deserialized_svd); + } +} From d39b04e549fdf077825ee77205773bc7d044373b Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 14 Dec 2020 15:03:10 -0800 Subject: [PATCH 64/79] fix: fmt --- src/decomposition/svd.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index fbaf042..eea1969 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -2,7 +2,7 @@ //! //! Similar to [`PCA`](../pca/index.html), SVD is a technique that can be used to reduce the number of input variables _p_ to a smaller number _k_, while preserving //! the most important structure or relationships between the variables observed in the data. -//! +//! //! Contrary to PCA, SVD does not center the data before computing the singular value decomposition. //! //! Example: From 505f495445e1c51fc66ea670cc59744ba57fc49d Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 16 Dec 2020 00:20:07 -0400 Subject: [PATCH 65/79] fix: Update ndarray version --- Cargo.toml | 2 +- src/linalg/ndarray_bindings.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1503957..32d8695 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ nalgebra-bindings = ["nalgebra"] datasets = [] [dependencies] -ndarray = { version = "0.13", optional = true } +ndarray = { version = "0.14", optional = true } nalgebra = { version = "0.23.0", optional = true } num-traits = "0.2.12" num = "0.3.0" diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 3f0478f..b80fac8 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -47,7 +47,7 @@ use std::ops::Range; use std::ops::SubAssign; use ndarray::ScalarOperand; -use ndarray::{s, stack, Array, ArrayBase, Axis, Ix1, Ix2, OwnedRepr}; +use ndarray::{concatenate, s, Array, ArrayBase, Axis, Ix1, Ix2, OwnedRepr}; use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; @@ -246,11 +246,11 @@ impl Self { - stack(Axis(1), &[self.view(), other.view()]).unwrap() + concatenate(Axis(1), &[self.view(), other.view()]).unwrap() } fn v_stack(&self, other: &Self) -> Self { - stack(Axis(0), &[self.view(), other.view()]).unwrap() + concatenate(Axis(0), &[self.view(), other.view()]).unwrap() } fn matmul(&self, other: &Self) -> Self { From f76a1d142007a15bcc0e272acba4b29976dcda4c Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 17 Dec 2020 13:01:45 -0800 Subject: [PATCH 66/79] feat: makes smartcore::error:FailedError non-exhaustive --- src/error/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/error/mod.rs b/src/error/mod.rs index 1615290..2409889 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -12,6 +12,7 @@ pub struct Failed { } /// Type of error +#[non_exhaustive] #[derive(Copy, Clone, Debug, Serialize, Deserialize)] pub enum FailedError { /// Can't fit algorithm to data From 5a185479a7edd93574f835ca23e9c2acf9420747 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 17 Dec 2020 19:00:11 -0800 Subject: [PATCH 67/79] feat: NB documentation --- src/naive_bayes/bernoulli.rs | 35 ++++++++++++++++++++++++ src/naive_bayes/categorical.rs | 32 ++++++++++++++++++++++ src/naive_bayes/gaussian.rs | 24 ++++++++++++++++ src/naive_bayes/mod.rs | 50 ++++++++++++++++++++++++++++------ src/naive_bayes/multinomial.rs | 35 ++++++++++++++++++++++++ src/svm/svc.rs | 1 - 6 files changed, 167 insertions(+), 10 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 057b447..c478d58 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -1,3 +1,38 @@ +//! # Bernoulli Naive Bayes +//! +//! Bernoulli Naive Bayes classifier is a variant of [Naive Bayes](../index.html) for the data that is distributed according to multivariate Bernoulli distribution. +//! It is used for discrete data with binary features. One example of a binary feature is a word that occurs in the text or not. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::naive_bayes::bernoulli::BernoulliNB; +//! +//! // Training data points are: +//! // Chinese Beijing Chinese (class: China) +//! // Chinese Chinese Shanghai (class: China) +//! // Chinese Macao (class: China) +//! // Tokyo Japan Chinese (class: Japan) +//! let x = DenseMatrix::::from_2d_array(&[ +//! &[1., 1., 0., 0., 0., 0.], +//! &[0., 1., 0., 0., 1., 0.], +//! &[0., 1., 0., 1., 0., 0.], +//! &[0., 1., 1., 0., 0., 1.], +//! ]); +//! let y = vec![0., 0., 0., 1.]; +//! +//! let nb = BernoulliNB::fit(&x, &y, Default::default()).unwrap(); +//! +//! // Testing data point is: +//! // Chinese Chinese Chinese Tokyo Japan +//! let x_test = DenseMatrix::::from_2d_array(&[&[0., 1., 1., 0., 0., 1.]]); +//! let y_hat = nb.predict(&x_test).unwrap(); +//! ``` +//! +//! ## References: +//! +//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index d32c34d..d6b24a2 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -1,3 +1,35 @@ +//! # Categorical Naive Bayes +//! +//! Categorical Naive Bayes is a variant of [Naive Bayes](../index.html) for the categorically distributed data. +//! It assumes that each feature has its own categorical distribution. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::naive_bayes::categorical::CategoricalNB; +//! +//! let x = DenseMatrix::from_2d_array(&[ +//! &[3., 4., 0., 1.], +//! &[3., 0., 0., 1.], +//! &[4., 4., 1., 2.], +//! &[4., 2., 4., 3.], +//! &[4., 2., 4., 2.], +//! &[4., 1., 1., 0.], +//! &[1., 1., 1., 1.], +//! &[0., 4., 1., 0.], +//! &[0., 3., 2., 1.], +//! &[0., 3., 1., 1.], +//! &[3., 4., 0., 1.], +//! &[3., 4., 2., 4.], +//! &[0., 3., 1., 2.], +//! &[0., 4., 1., 2.], +//! ]); +//! let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; +//! +//! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); +//! let y_hat = nb.predict(&x).unwrap(); +//! ``` use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index af5732d..fc11b49 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -1,3 +1,27 @@ +//! # Gaussian Naive Bayes +//! +//! Gaussian Naive Bayes is a variant of [Naive Bayes](../index.html) for the data that follows Gaussian distribution and +//! it supports continuous valued features conforming to a normal distribution. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::naive_bayes::gaussian::GaussianNB; +//! +//! let x = DenseMatrix::from_2d_array(&[ +//! &[-1., -1.], +//! &[-2., -1.], +//! &[-3., -2.], +//! &[ 1., 1.], +//! &[ 2., 1.], +//! &[ 3., 2.], +//! ]); +//! let y = vec![1., 1., 1., 2., 2., 2.]; +//! +//! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); +//! let y_hat = nb.predict(&x).unwrap(); +//! ``` use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 508b976..7ab8b85 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -1,3 +1,40 @@ +//! # Naive Bayes +//! +//! Naive Bayes (NB) is a simple but powerful machine learning algorithm. +//! Naive Bayes classifier is based on Bayes’ Theorem with an ssumption of conditional independence +//! between every pair of features given the value of the class variable. +//! +//! Bayes’ theorem can be written as +//! +//! \\[ P(y | X) = \frac{P(y)P(X| y)}{P(X)} \\] +//! +//! where +//! +//! * \\(X = (x_1,...x_n)\\) represents the predictors. +//! * \\(P(y | X)\\) is the probability of class _y_ given the data X +//! * \\(P(X| y)\\) is the probability of data X given the class _y_. +//! * \\(P(y)\\) is the probability of class y. This is called the prior probability of y. +//! * \\(P(y | X)\\) is the probability of the data (regardless of the class value). +//! +//! The naive conditional independence assumption let us rewrite this equation as +//! +//! \\[ P(y | x_1,...x_n) = \frac{P(y)\prod_{i=1}^nP(x_i|y)}{P(x_1,...x_n)} \\] +//! +//! +//! The denominator can be removed since \\(P(x_1,...x_n)\\) is constrant for all the entries in the dataset. +//! +//! \\[ P(y | x_1,...x_n) \propto P(y)\prod_{i=1}^nP(x_i|y) \\] +//! +//! To find class y from predictors X we use this equation +//! +//! \\[ y = \underset{y}{argmax} P(y)\prod_{i=1}^nP(x_i|y) \\] +//! +//! ## References: +//! +//! * ["Machine Learning: A Probabilistic Perspective", Kevin P. Murphy, 2012, Chapter 3 ](https://mitpress.mit.edu/books/machine-learning-1) +//! +//! +//! use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -64,12 +101,7 @@ impl, D: NBDistribution> BaseNaiveBayes::from_2d_array(&[ +//! &[1., 2., 0., 0., 0., 0.], +//! &[0., 2., 0., 0., 1., 0.], +//! &[0., 1., 0., 1., 0., 0.], +//! &[0., 1., 1., 0., 0., 1.], +//! ]); +//! let y = vec![0., 0., 0., 1.]; +//! let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); +//! +//! // Testing data point is: +//! // Chinese Chinese Chinese Tokyo Japan +//! let x_test = DenseMatrix::::from_2d_array(&[&[0., 3., 1., 0., 0., 1.]]); +//! let y_hat = nb.predict(&x_test).unwrap(); +//! ``` +//! +//! ## References: +//! +//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 4fd70df..9e166d5 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -28,7 +28,6 @@ //! //! ``` //! use smartcore::linalg::naive::dense_matrix::*; -//! use smartcore::linear::linear_regression::*; //! use smartcore::svm::Kernels; //! use smartcore::svm::svc::{SVC, SVCParameters}; //! From 8ca13a76d699f577b9746376f878cf8d23ec59e1 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 17 Dec 2020 19:11:47 -0800 Subject: [PATCH 68/79] fix: criterion --- benches/naive_bayes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/naive_bayes.rs b/benches/naive_bayes.rs index 2a4595b..ba8cb6f 100644 --- a/benches/naive_bayes.rs +++ b/benches/naive_bayes.rs @@ -6,7 +6,7 @@ use ndarray::Array2; use smartcore::linalg::naive::dense_matrix::DenseMatrix; use smartcore::linalg::BaseMatrix; use smartcore::linalg::BaseVector; -use smartcore::naive_bayes::GaussianNB; +use smartcore::naive_bayes::gaussian::GaussianNB; pub fn gaussian_naive_bayes_fit_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("GaussianNB::fit"); From c9eb94ba939cbf9a9987a8bd1332568c9b49a0b5 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 16 Dec 2020 20:11:09 -0400 Subject: [PATCH 69/79] Derive clone for NB Parameters --- src/naive_bayes/bernoulli.rs | 2 +- src/naive_bayes/categorical.rs | 2 +- src/naive_bayes/gaussian.rs | 2 +- src/naive_bayes/multinomial.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index c478d58..dd34ae9 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -76,7 +76,7 @@ impl> NBDistribution for BernoulliNBDistributi } /// `BernoulliNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct BernoulliNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index d6b24a2..c4626ef 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -215,7 +215,7 @@ impl CategoricalNBDistribution { } /// `CategoricalNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct CategoricalNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index fc11b49..c5c1fb2 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -74,7 +74,7 @@ impl> NBDistribution for GaussianNBDistributio } /// `GaussianNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Default)] +#[derive(Serialize, Deserialize, Debug, Default, Clone)] pub struct GaussianNBParameters { /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data pub priors: Option>, diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 0fb7aa4..c9ac86b 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -72,7 +72,7 @@ impl> NBDistribution for MultinomialNBDistribu } /// `MultinomialNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct MultinomialNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, From a2be9e117f96e173c9aaaa0be0e6f79cdac719ac Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 15:41:53 -0800 Subject: [PATCH 70/79] feat: + cross_validate, trait Predictor, refactoring --- src/algorithm/neighbour/cover_tree.rs | 3 +- src/algorithm/neighbour/linear_search.rs | 2 + src/base.rs | 10 + src/ensemble/random_forest_classifier.rs | 9 +- src/ensemble/random_forest_regressor.rs | 7 + src/lib.rs | 3 +- src/linalg/mod.rs | 72 ++++ src/linalg/ndarray_bindings.rs | 4 +- src/linear/elastic_net.rs | 9 +- src/linear/lasso.rs | 9 +- src/linear/linear_regression.rs | 11 +- src/linear/logistic_regression.rs | 34 +- src/linear/ridge_regression.rs | 11 +- src/math/distance/euclidian.rs | 2 +- src/math/distance/hamming.rs | 2 +- src/math/distance/mahalanobis.rs | 2 +- src/math/distance/manhattan.rs | 2 +- src/math/distance/minkowski.rs | 2 +- src/math/distance/mod.rs | 2 +- src/metrics/mod.rs | 2 +- src/model_selection/kfold.rs | 286 ++++++++++++++ src/model_selection/mod.rs | 473 ++++++++++++----------- src/naive_bayes/bernoulli.rs | 7 + src/naive_bayes/categorical.rs | 7 + src/naive_bayes/gaussian.rs | 7 + src/naive_bayes/multinomial.rs | 7 + src/neighbors/knn_classifier.rs | 79 +++- src/neighbors/knn_regressor.rs | 80 +++- src/neighbors/mod.rs | 2 +- src/svm/mod.rs | 5 +- src/svm/svc.rs | 94 +++-- src/svm/svr.rs | 83 ++-- src/tree/decision_tree_classifier.rs | 9 +- src/tree/decision_tree_regressor.rs | 9 +- 34 files changed, 977 insertions(+), 369 deletions(-) create mode 100644 src/base.rs create mode 100644 src/model_selection/kfold.rs diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 2fe7792..d271ed6 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -6,6 +6,7 @@ //! use smartcore::algorithm::neighbour::cover_tree::*; //! use smartcore::math::distance::Distance; //! +//! #[derive(Clone)] //! struct SimpleDistance {} // Our distance function //! //! impl Distance for SimpleDistance { @@ -453,7 +454,7 @@ mod tests { use super::*; use crate::math::distance::Distances; - #[derive(Debug, Serialize, Deserialize)] + #[derive(Debug, Serialize, Deserialize, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index d09f2ed..45fbd6f 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -5,6 +5,7 @@ //! use smartcore::algorithm::neighbour::linear_search::*; //! use smartcore::math::distance::Distance; //! +//! #[derive(Clone)] //! struct SimpleDistance {} // Our distance function //! //! impl Distance for SimpleDistance { @@ -137,6 +138,7 @@ mod tests { use super::*; use crate::math::distance::Distances; + #[derive(Debug, Serialize, Deserialize, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/base.rs b/src/base.rs new file mode 100644 index 0000000..a2d4468 --- /dev/null +++ b/src/base.rs @@ -0,0 +1,10 @@ +//! # Common Interfaces and methods +//! +//! This module consolidates interfaces and uniform basic API that is used elsewhere in the code. + +use crate::error::Failed; + +/// Implements method predict that offers a way to estimate target value from new data +pub trait Predictor { + fn predict(&self, x: &X) -> Result; +} diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 7229d92..a742d90 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -9,7 +9,7 @@ //! //! ``` //! use smartcore::linalg::naive::dense_matrix::*; -//! use smartcore::ensemble::random_forest_classifier::*; +//! use smartcore::ensemble::random_forest_classifier::RandomForestClassifier; //! //! // Iris dataset //! let x = DenseMatrix::from_2d_array(&[ @@ -51,6 +51,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -117,6 +118,12 @@ impl Default for RandomForestClassifierParameters { } } +impl> Predictor for RandomForestClassifier { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl RandomForestClassifier { /// Build a forest of trees from the training set. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 36fa096..52b39f9 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -49,6 +49,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -106,6 +107,12 @@ impl PartialEq for RandomForestRegressor { } } +impl> Predictor for RandomForestRegressor { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl RandomForestRegressor { /// Build a forest of trees from the training set. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/lib.rs b/src/lib.rs index 9290c86..a1608c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,7 +63,7 @@ //! let y = vec![2., 2., 2., 3., 3.]; //! //! // Train classifier -//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); //! //! // Predict classes //! let y_hat = knn.predict(&x).unwrap(); @@ -71,6 +71,7 @@ /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; +pub(crate) mod base; /// Algorithms for clustering of unlabeled data pub mod cluster; /// Various datasets diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index c768cbf..5b49942 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -274,6 +274,19 @@ pub trait BaseVector: Clone + Debug { /// Copies content of `other` vector. fn copy_from(&mut self, other: &Self); + + /// Take elements from an array. + fn take(&self, index: &[usize]) -> Self { + let n = index.len(); + + let mut result = Self::zeros(n); + + for i in 0..n { + result.set(i, self.get(index[i])); + } + + result + } } /// Generic matrix type. @@ -611,6 +624,32 @@ pub trait BaseMatrix: Clone + Debug { /// Calculates the covariance matrix fn cov(&self) -> Self; + + /// Take elements from an array along an axis. + fn take(&self, index: &[usize], axis: u8) -> Self { + let (n, p) = self.shape(); + + let k = match axis { + 0 => p, + _ => n, + }; + + let mut result = match axis { + 0 => Self::zeros(index.len(), p), + _ => Self::zeros(n, index.len()), + }; + + for i in 0..index.len() { + for j in 0..k { + match axis { + 0 => result.set(i, j, self.get(index[i], j)), + _ => result.set(j, i, self.get(j, index[i])), + }; + } + } + + result + } } /// Generic matrix with additional mixins like various factorization methods. @@ -662,6 +701,8 @@ impl<'a, T: RealNumber, M: BaseMatrix> Iterator for RowIter<'a, T, M> { #[cfg(test)] mod tests { + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::linalg::BaseMatrix; use crate::linalg::BaseVector; #[test] @@ -684,4 +725,35 @@ mod tests { assert!((m.var() - 1.25f64).abs() < std::f64::EPSILON); } + + #[test] + fn vec_take() { + let m = vec![1., 2., 3., 4., 5.]; + + assert_eq!(m.take(&vec!(0, 0, 4, 4)), vec![1., 1., 5., 5.]); + } + + #[test] + fn take() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 2.0], + &[3.0, 4.0], + &[5.0, 6.0], + &[7.0, 8.0], + &[9.0, 10.0], + ]); + + let expected_0 = DenseMatrix::from_2d_array(&[&[3.0, 4.0], &[3.0, 4.0], &[7.0, 8.0]]); + + let expected_1 = DenseMatrix::from_2d_array(&[ + &[2.0, 1.0], + &[4.0, 3.0], + &[6.0, 5.0], + &[8.0, 7.0], + &[10.0, 9.0], + ]); + + assert_eq!(m.take(&vec!(1, 1, 3), 0), expected_0); + assert_eq!(m.take(&vec!(1, 0), 1), expected_1); + } } diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 085fd5d..6ed40c8 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -36,7 +36,7 @@ //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. //! ]); //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = lr.predict(&x).unwrap(); //! ``` use std::iter::Sum; @@ -917,7 +917,7 @@ mod tests { 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]); - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index c01f3c7..b386290 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -58,6 +58,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -66,7 +67,7 @@ use crate::math::num::RealNumber; use crate::linear::lasso_optimizer::InteriorPointOptimizer; /// Elastic net parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct ElasticNetParameters { /// Regularization parameter. pub alpha: T, @@ -108,6 +109,12 @@ impl> PartialEq for ElasticNet { } } +impl> Predictor for ElasticNet { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> ElasticNet { /// Fits elastic net regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 7395bdc..0dab3e5 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -26,6 +26,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -33,7 +34,7 @@ use crate::linear::lasso_optimizer::InteriorPointOptimizer; use crate::math::num::RealNumber; /// Lasso regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LassoParameters { /// Controls the strength of the penalty to the loss function. pub alpha: T, @@ -71,6 +72,12 @@ impl> PartialEq for Lasso { } } +impl> Predictor for Lasso { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> Lasso { /// Fits Lasso regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index d01b817..c7bd872 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -64,11 +64,12 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Approach to use for estimation of regression coefficients. QR is more efficient but SVD is more stable. pub enum LinearRegressionSolverName { /// QR decomposition, see [QR](../../linalg/qr/index.html) @@ -78,7 +79,7 @@ pub enum LinearRegressionSolverName { } /// Linear Regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LinearRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: LinearRegressionSolverName, @@ -107,6 +108,12 @@ impl> PartialEq for LinearRegression { } } +impl> Predictor for LinearRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> LinearRegression { /// Fits Linear Regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 7b7cab6..b85bbe8 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -40,7 +40,7 @@ //! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., //! ]; //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! ``` @@ -58,6 +58,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -66,6 +67,11 @@ use crate::optimization::first_order::{FirstOrderOptimizer, OptimizerResult}; use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; +/// Logistic Regression parameters +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct LogisticRegressionParameters { +} + /// Logistic Regression #[derive(Serialize, Deserialize, Debug)] pub struct LogisticRegression> { @@ -97,6 +103,13 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { phantom: PhantomData<&'a T>, } +impl Default for LogisticRegressionParameters { + fn default() -> Self { + LogisticRegressionParameters { + } + } +} + impl> PartialEq for LogisticRegression { fn eq(&self, other: &Self) -> bool { if self.num_classes != other.num_classes @@ -207,11 +220,18 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction } } +impl> Predictor for LogisticRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> LogisticRegression { /// Fits Logistic Regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target class values - pub fn fit(x: &M, y: &M::RowVector) -> Result, Failed> { + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); let (_, y_nrows) = y_m.shape(); @@ -461,7 +481,7 @@ mod tests { ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); assert_eq!(lr.coefficients().shape(), (3, 2)); assert_eq!(lr.intercept().shape(), (3, 1)); @@ -484,7 +504,7 @@ mod tests { let x = DenseMatrix::from_vec(15, 4, &blobs.data); let y = blobs.target; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -498,7 +518,7 @@ mod tests { let x = DenseMatrix::from_vec(20, 4, &blobs.data); let y = blobs.target; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -526,7 +546,7 @@ mod tests { ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let deserialized_lr: LogisticRegression> = serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); @@ -562,7 +582,7 @@ mod tests { 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 98bc639..2b5a898 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -63,12 +63,13 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable. pub enum RidgeRegressionSolverName { /// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html) @@ -78,7 +79,7 @@ pub enum RidgeRegressionSolverName { } /// Ridge Regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RidgeRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: RidgeRegressionSolverName, @@ -114,6 +115,12 @@ impl> PartialEq for RidgeRegression { } } +impl> Predictor for RidgeRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> RidgeRegression { /// Fits ridge regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index e292f9c..9034727 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -25,7 +25,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Euclidian {} impl Euclidian { diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index 4028259..129fe16 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -26,7 +26,7 @@ use crate::math::num::RealNumber; use super::Distance; /// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Hamming {} impl Distance, F> for Hamming { diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index fd320c3..84aa947 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -52,7 +52,7 @@ use super::Distance; use crate::linalg::Matrix; /// Mahalanobis distance. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Mahalanobis> { /// covariance matrix of the dataset pub sigma: M, diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 66125a5..9a69184 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -24,7 +24,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Manhattan distance -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Manhattan {} impl Distance, T> for Manhattan { diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index b7c5691..c5dd85d 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -28,7 +28,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Defines the Minkowski distance of order `p` -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Minkowski { /// order, integer pub p: u16, diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 696b5ff..9bfbd6b 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -28,7 +28,7 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; /// Distance metric, a function that calculates distance between two points -pub trait Distance { +pub trait Distance: Clone { /// Calculates distance between _a_ and _b_ fn distance(&self, a: &T, b: &T) -> F; } diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index f49300d..42b3994 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -42,7 +42,7 @@ //! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., //! ]; //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs new file mode 100644 index 0000000..0fbe224 --- /dev/null +++ b/src/model_selection/kfold.rs @@ -0,0 +1,286 @@ +//! # KFold +//! +//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), +//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. +//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. +//! Underfitted is bad because the model is undetrained and does not fit the training data well. +//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for +//! your data. +//! +//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. + +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use rand::seq::SliceRandom; +use rand::thread_rng; + +/// An interface for the K-Folds cross-validator +pub trait BaseKFold { + /// An iterator over indices that split data into training and test set. + type Output: Iterator, Vec)>; + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Self::Output; + /// Returns the number of splits + fn n_splits(&self) -> usize; +} + +/// K-Folds cross-validator +pub struct KFold { + /// Number of folds. Must be at least 2. + pub n_splits: usize, // cannot exceed std::usize::MAX + /// Whether to shuffle the data before splitting into batches + pub shuffle: bool, +} + +impl KFold { + fn test_indices>(&self, x: &M) -> Vec> { + // number of samples (rows) in the matrix + let n_samples: usize = x.shape().0; + + // initialise indices + let mut indices: Vec = (0..n_samples).collect(); + if self.shuffle { + indices.shuffle(&mut thread_rng()); + } + // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. + let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + + // increment by one if odd + for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { + *fold_size += 1; + } + + // generate the right array of arrays for test indices + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + let mut current: usize = 0; + for fold_size in fold_sizes.drain(..) { + let stop = current + fold_size; + return_values.push(indices[current..stop].to_vec()); + current = stop + } + + return_values + } + + fn test_masks>(&self, x: &M) -> Vec> { + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + for test_index in self.test_indices(x).drain(..) { + // init mask + let mut test_mask = vec![false; x.shape().0]; + // set mask's indices to true according to test indices + for i in test_index { + test_mask[i] = true; // can be implemented with map() + } + return_values.push(test_mask); + } + return_values + } +} + +impl Default for KFold { + fn default() -> KFold { + KFold { + n_splits: 3, + shuffle: true, + } + } +} + +impl KFold { + /// Number of folds. Must be at least 2. + pub fn with_n_splits(mut self, n_splits: usize) -> Self { + self.n_splits = n_splits; + self + } + /// Whether to shuffle the data before splitting into batches + pub fn with_shuffle(mut self, shuffle: bool) -> Self { + self.shuffle = shuffle; + self + } +} + +/// An iterator over indices that split data into training and test set. +pub struct BaseKFoldIter { + indices: Vec, + test_indices: Vec>, +} + +impl Iterator for BaseKFoldIter { + type Item = (Vec, Vec); + + fn next(&mut self) -> Option<(Vec, Vec)> { + self.test_indices.pop().map(|test_index| { + let train_index = self + .indices + .iter() + .enumerate() + .filter(|&(idx, _)| !test_index[idx]) + .map(|(idx, _)| idx) + .collect::>(); // filter train indices out according to mask + let test_index = self + .indices + .iter() + .enumerate() + .filter(|&(idx, _)| test_index[idx]) + .map(|(idx, _)| idx) + .collect::>(); // filter tests indices out according to mask + + (train_index, test_index) + }) + } +} + +/// Abstract class for all KFold functionalities +impl BaseKFold for KFold { + type Output = BaseKFoldIter; + + fn n_splits(&self) -> usize { + self.n_splits + } + + fn split>(&self, x: &M) -> Self::Output { + if self.n_splits < 2 { + panic!("Number of splits is too small: {}", self.n_splits); + } + let n_samples: usize = x.shape().0; + let indices: Vec = (0..n_samples).collect(); + let mut test_indices = self.test_masks(x); + test_indices.reverse(); + + BaseKFoldIter { + indices, + test_indices, + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn run_kfold_return_test_indices_simple() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(33, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..11).collect::>()); + assert_eq!(test_indices[1], (11..22).collect::>()); + assert_eq!(test_indices[2], (22..33).collect::>()); + } + + #[test] + fn run_kfold_return_test_indices_odd() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(34, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..12).collect::>()); + assert_eq!(test_indices[1], (12..23).collect::>()); + assert_eq!(test_indices[2], (23..34).collect::>()); + } + + #[test] + fn run_kfold_return_test_mask_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let test_masks = k.test_masks(&x); + + for t in &test_masks[0][0..11] { + // TODO: this can be prob done better + assert_eq!(*t, true) + } + for t in &test_masks[0][11..22] { + assert_eq!(*t, false) + } + + for t in &test_masks[1][0..11] { + assert_eq!(*t, false) + } + for t in &test_masks[1][11..22] { + assert_eq!(*t, true) + } + } + + #[test] + fn run_kfold_return_split_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let train_test_splits: Vec<(Vec, Vec)> = k.split(&x).collect(); + + assert_eq!(train_test_splits[0].1, (0..11).collect::>()); + assert_eq!(train_test_splits[0].0, (11..22).collect::>()); + assert_eq!(train_test_splits[1].0, (0..11).collect::>()); + assert_eq!(train_test_splits[1].1, (11..22).collect::>()); + } + + #[test] + fn run_kfold_return_split_simple_shuffle() { + let k = KFold { + n_splits: 2, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(23, 100); + let train_test_splits: Vec<(Vec, Vec)> = k.split(&x).collect(); + + assert_eq!(train_test_splits[0].1.len(), 12_usize); + assert_eq!(train_test_splits[0].0.len(), 11_usize); + assert_eq!(train_test_splits[1].0.len(), 12_usize); + assert_eq!(train_test_splits[1].1.len(), 11_usize); + } + + #[test] + fn numpy_parity_test() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test, expected_test); + assert_eq!(train, expected_train); + } + } + + #[test] + fn numpy_parity_test_shuffle() { + let k = KFold { + n_splits: 3, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test.len(), expected_test.len()); + assert_eq!(train.len(), expected_train.len()); + } + } +} diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index bc0f9b8..64527b3 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -9,21 +9,27 @@ //! //! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +use crate::base::Predictor; +use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; +use crate::model_selection::kfold::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -use rand::Rng; + +pub mod kfold; /// Splits data into 2 disjoint datasets. /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. /// * `y` - target values, should be of size _M_ /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. +/// * `shuffle`, - whether or not to shuffle the data before splitting pub fn train_test_split>( x: &M, y: &M::RowVector, test_size: f32, + shuffle: bool, ) -> (M, M, M::RowVector, M::RowVector) { if x.shape().0 != y.len() { panic!( @@ -38,155 +44,80 @@ pub fn train_test_split>( } let n = y.len(); - let m = x.shape().1; - let mut rng = rand::thread_rng(); - let mut n_test = 0; - let mut index = vec![false; n]; + let n_test = ((n as f32) * test_size) as usize; - for index_i in index.iter_mut().take(n) { - let p_test: f32 = rng.gen(); - if p_test <= test_size { - *index_i = true; - n_test += 1; - } + if n_test < 1 { + panic!("number of sample is too small {}", n); } - let n_train = n - n_test; + let mut indices: Vec = (0..n).collect(); - let mut x_train = M::zeros(n_train, m); - let mut x_test = M::zeros(n_test, m); - let mut y_train = M::RowVector::zeros(n_train); - let mut y_test = M::RowVector::zeros(n_test); - - let mut r_train = 0; - let mut r_test = 0; - - for (r, index_r) in index.iter().enumerate().take(n) { - if *index_r { - //sample belongs to test - for c in 0..m { - x_test.set(r_test, c, x.get(r, c)); - y_test.set(r_test, y.get(r)); - } - r_test += 1; - } else { - for c in 0..m { - x_train.set(r_train, c, x.get(r, c)); - y_train.set(r_train, y.get(r)); - } - r_train += 1; - } + if shuffle { + indices.shuffle(&mut thread_rng()); } + let x_train = x.take(&indices[n_test..n], 0); + let x_test = x.take(&indices[0..n_test], 0); + let y_train = y.take(&indices[n_test..n]); + let y_test = y.take(&indices[0..n_test]); + (x_train, x_test, y_train, y_test) } -/// -/// KFold Cross-Validation -/// -pub trait BaseKFold { - /// Returns integer indices corresponding to test sets - fn test_indices>(&self, x: &M) -> Vec>; - - /// Returns masksk corresponding to test sets - fn test_masks>(&self, x: &M) -> Vec>; - - /// Return a tuple containing the the training set indices for that split and - /// the testing set indices for that split. - fn split>(&self, x: &M) -> Vec<(Vec, Vec)>; +#[derive(Clone, Debug)] +pub struct CrossValidationResult { + pub test_score: Vec, + pub train_score: Vec, } -/// -/// An implementation of KFold -/// -pub struct KFold { - n_splits: usize, // cannot exceed std::usize::MAX - shuffle: bool, - // TODO: to be implemented later - // random_state: i32, -} +impl CrossValidationResult { + pub fn mean_test_score(&self) -> T { + self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap() + } -impl Default for KFold { - fn default() -> KFold { - KFold { - n_splits: 3_usize, - shuffle: true, - } + pub fn mean_train_score(&self) -> T { + self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap() } } -/// -/// Abstract class for all KFold functionalities -/// -impl BaseKFold for KFold { - fn test_indices>(&self, x: &M) -> Vec> { - // number of samples (rows) in the matrix - let n_samples: usize = x.shape().0; +pub fn cross_validate( + fit_estimator: F, + x: &M, + y: &M::RowVector, + parameters: H, + cv: K, + score: S, +) -> Result, Failed> +where + T: RealNumber, + M: Matrix, + H: Clone, + E: Predictor, + K: BaseKFold, + F: Fn(&M, &M::RowVector, H) -> Result, + S: Fn(&M::RowVector, &M::RowVector) -> T, +{ + let k = cv.n_splits(); + let mut test_score = Vec::with_capacity(k); + let mut train_score = Vec::with_capacity(k); - // initialise indices - let mut indices: Vec = (0..n_samples).collect(); - if self.shuffle { - indices.shuffle(&mut thread_rng()); - } - // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. - let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + for (test_idx, train_idx) in cv.split(x) { + let train_x = x.take(&train_idx, 0); + let train_y = y.take(&train_idx); + let test_x = x.take(&test_idx, 0); + let test_y = y.take(&test_idx); - // increment by one if odd - for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { - *fold_size += 1; - } + let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; - // generate the right array of arrays for test indices - let mut return_values: Vec> = Vec::with_capacity(self.n_splits); - let mut current: usize = 0; - for fold_size in fold_sizes.drain(..) { - let stop = current + fold_size; - return_values.push(indices[current..stop].to_vec()); - current = stop - } - - return_values + train_score.push(score(&train_y, &estimator.predict(&train_x)?)); + test_score.push(score(&test_y, &estimator.predict(&test_x)?)); } - fn test_masks>(&self, x: &M) -> Vec> { - let mut return_values: Vec> = Vec::with_capacity(self.n_splits); - for test_index in self.test_indices(x).drain(..) { - // init mask - let mut test_mask = vec![false; x.shape().0]; - // set mask's indices to true according to test indices - for i in test_index { - test_mask[i] = true; // can be implemented with map() - } - return_values.push(test_mask); - } - return_values - } - - fn split>(&self, x: &M) -> Vec<(Vec, Vec)> { - let n_samples: usize = x.shape().0; - let indices: Vec = (0..n_samples).collect(); - - let mut return_values: Vec<(Vec, Vec)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs - - for test_index in self.test_masks(x).drain(..) { - let train_index = indices - .clone() - .iter() - .enumerate() - .filter(|&(idx, _)| !test_index[idx]) - .map(|(idx, _)| idx) - .collect::>(); // filter train indices out according to mask - let test_index = indices - .iter() - .enumerate() - .filter(|&(idx, _)| test_index[idx]) - .map(|(idx, _)| idx) - .collect::>(); // filter tests indices out according to mask - return_values.push((train_index, test_index)) - } - return_values - } + Ok(CrossValidationResult { + test_score, + train_score, + }) } #[cfg(test)] @@ -194,14 +125,17 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; + use crate::metrics::{accuracy, mean_absolute_error}; + use crate::model_selection::kfold::KFold; + use crate::neighbors::knn_regressor::KNNRegressor; #[test] fn run_train_test_split() { - let n = 100; - let x: DenseMatrix = DenseMatrix::rand(100, 3); - let y = vec![0f64; 100]; + let n = 123; + let x: DenseMatrix = DenseMatrix::rand(n, 3); + let y = vec![0f64; n]; - let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2); + let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true); assert!( x_train.shape().0 > (n as f64 * 0.65) as usize @@ -215,126 +149,195 @@ mod tests { assert_eq!(x_test.shape().0, y_test.len()); } - #[test] - fn run_kfold_return_test_indices_simple() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(33, 100); - let test_indices = k.test_indices(&x); + #[derive(Clone)] + struct NoParameters {} - assert_eq!(test_indices[0], (0..11).collect::>()); - assert_eq!(test_indices[1], (11..22).collect::>()); - assert_eq!(test_indices[2], (22..33).collect::>()); + #[test] + fn test_cross_validate_biased() { + struct BiasedEstimator {} + + impl BiasedEstimator { + fn fit>( + _: &M, + _: &M::RowVector, + _: NoParameters, + ) -> Result { + Ok(BiasedEstimator {}) + } + } + + impl> Predictor for BiasedEstimator { + fn predict(&self, x: &M) -> Result { + let (n, _) = x.shape(); + Ok(M::RowVector::zeros(n)) + } + } + + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + let y = vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let cv = KFold { + n_splits: 5, + ..KFold::default() + }; + + let results = + cross_validate(BiasedEstimator::fit, &x, &y, NoParameters {}, cv, &accuracy).unwrap(); + + assert_eq!(0.4, results.mean_test_score()); + assert_eq!(0.4, results.mean_train_score()); } #[test] - fn run_kfold_return_test_indices_odd() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(34, 100); - let test_indices = k.test_indices(&x); + fn test_cross_validate_knn() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159., 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165., 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], + &[365.385, 187., 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335., 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; - assert_eq!(test_indices[0], (0..12).collect::>()); - assert_eq!(test_indices[1], (12..23).collect::>()); - assert_eq!(test_indices[2], (23..34).collect::>()); + let cv = KFold { + n_splits: 5, + ..KFold::default() + }; + + let results = cross_validate( + KNNRegressor::fit, + &x, + &y, + Default::default(), + cv, + &mean_absolute_error, + ) + .unwrap(); + + assert!(results.mean_test_score() < 15.0); + assert!(results.mean_train_score() < results.mean_test_score()); } + use crate::tree::decision_tree_regressor::*; + #[test] - fn run_kfold_return_test_mask_simple() { - let k = KFold { - n_splits: 2, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(22, 100); - let test_masks = k.test_masks(&x); + fn test_some_regressor() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159., 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165., 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], + &[365.385, 187., 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335., 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; - for t in &test_masks[0][0..11] { - // TODO: this can be prob done better - assert_eq!(*t, true) - } - for t in &test_masks[0][11..22] { - assert_eq!(*t, false) - } + let cv = KFold::default().with_n_splits(2); - for t in &test_masks[1][0..11] { - assert_eq!(*t, false) - } - for t in &test_masks[1][11..22] { - assert_eq!(*t, true) - } + let results = cross_validate( + DecisionTreeRegressor::fit, + &x, + &y, + Default::default(), + cv, + &mean_absolute_error, + ) + .unwrap(); + + println!("{}", results.mean_test_score()); + println!("{}", results.mean_train_score()); } - #[test] - fn run_kfold_return_split_simple() { - let k = KFold { - n_splits: 2, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(22, 100); - let train_test_splits = k.split(&x); - - assert_eq!(train_test_splits[0].1, (0..11).collect::>()); - assert_eq!(train_test_splits[0].0, (11..22).collect::>()); - assert_eq!(train_test_splits[1].0, (0..11).collect::>()); - assert_eq!(train_test_splits[1].1, (11..22).collect::>()); - } + use crate::tree::decision_tree_classifier::*; #[test] - fn run_kfold_return_split_simple_shuffle() { - let k = KFold { + fn test_some_classifier() { + + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + let y = vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let cv = KFold { n_splits: 2, ..KFold::default() }; - let x: DenseMatrix = DenseMatrix::rand(23, 100); - let train_test_splits = k.split(&x); - assert_eq!(train_test_splits[0].1.len(), 12_usize); - assert_eq!(train_test_splits[0].0.len(), 11_usize); - assert_eq!(train_test_splits[1].0.len(), 12_usize); - assert_eq!(train_test_splits[1].1.len(), 11_usize); - } + let results = + cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap(); - #[test] - fn numpy_parity_test() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(10, 4); - let expected: Vec<(Vec, Vec)> = vec![ - (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), - (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), - (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), - ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { - assert_eq!(test, expected_test); - assert_eq!(train, expected_train); - } - } - - #[test] - fn numpy_parity_test_shuffle() { - let k = KFold { - n_splits: 3, - ..KFold::default() - }; - let x: DenseMatrix = DenseMatrix::rand(10, 4); - let expected: Vec<(Vec, Vec)> = vec![ - (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), - (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), - (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), - ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { - assert_eq!(test.len(), expected_test.len()); - assert_eq!(train.len(), expected_train.len()); - } + println!("{}", results.mean_test_score()); + println!("{}", results.mean_train_score()); } } diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index dd34ae9..fe299f3 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -33,6 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -200,6 +201,12 @@ pub struct BernoulliNB> { binarize: Option, } +impl> Predictor for BernoulliNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> BernoulliNB { /// Fits BernoulliNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index c4626ef..ce526ce 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -30,6 +30,7 @@ //! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -246,6 +247,12 @@ pub struct CategoricalNB> { inner: BaseNaiveBayes>, } +impl> Predictor for CategoricalNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> CategoricalNB { /// Fits CategoricalNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index c5c1fb2..01dacd7 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -22,6 +22,7 @@ //! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -181,6 +182,12 @@ pub struct GaussianNB> { inner: BaseNaiveBayes>, } +impl> Predictor for GaussianNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> GaussianNB { /// Fits GaussianNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index c9ac86b..84d3fd1 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -33,6 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -187,6 +188,12 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } +impl> Predictor for MultinomialNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> MultinomialNB { /// Fits MultinomialNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index f940211..8b4db1b 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -25,31 +25,40 @@ //! &[9., 10.]]); //! let y = vec![2., 2., 2., 3., 3.]; //your class labels //! -//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = knn.predict(&x).unwrap(); //! ``` //! //! variable `y_hat` will hold a vector with estimates of class labels //! +use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::{row_iter, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNClassifier` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] -pub struct KNNClassifierParameters { +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct KNNClassifierParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub distance: D, /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. pub weight: KNNWeightFunction, /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, + /// this parameter is not used + t: PhantomData, } /// K Nearest Neighbors Classifier @@ -62,12 +71,39 @@ pub struct KNNClassifier, T>> { k: usize, } -impl Default for KNNClassifierParameters { +impl, T>> KNNClassifierParameters { + /// number of training samples to consider when estimating class for new point. Default value is 3. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance(mut self, distance: D) -> Self { + self.distance = distance; + self + } + /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self { + self.weight = weight; + self + } +} + +impl Default for KNNClassifierParameters { fn default() -> Self { KNNClassifierParameters { + distance: Distances::euclidian(), algorithm: KNNAlgorithmName::CoverTree, weight: KNNWeightFunction::Uniform, k: 3, + t: PhantomData, } } } @@ -95,19 +131,23 @@ impl, T>> PartialEq for KNNClassifier { } } +impl, D: Distance, T>> Predictor + for KNNClassifier +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> KNNClassifier { /// Fits KNN classifier to a NxM matrix where N is number of samples and M is number of features. /// * `x` - training data - /// * `y` - vector with target values (classes) of length N - /// * `distance` - a function that defines a distance between each pair of point in training data. - /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. - /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + /// * `y` - vector with target values (classes) of length N /// * `parameters` - additional parameters like search algorithm and k pub fn fit>( x: &M, y: &M::RowVector, - distance: D, - parameters: KNNClassifierParameters, + parameters: KNNClassifierParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); @@ -142,7 +182,7 @@ impl, T>> KNNClassifier { classes, y: yi, k: parameters.k, - knn_algorithm: parameters.algorithm.fit(data, distance)?, + knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?, weight: parameters.weight, }) } @@ -187,14 +227,13 @@ impl, T>> KNNClassifier { mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::math::distance::Distances; #[test] fn knn_fit_predict() { let x = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![2., 2., 2., 3., 3.]; - let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); let y_hat = knn.predict(&x).unwrap(); assert_eq!(5, Vec::len(&y_hat)); assert_eq!(y.to_vec(), y_hat); @@ -207,12 +246,10 @@ mod tests { let knn = KNNClassifier::fit( &x, &y, - Distances::euclidian(), - KNNClassifierParameters { - k: 5, - algorithm: KNNAlgorithmName::LinearSearch, - weight: KNNWeightFunction::Distance, - }, + KNNClassifierParameters::default() + .with_k(5) + .with_algorithm(KNNAlgorithmName::LinearSearch) + .with_weight(KNNWeightFunction::Distance), ) .unwrap(); let y_hat = knn.predict(&DenseMatrix::from_2d_array(&[&[4.1]])).unwrap(); @@ -225,7 +262,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![2., 2., 2., 3., 3.]; - let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index b7c0f2d..a97fdea 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -27,31 +27,41 @@ //! &[5., 5.]]); //! let y = vec![1., 2., 3., 4., 5.]; //your target values //! -//! let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = knn.predict(&x).unwrap(); //! ``` //! //! variable `y_hat` will hold predicted value //! //! +use std::marker::PhantomData; + use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::{row_iter, BaseVector, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNRegressor` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] -pub struct KNNRegressorParameters { +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct KNNRegressorParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + distance: D, /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. pub weight: KNNWeightFunction, /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, + /// this parameter is not used + t: PhantomData, } /// K Nearest Neighbors Regressor @@ -63,12 +73,39 @@ pub struct KNNRegressor, T>> { k: usize, } -impl Default for KNNRegressorParameters { +impl, T>> KNNRegressorParameters { + /// number of training samples to consider when estimating class for new point. Default value is 3. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance(mut self, distance: D) -> Self { + self.distance = distance; + self + } + /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self { + self.weight = weight; + self + } +} + +impl Default for KNNRegressorParameters { fn default() -> Self { KNNRegressorParameters { + distance: Distances::euclidian(), algorithm: KNNAlgorithmName::CoverTree, weight: KNNWeightFunction::Uniform, k: 3, + t: PhantomData, } } } @@ -88,19 +125,23 @@ impl, T>> PartialEq for KNNRegressor { } } +impl, D: Distance, T>> Predictor + for KNNRegressor +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> KNNRegressor { /// Fits KNN regressor to a NxM matrix where N is number of samples and M is number of features. /// * `x` - training data - /// * `y` - vector with real values - /// * `distance` - a function that defines a distance between each pair of point in training data. - /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. - /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + /// * `y` - vector with real values /// * `parameters` - additional parameters like search algorithm and k pub fn fit>( x: &M, y: &M::RowVector, - distance: D, - parameters: KNNRegressorParameters, + parameters: KNNRegressorParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); @@ -126,7 +167,7 @@ impl, T>> KNNRegressor { Ok(KNNRegressor { y: y.to_vec(), k: parameters.k, - knn_algorithm: parameters.algorithm.fit(data, distance)?, + knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?, weight: parameters.weight, }) } @@ -176,12 +217,11 @@ mod tests { let knn = KNNRegressor::fit( &x, &y, - Distances::euclidian(), - KNNRegressorParameters { - k: 3, - algorithm: KNNAlgorithmName::LinearSearch, - weight: KNNWeightFunction::Distance, - }, + KNNRegressorParameters::default() + .with_k(3) + .with_distance(Distances::euclidian()) + .with_algorithm(KNNAlgorithmName::LinearSearch) + .with_weight(KNNWeightFunction::Distance), ) .unwrap(); let y_hat = knn.predict(&x).unwrap(); @@ -197,7 +237,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y: Vec = vec![1., 2., 3., 4., 5.]; let y_exp = vec![2., 2., 3., 4., 4.]; - let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); let y_hat = knn.predict(&x).unwrap(); assert_eq!(5, Vec::len(&y_hat)); for i in 0..y_hat.len() { @@ -211,7 +251,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![1., 2., 3., 4., 5.]; - let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index be1ad4d..85ea6b8 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -48,7 +48,7 @@ pub mod knn_regressor; pub type KNNAlgorithmName = crate::algorithm::neighbour::KNNAlgorithmName; /// Weight function that is used to determine estimated value. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub enum KNNWeightFunction { /// All k nearest points are weighted equally Uniform, diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 1f563c1..1e013d2 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -93,16 +93,18 @@ impl Kernels { } /// Linear Kernel -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LinearKernel {} /// Radial basis function (Gaussian) kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RBFKernel { /// kernel coefficient pub gamma: T, } /// Polynomial kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct PolynomialKernel { /// degree of the polynomial pub degree: T, @@ -113,6 +115,7 @@ pub struct PolynomialKernel { } /// Sigmoid (hyperbolic tangent) kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct SigmoidKernel { /// kernel coefficient pub gamma: T, diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 9e166d5..cbe97f7 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -57,13 +57,7 @@ //! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0., //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; //! -//! let svr = SVC::fit(&x, &y, -//! Kernels::linear(), -//! SVCParameters { -//! epoch: 2, -//! c: 200.0, -//! tol: 1e-3, -//! }).unwrap(); +//! let svr = SVC::fit(&x, &y, SVCParameters::default().with_c(200.0)).unwrap(); //! //! let y_hat = svr.predict(&x).unwrap(); //! ``` @@ -84,22 +78,26 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::svm::Kernel; - -#[derive(Serialize, Deserialize, Debug)] +use crate::svm::{Kernel, Kernels, LinearKernel}; +#[derive(Serialize, Deserialize, Debug, Clone)] /// SVC Parameters -pub struct SVCParameters { - /// Number of epochs +pub struct SVCParameters, K: Kernel> { + /// Number of epochs. pub epoch: usize, /// Regularization parameter. pub c: T, - /// Tolerance for stopping criterion + /// Tolerance for stopping criterion. pub tol: T, + /// The kernel function. + pub kernel: K, + /// Unused parameter. + m: PhantomData, } #[derive(Serialize, Deserialize, Debug)] @@ -136,7 +134,7 @@ struct Cache<'a, T: RealNumber, M: Matrix, K: Kernel> { struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { x: &'a M, y: &'a M::RowVector, - parameters: &'a SVCParameters, + parameters: &'a SVCParameters, svmin: usize, svmax: usize, gmin: T, @@ -147,27 +145,61 @@ struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { recalculate_minmax_grad: bool, } -impl Default for SVCParameters { +impl, K: Kernel> SVCParameters { + /// Number of epochs. + pub fn with_epoch(mut self, epoch: usize) -> Self { + self.epoch = epoch; + self + } + /// Regularization parameter. + pub fn with_c(mut self, c: T) -> Self { + self.c = c; + self + } + /// Tolerance for stopping criterion. + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The kernel function. + pub fn with_kernel>(&self, kernel: KK) -> SVCParameters { + SVCParameters { + epoch: self.epoch, + c: self.c, + tol: self.tol, + kernel: kernel, + m: PhantomData + } + } +} + +impl> Default for SVCParameters { fn default() -> Self { SVCParameters { epoch: 2, c: T::one(), tol: T::from_f64(1e-3).unwrap(), + kernel: Kernels::linear(), + m: PhantomData } } } +impl, K: Kernel> Predictor for SVC { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, K: Kernel> SVC { /// Fits SVC to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - class labels - /// * `kernel` - the kernel function /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, y: &M::RowVector, - kernel: K, - parameters: SVCParameters, + parameters: SVCParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -198,13 +230,13 @@ impl, K: Kernel> SVC { } } - let optimizer = Optimizer::new(x, &y, &kernel, ¶meters); + let optimizer = Optimizer::new(x, &y, ¶meters.kernel, ¶meters); let (support_vectors, weight, b) = optimizer.optimize(); Ok(SVC { classes, - kernel, + kernel: parameters.kernel, instances: support_vectors, w: weight, b, @@ -321,7 +353,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, x: &'a M, y: &'a M::RowVector, kernel: &'a K, - parameters: &'a SVCParameters, + parameters: &'a SVCParameters, ) -> Optimizer<'a, T, M, K> { let (n, _) = x.shape(); @@ -711,17 +743,10 @@ mod tests { let y_hat = SVC::fit( &x, &y, - Kernels::linear(), - SVCParameters { - epoch: 2, - c: 200.0, - tol: 1e-3, - }, + SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()), ) .and_then(|lr| lr.predict(&x)) - .unwrap(); - - println!("{:?}", y_hat); + .unwrap(); assert!(accuracy(&y_hat, &y) >= 0.9); } @@ -759,12 +784,7 @@ mod tests { let y_hat = SVC::fit( &x, &y, - Kernels::rbf(0.7), - SVCParameters { - epoch: 2, - c: 1.0, - tol: 1e-3, - }, + SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)), ) .and_then(|lr| lr.predict(&x)) .unwrap(); @@ -801,7 +821,7 @@ mod tests { -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let svr = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap(); + let svr = SVC::fit(&x, &y, Default::default()).unwrap(); let deserialized_svr: SVC, LinearKernel> = serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 5d007d7..25c7ff6 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -49,13 +49,7 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let svr = SVR::fit(&x, &y, -//! LinearKernel {}, -//! SVRParameters { -//! eps: 2.0, -//! c: 10.0, -//! tol: 1e-3, -//! }).unwrap(); +//! let svr = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0)).unwrap(); //! //! let y_hat = svr.predict(&x).unwrap(); //! ``` @@ -72,25 +66,30 @@ use std::cell::{Ref, RefCell}; use std::fmt::Debug; +use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::svm::Kernel; - -#[derive(Serialize, Deserialize, Debug)] +use crate::svm::{Kernel, Kernels, LinearKernel}; +#[derive(Serialize, Deserialize, Debug, Clone)] /// SVR Parameters -pub struct SVRParameters { - /// Epsilon in the epsilon-SVR model +pub struct SVRParameters, K: Kernel> { + /// Epsilon in the epsilon-SVR model. pub eps: T, /// Regularization parameter. pub c: T, - /// Tolerance for stopping criterion + /// Tolerance for stopping criterion. pub tol: T, + /// The kernel function. + pub kernel: K, + /// Unused parameter. + m: PhantomData, } #[derive(Serialize, Deserialize, Debug)] @@ -135,16 +134,52 @@ struct Cache { data: Vec>>>, } -impl Default for SVRParameters { +impl, K: Kernel> SVRParameters { + /// Epsilon in the epsilon-SVR model. + pub fn with_eps(mut self, eps: T) -> Self { + self.eps = eps; + self + } + /// Regularization parameter. + pub fn with_c(mut self, c: T) -> Self { + self.c = c; + self + } + /// Tolerance for stopping criterion. + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The kernel function. + pub fn with_kernel>(&self, kernel: KK) -> SVRParameters { + SVRParameters { + eps: self.eps, + c: self.c, + tol: self.tol, + kernel: kernel, + m: PhantomData + } + } +} + +impl> Default for SVRParameters { fn default() -> Self { SVRParameters { eps: T::from_f64(0.1).unwrap(), c: T::one(), tol: T::from_f64(1e-3).unwrap(), + kernel: Kernels::linear(), + m: PhantomData } } } +impl, K: Kernel> Predictor for SVR { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, K: Kernel> SVR { /// Fits SVR to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. @@ -153,9 +188,8 @@ impl, K: Kernel> SVR { /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, - y: &M::RowVector, - kernel: K, - parameters: SVRParameters, + y: &M::RowVector, + parameters: SVRParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -165,12 +199,12 @@ impl, K: Kernel> SVR { )); } - let optimizer = Optimizer::new(x, y, &kernel, ¶meters); + let optimizer = Optimizer::new(x, y, ¶meters.kernel, ¶meters); let (support_vectors, weight, b) = optimizer.smo(); Ok(SVR { - kernel, + kernel: parameters.kernel, instances: support_vectors, w: weight, b, @@ -243,7 +277,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, x: &M, y: &M::RowVector, kernel: &'a K, - parameters: &SVRParameters, + parameters: &SVRParameters, ) -> Optimizer<'a, T, M, K> { let (n, _) = x.shape(); @@ -513,12 +547,7 @@ mod tests { let y_hat = SVR::fit( &x, &y, - LinearKernel {}, - SVRParameters { - eps: 2.0, - c: 10.0, - tol: 1e-3, - }, + SVRParameters::default().with_eps(2.0).with_c(10.0), ) .and_then(|lr| lr.predict(&x)) .unwrap(); @@ -552,7 +581,7 @@ mod tests { 114.2, 115.7, 116.9, ]; - let svr = SVR::fit(&x, &y, LinearKernel {}, Default::default()).unwrap(); + let svr = SVR::fit(&x, &y, Default::default()).unwrap(); let deserialized_svr: SVR, LinearKernel> = serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 371bc4e..1845d5e 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -71,11 +71,12 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Parameters of Decision Tree pub struct DecisionTreeClassifierParameters { /// Split criteria to use when building a tree. @@ -269,6 +270,12 @@ pub(in crate) fn which_max(x: &[usize]) -> usize { which } +impl> Predictor for DecisionTreeClassifier { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl DecisionTreeClassifier { /// Build a decision tree classifier from the training data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 5e80b4c..492f0a1 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -66,11 +66,12 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Parameters of Regression Tree pub struct DecisionTreeRegressorParameters { /// The maximum depth of the tree. @@ -189,6 +190,12 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } } +impl> Predictor for DecisionTreeRegressor { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl DecisionTreeRegressor { /// Build a decision tree regressor from the training data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. From 9b221979da51f9a26c693f5f5300599939416df6 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 16:35:28 -0800 Subject: [PATCH 71/79] fix: clippy, documentation and formatting --- src/linalg/mod.rs | 10 +++---- src/linear/logistic_regression.rs | 12 +++++---- src/model_selection/kfold.rs | 29 +++++---------------- src/model_selection/mod.rs | 43 ++++++++++++++++++++++++++----- src/naive_bayes/multinomial.rs | 2 +- src/svm/svc.rs | 20 +++++++++----- src/svm/svr.rs | 26 +++++++++---------- 7 files changed, 80 insertions(+), 62 deletions(-) diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 5b49942..264815b 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -281,8 +281,8 @@ pub trait BaseVector: Clone + Debug { let mut result = Self::zeros(n); - for i in 0..n { - result.set(i, self.get(index[i])); + for (i, idx) in index.iter().enumerate() { + result.set(i, self.get(*idx)); } result @@ -639,11 +639,11 @@ pub trait BaseMatrix: Clone + Debug { _ => Self::zeros(n, index.len()), }; - for i in 0..index.len() { + for (i, idx) in index.iter().enumerate() { for j in 0..k { match axis { - 0 => result.set(i, j, self.get(index[i], j)), - _ => result.set(j, i, self.get(j, index[i])), + 0 => result.set(i, j, self.get(*idx, j)), + _ => result.set(j, i, self.get(j, *idx)), }; } } diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index b85bbe8..ffb845c 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -69,8 +69,7 @@ use crate::optimization::FunctionOrder; /// Logistic Regression parameters #[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LogisticRegressionParameters { -} +pub struct LogisticRegressionParameters {} /// Logistic Regression #[derive(Serialize, Deserialize, Debug)] @@ -105,8 +104,7 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { impl Default for LogisticRegressionParameters { fn default() -> Self { - LogisticRegressionParameters { - } + LogisticRegressionParameters {} } } @@ -231,7 +229,11 @@ impl> LogisticRegression { /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target class values /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. - pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result, Failed> { + pub fn fit( + x: &M, + y: &M::RowVector, + _parameters: LogisticRegressionParameters, + ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); let (_, y_nrows) = y_m.shape(); diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs index 0fbe224..63827c4 100644 --- a/src/model_selection/kfold.rs +++ b/src/model_selection/kfold.rs @@ -1,30 +1,13 @@ //! # KFold //! -//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), -//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. -//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. -//! Underfitted is bad because the model is undetrained and does not fit the training data well. -//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for -//! your data. -//! -//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +//! Defines k-fold cross validator. use crate::linalg::Matrix; use crate::math::num::RealNumber; +use crate::model_selection::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -/// An interface for the K-Folds cross-validator -pub trait BaseKFold { - /// An iterator over indices that split data into training and test set. - type Output: Iterator, Vec)>; - /// Return a tuple containing the the training set indices for that split and - /// the testing set indices for that split. - fn split>(&self, x: &M) -> Self::Output; - /// Returns the number of splits - fn n_splits(&self) -> usize; -} - /// K-Folds cross-validator pub struct KFold { /// Number of folds. Must be at least 2. @@ -101,12 +84,12 @@ impl KFold { } /// An iterator over indices that split data into training and test set. -pub struct BaseKFoldIter { +pub struct KFoldIter { indices: Vec, test_indices: Vec>, } -impl Iterator for BaseKFoldIter { +impl Iterator for KFoldIter { type Item = (Vec, Vec); fn next(&mut self) -> Option<(Vec, Vec)> { @@ -133,7 +116,7 @@ impl Iterator for BaseKFoldIter { /// Abstract class for all KFold functionalities impl BaseKFold for KFold { - type Output = BaseKFoldIter; + type Output = KFoldIter; fn n_splits(&self) -> usize { self.n_splits @@ -148,7 +131,7 @@ impl BaseKFold for KFold { let mut test_indices = self.test_masks(x); test_indices.reverse(); - BaseKFoldIter { + KFoldIter { indices, test_indices, } diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 64527b3..0aabb97 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -14,15 +14,27 @@ use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::model_selection::kfold::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -pub mod kfold; +pub(crate) mod kfold; + +pub use kfold::{KFold, KFoldIter}; + +/// An interface for the K-Folds cross-validator +pub trait BaseKFold { + /// An iterator over indices that split data into training and test set. + type Output: Iterator, Vec)>; + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Self::Output; + /// Returns the number of splits + fn n_splits(&self) -> usize; +} /// Splits data into 2 disjoint datasets. /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. -/// * `y` - target values, should be of size _M_ +/// * `y` - target values, should be of size _N_ /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. /// * `shuffle`, - whether or not to shuffle the data before splitting pub fn train_test_split>( @@ -65,22 +77,33 @@ pub fn train_test_split>( (x_train, x_test, y_train, y_test) } +/// Cross validation results. #[derive(Clone, Debug)] pub struct CrossValidationResult { + /// Vector with test scores on each cv split pub test_score: Vec, + /// Vector with training scores on each cv split pub train_score: Vec, } impl CrossValidationResult { + /// Average test score pub fn mean_test_score(&self) -> T { self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap() } - + /// Average training score pub fn mean_train_score(&self) -> T { self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap() } } +/// Evaluate an estimator by cross-validation using given metric. +/// * `fit_estimator` - a `fit` function of an estimator +/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. +/// * `y` - target values, should be of size _N_ +/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters. +/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html) +/// * `score` - a metric to use for evaluation, see [metrics](../metrics/index.html) pub fn cross_validate( fit_estimator: F, x: &M, @@ -302,7 +325,6 @@ mod tests { #[test] fn test_some_classifier() { - let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], &[4.9, 3.0, 1.4, 0.2], @@ -334,8 +356,15 @@ mod tests { ..KFold::default() }; - let results = - cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap(); + let results = cross_validate( + DecisionTreeClassifier::fit, + &x, + &y, + Default::default(), + cv, + &accuracy, + ) + .unwrap(); println!("{}", results.mean_test_score()); println!("{}", results.mean_train_score()); diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 84d3fd1..849b8db 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -188,7 +188,7 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } -impl> Predictor for MultinomialNB { +impl> Predictor for MultinomialNB { fn predict(&self, x: &M) -> Result { self.predict(x) } diff --git a/src/svm/svc.rs b/src/svm/svc.rs index cbe97f7..aee4d3f 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -167,8 +167,8 @@ impl, K: Kernel> SVCParameters> Default for SVCParameters c: T::one(), tol: T::from_f64(1e-3).unwrap(), kernel: Kernels::linear(), - m: PhantomData + m: PhantomData, } } } -impl, K: Kernel> Predictor for SVC { +impl, K: Kernel> Predictor + for SVC +{ fn predict(&self, x: &M) -> Result { self.predict(x) } @@ -743,10 +745,12 @@ mod tests { let y_hat = SVC::fit( &x, &y, - SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()), + SVCParameters::default() + .with_c(200.0) + .with_kernel(Kernels::linear()), ) .and_then(|lr| lr.predict(&x)) - .unwrap(); + .unwrap(); assert!(accuracy(&y_hat, &y) >= 0.9); } @@ -784,7 +788,9 @@ mod tests { let y_hat = SVC::fit( &x, &y, - SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)), + SVCParameters::default() + .with_c(1.0) + .with_kernel(Kernels::rbf(0.7)), ) .and_then(|lr| lr.predict(&x)) .unwrap(); diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 25c7ff6..295ad78 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -134,7 +134,7 @@ struct Cache { data: Vec>>>, } -impl, K: Kernel> SVRParameters { +impl, K: Kernel> SVRParameters { /// Epsilon in the epsilon-SVR model. pub fn with_eps(mut self, eps: T) -> Self { self.eps = eps; @@ -153,11 +153,11 @@ impl, K: Kernel> SVRParameters>(&self, kernel: KK) -> SVRParameters { SVRParameters { - eps: self.eps, + eps: self.eps, c: self.c, tol: self.tol, - kernel: kernel, - m: PhantomData + kernel, + m: PhantomData, } } } @@ -169,12 +169,14 @@ impl> Default for SVRParameters c: T::one(), tol: T::from_f64(1e-3).unwrap(), kernel: Kernels::linear(), - m: PhantomData + m: PhantomData, } } } -impl, K: Kernel> Predictor for SVR { +impl, K: Kernel> Predictor + for SVR +{ fn predict(&self, x: &M) -> Result { self.predict(x) } @@ -188,7 +190,7 @@ impl, K: Kernel> SVR { /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, - y: &M::RowVector, + y: &M::RowVector, parameters: SVRParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -544,13 +546,9 @@ mod tests { 114.2, 115.7, 116.9, ]; - let y_hat = SVR::fit( - &x, - &y, - SVRParameters::default().with_eps(2.0).with_c(10.0), - ) - .and_then(|lr| lr.predict(&x)) - .unwrap(); + let y_hat = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0)) + .and_then(|lr| lr.predict(&x)) + .unwrap(); assert!(mean_squared_error(&y_hat, &y) < 2.5); } From f685f575e068080b64d660ebe34261f3556ffee7 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 17:42:18 -0800 Subject: [PATCH 72/79] feat: + cross_val_predict --- src/model_selection/mod.rs | 105 +++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 0aabb97..7178da8 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -125,7 +125,7 @@ where let mut test_score = Vec::with_capacity(k); let mut train_score = Vec::with_capacity(k); - for (test_idx, train_idx) in cv.split(x) { + for (train_idx, test_idx) in cv.split(x) { let train_x = x.take(&train_idx, 0); let train_y = y.take(&train_idx); let test_x = x.take(&test_idx, 0); @@ -143,6 +143,46 @@ where }) } +/// Generate cross-validated estimates for each input data point. +/// The data is split according to the cv parameter. Each sample belongs to exactly one test set, and its prediction is computed with an estimator fitted on the corresponding training set. +/// * `fit_estimator` - a `fit` function of an estimator +/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. +/// * `y` - target values, should be of size _N_ +/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters. +/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html) +pub fn cross_val_predict( + fit_estimator: F, + x: &M, + y: &M::RowVector, + parameters: H, + cv: K +) -> Result +where + T: RealNumber, + M: Matrix, + H: Clone, + E: Predictor, + K: BaseKFold, + F: Fn(&M, &M::RowVector, H) -> Result +{ + let mut y_hat = M::RowVector::zeros(y.len()); + + for (train_idx, test_idx) in cv.split(x) { + let train_x = x.take(&train_idx, 0); + let train_y = y.take(&train_idx); + let test_x = x.take(&test_idx, 0); + + let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; + + let y_test_hat = estimator.predict(&test_x)?; + for (i, &idx) in test_idx.iter().enumerate() { + y_hat.set(idx, y_test_hat.get(i)); + } + } + + Ok(y_hat) +} + #[cfg(test)] mod tests { @@ -278,10 +318,8 @@ mod tests { assert!(results.mean_train_score() < results.mean_test_score()); } - use crate::tree::decision_tree_regressor::*; - #[test] - fn test_some_regressor() { + fn test_cross_val_predict_knn() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159., 107.608, 1947., 60.323], &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], @@ -305,68 +343,21 @@ mod tests { 114.2, 115.7, 116.9, ]; - let cv = KFold::default().with_n_splits(2); - - let results = cross_validate( - DecisionTreeRegressor::fit, - &x, - &y, - Default::default(), - cv, - &mean_absolute_error, - ) - .unwrap(); - - println!("{}", results.mean_test_score()); - println!("{}", results.mean_train_score()); - } - - use crate::tree::decision_tree_classifier::*; - - #[test] - fn test_some_classifier() { - let x = DenseMatrix::from_2d_array(&[ - &[5.1, 3.5, 1.4, 0.2], - &[4.9, 3.0, 1.4, 0.2], - &[4.7, 3.2, 1.3, 0.2], - &[4.6, 3.1, 1.5, 0.2], - &[5.0, 3.6, 1.4, 0.2], - &[5.4, 3.9, 1.7, 0.4], - &[4.6, 3.4, 1.4, 0.3], - &[5.0, 3.4, 1.5, 0.2], - &[4.4, 2.9, 1.4, 0.2], - &[4.9, 3.1, 1.5, 0.1], - &[7.0, 3.2, 4.7, 1.4], - &[6.4, 3.2, 4.5, 1.5], - &[6.9, 3.1, 4.9, 1.5], - &[5.5, 2.3, 4.0, 1.3], - &[6.5, 2.8, 4.6, 1.5], - &[5.7, 2.8, 4.5, 1.3], - &[6.3, 3.3, 4.7, 1.6], - &[4.9, 2.4, 3.3, 1.0], - &[6.6, 2.9, 4.6, 1.3], - &[5.2, 2.7, 3.9, 1.4], - ]); - let y = vec![ - 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - ]; - let cv = KFold { n_splits: 2, ..KFold::default() }; - let results = cross_validate( - DecisionTreeClassifier::fit, + let y_hat = cross_val_predict( + KNNRegressor::fit, &x, &y, Default::default(), - cv, - &accuracy, + cv ) - .unwrap(); + .unwrap(); - println!("{}", results.mean_test_score()); - println!("{}", results.mean_train_score()); + assert!(mean_absolute_error(&y, &y_hat) < 10.0); } + } From 74f0d9e6fb574196cd84bc7d82169ad8a96cb910 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 17:44:44 -0800 Subject: [PATCH 73/79] fix: formatting --- src/model_selection/mod.rs | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 7178da8..7776354 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -155,7 +155,7 @@ pub fn cross_val_predict( x: &M, y: &M::RowVector, parameters: H, - cv: K + cv: K, ) -> Result where T: RealNumber, @@ -163,14 +163,14 @@ where H: Clone, E: Predictor, K: BaseKFold, - F: Fn(&M, &M::RowVector, H) -> Result -{ - let mut y_hat = M::RowVector::zeros(y.len()); - + F: Fn(&M, &M::RowVector, H) -> Result, +{ + let mut y_hat = M::RowVector::zeros(y.len()); + for (train_idx, test_idx) in cv.split(x) { let train_x = x.take(&train_idx, 0); let train_y = y.take(&train_idx); - let test_x = x.take(&test_idx, 0); + let test_x = x.take(&test_idx, 0); let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; @@ -348,16 +348,8 @@ mod tests { ..KFold::default() }; - let y_hat = cross_val_predict( - KNNRegressor::fit, - &x, - &y, - Default::default(), - cv - ) - .unwrap(); + let y_hat = cross_val_predict(KNNRegressor::fit, &x, &y, Default::default(), cv).unwrap(); assert!(mean_absolute_error(&y, &y_hat) < 10.0); } - } From dd341f4a12a8638f2f5538bc2fa68b5d2ca779de Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 23 Dec 2020 12:29:39 -0800 Subject: [PATCH 74/79] feat: + builders for algorithm parameters --- src/cluster/dbscan.rs | 22 ++++++++++++++-- src/cluster/kmeans.rs | 8 ++++++ src/decomposition/pca.rs | 9 +++++++ src/ensemble/random_forest_classifier.rs | 33 ++++++++++++++++++++++++ src/ensemble/random_forest_regressor.rs | 28 ++++++++++++++++++++ src/linear/elastic_net.rs | 30 +++++++++++++++++++++ src/linear/lasso.rs | 23 +++++++++++++++++ src/linear/linear_regression.rs | 8 ++++++ src/linear/ridge_regression.rs | 18 +++++++++++++ src/naive_bayes/bernoulli.rs | 15 +++++++++++ src/naive_bayes/categorical.rs | 6 +++++ src/naive_bayes/gaussian.rs | 5 ++++ src/naive_bayes/multinomial.rs | 10 +++++++ src/neighbors/knn_classifier.rs | 14 +++++++--- src/neighbors/knn_regressor.rs | 14 +++++++--- src/tree/decision_tree_classifier.rs | 23 +++++++++++++++++ src/tree/decision_tree_regressor.rs | 18 +++++++++++++ 17 files changed, 276 insertions(+), 8 deletions(-) diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index e595028..ac095f6 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -53,14 +53,32 @@ pub struct DBSCAN, T>> { #[derive(Debug, Clone)] /// DBSCAN clustering algorithm parameters pub struct DBSCANParameters { - /// Maximum number of iterations of the k-means algorithm for a single run. + /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. pub min_samples: usize, - /// The number of samples in a neighborhood for a point to be considered as a core point. + /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. pub eps: T, /// KNN algorithm to use. pub algorithm: KNNAlgorithmName, } +impl DBSCANParameters { + /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. + pub fn with_min_samples(mut self, min_samples: usize) -> Self { + self.min_samples = min_samples; + self + } + /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. + pub fn with_eps(mut self, eps: T) -> Self { + self.eps = eps; + self + } + /// KNN algorithm to use. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } +} + impl, T>> PartialEq for DBSCAN { fn eq(&self, other: &Self) -> bool { self.cluster_labels.len() == other.cluster_labels.len() diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 26a4038..bc5d673 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -105,6 +105,14 @@ pub struct KMeansParameters { pub max_iter: usize, } +impl KMeansParameters { + /// Maximum number of iterations of the k-means algorithm for a single run. + pub fn with_max_iter(mut self, max_iter: usize) -> Self { + self.max_iter = max_iter; + self + } +} + impl Default for KMeansParameters { fn default() -> Self { KMeansParameters { max_iter: 100 } diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 7d80f88..68220e3 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -88,6 +88,15 @@ pub struct PCAParameters { pub use_correlation_matrix: bool, } +impl PCAParameters { + /// By default, covariance matrix is used to compute principal components. + /// Enable this flag if you want to use correlation matrix instead. + pub fn with_use_correlation_matrix(mut self, use_correlation_matrix: bool) -> Self { + self.use_correlation_matrix = use_correlation_matrix; + self + } +} + impl Default for PCAParameters { fn default() -> Self { PCAParameters { diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index a742d90..9f1ba72 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -85,6 +85,39 @@ pub struct RandomForestClassifier { classes: Vec, } +impl RandomForestClassifierParameters { + /// Split criteria to use when building a tree. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self { + self.criterion = criterion; + self + } + /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } + /// The number of trees in the forest. + pub fn with_n_trees(mut self, n_trees: u16) -> Self { + self.n_trees = n_trees; + self + } + /// Number of random sample of predictors to use as split candidates. + pub fn with_m(mut self, m: usize) -> Self { + self.m = Some(m); + self + } +} + impl PartialEq for RandomForestClassifier { fn eq(&self, other: &Self) -> bool { if self.classes.len() != other.classes.len() || self.trees.len() != other.trees.len() { diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 52b39f9..6aa89d0 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -80,6 +80,34 @@ pub struct RandomForestRegressor { trees: Vec>, } +impl RandomForestRegressorParameters { + /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } + /// The number of trees in the forest. + pub fn with_n_trees(mut self, n_trees: usize) -> Self { + self.n_trees = n_trees; + self + } + /// Number of random sample of predictors to use as split candidates. + pub fn with_m(mut self, m: usize) -> Self { + self.m = Some(m); + self + } +} + impl Default for RandomForestRegressorParameters { fn default() -> Self { RandomForestRegressorParameters { diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index b386290..1ab933a 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -90,6 +90,36 @@ pub struct ElasticNet> { intercept: T, } +impl ElasticNetParameters { + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// The elastic net mixing parameter, with 0 <= l1_ratio <= 1. + /// For l1_ratio = 0 the penalty is an L2 penalty. + /// For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. + pub fn with_l1_ratio(mut self, l1_ratio: T) -> Self { + self.l1_ratio = l1_ratio; + self + } + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. + pub fn with_normalize(mut self, normalize: bool) -> Self { + self.normalize = normalize; + self + } + /// The tolerance for the optimization + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The maximum number of iterations + pub fn with_max_iter(mut self, max_iter: usize) -> Self { + self.max_iter = max_iter; + self + } +} + impl Default for ElasticNetParameters { fn default() -> Self { ElasticNetParameters { diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 0dab3e5..e16a316 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -54,6 +54,29 @@ pub struct Lasso> { intercept: T, } +impl LassoParameters { + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. + pub fn with_normalize(mut self, normalize: bool) -> Self { + self.normalize = normalize; + self + } + /// The tolerance for the optimization + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The maximum number of iterations + pub fn with_max_iter(mut self, max_iter: usize) -> Self { + self.max_iter = max_iter; + self + } +} + impl Default for LassoParameters { fn default() -> Self { LassoParameters { diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index c7bd872..0ebad34 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -93,6 +93,14 @@ pub struct LinearRegression> { solver: LinearRegressionSolverName, } +impl LinearRegressionParameters { + /// Solver to use for estimation of regression coefficients. + pub fn with_solver(mut self, solver: LinearRegressionSolverName) -> Self { + self.solver = solver; + self + } +} + impl Default for LinearRegressionParameters { fn default() -> Self { LinearRegressionParameters { diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 2b5a898..5c14313 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -98,6 +98,24 @@ pub struct RidgeRegression> { solver: RidgeRegressionSolverName, } +impl RidgeRegressionParameters { + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// Solver to use for estimation of regression coefficients. + pub fn with_solver(mut self, solver: RidgeRegressionSolverName) -> Self { + self.solver = solver; + self + } + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. + pub fn with_normalize(mut self, normalize: bool) -> Self { + self.normalize = normalize; + self + } +} + impl Default for RidgeRegressionParameters { fn default() -> Self { RidgeRegressionParameters { diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index fe299f3..db98efc 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -96,6 +96,21 @@ impl BernoulliNBParameters { binarize, } } + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub fn with_priors(mut self, priors: Vec) -> Self { + self.priors = Some(priors); + self + } + /// Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors. + pub fn with_binarize(mut self, binarize: T) -> Self { + self.binarize = Some(binarize); + self + } } impl Default for BernoulliNBParameters { diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index ce526ce..ea81eb5 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -234,7 +234,13 @@ impl CategoricalNBParameters { ))) } } + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } } + impl Default for CategoricalNBParameters { fn default() -> Self { Self { alpha: T::one() } diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 01dacd7..f1fc812 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -86,6 +86,11 @@ impl GaussianNBParameters { pub fn new(priors: Option>) -> Self { Self { priors } } + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub fn with_priors(mut self, priors: Vec) -> Self { + self.priors = Some(priors); + self + } } impl GaussianNBDistribution { diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 849b8db..50d2ee2 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -86,6 +86,16 @@ impl MultinomialNBParameters { pub fn new(alpha: T, priors: Option>) -> Self { Self { alpha, priors } } + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub fn with_priors(mut self, priors: Vec) -> Self { + self.priors = Some(priors); + self + } } impl Default for MultinomialNBParameters { diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 8b4db1b..6668539 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -80,9 +80,17 @@ impl, T>> KNNClassifierParameters { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. - pub fn with_distance(mut self, distance: D) -> Self { - self.distance = distance; - self + pub fn with_distance, T>>( + self, + distance: DD, + ) -> KNNClassifierParameters { + KNNClassifierParameters { + distance, + algorithm: self.algorithm, + weight: self.weight, + k: self.k, + t: PhantomData, + } } /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index a97fdea..80971e5 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -82,9 +82,17 @@ impl, T>> KNNRegressorParameters { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. - pub fn with_distance(mut self, distance: D) -> Self { - self.distance = distance; - self + pub fn with_distance, T>>( + self, + distance: DD, + ) -> KNNRegressorParameters { + KNNRegressorParameters { + distance, + algorithm: self.algorithm, + weight: self.weight, + k: self.k, + t: PhantomData, + } } /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 1845d5e..50a855b 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -161,6 +161,29 @@ impl PartialEq for Node { } } +impl DecisionTreeClassifierParameters { + /// Split criteria to use when building a tree. + pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self { + self.criterion = criterion; + self + } + /// The maximum depth of the tree. + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } +} + impl Default for DecisionTreeClassifierParameters { fn default() -> Self { DecisionTreeClassifierParameters { diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 492f0a1..806e680 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -101,6 +101,24 @@ struct Node { false_child: Option, } +impl DecisionTreeRegressorParameters { + /// The maximum depth of the tree. + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } +} + impl Default for DecisionTreeRegressorParameters { fn default() -> Self { DecisionTreeRegressorParameters { From 32ae63a577b3a84bcca2dc7472f830b00290f085 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 23 Dec 2020 12:38:10 -0800 Subject: [PATCH 75/79] feat: documentation adjusted to new builder --- src/cluster/dbscan.rs | 8 +++----- src/linear/linear_regression.rs | 6 +++--- src/linear/ridge_regression.rs | 7 ++----- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index ac095f6..c572ccc 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -15,11 +15,9 @@ //! let blobs = generator::make_blobs(100, 2, 3); //! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data); //! // Fit the algorithm and predict cluster labels -//! let labels = DBSCAN::fit(&x, Distances::euclidian(), DBSCANParameters{ -//! min_samples: 5, -//! eps: 3.0, -//! algorithm: KNNAlgorithmName::CoverTree -//! }).and_then(|dbscan| dbscan.predict(&x)); +//! let labels = DBSCAN::fit(&x, Distances::euclidian(), +//! DBSCANParameters::default().with_eps(3.0)). +//! and_then(|dbscan| dbscan.predict(&x)); //! //! println!("{:?}", labels); //! ``` diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 0ebad34..1855673 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -45,9 +45,9 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let lr = LinearRegression::fit(&x, &y, LinearRegressionParameters { -//! solver: LinearRegressionSolverName::QR, // or SVD -//! }).unwrap(); +//! let lr = LinearRegression::fit(&x, &y, +//! LinearRegressionParameters::default(). +//! with_solver(LinearRegressionSolverName::QR)).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! ``` diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 5c14313..f29898d 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -45,11 +45,8 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters { -//! solver: RidgeRegressionSolverName::Cholesky, -//! alpha: 0.1, -//! normalize: true -//! }).and_then(|lr| lr.predict(&x)).unwrap(); +//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters::default().with_alpha(0.1)). +//! and_then(|lr| lr.predict(&x)).unwrap(); //! ``` //! //! ## References: From d22be7d6ae44c1fddc412fde9ca434070ae890b5 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 24 Dec 2020 13:47:09 -0800 Subject: [PATCH 76/79] fix: post-review changes --- src/naive_bayes/bernoulli.rs | 8 -------- src/naive_bayes/categorical.rs | 11 ----------- src/naive_bayes/gaussian.rs | 6 +----- src/naive_bayes/multinomial.rs | 4 ---- 4 files changed, 1 insertion(+), 28 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index db98efc..c6cbfa8 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -88,14 +88,6 @@ pub struct BernoulliNBParameters { } impl BernoulliNBParameters { - /// Create BernoulliNBParameters with specific paramaters. - pub fn new(alpha: T, priors: Option>, binarize: Option) -> Self { - Self { - alpha, - priors, - binarize, - } - } /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub fn with_alpha(mut self, alpha: T) -> Self { self.alpha = alpha; diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index ea81eb5..667a270 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -223,17 +223,6 @@ pub struct CategoricalNBParameters { } impl CategoricalNBParameters { - /// Create CategoricalNBParameters with specific paramaters. - pub fn new(alpha: T) -> Result { - if alpha > T::zero() { - Ok(Self { alpha }) - } else { - Err(Failed::fit(&format!( - "alpha should be >= 0, alpha=[{}]", - alpha - ))) - } - } /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub fn with_alpha(mut self, alpha: T) -> Self { self.alpha = alpha; diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index f1fc812..bc96420 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -82,10 +82,6 @@ pub struct GaussianNBParameters { } impl GaussianNBParameters { - /// Create GaussianNBParameters with specific paramaters. - pub fn new(priors: Option>) -> Self { - Self { priors } - } /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data pub fn with_priors(mut self, priors: Vec) -> Self { self.priors = Some(priors); @@ -266,7 +262,7 @@ mod tests { let y = vec![1., 1., 1., 2., 2., 2.]; let priors = vec![0.3, 0.7]; - let parameters = GaussianNBParameters::new(Some(priors.clone())); + let parameters = GaussianNBParameters::default().with_priors(priors.clone()); let gnb = GaussianNB::fit(&x, &y, parameters).unwrap(); assert_eq!(gnb.inner.distribution.class_priors, priors); diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 50d2ee2..237b606 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -82,10 +82,6 @@ pub struct MultinomialNBParameters { } impl MultinomialNBParameters { - /// Create MultinomialNBParameters with specific paramaters. - pub fn new(alpha: T, priors: Option>) -> Self { - Self { alpha, priors } - } /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub fn with_alpha(mut self, alpha: T) -> Self { self.alpha = alpha; From 810a5c429b9df1aa383e1eaf607f7c4c1e0b7a3f Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 24 Dec 2020 18:36:23 -0800 Subject: [PATCH 77/79] feat: consolidates API --- src/api.rs | 43 +++++++++++++++ src/base.rs | 10 ---- src/cluster/dbscan.rs | 66 ++++++++++++++++-------- src/cluster/kmeans.rs | 65 +++++++++++++++-------- src/decomposition/pca.rs | 52 ++++++++++++------- src/decomposition/svd.rs | 40 +++++++++++--- src/ensemble/random_forest_classifier.rs | 15 +++++- src/ensemble/random_forest_regressor.rs | 15 +++++- src/lib.rs | 2 +- src/linear/elastic_net.rs | 10 +++- src/linear/lasso.rs | 10 +++- src/linear/linear_regression.rs | 14 ++++- src/linear/logistic_regression.rs | 14 ++++- src/linear/ridge_regression.rs | 14 ++++- src/model_selection/mod.rs | 2 +- src/naive_bayes/bernoulli.rs | 10 +++- src/naive_bayes/categorical.rs | 14 ++++- src/naive_bayes/gaussian.rs | 10 +++- src/naive_bayes/multinomial.rs | 14 ++++- src/neighbors/knn_classifier.rs | 14 ++++- src/neighbors/knn_regressor.rs | 14 ++++- src/svm/svc.rs | 10 +++- src/svm/svr.rs | 10 +++- src/tree/decision_tree_classifier.rs | 15 +++++- src/tree/decision_tree_regressor.rs | 15 +++++- 25 files changed, 400 insertions(+), 98 deletions(-) create mode 100644 src/api.rs delete mode 100644 src/base.rs diff --git a/src/api.rs b/src/api.rs new file mode 100644 index 0000000..c598e12 --- /dev/null +++ b/src/api.rs @@ -0,0 +1,43 @@ +//! # Common Interfaces and API +//! +//! This module provides interfaces and uniform API with simple conventions +//! that are used in other modules for supervised and unsupervised learning. + +use crate::error::Failed; + +/// An estimator for unsupervised learning, that provides method `fit` to learn from data +pub trait UnsupervisedEstimator { + /// Fit a model to a training dataset, estimate model's parameters. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `parameters` - hyperparameters of an algorithm + fn fit(x: &X, parameters: P) -> Result + where + Self: Sized, + P: Clone; +} + +/// An estimator for supervised learning, , that provides method `fit` to learn from data and training values +pub trait SupervisedEstimator { + /// Fit a model to a training dataset, estimate model's parameters. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - target training values of size _N_. + /// * `parameters` - hyperparameters of an algorithm + fn fit(x: &X, y: &Y, parameters: P) -> Result + where + Self: Sized, + P: Clone; +} + +/// Implements method predict that estimates target value from new data +pub trait Predictor { + /// Estimate target values from new data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + fn predict(&self, x: &X) -> Result; +} + +/// Implements method transform that filters or modifies input data +pub trait Transformer { + /// Transform data by modifying or filtering it + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + fn transform(&self, x: &X) -> Result; +} diff --git a/src/base.rs b/src/base.rs deleted file mode 100644 index a2d4468..0000000 --- a/src/base.rs +++ /dev/null @@ -1,10 +0,0 @@ -//! # Common Interfaces and methods -//! -//! This module consolidates interfaces and uniform basic API that is used elsewhere in the code. - -use crate::error::Failed; - -/// Implements method predict that offers a way to estimate target value from new data -pub trait Predictor { - fn predict(&self, x: &X) -> Result; -} diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index c572ccc..9aed2f0 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -15,8 +15,7 @@ //! let blobs = generator::make_blobs(100, 2, 3); //! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data); //! // Fit the algorithm and predict cluster labels -//! let labels = DBSCAN::fit(&x, Distances::euclidian(), -//! DBSCANParameters::default().with_eps(3.0)). +//! let labels = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(3.0)). //! and_then(|dbscan| dbscan.predict(&x)); //! //! println!("{:?}", labels); @@ -33,9 +32,11 @@ use std::iter::Sum; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::api::{Predictor, UnsupervisedEstimator}; use crate::error::Failed; use crate::linalg::{row_iter, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::tree::decision_tree_classifier::which_max; @@ -50,7 +51,11 @@ pub struct DBSCAN, T>> { #[derive(Debug, Clone)] /// DBSCAN clustering algorithm parameters -pub struct DBSCANParameters { +pub struct DBSCANParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub distance: D, /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. pub min_samples: usize, /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. @@ -59,7 +64,18 @@ pub struct DBSCANParameters { pub algorithm: KNNAlgorithmName, } -impl DBSCANParameters { +impl, T>> DBSCANParameters { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance, T>>(self, distance: DD) -> DBSCANParameters { + DBSCANParameters { + distance, + min_samples: self.min_samples, + eps: self.eps, + algorithm: self.algorithm, + } + } /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. pub fn with_min_samples(mut self, min_samples: usize) -> Self { self.min_samples = min_samples; @@ -86,9 +102,10 @@ impl, T>> PartialEq for DBSCAN { } } -impl Default for DBSCANParameters { +impl Default for DBSCANParameters { fn default() -> Self { DBSCANParameters { + distance: Distances::euclidian(), min_samples: 5, eps: T::half(), algorithm: KNNAlgorithmName::CoverTree, @@ -96,6 +113,22 @@ impl Default for DBSCANParameters { } } +impl, D: Distance, T>> + UnsupervisedEstimator> for DBSCAN +{ + fn fit(x: &M, parameters: DBSCANParameters) -> Result { + DBSCAN::fit(x, parameters) + } +} + +impl, D: Distance, T>> Predictor + for DBSCAN +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> DBSCAN { /// Fit algorithm to _NxM_ matrix where _N_ is number of samples and _M_ is number of features. /// * `data` - training instances to cluster @@ -103,8 +136,7 @@ impl, T>> DBSCAN { /// * `parameters` - cluster parameters pub fn fit>( x: &M, - distance: D, - parameters: DBSCANParameters, + parameters: DBSCANParameters, ) -> Result, Failed> { if parameters.min_samples < 1 { return Err(Failed::fit(&"Invalid minPts".to_string())); @@ -121,7 +153,9 @@ impl, T>> DBSCAN { let n = x.shape().0; let mut y = vec![unassigned; n]; - let algo = parameters.algorithm.fit(row_iter(x).collect(), distance)?; + let algo = parameters + .algorithm + .fit(row_iter(x).collect(), parameters.distance)?; for (i, e) in row_iter(x).enumerate() { if y[i] == unassigned { @@ -195,7 +229,6 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::math::distance::euclidian::Euclidian; - use crate::math::distance::Distances; #[test] fn fit_predict_dbscan() { @@ -215,16 +248,7 @@ mod tests { let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0]; - let dbscan = DBSCAN::fit( - &x, - Distances::euclidian(), - DBSCANParameters { - min_samples: 5, - eps: 1.0, - algorithm: KNNAlgorithmName::CoverTree, - }, - ) - .unwrap(); + let dbscan = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(1.0)).unwrap(); let predicted_labels = dbscan.predict(&x).unwrap(); @@ -256,7 +280,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let dbscan = DBSCAN::fit(&x, Distances::euclidian(), Default::default()).unwrap(); + let dbscan = DBSCAN::fit(&x, Default::default()).unwrap(); let deserialized_dbscan: DBSCAN = serde_json::from_str(&serde_json::to_string(&dbscan).unwrap()).unwrap(); diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index bc5d673..44ce1e6 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -43,7 +43,7 @@ //! &[5.2, 2.7, 3.9, 1.4], //! ]); //! -//! let kmeans = KMeans::fit(&x, 2, Default::default()).unwrap(); // Fit to data, 2 clusters +//! let kmeans = KMeans::fit(&x, KMeansParameters::default().with_k(2)).unwrap(); // Fit to data, 2 clusters //! let y_hat = kmeans.predict(&x).unwrap(); // use the same points for prediction //! ``` //! @@ -59,6 +59,7 @@ use std::iter::Sum; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::bbd_tree::BBDTree; +use crate::api::{Predictor, UnsupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::distance::euclidian::*; @@ -101,11 +102,18 @@ impl PartialEq for KMeans { #[derive(Debug, Clone)] /// K-Means clustering algorithm parameters pub struct KMeansParameters { + /// Number of clusters. + pub k: usize, /// Maximum number of iterations of the k-means algorithm for a single run. pub max_iter: usize, } impl KMeansParameters { + /// Number of clusters. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } /// Maximum number of iterations of the k-means algorithm for a single run. pub fn with_max_iter(mut self, max_iter: usize) -> Self { self.max_iter = max_iter; @@ -115,24 +123,37 @@ impl KMeansParameters { impl Default for KMeansParameters { fn default() -> Self { - KMeansParameters { max_iter: 100 } + KMeansParameters { + k: 2, + max_iter: 100, + } + } +} + +impl> UnsupervisedEstimator for KMeans { + fn fit(x: &M, parameters: KMeansParameters) -> Result { + KMeans::fit(x, parameters) + } +} + +impl> Predictor for KMeans { + fn predict(&self, x: &M) -> Result { + self.predict(x) } } impl KMeans { /// Fit algorithm to _NxM_ matrix where _N_ is number of samples and _M_ is number of features. - /// * `data` - training instances to cluster - /// * `k` - number of clusters + /// * `data` - training instances to cluster /// * `parameters` - cluster parameters - pub fn fit>( - data: &M, - k: usize, - parameters: KMeansParameters, - ) -> Result, Failed> { + pub fn fit>(data: &M, parameters: KMeansParameters) -> Result, Failed> { let bbd = BBDTree::new(data); - if k < 2 { - return Err(Failed::fit(&format!("invalid number of clusters: {}", k))); + if parameters.k < 2 { + return Err(Failed::fit(&format!( + "invalid number of clusters: {}", + parameters.k + ))); } if parameters.max_iter == 0 { @@ -145,9 +166,9 @@ impl KMeans { let (n, d) = data.shape(); let mut distortion = T::max_value(); - let mut y = KMeans::kmeans_plus_plus(data, k); - let mut size = vec![0; k]; - let mut centroids = vec![vec![T::zero(); d]; k]; + let mut y = KMeans::kmeans_plus_plus(data, parameters.k); + let mut size = vec![0; parameters.k]; + let mut centroids = vec![vec![T::zero(); d]; parameters.k]; for i in 0..n { size[y[i]] += 1; @@ -159,16 +180,16 @@ impl KMeans { } } - for i in 0..k { + for i in 0..parameters.k { for j in 0..d { centroids[i][j] /= T::from(size[i]).unwrap(); } } - let mut sums = vec![vec![T::zero(); d]; k]; + let mut sums = vec![vec![T::zero(); d]; parameters.k]; for _ in 1..=parameters.max_iter { let dist = bbd.clustering(¢roids, &mut sums, &mut size, &mut y); - for i in 0..k { + for i in 0..parameters.k { if size[i] > 0 { for j in 0..d { centroids[i][j] = T::from(sums[i][j]).unwrap() / T::from(size[i]).unwrap(); @@ -184,7 +205,7 @@ impl KMeans { } Ok(KMeans { - k, + k: parameters.k, y, size, distortion, @@ -280,10 +301,10 @@ mod tests { fn invalid_k() { let x = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); - assert!(KMeans::fit(&x, 0, Default::default()).is_err()); + assert!(KMeans::fit(&x, KMeansParameters::default().with_k(0)).is_err()); assert_eq!( "Fit failed: invalid number of clusters: 1", - KMeans::fit(&x, 1, Default::default()) + KMeans::fit(&x, KMeansParameters::default().with_k(1)) .unwrap_err() .to_string() ); @@ -314,7 +335,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let kmeans = KMeans::fit(&x, 2, Default::default()).unwrap(); + let kmeans = KMeans::fit(&x, Default::default()).unwrap(); let y = kmeans.predict(&x).unwrap(); @@ -348,7 +369,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let kmeans = KMeans::fit(&x, 2, Default::default()).unwrap(); + let kmeans = KMeans::fit(&x, Default::default()).unwrap(); let deserialized_kmeans: KMeans = serde_json::from_str(&serde_json::to_string(&kmeans).unwrap()).unwrap(); diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 68220e3..189e6de 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -37,7 +37,7 @@ //! &[5.2, 2.7, 3.9, 1.4], //! ]); //! -//! let pca = PCA::fit(&iris, 2, Default::default()).unwrap(); // Reduce number of features to 2 +//! let pca = PCA::fit(&iris, PCAParameters::default().with_n_components(2)).unwrap(); // Reduce number of features to 2 //! //! let iris_reduced = pca.transform(&iris).unwrap(); //! @@ -49,6 +49,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -83,12 +84,19 @@ impl> PartialEq for PCA { #[derive(Debug, Clone)] /// PCA parameters pub struct PCAParameters { + /// Number of components to keep. + pub n_components: usize, /// By default, covariance matrix is used to compute principal components. /// Enable this flag if you want to use correlation matrix instead. pub use_correlation_matrix: bool, } impl PCAParameters { + /// Number of components to keep. + pub fn with_n_components(mut self, n_components: usize) -> Self { + self.n_components = n_components; + self + } /// By default, covariance matrix is used to compute principal components. /// Enable this flag if you want to use correlation matrix instead. pub fn with_use_correlation_matrix(mut self, use_correlation_matrix: bool) -> Self { @@ -100,24 +108,33 @@ impl PCAParameters { impl Default for PCAParameters { fn default() -> Self { PCAParameters { + n_components: 2, use_correlation_matrix: false, } } } +impl> UnsupervisedEstimator for PCA { + fn fit(x: &M, parameters: PCAParameters) -> Result { + PCA::fit(x, parameters) + } +} + +impl> Transformer for PCA { + fn transform(&self, x: &M) -> Result { + self.transform(x) + } +} + impl> PCA { /// Fits PCA to your data. /// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `n_components` - number of components to keep. /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. - pub fn fit( - data: &M, - n_components: usize, - parameters: PCAParameters, - ) -> Result, Failed> { + pub fn fit(data: &M, parameters: PCAParameters) -> Result, Failed> { let (m, n) = data.shape(); - if n_components > n { + if parameters.n_components > n { return Err(Failed::fit(&format!( "Number of components, n_components should be <= number of attributes ({})", n @@ -196,16 +213,16 @@ impl> PCA { } } - let mut projection = M::zeros(n_components, n); + let mut projection = M::zeros(parameters.n_components, n); for i in 0..n { - for j in 0..n_components { + for j in 0..parameters.n_components { projection.set(j, i, eigenvectors.get(i, j)); } } - let mut pmu = vec![T::zero(); n_components]; + let mut pmu = vec![T::zero(); parameters.n_components]; for (k, mu_k) in mu.iter().enumerate().take(n) { - for (i, pmu_i) in pmu.iter_mut().enumerate().take(n_components) { + for (i, pmu_i) in pmu.iter_mut().enumerate().take(parameters.n_components) { *pmu_i += projection.get(i, k) * (*mu_k); } } @@ -318,7 +335,7 @@ mod tests { &[0.0752, 0.2007], ]); - let pca = PCA::fit(&us_arrests, 2, Default::default()).unwrap(); + let pca = PCA::fit(&us_arrests, Default::default()).unwrap(); assert!(expected.approximate_eq(&pca.components().abs(), 0.4)); } @@ -414,7 +431,7 @@ mod tests { 302.04806302399646, ]; - let pca = PCA::fit(&us_arrests, 4, Default::default()).unwrap(); + let pca = PCA::fit(&us_arrests, PCAParameters::default().with_n_components(4)).unwrap(); assert!(pca .eigenvectors @@ -525,10 +542,9 @@ mod tests { let pca = PCA::fit( &us_arrests, - 4, - PCAParameters { - use_correlation_matrix: true, - }, + PCAParameters::default() + .with_n_components(4) + .with_use_correlation_matrix(true), ) .unwrap(); @@ -573,7 +589,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let pca = PCA::fit(&iris, 4, Default::default()).unwrap(); + let pca = PCA::fit(&iris, Default::default()).unwrap(); let deserialized_pca: PCA> = serde_json::from_str(&serde_json::to_string(&pca).unwrap()).unwrap(); diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index eea1969..d404ca7 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -34,7 +34,7 @@ //! &[5.2, 2.7, 3.9, 1.4], //! ]); //! -//! let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); // Reduce number of features to 2 +//! let svd = SVD::fit(&iris, SVDParameters::default().with_n_components(2)).unwrap(); // Reduce number of features to 2 //! //! let iris_reduced = svd.transform(&iris).unwrap(); //! @@ -47,6 +47,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -67,11 +68,34 @@ impl> PartialEq for SVD { #[derive(Debug, Clone)] /// SVD parameters -pub struct SVDParameters {} +pub struct SVDParameters { + /// Number of components to keep. + pub n_components: usize, +} impl Default for SVDParameters { fn default() -> Self { - SVDParameters {} + SVDParameters { n_components: 2 } + } +} + +impl SVDParameters { + /// Number of components to keep. + pub fn with_n_components(mut self, n_components: usize) -> Self { + self.n_components = n_components; + self + } +} + +impl> UnsupervisedEstimator for SVD { + fn fit(x: &M, parameters: SVDParameters) -> Result { + SVD::fit(x, parameters) + } +} + +impl> Transformer for SVD { + fn transform(&self, x: &M) -> Result { + self.transform(x) } } @@ -80,10 +104,10 @@ impl> SVD { /// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `n_components` - number of components to keep. /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. - pub fn fit(x: &M, n_components: usize, _: SVDParameters) -> Result, Failed> { + pub fn fit(x: &M, parameters: SVDParameters) -> Result, Failed> { let (_, p) = x.shape(); - if n_components >= p { + if parameters.n_components >= p { return Err(Failed::fit(&format!( "Number of components, n_components should be < number of attributes ({})", p @@ -92,7 +116,7 @@ impl> SVD { let svd = x.svd()?; - let components = svd.V.slice(0..p, 0..n_components); + let components = svd.V.slice(0..p, 0..parameters.n_components); Ok(SVD { components, @@ -189,7 +213,7 @@ mod tests { &[197.28420365, -11.66808306], &[293.43187394, 1.91163633], ]); - let svd = SVD::fit(&x, 2, Default::default()).unwrap(); + let svd = SVD::fit(&x, Default::default()).unwrap(); let x_transformed = svd.transform(&x).unwrap(); @@ -225,7 +249,7 @@ mod tests { &[5.2, 2.7, 3.9, 1.4], ]); - let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); + let svd = SVD::fit(&iris, Default::default()).unwrap(); let deserialized_svd: SVD> = serde_json::from_str(&serde_json::to_string(&svd).unwrap()).unwrap(); diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 9f1ba72..49c4239 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -51,7 +51,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -151,6 +151,19 @@ impl Default for RandomForestClassifierParameters { } } +impl> + SupervisedEstimator + for RandomForestClassifier +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: RandomForestClassifierParameters, + ) -> Result { + RandomForestClassifier::fit(x, y, parameters) + } +} + impl> Predictor for RandomForestClassifier { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 6aa89d0..fdeb9fc 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -49,7 +49,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -135,6 +135,19 @@ impl PartialEq for RandomForestRegressor { } } +impl> + SupervisedEstimator + for RandomForestRegressor +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: RandomForestRegressorParameters, + ) -> Result { + RandomForestRegressor::fit(x, y, parameters) + } +} + impl> Predictor for RandomForestRegressor { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/lib.rs b/src/lib.rs index a1608c3..297fcc4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,7 +71,7 @@ /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; -pub(crate) mod base; +pub mod api; /// Algorithms for clustering of unlabeled data pub mod cluster; /// Various datasets diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index 1ab933a..2833ff1 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -58,7 +58,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -139,6 +139,14 @@ impl> PartialEq for ElasticNet { } } +impl> SupervisedEstimator> + for ElasticNet +{ + fn fit(x: &M, y: &M::RowVector, parameters: ElasticNetParameters) -> Result { + ElasticNet::fit(x, y, parameters) + } +} + impl> Predictor for ElasticNet { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index e16a316..b99ecff 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -26,7 +26,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -95,6 +95,14 @@ impl> PartialEq for Lasso { } } +impl> SupervisedEstimator> + for Lasso +{ + fn fit(x: &M, y: &M::RowVector, parameters: LassoParameters) -> Result { + Lasso::fit(x, y, parameters) + } +} + impl> Predictor for Lasso { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 1855673..2ef03c1 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -64,7 +64,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -116,6 +116,18 @@ impl> PartialEq for LinearRegression { } } +impl> SupervisedEstimator + for LinearRegression +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: LinearRegressionParameters, + ) -> Result { + LinearRegression::fit(x, y, parameters) + } +} + impl> Predictor for LinearRegression { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index ffb845c..a71ac45 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -58,7 +58,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -218,6 +218,18 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction } } +impl> SupervisedEstimator + for LogisticRegression +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: LogisticRegressionParameters, + ) -> Result { + LogisticRegression::fit(x, y, parameters) + } +} + impl> Predictor for LogisticRegression { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index f29898d..e9ed1ff 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -60,7 +60,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -130,6 +130,18 @@ impl> PartialEq for RidgeRegression { } } +impl> SupervisedEstimator> + for RidgeRegression +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: RidgeRegressionParameters, + ) -> Result { + RidgeRegression::fit(x, y, parameters) + } +} + impl> Predictor for RidgeRegression { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 7776354..18dfa35 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -9,7 +9,7 @@ //! //! In SmartCore you can split your data into training and test datasets using `train_test_split` function. -use crate::base::Predictor; +use crate::api::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index c6cbfa8..388646f 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -33,7 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -208,6 +208,14 @@ pub struct BernoulliNB> { binarize: Option, } +impl> SupervisedEstimator> + for BernoulliNB +{ + fn fit(x: &M, y: &M::RowVector, parameters: BernoulliNBParameters) -> Result { + BernoulliNB::fit(x, y, parameters) + } +} + impl> Predictor for BernoulliNB { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index 667a270..c6f28bd 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -30,7 +30,7 @@ //! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -242,6 +242,18 @@ pub struct CategoricalNB> { inner: BaseNaiveBayes>, } +impl> SupervisedEstimator> + for CategoricalNB +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: CategoricalNBParameters, + ) -> Result { + CategoricalNB::fit(x, y, parameters) + } +} + impl> Predictor for CategoricalNB { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index bc96420..2ac9892 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -22,7 +22,7 @@ //! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -183,6 +183,14 @@ pub struct GaussianNB> { inner: BaseNaiveBayes>, } +impl> SupervisedEstimator> + for GaussianNB +{ + fn fit(x: &M, y: &M::RowVector, parameters: GaussianNBParameters) -> Result { + GaussianNB::fit(x, y, parameters) + } +} + impl> Predictor for GaussianNB { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 237b606..4cae1f3 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -33,7 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -194,6 +194,18 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } +impl> SupervisedEstimator> + for MultinomialNB +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: MultinomialNBParameters, + ) -> Result { + MultinomialNB::fit(x, y, parameters) + } +} + impl> Predictor for MultinomialNB { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 6668539..97dd748 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -36,7 +36,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::{row_iter, Matrix}; use crate::math::distance::euclidian::Euclidian; @@ -139,6 +139,18 @@ impl, T>> PartialEq for KNNClassifier { } } +impl, D: Distance, T>> + SupervisedEstimator> for KNNClassifier +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: KNNClassifierParameters, + ) -> Result { + KNNClassifier::fit(x, y, parameters) + } +} + impl, D: Distance, T>> Predictor for KNNClassifier { diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 80971e5..4e73103 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -39,7 +39,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::{row_iter, BaseVector, Matrix}; use crate::math::distance::euclidian::Euclidian; @@ -133,6 +133,18 @@ impl, T>> PartialEq for KNNRegressor { } } +impl, D: Distance, T>> + SupervisedEstimator> for KNNRegressor +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: KNNRegressorParameters, + ) -> Result { + KNNRegressor::fit(x, y, parameters) + } +} + impl, D: Distance, T>> Predictor for KNNRegressor { diff --git a/src/svm/svc.rs b/src/svm/svc.rs index aee4d3f..095d555 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -78,7 +78,7 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -185,6 +185,14 @@ impl> Default for SVCParameters } } +impl, K: Kernel> + SupervisedEstimator> for SVC +{ + fn fit(x: &M, y: &M::RowVector, parameters: SVCParameters) -> Result { + SVC::fit(x, y, parameters) + } +} + impl, K: Kernel> Predictor for SVC { diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 295ad78..9eb6046 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -70,7 +70,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -174,6 +174,14 @@ impl> Default for SVRParameters } } +impl, K: Kernel> + SupervisedEstimator> for SVR +{ + fn fit(x: &M, y: &M::RowVector, parameters: SVRParameters) -> Result { + SVR::fit(x, y, parameters) + } +} + impl, K: Kernel> Predictor for SVR { diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 50a855b..3a92c54 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -71,7 +71,7 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -293,6 +293,19 @@ pub(in crate) fn which_max(x: &[usize]) -> usize { which } +impl> + SupervisedEstimator + for DecisionTreeClassifier +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: DecisionTreeClassifierParameters, + ) -> Result { + DecisionTreeClassifier::fit(x, y, parameters) + } +} + impl> Predictor for DecisionTreeClassifier { fn predict(&self, x: &M) -> Result { self.predict(x) diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 806e680..06ee507 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -66,7 +66,7 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; -use crate::base::Predictor; +use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -208,6 +208,19 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } } +impl> + SupervisedEstimator + for DecisionTreeRegressor +{ + fn fit( + x: &M, + y: &M::RowVector, + parameters: DecisionTreeRegressorParameters, + ) -> Result { + DecisionTreeRegressor::fit(x, y, parameters) + } +} + impl> Predictor for DecisionTreeRegressor { fn predict(&self, x: &M) -> Result { self.predict(x) From 9475d500dbe08d6b7c98ba68ac2bf4ce47c2fe31 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Sun, 27 Dec 2020 18:39:37 -0800 Subject: [PATCH 78/79] feat: version change + api documentation updated --- Cargo.toml | 2 +- src/cluster/dbscan.rs | 16 +++++- src/lib.rs | 23 ++++----- src/model_selection/mod.rs | 103 +++++++++++++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 21 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 32d8695..5e21aef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "smartcore" description = "The most advanced machine learning library in rust." homepage = "https://smartcorelib.org" -version = "0.1.0" +version = "0.2.0" authors = ["SmartCore Developers"] edition = "2018" license = "Apache-2.0" diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 9aed2f0..7d641cd 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -1,6 +1,20 @@ //! # DBSCAN Clustering //! -//! DBSCAN - Density-Based Spatial Clustering of Applications with Noise. +//! DBSCAN stands for density-based spatial clustering of applications with noise. This algorithms is good for arbitrary shaped clusters and clusters with noise. +//! The main idea behind DBSCAN is that a point belongs to a cluster if it is close to many points from that cluster. There are two key parameters of DBSCAN: +//! +//! * `eps`, the maximum distance that specifies a neighborhood. Two points are considered to be neighbors if the distance between them are less than or equal to `eps`. +//! * `min_samples`, minimum number of data points that defines a cluster. +//! +//! Based on these two parameters, points are classified as core point, border point, or outlier: +//! +//! * A point is a core point if there are at least `min_samples` number of points, including the point itself in its vicinity. +//! * A point is a border point if it is reachable from a core point and there are less than `min_samples` number of points within its surrounding area. +//! * All points not reachable from any other point are outliers or noise points. +//! +//! The algorithm starts from picking up an arbitrarily point in the dataset. +//! If there are at least `min_samples` points within a radius of `eps` to the point then we consider all these points to be part of the same cluster. +//! The clusters are then expanded by recursively repeating the neighborhood calculation for each neighboring point. //! //! Example: //! diff --git a/src/lib.rs b/src/lib.rs index 297fcc4..d962894 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,16 +10,11 @@ //! //! Welcome to SmartCore, the most advanced machine learning library in Rust! //! -//! In SmartCore you will find implementation of these ML algorithms: -//! * __Regression__: Linear Regression (OLS), Decision Tree Regressor, Random Forest Regressor, K Nearest Neighbors -//! * __Classification__: Logistic Regressor, Decision Tree Classifier, Random Forest Classifier, Supervised Nearest Neighbors (KNN) -//! * __Clustering__: K-Means -//! * __Matrix Decomposition__: PCA, LU, QR, SVD, EVD -//! * __Distance Metrics__: Euclidian, Minkowski, Manhattan, Hamming, Mahalanobis -//! * __Evaluation Metrics__: Accuracy, AUC, Recall, Precision, F1, Mean Absolute Error, Mean Squared Error, R2 +//! SmartCore features various classification, regression and clustering algorithms including support vector machines, random forests, k-means and DBSCAN, +//! as well as tools for model selection and model evaluation. //! -//! Most of algorithms implemented in SmartCore operate on n-dimentional arrays. While you can use Rust vectors with all functions defined in this library -//! we do recommend to go with one of the popular linear algebra libraries available in Rust. At this moment we support these packages: +//! SmartCore is well integrated with a with wide variaty of libraries that provide support for large, multi-dimensional arrays and matrices. At this moment, +//! all Smartcore's algorithms work with ordinary Rust vectors, as well as matrices and vectors defined in these packages: //! * [ndarray](https://docs.rs/ndarray) //! * [nalgebra](https://docs.rs/nalgebra/) //! @@ -28,21 +23,21 @@ //! To start using SmartCore simply add the following to your Cargo.toml file: //! ```ignore //! [dependencies] -//! smartcore = "0.1.0" +//! smartcore = "0.2.0" //! ``` //! -//! All ML algorithms in SmartCore are grouped into these generic categories: +//! All machine learning algorithms in SmartCore are grouped into these broad categories: //! * [Clustering](cluster/index.html), unsupervised clustering of unlabeled data. //! * [Martix Decomposition](decomposition/index.html), various methods for matrix decomposition. //! * [Linear Models](linear/index.html), regression and classification methods where output is assumed to have linear relation to explanatory variables //! * [Ensemble Models](ensemble/index.html), variety of regression and classification ensemble models //! * [Tree-based Models](tree/index.html), classification and regression trees //! * [Nearest Neighbors](neighbors/index.html), K Nearest Neighbors for classification and regression +//! * [Naive Bayes](naive_bayes/index.html), statistical classification technique based on Bayes Theorem +//! * [SVM](svm/index.html), support vector machines //! -//! Each category is assigned to a separate module. //! -//! For example, KNN classifier is defined in [smartcore::neighbors::knn_classifier](neighbors/knn_classifier/index.html). To train and run it using standard Rust vectors you will -//! run this code: +//! For example, you can use this code to fit a [K Nearest Neighbors classifier](neighbors/knn_classifier/index.html) to a dataset that is defined as standard Rust vector: //! //! ``` //! // DenseMatrix defenition diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 18dfa35..0058367 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -1,13 +1,106 @@ //! # Model Selection methods //! -//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), -//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. +//! In statistics and machine learning we usually split our data into two sets: one for training and the other one for testing. +//! We fit our model to the training data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. //! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. //! Underfitted is bad because the model is undetrained and does not fit the training data well. -//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for -//! your data. +//! Splitting data into multiple subsets helps us to find the right combination of hyperparameters, estimate model performance and choose the right model for +//! the data. //! -//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +//! In SmartCore a random split into training and test sets can be quickly computed with the [train_test_split](./fn.train_test_split.html) helper function. +//! +//! ``` +//! use crate::smartcore::linalg::BaseMatrix; +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::model_selection::train_test_split; +//! +//! //Iris data +//! let x = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! let y: Vec = vec![ +//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., +//! ]; +//! +//! let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true); +//! +//! println!("X train: {:?}, y train: {}, X test: {:?}, y test: {}", +//! x_train.shape(), y_train.len(), x_test.shape(), y_test.len()); +//! ``` +//! +//! When we partition the available data into two disjoint sets, we drastically reduce the number of samples that can be used for training. +//! +//! One way to solve this problem is to use k-fold cross-validation. With k-fold validation, the dataset is split into k disjoint sets. +//! A model is trained using k - 1 of the folds, and the resulting model is validated on the remaining portion of the data. +//! +//! The simplest way to run cross-validation is to use the [cross_val_score](./fn.cross_validate.html) helper function on your estimator and the dataset. +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::model_selection::{KFold, cross_validate}; +//! use smartcore::metrics::accuracy; +//! use smartcore::linear::logistic_regression::LogisticRegression; +//! +//! //Iris data +//! let x = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! let y: Vec = vec![ +//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., +//! ]; +//! +//! let cv = KFold::default().with_n_splits(3); +//! +//! let results = cross_validate(LogisticRegression::fit, //estimator +//! &x, &y, //data +//! Default::default(), //hyperparameters +//! cv, //cross validation split +//! &accuracy).unwrap(); //metric +//! +//! println!("Training accuracy: {}, test accuracy: {}", +//! results.mean_test_score(), results.mean_train_score()); +//! ``` +//! +//! The function [cross_val_predict](./fn.cross_val_predict.html) has a similar interface to `cross_val_score`, +//! but instead of test error it calculates predictions for all samples in the test set. use crate::api::Predictor; use crate::error::Failed; From bb9a05b9930e686c7a6691ab27696eae35f5ee34 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Sat, 2 Jan 2021 18:08:40 -0800 Subject: [PATCH 79/79] fix: fixes a bug in DBSCAN, removes println's --- src/cluster/dbscan.rs | 65 ++++++++++++++++++++++++++++------------ src/dataset/generator.rs | 3 -- src/decomposition/svd.rs | 3 +- 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 7d641cd..c793039 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -161,39 +161,60 @@ impl, T>> DBSCAN { } let mut k = 0; - let unassigned = -2; + let queued = -2; let outlier = -1; + let undefined = -3; let n = x.shape().0; - let mut y = vec![unassigned; n]; + let mut y = vec![undefined; n]; let algo = parameters .algorithm .fit(row_iter(x).collect(), parameters.distance)?; for (i, e) in row_iter(x).enumerate() { - if y[i] == unassigned { + if y[i] == undefined { let mut neighbors = algo.find_radius(&e, parameters.eps)?; if neighbors.len() < parameters.min_samples { y[i] = outlier; } else { y[i] = k; + for j in 0..neighbors.len() { - if y[neighbors[j].0] == unassigned { - y[neighbors[j].0] = k; - - let mut secondary_neighbors = - algo.find_radius(neighbors[j].2, parameters.eps)?; - - if secondary_neighbors.len() >= parameters.min_samples { - neighbors.append(&mut secondary_neighbors); - } - } - - if y[neighbors[j].0] == outlier { - y[neighbors[j].0] = k; + if y[neighbors[j].0] == undefined { + y[neighbors[j].0] = queued; } } + + while !neighbors.is_empty() { + let neighbor = neighbors.pop().unwrap(); + let index = neighbor.0; + + if y[index] == outlier { + y[index] = k; + } + + if y[index] == undefined || y[index] == queued { + y[index] = k; + + let secondary_neighbors = + algo.find_radius(neighbor.2, parameters.eps)?; + + if secondary_neighbors.len() >= parameters.min_samples { + for j in 0..secondary_neighbors.len() { + let label = y[secondary_neighbors[j].0]; + if label == undefined { + y[secondary_neighbors[j].0] = queued; + } + + if label == undefined || label == outlier { + neighbors.push(secondary_neighbors[j]); + } + } + } + } + } + k += 1; } } @@ -250,19 +271,25 @@ mod tests { &[1.0, 2.0], &[1.1, 2.1], &[0.9, 1.9], - &[1.2, 1.2], + &[1.2, 2.2], &[0.8, 1.8], &[2.0, 1.0], &[2.1, 1.1], - &[2.2, 1.2], &[1.9, 0.9], + &[2.2, 1.2], &[1.8, 0.8], &[3.0, 5.0], ]); let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0]; - let dbscan = DBSCAN::fit(&x, DBSCANParameters::default().with_eps(1.0)).unwrap(); + let dbscan = DBSCAN::fit( + &x, + DBSCANParameters::default() + .with_eps(0.5) + .with_min_samples(2), + ) + .unwrap(); let predicted_labels = dbscan.predict(&x).unwrap(); diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index e0b2939..28a2224 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -59,8 +59,6 @@ pub fn make_circles(num_samples: usize, factor: f32, noise: f32) -> Dataset