From 0e8166386cfd694666ed3c5eb699cf8168d536dd Mon Sep 17 00:00:00 2001 From: Alex <1221721+atcol@users.noreply.github.com> Date: Tue, 5 Jan 2021 16:57:14 +0000 Subject: [PATCH 01/81] Fix Matrix typo in documentation --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index d962894..7d2b089 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,7 +28,7 @@ //! //! All machine learning algorithms in SmartCore are grouped into these broad categories: //! * [Clustering](cluster/index.html), unsupervised clustering of unlabeled data. -//! * [Martix Decomposition](decomposition/index.html), various methods for matrix decomposition. +//! * [Matrix Decomposition](decomposition/index.html), various methods for matrix decomposition. //! * [Linear Models](linear/index.html), regression and classification methods where output is assumed to have linear relation to explanatory variables //! * [Ensemble Models](ensemble/index.html), variety of regression and classification ensemble models //! * [Tree-based Models](tree/index.html), classification and regression trees From eb769493e78702aaf5d3b6a1210fde447440525e Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 5 Jan 2021 16:13:39 -0400 Subject: [PATCH 02/81] Add coverage check (#57) * Add coverage check --- .circleci/config.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 17da167..a931ff5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,6 +6,8 @@ workflows: jobs: - build - clippy + - coverage + jobs: build: docker: @@ -41,3 +43,17 @@ jobs: - run: name: Run cargo clippy command: cargo clippy --all-features -- -Drust-2018-idioms -Dwarnings + + coverage: + machine: true + steps: + - checkout + - run: + name: Generate report + command: > + docker run --security-opt seccomp=unconfined -v $PWD:/volume + xd009642/tarpaulin:latest-nightly cargo tarpaulin -v --ciserver circle-ci + --out Lcov --all-features -- --test-threads 1 + - run: + name: Upload + command: bash <(curl -s https://codecov.io/bash) -Z -f From e0d46f430be0f7016a4816665fabaa6b9318a6fd Mon Sep 17 00:00:00 2001 From: Ben Cross Date: Sun, 17 Jan 2021 21:35:03 +0000 Subject: [PATCH 03/81] feat: Make SerDe optional --- Cargo.toml | 3 +-- src/algorithm/neighbour/cover_tree.rs | 10 ++++++---- src/algorithm/neighbour/linear_search.rs | 5 +++-- src/algorithm/neighbour/mod.rs | 8 +++++--- src/cluster/dbscan.rs | 5 +++-- src/cluster/kmeans.rs | 5 +++-- src/decomposition/pca.rs | 5 +++-- src/decomposition/svd.rs | 5 +++-- src/ensemble/random_forest_classifier.rs | 8 +++++--- src/ensemble/random_forest_regressor.rs | 8 +++++--- src/error/mod.rs | 8 +++++--- src/linalg/naive/dense_matrix.rs | 7 ++++++- src/linear/elastic_net.rs | 8 +++++--- src/linear/lasso.rs | 8 +++++--- src/linear/linear_regression.rs | 11 +++++++---- src/linear/logistic_regression.rs | 8 +++++--- src/linear/ridge_regression.rs | 11 +++++++---- src/math/distance/euclidian.rs | 5 +++-- src/math/distance/hamming.rs | 5 +++-- src/math/distance/mahalanobis.rs | 5 +++-- src/math/distance/manhattan.rs | 5 +++-- src/math/distance/minkowski.rs | 5 +++-- src/metrics/accuracy.rs | 5 +++-- src/metrics/auc.rs | 5 +++-- src/metrics/cluster_hcv.rs | 5 +++-- src/metrics/f1.rs | 5 +++-- src/metrics/mean_absolute_error.rs | 5 +++-- src/metrics/mean_squared_error.rs | 5 +++-- src/metrics/precision.rs | 5 +++-- src/metrics/r2.rs | 5 +++-- src/metrics/recall.rs | 5 +++-- src/naive_bayes/bernoulli.rs | 11 +++++++---- src/naive_bayes/categorical.rs | 11 +++++++---- src/naive_bayes/gaussian.rs | 11 +++++++---- src/naive_bayes/mod.rs | 5 +++-- src/naive_bayes/multinomial.rs | 11 +++++++---- src/neighbors/knn_classifier.rs | 8 +++++--- src/neighbors/knn_regressor.rs | 8 +++++--- src/neighbors/mod.rs | 5 +++-- src/svm/mod.rs | 14 +++++++++----- src/svm/svc.rs | 15 +++++++++------ src/svm/svr.rs | 15 +++++++++------ src/tree/decision_tree_classifier.rs | 14 +++++++++----- src/tree/decision_tree_regressor.rs | 11 +++++++---- 44 files changed, 206 insertions(+), 126 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5e21aef..d941735 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,8 +25,7 @@ num-traits = "0.2.12" num = "0.3.0" rand = "0.7.3" rand_distr = "0.3.0" -serde = { version = "1.0.115", features = ["derive"] } -serde_derive = "1.0.115" +serde = { version = "1.0.115", features = ["derive"], optional = true } [dev-dependencies] criterion = "0.3" diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index d271ed6..553dc99 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -24,7 +24,7 @@ //! ``` use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::sort::heap_select::HeapSelection; use crate::error::{Failed, FailedError}; @@ -32,7 +32,8 @@ use crate::math::distance::Distance; use crate::math::num::RealNumber; /// Implements Cover Tree algorithm -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct CoverTree> { base: F, inv_log_base: F, @@ -56,7 +57,8 @@ impl> PartialEq for CoverTree { } } -#[derive(Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct Node { idx: usize, max_dist: F, @@ -65,7 +67,7 @@ struct Node { scale: i64, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug)] struct DistanceSet { idx: usize, dist: Vec, diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index 45fbd6f..d82e575 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -22,7 +22,7 @@ //! //! ``` -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use std::cmp::{Ordering, PartialOrd}; use std::marker::PhantomData; @@ -32,7 +32,8 @@ use crate::math::distance::Distance; use crate::math::num::RealNumber; /// Implements Linear Search algorithm, see [KNN algorithms](../index.html) -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct LinearKNNSearch> { distance: D, data: Vec, diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index bf9e669..9e432bd 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -35,7 +35,7 @@ use crate::algorithm::neighbour::linear_search::LinearKNNSearch; use crate::error::Failed; use crate::math::distance::Distance; use crate::math::num::RealNumber; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; pub(crate) mod bbd_tree; /// tree data structure for fast nearest neighbor search @@ -45,7 +45,8 @@ pub mod linear_search; /// Both, KNN classifier and regressor benefits from underlying search algorithms that helps to speed up queries. /// `KNNAlgorithmName` maintains a list of supported search algorithms, see [KNN algorithms](../algorithm/neighbour/index.html) -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub enum KNNAlgorithmName { /// Heap Search algorithm, see [`LinearSearch`](../algorithm/neighbour/linear_search/index.html) LinearSearch, @@ -53,7 +54,8 @@ pub enum KNNAlgorithmName { CoverTree, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub(crate) enum KNNAlgorithm, T>> { LinearSearch(LinearKNNSearch, T, D>), CoverTree(CoverTree, T, D>), diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index c793039..a117982 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -43,7 +43,7 @@ use std::fmt::Debug; use std::iter::Sum; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, UnsupervisedEstimator}; @@ -55,7 +55,8 @@ use crate::math::num::RealNumber; use crate::tree::decision_tree_classifier::which_max; /// DBSCAN clustering algorithm -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct DBSCAN, T>> { cluster_labels: Vec, num_classes: usize, diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 44ce1e6..78c9105 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -56,7 +56,7 @@ use rand::Rng; use std::fmt::Debug; use std::iter::Sum; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::bbd_tree::BBDTree; use crate::api::{Predictor, UnsupervisedEstimator}; @@ -66,7 +66,8 @@ use crate::math::distance::euclidian::*; use crate::math::num::RealNumber; /// K-Means clustering algorithm -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct KMeans { k: usize, y: Vec, diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 189e6de..626f268 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -47,7 +47,7 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; @@ -55,7 +55,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; /// Principal components analysis algorithm -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct PCA> { eigenvectors: M, eigenvalues: Vec, diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index 595e93c..7dc48dc 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -46,7 +46,7 @@ use std::fmt::Debug; use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; @@ -54,7 +54,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; /// SVD -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct SVD> { components: M, phantom: PhantomData, diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 49c4239..74f210c 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -49,7 +49,7 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -61,7 +61,8 @@ use crate::tree::decision_tree_classifier::{ /// Parameters of the Random Forest algorithm. /// Some parameters here are passed directly into base estimator. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct RandomForestClassifierParameters { /// Split criteria to use when building a tree. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) pub criterion: SplitCriterion, @@ -78,7 +79,8 @@ pub struct RandomForestClassifierParameters { } /// Random Forest Classifier -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct RandomForestClassifier { parameters: RandomForestClassifierParameters, trees: Vec>, diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index fdeb9fc..74a1b59 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -47,7 +47,7 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -57,7 +57,8 @@ use crate::tree::decision_tree_regressor::{ DecisionTreeRegressor, DecisionTreeRegressorParameters, }; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Parameters of the Random Forest Regressor /// Some parameters here are passed directly into base estimator. pub struct RandomForestRegressorParameters { @@ -74,7 +75,8 @@ pub struct RandomForestRegressorParameters { } /// Random Forest Regressor -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct RandomForestRegressor { parameters: RandomForestRegressorParameters, trees: Vec>, diff --git a/src/error/mod.rs b/src/error/mod.rs index 2409889..9a9bb8b 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -2,10 +2,11 @@ use std::error::Error; use std::fmt; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Generic error to be raised when something goes wrong. -#[derive(Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct Failed { err: FailedError, msg: String, @@ -13,7 +14,8 @@ pub struct Failed { /// Type of error #[non_exhaustive] -#[derive(Copy, Clone, Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Copy, Clone, Debug)] pub enum FailedError { /// Can't fit algorithm to data FitFailed = 1, diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index a0b7bdb..9816a28 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1,11 +1,14 @@ #![allow(clippy::ptr_arg)] use std::fmt; use std::fmt::Debug; -use std::marker::PhantomData; +#[cfg(feature = "serde")] use std::marker::PhantomData; use std::ops::Range; +#[cfg(feature = "serde")] use serde::de::{Deserializer, MapAccess, SeqAccess, Visitor}; +#[cfg(feature = "serde")] use serde::ser::{SerializeStruct, Serializer}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::cholesky::CholeskyDecomposableMatrix; @@ -349,6 +352,7 @@ impl<'a, T: RealNumber> Iterator for DenseMatrixIterator<'a, T> { } } +#[cfg(feature = "serde")] impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for DenseMatrix { fn deserialize(deserializer: D) -> Result where @@ -434,6 +438,7 @@ impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for De } } +#[cfg(feature = "serde")] impl Serialize for DenseMatrix { fn serialize(&self, serializer: S) -> Result where diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index 2833ff1..7e7a29a 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -56,7 +56,7 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -67,7 +67,8 @@ use crate::math::num::RealNumber; use crate::linear::lasso_optimizer::InteriorPointOptimizer; /// Elastic net parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct ElasticNetParameters { /// Regularization parameter. pub alpha: T, @@ -84,7 +85,8 @@ pub struct ElasticNetParameters { } /// Elastic net -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct ElasticNet> { coefficients: M, intercept: T, diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index b99ecff..8f46bbc 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -24,7 +24,7 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -34,7 +34,8 @@ use crate::linear::lasso_optimizer::InteriorPointOptimizer; use crate::math::num::RealNumber; /// Lasso regression parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct LassoParameters { /// Controls the strength of the penalty to the loss function. pub alpha: T, @@ -47,7 +48,8 @@ pub struct LassoParameters { pub max_iter: usize, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] /// Lasso regressor pub struct Lasso> { coefficients: M, diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 2ef03c1..6d24312 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -62,14 +62,15 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Approach to use for estimation of regression coefficients. QR is more efficient but SVD is more stable. pub enum LinearRegressionSolverName { /// QR decomposition, see [QR](../../linalg/qr/index.html) @@ -79,14 +80,16 @@ pub enum LinearRegressionSolverName { } /// Linear Regression parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct LinearRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: LinearRegressionSolverName, } /// Linear Regression -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct LinearRegression> { coefficients: M, intercept: T, diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index a71ac45..cdf78d1 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -56,7 +56,7 @@ use std::cmp::Ordering; use std::fmt::Debug; use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -68,11 +68,13 @@ use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; /// Logistic Regression parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct LogisticRegressionParameters {} /// Logistic Regression -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct LogisticRegression> { coefficients: M, intercept: M, diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index e9ed1ff..5afa2f9 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -58,7 +58,7 @@ //! use std::fmt::Debug; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -66,7 +66,8 @@ use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable. pub enum RidgeRegressionSolverName { /// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html) @@ -76,7 +77,8 @@ pub enum RidgeRegressionSolverName { } /// Ridge Regression parameters -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct RidgeRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: RidgeRegressionSolverName, @@ -88,7 +90,8 @@ pub struct RidgeRegressionParameters { } /// Ridge regression -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct RidgeRegression> { coefficients: M, intercept: T, diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 9034727..6385f6e 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -18,14 +18,15 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; /// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Euclidian {} impl Euclidian { diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index 129fe16..bdd8e14 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -19,14 +19,15 @@ //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; /// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Hamming {} impl Distance, F> for Hamming { diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index 84aa947..9f47894 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -44,7 +44,7 @@ use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; @@ -52,7 +52,8 @@ use super::Distance; use crate::linalg::Matrix; /// Mahalanobis distance. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Mahalanobis> { /// covariance matrix of the dataset pub sigma: M, diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 9a69184..758763b 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -17,14 +17,15 @@ //! ``` //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; /// Manhattan distance -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Manhattan {} impl Distance, T> for Manhattan { diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index c5dd85d..e953571 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -21,14 +21,15 @@ //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; /// Defines the Minkowski distance of order `p` -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct Minkowski { /// order, integer pub p: u16, diff --git a/src/metrics/accuracy.rs b/src/metrics/accuracy.rs index ef7028f..c5a129b 100644 --- a/src/metrics/accuracy.rs +++ b/src/metrics/accuracy.rs @@ -16,13 +16,14 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Accuracy metric. -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct Accuracy {} impl Accuracy { diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index 0f8d56a..f352ca7 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -20,14 +20,15 @@ //! * ["The ROC-AUC and the Mann-Whitney U-test", Haupt, J.](https://johaupt.github.io/roc-auc/model%20evaluation/Area_under_ROC_curve.html) #![allow(non_snake_case)] -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Area Under the Receiver Operating Characteristic Curve (ROC AUC) -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct AUC {} impl AUC { diff --git a/src/metrics/cluster_hcv.rs b/src/metrics/cluster_hcv.rs index 29a9db2..40e5173 100644 --- a/src/metrics/cluster_hcv.rs +++ b/src/metrics/cluster_hcv.rs @@ -1,10 +1,11 @@ -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; use crate::metrics::cluster_helpers::*; -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] /// Homogeneity, completeness and V-Measure scores. pub struct HCVScore {} diff --git a/src/metrics/f1.rs b/src/metrics/f1.rs index 5c8537c..29f989e 100644 --- a/src/metrics/f1.rs +++ b/src/metrics/f1.rs @@ -18,7 +18,7 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; @@ -26,7 +26,8 @@ use crate::metrics::precision::Precision; use crate::metrics::recall::Recall; /// F-measure -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct F1 { /// a positive real factor pub beta: T, diff --git a/src/metrics/mean_absolute_error.rs b/src/metrics/mean_absolute_error.rs index a069335..1049589 100644 --- a/src/metrics/mean_absolute_error.rs +++ b/src/metrics/mean_absolute_error.rs @@ -18,12 +18,13 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] /// Mean Absolute Error pub struct MeanAbsoluteError {} diff --git a/src/metrics/mean_squared_error.rs b/src/metrics/mean_squared_error.rs index 137c8e6..3bcb7e1 100644 --- a/src/metrics/mean_squared_error.rs +++ b/src/metrics/mean_squared_error.rs @@ -18,12 +18,13 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] /// Mean Squared Error pub struct MeanSquareError {} diff --git a/src/metrics/precision.rs b/src/metrics/precision.rs index 3524e7f..806c119 100644 --- a/src/metrics/precision.rs +++ b/src/metrics/precision.rs @@ -18,13 +18,14 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Precision metric. -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct Precision {} impl Precision { diff --git a/src/metrics/r2.rs b/src/metrics/r2.rs index cbcf7e4..0d661b7 100644 --- a/src/metrics/r2.rs +++ b/src/metrics/r2.rs @@ -18,13 +18,14 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Coefficient of Determination (R2) -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct R2 {} impl R2 { diff --git a/src/metrics/recall.rs b/src/metrics/recall.rs index 4d2be95..22f5402 100644 --- a/src/metrics/recall.rs +++ b/src/metrics/recall.rs @@ -18,13 +18,14 @@ //! //! //! -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; /// Recall metric. -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct Recall {} impl Recall { diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 388646f..7233b83 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -42,10 +42,11 @@ use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Bearnoulli features -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] struct BernoulliNBDistribution { /// class labels known to the classifier class_labels: Vec, @@ -77,7 +78,8 @@ impl> NBDistribution for BernoulliNBDistributi } /// `BernoulliNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct BernoulliNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, @@ -202,7 +204,8 @@ impl BernoulliNBDistribution { } /// BernoulliNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub struct BernoulliNB> { inner: BaseNaiveBayes>, binarize: Option, diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index c6f28bd..c6b66c6 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -36,10 +36,11 @@ use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct CategoricalNBDistribution { class_labels: Vec, class_priors: Vec, @@ -216,7 +217,8 @@ impl CategoricalNBDistribution { } /// `CategoricalNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct CategoricalNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, @@ -237,7 +239,8 @@ impl Default for CategoricalNBParameters { } /// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub struct CategoricalNB> { inner: BaseNaiveBayes>, } diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 2ac9892..6ba78bb 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -30,10 +30,11 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] struct GaussianNBDistribution { /// class labels known to the classifier class_labels: Vec, @@ -75,7 +76,8 @@ impl> NBDistribution for GaussianNBDistributio } /// `GaussianNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Default, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Default, Clone)] pub struct GaussianNBParameters { /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data pub priors: Option>, @@ -178,7 +180,8 @@ impl GaussianNBDistribution { } /// GaussianNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub struct GaussianNB> { inner: BaseNaiveBayes>, } diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 7ab8b85..9a24466 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -39,7 +39,7 @@ use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use std::marker::PhantomData; /// Distribution used in the Naive Bayes classifier. @@ -55,7 +55,8 @@ pub(crate) trait NBDistribution> { } /// Base struct for the Naive Bayes classifier. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub(crate) struct BaseNaiveBayes, D: NBDistribution> { distribution: D, _phantom_t: PhantomData, diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 4cae1f3..23382a1 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -42,10 +42,11 @@ use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Multinomial features -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] struct MultinomialNBDistribution { /// class labels known to the classifier class_labels: Vec, @@ -73,7 +74,8 @@ impl> NBDistribution for MultinomialNBDistribu } /// `MultinomialNB` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct MultinomialNBParameters { /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub alpha: T, @@ -189,7 +191,8 @@ impl MultinomialNBDistribution { } /// MultinomialNB implements the categorical naive Bayes algorithm for categorically distributed data. -#[derive(Serialize, Deserialize, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, PartialEq)] pub struct MultinomialNB> { inner: BaseNaiveBayes>, } diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 97dd748..0f75220 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -33,7 +33,7 @@ //! use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, SupervisedEstimator}; @@ -45,7 +45,8 @@ use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNClassifier` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct KNNClassifierParameters, T>> { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. @@ -62,7 +63,8 @@ pub struct KNNClassifierParameters, T>> { } /// K Nearest Neighbors Classifier -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct KNNClassifier, T>> { classes: Vec, y: Vec, diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 4e73103..86bfd85 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -36,7 +36,7 @@ //! use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, SupervisedEstimator}; @@ -48,7 +48,8 @@ use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNRegressor` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct KNNRegressorParameters, T>> { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. @@ -65,7 +66,8 @@ pub struct KNNRegressorParameters, T>> { } /// K Nearest Neighbors Regressor -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct KNNRegressor, T>> { y: Vec, knn_algorithm: KNNAlgorithm, diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index 85ea6b8..6beb75e 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -33,7 +33,7 @@ //! use crate::math::num::RealNumber; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// K Nearest Neighbors Classifier pub mod knn_classifier; @@ -48,7 +48,8 @@ pub mod knn_regressor; pub type KNNAlgorithmName = crate::algorithm::neighbour::KNNAlgorithmName; /// Weight function that is used to determine estimated value. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub enum KNNWeightFunction { /// All k nearest points are weighted equally Uniform, diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 1e013d2..abe8071 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -26,7 +26,7 @@ pub mod svc; pub mod svr; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; @@ -93,18 +93,21 @@ impl Kernels { } /// Linear Kernel -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct LinearKernel {} /// Radial basis function (Gaussian) kernel -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct RBFKernel { /// kernel coefficient pub gamma: T, } /// Polynomial kernel -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct PolynomialKernel { /// degree of the polynomial pub degree: T, @@ -115,7 +118,8 @@ pub struct PolynomialKernel { } /// Sigmoid (hyperbolic tangent) kernel -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub struct SigmoidKernel { /// kernel coefficient pub gamma: T, diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 095d555..0582cdc 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -76,7 +76,7 @@ use std::marker::PhantomData; use rand::seq::SliceRandom; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -85,7 +85,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::svm::{Kernel, Kernels, LinearKernel}; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// SVC Parameters pub struct SVCParameters, K: Kernel> { /// Number of epochs. @@ -100,11 +101,12 @@ pub struct SVCParameters, K: Kernel m: PhantomData, } -#[derive(Serialize, Deserialize, Debug)] -#[serde(bound( +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] +#[cfg_attr(feature = "serde", serde(bound( serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", -))] +)))] /// Support Vector Classifier pub struct SVC, K: Kernel> { classes: Vec, @@ -114,7 +116,8 @@ pub struct SVC, K: Kernel> { b: T, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct SupportVector> { index: usize, x: V, diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 9eb6046..4d61b97 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -68,7 +68,7 @@ use std::cell::{Ref, RefCell}; use std::fmt::Debug; use std::marker::PhantomData; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -77,7 +77,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::svm::{Kernel, Kernels, LinearKernel}; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// SVR Parameters pub struct SVRParameters, K: Kernel> { /// Epsilon in the epsilon-SVR model. @@ -92,11 +93,12 @@ pub struct SVRParameters, K: Kernel m: PhantomData, } -#[derive(Serialize, Deserialize, Debug)] -#[serde(bound( +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] +#[cfg_attr(feature = "serde", serde(bound( serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", -))] +)))] /// Epsilon-Support Vector Regression pub struct SVR, K: Kernel> { @@ -106,7 +108,8 @@ pub struct SVR, K: Kernel> { b: T, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct SupportVector> { index: usize, x: V, diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 3a92c54..b014152 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -68,7 +68,7 @@ use std::fmt::Debug; use std::marker::PhantomData; use rand::seq::SliceRandom; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::api::{Predictor, SupervisedEstimator}; @@ -76,7 +76,8 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Parameters of Decision Tree pub struct DecisionTreeClassifierParameters { /// Split criteria to use when building a tree. @@ -90,7 +91,8 @@ pub struct DecisionTreeClassifierParameters { } /// Decision Tree -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct DecisionTreeClassifier { nodes: Vec>, parameters: DecisionTreeClassifierParameters, @@ -100,7 +102,8 @@ pub struct DecisionTreeClassifier { } /// The function to measure the quality of a split. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] pub enum SplitCriterion { /// [Gini index](../decision_tree_classifier/index.html) Gini, @@ -110,7 +113,8 @@ pub enum SplitCriterion { ClassificationError, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct Node { index: usize, output: usize, diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 06ee507..ef8c52c 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -63,7 +63,7 @@ use std::default::Default; use std::fmt::Debug; use rand::seq::SliceRandom; -use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::api::{Predictor, SupervisedEstimator}; @@ -71,7 +71,8 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] /// Parameters of Regression Tree pub struct DecisionTreeRegressorParameters { /// The maximum depth of the tree. @@ -83,14 +84,16 @@ pub struct DecisionTreeRegressorParameters { } /// Regression Tree -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] pub struct DecisionTreeRegressor { nodes: Vec>, parameters: DecisionTreeRegressorParameters, depth: u16, } -#[derive(Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] struct Node { index: usize, output: T, From 762986b271c141b112ecb5d855834de76923ae3c Mon Sep 17 00:00:00 2001 From: Ben Cross Date: Sun, 17 Jan 2021 21:37:30 +0000 Subject: [PATCH 04/81] Cargo format --- src/algorithm/neighbour/cover_tree.rs | 3 ++- src/algorithm/neighbour/linear_search.rs | 3 ++- src/algorithm/neighbour/mod.rs | 3 ++- src/cluster/dbscan.rs | 3 ++- src/cluster/kmeans.rs | 3 ++- src/decomposition/pca.rs | 3 ++- src/decomposition/svd.rs | 3 ++- src/ensemble/random_forest_classifier.rs | 3 ++- src/ensemble/random_forest_regressor.rs | 3 ++- src/error/mod.rs | 3 ++- src/linalg/naive/dense_matrix.rs | 3 ++- src/linear/elastic_net.rs | 3 ++- src/linear/lasso.rs | 3 ++- src/linear/linear_regression.rs | 3 ++- src/linear/logistic_regression.rs | 3 ++- src/linear/ridge_regression.rs | 3 ++- src/math/distance/euclidian.rs | 3 ++- src/math/distance/hamming.rs | 3 ++- src/math/distance/mahalanobis.rs | 3 ++- src/math/distance/manhattan.rs | 3 ++- src/math/distance/minkowski.rs | 3 ++- src/metrics/accuracy.rs | 3 ++- src/metrics/auc.rs | 3 ++- src/metrics/cluster_hcv.rs | 3 ++- src/metrics/f1.rs | 3 ++- src/metrics/mean_absolute_error.rs | 3 ++- src/metrics/mean_squared_error.rs | 3 ++- src/metrics/precision.rs | 3 ++- src/metrics/r2.rs | 3 ++- src/metrics/recall.rs | 3 ++- src/naive_bayes/bernoulli.rs | 3 ++- src/naive_bayes/categorical.rs | 3 ++- src/naive_bayes/gaussian.rs | 3 ++- src/naive_bayes/mod.rs | 3 ++- src/naive_bayes/multinomial.rs | 3 ++- src/neighbors/knn_classifier.rs | 3 ++- src/neighbors/knn_regressor.rs | 3 ++- src/neighbors/mod.rs | 3 ++- src/svm/mod.rs | 3 ++- src/svm/svc.rs | 14 +++++++++----- src/svm/svr.rs | 14 +++++++++----- src/tree/decision_tree_classifier.rs | 3 ++- src/tree/decision_tree_regressor.rs | 3 ++- 43 files changed, 100 insertions(+), 51 deletions(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 553dc99..96a3389 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -24,7 +24,8 @@ //! ``` use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::sort::heap_select::HeapSelection; use crate::error::{Failed, FailedError}; diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index d82e575..f89e751 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -22,7 +22,8 @@ //! //! ``` -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use std::cmp::{Ordering, PartialOrd}; use std::marker::PhantomData; diff --git a/src/algorithm/neighbour/mod.rs b/src/algorithm/neighbour/mod.rs index 9e432bd..321ec01 100644 --- a/src/algorithm/neighbour/mod.rs +++ b/src/algorithm/neighbour/mod.rs @@ -35,7 +35,8 @@ use crate::algorithm::neighbour::linear_search::LinearKNNSearch; use crate::error::Failed; use crate::math::distance::Distance; use crate::math::num::RealNumber; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; pub(crate) mod bbd_tree; /// tree data structure for fast nearest neighbor search diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index a117982..73d686d 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -43,7 +43,8 @@ use std::fmt::Debug; use std::iter::Sum; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, UnsupervisedEstimator}; diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 78c9105..a454b1f 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -56,7 +56,8 @@ use rand::Rng; use std::fmt::Debug; use std::iter::Sum; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::bbd_tree::BBDTree; use crate::api::{Predictor, UnsupervisedEstimator}; diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 626f268..e3212e3 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -47,7 +47,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index 7dc48dc..5524e29 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -46,7 +46,8 @@ use std::fmt::Debug; use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Transformer, UnsupervisedEstimator}; use crate::error::Failed; diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 74f210c..62e83b5 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -49,7 +49,8 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 74a1b59..18c2f69 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -47,7 +47,8 @@ use std::default::Default; use std::fmt::Debug; use rand::Rng; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/error/mod.rs b/src/error/mod.rs index 9a9bb8b..4e84f6e 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -2,7 +2,8 @@ use std::error::Error; use std::fmt; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Generic error to be raised when something goes wrong. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 9816a28..1a9b3a6 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1,7 +1,8 @@ #![allow(clippy::ptr_arg)] use std::fmt; use std::fmt::Debug; -#[cfg(feature = "serde")] use std::marker::PhantomData; +#[cfg(feature = "serde")] +use std::marker::PhantomData; use std::ops::Range; #[cfg(feature = "serde")] diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index 7e7a29a..f4a4326 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -56,7 +56,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 8f46bbc..17712b1 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -24,7 +24,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 6d24312..290a2db 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -62,7 +62,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index cdf78d1..45777be 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -56,7 +56,8 @@ use std::cmp::Ordering; use std::fmt::Debug; use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 5afa2f9..4e1ebad 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -58,7 +58,8 @@ //! use std::fmt::Debug; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 6385f6e..b06d7d1 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index bdd8e14..d23b57f 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -19,7 +19,8 @@ //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index 9f47894..7ff86e9 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -44,7 +44,8 @@ use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 758763b..3162178 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -17,7 +17,8 @@ //! ``` //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index e953571..1e97ea8 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -21,7 +21,8 @@ //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; diff --git a/src/metrics/accuracy.rs b/src/metrics/accuracy.rs index c5a129b..6912a4c 100644 --- a/src/metrics/accuracy.rs +++ b/src/metrics/accuracy.rs @@ -16,7 +16,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index f352ca7..508295b 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -20,7 +20,8 @@ //! * ["The ROC-AUC and the Mann-Whitney U-test", Haupt, J.](https://johaupt.github.io/roc-auc/model%20evaluation/Area_under_ROC_curve.html) #![allow(non_snake_case)] -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::linalg::BaseVector; diff --git a/src/metrics/cluster_hcv.rs b/src/metrics/cluster_hcv.rs index 40e5173..d881bdc 100644 --- a/src/metrics/cluster_hcv.rs +++ b/src/metrics/cluster_hcv.rs @@ -1,4 +1,5 @@ -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/f1.rs b/src/metrics/f1.rs index 29f989e..d957d9b 100644 --- a/src/metrics/f1.rs +++ b/src/metrics/f1.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/mean_absolute_error.rs b/src/metrics/mean_absolute_error.rs index 1049589..db3039f 100644 --- a/src/metrics/mean_absolute_error.rs +++ b/src/metrics/mean_absolute_error.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/mean_squared_error.rs b/src/metrics/mean_squared_error.rs index 3bcb7e1..3003e5d 100644 --- a/src/metrics/mean_squared_error.rs +++ b/src/metrics/mean_squared_error.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/precision.rs b/src/metrics/precision.rs index 806c119..2bd0dcf 100644 --- a/src/metrics/precision.rs +++ b/src/metrics/precision.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/r2.rs b/src/metrics/r2.rs index 0d661b7..c710ef5 100644 --- a/src/metrics/r2.rs +++ b/src/metrics/r2.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/metrics/recall.rs b/src/metrics/recall.rs index 22f5402..d1fad56 100644 --- a/src/metrics/recall.rs +++ b/src/metrics/recall.rs @@ -18,7 +18,8 @@ //! //! //! -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 7233b83..cdbfa80 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -42,7 +42,8 @@ use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Bearnoulli features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index c6b66c6..dc8587a 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -36,7 +36,8 @@ use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 6ba78bb..c27c396 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -30,7 +30,8 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for categorical features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs index 9a24466..f7c8da6 100644 --- a/src/naive_bayes/mod.rs +++ b/src/naive_bayes/mod.rs @@ -39,7 +39,8 @@ use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use std::marker::PhantomData; /// Distribution used in the Naive Bayes classifier. diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 23382a1..fa91020 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -42,7 +42,8 @@ use crate::math::num::RealNumber; use crate::math::vector::RealNumberVector; use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Multinomial features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 0f75220..839eea3 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -33,7 +33,8 @@ //! use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, SupervisedEstimator}; diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 86bfd85..1edf86a 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -36,7 +36,8 @@ //! use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; use crate::api::{Predictor, SupervisedEstimator}; diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index 6beb75e..86b1e46 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -33,7 +33,8 @@ //! use crate::math::num::RealNumber; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; /// K Nearest Neighbors Classifier pub mod knn_classifier; diff --git a/src/svm/mod.rs b/src/svm/mod.rs index abe8071..068f773 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -26,7 +26,8 @@ pub mod svc; pub mod svr; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::linalg::BaseVector; use crate::math::num::RealNumber; diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 0582cdc..9d77812 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -76,7 +76,8 @@ use std::marker::PhantomData; use rand::seq::SliceRandom; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -103,10 +104,13 @@ pub struct SVCParameters, K: Kernel #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] -#[cfg_attr(feature = "serde", serde(bound( - serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", - deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", -)))] +#[cfg_attr( + feature = "serde", + serde(bound( + serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", + deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", + )) +)] /// Support Vector Classifier pub struct SVC, K: Kernel> { classes: Vec, diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 4d61b97..cbb1ea5 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -68,7 +68,8 @@ use std::cell::{Ref, RefCell}; use std::fmt::Debug; use std::marker::PhantomData; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; use crate::error::Failed; @@ -95,10 +96,13 @@ pub struct SVRParameters, K: Kernel #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] -#[cfg_attr(feature = "serde", serde(bound( - serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", - deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", -)))] +#[cfg_attr( + feature = "serde", + serde(bound( + serialize = "M::RowVector: Serialize, K: Serialize, T: Serialize", + deserialize = "M::RowVector: Deserialize<'de>, K: Deserialize<'de>, T: Deserialize<'de>", + )) +)] /// Epsilon-Support Vector Regression pub struct SVR, K: Kernel> { diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index b014152..7575a5a 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -68,7 +68,8 @@ use std::fmt::Debug; use std::marker::PhantomData; use rand::seq::SliceRandom; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::api::{Predictor, SupervisedEstimator}; diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index ef8c52c..d1292db 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -63,7 +63,8 @@ use std::default::Default; use std::fmt::Debug; use rand::seq::SliceRandom; -#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; use crate::api::{Predictor, SupervisedEstimator}; From f1cf8a6f0845f48e16f342f3e56b1bdb93ae2d2a Mon Sep 17 00:00:00 2001 From: Ben Cross Date: Mon, 18 Jan 2021 10:32:35 +0000 Subject: [PATCH 05/81] Added serde feature flags to tests --- src/algorithm/neighbour/cover_tree.rs | 4 +++- src/algorithm/neighbour/linear_search.rs | 3 ++- src/cluster/dbscan.rs | 2 ++ src/cluster/kmeans.rs | 1 + src/decomposition/pca.rs | 1 + src/decomposition/svd.rs | 1 + src/ensemble/random_forest_classifier.rs | 1 + src/ensemble/random_forest_regressor.rs | 1 + src/linalg/naive/dense_matrix.rs | 2 ++ src/linear/elastic_net.rs | 1 + src/linear/lasso.rs | 1 + src/linear/linear_regression.rs | 1 + src/linear/logistic_regression.rs | 1 + src/linear/ridge_regression.rs | 1 + src/naive_bayes/bernoulli.rs | 1 + src/naive_bayes/categorical.rs | 1 + src/naive_bayes/gaussian.rs | 1 + src/naive_bayes/multinomial.rs | 1 + src/neighbors/knn_classifier.rs | 1 + src/neighbors/knn_regressor.rs | 1 + src/svm/svc.rs | 2 ++ src/svm/svr.rs | 2 ++ src/tree/decision_tree_classifier.rs | 1 + src/tree/decision_tree_regressor.rs | 1 + 24 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 96a3389..9c5c806 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -457,7 +457,8 @@ mod tests { use super::*; use crate::math::distance::Distances; - #[derive(Debug, Serialize, Deserialize, Clone)] + #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] + #[derive(Debug, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { @@ -503,6 +504,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index f89e751..b4a3c89 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -140,7 +140,8 @@ mod tests { use super::*; use crate::math::distance::Distances; - #[derive(Debug, Serialize, Deserialize, Clone)] + #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] + #[derive(Debug, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 73d686d..d7a706a 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -265,6 +265,7 @@ impl, T>> DBSCAN { mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg(feature = "serde")] use crate::math::distance::euclidian::Euclidian; #[test] @@ -299,6 +300,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index a454b1f..6be52a5 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -347,6 +347,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index e3212e3..de258dc 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -567,6 +567,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let iris = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index 5524e29..6f5a1bd 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -228,6 +228,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let iris = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 62e83b5..4127627 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -325,6 +325,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 18c2f69..02eef99 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -274,6 +274,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159., 107.608, 1947., 60.323], diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 1a9b3a6..4faa77d 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1312,6 +1312,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn to_from_json() { let a = DenseMatrix::from_2d_array(&[&[0.9, 0.4, 0.7], &[0.4, 0.5, 0.3], &[0.7, 0.3, 0.8]]); let deserialized_a: DenseMatrix = @@ -1320,6 +1321,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn to_from_bincode() { let a = DenseMatrix::from_2d_array(&[&[0.9, 0.4, 0.7], &[0.4, 0.5, 0.3], &[0.7, 0.3, 0.8]]); let deserialized_a: DenseMatrix = diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index f4a4326..479ae2a 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -401,6 +401,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 17712b1..8c59a4f 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -275,6 +275,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 290a2db..2734a78 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -251,6 +251,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 45777be..cbdef77 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -543,6 +543,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[1., -5.], diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 4e1ebad..787c338 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -330,6 +330,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index cdbfa80..6a7d0b4 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -351,6 +351,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::::from_2d_array(&[ &[1., 1., 0., 0., 0., 0.], diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index dc8587a..2161528 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -349,6 +349,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::::from_2d_array(&[ &[3., 4., 0., 1.], diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index c27c396..28c4785 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -281,6 +281,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::::from_2d_array(&[ &[-1., -1.], diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index fa91020..06ee071 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -324,6 +324,7 @@ mod tests { )); } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::::from_2d_array(&[ &[1., 1., 0., 0., 0., 0.], diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 839eea3..ba6693e 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -280,6 +280,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index 1edf86a..ed52496 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -269,6 +269,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 9d77812..3101425 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -726,6 +726,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; use crate::metrics::accuracy; + #[cfg(feature = "serde")] use crate::svm::*; #[test] @@ -814,6 +815,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn svc_serde() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/svm/svr.rs b/src/svm/svr.rs index cbb1ea5..b160cca 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -533,6 +533,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; use crate::metrics::mean_squared_error; + #[cfg(feature = "serde")] use crate::svm::*; #[test] @@ -569,6 +570,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn svr_serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 7575a5a..ba79d52 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -745,6 +745,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[1., 1., 1., 0.], diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index d1292db..307d357 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -581,6 +581,7 @@ mod tests { } #[test] + #[cfg(feature = "serde")] fn serde() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159., 107.608, 1947., 60.323], From fd00bc3780a5d4e289d6689179ebb80798d74e77 Mon Sep 17 00:00:00 2001 From: Ben Cross Date: Mon, 18 Jan 2021 20:50:49 +0000 Subject: [PATCH 06/81] Run the pipeline with --all-features enabled --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a931ff5..6cdd0e4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,10 +23,10 @@ jobs: command: cargo fmt -- --check - run: name: Stable Build - command: cargo build --features "nalgebra-bindings ndarray-bindings" + command: cargo build --all-features - run: name: Test - command: cargo test --features "nalgebra-bindings ndarray-bindings" + command: cargo test --all-features - save_cache: key: project-cache paths: From bd5fbb63b155af9400e690d625a703fee9ff08f6 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 20 Jan 2021 16:55:58 -0800 Subject: [PATCH 07/81] feat: adds a new parameter to the logistic regression: solver --- src/linear/logistic_regression.rs | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index cbdef77..a23c15a 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -68,10 +68,21 @@ use crate::optimization::first_order::{FirstOrderOptimizer, OptimizerResult}; use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone)] +/// Solver options for Logistic regression. Right now only LBFGS solver is supported. +pub enum LogisticRegressionSolverName { + /// Limited-memory Broyden–Fletcher–Goldfarb–Shanno method, see [LBFGS paper](http://users.iems.northwestern.edu/~nocedal/lbfgsb.html) + LBFGS, +} + /// Logistic Regression parameters #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] -pub struct LogisticRegressionParameters {} +pub struct LogisticRegressionParameters { + /// Solver to use for estimation of regression coefficients. + pub solver: LogisticRegressionSolverName, +} /// Logistic Regression #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -105,9 +116,19 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { phantom: PhantomData<&'a T>, } +impl LogisticRegressionParameters { + /// Solver to use for estimation of regression coefficients. + pub fn with_solver(mut self, solver: LogisticRegressionSolverName) -> Self { + self.solver = solver; + self + } +} + impl Default for LogisticRegressionParameters { fn default() -> Self { - LogisticRegressionParameters {} + LogisticRegressionParameters { + solver: LogisticRegressionSolverName::LBFGS, + } } } From 40a92ee4dbaeb6a485555e67d2c864a2a42e3b5c Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 21 Jan 2021 14:37:34 -0800 Subject: [PATCH 08/81] feat: adds l2 regularization penalty to the Logistic Regression --- src/linear/logistic_regression.rs | 134 ++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 16 deletions(-) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index a23c15a..2a12c19 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -54,7 +54,6 @@ //! use std::cmp::Ordering; use std::fmt::Debug; -use std::marker::PhantomData; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -79,9 +78,11 @@ pub enum LogisticRegressionSolverName { /// Logistic Regression parameters #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] -pub struct LogisticRegressionParameters { +pub struct LogisticRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: LogisticRegressionSolverName, + /// Regularization parameter. + pub alpha: T, } /// Logistic Regression @@ -113,21 +114,27 @@ trait ObjectiveFunction> { struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { x: &'a M, y: Vec, - phantom: PhantomData<&'a T>, + alpha: T, } -impl LogisticRegressionParameters { +impl LogisticRegressionParameters { /// Solver to use for estimation of regression coefficients. pub fn with_solver(mut self, solver: LogisticRegressionSolverName) -> Self { self.solver = solver; self } + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } } -impl Default for LogisticRegressionParameters { +impl Default for LogisticRegressionParameters { fn default() -> Self { LogisticRegressionParameters { solver: LogisticRegressionSolverName::LBFGS, + alpha: T::zero(), } } } @@ -156,13 +163,22 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction { fn f(&self, w_bias: &M) -> T { let mut f = T::zero(); - let (n, _) = self.x.shape(); + let (n, p) = self.x.shape(); for i in 0..n { let wx = BinaryObjectiveFunction::partial_dot(w_bias, self.x, 0, i); f += wx.ln_1pe() - (T::from(self.y[i]).unwrap()) * wx; } + if self.alpha > T::zero() { + let mut w_squared = T::zero(); + for i in 0..p { + let w = w_bias.get(0, i); + w_squared += w * w; + } + f += T::half() * self.alpha * w_squared; + } + f } @@ -180,6 +196,13 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction } g.set(0, p, g.get(0, p) - dyi); } + + if self.alpha > T::zero() { + for i in 0..p { + let w = w_bias.get(0, i); + g.set(0, i, g.get(0, i) + self.alpha * w); + } + } } } @@ -187,7 +210,7 @@ struct MultiClassObjectiveFunction<'a, T: RealNumber, M: Matrix> { x: &'a M, y: Vec, k: usize, - phantom: PhantomData<&'a T>, + alpha: T, } impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction @@ -209,6 +232,17 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction f -= prob.get(0, self.y[i]).ln(); } + if self.alpha > T::zero() { + let mut w_squared = T::zero(); + for i in 0..self.k { + for j in 0..p { + let wi = w_bias.get(0, i * (p + 1) + j); + w_squared += wi * wi; + } + } + f += T::half() * self.alpha * w_squared; + } + f } @@ -239,16 +273,27 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction g.set(0, j * (p + 1) + p, g.get(0, j * (p + 1) + p) - yi); } } + + if self.alpha > T::zero() { + for i in 0..self.k { + for j in 0..p { + let pos = i * (p + 1); + let wi = w.get(0, pos + j); + g.set(0, pos + j, g.get(0, pos + j) + self.alpha * wi); + } + } + } } } -impl> SupervisedEstimator +impl> + SupervisedEstimator> for LogisticRegression { fn fit( x: &M, y: &M::RowVector, - parameters: LogisticRegressionParameters, + parameters: LogisticRegressionParameters, ) -> Result { LogisticRegression::fit(x, y, parameters) } @@ -268,7 +313,7 @@ impl> LogisticRegression { pub fn fit( x: &M, y: &M::RowVector, - _parameters: LogisticRegressionParameters, + parameters: LogisticRegressionParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); @@ -302,7 +347,7 @@ impl> LogisticRegression { let objective = BinaryObjectiveFunction { x, y: yi, - phantom: PhantomData, + alpha: parameters.alpha, }; let result = LogisticRegression::minimize(x0, objective); @@ -324,7 +369,7 @@ impl> LogisticRegression { x, y: yi, k, - phantom: PhantomData, + alpha: parameters.alpha, }; let result = LogisticRegression::minimize(x0, objective); @@ -431,9 +476,9 @@ mod tests { let objective = MultiClassObjectiveFunction { x: &x, - y, + y: y.clone(), k: 3, - phantom: PhantomData, + alpha: 0.0, }; let mut g: DenseMatrix = DenseMatrix::zeros(1, 9); @@ -454,6 +499,24 @@ mod tests { ])); assert!((f - 408.0052230582765).abs() < std::f64::EPSILON); + + let objective_reg = MultiClassObjectiveFunction { + x: &x, + y: y.clone(), + k: 3, + alpha: 1.0, + }; + + let f = objective_reg.f(&DenseMatrix::row_vector_from_array(&[ + 1., 2., 3., 4., 5., 6., 7., 8., 9., + ])); + assert!((f - 487.5052).abs() < 1e-4); + + objective_reg.df( + &mut g, + &DenseMatrix::row_vector_from_array(&[1., 2., 3., 4., 5., 6., 7., 8., 9.]), + ); + assert!((g.get(0, 0).abs() - 32.0).abs() < 1e-4); } #[test] @@ -480,8 +543,8 @@ mod tests { let objective = BinaryObjectiveFunction { x: &x, - y, - phantom: PhantomData, + y: y.clone(), + alpha: 0.0, }; let mut g: DenseMatrix = DenseMatrix::zeros(1, 3); @@ -496,6 +559,20 @@ mod tests { let f = objective.f(&DenseMatrix::row_vector_from_array(&[1., 2., 3.])); assert!((f - 59.76994756647412).abs() < std::f64::EPSILON); + + let objective_reg = BinaryObjectiveFunction { + x: &x, + y: y.clone(), + alpha: 1.0, + }; + + let f = objective_reg.f(&DenseMatrix::row_vector_from_array(&[1., 2., 3.])); + assert!((f - 62.2699).abs() < 1e-4); + + objective_reg.df(&mut g, &DenseMatrix::row_vector_from_array(&[1., 2., 3.])); + assert!((g.get(0, 0) - 27.0511).abs() < 1e-4); + assert!((g.get(0, 1) - 12.239).abs() < 1e-4); + assert!((g.get(0, 2) - 3.8693).abs() < 1e-4); } #[test] @@ -547,6 +624,15 @@ mod tests { let y_hat = lr.predict(&x).unwrap(); assert!(accuracy(&y_hat, &y) > 0.9); + + let lr_reg = LogisticRegression::fit( + &x, + &y, + LogisticRegressionParameters::default().with_alpha(10.0), + ) + .unwrap(); + + assert!(lr_reg.coefficients().abs().sum() < lr.coefficients().abs().sum()); } #[test] @@ -561,6 +647,15 @@ mod tests { let y_hat = lr.predict(&x).unwrap(); assert!(accuracy(&y_hat, &y) > 0.9); + + let lr_reg = LogisticRegression::fit( + &x, + &y, + LogisticRegressionParameters::default().with_alpha(10.0), + ) + .unwrap(); + + assert!(lr_reg.coefficients().abs().sum() < lr.coefficients().abs().sum()); } #[test] @@ -622,6 +717,12 @@ mod tests { ]; let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); + let lr_reg = LogisticRegression::fit( + &x, + &y, + LogisticRegressionParameters::default().with_alpha(1.0), + ) + .unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -632,5 +733,6 @@ mod tests { .sum(); assert!(error <= 1.0); + assert!(lr_reg.coefficients().abs().sum() < lr.coefficients().abs().sum()); } } From 991631876eb0bd55b6acf4fdecd85181b985de63 Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 25 Jan 2021 23:33:48 -0800 Subject: [PATCH 09/81] build one-hot encoder --- src/lib.rs | 2 + src/preprocessing/mod.rs | 1 + src/preprocessing/target_encoders.rs | 209 +++++++++++++++++++++++++++ 3 files changed, 212 insertions(+) create mode 100644 src/preprocessing/mod.rs create mode 100644 src/preprocessing/target_encoders.rs diff --git a/src/lib.rs b/src/lib.rs index 7d2b089..c5802d2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -95,3 +95,5 @@ pub(crate) mod optimization; pub mod svm; /// Supervised tree-based learning methods pub mod tree; +/// Preprocessing utilities +pub mod preprocessing; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs new file mode 100644 index 0000000..e4b5190 --- /dev/null +++ b/src/preprocessing/mod.rs @@ -0,0 +1 @@ +pub mod target_encoders; \ No newline at end of file diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs new file mode 100644 index 0000000..1894361 --- /dev/null +++ b/src/preprocessing/target_encoders.rs @@ -0,0 +1,209 @@ + +#![allow(clippy::ptr_arg)] +//! # Encode categorical features as a one-hot or multi-class numeric array. +//! + +use std::hash::Hash; +use std::collections::HashMap; + +use crate::math::num::RealNumber; +use crate::error::Failed; + + +/// Turn a collection of label types into a one-hot vectors. +/// This struct encodes single class per exmample +pub struct OneHotEncoder { + label_to_idx: HashMap, + labels: Vec, + num_classes: usize + +} + +enum LabelDefinition { + LabelToClsNumMap(HashMap), + PositionalLabel(Vec), +} + +/// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) +pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { + let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); + (0..num_labels).map(|idx| if idx == label_idx {pos.clone()} else {neg.clone()}).collect() + +} + +impl<'a, T: Hash + Eq + Clone> OneHotEncoder +{ + + /// Fit an encoder to a lable list + /// + /// Label numbers will be assigned in the order they are encountered + /// Example: + /// ``` + /// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; + /// let enc = OneHotEncoder::::fit(&fake_labels[0..]); + /// let oh_vec = enc.transform_one(&1); // notice that 1 is actually a zero-th positional label + /// assert_eq!(oh_vec, vec![1f64,0f64,0f64,0f64,0f64]); + /// ``` + pub fn fit(labels: &[T]) -> Self { + + let mut label_map: HashMap = HashMap::new(); + let mut class_num = 0usize; + let mut unique_lables: Vec = Vec::new(); + + for l in labels + { + if !label_map.contains_key(&l) { + label_map.insert(l.clone(), class_num); + unique_lables.push(l.clone()); + class_num += 1; + } + } + Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} + } + + + /// Build an encoder from a predefined (label -> class number) map + /// + /// Definition example: + /// ``` + /// let fake_label_map: HashMap<&str, u32> = vec![("background",0), ("dog", 1), ("cat", 2)] + /// .into_iter() + /// .collect(); + /// let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); + /// ``` + pub fn from_label_map(labels: HashMap) -> Self { + Self::from_label_def(LabelDefinition::LabelToClsNumMap(labels)) + } + /// Build an encoder from a predefined positional label-class num vector + /// + /// Definition example: + /// ``` + /// let fake_label_pos = vec!["background","dog", "cat"]; + /// let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + /// ``` + pub fn from_positional_label_vec(labels: Vec) -> Self { + Self::from_label_def(LabelDefinition::PositionalLabel(labels)) + } + + /// Transform a slice of label types into one-hot vectors + /// None is returned if unknown label is encountered + pub fn transform(&self, labels: &[T]) -> Vec>> { + labels + .into_iter() + .map(|l| self.transform_one(l)) + .collect() + } + + /// Transform a single label type into a one-hot vector + pub fn transform_one(&self, label: &T) -> Option> { + match self.label_to_idx.get(label) { + None => None, + Some(&idx) => Some(make_one_hot(idx, self.num_classes)) + } + } + + /// Invert one-hot vector, back to the label + ///``` + /// let lab = enc.invert_one(res)?; // e.g. res = [0,1,0,0...] "dog" == class 1 + /// assert_eq!(lab, "dog") + /// ``` + pub fn invert_one(&self, one_hot: Vec) -> Result { + let pos = U::from_f64(1f64).unwrap(); + + let s: Vec = one_hot + .into_iter() + .enumerate() + .filter_map(|(idx, v)| if v == pos {Some(idx)} else {None}) + .collect(); + + if s.len() == 1 { + let idx = s[0]; + return Ok(self.labels[idx].clone()) + } + let pos_entries = format!("Expected a single positive entry, {} entires found", s.len()); + Err(Failed::transform(&pos_entries[..])) + } + + + fn from_label_def(labels: LabelDefinition) -> Self { + + let (label_map, class_num, unique_lables) = match labels { + LabelDefinition::LabelToClsNumMap(h) => { + let mut _unique_lab: Vec<(T, usize)> = h.iter().map(|(k,v)| (k.clone(), v.clone())).collect(); + _unique_lab.sort_by(|a,b| a.1.cmp(&b.1)); + let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); + (h, unique_lab.len(), unique_lab) + }, + LabelDefinition::PositionalLabel(unique_lab) => { + let h: HashMap = unique_lab.iter().enumerate().map(|(v, k)| (k.clone(),v)).collect(); + (h, unique_lab.len(), unique_lab) + } + }; + Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} + + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_labels() { + let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; + let enc = OneHotEncoder::::fit(&fake_labels[0..]); + let oh_vec = match enc.transform_one(&1) { + None => panic!("Wrong labels"), + Some(v) => v + }; + let res: Vec = vec![1f64,0f64,0f64,0f64,0f64]; + assert_eq!(oh_vec, res); + } + + + fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str>{ + let fake_label_pos = vec!["background","dog", "cat"]; + let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + enc + } + + #[test] + fn label_map_and_vec() { + let fake_label_map: HashMap<&str, usize> = vec![("background",0), ("dog", 1), ("cat", 2)].into_iter().collect(); + let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v + }; + let res: Vec = vec![0f64, 1f64,0f64]; + assert_eq!(oh_vec, res); + } + + #[test] + fn positional_labels_vec() { + let enc = build_fake_str_enc(); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v + }; + let res: Vec = vec![0f64, 1f64,0f64]; + assert_eq!(oh_vec, res); + } + + #[test] + fn invert_label_test() { + let enc = build_fake_str_enc(); + let res: Vec = vec![0f64, 1f64,0f64]; + let lab = enc.invert_one(res).unwrap(); + assert_eq!(lab, "dog"); + + if let Err(e) = enc.invert_one(vec![0.0, 0.0,0.0]) { + let pos_entries = format!("Expected a single positive entry, 0 entires found"); + assert_eq!(e, Failed::transform(&pos_entries[..])); + }; + } + + + +} \ No newline at end of file From dbca6d43cede008cd6be5cb8c60e210c6f25994f Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 25 Jan 2021 23:55:43 -0800 Subject: [PATCH 10/81] fmt fix --- src/lib.rs | 4 +- src/preprocessing/mod.rs | 2 +- src/preprocessing/target_encoders.rs | 148 ++++++++++++++------------- 3 files changed, 79 insertions(+), 75 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c5802d2..6e6205f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -91,9 +91,9 @@ pub mod naive_bayes; /// Supervised neighbors-based learning methods pub mod neighbors; pub(crate) mod optimization; +/// Preprocessing utilities +pub mod preprocessing; /// Support Vector Machines pub mod svm; /// Supervised tree-based learning methods pub mod tree; -/// Preprocessing utilities -pub mod preprocessing; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index e4b5190..c70f7dc 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1 +1 @@ -pub mod target_encoders; \ No newline at end of file +pub mod target_encoders; diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 1894361..81cbdbd 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -1,22 +1,18 @@ - #![allow(clippy::ptr_arg)] //! # Encode categorical features as a one-hot or multi-class numeric array. -//! +//! -use std::hash::Hash; -use std::collections::HashMap; - -use crate::math::num::RealNumber; use crate::error::Failed; - +use crate::math::num::RealNumber; +use std::collections::HashMap; +use std::hash::Hash; /// Turn a collection of label types into a one-hot vectors. /// This struct encodes single class per exmample pub struct OneHotEncoder { label_to_idx: HashMap, labels: Vec, - num_classes: usize - + num_classes: usize, } enum LabelDefinition { @@ -27,13 +23,18 @@ enum LabelDefinition { /// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); - (0..num_labels).map(|idx| if idx == label_idx {pos.clone()} else {neg.clone()}).collect() - + (0..num_labels) + .map(|idx| { + if idx == label_idx { + pos.clone() + } else { + neg.clone() + } + }) + .collect() } -impl<'a, T: Hash + Eq + Clone> OneHotEncoder -{ - +impl<'a, T: Hash + Eq + Clone> OneHotEncoder { /// Fit an encoder to a lable list /// /// Label numbers will be assigned in the order they are encountered @@ -45,23 +46,24 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder /// assert_eq!(oh_vec, vec![1f64,0f64,0f64,0f64,0f64]); /// ``` pub fn fit(labels: &[T]) -> Self { - let mut label_map: HashMap = HashMap::new(); let mut class_num = 0usize; let mut unique_lables: Vec = Vec::new(); - for l in labels - { + for l in labels { if !label_map.contains_key(&l) { label_map.insert(l.clone(), class_num); unique_lables.push(l.clone()); class_num += 1; } } - Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} + Self { + label_to_idx: label_map, + num_classes: class_num, + labels: unique_lables, + } } - /// Build an encoder from a predefined (label -> class number) map /// /// Definition example: @@ -84,21 +86,18 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder pub fn from_positional_label_vec(labels: Vec) -> Self { Self::from_label_def(LabelDefinition::PositionalLabel(labels)) } - + /// Transform a slice of label types into one-hot vectors - /// None is returned if unknown label is encountered + /// None is returned if unknown label is encountered pub fn transform(&self, labels: &[T]) -> Vec>> { - labels - .into_iter() - .map(|l| self.transform_one(l)) - .collect() + labels.into_iter().map(|l| self.transform_one(l)).collect() } /// Transform a single label type into a one-hot vector pub fn transform_one(&self, label: &T) -> Option> { match self.label_to_idx.get(label) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_classes)) + Some(&idx) => Some(make_one_hot(idx, self.num_classes)), } } @@ -111,99 +110,104 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot - .into_iter() - .enumerate() - .filter_map(|(idx, v)| if v == pos {Some(idx)} else {None}) - .collect(); - + .into_iter() + .enumerate() + .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) + .collect(); + if s.len() == 1 { let idx = s[0]; - return Ok(self.labels[idx].clone()) + return Ok(self.labels[idx].clone()); } - let pos_entries = format!("Expected a single positive entry, {} entires found", s.len()); + let pos_entries = format!( + "Expected a single positive entry, {} entires found", + s.len() + ); Err(Failed::transform(&pos_entries[..])) } - fn from_label_def(labels: LabelDefinition) -> Self { - let (label_map, class_num, unique_lables) = match labels { LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(T, usize)> = h.iter().map(|(k,v)| (k.clone(), v.clone())).collect(); - _unique_lab.sort_by(|a,b| a.1.cmp(&b.1)); + let mut _unique_lab: Vec<(T, usize)> = + h.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); + _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); (h, unique_lab.len(), unique_lab) - }, + } LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab.iter().enumerate().map(|(v, k)| (k.clone(),v)).collect(); + let h: HashMap = unique_lab + .iter() + .enumerate() + .map(|(v, k)| (k.clone(), v)) + .collect(); (h, unique_lab.len(), unique_lab) } }; - Self {label_to_idx: label_map, num_classes: class_num, labels:unique_lables} - + Self { + label_to_idx: label_map, + num_classes: class_num, + labels: unique_lables, + } } } - #[cfg(test)] mod tests { use super::*; #[test] fn from_labels() { - let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; + let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; let enc = OneHotEncoder::::fit(&fake_labels[0..]); let oh_vec = match enc.transform_one(&1) { None => panic!("Wrong labels"), - Some(v) => v + Some(v) => v, }; - let res: Vec = vec![1f64,0f64,0f64,0f64,0f64]; + let res: Vec = vec![1f64, 0f64, 0f64, 0f64, 0f64]; assert_eq!(oh_vec, res); } - - fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str>{ - let fake_label_pos = vec!["background","dog", "cat"]; + fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { + let fake_label_pos = vec!["background", "dog", "cat"]; let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); enc } #[test] fn label_map_and_vec() { - let fake_label_map: HashMap<&str, usize> = vec![("background",0), ("dog", 1), ("cat", 2)].into_iter().collect(); - let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); - let oh_vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), - Some(v) => v - }; - let res: Vec = vec![0f64, 1f64,0f64]; - assert_eq!(oh_vec, res); - } - + let fake_label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + .into_iter() + .collect(); + let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v, + }; + let res: Vec = vec![0f64, 1f64, 0f64]; + assert_eq!(oh_vec, res); + } + #[test] fn positional_labels_vec() { - let enc = build_fake_str_enc(); - let oh_vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), - Some(v) => v - }; - let res: Vec = vec![0f64, 1f64,0f64]; - assert_eq!(oh_vec, res); + let enc = build_fake_str_enc(); + let oh_vec = match enc.transform_one(&"dog") { + None => panic!("Wrong labels"), + Some(v) => v, + }; + let res: Vec = vec![0.0, 1.0, 0.0]; + assert_eq!(oh_vec, res); } #[test] fn invert_label_test() { let enc = build_fake_str_enc(); - let res: Vec = vec![0f64, 1f64,0f64]; + let res: Vec = vec![0.0, 1.0, 0.0]; let lab = enc.invert_one(res).unwrap(); assert_eq!(lab, "dog"); - - if let Err(e) = enc.invert_one(vec![0.0, 0.0,0.0]) { + if let Err(e) = enc.invert_one(vec![0.0, 0.0, 0.0]) { let pos_entries = format!("Expected a single positive entry, 0 entires found"); assert_eq!(e, Failed::transform(&pos_entries[..])); }; } - - - -} \ No newline at end of file +} From 139bbae4564347cc8b44403c89baad14647ff37f Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 00:01:20 -0800 Subject: [PATCH 11/81] cliipy fixes --- src/preprocessing/target_encoders.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 81cbdbd..c282a4d 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -26,9 +26,9 @@ pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec OneHotEncoder { /// Transform a slice of label types into one-hot vectors /// None is returned if unknown label is encountered pub fn transform(&self, labels: &[T]) -> Vec>> { - labels.into_iter().map(|l| self.transform_one(l)).collect() + labels.iter().map(|l| self.transform_one(l)).collect() } /// Transform a single label type into a one-hot vector @@ -130,7 +130,7 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder { let (label_map, class_num, unique_lables) = match labels { LabelDefinition::LabelToClsNumMap(h) => { let mut _unique_lab: Vec<(T, usize)> = - h.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); + h.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); (h, unique_lab.len(), unique_lab) From 0df797cbae484e50c751910c9c726956ae1a2848 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 00:04:15 -0800 Subject: [PATCH 12/81] fmt fix --- src/preprocessing/target_encoders.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index c282a4d..44a5c05 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -24,13 +24,7 @@ enum LabelDefinition { pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); (0..num_labels) - .map(|idx| { - if idx == label_idx { - pos - } else { - neg - } - }) + .map(|idx| if idx == label_idx { pos } else { neg }) .collect() } From 7daf536aebff1c1d73118bd7d9dfc3bf70cc6b41 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 09:15:24 -0800 Subject: [PATCH 13/81] fixed docs --- src/preprocessing/target_encoders.rs | 125 ++++++++++++++++----------- 1 file changed, 76 insertions(+), 49 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 44a5c05..76f4c92 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -7,11 +7,47 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; -/// Turn a collection of label types into a one-hot vectors. +/// Turn a collection of `LabelType`s into a one-hot vectors. /// This struct encodes single class per exmample -pub struct OneHotEncoder { - label_to_idx: HashMap, - labels: Vec, +/// +/// You can fit a label enumeration by passing a collection of labels. +/// Label numbers will be assigned in the order they are encountered +/// +/// Example: +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// +/// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; +/// let enc = OneHotEncoder::::fit(&fake_labels[..]); +/// let oh_vec: Vec = enc.transform_one(&1).unwrap(); +/// // notice that 1 is actually a zero-th positional label +/// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); +/// ``` +/// +/// You can also pass a predefined label enumeration such as a hashmap `HashMap` or a vector `Vec` +/// +/// +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// +/// let label_map: HashMap<&str, usize> = +/// vec![("cat", 2), ("background",0), ("dog", 1)] +/// .into_iter() +/// .collect(); +/// let label_vec = vec!["background", "dog", "cat"]; +/// +/// let enc_lv = OneHotEncoder::<&str>::from_positional_label_vec(label_vec); +/// let enc_lm = OneHotEncoder::<&str>::from_label_map(label_map); +/// +/// // ["background", "dog", "cat"] +/// println!("{:?}", enc_lv.get_labels()); +/// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) +/// ``` +pub struct OneHotEncoder { + label_to_idx: HashMap, + labels: Vec, num_classes: usize, } @@ -28,21 +64,12 @@ pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec OneHotEncoder { +impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { /// Fit an encoder to a lable list - /// - /// Label numbers will be assigned in the order they are encountered - /// Example: - /// ``` - /// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; - /// let enc = OneHotEncoder::::fit(&fake_labels[0..]); - /// let oh_vec = enc.transform_one(&1); // notice that 1 is actually a zero-th positional label - /// assert_eq!(oh_vec, vec![1f64,0f64,0f64,0f64,0f64]); - /// ``` - pub fn fit(labels: &[T]) -> Self { - let mut label_map: HashMap = HashMap::new(); + pub fn fit(labels: &[LabelType]) -> Self { + let mut label_map: HashMap = HashMap::new(); let mut class_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + let mut unique_lables: Vec = Vec::new(); for l in labels { if !label_map.contains_key(&l) { @@ -59,48 +86,35 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder { } /// Build an encoder from a predefined (label -> class number) map - /// - /// Definition example: - /// ``` - /// let fake_label_map: HashMap<&str, u32> = vec![("background",0), ("dog", 1), ("cat", 2)] - /// .into_iter() - /// .collect(); - /// let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); - /// ``` - pub fn from_label_map(labels: HashMap) -> Self { + pub fn from_label_map(labels: HashMap) -> Self { Self::from_label_def(LabelDefinition::LabelToClsNumMap(labels)) } /// Build an encoder from a predefined positional label-class num vector - /// - /// Definition example: - /// ``` - /// let fake_label_pos = vec!["background","dog", "cat"]; - /// let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); - /// ``` - pub fn from_positional_label_vec(labels: Vec) -> Self { + pub fn from_positional_label_vec(labels: Vec) -> Self { Self::from_label_def(LabelDefinition::PositionalLabel(labels)) } /// Transform a slice of label types into one-hot vectors /// None is returned if unknown label is encountered - pub fn transform(&self, labels: &[T]) -> Vec>> { + pub fn transform(&self, labels: &[LabelType]) -> Vec>> { labels.iter().map(|l| self.transform_one(l)).collect() } /// Transform a single label type into a one-hot vector - pub fn transform_one(&self, label: &T) -> Option> { + pub fn transform_one(&self, label: &LabelType) -> Option> { match self.label_to_idx.get(label) { None => None, Some(&idx) => Some(make_one_hot(idx, self.num_classes)), } } + /// Get labels ordered by encoder's label enumeration + pub fn get_labels(&self) -> &Vec { + &self.labels + } + /// Invert one-hot vector, back to the label - ///``` - /// let lab = enc.invert_one(res)?; // e.g. res = [0,1,0,0...] "dog" == class 1 - /// assert_eq!(lab, "dog") - /// ``` - pub fn invert_one(&self, one_hot: Vec) -> Result { + pub fn invert_one(&self, one_hot: Vec) -> Result { let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot @@ -120,17 +134,17 @@ impl<'a, T: Hash + Eq + Clone> OneHotEncoder { Err(Failed::transform(&pos_entries[..])) } - fn from_label_def(labels: LabelDefinition) -> Self { + fn from_label_def(labels: LabelDefinition) -> Self { let (label_map, class_num, unique_lables) = match labels { LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(T, usize)> = + let mut _unique_lab: Vec<(LabelType, usize)> = h.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); - let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); + let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); (h, unique_lab.len(), unique_lab) } LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab + let h: HashMap = unique_lab .iter() .enumerate() .map(|(v, k)| (k.clone(), v)) @@ -154,7 +168,7 @@ mod tests { fn from_labels() { let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; let enc = OneHotEncoder::::fit(&fake_labels[0..]); - let oh_vec = match enc.transform_one(&1) { + let oh_vec: Vec = match enc.transform_one(&1) { None => panic!("Wrong labels"), Some(v) => v, }; @@ -170,11 +184,11 @@ mod tests { #[test] fn label_map_and_vec() { - let fake_label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + let label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_label_map(fake_label_map); - let oh_vec = match enc.transform_one(&"dog") { + let enc = OneHotEncoder::<&str>::from_label_map(label_map); + let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong labels"), Some(v) => v, }; @@ -185,7 +199,7 @@ mod tests { #[test] fn positional_labels_vec() { let enc = build_fake_str_enc(); - let oh_vec = match enc.transform_one(&"dog") { + let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong labels"), Some(v) => v, }; @@ -204,4 +218,17 @@ mod tests { assert_eq!(e, Failed::transform(&pos_entries[..])); }; } + + #[test] + fn test_many_labels() { + let enc = build_fake_str_enc(); + let res: Vec>> = enc.transform(&["dog", "cat", "fish", "background"]); + let v = vec![ + Some(vec![0.0, 1.0, 0.0]), + Some(vec![0.0, 0.0, 1.0]), + None, + Some(vec![1.0, 0.0, 0.0]), + ]; + assert_eq!(res, v) + } } From 9833a2f8514bea27e3913bdf144d00637751ec61 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 26 Jan 2021 10:03:33 -0800 Subject: [PATCH 14/81] codecov-fix --- src/preprocessing/target_encoders.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 76f4c92..56a97ed 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -1,6 +1,5 @@ #![allow(clippy::ptr_arg)] //! # Encode categorical features as a one-hot or multi-class numeric array. -//! use crate::error::Failed; use crate::math::num::RealNumber; From 244a72444520cc6ac832779a44538fc93f6b68e3 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 12:03:13 -0800 Subject: [PATCH 15/81] Genertic make_one_hot. Current implementation returns BaseVector of RealNumber --- src/preprocessing/target_encoders.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 56a97ed..3f2592b 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -6,7 +6,13 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; -/// Turn a collection of `LabelType`s into a one-hot vectors. +/// Make a one-hot encoded vector from a categorical variable +pub fn make_one_hot>(label_idx: usize, num_labels: usize) -> V { + let pos = T::from_f64(1f64).unwrap(); + let mut z = V::zeros(num_labels); + z.set(label_idx, pos); + z +} /// This struct encodes single class per exmample /// /// You can fit a label enumeration by passing a collection of labels. From 19088b682a52b81ec8709fc8ec12e25624062a3c Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 12:06:43 -0800 Subject: [PATCH 16/81] remoe LabelDefinition, looks like unnecesery abstraction for now --- src/preprocessing/target_encoders.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index 3f2592b..ff9fa6e 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -91,12 +91,31 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { } /// Build an encoder from a predefined (label -> class number) map - pub fn from_label_map(labels: HashMap) -> Self { - Self::from_label_def(LabelDefinition::LabelToClsNumMap(labels)) + pub fn from_label_map(category_map: HashMap) -> Self { + let mut _unique_cat: Vec<(CategoryType, usize)> = + category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); + _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); + let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); + Self { + num_categories: categories.len(), + categories, + category_map, } + } + /// Build an encoder from a predefined positional label-class num vector - pub fn from_positional_label_vec(labels: Vec) -> Self { - Self::from_label_def(LabelDefinition::PositionalLabel(labels)) + pub fn from_positional_label_vec(categories: Vec) -> Self { + // Self::from_label_def(LabelDefinition::PositionalLabel(categories)) + let category_map: HashMap = categories + .iter() + .enumerate() + .map(|(v, k)| (k.clone(), v)) + .collect(); + Self { + num_categories: categories.len(), + category_map, + categories, + } } /// Transform a slice of label types into one-hot vectors From 6109fc5211d0ebba410e66ec8b824992e775c1d5 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 12:13:45 -0800 Subject: [PATCH 17/81] Renaming fit/transform for API compatibility. Also rename label to category. --- src/preprocessing/target_encoders.rs | 172 +++++++++++---------------- 1 file changed, 70 insertions(+), 102 deletions(-) diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/target_encoders.rs index ff9fa6e..a929ab6 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/target_encoders.rs @@ -2,96 +2,86 @@ //! # Encode categorical features as a one-hot or multi-class numeric array. use crate::error::Failed; +use crate::linalg::BaseVector; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable -pub fn make_one_hot>(label_idx: usize, num_labels: usize) -> V { +pub fn make_one_hot>(category_idx: usize, num_categories: usize) -> V { let pos = T::from_f64(1f64).unwrap(); - let mut z = V::zeros(num_labels); - z.set(label_idx, pos); + let mut z = V::zeros(num_categories); + z.set(category_idx, pos); z } + +/// Turn a collection of `CategoryType`s into a one-hot vectors. /// This struct encodes single class per exmample /// -/// You can fit a label enumeration by passing a collection of labels. -/// Label numbers will be assigned in the order they are encountered +/// You can fit_to_series a category enumeration by passing a collection of categories. +/// category numbers will be assigned in the order they are encountered /// /// Example: /// ``` /// use std::collections::HashMap; /// use smartcore::preprocessing::target_encoders::OneHotEncoder; /// -/// let fake_labels: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; -/// let enc = OneHotEncoder::::fit(&fake_labels[..]); +/// let fake_categories: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; +/// let enc = OneHotEncoder::::fit_to_series(&fake_categories[..]); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); -/// // notice that 1 is actually a zero-th positional label +/// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); /// ``` /// -/// You can also pass a predefined label enumeration such as a hashmap `HashMap` or a vector `Vec` +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` /// /// /// ``` /// use std::collections::HashMap; /// use smartcore::preprocessing::target_encoders::OneHotEncoder; /// -/// let label_map: HashMap<&str, usize> = +/// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] /// .into_iter() /// .collect(); -/// let label_vec = vec!["background", "dog", "cat"]; +/// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = OneHotEncoder::<&str>::from_positional_label_vec(label_vec); -/// let enc_lm = OneHotEncoder::<&str>::from_label_map(label_map); +/// let enc_lv = OneHotEncoder::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = OneHotEncoder::<&str>::from_category_map(category_map); /// /// // ["background", "dog", "cat"] -/// println!("{:?}", enc_lv.get_labels()); +/// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` -pub struct OneHotEncoder { - label_to_idx: HashMap, - labels: Vec, - num_classes: usize, +pub struct OneHotEncoder { + category_map: HashMap, + categories: Vec, + num_categories: usize, } -enum LabelDefinition { - LabelToClsNumMap(HashMap), - PositionalLabel(Vec), -} - -/// Crearte a vector of size num_labels with zeros everywhere and 1 at label_idx (one-hot vector) -pub fn make_one_hot(label_idx: usize, num_labels: usize) -> Vec { - let (pos, neg) = (T::from_f64(1f64).unwrap(), T::from_f64(0f64).unwrap()); - (0..num_labels) - .map(|idx| if idx == label_idx { pos } else { neg }) - .collect() -} - -impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { +impl OneHotEncoder { /// Fit an encoder to a lable list - pub fn fit(labels: &[LabelType]) -> Self { - let mut label_map: HashMap = HashMap::new(); - let mut class_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + pub fn fit_to_series(categories: &[CategoryType]) -> Self { + let mut category_map: HashMap = HashMap::new(); + let mut category_num = 0usize; + let mut unique_lables: Vec = Vec::new(); - for l in labels { - if !label_map.contains_key(&l) { - label_map.insert(l.clone(), class_num); + for l in categories { + if !category_map.contains_key(&l) { + category_map.insert(l.clone(), category_num); unique_lables.push(l.clone()); - class_num += 1; + category_num += 1; } } Self { - label_to_idx: label_map, - num_classes: class_num, - labels: unique_lables, + category_map: category_map, + num_categories: category_num, + categories: unique_lables, } } - /// Build an encoder from a predefined (label -> class number) map - pub fn from_label_map(category_map: HashMap) -> Self { + /// Build an encoder from a predefined (category -> class number) map + pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(CategoryType, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); @@ -100,12 +90,11 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { num_categories: categories.len(), categories, category_map, - } + } } - /// Build an encoder from a predefined positional label-class num vector - pub fn from_positional_label_vec(categories: Vec) -> Self { - // Self::from_label_def(LabelDefinition::PositionalLabel(categories)) + /// Build an encoder from a predefined positional category-class num vector + pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories .iter() .enumerate() @@ -118,27 +107,30 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { } } - /// Transform a slice of label types into one-hot vectors - /// None is returned if unknown label is encountered - pub fn transform(&self, labels: &[LabelType]) -> Vec>> { - labels.iter().map(|l| self.transform_one(l)).collect() + /// Transform a slice of category types into one-hot vectors + /// None is returned if unknown category is encountered + pub fn transfrom_series( + &self, + categories: &[CategoryType], + ) -> Vec>> { + categories.iter().map(|l| self.transform_one(l)).collect() } - /// Transform a single label type into a one-hot vector - pub fn transform_one(&self, label: &LabelType) -> Option> { - match self.label_to_idx.get(label) { + /// Transform a single category type into a one-hot vector + pub fn transform_one(&self, category: &CategoryType) -> Option> { + match self.category_map.get(category) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_classes)), + Some(&idx) => Some(make_one_hot(idx, self.num_categories)), } } - /// Get labels ordered by encoder's label enumeration - pub fn get_labels(&self) -> &Vec { - &self.labels + /// Get categories ordered by encoder's category enumeration + pub fn get_categories(&self) -> &Vec { + &self.categories } - /// Invert one-hot vector, back to the label - pub fn invert_one(&self, one_hot: Vec) -> Result { + /// Invert one-hot vector, back to the category + pub fn invert_one(&self, one_hot: Vec) -> Result { let pos = U::from_f64(1f64).unwrap(); let s: Vec = one_hot @@ -149,7 +141,7 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { if s.len() == 1 { let idx = s[0]; - return Ok(self.labels[idx].clone()); + return Ok(self.categories[idx].clone()); } let pos_entries = format!( "Expected a single positive entry, {} entires found", @@ -157,31 +149,6 @@ impl<'a, LabelType: Hash + Eq + Clone> OneHotEncoder { ); Err(Failed::transform(&pos_entries[..])) } - - fn from_label_def(labels: LabelDefinition) -> Self { - let (label_map, class_num, unique_lables) = match labels { - LabelDefinition::LabelToClsNumMap(h) => { - let mut _unique_lab: Vec<(LabelType, usize)> = - h.iter().map(|(k, v)| (k.clone(), *v)).collect(); - _unique_lab.sort_by(|a, b| a.1.cmp(&b.1)); - let unique_lab: Vec = _unique_lab.into_iter().map(|a| a.0).collect(); - (h, unique_lab.len(), unique_lab) - } - LabelDefinition::PositionalLabel(unique_lab) => { - let h: HashMap = unique_lab - .iter() - .enumerate() - .map(|(v, k)| (k.clone(), v)) - .collect(); - (h, unique_lab.len(), unique_lab) - } - }; - Self { - label_to_idx: label_map, - num_classes: class_num, - labels: unique_lables, - } - } } #[cfg(test)] @@ -189,11 +156,11 @@ mod tests { use super::*; #[test] - fn from_labels() { - let fake_labels: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; - let enc = OneHotEncoder::::fit(&fake_labels[0..]); + fn from_categories() { + let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; + let enc = OneHotEncoder::::fit_to_series(&fake_categories[0..]); let oh_vec: Vec = match enc.transform_one(&1) { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![1f64, 0f64, 0f64, 0f64, 0f64]; @@ -201,19 +168,19 @@ mod tests { } fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { - let fake_label_pos = vec!["background", "dog", "cat"]; - let enc = OneHotEncoder::<&str>::from_positional_label_vec(fake_label_pos); + let fake_category_pos = vec!["background", "dog", "cat"]; + let enc = OneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); enc } #[test] - fn label_map_and_vec() { - let label_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] + fn category_map_and_vec() { + let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_label_map(label_map); + let enc = OneHotEncoder::<&str>::from_category_map(category_map); let oh_vec: Vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![0f64, 1f64, 0f64]; @@ -221,10 +188,10 @@ mod tests { } #[test] - fn positional_labels_vec() { + fn positional_categories_vec() { let enc = build_fake_str_enc(); let oh_vec: Vec = match enc.transform_one(&"dog") { - None => panic!("Wrong labels"), + None => panic!("Wrong categories"), Some(v) => v, }; let res: Vec = vec![0.0, 1.0, 0.0]; @@ -244,9 +211,10 @@ mod tests { } #[test] - fn test_many_labels() { + fn test_many_categorys() { let enc = build_fake_str_enc(); - let res: Vec>> = enc.transform(&["dog", "cat", "fish", "background"]); + let res: Vec>> = + enc.transfrom_series(&["dog", "cat", "fish", "background"]); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]), From 408b97d8aaa56ce72375f934f8cc56721962ee5b Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 19:31:14 -0800 Subject: [PATCH 18/81] Rename series encoder and move to separate module file --- src/preprocessing/mod.rs | 3 +- .../{target_encoders.rs => series_encoder.rs} | 50 +++++++++++-------- 2 files changed, 32 insertions(+), 21 deletions(-) rename src/preprocessing/{target_encoders.rs => series_encoder.rs} (80%) diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index c70f7dc..4534c6d 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1 +1,2 @@ -pub mod target_encoders; +pub mod categorical_encoders; +pub mod series_encoder; \ No newline at end of file diff --git a/src/preprocessing/target_encoders.rs b/src/preprocessing/series_encoder.rs similarity index 80% rename from src/preprocessing/target_encoders.rs rename to src/preprocessing/series_encoder.rs index a929ab6..132d160 100644 --- a/src/preprocessing/target_encoders.rs +++ b/src/preprocessing/series_encoder.rs @@ -1,14 +1,17 @@ #![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot or multi-class numeric array. +//! # Encode categorical features as a one-hot numeric array. use crate::error::Failed; -use crate::linalg::BaseVector; +use crate::linalg::{BaseVector, Matrix}; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable -pub fn make_one_hot>(category_idx: usize, num_categories: usize) -> V { +pub fn make_one_hot>( + category_idx: usize, + num_categories: usize, +) -> V { let pos = T::from_f64(1f64).unwrap(); let mut z = V::zeros(num_categories); z.set(category_idx, pos); @@ -18,16 +21,17 @@ pub fn make_one_hot>(category_idx: usize, num_ca /// Turn a collection of `CategoryType`s into a one-hot vectors. /// This struct encodes single class per exmample /// -/// You can fit_to_series a category enumeration by passing a collection of categories. +/// You can fit_to_iter a category enumeration by passing an iterator of categories. /// category numbers will be assigned in the order they are encountered /// /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; /// -/// let fake_categories: Vec = vec![1,2,3,4,5,3,5,3,1,2,4]; -/// let enc = OneHotEncoder::::fit_to_series(&fake_categories[..]); +/// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; +/// let it = fake_categories.iter().map(|&a| a); +/// let enc = SeriesOneHotEncoder::::fit_to_iter(it); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); /// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); @@ -38,7 +42,7 @@ pub fn make_one_hot>(category_idx: usize, num_ca /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::target_encoders::OneHotEncoder; +/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -46,22 +50,22 @@ pub fn make_one_hot>(category_idx: usize, num_ca /// .collect(); /// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = OneHotEncoder::<&str>::from_positional_category_vec(category_vec); -/// let enc_lm = OneHotEncoder::<&str>::from_category_map(category_map); +/// let enc_lv = SeriesOneHotEncoder::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = SeriesOneHotEncoder::<&str>::from_category_map(category_map); /// /// // ["background", "dog", "cat"] /// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` -pub struct OneHotEncoder { +pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, - num_categories: usize, + pub num_categories: usize, } -impl OneHotEncoder { +impl SeriesOneHotEncoder { /// Fit an encoder to a lable list - pub fn fit_to_series(categories: &[CategoryType]) -> Self { + pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); let mut category_num = 0usize; let mut unique_lables: Vec = Vec::new(); @@ -74,7 +78,7 @@ impl OneHotEncoder { } } Self { - category_map: category_map, + category_map, num_categories: category_num, categories: unique_lables, } @@ -107,15 +111,20 @@ impl OneHotEncoder { } } + + pub fn transform_iter(&self, cat_it: impl Iterator)-> Vec>> { + cat_it.map(|l| self.transform_one(l)).collect() + } /// Transform a slice of category types into one-hot vectors /// None is returned if unknown category is encountered pub fn transfrom_series( &self, categories: &[CategoryType], ) -> Vec>> { - categories.iter().map(|l| self.transform_one(l)).collect() + self.transform_iter(categories.iter()) } + /// Transform a single category type into a one-hot vector pub fn transform_one(&self, category: &CategoryType) -> Option> { match self.category_map.get(category) { @@ -158,7 +167,8 @@ mod tests { #[test] fn from_categories() { let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; - let enc = OneHotEncoder::::fit_to_series(&fake_categories[0..]); + let it = fake_categories.iter().map(|&a| a); + let enc = SeriesOneHotEncoder::::fit_to_iter(it); let oh_vec: Vec = match enc.transform_one(&1) { None => panic!("Wrong categories"), Some(v) => v, @@ -167,9 +177,9 @@ mod tests { assert_eq!(oh_vec, res); } - fn build_fake_str_enc<'a>() -> OneHotEncoder<&'a str> { + fn build_fake_str_enc<'a>() -> SeriesOneHotEncoder<&'a str> { let fake_category_pos = vec!["background", "dog", "cat"]; - let enc = OneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); + let enc = SeriesOneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); enc } @@ -178,7 +188,7 @@ mod tests { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = OneHotEncoder::<&str>::from_category_map(category_map); + let enc = SeriesOneHotEncoder::<&str>::from_category_map(category_map); let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong categories"), Some(v) => v, From 5c400f40d258c989659daefab030efcb24cec823 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 19:36:38 -0800 Subject: [PATCH 19/81] Scaffold for turniing floats to hashable and fittinng to columns --- src/preprocessing/categorical_encoders.rs | 27 +++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 src/preprocessing/categorical_encoders.rs diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs new file mode 100644 index 0000000..828eeef --- /dev/null +++ b/src/preprocessing/categorical_encoders.rs @@ -0,0 +1,27 @@ +#![allow(clippy::ptr_arg)] +//! # Encode categorical features as a one-hot numeric array. + +use crate::error::Failed; +use crate::linalg::{BaseVector, Matrix}; +use crate::math::num::RealNumber; + +use crate::preprocessing::series_encoder::SeriesOneHotEncoder; + +pub type HashableReal = u32; + +fn hashable_num(v: &T) -> HashableReal { + // gaxler: If first 32 bits are the same, assume numbers are the same for the categorical coercion + v.to_f32_bits() +} + +#[derive(Debug, Clone)] +pub struct OneHotEncoderParams { + pub categorical_param_idxs: Option>, + pub infer_categorical: bool, +} +/// Encode Categorical variavbles of data matrix to one-hot +pub struct OneHotEncoder { + series_encoders: Vec>, + categorical_param_idxs: Vec, +} + From f91b1f99425789b6d11c10941b079b4cd7150f5c Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 27 Jan 2021 19:37:54 -0800 Subject: [PATCH 20/81] fit SeriesOneHotEncoders to predefined columns --- src/preprocessing/categorical_encoders.rs | 42 +++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 828eeef..012f364 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -25,3 +25,45 @@ pub struct OneHotEncoder { categorical_param_idxs: Vec, } +impl> OneHotEncoder { + /// PlaceHolder + + pub fn fit(data: &M, params: OneHotEncoderParams) -> Result { + match (params.categorical_param_idxs, params.infer_categorical) { + (None, false) => Err(Failed::fit( + "Must pass categorical series ids or infer flag", + )), + + (Some(idxs), true) => Err(Failed::fit( + "Ambigous parameters, got both infer and categroy ids", + )), + + (Some(idxs), false) => Ok(Self { + series_encoders: Self::build_series_encoders::(data, &idxs[..]), + categorical_param_idxs: idxs, + }), + + (None, true) => { + todo!("implement categorical auto-inference") + } + } + } + + fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { + let (nrows, _) = data.shape(); + // let mut res: Vec> = Vec::with_capacity(idxs.len()); + let mut tmp_col: Vec = Vec::with_capacity(nrows); + + let res: Vec> = idxs + .iter() + .map(|&idx| { + data.copy_col_as_vec(idx, &mut tmp_col); + let hashable_col = tmp_col.iter().map(|v| hashable_num::(v)); + SeriesOneHotEncoder::fit_to_iter(hashable_col) + }) + .collect(); + res + } + + +} \ No newline at end of file From 3480e728af5ec16edadc8ec63946e76970eaf2d2 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 16:04:41 -0800 Subject: [PATCH 21/81] Documentation updates --- src/preprocessing/categorical_encoders.rs | 26 ++++++++++++++-- src/preprocessing/mod.rs | 5 ++- src/preprocessing/series_encoder.rs | 37 +++++++++++++++-------- 3 files changed, 53 insertions(+), 15 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 012f364..0436787 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -1,5 +1,27 @@ -#![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot numeric array. +//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies +//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents +//! +//! ### Usage Example +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; +//! let data = DenseMatrix::from_2d_array(&[ +//! &[1.5, 1.0, 1.5, 3.0], +//! &[1.5, 2.0, 1.5, 4.0], +//! &[1.5, 1.0, 1.5, 5.0], +//! &[1.5, 2.0, 1.5, 6.0], +//! ]); +//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); +//! // Infer number of categories from data and return a reusable encoder +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! // Transform categorical to one-hot encoded (can transform similar) +//! let oh_data = encoder.transform(&data).unwrap(); +//! // Produces the following: +//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] +//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] +//! ``` use crate::error::Failed; use crate::linalg::{BaseVector, Matrix}; diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index 4534c6d..c07b982 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,2 +1,5 @@ +/// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents pub mod categorical_encoders; -pub mod series_encoder; \ No newline at end of file +mod data_traits; +/// Encode a series (column, array) of categorical variables as one-hot vectors +pub mod series_encoder; diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 132d160..321f049 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -1,13 +1,21 @@ #![allow(clippy::ptr_arg)] -//! # Encode categorical features as a one-hot numeric array. +//! # Series Encoder +//! Encode a series of categorical features as a one-hot numeric array. use crate::error::Failed; -use crate::linalg::{BaseVector, Matrix}; +use crate::linalg::BaseVector; use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; /// Make a one-hot encoded vector from a categorical variable +/// +/// Example: +/// ``` +/// use smartcore::preprocessing::series_encoder::make_one_hot; +/// let one_hot: Vec = make_one_hot(2, 3); +/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); +/// ``` pub fn make_one_hot>( category_idx: usize, num_categories: usize, @@ -18,7 +26,7 @@ pub fn make_one_hot>( z } -/// Turn a collection of `CategoryType`s into a one-hot vectors. +/// Turn a collection of Hashable objects into a one-hot vectors. /// This struct encodes single class per exmample /// /// You can fit_to_iter a category enumeration by passing an iterator of categories. @@ -27,7 +35,7 @@ pub fn make_one_hot>( /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; /// /// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; /// let it = fake_categories.iter().map(|&a| a); @@ -42,7 +50,7 @@ pub fn make_one_hot>( /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::categorical_encoders::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -60,10 +68,11 @@ pub fn make_one_hot>( pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, + /// Number of categories for categorical variable pub num_categories: usize, } -impl SeriesOneHotEncoder { +impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder { /// Fit an encoder to a lable list pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); @@ -111,20 +120,24 @@ impl SeriesOneHotEncoder { } } - - pub fn transform_iter(&self, cat_it: impl Iterator)-> Vec>> { - cat_it.map(|l| self.transform_one(l)).collect() + /// Take an iterator as a series to transform + pub fn transform_iter( + &self, + cat_it: impl Iterator, + ) -> Vec>> { + cat_it.map(|l| self.transform_one(&l)).collect() } + /// Transform a slice of category types into one-hot vectors /// None is returned if unknown category is encountered pub fn transfrom_series( &self, - categories: &[CategoryType], + categories: &'a [CategoryType], ) -> Vec>> { - self.transform_iter(categories.iter()) + let v = categories.iter().map(|a| a.clone()); + self.transform_iter(v) } - /// Transform a single category type into a one-hot vector pub fn transform_one(&self, category: &CategoryType) -> Option> { match self.category_map.get(category) { From 3dc8a4283298d6622a6a0c74cd008339d6b8e9c4 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 16:05:45 -0800 Subject: [PATCH 22/81] Adapt column numbers to the new columns introduced by categorical variables. --- src/preprocessing/categorical_encoders.rs | 34 +++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 0436787..31d3500 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -41,6 +41,40 @@ pub struct OneHotEncoderParams { pub categorical_param_idxs: Option>, pub infer_categorical: bool, } +/// Calculate the offset to parameters to due introduction of one-hot encoding +fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { + // This functions uses iterators and returns a vector. + // In case we get a huge amount of paramenters this might be a problem + // todo: Change this such that it will return an iterator + + let cat_idx = encoded_idxs.iter().copied().chain((num_params..).take(1)); + + // Offset is constant between two categorical values, here we calculate the number of steps + // that remain constant + let repeats = cat_idx.scan(0, |a, v| { + let im = v + 1 - *a; + *a = v; + Some(im) + }); + + // Calculate the offset to parameter idx due to newly intorduced one-hot vectors + let offset_ = cat_sizes.iter().scan(0, |a, &v| { + *a = *a + v - 1; + Some(*a) + }); + let offset = (0..1).chain(offset_); + + let new_param_idxs: Vec = (0..num_params) + .zip( + repeats + .zip(offset) + .map(|(r, o)| iter::repeat(o).take(r)) + .flatten(), + ) + .map(|(idx, ofst)| idx + ofst) + .collect(); + new_param_idxs +} /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { series_encoders: Vec>, From dd39433ff8ddea5445e3b1ca27db2474c002885d Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 18:48:23 -0800 Subject: [PATCH 23/81] Categorizable trait defines logic of turning floats into hashable categorical variables. Since we only support RealNumbers for now, the idea is to treat round numbers as ordinal (or nominal if user chooses to ignore order) categories. --- src/preprocessing/data_traits.rs | 43 ++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/preprocessing/data_traits.rs diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs new file mode 100644 index 0000000..04b534e --- /dev/null +++ b/src/preprocessing/data_traits.rs @@ -0,0 +1,43 @@ +//! Traits to indicate that float variables can be viewed as categorical +//! This module assumes + +pub type CategoricalFloat = u16; + +// pub struct CategoricalFloat(u16); + +pub trait Categorizable { + type A; + + fn to_category(self) -> CategoricalFloat; + + fn is_valid(self) -> bool; + +} + +impl Categorizable for f32 { + + type A = CategoricalFloat; + + fn to_category(self) -> CategoricalFloat { + self as CategoricalFloat + } + + fn is_valid(self) -> bool { + let a = self.to_category(); + a as f32 == self + } +} + +impl Categorizable for f64 { + + type A = CategoricalFloat; + + fn to_category(self) ->CategoricalFloat { + self as CategoricalFloat + } + + fn is_valid(self) -> bool { + let a = self.to_category(); + a as f64 == self + } +} \ No newline at end of file From cd5611079caae782f148397a0ebad465aea6faef Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:29:33 -0800 Subject: [PATCH 24/81] Fit OneHotEncoder --- src/preprocessing/categorical_encoders.rs | 56 ++++++++++++++++++----- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 31d3500..794c1d6 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -75,32 +75,66 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) .collect(); new_param_idxs } +fn validate_col_is_categorical(data: &Vec) -> bool { + for v in data { + if !v.is_valid() { return false} + } + true +} /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { - series_encoders: Vec>, - categorical_param_idxs: Vec, + series_encoders: Vec>, + col_idx_categorical: Vec, } -impl> OneHotEncoder { +impl OneHotEncoder { /// PlaceHolder - pub fn fit(data: &M, params: OneHotEncoderParams) -> Result { - match (params.categorical_param_idxs, params.infer_categorical) { + pub fn fit>( + data: &M, + params: OneHotEncoderParams, + ) -> Result { + match (params.col_idx_categorical, params.infer_categorical) { (None, false) => Err(Failed::fit( "Must pass categorical series ids or infer flag", )), - (Some(idxs), true) => Err(Failed::fit( + (Some(_idxs), true) => Err(Failed::fit( "Ambigous parameters, got both infer and categroy ids", )), - (Some(idxs), false) => Ok(Self { - series_encoders: Self::build_series_encoders::(data, &idxs[..]), - categorical_param_idxs: idxs, - }), + (Some(mut idxs), false) => { + // make sure categories have same order as data columns + idxs.sort(); + + let (nrows, _) = data.shape(); + + // col buffer to avoid allocations + let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); + + let mut res: Vec> = Vec::with_capacity(idxs.len()); + + for &idx in &idxs { + data.copy_col_as_vec(idx, &mut col_buf); + if !validate_col_is_categorical(&col_buf) { + let msg = format!("Column {} of data matrix containts non categorizable (integer) values", idx); + return Err(Failed::fit(&msg[..])) + } + let hashable_col = col_buf.iter().map(|v| v.to_category()); + res.push(SeriesOneHotEncoder::fit_to_iter(hashable_col)); + } + + Ok(Self { + series_encoders: res, //Self::build_series_encoders::(data, &idxs[..]), + col_idx_categorical: idxs, + }) + } (None, true) => { - todo!("implement categorical auto-inference") + todo!("Auto-Inference for Categorical Variables not yet implemented") + } + } + } } } } From fd6b2e801479f709870921f192153c6abeeab53d Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:29:58 -0800 Subject: [PATCH 25/81] Transform matrix --- src/preprocessing/categorical_encoders.rs | 42 +++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 794c1d6..585f13a 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -135,9 +135,51 @@ impl OneHotEncoder { } } } + + /// Transform categorical variables to one-hot encoded and return a new matrix + pub fn transform>(&self, x: &M) -> Option { + let (nrows, p) = x.shape(); + let additional_params: Vec = self + .series_encoders + .iter() + .map(|enc| enc.num_categories) + .collect(); + + let new_param_num: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); + let mut res = M::zeros(nrows, new_param_num); + // copy old data in x to their new location + for (old_p, &new_p) in new_col_idx.iter().enumerate() { + for r in 0..nrows { + let val = x.get(r, old_p); + res.set(r, new_p, val); } } + for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { + let cidx = new_col_idx[old_cidx]; + let col_iter = (0..nrows).map(|r| res.get(r, cidx).to_category()); + let sencoder = &self.series_encoders[pidx]; + let oh_series: Vec>> = sencoder.transform_iter(col_iter); + + for (row, oh_vec) in oh_series.iter().enumerate() { + match oh_vec { + None => { + // Bad value in a series causes in to be invalid + // todo: proper error handling, so user can know where the bad value is + return None; + } + Some(v) => { + // copy one hot vectors to their place in the data matrix; + for (col_ofst, &val) in v.iter().enumerate() { + res.set(row, cidx + col_ofst, val); + } } + } + } + } + Some(res) + } +} fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { let (nrows, _) = data.shape(); From c987d39d439462e5abc12cf34276d8735afb1145 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:31:09 -0800 Subject: [PATCH 26/81] tests + force Categorizable be RealNumber --- src/preprocessing/categorical_encoders.rs | 138 +++++++++++++++++----- src/preprocessing/data_traits.rs | 4 +- 2 files changed, 114 insertions(+), 28 deletions(-) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoders.rs index 585f13a..063aa5c 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoders.rs @@ -1,6 +1,8 @@ //! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies //! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents //! +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [SeriesOneHotEncoder](../series_encoder/struct.SeriesOneHotEncoder.html) +//! //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; @@ -22,25 +24,33 @@ //! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] //! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] //! ``` +use std::iter; use crate::error::Failed; -use crate::linalg::{BaseVector, Matrix}; -use crate::math::num::RealNumber; +use crate::linalg::Matrix; +use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; use crate::preprocessing::series_encoder::SeriesOneHotEncoder; -pub type HashableReal = u32; - -fn hashable_num(v: &T) -> HashableReal { - // gaxler: If first 32 bits are the same, assume numbers are the same for the categorical coercion - v.to_f32_bits() -} - +/// OneHotEncoder Parameters #[derive(Debug, Clone)] pub struct OneHotEncoderParams { - pub categorical_param_idxs: Option>, + /// Column number that contain categorical variable + pub col_idx_categorical: Option>, + /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables pub infer_categorical: bool, } + +impl OneHotEncoderParams { + /// Generate parameters from categorical variable column numbers + pub fn from_cat_idx(categorical_params: &[usize]) -> Self { + Self { + col_idx_categorical: Some(categorical_params.to_vec()), + infer_categorical: false, + } + } +} + /// Calculate the offset to parameters to due introduction of one-hot encoding fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { // This functions uses iterators and returns a vector. @@ -75,12 +85,14 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) .collect(); new_param_idxs } + fn validate_col_is_categorical(data: &Vec) -> bool { for v in data { if !v.is_valid() { return false} } true } + /// Encode Categorical variavbles of data matrix to one-hot pub struct OneHotEncoder { series_encoders: Vec>, @@ -167,13 +179,13 @@ impl OneHotEncoder { // Bad value in a series causes in to be invalid // todo: proper error handling, so user can know where the bad value is return None; - } + } Some(v) => { // copy one hot vectors to their place in the data matrix; for (col_ofst, &val) in v.iter().enumerate() { res.set(row, cidx + col_ofst, val); - } - } + } + } } } } @@ -181,21 +193,93 @@ impl OneHotEncoder { } } - fn build_series_encoders(data: &M, idxs: &[usize]) -> Vec> { - let (nrows, _) = data.shape(); - // let mut res: Vec> = Vec::with_capacity(idxs.len()); - let mut tmp_col: Vec = Vec::with_capacity(nrows); +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::preprocessing::series_encoder::SeriesOneHotEncoder; - let res: Vec> = idxs - .iter() - .map(|&idx| { - data.copy_col_as_vec(idx, &mut tmp_col); - let hashable_col = tmp_col.iter().map(|v| hashable_num::(v)); - SeriesOneHotEncoder::fit_to_iter(hashable_col) - }) - .collect(); - res + #[test] + fn adjust_idxs() { + assert_eq!(find_new_idxs(0, &[], &[]), Vec::new()); + // [0,1,2] -> [0, 1, 1, 1, 2] + assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); } + fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { + let orig = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); -} \ No newline at end of file + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { + // Categorical first and last + let orig = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 1.5, 3.0], + &[1.5, 2.0, 1.5, 4.0], + &[1.5, 1.0, 1.5, 5.0], + &[1.5, 2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + #[test] + fn hash_encode_f64_series() { + let series = vec![3.0, 1.0, 2.0, 1.0]; + let hashable_series: Vec = + series.iter().map(|v| v.to_category()).collect(); + let enc = SeriesOneHotEncoder::from_positional_category_vec(hashable_series); + let inv = enc.invert_one(vec![0.0, 0.0, 1.0]); + let orig_val: f64 = inv.unwrap().into(); + assert_eq!(orig_val, 2.0); + } + #[test] + fn test_fit() { + let (X, _) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + assert_eq!(oh_enc.series_encoders.len(), 2); + + let num_cat: Vec = oh_enc + .series_encoders + .iter() + .map(|a| a.num_categories) + .collect(); + assert_eq!(num_cat, vec![2, 4]); + } + + #[test] + fn matrix_transform_test() { + let (X, expectedX) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let nm = oh_enc.transform(&X).unwrap(); + assert_eq!(nm, expectedX); + + let (X, expectedX) = build_cat_first_and_last(); + let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); + let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let nm = oh_enc.transform(&X).unwrap(); + assert_eq!(nm, expectedX); + } +} diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs index 04b534e..16924bb 100644 --- a/src/preprocessing/data_traits.rs +++ b/src/preprocessing/data_traits.rs @@ -1,11 +1,13 @@ //! Traits to indicate that float variables can be viewed as categorical //! This module assumes +use crate::math::num::RealNumber; + pub type CategoricalFloat = u16; // pub struct CategoricalFloat(u16); -pub trait Categorizable { +pub trait Categorizable: RealNumber { type A; fn to_category(self) -> CategoricalFloat; From 2f03c1d6d74834d5bad990a5fd9c7cd7962fa351 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:54:42 -0800 Subject: [PATCH 27/81] module name change --- ...cal_encoders.rs => categorical_encoder.rs} | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) rename src/preprocessing/{categorical_encoders.rs => categorical_encoder.rs} (89%) diff --git a/src/preprocessing/categorical_encoders.rs b/src/preprocessing/categorical_encoder.rs similarity index 89% rename from src/preprocessing/categorical_encoders.rs rename to src/preprocessing/categorical_encoder.rs index 063aa5c..22cd052 100644 --- a/src/preprocessing/categorical_encoders.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -38,7 +38,7 @@ pub struct OneHotEncoderParams { /// Column number that contain categorical variable pub col_idx_categorical: Option>, /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables - pub infer_categorical: bool, + infer_categorical: bool, } impl OneHotEncoderParams { @@ -86,14 +86,17 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) new_param_idxs } -fn validate_col_is_categorical(data: &Vec) -> bool { +fn validate_col_is_categorical(data: &[T]) -> bool { for v in data { - if !v.is_valid() { return false} + if !v.is_valid() { + return false; + } } true } /// Encode Categorical variavbles of data matrix to one-hot +#[derive(Debug, Clone)] pub struct OneHotEncoder { series_encoders: Vec>, col_idx_categorical: Vec, @@ -102,7 +105,7 @@ pub struct OneHotEncoder { impl OneHotEncoder { /// PlaceHolder - pub fn fit>( + pub fn fit>( data: &M, params: OneHotEncoderParams, ) -> Result { @@ -117,20 +120,24 @@ impl OneHotEncoder { (Some(mut idxs), false) => { // make sure categories have same order as data columns - idxs.sort(); + idxs.sort_unstable(); let (nrows, _) = data.shape(); // col buffer to avoid allocations let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - - let mut res: Vec> = Vec::with_capacity(idxs.len()); - + + let mut res: Vec> = + Vec::with_capacity(idxs.len()); + for &idx in &idxs { data.copy_col_as_vec(idx, &mut col_buf); if !validate_col_is_categorical(&col_buf) { - let msg = format!("Column {} of data matrix containts non categorizable (integer) values", idx); - return Err(Failed::fit(&msg[..])) + let msg = format!( + "Column {} of data matrix containts non categorizable (integer) values", + idx + ); + return Err(Failed::fit(&msg[..])); } let hashable_col = col_buf.iter().map(|v| v.to_category()); res.push(SeriesOneHotEncoder::fit_to_iter(hashable_col)); @@ -149,7 +156,7 @@ impl OneHotEncoder { } /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform>(&self, x: &M) -> Option { + pub fn transform>(&self, x: &M) -> Option { let (nrows, p) = x.shape(); let additional_params: Vec = self .series_encoders @@ -201,7 +208,7 @@ mod tests { #[test] fn adjust_idxs() { - assert_eq!(find_new_idxs(0, &[], &[]), Vec::new()); + assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); // [0,1,2] -> [0, 1, 1, 1, 2] assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); } @@ -282,4 +289,22 @@ mod tests { let nm = oh_enc.transform(&X).unwrap(); assert_eq!(nm, expectedX); } + + #[test] + fn fail_on_bad_category() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); + + let params = OneHotEncoderParams::from_cat_idx(&[1]); + match OneHotEncoder::fit(&m, params) { + Err(_) => { + assert!(true); + } + _ => assert!(false), + } + } } From ca0816db97d7fa1426b98c5b97b548a8a89d2b12 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 19:55:04 -0800 Subject: [PATCH 28/81] Clippy fixes --- src/preprocessing/data_traits.rs | 14 ++++++-------- src/preprocessing/mod.rs | 2 +- src/preprocessing/series_encoder.rs | 3 ++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/preprocessing/data_traits.rs b/src/preprocessing/data_traits.rs index 16924bb..38d9e3e 100644 --- a/src/preprocessing/data_traits.rs +++ b/src/preprocessing/data_traits.rs @@ -1,11 +1,12 @@ //! Traits to indicate that float variables can be viewed as categorical -//! This module assumes +//! This module assumes use crate::math::num::RealNumber; pub type CategoricalFloat = u16; // pub struct CategoricalFloat(u16); +const ERROR_MARGIN: f64 = 0.001; pub trait Categorizable: RealNumber { type A; @@ -13,11 +14,9 @@ pub trait Categorizable: RealNumber { fn to_category(self) -> CategoricalFloat; fn is_valid(self) -> bool; - } impl Categorizable for f32 { - type A = CategoricalFloat; fn to_category(self) -> CategoricalFloat { @@ -26,20 +25,19 @@ impl Categorizable for f32 { fn is_valid(self) -> bool { let a = self.to_category(); - a as f32 == self + (a as f32 - self).abs() < (ERROR_MARGIN as f32) } } impl Categorizable for f64 { - type A = CategoricalFloat; - fn to_category(self) ->CategoricalFloat { + fn to_category(self) -> CategoricalFloat { self as CategoricalFloat } fn is_valid(self) -> bool { let a = self.to_category(); - a as f64 == self + (a as f64 - self).abs() < ERROR_MARGIN } -} \ No newline at end of file +} diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index c07b982..4a1abf3 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,5 +1,5 @@ /// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents -pub mod categorical_encoders; +pub mod categorical_encoder; mod data_traits; /// Encode a series (column, array) of categorical variables as one-hot vectors pub mod series_encoder; diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 321f049..438d678 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -65,6 +65,7 @@ pub fn make_one_hot>( /// println!("{:?}", enc_lv.get_categories()); /// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) /// ``` +#[derive(Debug, Clone)] pub struct SeriesOneHotEncoder { category_map: HashMap, categories: Vec, @@ -134,7 +135,7 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder &self, categories: &'a [CategoryType], ) -> Vec>> { - let v = categories.iter().map(|a| a.clone()); + let v = categories.iter().cloned(); self.transform_iter(v) } From 863be5ef756518f8d213266f195a4c06b403d5fd Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 20:09:52 -0800 Subject: [PATCH 29/81] style fixes --- src/preprocessing/categorical_encoder.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 22cd052..b05a344 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -262,9 +262,9 @@ mod tests { } #[test] fn test_fit() { - let (X, _) = build_fake_matrix(); + let (x, _) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); assert_eq!(oh_enc.series_encoders.len(), 2); let num_cat: Vec = oh_enc @@ -277,17 +277,17 @@ mod tests { #[test] fn matrix_transform_test() { - let (X, expectedX) = build_fake_matrix(); + let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); - let nm = oh_enc.transform(&X).unwrap(); - assert_eq!(nm, expectedX); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); - let (X, expectedX) = build_cat_first_and_last(); + let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEncoder::fit(&X, params).unwrap(); - let nm = oh_enc.transform(&X).unwrap(); - assert_eq!(nm, expectedX); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); } #[test] From f4b5936dcfde9c3e82c4098016c2555a4e6210e2 Mon Sep 17 00:00:00 2001 From: gaxler Date: Sat, 30 Jan 2021 20:18:52 -0800 Subject: [PATCH 30/81] fmt --- src/preprocessing/categorical_encoder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index b05a344..706670b 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -277,17 +277,17 @@ mod tests { #[test] fn matrix_transform_test() { - let (x, expected_x) = build_fake_matrix(); + let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); + assert_eq!(nm, expected_x); - let (x, expected_x) = build_cat_first_and_last(); + let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); + assert_eq!(nm, expected_x); } #[test] From a882741e1273e7e0d2742f48f84920ae759aadaf Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 1 Feb 2021 11:20:03 -0800 Subject: [PATCH 31/81] If transform fails - fail before copying the whole matrix (changed the order of coping, first do the categorical, than copy ther rest) --- src/preprocessing/categorical_encoder.rs | 46 ++++++++++++++++-------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 706670b..7e71119 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -156,7 +156,7 @@ impl OneHotEncoder { } /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform>(&self, x: &M) -> Option { + pub fn transform>(&self, x: &M) -> Result { let (nrows, p) = x.shape(); let additional_params: Vec = self .series_encoders @@ -164,28 +164,24 @@ impl OneHotEncoder { .map(|enc| enc.num_categories) .collect(); - let new_param_num: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + // Eac category of size v adds v-1 params + let expandws_p: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); - let mut res = M::zeros(nrows, new_param_num); - // copy old data in x to their new location - for (old_p, &new_p) in new_col_idx.iter().enumerate() { - for r in 0..nrows { - let val = x.get(r, old_p); - res.set(r, new_p, val); - } - } + let mut res = M::zeros(nrows, expandws_p); + for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { let cidx = new_col_idx[old_cidx]; - let col_iter = (0..nrows).map(|r| res.get(r, cidx).to_category()); + let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); let sencoder = &self.series_encoders[pidx]; let oh_series: Vec>> = sencoder.transform_iter(col_iter); for (row, oh_vec) in oh_series.iter().enumerate() { match oh_vec { None => { - // Bad value in a series causes in to be invalid - // todo: proper error handling, so user can know where the bad value is - return None; + // Since we support T types, bad value in a series causes in to be invalid + let msg = format!("At least one value in column {} doesn't conform to category definition", old_cidx); + return Err(Failed::transform(&msg[..])); } Some(v) => { // copy one hot vectors to their place in the data matrix; @@ -196,7 +192,27 @@ impl OneHotEncoder { } } } - Some(res) + + // copy old data in x to their new location while skipping catergorical vars (already treated) + let mut skip_idx_iter = self.col_idx_categorical.iter(); + let mut cur_skip = skip_idx_iter.next(); + + for (old_p, &new_p) in new_col_idx.iter().enumerate() { + // if found treated varible, skip it + if let Some(&v) = cur_skip { + if v == old_p { + cur_skip = skip_idx_iter.next(); + continue; + } + } + + for r in 0..nrows { + let val = x.get(r, old_p); + res.set(r, new_p, val); + } + } + + Ok(res) } } From 03b9f76e9f9a18910cd59c5859b21571e05bb559 Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 1 Feb 2021 11:24:20 -0800 Subject: [PATCH 32/81] Doc+Naming Improvement --- src/preprocessing/categorical_encoder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 7e71119..7a0f5d9 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -52,12 +52,12 @@ impl OneHotEncoderParams { } /// Calculate the offset to parameters to due introduction of one-hot encoding -fn find_new_idxs(num_params: usize, cat_sizes: &[usize], encoded_idxs: &[usize]) -> Vec { +fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> Vec { // This functions uses iterators and returns a vector. // In case we get a huge amount of paramenters this might be a problem // todo: Change this such that it will return an iterator - let cat_idx = encoded_idxs.iter().copied().chain((num_params..).take(1)); + let cat_idx = cat_idxs.iter().copied().chain((num_params..).take(1)); // Offset is constant between two categorical values, here we calculate the number of steps // that remain constant @@ -103,8 +103,8 @@ pub struct OneHotEncoder { } impl OneHotEncoder { - /// PlaceHolder - + + /// Create an encoder instance with categories infered from data matrix pub fn fit>( data: &M, params: OneHotEncoderParams, From 228b54baf7d04715c1e170af2be506a99caf044e Mon Sep 17 00:00:00 2001 From: gaxler Date: Mon, 1 Feb 2021 11:24:50 -0800 Subject: [PATCH 33/81] fmt --- src/preprocessing/categorical_encoder.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 7a0f5d9..e3e8ce9 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -103,8 +103,7 @@ pub struct OneHotEncoder { } impl OneHotEncoder { - - /// Create an encoder instance with categories infered from data matrix + /// Create an encoder instance with categories infered from data matrix pub fn fit>( data: &M, params: OneHotEncoderParams, From 19ff6df84cd3d55f7accd44b2986289691059fa8 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 17:40:58 -0800 Subject: [PATCH 34/81] Separate mapper object --- src/preprocessing/series_encoder.rs | 67 +++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 438d678..4e9625e 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -8,6 +8,73 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; +#[derive(Debug, Clone)] +pub struct CategoryMapper { + category_map: HashMap, + categories: Vec, + num_categories: usize, +} + +impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { + fn fit_to_iter(categories: impl Iterator) -> Self { + let mut category_map: HashMap = HashMap::new(); + let mut category_num = 0usize; + let mut unique_lables: Vec = Vec::new(); + + for l in categories { + if !category_map.contains_key(&l) { + category_map.insert(l.clone(), category_num); + unique_lables.push(l.clone()); + category_num += 1; + } + } + Self { + category_map, + num_categories: category_num, + categories: unique_lables, + } + } + + fn from_category_map(category_map: HashMap) -> Self { + let mut _unique_cat: Vec<(CategoryType, usize)> = + category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); + _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); + let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); + Self { + num_categories: categories.len(), + categories, + category_map, + } + } + + fn from_positional_category_vec(categories: Vec) -> Self { + let category_map: HashMap = categories + .iter() + .enumerate() + .map(|(v, k)| (k.clone(), v)) + .collect(); + Self { + num_categories: categories.len(), + category_map, + categories, + } + } + + /// Get label num of a category + fn get_num(&self, category: &CategoryType) -> Option<&usize> { + self.category_map.get(category) + } + + /// Return category corresponding to label num + fn get_cat(&self, num: usize) -> &CategoryType { + &self.categories[num] + } + + fn get_categories(&self) -> &[CategoryType] { + &self.categories[..] + } +} + /// Make a one-hot encoded vector from a categorical variable /// /// Example: From d31145b4fe24e0718aef3b0b9371e9e2834b31ce Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 18:19:36 -0800 Subject: [PATCH 35/81] Define common series encoder behavior --- src/preprocessing/series_encoder.rs | 146 +++++++++++++--------------- 1 file changed, 70 insertions(+), 76 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 4e9625e..4e9ddf9 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -75,6 +75,50 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } } +/// Defines common behavior for series encoders(e.g. OneHot, Ordinal) +pub trait SeriesEncoder: + where + CategoryType:Hash + Eq + Clone +{ + /// Fit an encoder to a lable list + fn fit_to_iter(categories: impl Iterator) -> Self; + + /// Number of categories for categorical variable + fn num_categories(&self) -> usize; + + /// Build an encoder from a predefined (category -> class number) map + fn from_category_map(category_map: HashMap) -> Self; + + /// Build an encoder from a predefined positional category-class num vector + fn from_positional_category_vec(categories: Vec) -> Self; + + /// Transform a single category type into a one-hot vector + fn transform_one>(&self, category: &CategoryType) -> Option; + + /// Invert one-hot vector, back to the category + fn invert_one>(&self, one_hot: V) -> Result; + + /// Get categories ordered by encoder's category enumeration + fn get_categories(&self) -> &[CategoryType]; + + /// Take an iterator as a series to transform + fn transform_iter>( + &self, + cat_it: impl Iterator, + ) -> Vec> { + cat_it.map(|l| self.transform_one(&l)).collect() + } + + /// Transform a slice of category types into one-hot vectors + /// None is returned if unknown category is encountered + fn transfrom_series>( + &self, + categories: &[CategoryType], + ) -> Vec> { + let v = categories.iter().cloned(); + self.transform_iter(v) + } +} /// Make a one-hot encoded vector from a categorical variable /// /// Example: @@ -134,104 +178,47 @@ pub fn make_one_hot>( /// ``` #[derive(Debug, Clone)] pub struct SeriesOneHotEncoder { - category_map: HashMap, - categories: Vec, - /// Number of categories for categorical variable - pub num_categories: usize, + mapper: CategoryMapper, } -impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder { - /// Fit an encoder to a lable list - pub fn fit_to_iter(categories: impl Iterator) -> Self { - let mut category_map: HashMap = HashMap::new(); - let mut category_num = 0usize; - let mut unique_lables: Vec = Vec::new(); +impl SeriesEncoder for SeriesOneHotEncoder { - for l in categories { - if !category_map.contains_key(&l) { - category_map.insert(l.clone(), category_num); - unique_lables.push(l.clone()); - category_num += 1; + fn fit_to_iter(categories: impl Iterator) -> Self { + Self {mapper:CategoryMapper::fit_to_iter(categories)} } - } - Self { - category_map, - num_categories: category_num, - categories: unique_lables, - } - } /// Build an encoder from a predefined (category -> class number) map - pub fn from_category_map(category_map: HashMap) -> Self { - let mut _unique_cat: Vec<(CategoryType, usize)> = - category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); - _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); - let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); - Self { - num_categories: categories.len(), - categories, - category_map, + fn from_category_map(category_map: HashMap) -> Self { + Self {mapper: CategoryMapper::from_category_map(category_map)} } - } /// Build an encoder from a predefined positional category-class num vector - pub fn from_positional_category_vec(categories: Vec) -> Self { - let category_map: HashMap = categories - .iter() - .enumerate() - .map(|(v, k)| (k.clone(), v)) - .collect(); - Self { - num_categories: categories.len(), - category_map, - categories, + fn from_positional_category_vec(categories: Vec) -> Self { + Self {mapper:CategoryMapper::from_positional_category_vec(categories)} } + + fn num_categories(&self) -> usize { + self.mapper.num_categories } - /// Take an iterator as a series to transform - pub fn transform_iter( - &self, - cat_it: impl Iterator, - ) -> Vec>> { - cat_it.map(|l| self.transform_one(&l)).collect() + fn get_categories(&self) -> &[CategoryType] { + self.mapper.get_categories() } - /// Transform a slice of category types into one-hot vectors - /// None is returned if unknown category is encountered - pub fn transfrom_series( - &self, - categories: &'a [CategoryType], - ) -> Vec>> { - let v = categories.iter().cloned(); - self.transform_iter(v) - } - - /// Transform a single category type into a one-hot vector - pub fn transform_one(&self, category: &CategoryType) -> Option> { - match self.category_map.get(category) { - None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_categories)), - } - } - - /// Get categories ordered by encoder's category enumeration - pub fn get_categories(&self) -> &Vec { - &self.categories - } - - /// Invert one-hot vector, back to the category - pub fn invert_one(&self, one_hot: Vec) -> Result { + fn invert_one>(&self, one_hot: V) -> Result + { let pos = U::from_f64(1f64).unwrap(); + + let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); - let s: Vec = one_hot - .into_iter() + let s: Vec = oh_it .enumerate() .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) .collect(); if s.len() == 1 { let idx = s[0]; - return Ok(self.categories[idx].clone()); + return Ok(self.mapper.get_cat(idx).clone()); } let pos_entries = format!( "Expected a single positive entry, {} entires found", @@ -239,6 +226,13 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> SeriesOneHotEncoder ); Err(Failed::transform(&pos_entries[..])) } + + fn transform_one>(&self, category: &CategoryType) -> Option { + match self.mapper.get_num(category) { + None => None, + Some(&idx) => Some(make_one_hot(idx, self.num_categories())), + } + } } #[cfg(test)] From 237b1160b17308252b6040d4c5ca07880079051c Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 18:20:27 -0800 Subject: [PATCH 36/81] doc update --- src/preprocessing/series_encoder.rs | 64 ++++++++++++++++------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 4e9ddf9..9d7e259 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -8,6 +8,7 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; +/// Bi-directional map category <-> label num. #[derive(Debug, Clone)] pub struct CategoryMapper { category_map: HashMap, @@ -16,7 +17,9 @@ pub struct CategoryMapper { } impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { - fn fit_to_iter(categories: impl Iterator) -> Self { + + /// Fit an encoder to a lable iterator + pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); let mut category_num = 0usize; let mut unique_lables: Vec = Vec::new(); @@ -34,8 +37,9 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { categories: unique_lables, } } - - fn from_category_map(category_map: HashMap) -> Self { + + /// Build an encoder from a predefined (category -> class number) map + pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(CategoryType, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); @@ -46,8 +50,9 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { category_map, } } - - fn from_positional_category_vec(categories: Vec) -> Self { + + /// Build an encoder from a predefined positional category-class num vector + pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories .iter() .enumerate() @@ -61,16 +66,17 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Get label num of a category - fn get_num(&self, category: &CategoryType) -> Option<&usize> { + pub fn get_num(&self, category: &CategoryType) -> Option<&usize> { self.category_map.get(category) } /// Return category corresponding to label num - fn get_cat(&self, num: usize) -> &CategoryType { + pub fn get_cat(&self, num: usize) -> &CategoryType { &self.categories[num] } - fn get_categories(&self) -> &[CategoryType] { + /// List all categories (position = category number) + pub fn get_categories(&self) -> &[CategoryType] { &self.categories[..] } } @@ -80,14 +86,14 @@ pub trait SeriesEncoder: where CategoryType:Hash + Eq + Clone { - /// Fit an encoder to a lable list + /// Fit an encoder to a lable iterator fn fit_to_iter(categories: impl Iterator) -> Self; /// Number of categories for categorical variable fn num_categories(&self) -> usize; /// Build an encoder from a predefined (category -> class number) map - fn from_category_map(category_map: HashMap) -> Self; + fn from_category_map(category_map: HashMap) -> Self; /// Build an encoder from a predefined positional category-class num vector fn from_positional_category_vec(categories: Vec) -> Self; @@ -119,6 +125,7 @@ pub trait SeriesEncoder: self.transform_iter(v) } } + /// Make a one-hot encoded vector from a categorical variable /// /// Example: @@ -182,20 +189,20 @@ pub struct SeriesOneHotEncoder { } impl SeriesEncoder for SeriesOneHotEncoder { - + fn fit_to_iter(categories: impl Iterator) -> Self { Self {mapper:CategoryMapper::fit_to_iter(categories)} - } + } /// Build an encoder from a predefined (category -> class number) map fn from_category_map(category_map: HashMap) -> Self { Self {mapper: CategoryMapper::from_category_map(category_map)} - } + } /// Build an encoder from a predefined positional category-class num vector fn from_positional_category_vec(categories: Vec) -> Self { Self {mapper:CategoryMapper::from_positional_category_vec(categories)} - } + } fn num_categories(&self) -> usize { self.mapper.num_categories @@ -207,25 +214,25 @@ impl SeriesEncoder for SeriesOneH fn invert_one>(&self, one_hot: V) -> Result { - let pos = U::from_f64(1f64).unwrap(); + let pos = U::from_f64(1f64).unwrap(); let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); - + let s: Vec = oh_it - .enumerate() - .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) - .collect(); - - if s.len() == 1 { - let idx = s[0]; + .enumerate() + .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) + .collect(); + + if s.len() == 1 { + let idx = s[0]; return Ok(self.mapper.get_cat(idx).clone()); + } + let pos_entries = format!( + "Expected a single positive entry, {} entires found", + s.len() + ); + Err(Failed::transform(&pos_entries[..])) } - let pos_entries = format!( - "Expected a single positive entry, {} entires found", - s.len() - ); - Err(Failed::transform(&pos_entries[..])) - } fn transform_one>(&self, category: &CategoryType) -> Option { match self.mapper.get_num(category) { @@ -233,6 +240,7 @@ impl SeriesEncoder for SeriesOneH Some(&idx) => Some(make_one_hot(idx, self.num_categories())), } } + } #[cfg(test)] From ef06f45638ec42540d74f41ffd2171f2d97e793f Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 2 Feb 2021 18:21:06 -0800 Subject: [PATCH 37/81] Switch to use SeriesEncoder trait --- src/preprocessing/categorical_encoder.rs | 35 ++++++++++++++---------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index e3e8ce9..75cbf2b 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -6,7 +6,7 @@ //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; -//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEnc, OneHotEncoderParams}; //! let data = DenseMatrix::from_2d_array(&[ //! &[1.5, 1.0, 1.5, 3.0], //! &[1.5, 2.0, 1.5, 4.0], @@ -15,7 +15,7 @@ //! ]); //! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); //! // Infer number of categories from data and return a reusable encoder -//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! let encoder = OneHotEnc::fit(&data, encoder_params).unwrap(); //! // Transform categorical to one-hot encoded (can transform similar) //! let oh_data = encoder.transform(&data).unwrap(); //! // Produces the following: @@ -30,7 +30,7 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; -use crate::preprocessing::series_encoder::SeriesOneHotEncoder; +use crate::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; /// OneHotEncoder Parameters #[derive(Debug, Clone)] @@ -97,17 +97,17 @@ fn validate_col_is_categorical(data: &[T]) -> bool { /// Encode Categorical variavbles of data matrix to one-hot #[derive(Debug, Clone)] -pub struct OneHotEncoder { - series_encoders: Vec>, +pub struct OneHotEncoder { + series_encoders: Vec, col_idx_categorical: Vec, } -impl OneHotEncoder { +impl> OneHotEncoder { /// Create an encoder instance with categories infered from data matrix pub fn fit>( data: &M, params: OneHotEncoderParams, - ) -> Result { + ) -> Result, Failed> { match (params.col_idx_categorical, params.infer_categorical) { (None, false) => Err(Failed::fit( "Must pass categorical series ids or infer flag", @@ -126,7 +126,7 @@ impl OneHotEncoder { // col buffer to avoid allocations let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - let mut res: Vec> = + let mut res: Vec = Vec::with_capacity(idxs.len()); for &idx in &idxs { @@ -139,7 +139,7 @@ impl OneHotEncoder { return Err(Failed::fit(&msg[..])); } let hashable_col = col_buf.iter().map(|v| v.to_category()); - res.push(SeriesOneHotEncoder::fit_to_iter(hashable_col)); + res.push(E::fit_to_iter(hashable_col)); } Ok(Self { @@ -160,7 +160,7 @@ impl OneHotEncoder { let additional_params: Vec = self .series_encoders .iter() - .map(|enc| enc.num_categories) + .map(|enc| enc.num_categories()) .collect(); // Eac category of size v adds v-1 params @@ -215,12 +215,17 @@ impl OneHotEncoder { } } +/// Convinince type for common use +pub type OneHotEnc = OneHotEncoder>; + + #[cfg(test)] mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::preprocessing::series_encoder::SeriesOneHotEncoder; + #[test] fn adjust_idxs() { assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); @@ -279,13 +284,13 @@ mod tests { fn test_fit() { let (x, _) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let oh_enc = OneHotEnc::fit(&x, params).unwrap(); assert_eq!(oh_enc.series_encoders.len(), 2); let num_cat: Vec = oh_enc .series_encoders .iter() - .map(|a| a.num_categories) + .map(|a| a.num_categories()) .collect(); assert_eq!(num_cat, vec![2, 4]); } @@ -294,13 +299,13 @@ mod tests { fn matrix_transform_test() { let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let oh_enc = OneHotEnc::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let oh_enc = OneHotEnc::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); } @@ -315,7 +320,7 @@ mod tests { ]); let params = OneHotEncoderParams::from_cat_idx(&[1]); - match OneHotEncoder::fit(&m, params) { + match OneHotEnc::fit(&m, params) { Err(_) => { assert!(true); } From 700d320724c8dad09cdd31e3d73e5cc4d91c33ce Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 10:45:25 -0800 Subject: [PATCH 38/81] simplify SeriesEncoder trait --- src/preprocessing/series_encoder.rs | 134 ++++++++++++++-------------- 1 file changed, 68 insertions(+), 66 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 9d7e259..6975c0d 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -10,19 +10,22 @@ use std::hash::Hash; /// Bi-directional map category <-> label num. #[derive(Debug, Clone)] -pub struct CategoryMapper { - category_map: HashMap, - categories: Vec, +pub struct CategoryMapper { + category_map: HashMap, + categories: Vec, num_categories: usize, } -impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { +impl<'a, C> CategoryMapper +where + C: 'a + Hash + Eq + Clone +{ /// Fit an encoder to a lable iterator - pub fn fit_to_iter(categories: impl Iterator) -> Self { - let mut category_map: HashMap = HashMap::new(); + pub fn fit_to_iter(categories: impl Iterator) -> Self { + let mut category_map: HashMap = HashMap::new(); let mut category_num = 0usize; - let mut unique_lables: Vec = Vec::new(); + let mut unique_lables: Vec = Vec::new(); for l in categories { if !category_map.contains_key(&l) { @@ -39,11 +42,11 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Build an encoder from a predefined (category -> class number) map - pub fn from_category_map(category_map: HashMap) -> Self { - let mut _unique_cat: Vec<(CategoryType, usize)> = + pub fn from_category_map(category_map: HashMap) -> Self { + let mut _unique_cat: Vec<(C, usize)> = category_map.iter().map(|(k, v)| (k.clone(), *v)).collect(); _unique_cat.sort_by(|a, b| a.1.cmp(&b.1)); - let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); + let categories: Vec = _unique_cat.into_iter().map(|a| a.0).collect(); Self { num_categories: categories.len(), categories, @@ -52,8 +55,8 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Build an encoder from a predefined positional category-class num vector - pub fn from_positional_category_vec(categories: Vec) -> Self { - let category_map: HashMap = categories + pub fn from_positional_category_vec(categories: Vec) -> Self { + let category_map: HashMap = categories .iter() .enumerate() .map(|(v, k)| (k.clone(), v)) @@ -66,64 +69,49 @@ impl<'a, CategoryType: 'a + Hash + Eq + Clone> CategoryMapper { } /// Get label num of a category - pub fn get_num(&self, category: &CategoryType) -> Option<&usize> { + pub fn get_num(&self, category: &C) -> Option<&usize> { self.category_map.get(category) } /// Return category corresponding to label num - pub fn get_cat(&self, num: usize) -> &CategoryType { + pub fn get_cat(&self, num: usize) -> &C { &self.categories[num] } /// List all categories (position = category number) - pub fn get_categories(&self) -> &[CategoryType] { + pub fn get_categories(&self) -> &[C] { &self.categories[..] } } /// Defines common behavior for series encoders(e.g. OneHot, Ordinal) -pub trait SeriesEncoder: +pub trait SeriesEncoder: where - CategoryType:Hash + Eq + Clone + C: Hash + Eq + Clone { /// Fit an encoder to a lable iterator - fn fit_to_iter(categories: impl Iterator) -> Self; + fn fit_to_iter(categories: impl Iterator) -> Self; /// Number of categories for categorical variable fn num_categories(&self) -> usize; - /// Build an encoder from a predefined (category -> class number) map - fn from_category_map(category_map: HashMap) -> Self; - - /// Build an encoder from a predefined positional category-class num vector - fn from_positional_category_vec(categories: Vec) -> Self; - /// Transform a single category type into a one-hot vector - fn transform_one>(&self, category: &CategoryType) -> Option; + fn transform_one>(&self, category: &C) -> Option; /// Invert one-hot vector, back to the category - fn invert_one>(&self, one_hot: V) -> Result; + fn invert_one>(&self, one_hot: V) -> Result; /// Get categories ordered by encoder's category enumeration - fn get_categories(&self) -> &[CategoryType]; + fn get_categories(&self) -> &[C]; /// Take an iterator as a series to transform + /// None is returned if unknown category is encountered fn transform_iter>( &self, - cat_it: impl Iterator, + cat_it: impl Iterator, ) -> Vec> { cat_it.map(|l| self.transform_one(&l)).collect() } - - /// Transform a slice of category types into one-hot vectors - /// None is returned if unknown category is encountered - fn transfrom_series>( - &self, - categories: &[CategoryType], - ) -> Vec> { - let v = categories.iter().cloned(); - self.transform_iter(v) - } } /// Make a one-hot encoded vector from a categorical variable @@ -153,22 +141,22 @@ pub fn make_one_hot>( /// Example: /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; /// /// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; /// let it = fake_categories.iter().map(|&a| a); -/// let enc = SeriesOneHotEncoder::::fit_to_iter(it); +/// let enc: SeriesOneHotEncoder:: = SeriesEncoder::fit_to_iter(it); /// let oh_vec: Vec = enc.transform_one(&1).unwrap(); /// // notice that 1 is actually a zero-th positional category /// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); /// ``` /// -/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` /// /// /// ``` /// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::SeriesOneHotEncoder; +/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder, CategoryMapper}; /// /// let category_map: HashMap<&str, usize> = /// vec![("cat", 2), ("background",0), ("dog", 1)] @@ -176,43 +164,53 @@ pub fn make_one_hot>( /// .collect(); /// let category_vec = vec!["background", "dog", "cat"]; /// -/// let enc_lv = SeriesOneHotEncoder::<&str>::from_positional_category_vec(category_vec); -/// let enc_lm = SeriesOneHotEncoder::<&str>::from_category_map(category_map); +/// let enc_lv = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_positional_category_vec(category_vec)); +/// let enc_lm = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_category_map(category_map)); /// /// // ["background", "dog", "cat"] /// println!("{:?}", enc_lv.get_categories()); -/// assert_eq!(enc_lv.transform_one::(&"dog"), enc_lm.transform_one::(&"dog")) +/// let lv: Vec = enc_lv.transform_one(&"dog").unwrap(); +/// let lm: Vec = enc_lm.transform_one(&"dog").unwrap(); +/// assert_eq!(lv, lm); /// ``` #[derive(Debug, Clone)] -pub struct SeriesOneHotEncoder { - mapper: CategoryMapper, +pub struct SeriesOneHotEncoder { + mapper: CategoryMapper, } -impl SeriesEncoder for SeriesOneHotEncoder { +impl SeriesOneHotEncoder +where + C: Hash + Eq + Clone +{ + /// Create SeriesEncoder form existing mapper + pub fn new(mapper: CategoryMapper) -> Self { + Self {mapper} + } +} + +impl SeriesEncoder for SeriesOneHotEncoder +where + C: Hash + Eq + Clone +{ - fn fit_to_iter(categories: impl Iterator) -> Self { + + fn fit_to_iter(categories: impl Iterator) -> Self { Self {mapper:CategoryMapper::fit_to_iter(categories)} } - /// Build an encoder from a predefined (category -> class number) map - fn from_category_map(category_map: HashMap) -> Self { - Self {mapper: CategoryMapper::from_category_map(category_map)} - } - - /// Build an encoder from a predefined positional category-class num vector - fn from_positional_category_vec(categories: Vec) -> Self { - Self {mapper:CategoryMapper::from_positional_category_vec(categories)} - } - fn num_categories(&self) -> usize { self.mapper.num_categories } - fn get_categories(&self) -> &[CategoryType] { + fn get_categories(&self) -> &[C] { self.mapper.get_categories() } - fn invert_one>(&self, one_hot: V) -> Result + fn invert_one(&self, one_hot: V) -> Result + where + U: RealNumber, + V: BaseVector + { let pos = U::from_f64(1f64).unwrap(); @@ -234,7 +232,11 @@ impl SeriesEncoder for SeriesOneH Err(Failed::transform(&pos_entries[..])) } - fn transform_one>(&self, category: &CategoryType) -> Option { + fn transform_one(&self, category: &C) -> Option + where + U: RealNumber, + V: BaseVector + { match self.mapper.get_num(category) { None => None, Some(&idx) => Some(make_one_hot(idx, self.num_categories())), @@ -262,7 +264,7 @@ mod tests { fn build_fake_str_enc<'a>() -> SeriesOneHotEncoder<&'a str> { let fake_category_pos = vec!["background", "dog", "cat"]; - let enc = SeriesOneHotEncoder::<&str>::from_positional_category_vec(fake_category_pos); + let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_positional_category_vec(fake_category_pos)); enc } @@ -271,7 +273,7 @@ mod tests { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = SeriesOneHotEncoder::<&str>::from_category_map(category_map); + let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_category_map(category_map)); let oh_vec: Vec = match enc.transform_one(&"dog") { None => panic!("Wrong categories"), Some(v) => v, @@ -306,8 +308,8 @@ mod tests { #[test] fn test_many_categorys() { let enc = build_fake_str_enc(); - let res: Vec>> = - enc.transfrom_series(&["dog", "cat", "fish", "background"]); + let cat_it = ["dog", "cat", "fish", "background"].iter().cloned(); + let res: Vec>> = enc.transform_iter(cat_it); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]), From 3cc20fd400682356ac0dfe1dfeb1206172983123 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 13:39:26 -0800 Subject: [PATCH 39/81] Move all functionality to CategoryMapper (one-hot and ordinal). --- src/preprocessing/series_encoder.rs | 181 +++++++++------------------- 1 file changed, 58 insertions(+), 123 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 6975c0d..cdbae16 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -8,7 +8,48 @@ use crate::math::num::RealNumber; use std::collections::HashMap; use std::hash::Hash; -/// Bi-directional map category <-> label num. +/// ## Bi-directional map category <-> label num. +/// Turn Hashable objects into a one-hot vectors or ordinal values. +/// This struct encodes single class per exmample +/// +/// You can fit_to_iter a category enumeration by passing an iterator of categories. +/// category numbers will be assigned in the order they are encountered +/// +/// Example: +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::series_encoder::CategoryMapper; +/// +/// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; +/// let it = fake_categories.iter().map(|&a| a); +/// let enc = CategoryMapper::::fit_to_iter(it); +/// let oh_vec: Vec = enc.get_one_hot(&1).unwrap(); +/// // notice that 1 is actually a zero-th positional category +/// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); +/// ``` +/// +/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` +/// +/// +/// ``` +/// use std::collections::HashMap; +/// use smartcore::preprocessing::series_encoder::CategoryMapper; +/// +/// let category_map: HashMap<&str, usize> = +/// vec![("cat", 2), ("background",0), ("dog", 1)] +/// .into_iter() +/// .collect(); +/// let category_vec = vec!["background", "dog", "cat"]; +/// +/// let enc_lv = CategoryMapper::<&str>::from_positional_category_vec(category_vec); +/// let enc_lm = CategoryMapper::<&str>::from_category_map(category_map); +/// +/// // ["background", "dog", "cat"] +/// println!("{:?}", enc_lv.get_categories()); +/// let lv: Vec = enc_lv.get_one_hot(&"dog").unwrap(); +/// let lm: Vec = enc_lm.get_one_hot(&"dog").unwrap(); +/// assert_eq!(lv, lm); +/// ``` #[derive(Debug, Clone)] pub struct CategoryMapper { category_map: HashMap, @@ -16,10 +57,14 @@ pub struct CategoryMapper { num_categories: usize, } -impl<'a, C> CategoryMapper +impl CategoryMapper where - C: 'a + Hash + Eq + Clone + C: Hash + Eq + Clone, { + /// Get the number of categories in the mapper + pub fn num_categories(&self) -> usize { + self.num_categories + } /// Fit an encoder to a lable iterator pub fn fit_to_iter(categories: impl Iterator) -> Self { @@ -82,131 +127,21 @@ where pub fn get_categories(&self) -> &[C] { &self.categories[..] } -} -/// Defines common behavior for series encoders(e.g. OneHot, Ordinal) -pub trait SeriesEncoder: + /// Get one-hot encoding of the category + pub fn get_one_hot(&self, category: &C) -> Option where - C: Hash + Eq + Clone + U: RealNumber, + V: BaseVector, { - /// Fit an encoder to a lable iterator - fn fit_to_iter(categories: impl Iterator) -> Self; - - /// Number of categories for categorical variable - fn num_categories(&self) -> usize; - - /// Transform a single category type into a one-hot vector - fn transform_one>(&self, category: &C) -> Option; + match self.get_num(category) { + None => None, + Some(&idx) => Some(make_one_hot::(idx, self.num_categories)), + } +} /// Invert one-hot vector, back to the category - fn invert_one>(&self, one_hot: V) -> Result; - - /// Get categories ordered by encoder's category enumeration - fn get_categories(&self) -> &[C]; - - /// Take an iterator as a series to transform - /// None is returned if unknown category is encountered - fn transform_iter>( - &self, - cat_it: impl Iterator, - ) -> Vec> { - cat_it.map(|l| self.transform_one(&l)).collect() - } -} - -/// Make a one-hot encoded vector from a categorical variable -/// -/// Example: -/// ``` -/// use smartcore::preprocessing::series_encoder::make_one_hot; -/// let one_hot: Vec = make_one_hot(2, 3); -/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); -/// ``` -pub fn make_one_hot>( - category_idx: usize, - num_categories: usize, -) -> V { - let pos = T::from_f64(1f64).unwrap(); - let mut z = V::zeros(num_categories); - z.set(category_idx, pos); - z -} - -/// Turn a collection of Hashable objects into a one-hot vectors. -/// This struct encodes single class per exmample -/// -/// You can fit_to_iter a category enumeration by passing an iterator of categories. -/// category numbers will be assigned in the order they are encountered -/// -/// Example: -/// ``` -/// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; -/// -/// let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; -/// let it = fake_categories.iter().map(|&a| a); -/// let enc: SeriesOneHotEncoder:: = SeriesEncoder::fit_to_iter(it); -/// let oh_vec: Vec = enc.transform_one(&1).unwrap(); -/// // notice that 1 is actually a zero-th positional category -/// assert_eq!(oh_vec, vec![1.0, 0.0, 0.0, 0.0, 0.0]); -/// ``` -/// -/// You can also pass a predefined category enumeration such as a hashmap `HashMap` or a vector `Vec` -/// -/// -/// ``` -/// use std::collections::HashMap; -/// use smartcore::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder, CategoryMapper}; -/// -/// let category_map: HashMap<&str, usize> = -/// vec![("cat", 2), ("background",0), ("dog", 1)] -/// .into_iter() -/// .collect(); -/// let category_vec = vec!["background", "dog", "cat"]; -/// -/// let enc_lv = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_positional_category_vec(category_vec)); -/// let enc_lm = SeriesOneHotEncoder::<&str>::new(CategoryMapper::from_category_map(category_map)); -/// -/// // ["background", "dog", "cat"] -/// println!("{:?}", enc_lv.get_categories()); -/// let lv: Vec = enc_lv.transform_one(&"dog").unwrap(); -/// let lm: Vec = enc_lm.transform_one(&"dog").unwrap(); -/// assert_eq!(lv, lm); -/// ``` -#[derive(Debug, Clone)] -pub struct SeriesOneHotEncoder { - mapper: CategoryMapper, -} - -impl SeriesOneHotEncoder -where - C: Hash + Eq + Clone -{ - /// Create SeriesEncoder form existing mapper - pub fn new(mapper: CategoryMapper) -> Self { - Self {mapper} - } -} - -impl SeriesEncoder for SeriesOneHotEncoder -where - C: Hash + Eq + Clone -{ - - - fn fit_to_iter(categories: impl Iterator) -> Self { - Self {mapper:CategoryMapper::fit_to_iter(categories)} - } - - fn num_categories(&self) -> usize { - self.mapper.num_categories - } - - fn get_categories(&self) -> &[C] { - self.mapper.get_categories() - } - - fn invert_one(&self, one_hot: V) -> Result + pub fn invert_one_hot(&self, one_hot: V) -> Result where U: RealNumber, V: BaseVector From 374dfeceb906262a2797967cfa02514b5ca2d48d Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 13:41:25 -0800 Subject: [PATCH 40/81] No more SeriesEncoders. --- src/preprocessing/series_encoder.rs | 104 +++++++++++++++++----------- 1 file changed, 63 insertions(+), 41 deletions(-) diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index cdbae16..e24eca1 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -65,7 +65,7 @@ where pub fn num_categories(&self) -> usize { self.num_categories } - + /// Fit an encoder to a lable iterator pub fn fit_to_iter(categories: impl Iterator) -> Self { let mut category_map: HashMap = HashMap::new(); @@ -85,7 +85,7 @@ where categories: unique_lables, } } - + /// Build an encoder from a predefined (category -> class number) map pub fn from_category_map(category_map: HashMap) -> Self { let mut _unique_cat: Vec<(C, usize)> = @@ -98,7 +98,7 @@ where category_map, } } - + /// Build an encoder from a predefined positional category-class num vector pub fn from_positional_category_vec(categories: Vec) -> Self { let category_map: HashMap = categories @@ -130,54 +130,71 @@ where /// Get one-hot encoding of the category pub fn get_one_hot(&self, category: &C) -> Option - where + where U: RealNumber, V: BaseVector, -{ + { match self.get_num(category) { None => None, Some(&idx) => Some(make_one_hot::(idx, self.num_categories)), + } } -} /// Invert one-hot vector, back to the category pub fn invert_one_hot(&self, one_hot: V) -> Result where U: RealNumber, - V: BaseVector + V: BaseVector, + { + let pos = U::one(); - { - let pos = U::from_f64(1f64).unwrap(); - - let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); - - let s: Vec = oh_it - .enumerate() - .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) - .collect(); - - if s.len() == 1 { - let idx = s[0]; - return Ok(self.mapper.get_cat(idx).clone()); - } - let pos_entries = format!( - "Expected a single positive entry, {} entires found", - s.len() - ); - Err(Failed::transform(&pos_entries[..])) + let oh_it = (0..one_hot.len()).map(|idx| one_hot.get(idx)); + + let s: Vec = oh_it + .enumerate() + .filter_map(|(idx, v)| if v == pos { Some(idx) } else { None }) + .collect(); + + if s.len() == 1 { + let idx = s[0]; + return Ok(self.get_cat(idx).clone()); } + let pos_entries = format!( + "Expected a single positive entry, {} entires found", + s.len() + ); + Err(Failed::transform(&pos_entries[..])) + } - fn transform_one(&self, category: &C) -> Option + /// Get ordinal encoding of the catergory + pub fn get_ordinal(&self, category: &C) -> Option where U: RealNumber, - V: BaseVector { - match self.mapper.get_num(category) { + match self.get_num(category) { None => None, - Some(&idx) => Some(make_one_hot(idx, self.num_categories())), + Some(&idx) => U::from_usize(idx), } } - +} + +/// Make a one-hot encoded vector from a categorical variable +/// +/// Example: +/// ``` +/// use smartcore::preprocessing::series_encoder::make_one_hot; +/// let one_hot: Vec = make_one_hot(2, 3); +/// assert_eq!(one_hot, vec![0.0, 0.0, 1.0]); +/// ``` +pub fn make_one_hot(category_idx: usize, num_categories: usize) -> V +where + T: RealNumber, + V: BaseVector, +{ + let pos = T::one(); + let mut z = V::zeros(num_categories); + z.set(category_idx, pos); + z } #[cfg(test)] @@ -188,8 +205,8 @@ mod tests { fn from_categories() { let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; let it = fake_categories.iter().map(|&a| a); - let enc = SeriesOneHotEncoder::::fit_to_iter(it); - let oh_vec: Vec = match enc.transform_one(&1) { + let enc = CategoryMapper::::fit_to_iter(it); + let oh_vec: Vec = match enc.get_one_hot(&1) { None => panic!("Wrong categories"), Some(v) => v, }; @@ -197,19 +214,24 @@ mod tests { assert_eq!(oh_vec, res); } - fn build_fake_str_enc<'a>() -> SeriesOneHotEncoder<&'a str> { + fn build_fake_str_enc<'a>() -> CategoryMapper<&'a str> { let fake_category_pos = vec!["background", "dog", "cat"]; - let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_positional_category_vec(fake_category_pos)); + let enc = CategoryMapper::<&str>::from_positional_category_vec(fake_category_pos); enc } + #[test] + fn ordinal_encoding() { + let enc = build_fake_str_enc(); + assert_eq!(1f64, enc.get_ordinal::(&"dog").unwrap()) + } #[test] fn category_map_and_vec() { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] .into_iter() .collect(); - let enc = SeriesOneHotEncoder::<&str>::new( CategoryMapper::from_category_map(category_map)); - let oh_vec: Vec = match enc.transform_one(&"dog") { + let enc = CategoryMapper::<&str>::from_category_map(category_map); + let oh_vec: Vec = match enc.get_one_hot(&"dog") { None => panic!("Wrong categories"), Some(v) => v, }; @@ -220,7 +242,7 @@ mod tests { #[test] fn positional_categories_vec() { let enc = build_fake_str_enc(); - let oh_vec: Vec = match enc.transform_one(&"dog") { + let oh_vec: Vec = match enc.get_one_hot(&"dog") { None => panic!("Wrong categories"), Some(v) => v, }; @@ -232,9 +254,9 @@ mod tests { fn invert_label_test() { let enc = build_fake_str_enc(); let res: Vec = vec![0.0, 1.0, 0.0]; - let lab = enc.invert_one(res).unwrap(); + let lab = enc.invert_one_hot(res).unwrap(); assert_eq!(lab, "dog"); - if let Err(e) = enc.invert_one(vec![0.0, 0.0, 0.0]) { + if let Err(e) = enc.invert_one_hot(vec![0.0, 0.0, 0.0]) { let pos_entries = format!("Expected a single positive entry, 0 entires found"); assert_eq!(e, Failed::transform(&pos_entries[..])); }; @@ -244,7 +266,7 @@ mod tests { fn test_many_categorys() { let enc = build_fake_str_enc(); let cat_it = ["dog", "cat", "fish", "background"].iter().cloned(); - let res: Vec>> = enc.transform_iter(cat_it); + let res: Vec>> = cat_it.map(|v| enc.get_one_hot(&v)).collect(); let v = vec![ Some(vec![0.0, 1.0, 0.0]), Some(vec![0.0, 0.0, 1.0]), From 828df4e338c0a44a38ad2004f3bae349322d1c94 Mon Sep 17 00:00:00 2001 From: gaxler Date: Wed, 3 Feb 2021 13:42:27 -0800 Subject: [PATCH 41/81] Use CategoryMapper to transform an iterator. No more passing iterator to SeriesEncoders --- src/preprocessing/categorical_encoder.rs | 67 ++++++++++++------------ 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs index 75cbf2b..18e569a 100644 --- a/src/preprocessing/categorical_encoder.rs +++ b/src/preprocessing/categorical_encoder.rs @@ -1,12 +1,12 @@ //! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies //! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents //! -//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [SeriesOneHotEncoder](../series_encoder/struct.SeriesOneHotEncoder.html) +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [CategoryMapper](../series_encoder/struct.CategoryMapper.html) //! //! ### Usage Example //! ``` //! use smartcore::linalg::naive::dense_matrix::DenseMatrix; -//! use smartcore::preprocessing::categorical_encoder::{OneHotEnc, OneHotEncoderParams}; +//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; //! let data = DenseMatrix::from_2d_array(&[ //! &[1.5, 1.0, 1.5, 3.0], //! &[1.5, 2.0, 1.5, 4.0], @@ -15,7 +15,7 @@ //! ]); //! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); //! // Infer number of categories from data and return a reusable encoder -//! let encoder = OneHotEnc::fit(&data, encoder_params).unwrap(); +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); //! // Transform categorical to one-hot encoded (can transform similar) //! let oh_data = encoder.transform(&data).unwrap(); //! // Produces the following: @@ -30,7 +30,7 @@ use crate::error::Failed; use crate::linalg::Matrix; use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; -use crate::preprocessing::series_encoder::{SeriesOneHotEncoder, SeriesEncoder}; +use crate::preprocessing::series_encoder::CategoryMapper; /// OneHotEncoder Parameters #[derive(Debug, Clone)] @@ -97,17 +97,18 @@ fn validate_col_is_categorical(data: &[T]) -> bool { /// Encode Categorical variavbles of data matrix to one-hot #[derive(Debug, Clone)] -pub struct OneHotEncoder { - series_encoders: Vec, +pub struct OneHotEncoder { + category_mappers: Vec>, col_idx_categorical: Vec, } -impl> OneHotEncoder { +impl OneHotEncoder { /// Create an encoder instance with categories infered from data matrix - pub fn fit>( - data: &M, - params: OneHotEncoderParams, - ) -> Result, Failed> { + pub fn fit(data: &M, params: OneHotEncoderParams) -> Result + where + T: Categorizable, + M: Matrix, + { match (params.col_idx_categorical, params.infer_categorical) { (None, false) => Err(Failed::fit( "Must pass categorical series ids or infer flag", @@ -126,8 +127,7 @@ impl> OneHotEncoder { // col buffer to avoid allocations let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - let mut res: Vec = - Vec::with_capacity(idxs.len()); + let mut res: Vec> = Vec::with_capacity(idxs.len()); for &idx in &idxs { data.copy_col_as_vec(idx, &mut col_buf); @@ -139,11 +139,11 @@ impl> OneHotEncoder { return Err(Failed::fit(&msg[..])); } let hashable_col = col_buf.iter().map(|v| v.to_category()); - res.push(E::fit_to_iter(hashable_col)); + res.push(CategoryMapper::fit_to_iter(hashable_col)); } Ok(Self { - series_encoders: res, //Self::build_series_encoders::(data, &idxs[..]), + category_mappers: res, col_idx_categorical: idxs, }) } @@ -155,10 +155,14 @@ impl> OneHotEncoder { } /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform>(&self, x: &M) -> Result { + pub fn transform(&self, x: &M) -> Result + where + T: Categorizable, + M: Matrix, + { let (nrows, p) = x.shape(); let additional_params: Vec = self - .series_encoders + .category_mappers .iter() .map(|enc| enc.num_categories()) .collect(); @@ -172,10 +176,10 @@ impl> OneHotEncoder { for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { let cidx = new_col_idx[old_cidx]; let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); - let sencoder = &self.series_encoders[pidx]; - let oh_series: Vec>> = sencoder.transform_iter(col_iter); + let sencoder = &self.category_mappers[pidx]; + let oh_series = col_iter.map(|c| sencoder.get_one_hot::>(&c)); - for (row, oh_vec) in oh_series.iter().enumerate() { + for (row, oh_vec) in oh_series.enumerate() { match oh_vec { None => { // Since we support T types, bad value in a series causes in to be invalid @@ -215,16 +219,11 @@ impl> OneHotEncoder { } } -/// Convinince type for common use -pub type OneHotEnc = OneHotEncoder>; - - #[cfg(test)] mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::preprocessing::series_encoder::SeriesOneHotEncoder; - + use crate::preprocessing::series_encoder::CategoryMapper; #[test] fn adjust_idxs() { @@ -275,8 +274,8 @@ mod tests { let series = vec![3.0, 1.0, 2.0, 1.0]; let hashable_series: Vec = series.iter().map(|v| v.to_category()).collect(); - let enc = SeriesOneHotEncoder::from_positional_category_vec(hashable_series); - let inv = enc.invert_one(vec![0.0, 0.0, 1.0]); + let enc = CategoryMapper::from_positional_category_vec(hashable_series); + let inv = enc.invert_one_hot(vec![0.0, 0.0, 1.0]); let orig_val: f64 = inv.unwrap().into(); assert_eq!(orig_val, 2.0); } @@ -284,11 +283,11 @@ mod tests { fn test_fit() { let (x, _) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEnc::fit(&x, params).unwrap(); - assert_eq!(oh_enc.series_encoders.len(), 2); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + assert_eq!(oh_enc.category_mappers.len(), 2); let num_cat: Vec = oh_enc - .series_encoders + .category_mappers .iter() .map(|a| a.num_categories()) .collect(); @@ -299,13 +298,13 @@ mod tests { fn matrix_transform_test() { let (x, expected_x) = build_fake_matrix(); let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEnc::fit(&x, params).unwrap(); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); let (x, expected_x) = build_cat_first_and_last(); let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEnc::fit(&x, params).unwrap(); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); let nm = oh_enc.transform(&x).unwrap(); assert_eq!(nm, expected_x); } @@ -320,7 +319,7 @@ mod tests { ]); let params = OneHotEncoderParams::from_cat_idx(&[1]); - match OneHotEnc::fit(&m, params) { + match OneHotEncoder::fit(&m, params) { Err(_) => { assert!(true); } From af6ec2d402c1d3d6aca1881f7c80301487a94cab Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 9 Feb 2021 22:01:34 -0800 Subject: [PATCH 42/81] rename categorical --- src/preprocessing/categorical.rs | 329 +++++++++++++++++++++++++++++++ src/preprocessing/mod.rs | 2 +- 2 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 src/preprocessing/categorical.rs diff --git a/src/preprocessing/categorical.rs b/src/preprocessing/categorical.rs new file mode 100644 index 0000000..8571e74 --- /dev/null +++ b/src/preprocessing/categorical.rs @@ -0,0 +1,329 @@ +//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies +//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents +//! +//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [CategoryMapper](../series_encoder/struct.CategoryMapper.html) +//! +//! ### Usage Example +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::preprocessing::categorical::{OneHotEncoder, OneHotEncoderParams}; +//! let data = DenseMatrix::from_2d_array(&[ +//! &[1.5, 1.0, 1.5, 3.0], +//! &[1.5, 2.0, 1.5, 4.0], +//! &[1.5, 1.0, 1.5, 5.0], +//! &[1.5, 2.0, 1.5, 6.0], +//! ]); +//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); +//! // Infer number of categories from data and return a reusable encoder +//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); +//! // Transform categorical to one-hot encoded (can transform similar) +//! let oh_data = encoder.transform(&data).unwrap(); +//! // Produces the following: +//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] +//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] +//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] +//! ``` +use std::iter; + +use crate::error::Failed; +use crate::linalg::Matrix; + +use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; +use crate::preprocessing::series_encoder::CategoryMapper; + +/// OneHotEncoder Parameters +#[derive(Debug, Clone)] +pub struct OneHotEncoderParams { + /// Column number that contain categorical variable + pub col_idx_categorical: Option>, + /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables + infer_categorical: bool, +} + +impl OneHotEncoderParams { + /// Generate parameters from categorical variable column numbers + pub fn from_cat_idx(categorical_params: &[usize]) -> Self { + Self { + col_idx_categorical: Some(categorical_params.to_vec()), + infer_categorical: false, + } + } +} + +/// Calculate the offset to parameters to due introduction of one-hot encoding +fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> Vec { + // This functions uses iterators and returns a vector. + // In case we get a huge amount of paramenters this might be a problem + // todo: Change this such that it will return an iterator + + let cat_idx = cat_idxs.iter().copied().chain((num_params..).take(1)); + + // Offset is constant between two categorical values, here we calculate the number of steps + // that remain constant + let repeats = cat_idx.scan(0, |a, v| { + let im = v + 1 - *a; + *a = v; + Some(im) + }); + + // Calculate the offset to parameter idx due to newly intorduced one-hot vectors + let offset_ = cat_sizes.iter().scan(0, |a, &v| { + *a = *a + v - 1; + Some(*a) + }); + let offset = (0..1).chain(offset_); + + let new_param_idxs: Vec = (0..num_params) + .zip( + repeats + .zip(offset) + .map(|(r, o)| iter::repeat(o).take(r)) + .flatten(), + ) + .map(|(idx, ofst)| idx + ofst) + .collect(); + new_param_idxs +} + +fn validate_col_is_categorical(data: &[T]) -> bool { + for v in data { + if !v.is_valid() { + return false; + } + } + true +} + +/// Encode Categorical variavbles of data matrix to one-hot +#[derive(Debug, Clone)] +pub struct OneHotEncoder { + category_mappers: Vec>, + col_idx_categorical: Vec, +} + +impl OneHotEncoder { + /// Create an encoder instance with categories infered from data matrix + pub fn fit(data: &M, params: OneHotEncoderParams) -> Result + where + T: Categorizable, + M: Matrix, + { + match (params.col_idx_categorical, params.infer_categorical) { + (None, false) => Err(Failed::fit( + "Must pass categorical series ids or infer flag", + )), + + (Some(_idxs), true) => Err(Failed::fit( + "Ambigous parameters, got both infer and categroy ids", + )), + + (Some(mut idxs), false) => { + // make sure categories have same order as data columns + idxs.sort_unstable(); + + let (nrows, _) = data.shape(); + + // col buffer to avoid allocations + let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); + + let mut res: Vec> = Vec::with_capacity(idxs.len()); + + for &idx in &idxs { + data.copy_col_as_vec(idx, &mut col_buf); + if !validate_col_is_categorical(&col_buf) { + let msg = format!( + "Column {} of data matrix containts non categorizable (integer) values", + idx + ); + return Err(Failed::fit(&msg[..])); + } + let hashable_col = col_buf.iter().map(|v| v.to_category()); + res.push(CategoryMapper::fit_to_iter(hashable_col)); + } + + Ok(Self { + category_mappers: res, + col_idx_categorical: idxs, + }) + } + + (None, true) => { + todo!("Auto-Inference for Categorical Variables not yet implemented") + } + } + } + + /// Transform categorical variables to one-hot encoded and return a new matrix + pub fn transform(&self, x: &M) -> Result + where + T: Categorizable, + M: Matrix, + { + let (nrows, p) = x.shape(); + let additional_params: Vec = self + .category_mappers + .iter() + .map(|enc| enc.num_categories()) + .collect(); + + // Eac category of size v adds v-1 params + let expandws_p: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); + + let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); + let mut res = M::zeros(nrows, expandws_p); + + for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { + let cidx = new_col_idx[old_cidx]; + let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); + let sencoder = &self.category_mappers[pidx]; + let oh_series = col_iter.map(|c| sencoder.get_one_hot::>(&c)); + + for (row, oh_vec) in oh_series.enumerate() { + match oh_vec { + None => { + // Since we support T types, bad value in a series causes in to be invalid + let msg = format!("At least one value in column {} doesn't conform to category definition", old_cidx); + return Err(Failed::transform(&msg[..])); + } + Some(v) => { + // copy one hot vectors to their place in the data matrix; + for (col_ofst, &val) in v.iter().enumerate() { + res.set(row, cidx + col_ofst, val); + } + } + } + } + } + + // copy old data in x to their new location while skipping catergorical vars (already treated) + let mut skip_idx_iter = self.col_idx_categorical.iter(); + let mut cur_skip = skip_idx_iter.next(); + + for (old_p, &new_p) in new_col_idx.iter().enumerate() { + // if found treated varible, skip it + if let Some(&v) = cur_skip { + if v == old_p { + cur_skip = skip_idx_iter.next(); + continue; + } + } + + for r in 0..nrows { + let val = x.get(r, old_p); + res.set(r, new_p, val); + } + } + + Ok(res) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::preprocessing::series_encoder::CategoryMapper; + + #[test] + fn adjust_idxs() { + assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); + // [0,1,2] -> [0, 1, 1, 1, 2] + assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); + } + + fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { + let orig = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { + // Categorical first and last + let orig = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 1.5, 3.0], + &[1.5, 2.0, 1.5, 4.0], + &[1.5, 1.0, 1.5, 5.0], + &[1.5, 2.0, 1.5, 6.0], + ]); + + let oh_enc = DenseMatrix::from_2d_array(&[ + &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], + &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], + &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], + ]); + + (orig, oh_enc) + } + + #[test] + fn hash_encode_f64_series() { + let series = vec![3.0, 1.0, 2.0, 1.0]; + let hashable_series: Vec = + series.iter().map(|v| v.to_category()).collect(); + let enc = CategoryMapper::from_positional_category_vec(hashable_series); + let inv = enc.invert_one_hot(vec![0.0, 0.0, 1.0]); + let orig_val: f64 = inv.unwrap().into(); + assert_eq!(orig_val, 2.0); + } + #[test] + fn test_fit() { + let (x, _) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + assert_eq!(oh_enc.category_mappers.len(), 2); + + let num_cat: Vec = oh_enc + .category_mappers + .iter() + .map(|a| a.num_categories()) + .collect(); + assert_eq!(num_cat, vec![2, 4]); + } + + #[test] + fn matrix_transform_test() { + let (x, expected_x) = build_fake_matrix(); + let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); + + let (x, expected_x) = build_cat_first_and_last(); + let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); + let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); + let nm = oh_enc.transform(&x).unwrap(); + assert_eq!(nm, expected_x); + } + + #[test] + fn fail_on_bad_category() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 1.5, 3.0], + &[2.0, 1.5, 4.0], + &[1.0, 1.5, 5.0], + &[2.0, 1.5, 6.0], + ]); + + let params = OneHotEncoderParams::from_cat_idx(&[1]); + match OneHotEncoder::fit(&m, params) { + Err(_) => { + assert!(true); + } + _ => assert!(false), + } + } +} diff --git a/src/preprocessing/mod.rs b/src/preprocessing/mod.rs index 4a1abf3..32a0cfa 100644 --- a/src/preprocessing/mod.rs +++ b/src/preprocessing/mod.rs @@ -1,5 +1,5 @@ /// Transform a data matrix by replaceing all categorical variables with their one-hot vector equivalents -pub mod categorical_encoder; +pub mod categorical; mod data_traits; /// Encode a series (column, array) of categorical variables as one-hot vectors pub mod series_encoder; From 6b5bed60928fb2fdd304eca03ff31c0612573164 Mon Sep 17 00:00:00 2001 From: gaxler Date: Tue, 9 Feb 2021 22:01:59 -0800 Subject: [PATCH 43/81] remove old --- src/preprocessing/categorical_encoder.rs | 329 ----------------------- 1 file changed, 329 deletions(-) delete mode 100644 src/preprocessing/categorical_encoder.rs diff --git a/src/preprocessing/categorical_encoder.rs b/src/preprocessing/categorical_encoder.rs deleted file mode 100644 index 18e569a..0000000 --- a/src/preprocessing/categorical_encoder.rs +++ /dev/null @@ -1,329 +0,0 @@ -//! # One-hot Encoding For [RealNumber](../../math/num/trait.RealNumber.html) Matricies -//! Transform a data [Matrix](../../linalg/trait.BaseMatrix.html) by replacing all categorical variables with their one-hot equivalents -//! -//! Internally OneHotEncoder treats every categorical column as a series and transforms it using [CategoryMapper](../series_encoder/struct.CategoryMapper.html) -//! -//! ### Usage Example -//! ``` -//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; -//! use smartcore::preprocessing::categorical_encoder::{OneHotEncoder, OneHotEncoderParams}; -//! let data = DenseMatrix::from_2d_array(&[ -//! &[1.5, 1.0, 1.5, 3.0], -//! &[1.5, 2.0, 1.5, 4.0], -//! &[1.5, 1.0, 1.5, 5.0], -//! &[1.5, 2.0, 1.5, 6.0], -//! ]); -//! let encoder_params = OneHotEncoderParams::from_cat_idx(&[1, 3]); -//! // Infer number of categories from data and return a reusable encoder -//! let encoder = OneHotEncoder::fit(&data, encoder_params).unwrap(); -//! // Transform categorical to one-hot encoded (can transform similar) -//! let oh_data = encoder.transform(&data).unwrap(); -//! // Produces the following: -//! // &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0] -//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0] -//! // &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0] -//! // &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0] -//! ``` -use std::iter; - -use crate::error::Failed; -use crate::linalg::Matrix; - -use crate::preprocessing::data_traits::{CategoricalFloat, Categorizable}; -use crate::preprocessing::series_encoder::CategoryMapper; - -/// OneHotEncoder Parameters -#[derive(Debug, Clone)] -pub struct OneHotEncoderParams { - /// Column number that contain categorical variable - pub col_idx_categorical: Option>, - /// (Currently not implemented) Try and infer which of the matrix columns are categorical variables - infer_categorical: bool, -} - -impl OneHotEncoderParams { - /// Generate parameters from categorical variable column numbers - pub fn from_cat_idx(categorical_params: &[usize]) -> Self { - Self { - col_idx_categorical: Some(categorical_params.to_vec()), - infer_categorical: false, - } - } -} - -/// Calculate the offset to parameters to due introduction of one-hot encoding -fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> Vec { - // This functions uses iterators and returns a vector. - // In case we get a huge amount of paramenters this might be a problem - // todo: Change this such that it will return an iterator - - let cat_idx = cat_idxs.iter().copied().chain((num_params..).take(1)); - - // Offset is constant between two categorical values, here we calculate the number of steps - // that remain constant - let repeats = cat_idx.scan(0, |a, v| { - let im = v + 1 - *a; - *a = v; - Some(im) - }); - - // Calculate the offset to parameter idx due to newly intorduced one-hot vectors - let offset_ = cat_sizes.iter().scan(0, |a, &v| { - *a = *a + v - 1; - Some(*a) - }); - let offset = (0..1).chain(offset_); - - let new_param_idxs: Vec = (0..num_params) - .zip( - repeats - .zip(offset) - .map(|(r, o)| iter::repeat(o).take(r)) - .flatten(), - ) - .map(|(idx, ofst)| idx + ofst) - .collect(); - new_param_idxs -} - -fn validate_col_is_categorical(data: &[T]) -> bool { - for v in data { - if !v.is_valid() { - return false; - } - } - true -} - -/// Encode Categorical variavbles of data matrix to one-hot -#[derive(Debug, Clone)] -pub struct OneHotEncoder { - category_mappers: Vec>, - col_idx_categorical: Vec, -} - -impl OneHotEncoder { - /// Create an encoder instance with categories infered from data matrix - pub fn fit(data: &M, params: OneHotEncoderParams) -> Result - where - T: Categorizable, - M: Matrix, - { - match (params.col_idx_categorical, params.infer_categorical) { - (None, false) => Err(Failed::fit( - "Must pass categorical series ids or infer flag", - )), - - (Some(_idxs), true) => Err(Failed::fit( - "Ambigous parameters, got both infer and categroy ids", - )), - - (Some(mut idxs), false) => { - // make sure categories have same order as data columns - idxs.sort_unstable(); - - let (nrows, _) = data.shape(); - - // col buffer to avoid allocations - let mut col_buf: Vec = iter::repeat(T::zero()).take(nrows).collect(); - - let mut res: Vec> = Vec::with_capacity(idxs.len()); - - for &idx in &idxs { - data.copy_col_as_vec(idx, &mut col_buf); - if !validate_col_is_categorical(&col_buf) { - let msg = format!( - "Column {} of data matrix containts non categorizable (integer) values", - idx - ); - return Err(Failed::fit(&msg[..])); - } - let hashable_col = col_buf.iter().map(|v| v.to_category()); - res.push(CategoryMapper::fit_to_iter(hashable_col)); - } - - Ok(Self { - category_mappers: res, - col_idx_categorical: idxs, - }) - } - - (None, true) => { - todo!("Auto-Inference for Categorical Variables not yet implemented") - } - } - } - - /// Transform categorical variables to one-hot encoded and return a new matrix - pub fn transform(&self, x: &M) -> Result - where - T: Categorizable, - M: Matrix, - { - let (nrows, p) = x.shape(); - let additional_params: Vec = self - .category_mappers - .iter() - .map(|enc| enc.num_categories()) - .collect(); - - // Eac category of size v adds v-1 params - let expandws_p: usize = p + additional_params.iter().fold(0, |cs, &v| cs + v - 1); - - let new_col_idx = find_new_idxs(p, &additional_params[..], &self.col_idx_categorical[..]); - let mut res = M::zeros(nrows, expandws_p); - - for (pidx, &old_cidx) in self.col_idx_categorical.iter().enumerate() { - let cidx = new_col_idx[old_cidx]; - let col_iter = (0..nrows).map(|r| x.get(r, old_cidx).to_category()); - let sencoder = &self.category_mappers[pidx]; - let oh_series = col_iter.map(|c| sencoder.get_one_hot::>(&c)); - - for (row, oh_vec) in oh_series.enumerate() { - match oh_vec { - None => { - // Since we support T types, bad value in a series causes in to be invalid - let msg = format!("At least one value in column {} doesn't conform to category definition", old_cidx); - return Err(Failed::transform(&msg[..])); - } - Some(v) => { - // copy one hot vectors to their place in the data matrix; - for (col_ofst, &val) in v.iter().enumerate() { - res.set(row, cidx + col_ofst, val); - } - } - } - } - } - - // copy old data in x to their new location while skipping catergorical vars (already treated) - let mut skip_idx_iter = self.col_idx_categorical.iter(); - let mut cur_skip = skip_idx_iter.next(); - - for (old_p, &new_p) in new_col_idx.iter().enumerate() { - // if found treated varible, skip it - if let Some(&v) = cur_skip { - if v == old_p { - cur_skip = skip_idx_iter.next(); - continue; - } - } - - for r in 0..nrows { - let val = x.get(r, old_p); - res.set(r, new_p, val); - } - } - - Ok(res) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::preprocessing::series_encoder::CategoryMapper; - - #[test] - fn adjust_idxs() { - assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); - // [0,1,2] -> [0, 1, 1, 1, 2] - assert_eq!(find_new_idxs(3, &[3], &[1]), vec![0, 1, 4]); - } - - fn build_cat_first_and_last() -> (DenseMatrix, DenseMatrix) { - let orig = DenseMatrix::from_2d_array(&[ - &[1.0, 1.5, 3.0], - &[2.0, 1.5, 4.0], - &[1.0, 1.5, 5.0], - &[2.0, 1.5, 6.0], - ]); - - let oh_enc = DenseMatrix::from_2d_array(&[ - &[1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], - &[0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], - &[1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], - &[0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], - ]); - - (orig, oh_enc) - } - - fn build_fake_matrix() -> (DenseMatrix, DenseMatrix) { - // Categorical first and last - let orig = DenseMatrix::from_2d_array(&[ - &[1.5, 1.0, 1.5, 3.0], - &[1.5, 2.0, 1.5, 4.0], - &[1.5, 1.0, 1.5, 5.0], - &[1.5, 2.0, 1.5, 6.0], - ]); - - let oh_enc = DenseMatrix::from_2d_array(&[ - &[1.5, 1.0, 0.0, 1.5, 1.0, 0.0, 0.0, 0.0], - &[1.5, 0.0, 1.0, 1.5, 0.0, 1.0, 0.0, 0.0], - &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0], - &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0], - ]); - - (orig, oh_enc) - } - - #[test] - fn hash_encode_f64_series() { - let series = vec![3.0, 1.0, 2.0, 1.0]; - let hashable_series: Vec = - series.iter().map(|v| v.to_category()).collect(); - let enc = CategoryMapper::from_positional_category_vec(hashable_series); - let inv = enc.invert_one_hot(vec![0.0, 0.0, 1.0]); - let orig_val: f64 = inv.unwrap().into(); - assert_eq!(orig_val, 2.0); - } - #[test] - fn test_fit() { - let (x, _) = build_fake_matrix(); - let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); - assert_eq!(oh_enc.category_mappers.len(), 2); - - let num_cat: Vec = oh_enc - .category_mappers - .iter() - .map(|a| a.num_categories()) - .collect(); - assert_eq!(num_cat, vec![2, 4]); - } - - #[test] - fn matrix_transform_test() { - let (x, expected_x) = build_fake_matrix(); - let params = OneHotEncoderParams::from_cat_idx(&[1, 3]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); - let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); - - let (x, expected_x) = build_cat_first_and_last(); - let params = OneHotEncoderParams::from_cat_idx(&[0, 2]); - let oh_enc = OneHotEncoder::fit(&x, params).unwrap(); - let nm = oh_enc.transform(&x).unwrap(); - assert_eq!(nm, expected_x); - } - - #[test] - fn fail_on_bad_category() { - let m = DenseMatrix::from_2d_array(&[ - &[1.0, 1.5, 3.0], - &[2.0, 1.5, 4.0], - &[1.0, 1.5, 5.0], - &[2.0, 1.5, 6.0], - ]); - - let params = OneHotEncoderParams::from_cat_idx(&[1]); - match OneHotEncoder::fit(&m, params) { - Err(_) => { - assert!(true); - } - _ => assert!(false), - } - } -} From 4af69878e01ab2abc88433573ce52d4473b8c871 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 16 Feb 2021 18:19:14 -0400 Subject: [PATCH 44/81] fix: Fix new clippy warnings (#79) * Fix new clippy warnings * Allow clippy::suspicious-operation-groupings --- src/lib.rs | 3 ++- src/linalg/mod.rs | 3 ++- src/linalg/stats.rs | 2 +- src/linear/lasso_optimizer.rs | 2 +- src/optimization/first_order/lbfgs.rs | 1 + 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6e6205f..c7c99c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,8 @@ #![allow( clippy::type_complexity, clippy::too_many_arguments, - clippy::many_single_char_names + clippy::many_single_char_names, + clippy::unnecessary_wraps )] #![warn(missing_docs)] #![warn(missing_doc_code_examples)] diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 264815b..cadbc3a 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -1,3 +1,4 @@ +#![allow(clippy::wrong_self_convention)] //! # Linear Algebra and Matrix Decomposition //! //! Most machine learning algorithms in SmartCore depend on linear algebra and matrix decomposition methods from this module. @@ -265,7 +266,7 @@ pub trait BaseVector: Clone + Debug { sum += xi * xi; } mu /= div; - sum / div - mu * mu + sum / div - mu.powi(2) } /// Computes the standard deviation. fn std(&self) -> T { diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index 45a17af..5a1dd38 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -61,7 +61,7 @@ pub trait MatrixStats: BaseMatrix { sum += a * a; } mu /= div; - *x_i = sum / div - mu * mu; + *x_i = sum / div - mu.powi(2); } x diff --git a/src/linear/lasso_optimizer.rs b/src/linear/lasso_optimizer.rs index 4f5011f..c4340fc 100644 --- a/src/linear/lasso_optimizer.rs +++ b/src/linear/lasso_optimizer.rs @@ -138,7 +138,7 @@ impl> InteriorPointOptimizer { for i in 0..p { self.prb[i] = T::two() + self.d1[i]; - self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i] * self.d2[i]; + self.prs[i] = self.prb[i] * self.d1[i] - self.d2[i].powi(2); } let normg = grad.norm2(); diff --git a/src/optimization/first_order/lbfgs.rs b/src/optimization/first_order/lbfgs.rs index 5dedfe6..322df03 100644 --- a/src/optimization/first_order/lbfgs.rs +++ b/src/optimization/first_order/lbfgs.rs @@ -1,3 +1,4 @@ +#![allow(clippy::suspicious_operation_groupings)] use std::default::Default; use std::fmt::Debug; From a30802ec438cc9da1b439e9897f0de4fa884a5ec Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 16 Feb 2021 22:20:02 -0400 Subject: [PATCH 45/81] fix: Change to compile for wasm32-unknown-unknown target (#80) --- src/dataset/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index da790b4..31a12cf 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -83,7 +83,7 @@ pub(crate) fn deserialize_data( ) -> Result<(Vec, Vec, usize, usize), io::Error> { // read the same file back into a Vec of bytes let (num_samples, num_features) = { - let mut buffer = [0u8; 8]; + let mut buffer = [0u8; if cfg!(target_arch = "wasm32") { 4 } else { 8 }]; buffer.copy_from_slice(&bytes[0..8]); let num_features = usize::from_le_bytes(buffer); buffer.copy_from_slice(&bytes[8..16]); From 4fb2625a337646fc01dd904d817f2622572c54bf Mon Sep 17 00:00:00 2001 From: Chris McComb Date: Wed, 17 Feb 2021 21:22:06 -0500 Subject: [PATCH 46/81] Implemented make_moons generator per https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/datasets/_samples_generator.py#L683 --- src/dataset/generator.rs | 51 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index 28a2224..4d454af 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -88,6 +88,44 @@ pub fn make_circles(num_samples: usize, factor: f32, noise: f32) -> Dataset Dataset { + + let num_samples_out = num_samples / 2; + let num_samples_in = num_samples - num_samples_out; + + let linspace_out = linspace(0.0, std::f32::consts::PI, num_samples_out); + let linspace_in = linspace(0.0, std::f32::consts::PI, num_samples_in); + + let noise = Normal::new(0.0, noise).unwrap(); + let mut rng = rand::thread_rng(); + + let mut x: Vec = Vec::with_capacity(num_samples * 2); + let mut y: Vec = Vec::with_capacity(num_samples); + + for v in linspace_out { + x.push(v.cos() + noise.sample(&mut rng)); + x.push(v.sin() + noise.sample(&mut rng)); + y.push(0.0); + } + + for v in linspace_in { + x.push(1.0 - v.cos() + noise.sample(&mut rng)); + x.push(1.0 - v.sin() + noise.sample(&mut rng) - 0.5); + y.push(1.0); + } + + Dataset { + data: x, + target: y, + num_samples, + num_features: 2, + feature_names: (0..2).map(|n| n.to_string()).collect(), + target_names: vec!["label".to_string()], + description: "Two interleaving half circles in 2d".to_string(), + } +} + fn linspace(start: f32, stop: f32, num: usize) -> Vec { let div = num as f32; let delta = stop - start; @@ -123,4 +161,17 @@ mod tests { assert_eq!(dataset.num_features, 2); assert_eq!(dataset.num_samples, 10); } + + #[test] + fn test_make_moons() { + let dataset = make_moons(100, 0.05); + println!("{:?}", dataset.data); + assert_eq!( + dataset.data.len(), + dataset.num_features * dataset.num_samples + ); + assert_eq!(dataset.target.len(), dataset.num_samples); + assert_eq!(dataset.num_features, 2); + assert_eq!(dataset.num_samples, 10); + } } From 483a21bec06269e277eded5a1525d4ce7b3a2648 Mon Sep 17 00:00:00 2001 From: Chris McComb Date: Wed, 17 Feb 2021 21:22:41 -0500 Subject: [PATCH 47/81] Oops, test was failing due to typo. Fixed now. --- src/dataset/generator.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index 4d454af..4367308 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -164,8 +164,7 @@ mod tests { #[test] fn test_make_moons() { - let dataset = make_moons(100, 0.05); - println!("{:?}", dataset.data); + let dataset = make_moons(10, 0.05); assert_eq!( dataset.data.len(), dataset.num_features * dataset.num_samples From fed11f005c4c5acf6194dbca8c517895208b7fa4 Mon Sep 17 00:00:00 2001 From: Chris McComb Date: Wed, 17 Feb 2021 21:29:51 -0500 Subject: [PATCH 48/81] Fixed formatting to pass cargo format check. --- src/dataset/generator.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index 4367308..39299a5 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -90,7 +90,6 @@ pub fn make_circles(num_samples: usize, factor: f32, noise: f32) -> Dataset Dataset { - let num_samples_out = num_samples / 2; let num_samples_in = num_samples - num_samples_out; From 0e9c517b1adc75aaa75b9ac0389a17d2f10739b3 Mon Sep 17 00:00:00 2001 From: zhangyiqun01 Date: Thu, 25 Feb 2021 15:59:09 +0800 Subject: [PATCH 49/81] rename svm svr to svc in tests and docs --- src/svm/svc.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 4fd70df..acc17b6 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -58,7 +58,7 @@ //! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0., //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; //! -//! let svr = SVC::fit(&x, &y, +//! let svc = SVC::fit(&x, &y, //! Kernels::linear(), //! SVCParameters { //! epoch: 2, @@ -66,7 +66,7 @@ //! tol: 1e-3, //! }).unwrap(); //! -//! let y_hat = svr.predict(&x).unwrap(); +//! let y_hat = svc.predict(&x).unwrap(); //! ``` //! //! ## References: @@ -802,11 +802,11 @@ mod tests { -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let svr = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap(); + let svc = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap(); - let deserialized_svr: SVC, LinearKernel> = - serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); + let deserialized_svc: SVC, LinearKernel> = + serde_json::from_str(&serde_json::to_string(&svc).unwrap()).unwrap(); - assert_eq!(svr, deserialized_svr); + assert_eq!(svc, deserialized_svc); } } From 1b42f8a396f52d1df77c8d6773c78afc2951a827 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Thu, 25 Feb 2021 15:44:34 -0400 Subject: [PATCH 50/81] feat: Add getters for naive bayes structs (#74) * feat: Add getters for GaussianNB * Add classes getter to BernoulliNB Add classes getter to CategoricalNB Add classes getter to MultinomialNB * Add feature_log_prob getter to MultinomialNB * Add class_count to NB structs * Add n_features getter for NB * Add feature_count to MultinomialNB and BernoulliNB * Add n_categories to CategoricalNB * Implement feature_log_prob and category_count getter for CategoricalNB * Implement feature_log_prob for BernoulliNB --- src/naive_bayes/bernoulli.rs | 144 +++++++++++++++++++++++++++---- src/naive_bayes/categorical.rs | 153 ++++++++++++++++++++++++++++----- src/naive_bayes/gaussian.rs | 78 ++++++++++++----- src/naive_bayes/multinomial.rs | 122 ++++++++++++++++++++++---- 4 files changed, 420 insertions(+), 77 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 6a7d0b4..286a4a5 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -47,12 +47,44 @@ use serde::{Deserialize, Serialize}; /// Naive Bayes classifier for Bearnoulli features #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Debug, PartialEq)] +#[derive(Debug)] struct BernoulliNBDistribution { /// class labels known to the classifier class_labels: Vec, + /// number of training samples observed in each class + class_count: Vec, + /// probability of each class class_priors: Vec, - feature_prob: Vec>, + /// Number of samples encountered for each (class, feature) + feature_count: Vec>, + /// probability of features per class + feature_log_prob: Vec>, + /// Number of features of each sample + n_features: usize, +} + +impl PartialEq for BernoulliNBDistribution { + fn eq(&self, other: &Self) -> bool { + if self.class_labels == other.class_labels + && self.class_count == other.class_count + && self.class_priors == other.class_priors + && self.feature_count == other.feature_count + && self.n_features == other.n_features + { + for (a, b) in self + .feature_log_prob + .iter() + .zip(other.feature_log_prob.iter()) + { + if !a.approximate_eq(b, T::epsilon()) { + return false; + } + } + true + } else { + false + } + } } impl> NBDistribution for BernoulliNBDistribution { @@ -65,9 +97,9 @@ impl> NBDistribution for BernoulliNBDistributi for feature in 0..j.len() { let value = j.get(feature); if value == T::one() { - likelihood += self.feature_prob[class_index][feature].ln(); + likelihood += self.feature_log_prob[class_index][feature]; } else { - likelihood += (T::one() - self.feature_prob[class_index][feature]).ln(); + likelihood += (T::one() - self.feature_log_prob[class_index][feature].exp()).ln(); } } likelihood @@ -157,10 +189,10 @@ impl BernoulliNBDistribution { let y = y.to_vec(); let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); - let mut class_count = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; for class_index in indices.iter() { - class_count[*class_index] += T::one(); + class_count[*class_index] += 1; } let class_priors = if let Some(class_priors) = priors { @@ -173,25 +205,35 @@ impl BernoulliNBDistribution { } else { class_count .iter() - .map(|&c| c / T::from(n_samples).unwrap()) + .map(|&c| T::from(c).unwrap() / T::from(n_samples).unwrap()) .collect() }; - let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + let mut feature_in_class_counter = vec![vec![0_usize; n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { for (idx, row_i) in row.iter().enumerate().take(n_features) { - feature_in_class_counter[class_index][idx] += *row_i; + feature_in_class_counter[class_index][idx] += + row_i.to_usize().ok_or_else(|| { + Failed::fit(&format!( + "Elements of the matrix should be 1.0 or 0.0 |found|=[{}]", + row_i + )) + })?; } } - let feature_prob = feature_in_class_counter + let feature_log_prob = feature_in_class_counter .iter() .enumerate() .map(|(class_index, feature_count)| { feature_count .iter() - .map(|&count| (count + alpha) / (class_count[class_index] + alpha * T::two())) + .map(|&count| { + ((T::from(count).unwrap() + alpha) + / (T::from(class_count[class_index]).unwrap() + alpha * T::two())) + .ln() + }) .collect() }) .collect(); @@ -199,7 +241,10 @@ impl BernoulliNBDistribution { Ok(Self { class_labels, class_priors, - feature_prob, + class_count, + feature_count: feature_in_class_counter, + feature_log_prob, + n_features, }) } } @@ -266,6 +311,34 @@ impl> BernoulliNB { self.inner.predict(x) } } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Number of features of each sample + pub fn n_features(&self) -> usize { + self.inner.distribution.n_features + } + + /// Number of samples encountered for each (class, feature) + /// Returns a 2d vector of shape (n_classes, n_features) + pub fn feature_count(&self) -> &Vec> { + &self.inner.distribution.feature_count + } + + /// Empirical log probability of features given a class + pub fn feature_log_prob(&self) -> &Vec> { + &self.inner.distribution.feature_log_prob + } } #[cfg(test)] @@ -296,10 +369,24 @@ mod tests { assert_eq!(bnb.inner.distribution.class_priors, &[0.75, 0.25]); assert_eq!( - bnb.inner.distribution.feature_prob, + bnb.feature_log_prob(), &[ - &[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], - &[1. / 3.0, 2. / 3.0, 2. / 3.0, 1. / 3.0, 1. / 3.0, 2. / 3.0] + &[ + -0.916290731874155, + -0.2231435513142097, + -1.6094379124341003, + -0.916290731874155, + -0.916290731874155, + -1.6094379124341003 + ], + &[ + -1.0986122886681098, + -0.40546510810816444, + -0.40546510810816444, + -1.0986122886681098, + -1.0986122886681098, + -0.40546510810816444 + ] ] ); @@ -335,13 +422,36 @@ mod tests { let y_hat = bnb.predict(&x).unwrap(); + assert_eq!(bnb.classes(), &[0., 1., 2.]); + assert_eq!(bnb.class_count(), &[7, 3, 5]); + assert_eq!(bnb.n_features(), 10); + assert_eq!( + bnb.feature_count(), + &[ + &[5, 6, 6, 7, 6, 4, 6, 7, 7, 7], + &[3, 3, 3, 1, 3, 2, 3, 2, 2, 3], + &[4, 4, 3, 4, 5, 2, 4, 5, 3, 4] + ] + ); + assert!(bnb .inner .distribution .class_priors .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); - assert!(bnb.inner.distribution.feature_prob[1].approximate_eq( - &vec!(0.8, 0.8, 0.8, 0.4, 0.8, 0.6, 0.8, 0.6, 0.6, 0.8), + assert!(bnb.feature_log_prob()[1].approximate_eq( + &vec![ + -0.22314355, + -0.22314355, + -0.22314355, + -0.91629073, + -0.22314355, + -0.51082562, + -0.22314355, + -0.51082562, + -0.51082562, + -0.22314355 + ], 1e-1 )); assert!(y_hat.approximate_eq( diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index 2161528..e308a01 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -43,14 +43,31 @@ use serde::{Deserialize, Serialize}; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] struct CategoricalNBDistribution { + /// number of training samples observed in each class + class_count: Vec, + /// class labels known to the classifier class_labels: Vec, + /// probability of each class class_priors: Vec, coefficients: Vec>>, + /// Number of features of each sample + n_features: usize, + /// Number of categories for each feature + n_categories: Vec, + /// Holds arrays of shape (n_classes, n_categories of respective feature) + /// for each feature. Each array provides the number of samples + /// encountered for each class and category of the specific feature. + category_count: Vec>>, } impl PartialEq for CategoricalNBDistribution { fn eq(&self, other: &Self) -> bool { - if self.class_labels == other.class_labels && self.class_priors == other.class_priors { + if self.class_labels == other.class_labels + && self.class_priors == other.class_priors + && self.n_features == other.n_features + && self.n_categories == other.n_categories + && self.class_count == other.class_count + { if self.coefficients.len() != other.coefficients.len() { return false; } @@ -90,8 +107,8 @@ impl> NBDistribution for CategoricalNBDistribu let mut likelihood = T::zero(); for feature in 0..j.len() { let value = j.get(feature).floor().to_usize().unwrap(); - if self.coefficients[class_index][feature].len() > value { - likelihood += self.coefficients[class_index][feature][value]; + if self.coefficients[feature][class_index].len() > value { + likelihood += self.coefficients[feature][class_index][value]; } else { return T::zero(); } @@ -149,12 +166,12 @@ impl CategoricalNBDistribution { let class_labels: Vec = (0..*y_max + 1) .map(|label| T::from(label).unwrap()) .collect(); - let mut classes_count: Vec = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; for elem in y.iter() { - classes_count[*elem] += T::one(); + class_count[*elem] += 1; } - let mut feature_categories: Vec> = Vec::with_capacity(n_features); + let mut n_categories: Vec = Vec::with_capacity(n_features); for feature in 0..n_features { let feature_max = x .get_col_as_vec(feature) @@ -167,18 +184,15 @@ impl CategoricalNBDistribution { feature )) })?; - let feature_types = (0..feature_max + 1) - .map(|feat| T::from(feat).unwrap()) - .collect(); - feature_categories.push(feature_types); + n_categories.push(feature_max + 1); } let mut coefficients: Vec>> = Vec::with_capacity(class_labels.len()); - for (label, label_count) in class_labels.iter().zip(classes_count.iter()) { + let mut category_count: Vec>> = Vec::with_capacity(class_labels.len()); + for (feature_index, &n_categories_i) in n_categories.iter().enumerate().take(n_features) { let mut coef_i: Vec> = Vec::with_capacity(n_features); - for (feature_index, feature_options) in - feature_categories.iter().enumerate().take(n_features) - { + let mut category_count_i: Vec> = Vec::with_capacity(n_features); + for (label, &label_count) in class_labels.iter().zip(class_count.iter()) { let col = x .get_col_as_vec(feature_index) .iter() @@ -186,33 +200,41 @@ impl CategoricalNBDistribution { .filter(|(i, _j)| T::from(y[*i]).unwrap() == *label) .map(|(_, j)| *j) .collect::>(); - let mut feat_count: Vec = vec![T::zero(); feature_options.len()]; + let mut feat_count: Vec = vec![0_usize; n_categories_i]; for row in col.iter() { let index = row.floor().to_usize().unwrap(); - feat_count[index] += T::one(); + feat_count[index] += 1; } + let coef_i_j = feat_count .iter() .map(|c| { - ((*c + alpha) - / (*label_count + T::from(feature_options.len()).unwrap() * alpha)) + ((T::from(*c).unwrap() + alpha) + / (T::from(label_count).unwrap() + + T::from(n_categories_i).unwrap() * alpha)) .ln() }) .collect::>(); + category_count_i.push(feat_count); coef_i.push(coef_i_j); } + category_count.push(category_count_i); coefficients.push(coef_i); } - let class_priors = classes_count - .into_iter() - .map(|count| count / T::from(n_samples).unwrap()) + let class_priors = class_count + .iter() + .map(|&count| T::from(count).unwrap() / T::from(n_samples).unwrap()) .collect::>(); Ok(Self { + class_count, class_labels, class_priors, coefficients, + n_categories, + n_features, + category_count, }) } } @@ -287,6 +309,41 @@ impl> CategoricalNB { pub fn predict(&self, x: &M) -> Result { self.inner.predict(x) } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Number of features of each sample + pub fn n_features(&self) -> usize { + self.inner.distribution.n_features + } + + /// Number of features of each sample + pub fn n_categories(&self) -> &Vec { + &self.inner.distribution.n_categories + } + + /// Holds arrays of shape (n_classes, n_categories of respective feature) + /// for each feature. Each array provides the number of samples + /// encountered for each class and category of the specific feature. + pub fn category_count(&self) -> &Vec>> { + &self.inner.distribution.category_count + } + /// Holds arrays of shape (n_classes, n_categories of respective feature) + /// for each feature. Each array provides the empirical log probability + /// of categories given the respective feature and class, ``P(x_i|y)``. + pub fn feature_log_prob(&self) -> &Vec>> { + &self.inner.distribution.coefficients + } } #[cfg(test)] @@ -315,6 +372,60 @@ mod tests { let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.]; let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); + + // checking parity with scikit + assert_eq!(cnb.classes(), &[0., 1.]); + assert_eq!(cnb.class_count(), &[5, 9]); + assert_eq!(cnb.n_features(), 4); + assert_eq!(cnb.n_categories(), &[3, 3, 2, 2]); + assert_eq!( + cnb.category_count(), + &vec![ + vec![vec![3, 0, 2], vec![2, 4, 3]], + vec![vec![1, 2, 2], vec![3, 4, 2]], + vec![vec![1, 4], vec![6, 3]], + vec![vec![2, 3], vec![6, 3]] + ] + ); + + assert_eq!( + cnb.feature_log_prob(), + &vec![ + vec![ + vec![ + -0.6931471805599453, + -2.0794415416798357, + -0.9808292530117262 + ], + vec![ + -1.3862943611198906, + -0.8754687373538999, + -1.0986122886681098 + ] + ], + vec![ + vec![ + -1.3862943611198906, + -0.9808292530117262, + -0.9808292530117262 + ], + vec![ + -1.0986122886681098, + -0.8754687373538999, + -1.3862943611198906 + ] + ], + vec![ + vec![-1.252762968495368, -0.3364722366212129], + vec![-0.45198512374305727, -1.0116009116784799] + ], + vec![ + vec![-0.8472978603872037, -0.5596157879354228], + vec![-0.45198512374305727, -1.0116009116784799] + ] + ] + ); + let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]); let y_hat = cnb.predict(&x_test).unwrap(); assert_eq!(y_hat, vec![0., 1.]); diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 28c4785..00c7962 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -39,10 +39,12 @@ use serde::{Deserialize, Serialize}; struct GaussianNBDistribution { /// class labels known to the classifier class_labels: Vec, + /// number of training samples observed in each class + class_count: Vec, /// probability of each class. class_priors: Vec, /// variance of each feature per class - sigma: Vec>, + var: Vec>, /// mean of each feature per class theta: Vec>, } @@ -57,18 +59,14 @@ impl> NBDistribution for GaussianNBDistributio } fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { - if class_index < self.class_labels.len() { - let mut likelihood = T::zero(); - for feature in 0..j.len() { - let value = j.get(feature); - let mean = self.theta[class_index][feature]; - let variance = self.sigma[class_index][feature]; - likelihood += self.calculate_log_probability(value, mean, variance); - } - likelihood - } else { - T::zero() + let mut likelihood = T::zero(); + for feature in 0..j.len() { + let value = j.get(feature); + let mean = self.theta[class_index][feature]; + let variance = self.var[class_index][feature]; + likelihood += self.calculate_log_probability(value, mean, variance); } + likelihood } fn classes(&self) -> &Vec { @@ -121,12 +119,12 @@ impl GaussianNBDistribution { let y = y.to_vec(); let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); - let mut class_count = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; let mut subdataset: Vec>> = vec![vec![]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices.iter()) { - class_count[*class_index] += T::one(); + class_count[*class_index] += 1; subdataset[*class_index].push(row); } @@ -139,8 +137,8 @@ impl GaussianNBDistribution { class_priors } else { class_count - .into_iter() - .map(|c| c / T::from(n_samples).unwrap()) + .iter() + .map(|&c| T::from(c).unwrap() / T::from(n_samples).unwrap()) .collect() }; @@ -157,15 +155,16 @@ impl GaussianNBDistribution { }) .collect(); - let (sigma, theta): (Vec>, Vec>) = subdataset + let (var, theta): (Vec>, Vec>) = subdataset .iter() .map(|data| (data.var(0), data.mean(0))) .unzip(); Ok(Self { class_labels, + class_count, class_priors, - sigma, + var, theta, }) } @@ -223,6 +222,36 @@ impl> GaussianNB { pub fn predict(&self, x: &M) -> Result { self.inner.predict(x) } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Probability of each class + /// Returns a vector of size n_classes. + pub fn class_priors(&self) -> &Vec { + &self.inner.distribution.class_priors + } + + /// Mean of each feature per class + /// Returns a 2d vector of shape (n_classes, n_features). + pub fn theta(&self) -> &Vec> { + &self.inner.distribution.theta + } + + /// Variance of each feature per class + /// Returns a 2d vector of shape (n_classes, n_features). + pub fn var(&self) -> &Vec> { + &self.inner.distribution.var + } } #[cfg(test)] @@ -245,18 +274,23 @@ mod tests { let gnb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); let y_hat = gnb.predict(&x).unwrap(); assert_eq!(y_hat, y); + + assert_eq!(gnb.classes(), &[1., 2.]); + + assert_eq!(gnb.class_count(), &[3, 3]); + assert_eq!( - gnb.inner.distribution.sigma, + gnb.var(), &[ &[0.666666666666667, 0.22222222222222232], &[0.666666666666667, 0.22222222222222232] ] ); - assert_eq!(gnb.inner.distribution.class_priors, &[0.5, 0.5]); + assert_eq!(gnb.class_priors(), &[0.5, 0.5]); assert_eq!( - gnb.inner.distribution.theta, + gnb.theta(), &[&[-2., -1.3333333333333333], &[2., 1.3333333333333333]] ); } @@ -277,7 +311,7 @@ mod tests { let parameters = GaussianNBParameters::default().with_priors(priors.clone()); let gnb = GaussianNB::fit(&x, &y, parameters).unwrap(); - assert_eq!(gnb.inner.distribution.class_priors, priors); + assert_eq!(gnb.class_priors(), &priors); } #[test] diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 06ee071..87e0ddd 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -51,8 +51,16 @@ use serde::{Deserialize, Serialize}; struct MultinomialNBDistribution { /// class labels known to the classifier class_labels: Vec, + /// number of training samples observed in each class + class_count: Vec, + /// probability of each class class_priors: Vec, - feature_prob: Vec>, + /// Empirical log probability of features given a class + feature_log_prob: Vec>, + /// Number of samples encountered for each (class, feature) + feature_count: Vec>, + /// Number of features of each sample + n_features: usize, } impl> NBDistribution for MultinomialNBDistribution { @@ -64,7 +72,7 @@ impl> NBDistribution for MultinomialNBDistribu let mut likelihood = T::zero(); for feature in 0..j.len() { let value = j.get(feature); - likelihood += value * self.feature_prob[class_index][feature].ln(); + likelihood += value * self.feature_log_prob[class_index][feature]; } likelihood } @@ -144,10 +152,10 @@ impl MultinomialNBDistribution { let y = y.to_vec(); let (class_labels, indices) = as RealNumberVector>::unique_with_indices(&y); - let mut class_count = vec![T::zero(); class_labels.len()]; + let mut class_count = vec![0_usize; class_labels.len()]; for class_index in indices.iter() { - class_count[*class_index] += T::one(); + class_count[*class_index] += 1; } let class_priors = if let Some(class_priors) = priors { @@ -160,33 +168,46 @@ impl MultinomialNBDistribution { } else { class_count .iter() - .map(|&c| c / T::from(n_samples).unwrap()) + .map(|&c| T::from(c).unwrap() / T::from(n_samples).unwrap()) .collect() }; - let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; + let mut feature_in_class_counter = vec![vec![0_usize; n_features]; class_labels.len()]; for (row, class_index) in row_iter(x).zip(indices) { for (idx, row_i) in row.iter().enumerate().take(n_features) { - feature_in_class_counter[class_index][idx] += *row_i; + feature_in_class_counter[class_index][idx] += + row_i.to_usize().ok_or_else(|| { + Failed::fit(&format!( + "Elements of the matrix should be convertible to usize |found|=[{}]", + row_i + )) + })?; } } - let feature_prob = feature_in_class_counter + let feature_log_prob = feature_in_class_counter .iter() .map(|feature_count| { - let n_c = feature_count.sum(); + let n_c: usize = feature_count.iter().sum(); feature_count .iter() - .map(|&count| (count + alpha) / (n_c + alpha * T::from(n_features).unwrap())) + .map(|&count| { + ((T::from(count).unwrap() + alpha) + / (T::from(n_c).unwrap() + alpha * T::from(n_features).unwrap())) + .ln() + }) .collect() }) .collect(); Ok(Self { + class_count, class_labels, class_priors, - feature_prob, + feature_log_prob, + feature_count: feature_in_class_counter, + n_features, }) } } @@ -240,6 +261,35 @@ impl> MultinomialNB { pub fn predict(&self, x: &M) -> Result { self.inner.predict(x) } + + /// Class labels known to the classifier. + /// Returns a vector of size n_classes. + pub fn classes(&self) -> &Vec { + &self.inner.distribution.class_labels + } + + /// Number of training samples observed in each class. + /// Returns a vector of size n_classes. + pub fn class_count(&self) -> &Vec { + &self.inner.distribution.class_count + } + + /// Empirical log probability of features given a class, P(x_i|y). + /// Returns a 2d vector of shape (n_classes, n_features) + pub fn feature_log_prob(&self) -> &Vec> { + &self.inner.distribution.feature_log_prob + } + + /// Number of features of each sample + pub fn n_features(&self) -> usize { + self.inner.distribution.n_features + } + + /// Number of samples encountered for each (class, feature) + /// Returns a 2d vector of shape (n_classes, n_features) + pub fn feature_count(&self) -> &Vec> { + &self.inner.distribution.feature_count + } } #[cfg(test)] @@ -268,12 +318,29 @@ mod tests { let y = vec![0., 0., 0., 1.]; let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + assert_eq!(mnb.classes(), &[0., 1.]); + assert_eq!(mnb.class_count(), &[3, 1]); + assert_eq!(mnb.inner.distribution.class_priors, &[0.75, 0.25]); assert_eq!( - mnb.inner.distribution.feature_prob, + mnb.feature_log_prob(), &[ - &[1. / 7., 3. / 7., 1. / 14., 1. / 7., 1. / 7., 1. / 14.], - &[1. / 9., 2. / 9.0, 2. / 9.0, 1. / 9.0, 1. / 9.0, 2. / 9.0] + &[ + (1_f64 / 7_f64).ln(), + (3_f64 / 7_f64).ln(), + (1_f64 / 14_f64).ln(), + (1_f64 / 7_f64).ln(), + (1_f64 / 7_f64).ln(), + (1_f64 / 14_f64).ln() + ], + &[ + (1_f64 / 9_f64).ln(), + (2_f64 / 9_f64).ln(), + (2_f64 / 9_f64).ln(), + (1_f64 / 9_f64).ln(), + (1_f64 / 9_f64).ln(), + (2_f64 / 9_f64).ln() + ] ] ); @@ -307,6 +374,16 @@ mod tests { let y = vec![2., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0., 0., 2., 0., 2.]; let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); + assert_eq!(nb.n_features(), 10); + assert_eq!( + nb.feature_count(), + &[ + &[12, 20, 11, 24, 12, 14, 13, 17, 13, 18], + &[9, 6, 9, 4, 7, 3, 8, 5, 4, 9], + &[10, 12, 9, 9, 11, 3, 9, 18, 10, 10] + ] + ); + let y_hat = nb.predict(&x).unwrap(); assert!(nb @@ -314,9 +391,20 @@ mod tests { .distribution .class_priors .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); - assert!(nb.inner.distribution.feature_prob[1].approximate_eq( - &vec!(0.07, 0.12, 0.07, 0.15, 0.07, 0.09, 0.08, 0.10, 0.08, 0.11), - 1e-1 + assert!(nb.feature_log_prob()[1].approximate_eq( + &vec![ + -2.00148, + -2.35815494, + -2.00148, + -2.69462718, + -2.22462355, + -2.91777073, + -2.10684052, + -2.51230562, + -2.69462718, + -2.00148 + ], + 1e-5 )); assert!(y_hat.approximate_eq( &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 2.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 2.0), From cd44f1d51542fe61b66e1dd85c9b7f699fa40cf9 Mon Sep 17 00:00:00 2001 From: zhangyiqun01 Date: Fri, 26 Feb 2021 10:47:21 +0800 Subject: [PATCH 51/81] reset --- src/svm/svc.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/svm/svc.rs b/src/svm/svc.rs index acc17b6..4fd70df 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -58,7 +58,7 @@ //! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0., //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; //! -//! let svc = SVC::fit(&x, &y, +//! let svr = SVC::fit(&x, &y, //! Kernels::linear(), //! SVCParameters { //! epoch: 2, @@ -66,7 +66,7 @@ //! tol: 1e-3, //! }).unwrap(); //! -//! let y_hat = svc.predict(&x).unwrap(); +//! let y_hat = svr.predict(&x).unwrap(); //! ``` //! //! ## References: @@ -802,11 +802,11 @@ mod tests { -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let svc = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap(); + let svr = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap(); - let deserialized_svc: SVC, LinearKernel> = - serde_json::from_str(&serde_json::to_string(&svc).unwrap()).unwrap(); + let deserialized_svr: SVC, LinearKernel> = + serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); - assert_eq!(svc, deserialized_svc); + assert_eq!(svr, deserialized_svr); } } From 6d58dbe2a26ed40e7bd08e7d190e19417d121d2f Mon Sep 17 00:00:00 2001 From: zhangyiqun01 Date: Fri, 26 Feb 2021 10:52:04 +0800 Subject: [PATCH 52/81] rename svm svr to svc in tests and docs --- src/svm/svc.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 3101425..8870c41 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -57,9 +57,9 @@ //! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0., //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; //! -//! let svr = SVC::fit(&x, &y, SVCParameters::default().with_c(200.0)).unwrap(); +//! let svc = SVC::fit(&x, &y, SVCParameters::default().with_c(200.0)).unwrap(); //! -//! let y_hat = svr.predict(&x).unwrap(); +//! let y_hat = svc.predict(&x).unwrap(); //! ``` //! //! ## References: @@ -844,11 +844,11 @@ mod tests { -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let svr = SVC::fit(&x, &y, Default::default()).unwrap(); + let svc = SVC::fit(&x, &y, Default::default()).unwrap(); - let deserialized_svr: SVC, LinearKernel> = - serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); + let deserialized_svc: SVC, LinearKernel> = + serde_json::from_str(&serde_json::to_string(&svc).unwrap()).unwrap(); - assert_eq!(svr, deserialized_svr); + assert_eq!(svc, deserialized_svc); } } From 4c1dbc33278e76366f84a7f8daef7b10733f2e1f Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 26 Feb 2021 12:34:05 -0800 Subject: [PATCH 53/81] Fixes width and hight parameters of the logo --- smartcore.svg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smartcore.svg b/smartcore.svg index f8ff7e9..6f2e0cb 100644 --- a/smartcore.svg +++ b/smartcore.svg @@ -9,9 +9,9 @@ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" inkscape:version="1.0 (4035a4f, 2020-05-01)" sodipodi:docname="smartcore.svg" - width="396.01309mm" - height="86.286003mm" - viewBox="0 0 396.0131 86.286004" + width="1280" + height="320" + viewBox="0 0 400 100" version="1.1" id="svg512"> Date: Fri, 26 Feb 2021 12:43:10 -0800 Subject: [PATCH 54/81] Fixes width and hight parameters of the logo --- smartcore.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartcore.svg b/smartcore.svg index 6f2e0cb..3e4c68d 100644 --- a/smartcore.svg +++ b/smartcore.svg @@ -11,7 +11,7 @@ sodipodi:docname="smartcore.svg" width="1280" height="320" - viewBox="0 0 400 100" + viewBox="0 0 454 86.286004" version="1.1" id="svg512"> Date: Fri, 5 Mar 2021 10:25:34 -0400 Subject: [PATCH 55/81] fix: Use usize time for usize::from_le_bytes buffer --- src/dataset/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 31a12cf..17d3b72 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -82,8 +82,9 @@ pub(crate) fn deserialize_data( bytes: &[u8], ) -> Result<(Vec, Vec, usize, usize), io::Error> { // read the same file back into a Vec of bytes + const USIZE_SIZE: usize = std::mem::size_of::(); let (num_samples, num_features) = { - let mut buffer = [0u8; if cfg!(target_arch = "wasm32") { 4 } else { 8 }]; + let mut buffer = [0u8; USIZE_SIZE]; buffer.copy_from_slice(&bytes[0..8]); let num_features = usize::from_le_bytes(buffer); buffer.copy_from_slice(&bytes[8..16]); From 3dc53365149ae83305bf4c0d8df6029693f0651b Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Fri, 5 Mar 2021 15:48:55 -0400 Subject: [PATCH 56/81] Move CI to github actions --- .circleci/config.yml | 59 ---------------------------------- .github/workflows/ci.yml | 50 ++++++++++++++++++++++++++++ .github/workflows/coverage.yml | 44 +++++++++++++++++++++++++ .github/workflows/lint.yml | 41 +++++++++++++++++++++++ 4 files changed, 135 insertions(+), 59 deletions(-) delete mode 100644 .circleci/config.yml create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/coverage.yml create mode 100644 .github/workflows/lint.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 6cdd0e4..0000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,59 +0,0 @@ -version: 2.1 - -workflows: - version: 2.1 - build: - jobs: - - build - - clippy - - coverage - -jobs: - build: - docker: - - image: circleci/rust:latest - environment: - TZ: "/usr/share/zoneinfo/your/location" - steps: - - checkout - - restore_cache: - key: project-cache - - run: - name: Check formatting - command: cargo fmt -- --check - - run: - name: Stable Build - command: cargo build --all-features - - run: - name: Test - command: cargo test --all-features - - save_cache: - key: project-cache - paths: - - "~/.cargo" - - "./target" - clippy: - docker: - - image: circleci/rust:latest - steps: - - checkout - - run: - name: Install cargo clippy - command: rustup component add clippy - - run: - name: Run cargo clippy - command: cargo clippy --all-features -- -Drust-2018-idioms -Dwarnings - - coverage: - machine: true - steps: - - checkout - - run: - name: Generate report - command: > - docker run --security-opt seccomp=unconfined -v $PWD:/volume - xd009642/tarpaulin:latest-nightly cargo tarpaulin -v --ciserver circle-ci - --out Lcov --all-features -- --test-threads 1 - - run: - name: Upload - command: bash <(curl -s https://codecov.io/bash) -Z -f diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..89e9517 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,50 @@ +name: CI + +on: + push: + branches: [ main, development ] + pull_request: + branches: [ development ] + +jobs: + tests: + runs-on: "${{ matrix.platform.os }}-latest" + strategy: + matrix: + platform: [ + { os: "windows", target: "x86_64-pc-windows-msvc" }, + { os: "windows", target: "i686-pc-windows-msvc" }, + { os: "ubuntu", target: "x86_64-unknown-linux-gnu" }, + { os: "ubuntu", target: "i686-unknown-linux-gnu" }, + { os: "ubuntu", target: "wasm32-unknown-unknown" }, + { os: "macos", target: "aarch64-apple-darwin" }, + ] + env: + TZ: "/usr/share/zoneinfo/your/location" + steps: + - uses: actions/checkout@v2 + - name: Cache .cargo and target + uses: actions/cache@v2 + with: + path: | + ~/.cargo + ./target + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.toml') }} + restore-keys: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.toml') }} + - name: Install Rust toolchain + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + target: ${{ matrix.platform.target }} + profile: minimal + default: true + - name: Stable Build + uses: actions-rs/cargo@v1 + with: + command: build + args: --all-features --target ${{ matrix.platform.target }} + - name: Tests + uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml new file mode 100644 index 0000000..793e79d --- /dev/null +++ b/.github/workflows/coverage.yml @@ -0,0 +1,44 @@ +name: Coverage + +on: + push: + branches: [ main, development ] + pull_request: + branches: [ development ] + +jobs: + coverage: + runs-on: ubuntu-latest + env: + TZ: "/usr/share/zoneinfo/your/location" + steps: + - uses: actions/checkout@v2 + - name: Cache .cargo + uses: actions/cache@v2 + with: + path: | + ~/.cargo + ./target + key: ${{ runner.os }}-coverage-cargo-${{ hashFiles('**/Cargo.toml') }} + restore-keys: ${{ runner.os }}-coverage-cargo-${{ hashFiles('**/Cargo.toml') }} + - name: Install Rust toolchain + uses: actions-rs/toolchain@v1 + with: + toolchain: nightly + profile: minimal + default: true + - name: Install cargo-tarpaulin + uses: actions-rs/install@v0.1 + with: + crate: cargo-tarpaulin + version: latest + use-tool-cache: true + - name: Run cargo-tarpaulin + uses: actions-rs/cargo@v1 + with: + command: tarpaulin + args: --out Lcov --all-features -- --test-threads 1 + - name: Upload to codecov.io + uses: codecov/codecov-action@v1 + with: + fail_ci_if_error: true diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..77a082f --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,41 @@ +name: Lint checks + +on: + push: + branches: [ main, development ] + pull_request: + branches: [ development ] + +jobs: + lint: + runs-on: ubuntu-latest + env: + TZ: "/usr/share/zoneinfo/your/location" + steps: + - uses: actions/checkout@v2 + - name: Cache .cargo and target + uses: actions/cache@v2 + with: + path: | + ~/.cargo + ./target + key: ${{ runner.os }}-lint-cargo-${{ hashFiles('**/Cargo.toml') }} + restore-keys: ${{ runner.os }}-lint-cargo-${{ hashFiles('**/Cargo.toml') }} + - name: Install Rust toolchain + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + profile: minimal + default: true + - run: rustup component add rustfmt + - name: Check formt + uses: actions-rs/cargo@v1 + with: + command: fmt + args: --all -- --check + - run: rustup component add clippy + - name: Run clippy + uses: actions-rs/cargo@v1 + with: + command: clippy + args: --all-features -- -Drust-2018-idioms -Dwarnings From 02200ae1e319e9f878adc081a00f3faca31c6739 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Fri, 5 Mar 2021 18:45:20 -0400 Subject: [PATCH 57/81] Only run tests once per OS --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 89e9517..1955bc4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,6 +44,7 @@ jobs: command: build args: --all-features --target ${{ matrix.platform.target }} - name: Tests + if: matrix.platform.target == 'x86_64-unknown-linux-gnu' || matrix.platform.target == 'x86_64-pc-windows-msvc' || matrix.platform.target == 'aarch64-apple-darwin' uses: actions-rs/cargo@v1 with: command: test From d9814c0918e1e9ee18693d06682a5ed801ee3a27 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Tue, 27 Apr 2021 09:32:01 -0400 Subject: [PATCH 58/81] style(lint): fix clippy warnings --- src/algorithm/neighbour/cover_tree.rs | 3 +-- src/lib.rs | 3 ++- src/linalg/naive/dense_matrix.rs | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 9c5c806..bceb897 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -316,8 +316,7 @@ impl> CoverTree point_set.append(&mut far); child } else { - let mut children: Vec> = Vec::new(); - children.push(child); + let mut children: Vec> = vec![child]; let mut new_point_set: Vec> = Vec::new(); let mut new_consumed_set: Vec> = Vec::new(); diff --git a/src/lib.rs b/src/lib.rs index c7c99c8..c117039 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,8 @@ clippy::type_complexity, clippy::too_many_arguments, clippy::many_single_char_names, - clippy::unnecessary_wraps + clippy::unnecessary_wraps, + clippy::upper_case_acronyms )] #![warn(missing_docs)] #![warn(missing_doc_code_examples)] diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 4faa77d..34c8259 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -524,9 +524,9 @@ impl PartialEq for DenseMatrix { } } -impl Into> for DenseMatrix { - fn into(self) -> Vec { - self.values +impl From> for Vec { + fn from(dense_matrix: DenseMatrix) -> Vec { + dense_matrix.values } } From 162bed2aa2242e35e02c875821d0d7abaf054070 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 28 Apr 2021 15:58:39 -0400 Subject: [PATCH 59/81] feat: added support to wasm (#94) * test: run tests also in wasm targets * fix: install rand with wasm-bindgen por wasm targets * fix: use actual usize size to access buffer. * fix: do not run functions that create files in wasm. * test: do not run in wasm test that panics. Co-authored-by: Luis Moreno --- .github/workflows/ci.yml | 6 ++ Cargo.toml | 6 ++ src/algorithm/neighbour/bbd_tree.rs | 1 + src/algorithm/neighbour/cover_tree.rs | 5 +- src/algorithm/neighbour/linear_search.rs | 3 +- src/algorithm/sort/heap_select.rs | 5 ++ src/algorithm/sort/quick_sort.rs | 1 + src/cluster/dbscan.rs | 2 + src/cluster/kmeans.rs | 3 + src/dataset/boston.rs | 3 + src/dataset/breast_cancer.rs | 3 + src/dataset/diabetes.rs | 3 + src/dataset/digits.rs | 4 +- src/dataset/generator.rs | 3 + src/dataset/iris.rs | 3 + src/dataset/mod.rs | 10 +++- src/decomposition/pca.rs | 6 +- src/decomposition/svd.rs | 2 + src/ensemble/random_forest_classifier.rs | 2 + src/ensemble/random_forest_regressor.rs | 2 + src/linalg/cholesky.rs | 3 +- src/linalg/evd.rs | 6 +- src/linalg/lu.rs | 3 +- src/linalg/mod.rs | 5 ++ src/linalg/naive/dense_matrix.rs | 58 +++++++++---------- src/linalg/nalgebra_bindings.rs | 40 +++++++++++++ src/linalg/ndarray_bindings.rs | 45 +++++++++++++- src/linalg/qr.rs | 3 +- src/linalg/stats.rs | 8 +-- src/linalg/svd.rs | 7 ++- src/linear/bg_solver.rs | 1 + src/linear/elastic_net.rs | 3 + src/linear/lasso.rs | 2 + src/linear/linear_regression.rs | 2 + src/linear/logistic_regression.rs | 7 +++ src/linear/ridge_regression.rs | 2 + src/math/distance/euclidian.rs | 1 + src/math/distance/hamming.rs | 1 + src/math/distance/mahalanobis.rs | 1 + src/math/distance/manhattan.rs | 1 + src/math/distance/minkowski.rs | 1 + src/math/num.rs | 1 + src/math/vector.rs | 1 + src/metrics/accuracy.rs | 1 + src/metrics/auc.rs | 1 + src/metrics/cluster_hcv.rs | 1 + src/metrics/cluster_helpers.rs | 3 + src/metrics/f1.rs | 1 + src/metrics/mean_absolute_error.rs | 1 + src/metrics/mean_squared_error.rs | 1 + src/metrics/precision.rs | 1 + src/metrics/r2.rs | 1 + src/metrics/recall.rs | 1 + src/model_selection/kfold.rs | 7 +++ src/model_selection/mod.rs | 4 ++ src/naive_bayes/bernoulli.rs | 3 + src/naive_bayes/categorical.rs | 3 + src/naive_bayes/gaussian.rs | 3 + src/naive_bayes/multinomial.rs | 3 + src/neighbors/knn_classifier.rs | 3 + src/neighbors/knn_regressor.rs | 3 + .../first_order/gradient_descent.rs | 1 + src/optimization/first_order/lbfgs.rs | 1 + src/optimization/line_search.rs | 1 + src/preprocessing/categorical.rs | 5 ++ src/preprocessing/series_encoder.rs | 6 ++ src/svm/mod.rs | 4 ++ src/svm/svc.rs | 3 + src/svm/svr.rs | 2 + src/tree/decision_tree_classifier.rs | 4 ++ src/tree/decision_tree_regressor.rs | 2 + 71 files changed, 294 insertions(+), 51 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1955bc4..5041117 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,6 +38,9 @@ jobs: target: ${{ matrix.platform.target }} profile: minimal default: true + - name: Install test runner for wasm + if: matrix.platform.target == 'wasm32-unknown-unknown' + run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh - name: Stable Build uses: actions-rs/cargo@v1 with: @@ -49,3 +52,6 @@ jobs: with: command: test args: --all-features + - name: Tests in WASM + if: matrix.platform.target == 'wasm32-unknown-unknown' + run: wasm-pack test --node -- --all-features diff --git a/Cargo.toml b/Cargo.toml index d941735..ef99307 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,11 +27,17 @@ rand = "0.7.3" rand_distr = "0.3.0" serde = { version = "1.0.115", features = ["derive"], optional = true } +[target.'cfg(target_arch = "wasm32")'.dependencies] +rand = { version = "0.7.3", features = ["wasm-bindgen"] } + [dev-dependencies] criterion = "0.3" serde_json = "1.0" bincode = "1.3.1" +[target.'cfg(target_arch = "wasm32")'.dev-dependencies] +wasm-bindgen-test = "0.3" + [[bench]] name = "distance" harness = false diff --git a/src/algorithm/neighbour/bbd_tree.rs b/src/algorithm/neighbour/bbd_tree.rs index 0d11fc6..293a822 100644 --- a/src/algorithm/neighbour/bbd_tree.rs +++ b/src/algorithm/neighbour/bbd_tree.rs @@ -314,6 +314,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn bbdtree_iris() { let data = DenseMatrix::from_2d_array(&[ diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index bceb897..e8fc937 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -466,6 +466,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn cover_tree_test() { let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; @@ -482,7 +483,7 @@ mod tests { let knn: Vec = knn.iter().map(|v| *v.2).collect(); assert_eq!(vec!(3, 4, 5, 6, 7), knn); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn cover_tree_test1() { let data = vec![ @@ -501,7 +502,7 @@ mod tests { assert_eq!(vec!(0, 1, 2), knn); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index b4a3c89..fd8cc6a 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -150,6 +150,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn knn_find() { let data1 = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; @@ -196,7 +197,7 @@ mod tests { assert_eq!(vec!(1, 2, 3), found_idxs2); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn knn_point_eq() { let point1 = KNNPoint { diff --git a/src/algorithm/sort/heap_select.rs b/src/algorithm/sort/heap_select.rs index a44b2bb..86a74ac 100644 --- a/src/algorithm/sort/heap_select.rs +++ b/src/algorithm/sort/heap_select.rs @@ -96,12 +96,14 @@ impl<'a, T: PartialOrd + Debug> HeapSelection { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn with_capacity() { let heap = HeapSelection::::with_capacity(3); assert_eq!(3, heap.k); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_add() { let mut heap = HeapSelection::with_capacity(3); @@ -119,6 +121,7 @@ mod tests { assert_eq!(vec![2, 0, -5], heap.get()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_add1() { let mut heap = HeapSelection::with_capacity(3); @@ -133,6 +136,7 @@ mod tests { assert_eq!(vec![0f64, -1f64, -5f64], heap.get()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_add2() { let mut heap = HeapSelection::with_capacity(3); @@ -145,6 +149,7 @@ mod tests { assert_eq!(vec![5.6568, 2.8284, 0.0], heap.get()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_add_ordered() { let mut heap = HeapSelection::with_capacity(3); diff --git a/src/algorithm/sort/quick_sort.rs b/src/algorithm/sort/quick_sort.rs index e160ed2..ddf2503 100644 --- a/src/algorithm/sort/quick_sort.rs +++ b/src/algorithm/sort/quick_sort.rs @@ -113,6 +113,7 @@ impl QuickArgSort for Vec { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn with_capacity() { let arr1 = vec![0.3, 0.1, 0.2, 0.4, 0.9, 0.5, 0.7, 0.6, 0.8]; diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index d7a706a..b1231c3 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -268,6 +268,7 @@ mod tests { #[cfg(feature = "serde")] use crate::math::distance::euclidian::Euclidian; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn fit_predict_dbscan() { let x = DenseMatrix::from_2d_array(&[ @@ -299,6 +300,7 @@ mod tests { assert_eq!(expected_labels, predicted_labels); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 6be52a5..69f40db 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -299,6 +299,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn invalid_k() { let x = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); @@ -312,6 +313,7 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn fit_predict_iris() { let x = DenseMatrix::from_2d_array(&[ @@ -346,6 +348,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/dataset/boston.rs b/src/dataset/boston.rs index 33f7700..1e4ee12 100644 --- a/src/dataset/boston.rs +++ b/src/dataset/boston.rs @@ -56,9 +56,11 @@ pub fn load_dataset() -> Dataset { #[cfg(test)] mod tests { + #[cfg(not(target_arch = "wasm32"))] use super::super::*; use super::*; + #[cfg(not(target_arch = "wasm32"))] #[test] #[ignore] fn refresh_boston_dataset() { @@ -67,6 +69,7 @@ mod tests { assert!(serialize_data(&dataset, "boston.xy").is_ok()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn boston_dataset() { let dataset = load_dataset(); diff --git a/src/dataset/breast_cancer.rs b/src/dataset/breast_cancer.rs index e469794..0e13be1 100644 --- a/src/dataset/breast_cancer.rs +++ b/src/dataset/breast_cancer.rs @@ -66,17 +66,20 @@ pub fn load_dataset() -> Dataset { #[cfg(test)] mod tests { + #[cfg(not(target_arch = "wasm32"))] use super::super::*; use super::*; #[test] #[ignore] + #[cfg(not(target_arch = "wasm32"))] fn refresh_cancer_dataset() { // run this test to generate breast_cancer.xy file. let dataset = load_dataset(); assert!(serialize_data(&dataset, "breast_cancer.xy").is_ok()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn cancer_dataset() { let dataset = load_dataset(); diff --git a/src/dataset/diabetes.rs b/src/dataset/diabetes.rs index 2a3e20c..cbee636 100644 --- a/src/dataset/diabetes.rs +++ b/src/dataset/diabetes.rs @@ -50,9 +50,11 @@ pub fn load_dataset() -> Dataset { #[cfg(test)] mod tests { + #[cfg(not(target_arch = "wasm32"))] use super::super::*; use super::*; + #[cfg(not(target_arch = "wasm32"))] #[test] #[ignore] fn refresh_diabetes_dataset() { @@ -61,6 +63,7 @@ mod tests { assert!(serialize_data(&dataset, "diabetes.xy").is_ok()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn boston_dataset() { let dataset = load_dataset(); diff --git a/src/dataset/digits.rs b/src/dataset/digits.rs index fd643d5..9120e59 100644 --- a/src/dataset/digits.rs +++ b/src/dataset/digits.rs @@ -45,9 +45,11 @@ pub fn load_dataset() -> Dataset { #[cfg(test)] mod tests { + #[cfg(not(target_arch = "wasm32"))] use super::super::*; use super::*; + #[cfg(not(target_arch = "wasm32"))] #[test] #[ignore] fn refresh_digits_dataset() { @@ -55,7 +57,7 @@ mod tests { let dataset = load_dataset(); assert!(serialize_data(&dataset, "digits.xy").is_ok()); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn digits_dataset() { let dataset = load_dataset(); diff --git a/src/dataset/generator.rs b/src/dataset/generator.rs index 39299a5..a73f546 100644 --- a/src/dataset/generator.rs +++ b/src/dataset/generator.rs @@ -137,6 +137,7 @@ mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_make_blobs() { let dataset = make_blobs(10, 2, 3); @@ -149,6 +150,7 @@ mod tests { assert_eq!(dataset.num_samples, 10); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_make_circles() { let dataset = make_circles(10, 0.5, 0.05); @@ -161,6 +163,7 @@ mod tests { assert_eq!(dataset.num_samples, 10); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_make_moons() { let dataset = make_moons(10, 0.05); diff --git a/src/dataset/iris.rs b/src/dataset/iris.rs index 3c92428..888d3e8 100644 --- a/src/dataset/iris.rs +++ b/src/dataset/iris.rs @@ -50,9 +50,11 @@ pub fn load_dataset() -> Dataset { #[cfg(test)] mod tests { + #[cfg(not(target_arch = "wasm32"))] use super::super::*; use super::*; + #[cfg(not(target_arch = "wasm32"))] #[test] #[ignore] fn refresh_iris_dataset() { @@ -61,6 +63,7 @@ mod tests { assert!(serialize_data(&dataset, "iris.xy").is_ok()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn iris_dataset() { let dataset = load_dataset(); diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 17d3b72..5fe4c45 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -8,9 +8,12 @@ pub mod digits; pub mod generator; pub mod iris; +#[cfg(not(target_arch = "wasm32"))] use crate::math::num::RealNumber; +#[cfg(not(target_arch = "wasm32"))] use std::fs::File; use std::io; +#[cfg(not(target_arch = "wasm32"))] use std::io::prelude::*; /// Dataset @@ -49,6 +52,8 @@ impl Dataset { } } +// Running this in wasm throws: operation not supported on this platform. +#[cfg(not(target_arch = "wasm32"))] #[allow(dead_code)] pub(crate) fn serialize_data( dataset: &Dataset, @@ -85,9 +90,9 @@ pub(crate) fn deserialize_data( const USIZE_SIZE: usize = std::mem::size_of::(); let (num_samples, num_features) = { let mut buffer = [0u8; USIZE_SIZE]; - buffer.copy_from_slice(&bytes[0..8]); + buffer.copy_from_slice(&bytes[0..USIZE_SIZE]); let num_features = usize::from_le_bytes(buffer); - buffer.copy_from_slice(&bytes[8..16]); + buffer.copy_from_slice(&bytes[8..8 + USIZE_SIZE]); let num_samples = usize::from_le_bytes(buffer); (num_samples, num_features) }; @@ -116,6 +121,7 @@ pub(crate) fn deserialize_data( mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn as_matrix() { let dataset = Dataset { diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index de258dc..9aebae2 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -325,7 +325,7 @@ mod tests { &[6.8, 161.0, 60.0, 15.6], ]) } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn pca_components() { let us_arrests = us_arrests_data(); @@ -341,7 +341,7 @@ mod tests { assert!(expected.approximate_eq(&pca.components().abs(), 0.4)); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose_covariance() { let us_arrests = us_arrests_data(); @@ -451,6 +451,7 @@ mod tests { .approximate_eq(&expected_projection.abs(), 1e-4)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose_correlation() { let us_arrests = us_arrests_data(); @@ -566,6 +567,7 @@ mod tests { .approximate_eq(&expected_projection.abs(), 1e-4)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs index 6f5a1bd..3807760 100644 --- a/src/decomposition/svd.rs +++ b/src/decomposition/svd.rs @@ -153,6 +153,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn svd_decompose() { // https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/USArrests.html @@ -227,6 +228,7 @@ mod tests { .approximate_eq(&expected, 1e-4)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 4127627..5d509c0 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -279,6 +279,7 @@ mod tests { use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::metrics::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn fit_predict_iris() { let x = DenseMatrix::from_2d_array(&[ @@ -324,6 +325,7 @@ mod tests { assert!(accuracy(&y, &classifier.predict(&x).unwrap()) >= 0.95); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 02eef99..82e299b 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -231,6 +231,7 @@ mod tests { use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::metrics::mean_absolute_error; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn fit_longley() { let x = DenseMatrix::from_2d_array(&[ @@ -273,6 +274,7 @@ mod tests { assert!(mean_absolute_error(&y, &y_hat) < 1.0); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/linalg/cholesky.rs b/src/linalg/cholesky.rs index 724dc8a..053cbfa 100644 --- a/src/linalg/cholesky.rs +++ b/src/linalg/cholesky.rs @@ -168,7 +168,7 @@ pub trait CholeskyDecomposableMatrix: BaseMatrix { mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn cholesky_decompose() { let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]); @@ -187,6 +187,7 @@ mod tests { .approximate_eq(&a.abs(), 1e-4)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn cholesky_solve_mut() { let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]); diff --git a/src/linalg/evd.rs b/src/linalg/evd.rs index 4c1b6c3..78b6cc2 100644 --- a/src/linalg/evd.rs +++ b/src/linalg/evd.rs @@ -816,7 +816,7 @@ fn sort>(d: &mut Vec, e: &mut Vec, V: &mut mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose_symmetric() { let A = DenseMatrix::from_2d_array(&[ @@ -843,7 +843,7 @@ mod tests { assert!((0f64 - evd.e[i]).abs() < std::f64::EPSILON); } } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose_asymmetric() { let A = DenseMatrix::from_2d_array(&[ @@ -870,7 +870,7 @@ mod tests { assert!((0f64 - evd.e[i]).abs() < std::f64::EPSILON); } } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose_complex() { let A = DenseMatrix::from_2d_array(&[ diff --git a/src/linalg/lu.rs b/src/linalg/lu.rs index 6daed69..72d6079 100644 --- a/src/linalg/lu.rs +++ b/src/linalg/lu.rs @@ -260,6 +260,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[0., 1., 5.], &[5., 6., 0.]]); @@ -274,7 +275,7 @@ mod tests { assert!(lu.U().approximate_eq(&expected_U, 1e-4)); assert!(lu.pivot().approximate_eq(&expected_pivot, 1e-4)); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn inverse() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[0., 1., 5.], &[5., 6., 0.]]); diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index cadbc3a..d2d2212 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -706,6 +706,7 @@ mod tests { use crate::linalg::BaseMatrix; use crate::linalg::BaseVector; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn mean() { let m = vec![1., 2., 3.]; @@ -713,6 +714,7 @@ mod tests { assert_eq!(m.mean(), 2.0); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn std() { let m = vec![1., 2., 3.]; @@ -720,6 +722,7 @@ mod tests { assert!((m.std() - 0.81f64).abs() < 1e-2); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn var() { let m = vec![1., 2., 3., 4.]; @@ -727,6 +730,7 @@ mod tests { assert!((m.var() - 1.25f64).abs() < std::f64::EPSILON); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_take() { let m = vec![1., 2., 3., 4., 5.]; @@ -734,6 +738,7 @@ mod tests { assert_eq!(m.take(&vec!(0, 0, 4, 4)), vec![1., 1., 5., 5.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn take() { let m = DenseMatrix::from_2d_array(&[ diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 34c8259..ae68015 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -1060,14 +1060,14 @@ impl BaseMatrix for DenseMatrix { #[cfg(test)] mod tests { use super::*; - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_dot() { let v1 = vec![1., 2., 3.]; let v2 = vec![4., 5., 6.]; assert_eq!(32.0, BaseVector::dot(&v1, &v2)); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_copy_from() { let mut v1 = vec![1., 2., 3.]; @@ -1075,7 +1075,7 @@ mod tests { v1.copy_from(&v2); assert_eq!(v1, v2); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_approximate_eq() { let a = vec![1., 2., 3.]; @@ -1083,7 +1083,7 @@ mod tests { assert!(a.approximate_eq(&b, 1e-4)); assert!(!a.approximate_eq(&b, 1e-5)); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn from_array() { let vec = [1., 2., 3., 4., 5., 6.]; @@ -1096,7 +1096,7 @@ mod tests { DenseMatrix::new(2, 3, vec![1., 4., 2., 5., 3., 6.]) ); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn row_column_vec_from_array() { let vec = vec![1., 2., 3., 4., 5., 6.]; @@ -1109,7 +1109,7 @@ mod tests { DenseMatrix::new(6, 1, vec![1., 2., 3., 4., 5., 6.]) ); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn from_to_row_vec() { let vec = vec![1., 2., 3.]; @@ -1122,20 +1122,20 @@ mod tests { vec![1., 2., 3.] ); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn col_matrix_to_row_vector() { let m: DenseMatrix = BaseMatrix::zeros(10, 1); assert_eq!(m.to_row_vector().len(), 10) } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn iter() { let vec = vec![1., 2., 3., 4., 5., 6.]; let m = DenseMatrix::from_array(3, 2, &vec); assert_eq!(vec, m.iter().collect::>()); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn v_stack() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]); @@ -1150,7 +1150,7 @@ mod tests { let result = a.v_stack(&b); assert_eq!(result, expected); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn h_stack() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]); @@ -1163,13 +1163,13 @@ mod tests { let result = a.h_stack(&b); assert_eq!(result, expected); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn get_row() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]); assert_eq!(vec![4., 5., 6.], a.get_row(1)); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn matmul() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); @@ -1178,7 +1178,7 @@ mod tests { let result = a.matmul(&b); assert_eq!(result, expected); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn ab() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); @@ -1201,14 +1201,14 @@ mod tests { DenseMatrix::from_2d_array(&[&[29., 39., 49.], &[40., 54., 68.,], &[51., 69., 87.]]) ); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn dot() { let a = DenseMatrix::from_array(1, 3, &[1., 2., 3.]); let b = DenseMatrix::from_array(1, 3, &[4., 5., 6.]); assert_eq!(a.dot(&b), 32.); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn copy_from() { let mut a = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.]]); @@ -1216,7 +1216,7 @@ mod tests { a.copy_from(&b); assert_eq!(a, b); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn slice() { let m = DenseMatrix::from_2d_array(&[ @@ -1228,7 +1228,7 @@ mod tests { let result = m.slice(0..2, 1..3); assert_eq!(result, expected); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn approximate_eq() { let m = DenseMatrix::from_2d_array(&[&[2., 3.], &[5., 6.]]); @@ -1237,7 +1237,7 @@ mod tests { assert!(m.approximate_eq(&m_eq, 0.5)); assert!(!m.approximate_eq(&m_neq, 0.5)); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn rand() { let m: DenseMatrix = DenseMatrix::rand(3, 3); @@ -1247,7 +1247,7 @@ mod tests { } } } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn transpose() { let m = DenseMatrix::from_2d_array(&[&[1.0, 3.0], &[2.0, 4.0]]); @@ -1259,7 +1259,7 @@ mod tests { } } } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn reshape() { let m_orig = DenseMatrix::row_vector_from_array(&[1., 2., 3., 4., 5., 6.]); @@ -1270,7 +1270,7 @@ mod tests { assert_eq!(m_result.get(0, 1), 2.); assert_eq!(m_result.get(0, 3), 4.); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn norm() { let v = DenseMatrix::row_vector_from_array(&[3., -2., 6.]); @@ -1279,7 +1279,7 @@ mod tests { assert_eq!(v.norm(std::f64::INFINITY), 6.); assert_eq!(v.norm(std::f64::NEG_INFINITY), 2.); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn softmax_mut() { let mut prob: DenseMatrix = DenseMatrix::row_vector_from_array(&[1., 2., 3.]); @@ -1288,14 +1288,14 @@ mod tests { assert!((prob.get(0, 1) - 0.24).abs() < 0.01); assert!((prob.get(0, 2) - 0.66).abs() < 0.01); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn col_mean() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]); let res = a.column_mean(); assert_eq!(res, vec![4., 5., 6.]); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn min_max_sum() { let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); @@ -1303,14 +1303,14 @@ mod tests { assert_eq!(1., a.min()); assert_eq!(6., a.max()); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn eye() { let a = DenseMatrix::from_2d_array(&[&[1., 0., 0.], &[0., 1., 0.], &[0., 0., 1.]]); let res = DenseMatrix::eye(3); assert_eq!(res, a); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn to_from_json() { @@ -1319,7 +1319,7 @@ mod tests { serde_json::from_str(&serde_json::to_string(&a).unwrap()).unwrap(); assert_eq!(a, deserialized_a); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn to_from_bincode() { @@ -1328,7 +1328,7 @@ mod tests { bincode::deserialize(&bincode::serialize(&a).unwrap()).unwrap(); assert_eq!(a, deserialized_a); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn to_string() { let a = DenseMatrix::from_2d_array(&[&[0.9, 0.4, 0.7], &[0.4, 0.5, 0.3], &[0.7, 0.3, 0.8]]); @@ -1337,7 +1337,7 @@ mod tests { "[[0.9, 0.4, 0.7], [0.4, 0.5, 0.3], [0.7, 0.3, 0.8]]" ); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn cov() { let a = DenseMatrix::from_2d_array(&[ diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index b976fbd..249f21f 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -579,6 +579,7 @@ mod tests { use crate::linear::linear_regression::*; use nalgebra::{DMatrix, Matrix2x3, RowDVector}; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_copy_from() { let mut v1 = RowDVector::from_vec(vec![1., 2., 3.]); @@ -589,12 +590,14 @@ mod tests { assert_ne!(v2, v1); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_len() { let v = RowDVector::from_vec(vec![1., 2., 3.]); assert_eq!(3, v.len()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn get_set_vector() { let mut v = RowDVector::from_vec(vec![1., 2., 3., 4.]); @@ -607,12 +610,14 @@ mod tests { assert_eq!(5., BaseVector::get(&v, 1)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_to_vec() { let v = RowDVector::from_vec(vec![1., 2., 3.]); assert_eq!(vec![1., 2., 3.], v.to_vec()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_init() { let zeros: RowDVector = BaseVector::zeros(3); @@ -623,6 +628,7 @@ mod tests { assert_eq!(twos, RowDVector::from_vec(vec![2., 2., 2.])); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_dot() { let v1 = RowDVector::from_vec(vec![1., 2., 3.]); @@ -630,6 +636,7 @@ mod tests { assert_eq!(32.0, BaseVector::dot(&v1, &v2)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_approximate_eq() { let a = RowDVector::from_vec(vec![1., 2., 3.]); @@ -638,6 +645,7 @@ mod tests { assert!(!a.approximate_eq(&(&noise + &a), 1e-5)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn get_set_dynamic() { let mut m = DMatrix::from_row_slice(2, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); @@ -650,6 +658,7 @@ mod tests { assert_eq!(10., BaseMatrix::get(&m, 1, 1)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn zeros() { let expected = DMatrix::from_row_slice(2, 2, &[0., 0., 0., 0.]); @@ -659,6 +668,7 @@ mod tests { assert_eq!(m, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn ones() { let expected = DMatrix::from_row_slice(2, 2, &[1., 1., 1., 1.]); @@ -668,6 +678,7 @@ mod tests { assert_eq!(m, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn eye() { let expected = DMatrix::from_row_slice(3, 3, &[1., 0., 0., 0., 1., 0., 0., 0., 1.]); @@ -675,6 +686,7 @@ mod tests { assert_eq!(m, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn shape() { let m: DMatrix = BaseMatrix::zeros(5, 10); @@ -684,6 +696,7 @@ mod tests { assert_eq!(ncols, 10); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn scalar_add_sub_mul_div() { let mut m = DMatrix::from_row_slice(2, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); @@ -697,6 +710,7 @@ mod tests { assert_eq!(m, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn add_sub_mul_div() { let mut m = DMatrix::from_row_slice(2, 2, &[1.0, 2.0, 3.0, 4.0]); @@ -715,6 +729,7 @@ mod tests { assert_eq!(m, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn to_from_row_vector() { let v = RowDVector::from_vec(vec![1., 2., 3., 4.]); @@ -723,12 +738,14 @@ mod tests { assert_eq!(m.to_row_vector(), expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn col_matrix_to_row_vector() { let m: DMatrix = BaseMatrix::zeros(10, 1); assert_eq!(m.to_row_vector().len(), 10) } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn get_row_col_as_vec() { let m = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]); @@ -737,12 +754,14 @@ mod tests { assert_eq!(m.get_col_as_vec(1), vec!(2., 5., 8.)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn get_row() { let a = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]); assert_eq!(RowDVector::from_vec(vec![4., 5., 6.]), a.get_row(1)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn copy_row_col_as_vec() { let m = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]); @@ -754,6 +773,7 @@ mod tests { assert_eq!(v, vec!(2., 5., 8.)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn element_add_sub_mul_div() { let mut m = DMatrix::from_row_slice(2, 2, &[1.0, 2.0, 3.0, 4.0]); @@ -767,6 +787,7 @@ mod tests { assert_eq!(m, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vstack_hstack() { let m1 = DMatrix::from_row_slice(2, 3, &[1., 2., 3., 4., 5., 6.]); @@ -782,6 +803,7 @@ mod tests { assert_eq!(result, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn matmul() { let a = DMatrix::from_row_slice(2, 3, &[1., 2., 3., 4., 5., 6.]); @@ -791,6 +813,7 @@ mod tests { assert_eq!(result, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn dot() { let a = DMatrix::from_row_slice(1, 3, &[1., 2., 3.]); @@ -798,6 +821,7 @@ mod tests { assert_eq!(14., a.dot(&b)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn slice() { let a = DMatrix::from_row_slice( @@ -810,6 +834,7 @@ mod tests { assert_eq!(result, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn approximate_eq() { let a = DMatrix::from_row_slice(3, 3, &[1., 2., 3., 4., 5., 6., 7., 8., 9.]); @@ -822,6 +847,7 @@ mod tests { assert!(!a.approximate_eq(&(&noise + &a), 1e-5)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn negative_mut() { let mut v = DMatrix::from_row_slice(1, 3, &[3., -2., 6.]); @@ -829,6 +855,7 @@ mod tests { assert_eq!(v, DMatrix::from_row_slice(1, 3, &[-3., 2., -6.])); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn transpose() { let m = DMatrix::from_row_slice(2, 2, &[1.0, 3.0, 2.0, 4.0]); @@ -837,6 +864,7 @@ mod tests { assert_eq!(m_transposed, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn rand() { let m: DMatrix = BaseMatrix::rand(3, 3); @@ -847,6 +875,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn norm() { let v = DMatrix::from_row_slice(1, 3, &[3., -2., 6.]); @@ -856,6 +885,7 @@ mod tests { assert_eq!(BaseMatrix::norm(&v, std::f64::NEG_INFINITY), 2.); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn col_mean() { let a = DMatrix::from_row_slice(3, 3, &[1., 2., 3., 4., 5., 6., 7., 8., 9.]); @@ -863,6 +893,7 @@ mod tests { assert_eq!(res, vec![4., 5., 6.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn reshape() { let m_orig = DMatrix::from_row_slice(1, 6, &[1., 2., 3., 4., 5., 6.]); @@ -874,6 +905,7 @@ mod tests { assert_eq!(BaseMatrix::get(&m_result, 0, 3), 4.); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn copy_from() { let mut src = DMatrix::from_row_slice(1, 3, &[1., 2., 3.]); @@ -882,6 +914,7 @@ mod tests { assert_eq!(src, dst); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn abs_mut() { let mut a = DMatrix::from_row_slice(2, 2, &[1., -2., 3., -4.]); @@ -890,6 +923,7 @@ mod tests { assert_eq!(a, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn min_max_sum() { let a = DMatrix::from_row_slice(2, 3, &[1., 2., 3., 4., 5., 6.]); @@ -898,6 +932,7 @@ mod tests { assert_eq!(6., a.max()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn max_diff() { let a1 = DMatrix::from_row_slice(2, 3, &[1., 2., 3., 4., -5., 6.]); @@ -906,6 +941,7 @@ mod tests { assert_eq!(a2.max_diff(&a2), 0.); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn softmax_mut() { let mut prob: DMatrix = DMatrix::from_row_slice(1, 3, &[1., 2., 3.]); @@ -915,6 +951,7 @@ mod tests { assert!((BaseMatrix::get(&prob, 0, 2) - 0.66).abs() < 0.01); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn pow_mut() { let mut a = DMatrix::from_row_slice(1, 3, &[1., 2., 3.]); @@ -922,6 +959,7 @@ mod tests { assert_eq!(a, DMatrix::from_row_slice(1, 3, &[1., 8., 27.])); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn argmax() { let a = DMatrix::from_row_slice(3, 3, &[1., 2., 3., -5., -6., -7., 0.1, 0.2, 0.1]); @@ -929,6 +967,7 @@ mod tests { assert_eq!(res, vec![2, 0, 1]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn unique() { let a = DMatrix::from_row_slice(3, 3, &[1., 2., 2., -2., -6., -7., 2., 3., 4.]); @@ -937,6 +976,7 @@ mod tests { assert_eq!(res, vec![-7., -6., -2., 1., 2., 3., 4.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn ols_fit_predict() { let x = DMatrix::from_row_slice( diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 6ed40c8..0aa97aa 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -530,6 +530,7 @@ mod tests { use crate::metrics::mean_absolute_error; use ndarray::{arr1, arr2, Array1, Array2}; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_get_set() { let mut result = arr1(&[1., 2., 3.]); @@ -541,6 +542,7 @@ mod tests { assert_eq!(5., BaseVector::get(&result, 1)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_copy_from() { let mut v1 = arr1(&[1., 2., 3.]); @@ -551,18 +553,21 @@ mod tests { assert_ne!(v1, v2); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_len() { let v = arr1(&[1., 2., 3.]); assert_eq!(3, v.len()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_to_vec() { let v = arr1(&[1., 2., 3.]); assert_eq!(vec![1., 2., 3.], v.to_vec()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_dot() { let v1 = arr1(&[1., 2., 3.]); @@ -570,6 +575,7 @@ mod tests { assert_eq!(32.0, BaseVector::dot(&v1, &v2)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vec_approximate_eq() { let a = arr1(&[1., 2., 3.]); @@ -578,6 +584,7 @@ mod tests { assert!(!a.approximate_eq(&(&noise + &a), 1e-5)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn from_to_row_vec() { let vec = arr1(&[1., 2., 3.]); @@ -588,12 +595,14 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn col_matrix_to_row_vector() { let m: Array2 = BaseMatrix::zeros(10, 1); assert_eq!(m.to_row_vector().len(), 10) } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn add_mut() { let mut a1 = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -604,6 +613,7 @@ mod tests { assert_eq!(a1, a3); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn sub_mut() { let mut a1 = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -614,6 +624,7 @@ mod tests { assert_eq!(a1, a3); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn mul_mut() { let mut a1 = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -624,6 +635,7 @@ mod tests { assert_eq!(a1, a3); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn div_mut() { let mut a1 = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -634,6 +646,7 @@ mod tests { assert_eq!(a1, a3); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn div_element_mut() { let mut a = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -642,6 +655,7 @@ mod tests { assert_eq!(BaseMatrix::get(&a, 1, 1), 1.); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn mul_element_mut() { let mut a = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -650,6 +664,7 @@ mod tests { assert_eq!(BaseMatrix::get(&a, 1, 1), 25.); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn add_element_mut() { let mut a = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -657,7 +672,7 @@ mod tests { assert_eq!(BaseMatrix::get(&a, 1, 1), 10.); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn sub_element_mut() { let mut a = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -666,6 +681,7 @@ mod tests { assert_eq!(BaseMatrix::get(&a, 1, 1), 0.); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn vstack_hstack() { let a1 = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -680,6 +696,7 @@ mod tests { assert_eq!(result, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn get_set() { let mut result = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -691,6 +708,7 @@ mod tests { assert_eq!(10., BaseMatrix::get(&result, 1, 1)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn matmul() { let a = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -700,6 +718,7 @@ mod tests { assert_eq!(result, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn dot() { let a = arr2(&[[1., 2., 3.]]); @@ -707,6 +726,7 @@ mod tests { assert_eq!(14., BaseMatrix::dot(&a, &b)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn slice() { let a = arr2(&[ @@ -719,6 +739,7 @@ mod tests { assert_eq!(result, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn scalar_ops() { let a = arr2(&[[1., 2., 3.]]); @@ -728,6 +749,7 @@ mod tests { assert_eq!(&arr2(&[[0.5, 1., 1.5]]), a.clone().div_scalar_mut(2.)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn transpose() { let m = arr2(&[[1.0, 3.0], [2.0, 4.0]]); @@ -736,6 +758,7 @@ mod tests { assert_eq!(m_transposed, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn norm() { let v = arr2(&[[3., -2., 6.]]); @@ -745,6 +768,7 @@ mod tests { assert_eq!(v.norm(std::f64::NEG_INFINITY), 2.); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn negative_mut() { let mut v = arr2(&[[3., -2., 6.]]); @@ -752,6 +776,7 @@ mod tests { assert_eq!(v, arr2(&[[-3., 2., -6.]])); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn reshape() { let m_orig = arr2(&[[1., 2., 3., 4., 5., 6.]]); @@ -763,6 +788,7 @@ mod tests { assert_eq!(BaseMatrix::get(&m_result, 0, 3), 4.); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn copy_from() { let mut src = arr2(&[[1., 2., 3.]]); @@ -771,6 +797,7 @@ mod tests { assert_eq!(src, dst); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn min_max_sum() { let a = arr2(&[[1., 2., 3.], [4., 5., 6.]]); @@ -779,6 +806,7 @@ mod tests { assert_eq!(6., a.max()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn max_diff() { let a1 = arr2(&[[1., 2., 3.], [4., -5., 6.]]); @@ -787,6 +815,7 @@ mod tests { assert_eq!(a2.max_diff(&a2), 0.); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn softmax_mut() { let mut prob: Array2 = arr2(&[[1., 2., 3.]]); @@ -796,6 +825,7 @@ mod tests { assert!((BaseMatrix::get(&prob, 0, 2) - 0.66).abs() < 0.01); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn pow_mut() { let mut a = arr2(&[[1., 2., 3.]]); @@ -803,6 +833,7 @@ mod tests { assert_eq!(a, arr2(&[[1., 8., 27.]])); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn argmax() { let a = arr2(&[[1., 2., 3.], [-5., -6., -7.], [0.1, 0.2, 0.1]]); @@ -810,6 +841,7 @@ mod tests { assert_eq!(res, vec![2, 0, 1]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn unique() { let a = arr2(&[[1., 2., 2.], [-2., -6., -7.], [2., 3., 4.]]); @@ -818,6 +850,7 @@ mod tests { assert_eq!(res, vec![-7., -6., -2., 1., 2., 3., 4.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn get_row_as_vector() { let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]); @@ -825,12 +858,14 @@ mod tests { assert_eq!(res, vec![4., 5., 6.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn get_row() { let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]); assert_eq!(arr1(&[4., 5., 6.]), a.get_row(1)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn get_col_as_vector() { let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]); @@ -838,6 +873,7 @@ mod tests { assert_eq!(res, vec![2., 5., 8.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn copy_row_col_as_vec() { let m = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]); @@ -849,6 +885,7 @@ mod tests { assert_eq!(v, vec!(2., 5., 8.)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn col_mean() { let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]); @@ -856,6 +893,7 @@ mod tests { assert_eq!(res, vec![4., 5., 6.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn eye() { let a = arr2(&[[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]); @@ -863,6 +901,7 @@ mod tests { assert_eq!(res, a); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn rand() { let m: Array2 = BaseMatrix::rand(3, 3); @@ -873,6 +912,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn approximate_eq() { let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]); @@ -881,6 +921,7 @@ mod tests { assert!(!a.approximate_eq(&(&noise + &a), 1e-5)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn abs_mut() { let mut a = arr2(&[[1., -2.], [3., -4.]]); @@ -889,6 +930,7 @@ mod tests { assert_eq!(a, expected); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn lr_fit_predict_iris() { let x = arr2(&[ @@ -930,6 +972,7 @@ mod tests { assert!(error <= 1.0); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn my_fit_longley_ndarray() { let x = arr2(&[ diff --git a/src/linalg/qr.rs b/src/linalg/qr.rs index a06a01f..3380fb4 100644 --- a/src/linalg/qr.rs +++ b/src/linalg/qr.rs @@ -195,7 +195,7 @@ pub trait QRDecomposableMatrix: BaseMatrix { mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose() { let a = DenseMatrix::from_2d_array(&[&[0.9, 0.4, 0.7], &[0.4, 0.5, 0.3], &[0.7, 0.3, 0.8]]); @@ -214,6 +214,7 @@ mod tests { assert!(qr.R().abs().approximate_eq(&r.abs(), 1e-4)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn qr_solve_mut() { let a = DenseMatrix::from_2d_array(&[&[0.9, 0.4, 0.7], &[0.4, 0.5, 0.3], &[0.7, 0.3, 0.8]]); diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs index 5a1dd38..10a3fc4 100644 --- a/src/linalg/stats.rs +++ b/src/linalg/stats.rs @@ -150,7 +150,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::linalg::BaseVector; - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn mean() { let m = DenseMatrix::from_2d_array(&[ @@ -164,7 +164,7 @@ mod tests { assert_eq!(m.mean(0), expected_0); assert_eq!(m.mean(1), expected_1); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn std() { let m = DenseMatrix::from_2d_array(&[ @@ -178,7 +178,7 @@ mod tests { assert!(m.std(0).approximate_eq(&expected_0, 1e-2)); assert!(m.std(1).approximate_eq(&expected_1, 1e-2)); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn var() { let m = DenseMatrix::from_2d_array(&[&[1., 2., 3., 4.], &[5., 6., 7., 8.]]); @@ -188,7 +188,7 @@ mod tests { assert!(m.var(0).approximate_eq(&expected_0, std::f64::EPSILON)); assert!(m.var(1).approximate_eq(&expected_1, std::f64::EPSILON)); } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn scale() { let mut m = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); diff --git a/src/linalg/svd.rs b/src/linalg/svd.rs index e370453..3746071 100644 --- a/src/linalg/svd.rs +++ b/src/linalg/svd.rs @@ -482,7 +482,7 @@ impl> SVD { mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose_symmetric() { let A = DenseMatrix::from_2d_array(&[ @@ -513,7 +513,7 @@ mod tests { assert!((s[i] - svd.s[i]).abs() < 1e-4); } } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose_asymmetric() { let A = DenseMatrix::from_2d_array(&[ @@ -714,7 +714,7 @@ mod tests { assert!((s[i] - svd.s[i]).abs() < 1e-4); } } - + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn solve() { let a = DenseMatrix::from_2d_array(&[&[0.9, 0.4, 0.7], &[0.4, 0.5, 0.3], &[0.7, 0.3, 0.8]]); @@ -725,6 +725,7 @@ mod tests { assert!(w.approximate_eq(&expected_w, 1e-2)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn decompose_restore() { let a = DenseMatrix::from_2d_array(&[&[1.0, 2.0, 3.0, 4.0], &[5.0, 6.0, 7.0, 8.0]]); diff --git a/src/linear/bg_solver.rs b/src/linear/bg_solver.rs index 46ef13d..28cc3d8 100644 --- a/src/linear/bg_solver.rs +++ b/src/linear/bg_solver.rs @@ -126,6 +126,7 @@ mod tests { impl> BiconjugateGradientSolver for BGSolver {} + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn bg_solver() { let a = DenseMatrix::from_2d_array(&[&[25., 15., -5.], &[15., 18., 0.], &[-5., 0., 11.]]); diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index 479ae2a..ce13435 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -291,6 +291,7 @@ mod tests { use crate::linalg::naive::dense_matrix::*; use crate::metrics::mean_absolute_error; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn elasticnet_longley() { let x = DenseMatrix::from_2d_array(&[ @@ -334,6 +335,7 @@ mod tests { assert!(mean_absolute_error(&y_hat, &y) < 30.0); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn elasticnet_fit_predict1() { let x = DenseMatrix::from_2d_array(&[ @@ -400,6 +402,7 @@ mod tests { assert!(l1_model.coefficients().get(0, 0) > l1_model.coefficients().get(2, 0)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 8c59a4f..7edd325 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -226,6 +226,7 @@ mod tests { use crate::linalg::naive::dense_matrix::*; use crate::metrics::mean_absolute_error; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn lasso_fit_predict() { let x = DenseMatrix::from_2d_array(&[ @@ -274,6 +275,7 @@ mod tests { assert!(mean_absolute_error(&y_hat, &y) < 2.0); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 2734a78..a10b5ac 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -200,6 +200,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn ols_fit_predict() { let x = DenseMatrix::from_2d_array(&[ @@ -250,6 +251,7 @@ mod tests { .all(|(&a, &b)| (a - b).abs() <= 5.0)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 2a12c19..ad2cdb3 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -452,6 +452,7 @@ mod tests { use crate::linalg::naive::dense_matrix::*; use crate::metrics::accuracy; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn multiclass_objective_f() { let x = DenseMatrix::from_2d_array(&[ @@ -519,6 +520,7 @@ mod tests { assert!((g.get(0, 0).abs() - 32.0).abs() < 1e-4); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn binary_objective_f() { let x = DenseMatrix::from_2d_array(&[ @@ -575,6 +577,7 @@ mod tests { assert!((g.get(0, 2) - 3.8693).abs() < 1e-4); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn lr_fit_predict() { let x = DenseMatrix::from_2d_array(&[ @@ -612,6 +615,7 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn lr_fit_predict_multiclass() { let blobs = make_blobs(15, 4, 3); @@ -635,6 +639,7 @@ mod tests { assert!(lr_reg.coefficients().abs().sum() < lr.coefficients().abs().sum()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn lr_fit_predict_binary() { let blobs = make_blobs(20, 4, 2); @@ -658,6 +663,7 @@ mod tests { assert!(lr_reg.coefficients().abs().sum() < lr.coefficients().abs().sum()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { @@ -688,6 +694,7 @@ mod tests { assert_eq!(lr, deserialized_lr); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn lr_fit_predict_iris() { let x = DenseMatrix::from_2d_array(&[ diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 787c338..94ac700 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -274,6 +274,7 @@ mod tests { use crate::linalg::naive::dense_matrix::*; use crate::metrics::mean_absolute_error; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn ridge_fit_predict() { let x = DenseMatrix::from_2d_array(&[ @@ -329,6 +330,7 @@ mod tests { assert!(mean_absolute_error(&y_hat_svd, &y) < 2.0); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index b06d7d1..ed836f6 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -57,6 +57,7 @@ impl Distance, T> for Euclidian { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn squared_distance() { let a = vec![1., 2., 3.]; diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index d23b57f..da0d28f 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -52,6 +52,7 @@ impl Distance, F> for Hamming { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn hamming_distance() { let a = vec![1, 0, 0, 1, 0, 0, 1]; diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index 7ff86e9..5a3fae8 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -133,6 +133,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn mahalanobis_distance() { let data = DenseMatrix::from_2d_array(&[ diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 3162178..372f524 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -48,6 +48,7 @@ impl Distance, T> for Manhattan { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn manhattan_distance() { let a = vec![1., 2., 3.]; diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index 1e97ea8..bd9c1c4 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -61,6 +61,7 @@ impl Distance, T> for Minkowski { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn minkowski_distance() { let a = vec![1., 2., 3.]; diff --git a/src/math/num.rs b/src/math/num.rs index 490623c..7199949 100644 --- a/src/math/num.rs +++ b/src/math/num.rs @@ -136,6 +136,7 @@ impl RealNumber for f32 { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn sigmoid() { assert_eq!(1.0.sigmoid(), 0.7310585786300049); diff --git a/src/math/vector.rs b/src/math/vector.rs index 62cf63b..c38c7a4 100644 --- a/src/math/vector.rs +++ b/src/math/vector.rs @@ -30,6 +30,7 @@ impl> RealNumberVector for V { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn unique_with_indices() { let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0]; diff --git a/src/metrics/accuracy.rs b/src/metrics/accuracy.rs index 6912a4c..0c9ce06 100644 --- a/src/metrics/accuracy.rs +++ b/src/metrics/accuracy.rs @@ -57,6 +57,7 @@ impl Accuracy { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn accuracy() { let y_pred: Vec = vec![0., 2., 1., 3.]; diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index 508295b..c413dc4 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -93,6 +93,7 @@ impl AUC { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn auc() { let y_true: Vec = vec![0., 0., 1., 1.]; diff --git a/src/metrics/cluster_hcv.rs b/src/metrics/cluster_hcv.rs index d881bdc..f20f448 100644 --- a/src/metrics/cluster_hcv.rs +++ b/src/metrics/cluster_hcv.rs @@ -43,6 +43,7 @@ impl HCVScore { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn homogeneity_score() { let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0]; diff --git a/src/metrics/cluster_helpers.rs b/src/metrics/cluster_helpers.rs index a8fa7e5..05cf97c 100644 --- a/src/metrics/cluster_helpers.rs +++ b/src/metrics/cluster_helpers.rs @@ -101,6 +101,7 @@ pub fn mutual_info_score(contingency: &[Vec]) -> T { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn contingency_matrix_test() { let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0]; @@ -112,6 +113,7 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn entropy_test() { let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0]; @@ -119,6 +121,7 @@ mod tests { assert!((1.2770f32 - entropy(&v1).unwrap()).abs() < 1e-4); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn mutual_info_score_test() { let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0]; diff --git a/src/metrics/f1.rs b/src/metrics/f1.rs index d957d9b..4ad6a5d 100644 --- a/src/metrics/f1.rs +++ b/src/metrics/f1.rs @@ -59,6 +59,7 @@ impl F1 { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn f1() { let y_pred: Vec = vec![0., 0., 1., 1., 1., 1.]; diff --git a/src/metrics/mean_absolute_error.rs b/src/metrics/mean_absolute_error.rs index db3039f..3e8ce85 100644 --- a/src/metrics/mean_absolute_error.rs +++ b/src/metrics/mean_absolute_error.rs @@ -56,6 +56,7 @@ impl MeanAbsoluteError { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn mean_absolute_error() { let y_true: Vec = vec![3., -0.5, 2., 7.]; diff --git a/src/metrics/mean_squared_error.rs b/src/metrics/mean_squared_error.rs index 3003e5d..dce758d 100644 --- a/src/metrics/mean_squared_error.rs +++ b/src/metrics/mean_squared_error.rs @@ -56,6 +56,7 @@ impl MeanSquareError { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn mean_squared_error() { let y_true: Vec = vec![3., -0.5, 2., 7.]; diff --git a/src/metrics/precision.rs b/src/metrics/precision.rs index 2bd0dcf..a0171aa 100644 --- a/src/metrics/precision.rs +++ b/src/metrics/precision.rs @@ -77,6 +77,7 @@ impl Precision { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn precision() { let y_true: Vec = vec![0., 1., 1., 0.]; diff --git a/src/metrics/r2.rs b/src/metrics/r2.rs index c710ef5..738aae6 100644 --- a/src/metrics/r2.rs +++ b/src/metrics/r2.rs @@ -70,6 +70,7 @@ impl R2 { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn r2() { let y_true: Vec = vec![3., -0.5, 2., 7.]; diff --git a/src/metrics/recall.rs b/src/metrics/recall.rs index d1fad56..18863ae 100644 --- a/src/metrics/recall.rs +++ b/src/metrics/recall.rs @@ -77,6 +77,7 @@ impl Recall { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn recall() { let y_true: Vec = vec![0., 1., 1., 0.]; diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs index 63827c4..8706954 100644 --- a/src/model_selection/kfold.rs +++ b/src/model_selection/kfold.rs @@ -144,6 +144,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_kfold_return_test_indices_simple() { let k = KFold { @@ -158,6 +159,7 @@ mod tests { assert_eq!(test_indices[2], (22..33).collect::>()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_kfold_return_test_indices_odd() { let k = KFold { @@ -172,6 +174,7 @@ mod tests { assert_eq!(test_indices[2], (23..34).collect::>()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_kfold_return_test_mask_simple() { let k = KFold { @@ -197,6 +200,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_kfold_return_split_simple() { let k = KFold { @@ -212,6 +216,7 @@ mod tests { assert_eq!(train_test_splits[1].1, (11..22).collect::>()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_kfold_return_split_simple_shuffle() { let k = KFold { @@ -227,6 +232,7 @@ mod tests { assert_eq!(train_test_splits[1].1.len(), 11_usize); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn numpy_parity_test() { let k = KFold { @@ -247,6 +253,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn numpy_parity_test_shuffle() { let k = KFold { diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 0058367..d283176 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -285,6 +285,7 @@ mod tests { use crate::model_selection::kfold::KFold; use crate::neighbors::knn_regressor::KNNRegressor; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_train_test_split() { let n = 123; @@ -308,6 +309,7 @@ mod tests { #[derive(Clone)] struct NoParameters {} + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_cross_validate_biased() { struct BiasedEstimator {} @@ -367,6 +369,7 @@ mod tests { assert_eq!(0.4, results.mean_train_score()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_cross_validate_knn() { let x = DenseMatrix::from_2d_array(&[ @@ -411,6 +414,7 @@ mod tests { assert!(results.mean_train_score() < results.mean_test_score()); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_cross_val_predict_knn() { let x = DenseMatrix::from_2d_array(&[ diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 286a4a5..69eb13c 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -346,6 +346,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_bernoulli_naive_bayes() { // Tests that BernoulliNB when alpha=1.0 gives the same values as @@ -398,6 +399,7 @@ mod tests { assert_eq!(y_hat, &[1.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn bernoulli_nb_scikit_parity() { let x = DenseMatrix::::from_2d_array(&[ @@ -460,6 +462,7 @@ mod tests { )); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index e308a01..51619b6 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -351,6 +351,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_categorical_naive_bayes() { let x = DenseMatrix::from_2d_array(&[ @@ -431,6 +432,7 @@ mod tests { assert_eq!(y_hat, vec![0., 1.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_categorical_naive_bayes2() { let x = DenseMatrix::from_2d_array(&[ @@ -459,6 +461,7 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 00c7962..b84e65f 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -259,6 +259,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_gaussian_naive_bayes() { let x = DenseMatrix::from_2d_array(&[ @@ -295,6 +296,7 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_gaussian_naive_bayes_with_priors() { let x = DenseMatrix::from_2d_array(&[ @@ -314,6 +316,7 @@ mod tests { assert_eq!(gnb.class_priors(), &priors); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 87e0ddd..43a022a 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -297,6 +297,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn run_multinomial_naive_bayes() { // Tests that MultinomialNB when alpha=1.0 gives the same values as @@ -352,6 +353,7 @@ mod tests { assert_eq!(y_hat, &[0.]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn multinomial_nb_scikit_parity() { let x = DenseMatrix::::from_2d_array(&[ @@ -411,6 +413,7 @@ mod tests { 1e-5 )); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index ba6693e..8723900 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -251,6 +251,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn knn_fit_predict() { let x = @@ -262,6 +263,7 @@ mod tests { assert_eq!(y.to_vec(), y_hat); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn knn_fit_predict_weighted() { let x = DenseMatrix::from_2d_array(&[&[1.], &[2.], &[3.], &[4.], &[5.]]); @@ -279,6 +281,7 @@ mod tests { assert_eq!(vec![3.0], y_hat); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index ed52496..649cd1f 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -231,6 +231,7 @@ mod tests { use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::math::distance::Distances; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn knn_fit_predict_weighted() { let x = @@ -254,6 +255,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn knn_fit_predict_uniform() { let x = @@ -268,6 +270,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/optimization/first_order/gradient_descent.rs b/src/optimization/first_order/gradient_descent.rs index d57896f..aba48a5 100644 --- a/src/optimization/first_order/gradient_descent.rs +++ b/src/optimization/first_order/gradient_descent.rs @@ -88,6 +88,7 @@ mod tests { use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn gradient_descent() { let x0 = DenseMatrix::row_vector_from_array(&[-1., 1.]); diff --git a/src/optimization/first_order/lbfgs.rs b/src/optimization/first_order/lbfgs.rs index 322df03..aaf2c89 100644 --- a/src/optimization/first_order/lbfgs.rs +++ b/src/optimization/first_order/lbfgs.rs @@ -239,6 +239,7 @@ mod tests { use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn lbfgs() { let x0 = DenseMatrix::row_vector_from_array(&[0., 0.]); diff --git a/src/optimization/line_search.rs b/src/optimization/line_search.rs index 99457c9..bbaa3fc 100644 --- a/src/optimization/line_search.rs +++ b/src/optimization/line_search.rs @@ -112,6 +112,7 @@ impl LineSearchMethod for Backtracking { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn backtracking() { let f = |x: f64| -> f64 { x.powf(2.) + x }; diff --git a/src/preprocessing/categorical.rs b/src/preprocessing/categorical.rs index 8571e74..adc85a6 100644 --- a/src/preprocessing/categorical.rs +++ b/src/preprocessing/categorical.rs @@ -225,6 +225,7 @@ mod tests { use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::preprocessing::series_encoder::CategoryMapper; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn adjust_idxs() { assert_eq!(find_new_idxs(0, &[], &[]), Vec::::new()); @@ -269,6 +270,7 @@ mod tests { (orig, oh_enc) } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn hash_encode_f64_series() { let series = vec![3.0, 1.0, 2.0, 1.0]; @@ -279,6 +281,7 @@ mod tests { let orig_val: f64 = inv.unwrap().into(); assert_eq!(orig_val, 2.0); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_fit() { let (x, _) = build_fake_matrix(); @@ -294,6 +297,7 @@ mod tests { assert_eq!(num_cat, vec![2, 4]); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn matrix_transform_test() { let (x, expected_x) = build_fake_matrix(); @@ -309,6 +313,7 @@ mod tests { assert_eq!(nm, expected_x); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn fail_on_bad_category() { let m = DenseMatrix::from_2d_array(&[ diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index e24eca1..2cd4133 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -201,6 +201,7 @@ where mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn from_categories() { let fake_categories: Vec = vec![1, 2, 3, 4, 5, 3, 5, 3, 1, 2, 4]; @@ -219,12 +220,14 @@ mod tests { let enc = CategoryMapper::<&str>::from_positional_category_vec(fake_category_pos); enc } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn ordinal_encoding() { let enc = build_fake_str_enc(); assert_eq!(1f64, enc.get_ordinal::(&"dog").unwrap()) } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn category_map_and_vec() { let category_map: HashMap<&str, usize> = vec![("background", 0), ("dog", 1), ("cat", 2)] @@ -239,6 +242,7 @@ mod tests { assert_eq!(oh_vec, res); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn positional_categories_vec() { let enc = build_fake_str_enc(); @@ -250,6 +254,7 @@ mod tests { assert_eq!(oh_vec, res); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn invert_label_test() { let enc = build_fake_str_enc(); @@ -262,6 +267,7 @@ mod tests { }; } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn test_many_categorys() { let enc = build_fake_str_enc(); diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 068f773..55df584 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -159,6 +159,7 @@ impl> Kernel for SigmoidKernel { mod tests { use super::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn linear_kernel() { let v1 = vec![1., 2., 3.]; @@ -167,6 +168,7 @@ mod tests { assert_eq!(32f64, Kernels::linear().apply(&v1, &v2)); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn rbf_kernel() { let v1 = vec![1., 2., 3.]; @@ -175,6 +177,7 @@ mod tests { assert!((0.2265f64 - Kernels::rbf(0.055).apply(&v1, &v2)).abs() < 1e-4); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn polynomial_kernel() { let v1 = vec![1., 2., 3.]; @@ -186,6 +189,7 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn sigmoid_kernel() { let v1 = vec![1., 2., 3.]; diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 8870c41..9c141e5 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -729,6 +729,7 @@ mod tests { #[cfg(feature = "serde")] use crate::svm::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn svc_fit_predict() { let x = DenseMatrix::from_2d_array(&[ @@ -771,6 +772,7 @@ mod tests { assert!(accuracy(&y_hat, &y) >= 0.9); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn svc_fit_predict_rbf() { let x = DenseMatrix::from_2d_array(&[ @@ -814,6 +816,7 @@ mod tests { assert!(accuracy(&y_hat, &y) >= 0.9); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn svc_serde() { diff --git a/src/svm/svr.rs b/src/svm/svr.rs index b160cca..455e51f 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -536,6 +536,7 @@ mod tests { #[cfg(feature = "serde")] use crate::svm::*; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn svr_fit_predict() { let x = DenseMatrix::from_2d_array(&[ @@ -569,6 +570,7 @@ mod tests { assert!(mean_squared_error(&y_hat, &y) < 2.5); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn svr_serde() { diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index ba79d52..5ba096e 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -640,6 +640,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn gini_impurity() { assert!( @@ -656,6 +657,7 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn fit_predict_iris() { let x = DenseMatrix::from_2d_array(&[ @@ -708,6 +710,7 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn fit_predict_baloons() { let x = DenseMatrix::from_2d_array(&[ @@ -744,6 +747,7 @@ mod tests { ); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 307d357..21a6d00 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -506,6 +506,7 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] fn fit_longley() { let x = DenseMatrix::from_2d_array(&[ @@ -580,6 +581,7 @@ mod tests { } } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] fn serde() { From 790979a26de103f8cc15f4271781aecad004707d Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Wed, 28 Apr 2021 20:00:24 +0000 Subject: [PATCH 60/81] build(deps): update rand requirement from 0.7.3 to 0.8.3 Updates the requirements on [rand](https://github.com/rust-random/rand) to permit the latest version. - [Release notes](https://github.com/rust-random/rand/releases) - [Changelog](https://github.com/rust-random/rand/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-random/rand/compare/0.7.3...0.8.3) Signed-off-by: dependabot-preview[bot] --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index ef99307..c2b6825 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ ndarray = { version = "0.14", optional = true } nalgebra = { version = "0.23.0", optional = true } num-traits = "0.2.12" num = "0.3.0" -rand = "0.7.3" +rand = "0.8.3" rand_distr = "0.3.0" serde = { version = "1.0.115", features = ["derive"], optional = true } From 703dc9688b1061959911b637e7f6f2c8ec89192b Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Wed, 28 Apr 2021 20:00:23 +0000 Subject: [PATCH 61/81] build(deps): update rand_distr requirement from 0.3.0 to 0.4.0 Updates the requirements on [rand_distr](https://github.com/rust-random/rand) to permit the latest version. - [Release notes](https://github.com/rust-random/rand/releases) - [Changelog](https://github.com/rust-random/rand/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-random/rand/compare/rand_distr-0.3.0...rand_distr-0.4.0) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c2b6825..61e922c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,7 @@ nalgebra = { version = "0.23.0", optional = true } num-traits = "0.2.12" num = "0.3.0" rand = "0.8.3" -rand_distr = "0.3.0" +rand_distr = "0.4.0" serde = { version = "1.0.115", features = ["derive"], optional = true } [target.'cfg(target_arch = "wasm32")'.dependencies] From c295a0d1bb1a8aeb18aa14f400c97ffac0a75bbf Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 28 Apr 2021 16:28:43 -0400 Subject: [PATCH 62/81] fix: fix code to be compatible with rand 0.8, following the recommendations of https://rust-random.github.io/book/update-0.8.html and https://docs.rs/getrandom/0.2.2/getrandom/#webassembly-support --- Cargo.toml | 2 +- src/cluster/kmeans.rs | 2 +- src/ensemble/random_forest_classifier.rs | 2 +- src/ensemble/random_forest_regressor.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 61e922c..f1e805c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,7 @@ rand_distr = "0.4.0" serde = { version = "1.0.115", features = ["derive"], optional = true } [target.'cfg(target_arch = "wasm32")'.dependencies] -rand = { version = "0.7.3", features = ["wasm-bindgen"] } +getrandom = { version = "0.2", features = ["js"] } [dev-dependencies] criterion = "0.3" diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 69f40db..fd43a14 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -245,7 +245,7 @@ impl KMeans { let mut rng = rand::thread_rng(); let (n, m) = data.shape(); let mut y = vec![0; n]; - let mut centroid = data.get_row_as_vec(rng.gen_range(0, n)); + let mut centroid = data.get_row_as_vec(rng.gen_range(0..n)); let mut d = vec![T::max_value(); n]; diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 5d509c0..1d7884b 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -265,7 +265,7 @@ impl RandomForestClassifier { let size = ((n_samples as f64) / *class_weight_l) as usize; for _ in 0..size { - let xi: usize = rng.gen_range(0, n_samples); + let xi: usize = rng.gen_range(0..n_samples); samples[index[xi]] += 1; } } diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 82e299b..0351fc4 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -218,7 +218,7 @@ impl RandomForestRegressor { let mut rng = rand::thread_rng(); let mut samples = vec![0; nrows]; for _ in 0..nrows { - let xi = rng.gen_range(0, nrows); + let xi = rng.gen_range(0..nrows); samples[xi] += 1; } samples From 9ce448379ae9150c495f12ebb9c8937ed0f3195d Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 28 Apr 2021 16:58:15 -0400 Subject: [PATCH 63/81] docs: create changelog (#102) Co-authored-by: Luis Moreno --- CHANGELOG.md | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ade6825 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,60 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## Added +- L2 regularization penalty to the Logistic Regression +- Getters for the naive bayes structs +- One hot encoder +- Make moons data generator +- Support for WASM. + +## Changed +- Make serde optional + +## [0.2.0] - 2021-01-03 + +### Added +- DBSCAN +- Epsilon-SVR, SVC +- Ridge, Lasso, ElasticNet +- Bernoulli, Gaussian, Categorical and Multinomial Naive Bayes +- K-fold Cross Validation +- Singular value decomposition +- New api module +- Integration with Clippy +- Cholesky decomposition + +### Changed +- ndarray upgraded to 0.14 +- smartcore::error:FailedError is now non-exhaustive +- K-Means +- PCA +- Random Forest +- Linear and Logistic Regression +- KNN +- Decision Tree + +## [0.1.0] - 2020-09-25 + +### Added +- First release of smartcore. +- KNN + distance metrics (Euclidian, Minkowski, Manhattan, Hamming, Mahalanobis) +- Linear Regression (OLS) +- Logistic Regression +- Random Forest Classifier +- Decision Tree Classifier +- PCA +- K-Means +- Integrated with ndarray +- Abstract linear algebra methods +- RandomForest Regressor +- Decision Tree Regressor +- Serde integration +- Integrated with nalgebra +- LU, QR, SVD, EVD +- Evaluation Metrics From 5a2e1f12627f73795ee182d71961e1bd07512f70 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Wed, 28 Apr 2021 21:41:48 +0000 Subject: [PATCH 64/81] build(deps): update ndarray requirement from 0.14 to 0.15 Updates the requirements on [ndarray](https://github.com/rust-ndarray/ndarray) to permit the latest version. - [Release notes](https://github.com/rust-ndarray/ndarray/releases) - [Changelog](https://github.com/rust-ndarray/ndarray/blob/master/RELEASES.md) - [Commits](https://github.com/rust-ndarray/ndarray/compare/ndarray-rand-0.14.0...0.15.1) Signed-off-by: dependabot-preview[bot] --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f1e805c..4d37c80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ nalgebra-bindings = ["nalgebra"] datasets = [] [dependencies] -ndarray = { version = "0.14", optional = true } +ndarray = { version = "0.15", optional = true } nalgebra = { version = "0.23.0", optional = true } num-traits = "0.2.12" num = "0.3.0" From 513d3898c91c9220725ff189d1211d3faea53911 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Wed, 28 Apr 2021 21:44:02 +0000 Subject: [PATCH 65/81] build(deps): update num requirement from 0.3.0 to 0.4.0 Updates the requirements on [num](https://github.com/rust-num/num) to permit the latest version. - [Release notes](https://github.com/rust-num/num/releases) - [Changelog](https://github.com/rust-num/num/blob/master/RELEASES.md) - [Commits](https://github.com/rust-num/num/compare/num-0.3.0...num-0.4.0) Signed-off-by: dependabot-preview[bot] --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4d37c80..f662d5e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ datasets = [] ndarray = { version = "0.15", optional = true } nalgebra = { version = "0.23.0", optional = true } num-traits = "0.2.12" -num = "0.3.0" +num = "0.4.0" rand = "0.8.3" rand_distr = "0.4.0" serde = { version = "1.0.115", features = ["derive"], optional = true } From 436d0a089f93e8500f583312630a9d9509f2468c Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Thu, 29 Apr 2021 16:13:20 +0000 Subject: [PATCH 66/81] Upgrade to GitHub-native Dependabot --- .github/dependabot.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..95f9250 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: +- package-ecosystem: cargo + directory: "/" + schedule: + interval: daily + open-pull-requests-limit: 10 + ignore: + - dependency-name: rand_distr + versions: + - 0.4.0 From 763a8370ebf4fbc513ac3ff6c032f877ac8fcad2 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sat, 5 Jun 2021 00:25:34 -0400 Subject: [PATCH 67/81] docs: fix documentation of naive bayes structs --- src/naive_bayes/bernoulli.rs | 3 ++- src/naive_bayes/gaussian.rs | 5 +++-- src/naive_bayes/multinomial.rs | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index 69eb13c..95c4d36 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -249,7 +249,8 @@ impl BernoulliNBDistribution { } } -/// BernoulliNB implements the categorical naive Bayes algorithm for categorically distributed data. +/// BernoulliNB implements the naive Bayes algorithm for data that follows the Bernoulli +/// distribution. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, PartialEq)] pub struct BernoulliNB> { diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index b84e65f..bd23919 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -33,7 +33,7 @@ use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -/// Naive Bayes classifier for categorical features +/// Naive Bayes classifier using Gaussian distribution #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, PartialEq)] struct GaussianNBDistribution { @@ -179,7 +179,8 @@ impl GaussianNBDistribution { } } -/// GaussianNB implements the categorical naive Bayes algorithm for categorically distributed data. +/// GaussianNB implements the naive Bayes algorithm for data that follows the Gaussian +/// distribution. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, PartialEq)] pub struct GaussianNB> { diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 43a022a..f42b99e 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -212,7 +212,7 @@ impl MultinomialNBDistribution { } } -/// MultinomialNB implements the categorical naive Bayes algorithm for categorically distributed data. +/// MultinomialNB implements the naive Bayes algorithm for multinomially distributed data. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, PartialEq)] pub struct MultinomialNB> { From 0b3bf946dfc40153b910e61a05de6dd90c95b6d7 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Sat, 5 Jun 2021 01:00:38 -0400 Subject: [PATCH 68/81] chore: fix clippy warnings --- src/linalg/evd.rs | 2 +- src/linalg/ndarray_bindings.rs | 2 +- src/naive_bayes/categorical.rs | 2 +- src/preprocessing/series_encoder.rs | 6 ++---- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/linalg/evd.rs b/src/linalg/evd.rs index 78b6cc2..81e2315 100644 --- a/src/linalg/evd.rs +++ b/src/linalg/evd.rs @@ -93,7 +93,7 @@ pub trait EVDDecomposableMatrix: BaseMatrix { sort(&mut d, &mut e, &mut V); } - Ok(EVD { V, d, e }) + Ok(EVD { d, e, V }) } } diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 0aa97aa..e081dcc 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -966,7 +966,7 @@ mod tests { let error: f64 = y .into_iter() .zip(y_hat.into_iter()) - .map(|(&a, &b)| (a - b).abs()) + .map(|(a, b)| (a - b).abs()) .sum(); assert!(error <= 1.0); diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index 51619b6..44e5dd9 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -232,8 +232,8 @@ impl CategoricalNBDistribution { class_labels, class_priors, coefficients, - n_categories, n_features, + n_categories, category_count, }) } diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 2cd4133..ab99b08 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -134,10 +134,8 @@ where U: RealNumber, V: BaseVector, { - match self.get_num(category) { - None => None, - Some(&idx) => Some(make_one_hot::(idx, self.num_categories)), - } + self.get_num(category) + .map(|&idx| make_one_hot::(idx, self.num_categories)) } /// Invert one-hot vector, back to the category From e8cba343ca83ea2af06b0d0af97a3c538b17084f Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 14 Oct 2021 09:33:55 +0200 Subject: [PATCH 69/81] Initial implementation of predict_oob. --- src/ensemble/random_forest_classifier.rs | 59 +++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 1d7884b..b3c810a 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -53,7 +53,7 @@ use rand::Rng; use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; -use crate::error::Failed; +use crate::error::{Failed, FailedError}; use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::tree::decision_tree_classifier::{ @@ -77,6 +77,8 @@ pub struct RandomForestClassifierParameters { pub n_trees: u16, /// Number of random sample of predictors to use as split candidates. pub m: Option, + /// Whether to keep samples used for tree generation. This is required for OOB prediction. + pub keep_samples: bool, } /// Random Forest Classifier @@ -86,6 +88,7 @@ pub struct RandomForestClassifier { parameters: RandomForestClassifierParameters, trees: Vec>, classes: Vec, + samples: Option>>, } impl RandomForestClassifierParameters { @@ -119,6 +122,12 @@ impl RandomForestClassifierParameters { self.m = Some(m); self } + + /// Whether to keep samples used for tree generation. This is required for OOB prediction. + pub fn with_keep_samples(mut self, keep_samples: bool) -> Self { + self.keep_samples = keep_samples; + self + } } impl PartialEq for RandomForestClassifier { @@ -150,6 +159,7 @@ impl Default for RandomForestClassifierParameters { min_samples_split: 2, n_trees: 100, m: Option::None, + keep_samples: false, } } } @@ -205,8 +215,17 @@ impl RandomForestClassifier { let k = classes.len(); let mut trees: Vec> = Vec::new(); + let mut maybe_all_samples: Option>> = Option::None; + if parameters.keep_samples { + maybe_all_samples = Some(Vec::new()); + } + for _ in 0..parameters.n_trees { let samples = RandomForestClassifier::::sample_with_replacement(&yi, k); + if let Some(ref mut all_samples) = maybe_all_samples { + all_samples.push(samples.iter().map(|x| *x != 0).collect()) + } + let params = DecisionTreeClassifierParameters { criterion: parameters.criterion.clone(), max_depth: parameters.max_depth, @@ -221,6 +240,7 @@ impl RandomForestClassifier { parameters, trees, classes, + samples: maybe_all_samples, }) } @@ -248,6 +268,42 @@ impl RandomForestClassifier { which_max(&result) } + /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training. + pub fn predict_oob>(&self, x: &M) -> Result { + let (n, _) = x.shape(); + if self.samples.is_none() { + Err(Failed::because( + FailedError::PredictFailed, + "Need samples=true for OOB predictions.", + )) + } else if self.samples.as_ref().unwrap()[0].len() != n { + Err(Failed::because( + FailedError::PredictFailed, + "Prediction matrix must match matrix used in training for OOB predictions.", + )) + } else { + let mut result = M::zeros(self.classes.len(), 1); + + for i in 0..n { + result.set(0, i, self.classes[self.predict_for_row_oob(x, i)]); + } + + Ok(result.to_row_vector()) + } + } + + fn predict_for_row_oob>(&self, x: &M, row: usize) -> usize { + let mut result = vec![0; self.classes.len()]; + + for (tree, samples) in self.trees.iter().zip(self.samples.as_ref().unwrap()) { + if !samples[row] { + result[tree.predict_for_row(x, row)] += 1; + } + } + + which_max(&result) + } + fn sample_with_replacement(y: &[usize], num_classes: usize) -> Vec { let mut rng = rand::thread_rng(); let class_weight = vec![1.; num_classes]; @@ -318,6 +374,7 @@ mod tests { min_samples_split: 2, n_trees: 100, m: Option::None, + keep_samples: false, }, ) .unwrap(); From 4bae62ab2f7776f56363c472a4257ee0c069fee7 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 14 Oct 2021 09:47:00 +0200 Subject: [PATCH 70/81] Test. --- src/ensemble/random_forest_classifier.rs | 51 +++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index b3c810a..f70604c 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -282,7 +282,7 @@ impl RandomForestClassifier { "Prediction matrix must match matrix used in training for OOB predictions.", )) } else { - let mut result = M::zeros(self.classes.len(), 1); + let mut result = M::zeros(1, n); for i in 0..n { result.set(0, i, self.classes[self.predict_for_row_oob(x, i)]); @@ -382,6 +382,55 @@ mod tests { assert!(accuracy(&y, &classifier.predict(&x).unwrap()) >= 0.95); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] + #[test] + fn fit_predict_iris_oob() { + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + let y = vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let classifier = RandomForestClassifier::fit( + &x, + &y, + RandomForestClassifierParameters { + criterion: SplitCriterion::Gini, + max_depth: None, + min_samples_leaf: 1, + min_samples_split: 2, + n_trees: 100, + m: Option::None, + keep_samples: true, + }, + ) + .unwrap(); + assert!( + accuracy(&y, &classifier.predict_oob(&x).unwrap()) + < accuracy(&y, &classifier.predict(&x).unwrap()) + ); + } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] From d23931496758792938a1eebc989be760fc66c9b8 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 14 Oct 2021 09:59:26 +0200 Subject: [PATCH 71/81] Same for regressor. --- src/ensemble/random_forest_regressor.rs | 108 +++++++++++++++++++++++- 1 file changed, 106 insertions(+), 2 deletions(-) diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 0351fc4..f1caa8e 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -51,7 +51,7 @@ use rand::Rng; use serde::{Deserialize, Serialize}; use crate::api::{Predictor, SupervisedEstimator}; -use crate::error::Failed; +use crate::error::{Failed, FailedError}; use crate::linalg::Matrix; use crate::math::num::RealNumber; use crate::tree::decision_tree_regressor::{ @@ -73,6 +73,8 @@ pub struct RandomForestRegressorParameters { pub n_trees: usize, /// Number of random sample of predictors to use as split candidates. pub m: Option, + /// Whether to keep samples used for tree generation. This is required for OOB prediction. + pub keep_samples: bool, } /// Random Forest Regressor @@ -81,6 +83,7 @@ pub struct RandomForestRegressorParameters { pub struct RandomForestRegressor { parameters: RandomForestRegressorParameters, trees: Vec>, + samples: Option>>, } impl RandomForestRegressorParameters { @@ -109,6 +112,12 @@ impl RandomForestRegressorParameters { self.m = Some(m); self } + + /// Whether to keep samples used for tree generation. This is required for OOB prediction. + pub fn with_keep_samples(mut self, keep_samples: bool) -> Self { + self.keep_samples = keep_samples; + self + } } impl Default for RandomForestRegressorParameters { @@ -119,6 +128,7 @@ impl Default for RandomForestRegressorParameters { min_samples_split: 2, n_trees: 10, m: Option::None, + keep_samples: false, } } } @@ -174,8 +184,16 @@ impl RandomForestRegressor { let mut trees: Vec> = Vec::new(); + let mut maybe_all_samples: Option>> = Option::None; + if parameters.keep_samples { + maybe_all_samples = Some(Vec::new()); + } + for _ in 0..parameters.n_trees { let samples = RandomForestRegressor::::sample_with_replacement(n_rows); + if let Some(ref mut all_samples) = maybe_all_samples { + all_samples.push(samples.iter().map(|x| *x != 0).collect()) + } let params = DecisionTreeRegressorParameters { max_depth: parameters.max_depth, min_samples_leaf: parameters.min_samples_leaf, @@ -185,7 +203,7 @@ impl RandomForestRegressor { trees.push(tree); } - Ok(RandomForestRegressor { parameters, trees }) + Ok(RandomForestRegressor { parameters, trees, samples: maybe_all_samples }) } /// Predict class for `x` @@ -214,6 +232,46 @@ impl RandomForestRegressor { result / T::from(n_trees).unwrap() } + + /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training. + pub fn predict_oob>(&self, x: &M) -> Result { + let (n, _) = x.shape(); + if self.samples.is_none() { + Err(Failed::because( + FailedError::PredictFailed, + "Need samples=true for OOB predictions.", + )) + } else if self.samples.as_ref().unwrap()[0].len() != n { + Err(Failed::because( + FailedError::PredictFailed, + "Prediction matrix must match matrix used in training for OOB predictions.", + )) + } else { + let mut result = M::zeros(1, n); + + for i in 0..n { + result.set(0, i, self.predict_for_row_oob(x, i)); + } + + Ok(result.to_row_vector()) + } + } + + fn predict_for_row_oob>(&self, x: &M, row: usize) -> T { + let mut n_trees = 0; + let mut result = T::zero(); + + for (tree, samples) in self.trees.iter().zip(self.samples.as_ref().unwrap()) { + if !samples[row] { + result += tree.predict_for_row(x, row); + n_trees += 1; + } + } + + // TODO: What to do if there are no oob trees? + result / T::from(n_trees).unwrap() + } + fn sample_with_replacement(nrows: usize) -> Vec { let mut rng = rand::thread_rng(); let mut samples = vec![0; nrows]; @@ -266,6 +324,7 @@ mod tests { min_samples_split: 2, n_trees: 1000, m: Option::None, + keep_samples: false, }, ) .and_then(|rf| rf.predict(&x)) @@ -274,6 +333,51 @@ mod tests { assert!(mean_absolute_error(&y, &y_hat) < 1.0); } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] + #[test] + fn fit_predict_longley_oob() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159., 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165., 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], + &[365.385, 187., 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335., 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let regressor = RandomForestRegressor::fit( + &x, + &y, + RandomForestRegressorParameters { + max_depth: None, + min_samples_leaf: 1, + min_samples_split: 2, + n_trees: 1000, + m: Option::None, + keep_samples: true, + }, + ).unwrap(); + + let y_hat = regressor.predict(&x).unwrap(); + let y_hat_oob = regressor.predict_oob(&x).unwrap(); + + assert!(mean_absolute_error(&y, &y_hat) < mean_absolute_error(&y, &y_hat_oob)); + } + #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)] #[test] #[cfg(feature = "serde")] From 85b9fde9a78c6bc3c9b8e2899240fc77a7d5adf4 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 20 Oct 2021 17:04:24 +0200 Subject: [PATCH 72/81] Another format. --- src/ensemble/random_forest_regressor.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index f1caa8e..90ac479 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -203,7 +203,11 @@ impl RandomForestRegressor { trees.push(tree); } - Ok(RandomForestRegressor { parameters, trees, samples: maybe_all_samples }) + Ok(RandomForestRegressor { + parameters, + trees, + samples: maybe_all_samples, + }) } /// Predict class for `x` @@ -232,7 +236,6 @@ impl RandomForestRegressor { result / T::from(n_trees).unwrap() } - /// Predict OOB classes for `x`. `x` is expected to be equal to the dataset used in training. pub fn predict_oob>(&self, x: &M) -> Result { let (n, _) = x.shape(); @@ -370,7 +373,8 @@ mod tests { m: Option::None, keep_samples: true, }, - ).unwrap(); + ) + .unwrap(); let y_hat = regressor.predict(&x).unwrap(); let y_hat_oob = regressor.predict_oob(&x).unwrap(); From d0a4ccbe202263ff2103891cbf163a691befe594 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 20 Oct 2021 17:08:52 +0200 Subject: [PATCH 73/81] Set keep_samples attribute. --- src/linalg/ndarray_bindings.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 0aa97aa..2ec8e3a 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -1007,6 +1007,7 @@ mod tests { min_samples_split: 2, n_trees: 1000, m: Option::None, + keep_samples: false, }, ) .unwrap() From 14245e15ad2452232052f56b22959a6aaa89af2a Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 20 Oct 2021 17:13:00 +0200 Subject: [PATCH 74/81] type error. --- src/linalg/ndarray_bindings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 2ec8e3a..f5b1c69 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -966,7 +966,7 @@ mod tests { let error: f64 = y .into_iter() .zip(y_hat.into_iter()) - .map(|(&a, &b)| (a - b).abs()) + .map(|(a, b)| (a - b).abs()) .sum(); assert!(error <= 1.0); From 4397c91570f1c7e8afb4bff346a4a1b8505216c4 Mon Sep 17 00:00:00 2001 From: Luis Moreno Date: Wed, 20 Oct 2021 13:50:14 -0500 Subject: [PATCH 75/81] Fix clippy warnings --- src/algorithm/neighbour/cover_tree.rs | 10 +++++----- src/algorithm/neighbour/linear_search.rs | 4 ++-- src/algorithm/sort/heap_select.rs | 3 +-- src/lib.rs | 2 +- src/linalg/naive/dense_matrix.rs | 2 +- src/linalg/ndarray_bindings.rs | 6 +++--- src/optimization/first_order/gradient_descent.rs | 6 +++--- src/optimization/first_order/lbfgs.rs | 6 +++--- src/preprocessing/series_encoder.rs | 6 ++---- src/svm/svc.rs | 4 ++-- src/tree/decision_tree_classifier.rs | 4 ++-- src/tree/decision_tree_regressor.rs | 2 +- 12 files changed, 26 insertions(+), 29 deletions(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index e8fc937..8fb8b7d 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -117,7 +117,7 @@ impl> CoverTree } let e = self.get_data_value(self.root.idx); - let mut d = self.distance.distance(&e, p); + let mut d = self.distance.distance(e, p); let mut current_cover_set: Vec<(F, &Node)> = Vec::new(); let mut zero_set: Vec<(F, &Node)> = Vec::new(); @@ -175,7 +175,7 @@ impl> CoverTree if ds.0 <= upper_bound { let v = self.get_data_value(ds.1.idx); if !self.identical_excluded || v != p { - neighbors.push((ds.1.idx, ds.0, &v)); + neighbors.push((ds.1.idx, ds.0, v)); } } } @@ -200,7 +200,7 @@ impl> CoverTree let mut zero_set: Vec<(F, &Node)> = Vec::new(); let e = self.get_data_value(self.root.idx); - let mut d = self.distance.distance(&e, p); + let mut d = self.distance.distance(e, p); current_cover_set.push((d, &self.root)); while !current_cover_set.is_empty() { @@ -230,7 +230,7 @@ impl> CoverTree for ds in zero_set { let v = self.get_data_value(ds.1.idx); if !self.identical_excluded || v != p { - neighbors.push((ds.1.idx, ds.0, &v)); + neighbors.push((ds.1.idx, ds.0, v)); } } @@ -287,7 +287,7 @@ impl> CoverTree if point_set.is_empty() { self.new_leaf(p) } else { - let max_dist = self.max(&point_set); + let max_dist = self.max(point_set); let next_scale = (max_scale - 1).min(self.get_scale(max_dist)); if next_scale == std::i64::MIN { let mut children: Vec> = Vec::new(); diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index fd8cc6a..e2a1b6d 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -74,7 +74,7 @@ impl> LinearKNNSearch { } for i in 0..self.data.len() { - let d = self.distance.distance(&from, &self.data[i]); + let d = self.distance.distance(from, &self.data[i]); let datum = heap.peek_mut(); if d < datum.distance { datum.distance = d; @@ -104,7 +104,7 @@ impl> LinearKNNSearch { let mut neighbors: Vec<(usize, F, &T)> = Vec::new(); for i in 0..self.data.len() { - let d = self.distance.distance(&from, &self.data[i]); + let d = self.distance.distance(from, &self.data[i]); if d <= radius { neighbors.push((i, d, &self.data[i])); diff --git a/src/algorithm/sort/heap_select.rs b/src/algorithm/sort/heap_select.rs index 86a74ac..beb698f 100644 --- a/src/algorithm/sort/heap_select.rs +++ b/src/algorithm/sort/heap_select.rs @@ -53,8 +53,7 @@ impl<'a, T: PartialOrd + Debug> HeapSelection { if self.sorted { &self.heap[0] } else { - &self - .heap + self.heap .iter() .max_by(|a, b| a.partial_cmp(b).unwrap()) .unwrap() diff --git a/src/lib.rs b/src/lib.rs index c117039..2edada4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,7 @@ clippy::upper_case_acronyms )] #![warn(missing_docs)] -#![warn(missing_doc_code_examples)] +#![warn(rustdoc::missing_doc_code_examples)] //! # SmartCore //! diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index ae68015..9866618 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -330,7 +330,7 @@ impl DenseMatrix { cur_r: 0, max_c: self.ncols, max_r: self.nrows, - m: &self, + m: self, } } } diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 0aa97aa..2e70e8d 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -178,7 +178,7 @@ impl BaseVector for ArrayBase, Ix } fn copy_from(&mut self, other: &Self) { - self.assign(&other); + self.assign(other); } } @@ -385,7 +385,7 @@ impl &Self { @@ -966,7 +966,7 @@ mod tests { let error: f64 = y .into_iter() .zip(y_hat.into_iter()) - .map(|(&a, &b)| (a - b).abs()) + .map(|(a, b)| (a - b).abs()) .sum(); assert!(error <= 1.0); diff --git a/src/optimization/first_order/gradient_descent.rs b/src/optimization/first_order/gradient_descent.rs index aba48a5..a936ae4 100644 --- a/src/optimization/first_order/gradient_descent.rs +++ b/src/optimization/first_order/gradient_descent.rs @@ -50,14 +50,14 @@ impl FirstOrderOptimizer for GradientDescent { let f_alpha = |alpha: T| -> T { let mut dx = step.clone(); dx.mul_scalar_mut(alpha); - f(&dx.add_mut(&x)) // f(x) = f(x .+ gvec .* alpha) + f(dx.add_mut(&x)) // f(x) = f(x .+ gvec .* alpha) }; let df_alpha = |alpha: T| -> T { let mut dx = step.clone(); let mut dg = gvec.clone(); dx.mul_scalar_mut(alpha); - df(&mut dg, &dx.add_mut(&x)); //df(x) = df(x .+ gvec .* alpha) + df(&mut dg, dx.add_mut(&x)); //df(x) = df(x .+ gvec .* alpha) gvec.dot(&dg) }; @@ -66,7 +66,7 @@ impl FirstOrderOptimizer for GradientDescent { let ls_r = ls.search(&f_alpha, &df_alpha, alpha, fx, df0); alpha = ls_r.alpha; fx = ls_r.f_x; - x.add_mut(&step.mul_scalar_mut(alpha)); + x.add_mut(step.mul_scalar_mut(alpha)); df(&mut gvec, &x); gnorm = gvec.norm2(); } diff --git a/src/optimization/first_order/lbfgs.rs b/src/optimization/first_order/lbfgs.rs index aaf2c89..6c0b89b 100644 --- a/src/optimization/first_order/lbfgs.rs +++ b/src/optimization/first_order/lbfgs.rs @@ -117,14 +117,14 @@ impl LBFGS { let f_alpha = |alpha: T| -> T { let mut dx = state.s.clone(); dx.mul_scalar_mut(alpha); - f(&dx.add_mut(&state.x)) // f(x) = f(x .+ gvec .* alpha) + f(dx.add_mut(&state.x)) // f(x) = f(x .+ gvec .* alpha) }; let df_alpha = |alpha: T| -> T { let mut dx = state.s.clone(); let mut dg = state.x_df.clone(); dx.mul_scalar_mut(alpha); - df(&mut dg, &dx.add_mut(&state.x)); //df(x) = df(x .+ gvec .* alpha) + df(&mut dg, dx.add_mut(&state.x)); //df(x) = df(x .+ gvec .* alpha) state.x_df.dot(&dg) }; @@ -206,7 +206,7 @@ impl FirstOrderOptimizer for LBFGS { ) -> OptimizerResult { let mut state = self.init_state(x0); - df(&mut state.x_df, &x0); + df(&mut state.x_df, x0); let g_converged = state.x_df.norm(T::infinity()) < self.g_atol; let mut converged = g_converged; diff --git a/src/preprocessing/series_encoder.rs b/src/preprocessing/series_encoder.rs index 2cd4133..ab99b08 100644 --- a/src/preprocessing/series_encoder.rs +++ b/src/preprocessing/series_encoder.rs @@ -134,10 +134,8 @@ where U: RealNumber, V: BaseVector, { - match self.get_num(category) { - None => None, - Some(&idx) => Some(make_one_hot::(idx, self.num_categories)), - } + self.get_num(category) + .map(|&idx| make_one_hot::(idx, self.num_categories)) } /// Invert one-hot vector, back to the category diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 9c141e5..b12e558 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -377,7 +377,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, Optimizer { x, y, - parameters: ¶meters, + parameters, svmin: 0, svmax: 0, gmin: T::max_value(), @@ -589,7 +589,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, for i in 0..self.sv.len() { let v = &self.sv[i]; let z = v.grad - gm; - let k = cache.get(sv1, &v); + let k = cache.get(sv1, v); let mut curv = km + v.k - T::two() * k; if curv <= T::zero() { curv = self.tau; diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 5ba096e..200fee5 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -380,7 +380,7 @@ impl DecisionTreeClassifier { depth: 0, }; - let mut visitor = NodeVisitor::::new(0, samples, &order, &x, &yi, 1); + let mut visitor = NodeVisitor::::new(0, samples, &order, x, &yi, 1); let mut visitor_queue: LinkedList> = LinkedList::new(); @@ -541,7 +541,7 @@ impl DecisionTreeClassifier { - T::from(tc).unwrap() / T::from(n).unwrap() * impurity(&self.parameters.criterion, &true_count, tc) - T::from(fc).unwrap() / T::from(n).unwrap() - * impurity(&self.parameters.criterion, &false_count, fc); + * impurity(&self.parameters.criterion, false_count, fc); if self.nodes[visitor.node].split_score == Option::None || gain > self.nodes[visitor.node].split_score.unwrap() diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 21a6d00..6a0705f 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -280,7 +280,7 @@ impl DecisionTreeRegressor { depth: 0, }; - let mut visitor = NodeVisitor::::new(0, samples, &order, &x, &y_m, 1); + let mut visitor = NodeVisitor::::new(0, samples, &order, x, &y_m, 1); let mut visitor_queue: LinkedList> = LinkedList::new(); From 12c102d02b11413608bdeb231e2fbd145a3d7d6b Mon Sep 17 00:00:00 2001 From: Malte Londschien <61679398+mlondschien@users.noreply.github.com> Date: Thu, 11 Nov 2021 01:51:24 +0100 Subject: [PATCH 76/81] Allow setting seed for `RandomForestClassifier` and `Regressor` (#120) * Seed for the classifier. * Seed for the regressor. * Forgot one. * typo. --- src/ensemble/random_forest_classifier.rs | 24 +++++++++++++++++++----- src/ensemble/random_forest_regressor.rs | 24 ++++++++++++++++++------ src/linalg/ndarray_bindings.rs | 1 + src/tree/decision_tree_classifier.rs | 23 +++++++++++++++++------ src/tree/decision_tree_regressor.rs | 23 +++++++++++++++++------ 5 files changed, 72 insertions(+), 23 deletions(-) diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index f70604c..5cebced 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -45,10 +45,11 @@ //! //! //! +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; use std::default::Default; use std::fmt::Debug; -use rand::Rng; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -79,6 +80,8 @@ pub struct RandomForestClassifierParameters { pub m: Option, /// Whether to keep samples used for tree generation. This is required for OOB prediction. pub keep_samples: bool, + /// Seed used for bootstrap sampling and feature selection for each tree. + pub seed: u64, } /// Random Forest Classifier @@ -128,6 +131,12 @@ impl RandomForestClassifierParameters { self.keep_samples = keep_samples; self } + + /// Seed used for bootstrap sampling and feature selection for each tree. + pub fn with_seed(mut self, seed: u64) -> Self { + self.seed = seed; + self + } } impl PartialEq for RandomForestClassifier { @@ -160,6 +169,7 @@ impl Default for RandomForestClassifierParameters { n_trees: 100, m: Option::None, keep_samples: false, + seed: 0, } } } @@ -211,6 +221,7 @@ impl RandomForestClassifier { .unwrap() }); + let mut rng = StdRng::seed_from_u64(parameters.seed); let classes = y_m.unique(); let k = classes.len(); let mut trees: Vec> = Vec::new(); @@ -221,7 +232,7 @@ impl RandomForestClassifier { } for _ in 0..parameters.n_trees { - let samples = RandomForestClassifier::::sample_with_replacement(&yi, k); + let samples = RandomForestClassifier::::sample_with_replacement(&yi, k, &mut rng); if let Some(ref mut all_samples) = maybe_all_samples { all_samples.push(samples.iter().map(|x| *x != 0).collect()) } @@ -232,7 +243,8 @@ impl RandomForestClassifier { min_samples_leaf: parameters.min_samples_leaf, min_samples_split: parameters.min_samples_split, }; - let tree = DecisionTreeClassifier::fit_weak_learner(x, y, samples, mtry, params)?; + let tree = + DecisionTreeClassifier::fit_weak_learner(x, y, samples, mtry, params, &mut rng)?; trees.push(tree); } @@ -304,8 +316,7 @@ impl RandomForestClassifier { which_max(&result) } - fn sample_with_replacement(y: &[usize], num_classes: usize) -> Vec { - let mut rng = rand::thread_rng(); + fn sample_with_replacement(y: &[usize], num_classes: usize, rng: &mut impl Rng) -> Vec { let class_weight = vec![1.; num_classes]; let nrows = y.len(); let mut samples = vec![0; nrows]; @@ -375,6 +386,7 @@ mod tests { n_trees: 100, m: Option::None, keep_samples: false, + seed: 87, }, ) .unwrap(); @@ -422,9 +434,11 @@ mod tests { n_trees: 100, m: Option::None, keep_samples: true, + seed: 87, }, ) .unwrap(); + assert!( accuracy(&y, &classifier.predict_oob(&x).unwrap()) < accuracy(&y, &classifier.predict(&x).unwrap()) diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 90ac479..c923cd8 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -43,10 +43,11 @@ //! //! +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; use std::default::Default; use std::fmt::Debug; -use rand::Rng; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -75,6 +76,8 @@ pub struct RandomForestRegressorParameters { pub m: Option, /// Whether to keep samples used for tree generation. This is required for OOB prediction. pub keep_samples: bool, + /// Seed used for bootstrap sampling and feature selection for each tree. + pub seed: u64, } /// Random Forest Regressor @@ -118,8 +121,13 @@ impl RandomForestRegressorParameters { self.keep_samples = keep_samples; self } -} + /// Seed used for bootstrap sampling and feature selection for each tree. + pub fn with_seed(mut self, seed: u64) -> Self { + self.seed = seed; + self + } +} impl Default for RandomForestRegressorParameters { fn default() -> Self { RandomForestRegressorParameters { @@ -129,6 +137,7 @@ impl Default for RandomForestRegressorParameters { n_trees: 10, m: Option::None, keep_samples: false, + seed: 0, } } } @@ -182,6 +191,7 @@ impl RandomForestRegressor { .m .unwrap_or((num_attributes as f64).sqrt().floor() as usize); + let mut rng = StdRng::seed_from_u64(parameters.seed); let mut trees: Vec> = Vec::new(); let mut maybe_all_samples: Option>> = Option::None; @@ -190,7 +200,7 @@ impl RandomForestRegressor { } for _ in 0..parameters.n_trees { - let samples = RandomForestRegressor::::sample_with_replacement(n_rows); + let samples = RandomForestRegressor::::sample_with_replacement(n_rows, &mut rng); if let Some(ref mut all_samples) = maybe_all_samples { all_samples.push(samples.iter().map(|x| *x != 0).collect()) } @@ -199,7 +209,8 @@ impl RandomForestRegressor { min_samples_leaf: parameters.min_samples_leaf, min_samples_split: parameters.min_samples_split, }; - let tree = DecisionTreeRegressor::fit_weak_learner(x, y, samples, mtry, params)?; + let tree = + DecisionTreeRegressor::fit_weak_learner(x, y, samples, mtry, params, &mut rng)?; trees.push(tree); } @@ -275,8 +286,7 @@ impl RandomForestRegressor { result / T::from(n_trees).unwrap() } - fn sample_with_replacement(nrows: usize) -> Vec { - let mut rng = rand::thread_rng(); + fn sample_with_replacement(nrows: usize, rng: &mut impl Rng) -> Vec { let mut samples = vec![0; nrows]; for _ in 0..nrows { let xi = rng.gen_range(0..nrows); @@ -328,6 +338,7 @@ mod tests { n_trees: 1000, m: Option::None, keep_samples: false, + seed: 87, }, ) .and_then(|rf| rf.predict(&x)) @@ -372,6 +383,7 @@ mod tests { n_trees: 1000, m: Option::None, keep_samples: true, + seed: 87, }, ) .unwrap(); diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 091aaaf..99e0918 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -1008,6 +1008,7 @@ mod tests { n_trees: 1000, m: Option::None, keep_samples: false, + seed: 0, }, ) .unwrap() diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 200fee5..751d5d1 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -68,6 +68,7 @@ use std::fmt::Debug; use std::marker::PhantomData; use rand::seq::SliceRandom; +use rand::Rng; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -328,7 +329,14 @@ impl DecisionTreeClassifier { ) -> Result, Failed> { let (x_nrows, num_attributes) = x.shape(); let samples = vec![1; x_nrows]; - DecisionTreeClassifier::fit_weak_learner(x, y, samples, num_attributes, parameters) + DecisionTreeClassifier::fit_weak_learner( + x, + y, + samples, + num_attributes, + parameters, + &mut rand::thread_rng(), + ) } pub(crate) fn fit_weak_learner>( @@ -337,6 +345,7 @@ impl DecisionTreeClassifier { samples: Vec, mtry: usize, parameters: DecisionTreeClassifierParameters, + rng: &mut impl Rng, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (_, y_ncols) = y_m.shape(); @@ -384,13 +393,13 @@ impl DecisionTreeClassifier { let mut visitor_queue: LinkedList> = LinkedList::new(); - if tree.find_best_cutoff(&mut visitor, mtry) { + if tree.find_best_cutoff(&mut visitor, mtry, rng) { visitor_queue.push_back(visitor); } while tree.depth < tree.parameters.max_depth.unwrap_or(std::u16::MAX) { match visitor_queue.pop_front() { - Some(node) => tree.split(node, mtry, &mut visitor_queue), + Some(node) => tree.split(node, mtry, &mut visitor_queue, rng), None => break, }; } @@ -443,6 +452,7 @@ impl DecisionTreeClassifier { &mut self, visitor: &mut NodeVisitor<'_, T, M>, mtry: usize, + rng: &mut impl Rng, ) -> bool { let (n_rows, n_attr) = visitor.x.shape(); @@ -482,7 +492,7 @@ impl DecisionTreeClassifier { let mut variables = (0..n_attr).collect::>(); if mtry < n_attr { - variables.shuffle(&mut rand::thread_rng()); + variables.shuffle(rng); } for variable in variables.iter().take(mtry) { @@ -566,6 +576,7 @@ impl DecisionTreeClassifier { mut visitor: NodeVisitor<'a, T, M>, mtry: usize, visitor_queue: &mut LinkedList>, + rng: &mut impl Rng, ) -> bool { let (n, _) = visitor.x.shape(); let mut tc = 0; @@ -614,7 +625,7 @@ impl DecisionTreeClassifier { visitor.level + 1, ); - if self.find_best_cutoff(&mut true_visitor, mtry) { + if self.find_best_cutoff(&mut true_visitor, mtry, rng) { visitor_queue.push_back(true_visitor); } @@ -627,7 +638,7 @@ impl DecisionTreeClassifier { visitor.level + 1, ); - if self.find_best_cutoff(&mut false_visitor, mtry) { + if self.find_best_cutoff(&mut false_visitor, mtry, rng) { visitor_queue.push_back(false_visitor); } diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 6a0705f..34f58a9 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -63,6 +63,7 @@ use std::default::Default; use std::fmt::Debug; use rand::seq::SliceRandom; +use rand::Rng; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -242,7 +243,14 @@ impl DecisionTreeRegressor { ) -> Result, Failed> { let (x_nrows, num_attributes) = x.shape(); let samples = vec![1; x_nrows]; - DecisionTreeRegressor::fit_weak_learner(x, y, samples, num_attributes, parameters) + DecisionTreeRegressor::fit_weak_learner( + x, + y, + samples, + num_attributes, + parameters, + &mut rand::thread_rng(), + ) } pub(crate) fn fit_weak_learner>( @@ -251,6 +259,7 @@ impl DecisionTreeRegressor { samples: Vec, mtry: usize, parameters: DecisionTreeRegressorParameters, + rng: &mut impl Rng, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); @@ -284,13 +293,13 @@ impl DecisionTreeRegressor { let mut visitor_queue: LinkedList> = LinkedList::new(); - if tree.find_best_cutoff(&mut visitor, mtry) { + if tree.find_best_cutoff(&mut visitor, mtry, rng) { visitor_queue.push_back(visitor); } while tree.depth < tree.parameters.max_depth.unwrap_or(std::u16::MAX) { match visitor_queue.pop_front() { - Some(node) => tree.split(node, mtry, &mut visitor_queue), + Some(node) => tree.split(node, mtry, &mut visitor_queue, rng), None => break, }; } @@ -343,6 +352,7 @@ impl DecisionTreeRegressor { &mut self, visitor: &mut NodeVisitor<'_, T, M>, mtry: usize, + rng: &mut impl Rng, ) -> bool { let (_, n_attr) = visitor.x.shape(); @@ -357,7 +367,7 @@ impl DecisionTreeRegressor { let mut variables = (0..n_attr).collect::>(); if mtry < n_attr { - variables.shuffle(&mut rand::thread_rng()); + variables.shuffle(rng); } let parent_gain = @@ -432,6 +442,7 @@ impl DecisionTreeRegressor { mut visitor: NodeVisitor<'a, T, M>, mtry: usize, visitor_queue: &mut LinkedList>, + rng: &mut impl Rng, ) -> bool { let (n, _) = visitor.x.shape(); let mut tc = 0; @@ -480,7 +491,7 @@ impl DecisionTreeRegressor { visitor.level + 1, ); - if self.find_best_cutoff(&mut true_visitor, mtry) { + if self.find_best_cutoff(&mut true_visitor, mtry, rng) { visitor_queue.push_back(true_visitor); } @@ -493,7 +504,7 @@ impl DecisionTreeRegressor { visitor.level + 1, ); - if self.find_best_cutoff(&mut false_visitor, mtry) { + if self.find_best_cutoff(&mut false_visitor, mtry, rng) { visitor_queue.push_back(false_visitor); } From f93286ffbd8179a8b848c31b5cd25335b28ff0c2 Mon Sep 17 00:00:00 2001 From: Kiran Eiden Date: Sun, 2 Jan 2022 19:46:21 -0800 Subject: [PATCH 77/81] Fix bug in cover tree KNN algorithm Prior to this change, the find function implementation for the CoverTree class could have potentially returned the wrong result in cases where there were multiple points in the dataset equidistant from p. For example, the current test passed for k=3 but failed to produce the correct result for k=4 (it claimed that 3, 4, 5, and 7 were the 4 closest points to 5 in the dataset rather than 3, 4, 5, and 6). Sorting the neighbors vector before collecting the first k values from it resolved this issue. --- src/algorithm/neighbour/cover_tree.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 8fb8b7d..355a448 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -179,7 +179,8 @@ impl> CoverTree } } } - + + neighbors.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); Ok(neighbors.into_iter().take(k).collect()) } From 389b0e8e6725d82940e7eca54798798e87ac429e Mon Sep 17 00:00:00 2001 From: Kiran Eiden Date: Tue, 4 Jan 2022 14:50:47 -0800 Subject: [PATCH 78/81] Only sort in CoverTree::find function if there are more than k points Sorting only needs to be done if the list of KNN candidates is greater than length k. --- src/algorithm/neighbour/cover_tree.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 355a448..ad2b071 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -180,7 +180,9 @@ impl> CoverTree } } - neighbors.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + if neighbors.len() > k { + neighbors.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + } Ok(neighbors.into_iter().take(k).collect()) } From 820201e92079aa70f71ee780f504cc72817c6b60 Mon Sep 17 00:00:00 2001 From: morenol <22335041+morenol@users.noreply.github.com> Date: Thu, 5 May 2022 10:39:18 -0400 Subject: [PATCH 79/81] Solve conflic with num-traits (#130) * Solve conflic with num-traits * Fix clippy warnings Co-authored-by: Luis Moreno --- src/algorithm/neighbour/cover_tree.rs | 10 +++++----- src/cluster/dbscan.rs | 4 ++-- src/cluster/kmeans.rs | 10 +++++----- src/dataset/mod.rs | 4 ++-- src/ensemble/random_forest_classifier.rs | 4 ++-- src/ensemble/random_forest_regressor.rs | 4 ++-- src/linalg/cholesky.rs | 7 +++---- src/linalg/evd.rs | 19 +++++++------------ src/linalg/lu.rs | 6 +++--- src/linalg/mod.rs | 9 ++++----- src/linalg/naive/dense_matrix.rs | 1 - src/linalg/svd.rs | 12 ++++++------ src/linear/linear_regression.rs | 6 +++--- src/linear/logistic_regression.rs | 2 +- src/linear/ridge_regression.rs | 4 ++-- src/naive_bayes/categorical.rs | 2 +- src/optimization/first_order/lbfgs.rs | 1 + src/optimization/mod.rs | 1 + src/preprocessing/categorical.rs | 3 +-- src/svm/svc.rs | 2 +- src/svm/svr.rs | 2 +- src/tree/decision_tree_classifier.rs | 6 +++--- src/tree/decision_tree_regressor.rs | 4 ++-- 23 files changed, 58 insertions(+), 65 deletions(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 8fb8b7d..e329220 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -65,7 +65,7 @@ struct Node { max_dist: F, parent_dist: F, children: Vec>, - scale: i64, + _scale: i64, } #[derive(Debug)] @@ -85,7 +85,7 @@ impl> CoverTree max_dist: F::zero(), parent_dist: F::zero(), children: Vec::new(), - scale: 0, + _scale: 0, }; let mut tree = CoverTree { base, @@ -243,7 +243,7 @@ impl> CoverTree max_dist: F::zero(), parent_dist: F::zero(), children: Vec::new(), - scale: 100, + _scale: 100, } } @@ -304,7 +304,7 @@ impl> CoverTree max_dist: F::zero(), parent_dist: F::zero(), children, - scale: 100, + _scale: 100, } } else { let mut far: Vec> = Vec::new(); @@ -373,7 +373,7 @@ impl> CoverTree max_dist: self.max(consumed_set), parent_dist: F::zero(), children, - scale: (top_scale - max_scale), + _scale: (top_scale - max_scale), } } } diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index b1231c3..7f2baef 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -155,11 +155,11 @@ impl, T>> DBSCAN { parameters: DBSCANParameters, ) -> Result, Failed> { if parameters.min_samples < 1 { - return Err(Failed::fit(&"Invalid minPts".to_string())); + return Err(Failed::fit("Invalid minPts")); } if parameters.eps <= T::zero() { - return Err(Failed::fit(&"Invalid radius: ".to_string())); + return Err(Failed::fit("Invalid radius: ")); } let mut k = 0; diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index fd43a14..05af680 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -71,9 +71,9 @@ use crate::math::num::RealNumber; #[derive(Debug)] pub struct KMeans { k: usize, - y: Vec, + _y: Vec, size: Vec, - distortion: T, + _distortion: T, centroids: Vec>, } @@ -208,9 +208,9 @@ impl KMeans { Ok(KMeans { k: parameters.k, - y, + _y: y, size, - distortion, + _distortion: distortion, centroids, }) } @@ -344,7 +344,7 @@ mod tests { let y = kmeans.predict(&x).unwrap(); for i in 0..y.len() { - assert_eq!(y[i] as usize, kmeans.y[i]); + assert_eq!(y[i] as usize, kmeans._y[i]); } } diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 5fe4c45..acd7641 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -67,14 +67,14 @@ pub(crate) fn serialize_data( .data .iter() .copied() - .flat_map(|f| f.to_f32_bits().to_le_bytes().to_vec().into_iter()) + .flat_map(|f| f.to_f32_bits().to_le_bytes().to_vec()) .collect(); file.write_all(&x)?; let y: Vec = dataset .target .iter() .copied() - .flat_map(|f| f.to_f32_bits().to_le_bytes().to_vec().into_iter()) + .flat_map(|f| f.to_f32_bits().to_le_bytes().to_vec()) .collect(); file.write_all(&y)?; } diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 5cebced..247b502 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -88,7 +88,7 @@ pub struct RandomForestClassifierParameters { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] pub struct RandomForestClassifier { - parameters: RandomForestClassifierParameters, + _parameters: RandomForestClassifierParameters, trees: Vec>, classes: Vec, samples: Option>>, @@ -249,7 +249,7 @@ impl RandomForestClassifier { } Ok(RandomForestClassifier { - parameters, + _parameters: parameters, trees, classes, samples: maybe_all_samples, diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index c923cd8..08a7dcc 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -84,7 +84,7 @@ pub struct RandomForestRegressorParameters { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] pub struct RandomForestRegressor { - parameters: RandomForestRegressorParameters, + _parameters: RandomForestRegressorParameters, trees: Vec>, samples: Option>>, } @@ -215,7 +215,7 @@ impl RandomForestRegressor { } Ok(RandomForestRegressor { - parameters, + _parameters: parameters, trees, samples: maybe_all_samples, }) diff --git a/src/linalg/cholesky.rs b/src/linalg/cholesky.rs index 053cbfa..9b5b9cc 100644 --- a/src/linalg/cholesky.rs +++ b/src/linalg/cholesky.rs @@ -87,8 +87,7 @@ impl> Cholesky { if bn != rn { return Err(Failed::because( FailedError::SolutionFailed, - &"Can\'t solve Ax = b for x. Number of rows in b != number of rows in R." - .to_string(), + "Can\'t solve Ax = b for x. Number of rows in b != number of rows in R.", )); } @@ -128,7 +127,7 @@ pub trait CholeskyDecomposableMatrix: BaseMatrix { if m != n { return Err(Failed::because( FailedError::DecompositionFailed, - &"Can\'t do Cholesky decomposition on a non-square matrix".to_string(), + "Can\'t do Cholesky decomposition on a non-square matrix", )); } @@ -148,7 +147,7 @@ pub trait CholeskyDecomposableMatrix: BaseMatrix { if d < T::zero() { return Err(Failed::because( FailedError::DecompositionFailed, - &"The matrix is not positive definite.".to_string(), + "The matrix is not positive definite.", )); } diff --git a/src/linalg/evd.rs b/src/linalg/evd.rs index 81e2315..bf195a0 100644 --- a/src/linalg/evd.rs +++ b/src/linalg/evd.rs @@ -97,7 +97,7 @@ pub trait EVDDecomposableMatrix: BaseMatrix { } } -fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec) { +fn tred2>(V: &mut M, d: &mut [T], e: &mut [T]) { let (n, _) = V.shape(); for (i, d_i) in d.iter_mut().enumerate().take(n) { *d_i = V.get(n - 1, i); @@ -195,7 +195,7 @@ fn tred2>(V: &mut M, d: &mut Vec, e: &mut Vec e[0] = T::zero(); } -fn tql2>(V: &mut M, d: &mut Vec, e: &mut Vec) { +fn tql2>(V: &mut M, d: &mut [T], e: &mut [T]) { let (n, _) = V.shape(); for i in 1..n { e[i - 1] = e[i]; @@ -419,7 +419,7 @@ fn eltran>(A: &M, V: &mut M, perm: &[usize]) { } } -fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e: &mut Vec) { +fn hqr2>(A: &mut M, V: &mut M, d: &mut [T], e: &mut [T]) { let (n, _) = A.shape(); let mut z = T::zero(); let mut s = T::zero(); @@ -471,7 +471,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e A.set(nn, nn, x); A.set(nn - 1, nn - 1, y + t); if q >= T::zero() { - z = p + z.copysign(p); + z = p + RealNumber::copysign(z, p); d[nn - 1] = x + z; d[nn] = x + z; if z != T::zero() { @@ -570,7 +570,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e r /= x; } } - let s = (p * p + q * q + r * r).sqrt().copysign(p); + let s = RealNumber::copysign((p * p + q * q + r * r).sqrt(), p); if s != T::zero() { if k == m { if l != m { @@ -594,12 +594,7 @@ fn hqr2>(A: &mut M, V: &mut M, d: &mut Vec, e A.sub_element_mut(k + 1, j, p * y); A.sub_element_mut(k, j, p * x); } - let mmin; - if nn < k + 3 { - mmin = nn; - } else { - mmin = k + 3; - } + let mmin = if nn < k + 3 { nn } else { k + 3 }; for i in 0..mmin + 1 { p = x * A.get(i, k) + y * A.get(i, k + 1); if k + 1 != nn { @@ -783,7 +778,7 @@ fn balbak>(V: &mut M, scale: &[T]) { } } -fn sort>(d: &mut Vec, e: &mut Vec, V: &mut M) { +fn sort>(d: &mut [T], e: &mut [T], V: &mut M) { let n = d.len(); let mut temp = vec![T::zero(); n]; for j in 1..n { diff --git a/src/linalg/lu.rs b/src/linalg/lu.rs index 72d6079..cb001af 100644 --- a/src/linalg/lu.rs +++ b/src/linalg/lu.rs @@ -46,13 +46,13 @@ use crate::math::num::RealNumber; pub struct LU> { LU: M, pivot: Vec, - pivot_sign: i8, + _pivot_sign: i8, singular: bool, phantom: PhantomData, } impl> LU { - pub(crate) fn new(LU: M, pivot: Vec, pivot_sign: i8) -> LU { + pub(crate) fn new(LU: M, pivot: Vec, _pivot_sign: i8) -> LU { let (_, n) = LU.shape(); let mut singular = false; @@ -66,7 +66,7 @@ impl> LU { LU { LU, pivot, - pivot_sign, + _pivot_sign, singular, phantom: PhantomData, } diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index d2d2212..59b6089 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -689,12 +689,11 @@ impl<'a, T: RealNumber, M: BaseMatrix> Iterator for RowIter<'a, T, M> { type Item = Vec; fn next(&mut self) -> Option> { - let res; - if self.pos < self.max_pos { - res = Some(self.m.get_row_as_vec(self.pos)) + let res = if self.pos < self.max_pos { + Some(self.m.get_row_as_vec(self.pos)) } else { - res = None - } + None + }; self.pos += 1; res } diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index 9866618..1af926c 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -523,7 +523,6 @@ impl PartialEq for DenseMatrix { true } } - impl From> for Vec { fn from(dense_matrix: DenseMatrix) -> Vec { dense_matrix.values diff --git a/src/linalg/svd.rs b/src/linalg/svd.rs index 3746071..97d85ca 100644 --- a/src/linalg/svd.rs +++ b/src/linalg/svd.rs @@ -47,7 +47,7 @@ pub struct SVD> { pub V: M, /// Singular values of the original matrix pub s: Vec, - full: bool, + _full: bool, m: usize, n: usize, tol: T, @@ -116,7 +116,7 @@ pub trait SVDDecomposableMatrix: BaseMatrix { } let mut f = U.get(i, i); - g = -s.sqrt().copysign(f); + g = -RealNumber::copysign(s.sqrt(), f); let h = f * g - s; U.set(i, i, f - g); for j in l - 1..n { @@ -152,7 +152,7 @@ pub trait SVDDecomposableMatrix: BaseMatrix { } let f = U.get(i, l - 1); - g = -s.sqrt().copysign(f); + g = -RealNumber::copysign(s.sqrt(), f); let h = f * g - s; U.set(i, l - 1, f - g); @@ -299,7 +299,7 @@ pub trait SVDDecomposableMatrix: BaseMatrix { let mut h = rv1[k]; let mut f = ((y - z) * (y + z) + (g - h) * (g + h)) / (T::two() * h * y); g = f.hypot(T::one()); - f = ((x - z) * (x + z) + h * ((y / (f + g.copysign(f))) - h)) / x; + f = ((x - z) * (x + z) + h * ((y / (f + RealNumber::copysign(g, f))) - h)) / x; let mut c = T::one(); let mut s = T::one(); @@ -428,13 +428,13 @@ impl> SVD { pub(crate) fn new(U: M, V: M, s: Vec) -> SVD { let m = U.shape().0; let n = V.shape().0; - let full = s.len() == m.min(n); + let _full = s.len() == m.min(n); let tol = T::half() * (T::from(m + n).unwrap() + T::one()).sqrt() * s[0] * T::epsilon(); SVD { U, V, s, - full, + _full, m, n, tol, diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index a10b5ac..b1f7c51 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -94,7 +94,7 @@ pub struct LinearRegressionParameters { pub struct LinearRegression> { coefficients: M, intercept: T, - solver: LinearRegressionSolverName, + _solver: LinearRegressionSolverName, } impl LinearRegressionParameters { @@ -155,7 +155,7 @@ impl> LinearRegression { if x_nrows != y_nrows { return Err(Failed::fit( - &"Number of rows of X doesn\'t match number of rows of Y".to_string(), + "Number of rows of X doesn\'t match number of rows of Y", )); } @@ -171,7 +171,7 @@ impl> LinearRegression { Ok(LinearRegression { intercept: w.get(num_attributes, 0), coefficients: wights, - solver: parameters.solver, + _solver: parameters.solver, }) } diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index ad2cdb3..1a20077 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -321,7 +321,7 @@ impl> LogisticRegression { if x_nrows != y_nrows { return Err(Failed::fit( - &"Number of rows of X doesn\'t match number of rows of Y".to_string(), + "Number of rows of X doesn\'t match number of rows of Y", )); } diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 94ac700..ecad250 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -96,7 +96,7 @@ pub struct RidgeRegressionParameters { pub struct RidgeRegression> { coefficients: M, intercept: T, - solver: RidgeRegressionSolverName, + _solver: RidgeRegressionSolverName, } impl RidgeRegressionParameters { @@ -226,7 +226,7 @@ impl> RidgeRegression { Ok(RidgeRegression { intercept: b, coefficients: w, - solver: parameters.solver, + _solver: parameters.solver, }) } diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index 44e5dd9..8706702 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -161,7 +161,7 @@ impl CategoricalNBDistribution { let y_max = y .iter() .max() - .ok_or_else(|| Failed::fit(&"Failed to get the labels of y.".to_string()))?; + .ok_or_else(|| Failed::fit("Failed to get the labels of y."))?; let class_labels: Vec = (0..*y_max + 1) .map(|label| T::from(label).unwrap()) diff --git a/src/optimization/first_order/lbfgs.rs b/src/optimization/first_order/lbfgs.rs index 6c0b89b..1b3bfde 100644 --- a/src/optimization/first_order/lbfgs.rs +++ b/src/optimization/first_order/lbfgs.rs @@ -8,6 +8,7 @@ use crate::optimization::first_order::{FirstOrderOptimizer, OptimizerResult}; use crate::optimization::line_search::LineSearchMethod; use crate::optimization::{DF, F}; +#[allow(clippy::upper_case_acronyms)] pub struct LBFGS { pub max_iter: usize, pub g_rtol: T, diff --git a/src/optimization/mod.rs b/src/optimization/mod.rs index e5e58d1..b0be9d6 100644 --- a/src/optimization/mod.rs +++ b/src/optimization/mod.rs @@ -4,6 +4,7 @@ pub mod line_search; pub type F<'a, T, X> = dyn for<'b> Fn(&'b X) -> T + 'a; pub type DF<'a, X> = dyn for<'b> Fn(&'b mut X, &'b X) + 'a; +#[allow(clippy::upper_case_acronyms)] #[derive(Debug, PartialEq)] pub enum FunctionOrder { SECOND, diff --git a/src/preprocessing/categorical.rs b/src/preprocessing/categorical.rs index adc85a6..478e706 100644 --- a/src/preprocessing/categorical.rs +++ b/src/preprocessing/categorical.rs @@ -78,8 +78,7 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) -> .zip( repeats .zip(offset) - .map(|(r, o)| iter::repeat(o).take(r)) - .flatten(), + .flat_map(|(r, o)| iter::repeat(o).take(r)), ) .map(|(idx, ofst)| idx + ofst) .collect(); diff --git a/src/svm/svc.rs b/src/svm/svc.rs index b12e558..7432b9c 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -222,7 +222,7 @@ impl, K: Kernel> SVC { if n != y.len() { return Err(Failed::fit( - &"Number of rows of X doesn\'t match number of rows of Y".to_string(), + "Number of rows of X doesn\'t match number of rows of Y", )); } diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 455e51f..3257111 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -212,7 +212,7 @@ impl, K: Kernel> SVR { if n != y.len() { return Err(Failed::fit( - &"Number of rows of X doesn\'t match number of rows of Y".to_string(), + "Number of rows of X doesn\'t match number of rows of Y", )); } diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 751d5d1..d86f59a 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -118,7 +118,7 @@ pub enum SplitCriterion { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] struct Node { - index: usize, + _index: usize, output: usize, split_feature: usize, split_value: Option, @@ -204,7 +204,7 @@ impl Default for DecisionTreeClassifierParameters { impl Node { fn new(index: usize, output: usize) -> Self { Node { - index, + _index: index, output, split_feature: 0, split_value: Option::None, @@ -514,7 +514,7 @@ impl DecisionTreeClassifier { visitor: &mut NodeVisitor<'_, T, M>, n: usize, count: &[usize], - false_count: &mut Vec, + false_count: &mut [usize], parent_impurity: T, j: usize, ) { diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 34f58a9..94fa0f8 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -97,7 +97,7 @@ pub struct DecisionTreeRegressor { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] struct Node { - index: usize, + _index: usize, output: T, split_feature: usize, split_value: Option, @@ -137,7 +137,7 @@ impl Default for DecisionTreeRegressorParameters { impl Node { fn new(index: usize, output: T) -> Self { Node { - index, + _index: index, output, split_feature: 0, split_value: Option::None, From 8297cbe67e56c91ee19d5d8cbeefdbcfd525a77b Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 9 May 2022 15:50:25 -0700 Subject: [PATCH 80/81] Fixes broken build --- src/algorithm/neighbour/cover_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 53ae286..5664acc 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -179,7 +179,7 @@ impl> CoverTree } } } - + if neighbors.len() > k { neighbors.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); } From 7ea620e6fd7b67a63c4f64ac49e94a00f4e6c51c Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 9 May 2022 16:03:05 -0700 Subject: [PATCH 81/81] Updates version to 0.2.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f662d5e..f83889e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "smartcore" description = "The most advanced machine learning library in rust." homepage = "https://smartcorelib.org" -version = "0.2.0" +version = "0.2.1" authors = ["SmartCore Developers"] edition = "2018" license = "Apache-2.0"