From 23b36997309e1cdd7ed3a82bfa7ec9b2b6ee0cdf Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Mon, 7 Nov 2022 12:48:44 +0000 Subject: [PATCH 01/21] Release 0.3 --- .github/DEVELOPERS.md | 5 ++- CHANGELOG.md | 7 +++-- Cargo.toml | 16 +++++++--- LICENSE | 2 +- README.md | 4 +-- smartcore.svg | 2 +- src/algorithm/neighbour/cover_tree.rs | 10 +++--- src/cluster/kmeans.rs | 6 ++-- src/dataset/mod.rs | 2 +- src/ensemble/mod.rs | 2 +- src/ensemble/random_forest_classifier.rs | 3 -- src/ensemble/random_forest_regressor.rs | 3 -- src/lib.rs | 39 ++++++++++++++++++------ src/linear/linear_regression.rs | 5 +-- src/linear/logistic_regression.rs | 2 +- src/linear/ridge_regression.rs | 5 +-- src/metrics/auc.rs | 2 +- src/metrics/mod.rs | 2 +- src/model_selection/mod.rs | 2 +- src/neighbors/knn_classifier.rs | 2 +- src/numbers/realnum.rs | 2 +- src/svm/mod.rs | 2 +- src/svm/svc.rs | 5 ++- src/svm/svr.rs | 2 -- src/tree/decision_tree_classifier.rs | 12 +++----- src/tree/decision_tree_regressor.rs | 14 +++------ src/tree/mod.rs | 2 +- 27 files changed, 83 insertions(+), 77 deletions(-) diff --git a/.github/DEVELOPERS.md b/.github/DEVELOPERS.md index 87c2506..b3a647b 100644 --- a/.github/DEVELOPERS.md +++ b/.github/DEVELOPERS.md @@ -1,4 +1,7 @@ -# Smartcore: Introduction to modules +# smartcore: Introduction to modules + +Important source of information: +* [Rust API guidelines](https://rust-lang.github.io/api-guidelines/about.html) ## Walkthrough: traits system and basic structures diff --git a/CHANGELOG.md b/CHANGELOG.md index a9dda10..6052e07 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,13 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.3] - 2022-11 ## Added +- WARNING: Breaking changes! - Seeds to multiple algorithims that depend on random number generation. - Added feature `js` to use WASM in browser - Drop `nalgebra-bindings` feature -- Complete refactoring with *extensive API changes* that includes: +- Complete refactoring with **extensive API changes** that includes: * moving to a new traits system, less structs more traits * adapting all the modules to the new traits system * moving towards Rust 2021, in particular the use of `dyn` and `as_ref` @@ -19,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## BREAKING CHANGE - Added a new parameter to `train_test_split` to define the seed. -## [0.2.1] - 2022-05-10 +## [0.2.1] - 2021-05-10 ## Added - L2 regularization penalty to the Logistic Regression diff --git a/Cargo.toml b/Cargo.toml index 0a23083..0c3adda 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "smartcore" -description = "The most advanced machine learning library in rust." +description = "Machine Learning in Rust." homepage = "https://smartcorelib.org" -version = "0.4.0" -authors = ["SmartCore Developers"] +version = "0.3.0" +authors = ["smartcore Developers"] edition = "2021" license = "Apache-2.0" documentation = "https://docs.rs/smartcore" @@ -11,6 +11,12 @@ repository = "https://github.com/smartcorelib/smartcore" readme = "README.md" keywords = ["machine-learning", "statistical", "ai", "optimization", "linear-algebra"] categories = ["science"] +exclude = [ + ".github", + ".gitignore", + "smartcore.iml", + "smartcore.svg", +] [dependencies] approx = "0.5.1" @@ -23,10 +29,10 @@ rand_distr = { version = "0.4", optional = true } serde = { version = "1", features = ["derive"], optional = true } [features] -default = ["serde", "datasets"] +default = [] serde = ["dep:serde"] ndarray-bindings = ["dep:ndarray"] -datasets = ["dep:rand_distr", "std"] +datasets = ["dep:rand_distr", "std", "serde"] std = ["rand/std_rng", "rand/std"] # wasm32 only js = ["getrandom/js"] diff --git a/LICENSE b/LICENSE index 3cd5786..9448cee 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2019-present at SmartCore developers (smartcorelib.org) + Copyright 2019-present at smartcore developers (smartcorelib.org) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index fd6f481..758a461 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@

- SmartCore + smartcore

@@ -18,4 +18,4 @@ ----- [![CI](https://github.com/smartcorelib/smartcore/actions/workflows/ci.yml/badge.svg)](https://github.com/smartcorelib/smartcore/actions/workflows/ci.yml) -To start getting familiar with the new Smartcore v0.5 API, there is now available a [**Jupyter Notebook environment repository**](https://github.com/smartcorelib/smartcore-jupyter). Please see instructions there, contributions welcome see [CONTRIBUTING](.github/CONTRIBUTING.md). +To start getting familiar with the new smartcore v0.5 API, there is now available a [**Jupyter Notebook environment repository**](https://github.com/smartcorelib/smartcore-jupyter). Please see instructions there, contributions welcome see [CONTRIBUTING](.github/CONTRIBUTING.md). diff --git a/smartcore.svg b/smartcore.svg index 3e4c68d..eaffd58 100644 --- a/smartcore.svg +++ b/smartcore.svg @@ -76,5 +76,5 @@ y="81.876823" x="91.861809" id="tspan842" - sodipodi:role="line">SmartCore + sodipodi:role="line">smartcore diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index db062f9..011a9cc 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -64,7 +64,7 @@ struct Node { max_dist: f64, parent_dist: f64, children: Vec, - scale: i64, + _scale: i64, } #[derive(Debug)] @@ -84,7 +84,7 @@ impl> CoverTree { max_dist: 0f64, parent_dist: 0f64, children: Vec::new(), - scale: 0, + _scale: 0, }; let mut tree = CoverTree { base, @@ -245,7 +245,7 @@ impl> CoverTree { max_dist: 0f64, parent_dist: 0f64, children: Vec::new(), - scale: 100, + _scale: 100, } } @@ -306,7 +306,7 @@ impl> CoverTree { max_dist: 0f64, parent_dist: 0f64, children, - scale: 100, + _scale: 100, } } else { let mut far: Vec = Vec::new(); @@ -375,7 +375,7 @@ impl> CoverTree { max_dist: self.max(consumed_set), parent_dist: 0f64, children, - scale: (top_scale - max_scale), + _scale: (top_scale - max_scale), } } } diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 9322d65..4384ddb 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -11,7 +11,7 @@ //! these re-calculated centroids becoming the new centers of their respective clusters. Next all instances of the training set are re-assigned to their closest cluster again. //! This iterative process continues until convergence is achieved and the clusters are considered settled. //! -//! Initial choice of K data points is very important and has big effect on performance of the algorithm. SmartCore uses k-means++ algorithm to initialize cluster centers. +//! Initial choice of K data points is very important and has big effect on performance of the algorithm. smartcore uses k-means++ algorithm to initialize cluster centers. //! //! Example: //! @@ -74,7 +74,7 @@ pub struct KMeans, Y: Array1> { k: usize, _y: Vec, size: Vec, - distortion: f64, + _distortion: f64, centroids: Vec>, _phantom_tx: PhantomData, _phantom_ty: PhantomData, @@ -313,7 +313,7 @@ impl, Y: Array1> KMeans k: parameters.k, _y: y, size, - distortion, + _distortion: distortion, centroids, _phantom_tx: PhantomData, _phantom_ty: PhantomData, diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 5b32d02..ac48bf8 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -1,6 +1,6 @@ //! Datasets //! -//! In this module you will find small datasets that are used in SmartCore mostly for demonstration purposes. +//! In this module you will find small datasets that are used in smartcore mostly for demonstration purposes. pub mod boston; pub mod breast_cancer; pub mod diabetes; diff --git a/src/ensemble/mod.rs b/src/ensemble/mod.rs index 1ddf4b4..161df96 100644 --- a/src/ensemble/mod.rs +++ b/src/ensemble/mod.rs @@ -7,7 +7,7 @@ //! set and then aggregate their individual predictions to form a final prediction. In classification setting the overall prediction is the most commonly //! occurring majority class among the individual predictions. //! -//! In SmartCore you will find implementation of RandomForest - a popular averaging algorithms based on randomized [decision trees](../tree/index.html). +//! In smartcore you will find implementation of RandomForest - a popular averaging algorithms based on randomized [decision trees](../tree/index.html). //! Random forests provide an improvement over bagged trees by way of a small tweak that decorrelates the trees. As in bagging, we build a number of //! decision trees on bootstrapped training samples. But when building these decision trees, each time a split in a tree is considered, //! a random sample of _m_ predictors is chosen as split candidates from the full set of _p_ predictors. diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index d01acef..3db103b 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -104,7 +104,6 @@ pub struct RandomForestClassifier< X: Array2, Y: Array1, > { - parameters: Option, trees: Option>>, classes: Option>, samples: Option>>, @@ -198,7 +197,6 @@ impl, Y: { fn new() -> Self { Self { - parameters: Option::None, trees: Option::None, classes: Option::None, samples: Option::None, @@ -501,7 +499,6 @@ impl, Y: Array1, Y: Array1, > { - parameters: Option, trees: Option>>, samples: Option>>, } @@ -177,7 +176,6 @@ impl, Y: Array1 { fn new() -> Self { Self { - parameters: Option::None, trees: Option::None, samples: Option::None, } @@ -434,7 +432,6 @@ impl, Y: Array1 } Ok(RandomForestRegressor { - parameters: Some(parameters), trees: Some(trees), samples: maybe_all_samples, }) diff --git a/src/lib.rs b/src/lib.rs index a955de2..8746dbf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,25 +8,38 @@ #![warn(missing_docs)] #![warn(rustdoc::missing_doc_code_examples)] -//! # SmartCore +//! # smartcore //! -//! Welcome to SmartCore, machine learning in Rust! +//! Welcome to smartcore, machine learning in Rust! //! -//! SmartCore features various classification, regression and clustering algorithms including support vector machines, random forests, k-means and DBSCAN, +//! `smartcore` features various classification, regression and clustering algorithms including support vector machines, random forests, k-means and DBSCAN, //! as well as tools for model selection and model evaluation. //! -//! SmartCore provides its own traits system that extends Rust standard library, to deal with linear algebra and common +//! `smartcore` provides its own traits system that extends Rust standard library, to deal with linear algebra and common //! computational models. Its API is designed using well recognizable patterns. Extra features (like support for [ndarray](https://docs.rs/ndarray) //! structures) is available via optional features. //! //! ## Getting Started //! -//! To start using SmartCore simply add the following to your Cargo.toml file: +//! To start using `smartcore` latest stable version simply add the following to your `Cargo.toml` file: +//! ```ignore +//! [dependencies] +//! smartcore = "*" +//! ``` +//! +//! To start using smartcore development version with latest unstable additions: //! ```ignore //! [dependencies] //! smartcore = { git = "https://github.com/smartcorelib/smartcore", branch = "development" } //! ``` //! +//! There are different features that can be added to the base library, for example to add sample datasets: +//! ```ignore +//! [dependencies] +//! smartcore = { git = "https://github.com/smartcorelib/smartcore", features = ["datasets"] } +//! ``` +//! Check `smartcore`'s `Cargo.toml` for available features. +//! //! ## Using Jupyter //! For quick introduction, Jupyter Notebooks are available [here](https://github.com/smartcorelib/smartcore-jupyter/tree/main/notebooks). //! You can set up a local environment to run Rust notebooks using [EVCXR](https://github.com/google/evcxr) @@ -37,7 +50,7 @@ //! For example, you can use this code to fit a [K Nearest Neighbors classifier](neighbors/knn_classifier/index.html) to a dataset that is defined as standard Rust vector: //! //! ``` -//! // DenseMatrix defenition +//! // DenseMatrix definition //! use smartcore::linalg::basic::matrix::DenseMatrix; //! // KNNClassifier //! use smartcore::neighbors::knn_classifier::*; @@ -62,7 +75,9 @@ //! ``` //! //! ## Overview -//! All machine learning algorithms in SmartCore are grouped into these broad categories: +//! +//! ### Supported algorithms +//! All machine learning algorithms are grouped into these broad categories: //! * [Clustering](cluster/index.html), unsupervised clustering of unlabeled data. //! * [Matrix Decomposition](decomposition/index.html), various methods for matrix decomposition. //! * [Linear Models](linear/index.html), regression and classification methods where output is assumed to have linear relation to explanatory variables @@ -71,11 +86,14 @@ //! * [Nearest Neighbors](neighbors/index.html), K Nearest Neighbors for classification and regression //! * [Naive Bayes](naive_bayes/index.html), statistical classification technique based on Bayes Theorem //! * [SVM](svm/index.html), support vector machines +//! +//! ### Linear Algebra traits system +//! For an introduction to `smartcore`'s traits system see [this notebook](https://github.com/smartcorelib/smartcore-jupyter/blob/5523993c53c6ec1fd72eea130ef4e7883121c1ea/notebooks/01-A-little-bit-about-numbers.ipynb) /// Foundamental numbers traits pub mod numbers; -/// Various algorithms and helper methods that are used elsewhere in SmartCore +/// Various algorithms and helper methods that are used elsewhere in smartcore pub mod algorithm; pub mod api; @@ -89,7 +107,7 @@ pub mod decomposition; /// Ensemble methods, including Random Forest classifier and regressor pub mod ensemble; pub mod error; -/// Diverse collection of linear algebra abstractions and methods that power SmartCore algorithms +/// Diverse collection of linear algebra abstractions and methods that power smartcore algorithms pub mod linalg; /// Supervised classification and regression models that assume linear relationship between dependent and explanatory variables. pub mod linear; @@ -105,7 +123,8 @@ pub mod neighbors; pub mod optimization; /// Preprocessing utilities pub mod preprocessing; -/// Reading in data from serialized foramts +/// Reading in data from serialized formats +#[cfg(feature = "serde")] pub mod readers; /// Support Vector Machines pub mod svm; diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 1f7d540..7f6dfad 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -12,7 +12,7 @@ //! \\[\hat{\beta} = (X^TX)^{-1}X^Ty \\] //! //! the \\((X^TX)^{-1}\\) term is both computationally expensive and numerically unstable. An alternative approach is to use a matrix decomposition to avoid this operation. -//! SmartCore uses [SVD](../../linalg/svd/index.html) and [QR](../../linalg/qr/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). +//! smartcore uses [SVD](../../linalg/svd/index.html) and [QR](../../linalg/qr/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). //! The QR decomposition is more computationally efficient and more numerically stable than calculating the normal equation directly, //! but does not work for all data matrices. Unlike the QR decomposition, all matrices have an SVD decomposition. //! @@ -113,7 +113,6 @@ pub struct LinearRegression< > { coefficients: Option, intercept: Option, - solver: LinearRegressionSolverName, _phantom_ty: PhantomData, _phantom_y: PhantomData, } @@ -210,7 +209,6 @@ impl< Self { coefficients: Option::None, intercept: Option::None, - solver: LinearRegressionParameters::default().solver, _phantom_ty: PhantomData, _phantom_y: PhantomData, } @@ -276,7 +274,6 @@ impl< Ok(LinearRegression { intercept: Some(*w.get((num_attributes, 0))), coefficients: Some(weights), - solver: parameters.solver, _phantom_ty: PhantomData, _phantom_y: PhantomData, }) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 7dd269c..e8c08d8 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -5,7 +5,7 @@ //! //! \\[ Pr(y=1) \approx \frac{e^{\beta_0 + \sum_{i=1}^n \beta_iX_i}}{1 + e^{\beta_0 + \sum_{i=1}^n \beta_iX_i}} \\] //! -//! SmartCore uses [limited memory BFGS](https://en.wikipedia.org/wiki/Limited-memory_BFGS) method to find estimates of regression coefficients, \\(\beta\\) +//! smartcore uses [limited memory BFGS](https://en.wikipedia.org/wiki/Limited-memory_BFGS) method to find estimates of regression coefficients, \\(\beta\\) //! //! Example: //! diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 914afc2..e03948d 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -12,7 +12,7 @@ //! where \\(\alpha \geq 0\\) is a tuning parameter that controls strength of regularization. When \\(\alpha = 0\\) the penalty term has no effect, and ridge regression will produce the least squares estimates. //! However, as \\(\alpha \rightarrow \infty\\), the impact of the shrinkage penalty grows, and the ridge regression coefficient estimates will approach zero. //! -//! SmartCore uses [SVD](../../linalg/svd/index.html) and [Cholesky](../../linalg/cholesky/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). +//! smartcore uses [SVD](../../linalg/svd/index.html) and [Cholesky](../../linalg/cholesky/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). //! The Cholesky decomposition is more computationally efficient and more numerically stable than calculating the normal equation directly, //! but does not work for all data matrices. Unlike the Cholesky decomposition, all matrices have an SVD decomposition. //! @@ -197,7 +197,6 @@ pub struct RidgeRegression< > { coefficients: Option, intercept: Option, - solver: Option, _phantom_ty: PhantomData, _phantom_y: PhantomData, } @@ -259,7 +258,6 @@ impl< Self { coefficients: Option::None, intercept: Option::None, - solver: Option::None, _phantom_ty: PhantomData, _phantom_y: PhantomData, } @@ -367,7 +365,6 @@ impl< Ok(RidgeRegression { intercept: Some(b), coefficients: Some(w), - solver: Some(parameters.solver), _phantom_ty: PhantomData, _phantom_y: PhantomData, }) diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index ecaf646..5848fbc 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -2,7 +2,7 @@ //! Computes the area under the receiver operating characteristic (ROC) curve that is equal to the probability that a classifier will rank a //! randomly chosen positive instance higher than a randomly chosen negative one. //! -//! SmartCore calculates ROC AUC from Wilcoxon or Mann-Whitney U test. +//! smartcore calculates ROC AUC from Wilcoxon or Mann-Whitney U test. //! //! Example: //! ``` diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 06d44a1..40086af 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -4,7 +4,7 @@ //! In a feedback loop you build your model first, then you get feedback from metrics, improve it and repeat until your model achieve desirable performance. //! Evaluation metrics helps to explain the performance of a model and compare models based on an objective criterion. //! -//! Choosing the right metric is crucial while evaluating machine learning models. In SmartCore you will find metrics for these classes of ML models: +//! Choosing the right metric is crucial while evaluating machine learning models. In smartcore you will find metrics for these classes of ML models: //! //! * [Classification metrics](struct.ClassificationMetrics.html) //! * [Regression metrics](struct.RegressionMetrics.html) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index b8e4e7f..b712d67 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -7,7 +7,7 @@ //! Splitting data into multiple subsets helps us to find the right combination of hyperparameters, estimate model performance and choose the right model for //! the data. //! -//! In SmartCore a random split into training and test sets can be quickly computed with the [train_test_split](./fn.train_test_split.html) helper function. +//! In smartcore a random split into training and test sets can be quickly computed with the [train_test_split](./fn.train_test_split.html) helper function. //! //! ``` //! use smartcore::linalg::basic::matrix::DenseMatrix; diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 67d094a..d13dce6 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -1,6 +1,6 @@ //! # K Nearest Neighbors Classifier //! -//! SmartCore relies on 2 backend algorithms to speedup KNN queries: +//! smartcore relies on 2 backend algorithms to speedup KNN queries: //! * [`LinearSearch`](../../algorithm/neighbour/linear_search/index.html) //! * [`CoverTree`](../../algorithm/neighbour/cover_tree/index.html) //! diff --git a/src/numbers/realnum.rs b/src/numbers/realnum.rs index 8c60e47..cb5336a 100644 --- a/src/numbers/realnum.rs +++ b/src/numbers/realnum.rs @@ -1,5 +1,5 @@ //! # Real Number -//! Most algorithms in SmartCore rely on basic linear algebra operations like dot product, matrix decomposition and other subroutines that are defined for a set of real numbers, ℝ. +//! Most algorithms in smartcore rely on basic linear algebra operations like dot product, matrix decomposition and other subroutines that are defined for a set of real numbers, ℝ. //! This module defines real number and some useful functions that are used in [Linear Algebra](../../linalg/index.html) module. use num_traits::Float; diff --git a/src/svm/mod.rs b/src/svm/mod.rs index a30fe87..92b3ab4 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -9,7 +9,7 @@ //! SVM is memory efficient since it uses only a subset of training data to find a decision boundary. This subset is called support vectors. //! //! In SVM distance between a data point and the support vectors is defined by the kernel function. -//! SmartCore supports multiple kernel functions but you can always define a new kernel function by implementing the `Kernel` trait. Not all functions can be a kernel. +//! smartcore supports multiple kernel functions but you can always define a new kernel function by implementing the `Kernel` trait. Not all functions can be a kernel. //! Building a new kernel requires a good mathematical understanding of the [Mercer theorem](https://en.wikipedia.org/wiki/Mercer%27s_theorem) //! that gives necessary and sufficient condition for a function to be a kernel function. //! diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 9cb140d..c886ba1 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -20,7 +20,7 @@ //! //! Where \\( m \\) is a number of training samples, \\( y_i \\) is a label value (either 1 or -1) and \\(\langle\vec{w}, \vec{x}_i \rangle + b\\) is a decision boundary. //! -//! To solve this optimization problem, SmartCore uses an [approximate SVM solver](https://leon.bottou.org/projects/lasvm). +//! To solve this optimization problem, smartcore uses an [approximate SVM solver](https://leon.bottou.org/projects/lasvm). //! The optimizer reaches accuracies similar to that of a real SVM after performing two passes through the training examples. You can choose the number of passes //! through the data that the algorithm takes by changing the `epoch` parameter of the classifier. //! @@ -934,8 +934,7 @@ mod tests { use super::*; use crate::linalg::basic::matrix::DenseMatrix; use crate::metrics::accuracy; - #[cfg(feature = "serde")] - use crate::svm::*; + use crate::svm::Kernels; #[cfg_attr( all(target_arch = "wasm32", not(target_os = "wasi")), diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 7a39a56..8d49525 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -596,7 +596,6 @@ mod tests { use super::*; use crate::linalg::basic::matrix::DenseMatrix; use crate::metrics::mean_squared_error; - #[cfg(feature = "serde")] use crate::svm::Kernels; // #[test] @@ -617,7 +616,6 @@ mod tests { // assert!(iter.next().is_none()); // } - //TODO: had to disable this test as it runs for too long #[cfg_attr( all(target_arch = "wasm32", not(target_os = "wasi")), wasm_bindgen_test::wasm_bindgen_test diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 6341ab4..a7b0228 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -163,7 +163,6 @@ impl Default for SplitCriterion { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] struct Node { - index: usize, output: usize, split_feature: usize, split_value: Option, @@ -406,9 +405,8 @@ impl Default for DecisionTreeClassifierSearchParameters { } impl Node { - fn new(index: usize, output: usize) -> Self { + fn new(output: usize) -> Self { Node { - index, output, split_feature: 0, split_value: Option::None, @@ -582,7 +580,7 @@ impl, Y: Array1> count[yi[i]] += samples[i]; } - let root = Node::new(0, which_max(&count)); + let root = Node::new(which_max(&count)); change_nodes.push(root); let mut order: Vec> = Vec::new(); @@ -831,11 +829,9 @@ impl, Y: Array1> let true_child_idx = self.nodes().len(); - self.nodes - .push(Node::new(true_child_idx, visitor.true_child_output)); + self.nodes.push(Node::new(visitor.true_child_output)); let false_child_idx = self.nodes().len(); - self.nodes - .push(Node::new(false_child_idx, visitor.false_child_output)); + self.nodes.push(Node::new(visitor.false_child_output)); self.nodes[visitor.node].true_child = Some(true_child_idx); self.nodes[visitor.node].false_child = Some(false_child_idx); diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 12ea978..cb6eb4f 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -11,7 +11,7 @@ //! //! where \\(\hat{y}_{Rk}\\) is the mean response for the training observations withing region _k_. //! -//! SmartCore uses recursive binary splitting approach to build \\(R_1, R_2, ..., R_K\\) regions. The approach begins at the top of the tree and then successively splits the predictor space +//! smartcore uses recursive binary splitting approach to build \\(R_1, R_2, ..., R_K\\) regions. The approach begins at the top of the tree and then successively splits the predictor space //! one predictor at a time. At each step of the tree-building process, the best split is made at that particular step, rather than looking ahead and picking a split that will lead to a better //! tree in some future step. //! @@ -128,7 +128,6 @@ impl, Y: Array1> #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug, Clone)] struct Node { - index: usize, output: f64, split_feature: usize, split_value: Option, @@ -299,9 +298,8 @@ impl Default for DecisionTreeRegressorSearchParameters { } impl Node { - fn new(index: usize, output: f64) -> Self { + fn new(output: f64) -> Self { Node { - index, output, split_feature: 0, split_value: Option::None, @@ -450,7 +448,7 @@ impl, Y: Array1> sum += *sample_i as f64 * y_m.get(i).to_f64().unwrap(); } - let root = Node::new(0, sum / (n as f64)); + let root = Node::new(sum / (n as f64)); nodes.push(root); let mut order: Vec> = Vec::new(); @@ -662,11 +660,9 @@ impl, Y: Array1> let true_child_idx = self.nodes().len(); - self.nodes - .push(Node::new(true_child_idx, visitor.true_child_output)); + self.nodes.push(Node::new(visitor.true_child_output)); let false_child_idx = self.nodes().len(); - self.nodes - .push(Node::new(false_child_idx, visitor.false_child_output)); + self.nodes.push(Node::new(visitor.false_child_output)); self.nodes[visitor.node].true_child = Some(true_child_idx); self.nodes[visitor.node].false_child = Some(false_child_idx); diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 700dc76..a1b82c8 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -9,7 +9,7 @@ //! Decision trees suffer from high variance and often does not deliver best prediction accuracy when compared to other supervised learning approaches, such as linear and logistic regression. //! Hence some techniques such as [Random Forests](../ensemble/index.html) use more than one decision tree to improve performance of the algorithm. //! -//! SmartCore uses [CART](https://en.wikipedia.org/wiki/Predictive_analytics#Classification_and_regression_trees_.28CART.29) learning technique to build both classification and regression trees. +//! smartcore uses [CART](https://en.wikipedia.org/wiki/Predictive_analytics#Classification_and_regression_trees_.28CART.29) learning technique to build both classification and regression trees. //! //! ## References: //! From 0ec89402e8ff5eb97d05c5d5ab6c0ca650342e7d Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Mon, 7 Nov 2022 12:50:32 +0000 Subject: [PATCH 02/21] minor fix --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 8746dbf..b06d668 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,7 +10,7 @@ //! # smartcore //! -//! Welcome to smartcore, machine learning in Rust! +//! Welcome to `smartcore`, machine learning in Rust! //! //! `smartcore` features various classification, regression and clustering algorithms including support vector machines, random forests, k-means and DBSCAN, //! as well as tools for model selection and model evaluation. From cc91e31a0efd8c4d6ba8b75e8348b30963d9c3b7 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Mon, 7 Nov 2022 13:00:51 +0000 Subject: [PATCH 03/21] minor fixes --- src/cluster/kmeans.rs | 2 +- src/dataset/mod.rs | 2 +- src/lib.rs | 4 ++-- src/numbers/realnum.rs | 2 +- src/tree/mod.rs | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 4384ddb..c542ae2 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -11,7 +11,7 @@ //! these re-calculated centroids becoming the new centers of their respective clusters. Next all instances of the training set are re-assigned to their closest cluster again. //! This iterative process continues until convergence is achieved and the clusters are considered settled. //! -//! Initial choice of K data points is very important and has big effect on performance of the algorithm. smartcore uses k-means++ algorithm to initialize cluster centers. +//! Initial choice of K data points is very important and has big effect on performance of the algorithm. `smartcore` uses k-means++ algorithm to initialize cluster centers. //! //! Example: //! diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index ac48bf8..855b288 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -1,6 +1,6 @@ //! Datasets //! -//! In this module you will find small datasets that are used in smartcore mostly for demonstration purposes. +//! In this module you will find small datasets that are used in `smartcore` mostly for demonstration purposes. pub mod boston; pub mod breast_cancer; pub mod diabetes; diff --git a/src/lib.rs b/src/lib.rs index b06d668..03bfc03 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -75,7 +75,7 @@ //! ``` //! //! ## Overview -//! +//! //! ### Supported algorithms //! All machine learning algorithms are grouped into these broad categories: //! * [Clustering](cluster/index.html), unsupervised clustering of unlabeled data. @@ -86,7 +86,7 @@ //! * [Nearest Neighbors](neighbors/index.html), K Nearest Neighbors for classification and regression //! * [Naive Bayes](naive_bayes/index.html), statistical classification technique based on Bayes Theorem //! * [SVM](svm/index.html), support vector machines -//! +//! //! ### Linear Algebra traits system //! For an introduction to `smartcore`'s traits system see [this notebook](https://github.com/smartcorelib/smartcore-jupyter/blob/5523993c53c6ec1fd72eea130ef4e7883121c1ea/notebooks/01-A-little-bit-about-numbers.ipynb) diff --git a/src/numbers/realnum.rs b/src/numbers/realnum.rs index cb5336a..f4d9aec 100644 --- a/src/numbers/realnum.rs +++ b/src/numbers/realnum.rs @@ -1,5 +1,5 @@ //! # Real Number -//! Most algorithms in smartcore rely on basic linear algebra operations like dot product, matrix decomposition and other subroutines that are defined for a set of real numbers, ℝ. +//! Most algorithms in `smartcore` rely on basic linear algebra operations like dot product, matrix decomposition and other subroutines that are defined for a set of real numbers, ℝ. //! This module defines real number and some useful functions that are used in [Linear Algebra](../../linalg/index.html) module. use num_traits::Float; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index a1b82c8..340b0a8 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -9,7 +9,7 @@ //! Decision trees suffer from high variance and often does not deliver best prediction accuracy when compared to other supervised learning approaches, such as linear and logistic regression. //! Hence some techniques such as [Random Forests](../ensemble/index.html) use more than one decision tree to improve performance of the algorithm. //! -//! smartcore uses [CART](https://en.wikipedia.org/wiki/Predictive_analytics#Classification_and_regression_trees_.28CART.29) learning technique to build both classification and regression trees. +//! `smartcore` uses [CART](https://en.wikipedia.org/wiki/Predictive_analytics#Classification_and_regression_trees_.28CART.29) learning technique to build both classification and regression trees. //! //! ## References: //! From 3ac6598951600d28c2576230a92f3408abfd7812 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Mon, 7 Nov 2022 13:56:29 +0000 Subject: [PATCH 04/21] Exclude datasets test for wasm/wasi --- src/cluster/kmeans.rs | 1 + src/ensemble/random_forest_classifier.rs | 1 + src/tree/decision_tree_classifier.rs | 1 + 3 files changed, 3 insertions(+) diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index c542ae2..144f8c5 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -469,6 +469,7 @@ mod tests { all(target_arch = "wasm32", not(target_os = "wasi")), wasm_bindgen_test::wasm_bindgen_test )] + #[cfg(feature = "datasets")] #[test] fn fit_predict_iris() { let x = DenseMatrix::from_2d_array(&[ diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 3db103b..ca06e2f 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -634,6 +634,7 @@ mod tests { wasm_bindgen_test::wasm_bindgen_test )] #[test] + #[cfg(feature = "datasets")] fn fit_predict_iris() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index a7b0228..cbce14e 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -919,6 +919,7 @@ mod tests { wasm_bindgen_test::wasm_bindgen_test )] #[test] + #[cfg(feature = "datasets")] fn fit_predict_iris() { let x: DenseMatrix = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], From bf7b714126b47808947b61f449f3872bc15dcfab Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Mon, 7 Nov 2022 18:16:13 +0000 Subject: [PATCH 05/21] Add static analyzer to doc --- .github/CONTRIBUTING.md | 5 +++++ .gitignore | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index c09dfa7..48bce72 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -25,6 +25,11 @@ Take a look to the conventions established by existing code: * Every module should provide a Rust doctest, a brief test embedded with the documentation that explains how to use the procedure implemented. * Every module should provide comprehensive tests at the end, in its `mod tests {}` sub-module. These tests can be flagged or not with configuration flags to allow WebAssembly target. * Run `cargo doc --no-deps --open` and read the generated documentation in the browser to be sure that your changes reflects in the documentation and new code is documented. +* a nice overview of the codebase is given by [static analyzer](https://mozilla.github.io/rust-code-analysis/metrics.html): +``` +$ cargo install rust-code-analysis-cli +$ rust-code-analysis-cli -m -O json -o . -p src/ --pr +``` ## Issue Report Process diff --git a/.gitignore b/.gitignore index 9c0651c..e2976f7 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ src.dot out.svg FlameGraph/ -out.stacks \ No newline at end of file +out.stacks +*.json \ No newline at end of file From 8e6e5f9e68d33ff906acd1f37f9eb38e5e33c7a5 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 11:47:31 +0000 Subject: [PATCH 06/21] Use getrandom as default (for no-std feature) --- .github/CONTRIBUTING.md | 6 ++++++ .gitignore | 3 ++- Cargo.toml | 20 +++++++++----------- src/rand_custom.rs | 7 ++++++- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 48bce72..15b3906 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -25,11 +25,17 @@ Take a look to the conventions established by existing code: * Every module should provide a Rust doctest, a brief test embedded with the documentation that explains how to use the procedure implemented. * Every module should provide comprehensive tests at the end, in its `mod tests {}` sub-module. These tests can be flagged or not with configuration flags to allow WebAssembly target. * Run `cargo doc --no-deps --open` and read the generated documentation in the browser to be sure that your changes reflects in the documentation and new code is documented. + +#### digging deeper * a nice overview of the codebase is given by [static analyzer](https://mozilla.github.io/rust-code-analysis/metrics.html): ``` $ cargo install rust-code-analysis-cli +// print metrics for every module $ rust-code-analysis-cli -m -O json -o . -p src/ --pr +// print full AST for a module +$ rust-code-analysis-cli -p src/algorithm/neighbour/fastpair.rs --ls 22 --le 213 -d > ast.txt ``` +* find more information about what happens in your binary with [`twiggy`](https://rustwasm.github.io/twiggy/install.html). This need a compiled binary so create a brief `main {}` function using `smartcore` and then point `twiggy` to that file. ## Issue Report Process diff --git a/.gitignore b/.gitignore index e2976f7..0983a15 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,5 @@ out.svg FlameGraph/ out.stacks -*.json \ No newline at end of file +*.json +*.txt \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 0c3adda..0dc84ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ exclude = [ ".gitignore", "smartcore.iml", "smartcore.svg", + "tests/" ] [dependencies] @@ -25,6 +26,7 @@ ndarray = { version = "0.15", optional = true } num-traits = "0.2.12" num = "0.4" rand = { version = "0.8.5", default-features = false, features = ["small_rng"] } +getrandom = { version = "*", features = ["js"] } rand_distr = { version = "0.4", optional = true } serde = { version = "1", features = ["derive"], optional = true } @@ -32,25 +34,21 @@ serde = { version = "1", features = ["derive"], optional = true } default = [] serde = ["dep:serde"] ndarray-bindings = ["dep:ndarray"] -datasets = ["dep:rand_distr", "std", "serde"] -std = ["rand/std_rng", "rand/std"] -# wasm32 only -js = ["getrandom/js"] +datasets = ["dep:rand_distr", "std_rand", "serde"] +std_rand = ["rand/std_rng", "rand/std"] [target.'cfg(target_arch = "wasm32")'.dependencies] getrandom = { version = "0.2", optional = true } -[dev-dependencies] -itertools = "*" -criterion = { version = "0.4", default-features = false } -serde_json = "1.0" -bincode = "1.3.1" - [target.'cfg(all(target_arch = "wasm32", not(target_os = "wasi")))'.dev-dependencies] wasm-bindgen-test = "0.3" +[dev-dependencies] +itertools = "*" +serde_json = "1.0" +bincode = "1.3.1" + [workspace] -resolver = "2" [profile.test] debug = 1 diff --git a/src/rand_custom.rs b/src/rand_custom.rs index 15f9e73..d06c344 100644 --- a/src/rand_custom.rs +++ b/src/rand_custom.rs @@ -1,5 +1,7 @@ #[cfg(not(feature = "std"))] pub(crate) use rand::rngs::SmallRng as RngImpl; +#[cfg(not(feature = "std"))] +use getrandom; #[cfg(feature = "std")] pub(crate) use rand::rngs::StdRng as RngImpl; use rand::SeedableRng; @@ -13,7 +15,10 @@ pub(crate) fn get_rng_impl(seed: Option) -> RngImpl { use rand::RngCore; RngImpl::seed_from_u64(rand::thread_rng().next_u64()) } else { - panic!("seed number needed for non-std build"); + // non-std build, use getrandom + let mut buf = [0u8; 64]; + getrandom::getrandom(&mut buf).unwrap(); + RngImpl::seed_from_u64(buf[0] as u64) } } } From 2fa454ea94c4eaf021571ed69b37e188ff75756b Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 11:48:14 +0000 Subject: [PATCH 07/21] fmt --- src/rand_custom.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rand_custom.rs b/src/rand_custom.rs index d06c344..7b4a3a5 100644 --- a/src/rand_custom.rs +++ b/src/rand_custom.rs @@ -1,7 +1,7 @@ #[cfg(not(feature = "std"))] -pub(crate) use rand::rngs::SmallRng as RngImpl; -#[cfg(not(feature = "std"))] use getrandom; +#[cfg(not(feature = "std"))] +pub(crate) use rand::rngs::SmallRng as RngImpl; #[cfg(feature = "std")] pub(crate) use rand::rngs::StdRng as RngImpl; use rand::SeedableRng; From c1af60cafb37fb799ea8457384fdf05aed924c8b Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 11:55:32 +0000 Subject: [PATCH 08/21] cleanup --- src/rand_custom.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/rand_custom.rs b/src/rand_custom.rs index 7b4a3a5..2156ab0 100644 --- a/src/rand_custom.rs +++ b/src/rand_custom.rs @@ -1,6 +1,4 @@ #[cfg(not(feature = "std"))] -use getrandom; -#[cfg(not(feature = "std"))] pub(crate) use rand::rngs::SmallRng as RngImpl; #[cfg(feature = "std")] pub(crate) use rand::rngs::StdRng as RngImpl; From 3c4a807be8a9ce998b8bbf81501c96fe322d9be4 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 12:04:39 +0000 Subject: [PATCH 09/21] Fix std_rand feature --- src/rand_custom.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/rand_custom.rs b/src/rand_custom.rs index 2156ab0..b22390e 100644 --- a/src/rand_custom.rs +++ b/src/rand_custom.rs @@ -1,19 +1,20 @@ -#[cfg(not(feature = "std"))] -pub(crate) use rand::rngs::SmallRng as RngImpl; -#[cfg(feature = "std")] -pub(crate) use rand::rngs::StdRng as RngImpl; +#[cfg(not(feature = "std_rand"))] +pub use rand::rngs::SmallRng as RngImpl; +#[cfg(feature = "std_rand")] +pub use rand::rngs::StdRng as RngImpl; use rand::SeedableRng; -pub(crate) fn get_rng_impl(seed: Option) -> RngImpl { +/// Custom switch for random fuctions +pub fn get_rng_impl(seed: Option) -> RngImpl { match seed { Some(seed) => RngImpl::seed_from_u64(seed), None => { cfg_if::cfg_if! { - if #[cfg(feature = "std")] { + if #[cfg(feature = "std_rand")] { use rand::RngCore; RngImpl::seed_from_u64(rand::thread_rng().next_u64()) } else { - // non-std build, use getrandom + // no std_random feature build, use getrandom let mut buf = [0u8; 64]; getrandom::getrandom(&mut buf).unwrap(); RngImpl::seed_from_u64(buf[0] as u64) From b4206c4b08f31a28c9b81264c45a89eb5d4762a2 Mon Sep 17 00:00:00 2001 From: Lorenzo Date: Tue, 8 Nov 2022 12:15:10 +0000 Subject: [PATCH 10/21] minor fix --- src/ensemble/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ensemble/mod.rs b/src/ensemble/mod.rs index 161df96..8cebd5c 100644 --- a/src/ensemble/mod.rs +++ b/src/ensemble/mod.rs @@ -7,7 +7,7 @@ //! set and then aggregate their individual predictions to form a final prediction. In classification setting the overall prediction is the most commonly //! occurring majority class among the individual predictions. //! -//! In smartcore you will find implementation of RandomForest - a popular averaging algorithms based on randomized [decision trees](../tree/index.html). +//! In `smartcore` you will find implementation of RandomForest - a popular averaging algorithms based on randomized [decision trees](../tree/index.html). //! Random forests provide an improvement over bagged trees by way of a small tweak that decorrelates the trees. As in bagging, we build a number of //! decision trees on bootstrapped training samples. But when building these decision trees, each time a split in a tree is considered, //! a random sample of _m_ predictors is chosen as split candidates from the full set of _p_ predictors. From a60fdaf235be3a8447b5c436f7669d94bc140bc7 Mon Sep 17 00:00:00 2001 From: Lorenzo Date: Tue, 8 Nov 2022 12:17:04 +0000 Subject: [PATCH 11/21] minor fix --- src/linear/linear_regression.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 7f6dfad..a5c7699 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -12,7 +12,7 @@ //! \\[\hat{\beta} = (X^TX)^{-1}X^Ty \\] //! //! the \\((X^TX)^{-1}\\) term is both computationally expensive and numerically unstable. An alternative approach is to use a matrix decomposition to avoid this operation. -//! smartcore uses [SVD](../../linalg/svd/index.html) and [QR](../../linalg/qr/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). +//! `smartcore` uses [SVD](../../linalg/svd/index.html) and [QR](../../linalg/qr/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). //! The QR decomposition is more computationally efficient and more numerically stable than calculating the normal equation directly, //! but does not work for all data matrices. Unlike the QR decomposition, all matrices have an SVD decomposition. //! From 78bf75b5d8fc3a8cf044071896991bdf012fc128 Mon Sep 17 00:00:00 2001 From: Lorenzo Date: Tue, 8 Nov 2022 12:17:32 +0000 Subject: [PATCH 12/21] minor fix --- src/linear/logistic_regression.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index e8c08d8..8bf65bf 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -5,7 +5,7 @@ //! //! \\[ Pr(y=1) \approx \frac{e^{\beta_0 + \sum_{i=1}^n \beta_iX_i}}{1 + e^{\beta_0 + \sum_{i=1}^n \beta_iX_i}} \\] //! -//! smartcore uses [limited memory BFGS](https://en.wikipedia.org/wiki/Limited-memory_BFGS) method to find estimates of regression coefficients, \\(\beta\\) +//! `smartcore` uses [limited memory BFGS](https://en.wikipedia.org/wiki/Limited-memory_BFGS) method to find estimates of regression coefficients, \\(\beta\\) //! //! Example: //! From b71c7b49cb59d18d9ad4a97370832a4f96c9f82e Mon Sep 17 00:00:00 2001 From: Lorenzo Date: Tue, 8 Nov 2022 12:18:03 +0000 Subject: [PATCH 13/21] minor fix --- src/linear/ridge_regression.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index e03948d..6bd5595 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -12,7 +12,7 @@ //! where \\(\alpha \geq 0\\) is a tuning parameter that controls strength of regularization. When \\(\alpha = 0\\) the penalty term has no effect, and ridge regression will produce the least squares estimates. //! However, as \\(\alpha \rightarrow \infty\\), the impact of the shrinkage penalty grows, and the ridge regression coefficient estimates will approach zero. //! -//! smartcore uses [SVD](../../linalg/svd/index.html) and [Cholesky](../../linalg/cholesky/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). +//! `smartcore` uses [SVD](../../linalg/svd/index.html) and [Cholesky](../../linalg/cholesky/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). //! The Cholesky decomposition is more computationally efficient and more numerically stable than calculating the normal equation directly, //! but does not work for all data matrices. Unlike the Cholesky decomposition, all matrices have an SVD decomposition. //! From a4097fce152ece11f6bb4f1ce7b4636f54cfcdd6 Mon Sep 17 00:00:00 2001 From: Lorenzo Date: Tue, 8 Nov 2022 12:18:35 +0000 Subject: [PATCH 14/21] minor fix --- src/metrics/auc.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metrics/auc.rs b/src/metrics/auc.rs index 5848fbc..0a7ddf4 100644 --- a/src/metrics/auc.rs +++ b/src/metrics/auc.rs @@ -2,7 +2,7 @@ //! Computes the area under the receiver operating characteristic (ROC) curve that is equal to the probability that a classifier will rank a //! randomly chosen positive instance higher than a randomly chosen negative one. //! -//! smartcore calculates ROC AUC from Wilcoxon or Mann-Whitney U test. +//! `smartcore` calculates ROC AUC from Wilcoxon or Mann-Whitney U test. //! //! Example: //! ``` From 6c6f92697fe30cf73a7c50ba598ba53e76cc2002 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 12:21:34 +0000 Subject: [PATCH 15/21] minor fixes to doc --- src/metrics/mod.rs | 2 +- src/model_selection/mod.rs | 2 +- src/neighbors/knn_classifier.rs | 2 +- src/svm/mod.rs | 2 +- src/svm/svc.rs | 2 +- src/tree/decision_tree_regressor.rs | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 40086af..c7e1be3 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -4,7 +4,7 @@ //! In a feedback loop you build your model first, then you get feedback from metrics, improve it and repeat until your model achieve desirable performance. //! Evaluation metrics helps to explain the performance of a model and compare models based on an objective criterion. //! -//! Choosing the right metric is crucial while evaluating machine learning models. In smartcore you will find metrics for these classes of ML models: +//! Choosing the right metric is crucial while evaluating machine learning models. In `smartcore` you will find metrics for these classes of ML models: //! //! * [Classification metrics](struct.ClassificationMetrics.html) //! * [Regression metrics](struct.RegressionMetrics.html) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index b712d67..222b9d7 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -7,7 +7,7 @@ //! Splitting data into multiple subsets helps us to find the right combination of hyperparameters, estimate model performance and choose the right model for //! the data. //! -//! In smartcore a random split into training and test sets can be quickly computed with the [train_test_split](./fn.train_test_split.html) helper function. +//! In `smartcore` a random split into training and test sets can be quickly computed with the [train_test_split](./fn.train_test_split.html) helper function. //! //! ``` //! use smartcore::linalg::basic::matrix::DenseMatrix; diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index d13dce6..882ac55 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -1,6 +1,6 @@ //! # K Nearest Neighbors Classifier //! -//! smartcore relies on 2 backend algorithms to speedup KNN queries: +//! `smartcore` relies on 2 backend algorithms to speedup KNN queries: //! * [`LinearSearch`](../../algorithm/neighbour/linear_search/index.html) //! * [`CoverTree`](../../algorithm/neighbour/cover_tree/index.html) //! diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 92b3ab4..ef0f003 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -9,7 +9,7 @@ //! SVM is memory efficient since it uses only a subset of training data to find a decision boundary. This subset is called support vectors. //! //! In SVM distance between a data point and the support vectors is defined by the kernel function. -//! smartcore supports multiple kernel functions but you can always define a new kernel function by implementing the `Kernel` trait. Not all functions can be a kernel. +//! `smartcore` supports multiple kernel functions but you can always define a new kernel function by implementing the `Kernel` trait. Not all functions can be a kernel. //! Building a new kernel requires a good mathematical understanding of the [Mercer theorem](https://en.wikipedia.org/wiki/Mercer%27s_theorem) //! that gives necessary and sufficient condition for a function to be a kernel function. //! diff --git a/src/svm/svc.rs b/src/svm/svc.rs index c886ba1..74998f5 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -20,7 +20,7 @@ //! //! Where \\( m \\) is a number of training samples, \\( y_i \\) is a label value (either 1 or -1) and \\(\langle\vec{w}, \vec{x}_i \rangle + b\\) is a decision boundary. //! -//! To solve this optimization problem, smartcore uses an [approximate SVM solver](https://leon.bottou.org/projects/lasvm). +//! To solve this optimization problem, `smartcore` uses an [approximate SVM solver](https://leon.bottou.org/projects/lasvm). //! The optimizer reaches accuracies similar to that of a real SVM after performing two passes through the training examples. You can choose the number of passes //! through the data that the algorithm takes by changing the `epoch` parameter of the classifier. //! diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index cb6eb4f..0146cbc 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -11,7 +11,7 @@ //! //! where \\(\hat{y}_{Rk}\\) is the mean response for the training observations withing region _k_. //! -//! smartcore uses recursive binary splitting approach to build \\(R_1, R_2, ..., R_K\\) regions. The approach begins at the top of the tree and then successively splits the predictor space +//! `smartcore` uses recursive binary splitting approach to build \\(R_1, R_2, ..., R_K\\) regions. The approach begins at the top of the tree and then successively splits the predictor space //! one predictor at a time. At each step of the tree-building process, the best split is made at that particular step, rather than looking ahead and picking a split that will lead to a better //! tree in some future step. //! From 98b18c4dae40c63ae8cb5cc6dc4adedc8dbeb9e3 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 13:53:50 +0000 Subject: [PATCH 16/21] Remove unused tests flags --- src/cluster/kmeans.rs | 3 +-- src/ensemble/random_forest_classifier.rs | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 144f8c5..18f8308 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -469,9 +469,8 @@ mod tests { all(target_arch = "wasm32", not(target_os = "wasi")), wasm_bindgen_test::wasm_bindgen_test )] - #[cfg(feature = "datasets")] #[test] - fn fit_predict_iris() { + fn fit_predict() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], &[4.9, 3.0, 1.4, 0.2], diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index ca06e2f..8ea174b 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -634,8 +634,7 @@ mod tests { wasm_bindgen_test::wasm_bindgen_test )] #[test] - #[cfg(feature = "datasets")] - fn fit_predict_iris() { + fn fit_predict() { let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], &[4.9, 3.0, 1.4, 0.2], From dad0d01f6df55160d8c8d857169f4522aae945ce Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 13:59:49 +0000 Subject: [PATCH 17/21] Update CHANGELOG --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6052e07..06d6d79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,13 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Added - WARNING: Breaking changes! - Seeds to multiple algorithims that depend on random number generation. -- Added feature `js` to use WASM in browser - Drop `nalgebra-bindings` feature - Complete refactoring with **extensive API changes** that includes: * moving to a new traits system, less structs more traits * adapting all the modules to the new traits system - * moving towards Rust 2021, in particular the use of `dyn` and `as_ref` + * moving to Rust 2021, in particular the use of `dyn` and `as_ref` * reorganization of the code base, trying to eliminate duplicates +- usage of `serde` is now optional, use the `serde` feature +- default feature is now Wasm-/Wasi-first for minimal binary size ## BREAKING CHANGE - Added a new parameter to `train_test_split` to define the seed. From 48f1d6b74d6b11d3a13abb4235c8e77c2103f6ff Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 14:19:40 +0000 Subject: [PATCH 18/21] use getrandom/js --- Cargo.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0dc84ff..8ec0e97 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ ndarray = { version = "0.15", optional = true } num-traits = "0.2.12" num = "0.4" rand = { version = "0.8.5", default-features = false, features = ["small_rng"] } -getrandom = { version = "*", features = ["js"] } +getrandom = "*" rand_distr = { version = "0.4", optional = true } serde = { version = "1", features = ["derive"], optional = true } @@ -36,6 +36,8 @@ serde = ["dep:serde"] ndarray-bindings = ["dep:ndarray"] datasets = ["dep:rand_distr", "std_rand", "serde"] std_rand = ["rand/std_rng", "rand/std"] +# used by wasm32-unknown-unknown +js = ["getrandom/js"] [target.'cfg(target_arch = "wasm32")'.dependencies] getrandom = { version = "0.2", optional = true } From c934f6b6cf89b1e09af33b7a4599c8abbbfec67f Mon Sep 17 00:00:00 2001 From: Lorenzo Date: Tue, 8 Nov 2022 14:23:13 +0000 Subject: [PATCH 19/21] update comment --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8ec0e97..4fb260b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,7 @@ serde = ["dep:serde"] ndarray-bindings = ["dep:ndarray"] datasets = ["dep:rand_distr", "std_rand", "serde"] std_rand = ["rand/std_rng", "rand/std"] -# used by wasm32-unknown-unknown +# used by wasm32-unknown-unknown for in-browser usage js = ["getrandom/js"] [target.'cfg(target_arch = "wasm32")'.dependencies] From 6c03e6e0b344188bec8b5b04cd818469649f42e1 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 15:17:31 +0000 Subject: [PATCH 20/21] update CHANGELOG --- CHANGELOG.md | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06d6d79..d105432 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,22 +4,27 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.3] - 2022-11 +## [0.3.0] - 2022-11-09 ## Added - WARNING: Breaking changes! -- Seeds to multiple algorithims that depend on random number generation. -- Drop `nalgebra-bindings` feature - Complete refactoring with **extensive API changes** that includes: * moving to a new traits system, less structs more traits * adapting all the modules to the new traits system - * moving to Rust 2021, in particular the use of `dyn` and `as_ref` - * reorganization of the code base, trying to eliminate duplicates -- usage of `serde` is now optional, use the `serde` feature -- default feature is now Wasm-/Wasi-first for minimal binary size + * moving to Rust 2021, use of object-safe traits and `as_ref` + * reorganization of the code base, eliminate duplicates +- implements `readers` (needs "serde" feature) for read/write CSV file, extendible to other formats +- default feature is now Wasm-/Wasi-first -## BREAKING CHANGE -- Added a new parameter to `train_test_split` to define the seed. +## Changed +- WARNING: Breaking changes! +- Seeds to multiple algorithims that depend on random number generation +- Added a new parameter to `train_test_split` to define the seed +- changed use of "serde" feature + +## Dropped +- WARNING: Breaking changes! +- Drop `nalgebra-bindings` feature, only `ndarray` as supported library ## [0.2.1] - 2021-05-10 From c683073b143fdf7f104612d38489158140b4eca4 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Tue, 8 Nov 2022 15:35:04 +0000 Subject: [PATCH 21/21] make work cargo build --target wasm32-unknown-unknown --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4fb260b..42faefa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,7 +40,7 @@ std_rand = ["rand/std_rng", "rand/std"] js = ["getrandom/js"] [target.'cfg(target_arch = "wasm32")'.dependencies] -getrandom = { version = "0.2", optional = true } +getrandom = { version = "*", features = ["js"] } [target.'cfg(all(target_arch = "wasm32", not(target_os = "wasi")))'.dev-dependencies] wasm-bindgen-test = "0.3"