From 9475d500dbe08d6b7c98ba68ac2bf4ce47c2fe31 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Sun, 27 Dec 2020 18:39:37 -0800 Subject: [PATCH] feat: version change + api documentation updated --- Cargo.toml | 2 +- src/cluster/dbscan.rs | 16 +++++- src/lib.rs | 23 ++++----- src/model_selection/mod.rs | 103 +++++++++++++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 21 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 32d8695..5e21aef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "smartcore" description = "The most advanced machine learning library in rust." homepage = "https://smartcorelib.org" -version = "0.1.0" +version = "0.2.0" authors = ["SmartCore Developers"] edition = "2018" license = "Apache-2.0" diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index 9aed2f0..7d641cd 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -1,6 +1,20 @@ //! # DBSCAN Clustering //! -//! DBSCAN - Density-Based Spatial Clustering of Applications with Noise. +//! DBSCAN stands for density-based spatial clustering of applications with noise. This algorithms is good for arbitrary shaped clusters and clusters with noise. +//! The main idea behind DBSCAN is that a point belongs to a cluster if it is close to many points from that cluster. There are two key parameters of DBSCAN: +//! +//! * `eps`, the maximum distance that specifies a neighborhood. Two points are considered to be neighbors if the distance between them are less than or equal to `eps`. +//! * `min_samples`, minimum number of data points that defines a cluster. +//! +//! Based on these two parameters, points are classified as core point, border point, or outlier: +//! +//! * A point is a core point if there are at least `min_samples` number of points, including the point itself in its vicinity. +//! * A point is a border point if it is reachable from a core point and there are less than `min_samples` number of points within its surrounding area. +//! * All points not reachable from any other point are outliers or noise points. +//! +//! The algorithm starts from picking up an arbitrarily point in the dataset. +//! If there are at least `min_samples` points within a radius of `eps` to the point then we consider all these points to be part of the same cluster. +//! The clusters are then expanded by recursively repeating the neighborhood calculation for each neighboring point. //! //! Example: //! diff --git a/src/lib.rs b/src/lib.rs index 297fcc4..d962894 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,16 +10,11 @@ //! //! Welcome to SmartCore, the most advanced machine learning library in Rust! //! -//! In SmartCore you will find implementation of these ML algorithms: -//! * __Regression__: Linear Regression (OLS), Decision Tree Regressor, Random Forest Regressor, K Nearest Neighbors -//! * __Classification__: Logistic Regressor, Decision Tree Classifier, Random Forest Classifier, Supervised Nearest Neighbors (KNN) -//! * __Clustering__: K-Means -//! * __Matrix Decomposition__: PCA, LU, QR, SVD, EVD -//! * __Distance Metrics__: Euclidian, Minkowski, Manhattan, Hamming, Mahalanobis -//! * __Evaluation Metrics__: Accuracy, AUC, Recall, Precision, F1, Mean Absolute Error, Mean Squared Error, R2 +//! SmartCore features various classification, regression and clustering algorithms including support vector machines, random forests, k-means and DBSCAN, +//! as well as tools for model selection and model evaluation. //! -//! Most of algorithms implemented in SmartCore operate on n-dimentional arrays. While you can use Rust vectors with all functions defined in this library -//! we do recommend to go with one of the popular linear algebra libraries available in Rust. At this moment we support these packages: +//! SmartCore is well integrated with a with wide variaty of libraries that provide support for large, multi-dimensional arrays and matrices. At this moment, +//! all Smartcore's algorithms work with ordinary Rust vectors, as well as matrices and vectors defined in these packages: //! * [ndarray](https://docs.rs/ndarray) //! * [nalgebra](https://docs.rs/nalgebra/) //! @@ -28,21 +23,21 @@ //! To start using SmartCore simply add the following to your Cargo.toml file: //! ```ignore //! [dependencies] -//! smartcore = "0.1.0" +//! smartcore = "0.2.0" //! ``` //! -//! All ML algorithms in SmartCore are grouped into these generic categories: +//! All machine learning algorithms in SmartCore are grouped into these broad categories: //! * [Clustering](cluster/index.html), unsupervised clustering of unlabeled data. //! * [Martix Decomposition](decomposition/index.html), various methods for matrix decomposition. //! * [Linear Models](linear/index.html), regression and classification methods where output is assumed to have linear relation to explanatory variables //! * [Ensemble Models](ensemble/index.html), variety of regression and classification ensemble models //! * [Tree-based Models](tree/index.html), classification and regression trees //! * [Nearest Neighbors](neighbors/index.html), K Nearest Neighbors for classification and regression +//! * [Naive Bayes](naive_bayes/index.html), statistical classification technique based on Bayes Theorem +//! * [SVM](svm/index.html), support vector machines //! -//! Each category is assigned to a separate module. //! -//! For example, KNN classifier is defined in [smartcore::neighbors::knn_classifier](neighbors/knn_classifier/index.html). To train and run it using standard Rust vectors you will -//! run this code: +//! For example, you can use this code to fit a [K Nearest Neighbors classifier](neighbors/knn_classifier/index.html) to a dataset that is defined as standard Rust vector: //! //! ``` //! // DenseMatrix defenition diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 18dfa35..0058367 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -1,13 +1,106 @@ //! # Model Selection methods //! -//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), -//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. +//! In statistics and machine learning we usually split our data into two sets: one for training and the other one for testing. +//! We fit our model to the training data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. //! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. //! Underfitted is bad because the model is undetrained and does not fit the training data well. -//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for -//! your data. +//! Splitting data into multiple subsets helps us to find the right combination of hyperparameters, estimate model performance and choose the right model for +//! the data. //! -//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +//! In SmartCore a random split into training and test sets can be quickly computed with the [train_test_split](./fn.train_test_split.html) helper function. +//! +//! ``` +//! use crate::smartcore::linalg::BaseMatrix; +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::model_selection::train_test_split; +//! +//! //Iris data +//! let x = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! let y: Vec = vec![ +//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., +//! ]; +//! +//! let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true); +//! +//! println!("X train: {:?}, y train: {}, X test: {:?}, y test: {}", +//! x_train.shape(), y_train.len(), x_test.shape(), y_test.len()); +//! ``` +//! +//! When we partition the available data into two disjoint sets, we drastically reduce the number of samples that can be used for training. +//! +//! One way to solve this problem is to use k-fold cross-validation. With k-fold validation, the dataset is split into k disjoint sets. +//! A model is trained using k - 1 of the folds, and the resulting model is validated on the remaining portion of the data. +//! +//! The simplest way to run cross-validation is to use the [cross_val_score](./fn.cross_validate.html) helper function on your estimator and the dataset. +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; +//! use smartcore::model_selection::{KFold, cross_validate}; +//! use smartcore::metrics::accuracy; +//! use smartcore::linear::logistic_regression::LogisticRegression; +//! +//! //Iris data +//! let x = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! let y: Vec = vec![ +//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., +//! ]; +//! +//! let cv = KFold::default().with_n_splits(3); +//! +//! let results = cross_validate(LogisticRegression::fit, //estimator +//! &x, &y, //data +//! Default::default(), //hyperparameters +//! cv, //cross validation split +//! &accuracy).unwrap(); //metric +//! +//! println!("Training accuracy: {}, test accuracy: {}", +//! results.mean_test_score(), results.mean_train_score()); +//! ``` +//! +//! The function [cross_val_predict](./fn.cross_val_predict.html) has a similar interface to `cross_val_score`, +//! but instead of test error it calculates predictions for all samples in the test set. use crate::api::Predictor; use crate::error::Failed;