fix: clippy, documentation and formatting

This commit is contained in:
Volodymyr Orlov
2020-12-22 16:35:28 -08:00
parent a2be9e117f
commit 9b221979da
7 changed files with 80 additions and 62 deletions
+5 -5
View File
@@ -281,8 +281,8 @@ pub trait BaseVector<T: RealNumber>: Clone + Debug {
let mut result = Self::zeros(n); let mut result = Self::zeros(n);
for i in 0..n { for (i, idx) in index.iter().enumerate() {
result.set(i, self.get(index[i])); result.set(i, self.get(*idx));
} }
result result
@@ -639,11 +639,11 @@ pub trait BaseMatrix<T: RealNumber>: Clone + Debug {
_ => Self::zeros(n, index.len()), _ => Self::zeros(n, index.len()),
}; };
for i in 0..index.len() { for (i, idx) in index.iter().enumerate() {
for j in 0..k { for j in 0..k {
match axis { match axis {
0 => result.set(i, j, self.get(index[i], j)), 0 => result.set(i, j, self.get(*idx, j)),
_ => result.set(j, i, self.get(j, index[i])), _ => result.set(j, i, self.get(j, *idx)),
}; };
} }
} }
+7 -5
View File
@@ -69,8 +69,7 @@ use crate::optimization::FunctionOrder;
/// Logistic Regression parameters /// Logistic Regression parameters
#[derive(Serialize, Deserialize, Debug, Clone)] #[derive(Serialize, Deserialize, Debug, Clone)]
pub struct LogisticRegressionParameters { pub struct LogisticRegressionParameters {}
}
/// Logistic Regression /// Logistic Regression
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
@@ -105,8 +104,7 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix<T>> {
impl Default for LogisticRegressionParameters { impl Default for LogisticRegressionParameters {
fn default() -> Self { fn default() -> Self {
LogisticRegressionParameters { LogisticRegressionParameters {}
}
} }
} }
@@ -231,7 +229,11 @@ impl<T: RealNumber, M: Matrix<T>> LogisticRegression<T, M> {
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
/// * `y` - target class values /// * `y` - target class values
/// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values.
pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result<LogisticRegression<T, M>, Failed> { pub fn fit(
x: &M,
y: &M::RowVector,
_parameters: LogisticRegressionParameters,
) -> Result<LogisticRegression<T, M>, Failed> {
let y_m = M::from_row_vector(y.clone()); let y_m = M::from_row_vector(y.clone());
let (x_nrows, num_attributes) = x.shape(); let (x_nrows, num_attributes) = x.shape();
let (_, y_nrows) = y_m.shape(); let (_, y_nrows) = y_m.shape();
+6 -23
View File
@@ -1,30 +1,13 @@
//! # KFold //! # KFold
//! //!
//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), //! Defines k-fold cross validator.
//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data.
//! Overfitting is bad because the model we trained fits trained data too well and cant make any inferences on new data.
//! Underfitted is bad because the model is undetrained and does not fit the training data well.
//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for
//! your data.
//!
//! In SmartCore you can split your data into training and test datasets using `train_test_split` function.
use crate::linalg::Matrix; use crate::linalg::Matrix;
use crate::math::num::RealNumber; use crate::math::num::RealNumber;
use crate::model_selection::BaseKFold;
use rand::seq::SliceRandom; use rand::seq::SliceRandom;
use rand::thread_rng; use rand::thread_rng;
/// An interface for the K-Folds cross-validator
pub trait BaseKFold {
/// An iterator over indices that split data into training and test set.
type Output: Iterator<Item = (Vec<usize>, Vec<usize>)>;
/// Return a tuple containing the the training set indices for that split and
/// the testing set indices for that split.
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Self::Output;
/// Returns the number of splits
fn n_splits(&self) -> usize;
}
/// K-Folds cross-validator /// K-Folds cross-validator
pub struct KFold { pub struct KFold {
/// Number of folds. Must be at least 2. /// Number of folds. Must be at least 2.
@@ -101,12 +84,12 @@ impl KFold {
} }
/// An iterator over indices that split data into training and test set. /// An iterator over indices that split data into training and test set.
pub struct BaseKFoldIter { pub struct KFoldIter {
indices: Vec<usize>, indices: Vec<usize>,
test_indices: Vec<Vec<bool>>, test_indices: Vec<Vec<bool>>,
} }
impl Iterator for BaseKFoldIter { impl Iterator for KFoldIter {
type Item = (Vec<usize>, Vec<usize>); type Item = (Vec<usize>, Vec<usize>);
fn next(&mut self) -> Option<(Vec<usize>, Vec<usize>)> { fn next(&mut self) -> Option<(Vec<usize>, Vec<usize>)> {
@@ -133,7 +116,7 @@ impl Iterator for BaseKFoldIter {
/// Abstract class for all KFold functionalities /// Abstract class for all KFold functionalities
impl BaseKFold for KFold { impl BaseKFold for KFold {
type Output = BaseKFoldIter; type Output = KFoldIter;
fn n_splits(&self) -> usize { fn n_splits(&self) -> usize {
self.n_splits self.n_splits
@@ -148,7 +131,7 @@ impl BaseKFold for KFold {
let mut test_indices = self.test_masks(x); let mut test_indices = self.test_masks(x);
test_indices.reverse(); test_indices.reverse();
BaseKFoldIter { KFoldIter {
indices, indices,
test_indices, test_indices,
} }
+36 -7
View File
@@ -14,15 +14,27 @@ use crate::error::Failed;
use crate::linalg::BaseVector; use crate::linalg::BaseVector;
use crate::linalg::Matrix; use crate::linalg::Matrix;
use crate::math::num::RealNumber; use crate::math::num::RealNumber;
use crate::model_selection::kfold::BaseKFold;
use rand::seq::SliceRandom; use rand::seq::SliceRandom;
use rand::thread_rng; use rand::thread_rng;
pub mod kfold; pub(crate) mod kfold;
pub use kfold::{KFold, KFoldIter};
/// An interface for the K-Folds cross-validator
pub trait BaseKFold {
/// An iterator over indices that split data into training and test set.
type Output: Iterator<Item = (Vec<usize>, Vec<usize>)>;
/// Return a tuple containing the the training set indices for that split and
/// the testing set indices for that split.
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Self::Output;
/// Returns the number of splits
fn n_splits(&self) -> usize;
}
/// Splits data into 2 disjoint datasets. /// Splits data into 2 disjoint datasets.
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
/// * `y` - target values, should be of size _M_ /// * `y` - target values, should be of size _N_
/// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split.
/// * `shuffle`, - whether or not to shuffle the data before splitting /// * `shuffle`, - whether or not to shuffle the data before splitting
pub fn train_test_split<T: RealNumber, M: Matrix<T>>( pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
@@ -65,22 +77,33 @@ pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
(x_train, x_test, y_train, y_test) (x_train, x_test, y_train, y_test)
} }
/// Cross validation results.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct CrossValidationResult<T: RealNumber> { pub struct CrossValidationResult<T: RealNumber> {
/// Vector with test scores on each cv split
pub test_score: Vec<T>, pub test_score: Vec<T>,
/// Vector with training scores on each cv split
pub train_score: Vec<T>, pub train_score: Vec<T>,
} }
impl<T: RealNumber> CrossValidationResult<T> { impl<T: RealNumber> CrossValidationResult<T> {
/// Average test score
pub fn mean_test_score(&self) -> T { pub fn mean_test_score(&self) -> T {
self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap() self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap()
} }
/// Average training score
pub fn mean_train_score(&self) -> T { pub fn mean_train_score(&self) -> T {
self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap() self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap()
} }
} }
/// Evaluate an estimator by cross-validation using given metric.
/// * `fit_estimator` - a `fit` function of an estimator
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
/// * `y` - target values, should be of size _N_
/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters.
/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html)
/// * `score` - a metric to use for evaluation, see [metrics](../metrics/index.html)
pub fn cross_validate<T, M, H, E, K, F, S>( pub fn cross_validate<T, M, H, E, K, F, S>(
fit_estimator: F, fit_estimator: F,
x: &M, x: &M,
@@ -302,7 +325,6 @@ mod tests {
#[test] #[test]
fn test_some_classifier() { fn test_some_classifier() {
let x = DenseMatrix::from_2d_array(&[ let x = DenseMatrix::from_2d_array(&[
&[5.1, 3.5, 1.4, 0.2], &[5.1, 3.5, 1.4, 0.2],
&[4.9, 3.0, 1.4, 0.2], &[4.9, 3.0, 1.4, 0.2],
@@ -334,8 +356,15 @@ mod tests {
..KFold::default() ..KFold::default()
}; };
let results = let results = cross_validate(
cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap(); DecisionTreeClassifier::fit,
&x,
&y,
Default::default(),
cv,
&accuracy,
)
.unwrap();
println!("{}", results.mean_test_score()); println!("{}", results.mean_test_score());
println!("{}", results.mean_train_score()); println!("{}", results.mean_train_score());
+1 -1
View File
@@ -188,7 +188,7 @@ pub struct MultinomialNB<T: RealNumber, M: Matrix<T>> {
inner: BaseNaiveBayes<T, M, MultinomialNBDistribution<T>>, inner: BaseNaiveBayes<T, M, MultinomialNBDistribution<T>>,
} }
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for MultinomialNB <T, M> { impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for MultinomialNB<T, M> {
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> { fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
self.predict(x) self.predict(x)
} }
+12 -6
View File
@@ -167,8 +167,8 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVCParameters<T, M
epoch: self.epoch, epoch: self.epoch,
c: self.c, c: self.c,
tol: self.tol, tol: self.tol,
kernel: kernel, kernel,
m: PhantomData m: PhantomData,
} }
} }
} }
@@ -180,12 +180,14 @@ impl<T: RealNumber, M: Matrix<T>> Default for SVCParameters<T, M, LinearKernel>
c: T::one(), c: T::one(),
tol: T::from_f64(1e-3).unwrap(), tol: T::from_f64(1e-3).unwrap(),
kernel: Kernels::linear(), kernel: Kernels::linear(),
m: PhantomData m: PhantomData,
} }
} }
} }
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector> for SVC<T, M, K> { impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector>
for SVC<T, M, K>
{
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> { fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
self.predict(x) self.predict(x)
} }
@@ -743,7 +745,9 @@ mod tests {
let y_hat = SVC::fit( let y_hat = SVC::fit(
&x, &x,
&y, &y,
SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()), SVCParameters::default()
.with_c(200.0)
.with_kernel(Kernels::linear()),
) )
.and_then(|lr| lr.predict(&x)) .and_then(|lr| lr.predict(&x))
.unwrap(); .unwrap();
@@ -784,7 +788,9 @@ mod tests {
let y_hat = SVC::fit( let y_hat = SVC::fit(
&x, &x,
&y, &y,
SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)), SVCParameters::default()
.with_c(1.0)
.with_kernel(Kernels::rbf(0.7)),
) )
.and_then(|lr| lr.predict(&x)) .and_then(|lr| lr.predict(&x))
.unwrap(); .unwrap();
+9 -11
View File
@@ -156,8 +156,8 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVRParameters<T, M
eps: self.eps, eps: self.eps,
c: self.c, c: self.c,
tol: self.tol, tol: self.tol,
kernel: kernel, kernel,
m: PhantomData m: PhantomData,
} }
} }
} }
@@ -169,12 +169,14 @@ impl<T: RealNumber, M: Matrix<T>> Default for SVRParameters<T, M, LinearKernel>
c: T::one(), c: T::one(),
tol: T::from_f64(1e-3).unwrap(), tol: T::from_f64(1e-3).unwrap(),
kernel: Kernels::linear(), kernel: Kernels::linear(),
m: PhantomData m: PhantomData,
} }
} }
} }
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector> for SVR<T, M, K> { impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector>
for SVR<T, M, K>
{
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> { fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
self.predict(x) self.predict(x)
} }
@@ -544,13 +546,9 @@ mod tests {
114.2, 115.7, 116.9, 114.2, 115.7, 116.9,
]; ];
let y_hat = SVR::fit( let y_hat = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0))
&x, .and_then(|lr| lr.predict(&x))
&y, .unwrap();
SVRParameters::default().with_eps(2.0).with_c(10.0),
)
.and_then(|lr| lr.predict(&x))
.unwrap();
assert!(mean_squared_error(&y_hat, &y) < 2.5); assert!(mean_squared_error(&y_hat, &y) < 2.5);
} }