fix: clippy, documentation and formatting
This commit is contained in:
+5
-5
@@ -281,8 +281,8 @@ pub trait BaseVector<T: RealNumber>: Clone + Debug {
|
|||||||
|
|
||||||
let mut result = Self::zeros(n);
|
let mut result = Self::zeros(n);
|
||||||
|
|
||||||
for i in 0..n {
|
for (i, idx) in index.iter().enumerate() {
|
||||||
result.set(i, self.get(index[i]));
|
result.set(i, self.get(*idx));
|
||||||
}
|
}
|
||||||
|
|
||||||
result
|
result
|
||||||
@@ -639,11 +639,11 @@ pub trait BaseMatrix<T: RealNumber>: Clone + Debug {
|
|||||||
_ => Self::zeros(n, index.len()),
|
_ => Self::zeros(n, index.len()),
|
||||||
};
|
};
|
||||||
|
|
||||||
for i in 0..index.len() {
|
for (i, idx) in index.iter().enumerate() {
|
||||||
for j in 0..k {
|
for j in 0..k {
|
||||||
match axis {
|
match axis {
|
||||||
0 => result.set(i, j, self.get(index[i], j)),
|
0 => result.set(i, j, self.get(*idx, j)),
|
||||||
_ => result.set(j, i, self.get(j, index[i])),
|
_ => result.set(j, i, self.get(j, *idx)),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -69,8 +69,7 @@ use crate::optimization::FunctionOrder;
|
|||||||
|
|
||||||
/// Logistic Regression parameters
|
/// Logistic Regression parameters
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct LogisticRegressionParameters {
|
pub struct LogisticRegressionParameters {}
|
||||||
}
|
|
||||||
|
|
||||||
/// Logistic Regression
|
/// Logistic Regression
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
@@ -105,8 +104,7 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix<T>> {
|
|||||||
|
|
||||||
impl Default for LogisticRegressionParameters {
|
impl Default for LogisticRegressionParameters {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
LogisticRegressionParameters {
|
LogisticRegressionParameters {}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -231,7 +229,11 @@ impl<T: RealNumber, M: Matrix<T>> LogisticRegression<T, M> {
|
|||||||
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
/// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||||
/// * `y` - target class values
|
/// * `y` - target class values
|
||||||
/// * `parameters` - other parameters, use `Default::default()` to set parameters to default values.
|
/// * `parameters` - other parameters, use `Default::default()` to set parameters to default values.
|
||||||
pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result<LogisticRegression<T, M>, Failed> {
|
pub fn fit(
|
||||||
|
x: &M,
|
||||||
|
y: &M::RowVector,
|
||||||
|
_parameters: LogisticRegressionParameters,
|
||||||
|
) -> Result<LogisticRegression<T, M>, Failed> {
|
||||||
let y_m = M::from_row_vector(y.clone());
|
let y_m = M::from_row_vector(y.clone());
|
||||||
let (x_nrows, num_attributes) = x.shape();
|
let (x_nrows, num_attributes) = x.shape();
|
||||||
let (_, y_nrows) = y_m.shape();
|
let (_, y_nrows) = y_m.shape();
|
||||||
|
|||||||
@@ -1,30 +1,13 @@
|
|||||||
//! # KFold
|
//! # KFold
|
||||||
//!
|
//!
|
||||||
//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate),
|
//! Defines k-fold cross validator.
|
||||||
//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data.
|
|
||||||
//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data.
|
|
||||||
//! Underfitted is bad because the model is undetrained and does not fit the training data well.
|
|
||||||
//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for
|
|
||||||
//! your data.
|
|
||||||
//!
|
|
||||||
//! In SmartCore you can split your data into training and test datasets using `train_test_split` function.
|
|
||||||
|
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
|
use crate::model_selection::BaseKFold;
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
|
|
||||||
/// An interface for the K-Folds cross-validator
|
|
||||||
pub trait BaseKFold {
|
|
||||||
/// An iterator over indices that split data into training and test set.
|
|
||||||
type Output: Iterator<Item = (Vec<usize>, Vec<usize>)>;
|
|
||||||
/// Return a tuple containing the the training set indices for that split and
|
|
||||||
/// the testing set indices for that split.
|
|
||||||
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Self::Output;
|
|
||||||
/// Returns the number of splits
|
|
||||||
fn n_splits(&self) -> usize;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// K-Folds cross-validator
|
/// K-Folds cross-validator
|
||||||
pub struct KFold {
|
pub struct KFold {
|
||||||
/// Number of folds. Must be at least 2.
|
/// Number of folds. Must be at least 2.
|
||||||
@@ -101,12 +84,12 @@ impl KFold {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// An iterator over indices that split data into training and test set.
|
/// An iterator over indices that split data into training and test set.
|
||||||
pub struct BaseKFoldIter {
|
pub struct KFoldIter {
|
||||||
indices: Vec<usize>,
|
indices: Vec<usize>,
|
||||||
test_indices: Vec<Vec<bool>>,
|
test_indices: Vec<Vec<bool>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Iterator for BaseKFoldIter {
|
impl Iterator for KFoldIter {
|
||||||
type Item = (Vec<usize>, Vec<usize>);
|
type Item = (Vec<usize>, Vec<usize>);
|
||||||
|
|
||||||
fn next(&mut self) -> Option<(Vec<usize>, Vec<usize>)> {
|
fn next(&mut self) -> Option<(Vec<usize>, Vec<usize>)> {
|
||||||
@@ -133,7 +116,7 @@ impl Iterator for BaseKFoldIter {
|
|||||||
|
|
||||||
/// Abstract class for all KFold functionalities
|
/// Abstract class for all KFold functionalities
|
||||||
impl BaseKFold for KFold {
|
impl BaseKFold for KFold {
|
||||||
type Output = BaseKFoldIter;
|
type Output = KFoldIter;
|
||||||
|
|
||||||
fn n_splits(&self) -> usize {
|
fn n_splits(&self) -> usize {
|
||||||
self.n_splits
|
self.n_splits
|
||||||
@@ -148,7 +131,7 @@ impl BaseKFold for KFold {
|
|||||||
let mut test_indices = self.test_masks(x);
|
let mut test_indices = self.test_masks(x);
|
||||||
test_indices.reverse();
|
test_indices.reverse();
|
||||||
|
|
||||||
BaseKFoldIter {
|
KFoldIter {
|
||||||
indices,
|
indices,
|
||||||
test_indices,
|
test_indices,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,15 +14,27 @@ use crate::error::Failed;
|
|||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
use crate::model_selection::kfold::BaseKFold;
|
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
|
|
||||||
pub mod kfold;
|
pub(crate) mod kfold;
|
||||||
|
|
||||||
|
pub use kfold::{KFold, KFoldIter};
|
||||||
|
|
||||||
|
/// An interface for the K-Folds cross-validator
|
||||||
|
pub trait BaseKFold {
|
||||||
|
/// An iterator over indices that split data into training and test set.
|
||||||
|
type Output: Iterator<Item = (Vec<usize>, Vec<usize>)>;
|
||||||
|
/// Return a tuple containing the the training set indices for that split and
|
||||||
|
/// the testing set indices for that split.
|
||||||
|
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Self::Output;
|
||||||
|
/// Returns the number of splits
|
||||||
|
fn n_splits(&self) -> usize;
|
||||||
|
}
|
||||||
|
|
||||||
/// Splits data into 2 disjoint datasets.
|
/// Splits data into 2 disjoint datasets.
|
||||||
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
|
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
|
||||||
/// * `y` - target values, should be of size _M_
|
/// * `y` - target values, should be of size _N_
|
||||||
/// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split.
|
/// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split.
|
||||||
/// * `shuffle`, - whether or not to shuffle the data before splitting
|
/// * `shuffle`, - whether or not to shuffle the data before splitting
|
||||||
pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
||||||
@@ -65,22 +77,33 @@ pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
|||||||
(x_train, x_test, y_train, y_test)
|
(x_train, x_test, y_train, y_test)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Cross validation results.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct CrossValidationResult<T: RealNumber> {
|
pub struct CrossValidationResult<T: RealNumber> {
|
||||||
|
/// Vector with test scores on each cv split
|
||||||
pub test_score: Vec<T>,
|
pub test_score: Vec<T>,
|
||||||
|
/// Vector with training scores on each cv split
|
||||||
pub train_score: Vec<T>,
|
pub train_score: Vec<T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber> CrossValidationResult<T> {
|
impl<T: RealNumber> CrossValidationResult<T> {
|
||||||
|
/// Average test score
|
||||||
pub fn mean_test_score(&self) -> T {
|
pub fn mean_test_score(&self) -> T {
|
||||||
self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap()
|
self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap()
|
||||||
}
|
}
|
||||||
|
/// Average training score
|
||||||
pub fn mean_train_score(&self) -> T {
|
pub fn mean_train_score(&self) -> T {
|
||||||
self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap()
|
self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Evaluate an estimator by cross-validation using given metric.
|
||||||
|
/// * `fit_estimator` - a `fit` function of an estimator
|
||||||
|
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
|
||||||
|
/// * `y` - target values, should be of size _N_
|
||||||
|
/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters.
|
||||||
|
/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html)
|
||||||
|
/// * `score` - a metric to use for evaluation, see [metrics](../metrics/index.html)
|
||||||
pub fn cross_validate<T, M, H, E, K, F, S>(
|
pub fn cross_validate<T, M, H, E, K, F, S>(
|
||||||
fit_estimator: F,
|
fit_estimator: F,
|
||||||
x: &M,
|
x: &M,
|
||||||
@@ -302,7 +325,6 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_some_classifier() {
|
fn test_some_classifier() {
|
||||||
|
|
||||||
let x = DenseMatrix::from_2d_array(&[
|
let x = DenseMatrix::from_2d_array(&[
|
||||||
&[5.1, 3.5, 1.4, 0.2],
|
&[5.1, 3.5, 1.4, 0.2],
|
||||||
&[4.9, 3.0, 1.4, 0.2],
|
&[4.9, 3.0, 1.4, 0.2],
|
||||||
@@ -334,8 +356,15 @@ mod tests {
|
|||||||
..KFold::default()
|
..KFold::default()
|
||||||
};
|
};
|
||||||
|
|
||||||
let results =
|
let results = cross_validate(
|
||||||
cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap();
|
DecisionTreeClassifier::fit,
|
||||||
|
&x,
|
||||||
|
&y,
|
||||||
|
Default::default(),
|
||||||
|
cv,
|
||||||
|
&accuracy,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
println!("{}", results.mean_test_score());
|
println!("{}", results.mean_test_score());
|
||||||
println!("{}", results.mean_train_score());
|
println!("{}", results.mean_train_score());
|
||||||
|
|||||||
@@ -188,7 +188,7 @@ pub struct MultinomialNB<T: RealNumber, M: Matrix<T>> {
|
|||||||
inner: BaseNaiveBayes<T, M, MultinomialNBDistribution<T>>,
|
inner: BaseNaiveBayes<T, M, MultinomialNBDistribution<T>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for MultinomialNB <T, M> {
|
impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for MultinomialNB<T, M> {
|
||||||
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
self.predict(x)
|
self.predict(x)
|
||||||
}
|
}
|
||||||
|
|||||||
+13
-7
@@ -167,8 +167,8 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVCParameters<T, M
|
|||||||
epoch: self.epoch,
|
epoch: self.epoch,
|
||||||
c: self.c,
|
c: self.c,
|
||||||
tol: self.tol,
|
tol: self.tol,
|
||||||
kernel: kernel,
|
kernel,
|
||||||
m: PhantomData
|
m: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -180,12 +180,14 @@ impl<T: RealNumber, M: Matrix<T>> Default for SVCParameters<T, M, LinearKernel>
|
|||||||
c: T::one(),
|
c: T::one(),
|
||||||
tol: T::from_f64(1e-3).unwrap(),
|
tol: T::from_f64(1e-3).unwrap(),
|
||||||
kernel: Kernels::linear(),
|
kernel: Kernels::linear(),
|
||||||
m: PhantomData
|
m: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector> for SVC<T, M, K> {
|
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector>
|
||||||
|
for SVC<T, M, K>
|
||||||
|
{
|
||||||
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
self.predict(x)
|
self.predict(x)
|
||||||
}
|
}
|
||||||
@@ -743,10 +745,12 @@ mod tests {
|
|||||||
let y_hat = SVC::fit(
|
let y_hat = SVC::fit(
|
||||||
&x,
|
&x,
|
||||||
&y,
|
&y,
|
||||||
SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()),
|
SVCParameters::default()
|
||||||
|
.with_c(200.0)
|
||||||
|
.with_kernel(Kernels::linear()),
|
||||||
)
|
)
|
||||||
.and_then(|lr| lr.predict(&x))
|
.and_then(|lr| lr.predict(&x))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
assert!(accuracy(&y_hat, &y) >= 0.9);
|
assert!(accuracy(&y_hat, &y) >= 0.9);
|
||||||
}
|
}
|
||||||
@@ -784,7 +788,9 @@ mod tests {
|
|||||||
let y_hat = SVC::fit(
|
let y_hat = SVC::fit(
|
||||||
&x,
|
&x,
|
||||||
&y,
|
&y,
|
||||||
SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)),
|
SVCParameters::default()
|
||||||
|
.with_c(1.0)
|
||||||
|
.with_kernel(Kernels::rbf(0.7)),
|
||||||
)
|
)
|
||||||
.and_then(|lr| lr.predict(&x))
|
.and_then(|lr| lr.predict(&x))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|||||||
+12
-14
@@ -134,7 +134,7 @@ struct Cache<T: Clone> {
|
|||||||
data: Vec<RefCell<Option<Vec<T>>>>,
|
data: Vec<RefCell<Option<Vec<T>>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVRParameters<T, M, K> {
|
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVRParameters<T, M, K> {
|
||||||
/// Epsilon in the epsilon-SVR model.
|
/// Epsilon in the epsilon-SVR model.
|
||||||
pub fn with_eps(mut self, eps: T) -> Self {
|
pub fn with_eps(mut self, eps: T) -> Self {
|
||||||
self.eps = eps;
|
self.eps = eps;
|
||||||
@@ -153,11 +153,11 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVRParameters<T, M
|
|||||||
/// The kernel function.
|
/// The kernel function.
|
||||||
pub fn with_kernel<KK: Kernel<T, M::RowVector>>(&self, kernel: KK) -> SVRParameters<T, M, KK> {
|
pub fn with_kernel<KK: Kernel<T, M::RowVector>>(&self, kernel: KK) -> SVRParameters<T, M, KK> {
|
||||||
SVRParameters {
|
SVRParameters {
|
||||||
eps: self.eps,
|
eps: self.eps,
|
||||||
c: self.c,
|
c: self.c,
|
||||||
tol: self.tol,
|
tol: self.tol,
|
||||||
kernel: kernel,
|
kernel,
|
||||||
m: PhantomData
|
m: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -169,12 +169,14 @@ impl<T: RealNumber, M: Matrix<T>> Default for SVRParameters<T, M, LinearKernel>
|
|||||||
c: T::one(),
|
c: T::one(),
|
||||||
tol: T::from_f64(1e-3).unwrap(),
|
tol: T::from_f64(1e-3).unwrap(),
|
||||||
kernel: Kernels::linear(),
|
kernel: Kernels::linear(),
|
||||||
m: PhantomData
|
m: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector> for SVR<T, M, K> {
|
impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector>
|
||||||
|
for SVR<T, M, K>
|
||||||
|
{
|
||||||
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||||
self.predict(x)
|
self.predict(x)
|
||||||
}
|
}
|
||||||
@@ -188,7 +190,7 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVR<T, M, K> {
|
|||||||
/// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values.
|
/// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values.
|
||||||
pub fn fit(
|
pub fn fit(
|
||||||
x: &M,
|
x: &M,
|
||||||
y: &M::RowVector,
|
y: &M::RowVector,
|
||||||
parameters: SVRParameters<T, M, K>,
|
parameters: SVRParameters<T, M, K>,
|
||||||
) -> Result<SVR<T, M, K>, Failed> {
|
) -> Result<SVR<T, M, K>, Failed> {
|
||||||
let (n, _) = x.shape();
|
let (n, _) = x.shape();
|
||||||
@@ -544,13 +546,9 @@ mod tests {
|
|||||||
114.2, 115.7, 116.9,
|
114.2, 115.7, 116.9,
|
||||||
];
|
];
|
||||||
|
|
||||||
let y_hat = SVR::fit(
|
let y_hat = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0))
|
||||||
&x,
|
.and_then(|lr| lr.predict(&x))
|
||||||
&y,
|
.unwrap();
|
||||||
SVRParameters::default().with_eps(2.0).with_c(10.0),
|
|
||||||
)
|
|
||||||
.and_then(|lr| lr.predict(&x))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert!(mean_squared_error(&y_hat, &y) < 2.5);
|
assert!(mean_squared_error(&y_hat, &y) < 2.5);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user