diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index 66ec531..1517850 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -1,14 +1,34 @@ +//! # Euclidian Metric Distance +//! +//! The Euclidean distance (L2) between two points \\( x \\) and \\( y \\) in n-space is defined as +//! +//! \\[ d(x, y) = \sqrt{\sum_{i=1}^n (x-y)^2} \\] +//! +//! Example: +//! +//! ``` +//! use smartcore::math::distance::Distance; +//! use smartcore::math::distance::euclidian::Euclidian; +//! +//! let x = vec![1., 1.]; +//! let y = vec![2., 2.]; +//! +//! let l2: f64 = Euclidian{}.distance(&x, &y); +//! ``` +//! +//! use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; +/// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space. #[derive(Serialize, Deserialize, Debug)] pub struct Euclidian {} impl Euclidian { - pub fn squared_distance(x: &Vec, y: &Vec) -> T { + pub(crate) fn squared_distance(x: &Vec, y: &Vec) -> T { if x.len() != y.len() { panic!("Input vector sizes are different."); } diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index 0f83be2..eebae55 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -1,9 +1,30 @@ +//! # Hamming Distance +//! +//! Hamming Distance measures the similarity between two integer-valued vectors of the same length. +//! Given two vectors \\( x \in ℝ^n \\), \\( y \in ℝ^n \\) the hamming distance between \\( x \\) and \\( y \\), \\( d(x, y) \\), is the number of places where \\( x \\) and \\( y \\) differ. +//! +//! Example: +//! +//! ``` +//! use smartcore::math::distance::Distance; +//! use smartcore::math::distance::hamming::Hamming; +//! +//! let a = vec![1, 0, 0, 1, 0, 0, 1]; +//! let b = vec![1, 1, 0, 0, 1, 0, 1]; +//! +//! let h: f64 = Hamming {}.distance(&a, &b); +//! +//! ``` +//! +//! + use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; +/// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different #[derive(Serialize, Deserialize, Debug)] pub struct Hamming {} @@ -29,7 +50,7 @@ mod tests { use super::*; #[test] - fn minkowski_distance() { + fn hamming_distance() { let a = vec![1, 0, 0, 1, 0, 0, 1]; let b = vec![1, 1, 0, 0, 1, 0, 1]; diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index 7c26ae1..64a68f6 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -1,3 +1,44 @@ +//! # Mahalanobis Distance +//! +//! The Mahalanobis distance (MD) is the distance between two points in multivariate space. +//! In a regular Euclidean space the distance between any two points can be measured with [Euclidean distance](euclidian/index.html). +//! For uncorrelated variables, the Euclidean distance equals the MD. However, if two or more variables are correlated the measurements become impossible +//! with Euclidean distance because the axes are no longer at right angles to each other. MD on the other hand, is scale-invariant, +//! it takes into account the covariance matrix of the dataset when calculating distance between 2 points that belong to the same space as the dataset. +//! +//! MD between two vectors \\( x \in ℝ^n \\) and \\( y \in ℝ^n \\) is defined as +//! \\[ d(x, y) = \sqrt{(x - y)^TS^{-1}(x - y)}\\] +//! +//! where \\( S \\) is the covariance matrix of the dataset. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::math::distance::Distance; +//! use smartcore::math::distance::mahalanobis::Mahalanobis; +//! +//! let data = DenseMatrix::from_array(&[ +//! &[64., 580., 29.], +//! &[66., 570., 33.], +//! &[68., 590., 37.], +//! &[69., 660., 46.], +//! &[73., 600., 55.], +//! ]); +//! +//! let a = data.column_mean(); +//! let b = vec![66., 640., 44.]; +//! +//! let mahalanobis = Mahalanobis::new(&data); +//! +//! mahalanobis.distance(&a, &b); +//! ``` +//! +//! ## References +//! * ["Introduction to Multivariate Statistical Analysis in Chemometrics", Varmuza, K., Filzmoser, P., 2016, p.46](https://www.taylorfrancis.com/books/9780429145049) +//! * ["Example of Calculating the Mahalanobis Distance", McCaffrey, J.D.](https://jamesmccaffrey.wordpress.com/2017/11/09/example-of-calculating-the-mahalanobis-distance/) +//! +//! #![allow(non_snake_case)] use std::marker::PhantomData; @@ -9,14 +50,19 @@ use crate::math::num::RealNumber; use super::Distance; use crate::linalg::Matrix; +/// Mahalanobis distance. #[derive(Serialize, Deserialize, Debug)] pub struct Mahalanobis> { + /// covariance matrix of the dataset pub sigma: M, + /// inverse of the covariance matrix pub sigmaInv: M, t: PhantomData, } impl> Mahalanobis { + /// Constructs new instance of `Mahalanobis` from given dataset + /// * `data` - a matrix of _NxM_ where _N_ is number of observations and _M_ is number of attributes pub fn new(data: &M) -> Mahalanobis { let sigma = data.cov(); let sigmaInv = sigma.lu().inverse(); @@ -27,6 +73,8 @@ impl> Mahalanobis { } } + /// Constructs new instance of `Mahalanobis` from given covariance matrix + /// * `cov` - a covariance matrix pub fn new_from_covariance(cov: &M) -> Mahalanobis { let sigma = cov.clone(); let sigmaInv = sigma.lu().inverse(); @@ -99,6 +147,8 @@ mod tests { let mahalanobis = Mahalanobis::new(&data); - println!("{}", mahalanobis.distance(&a, &b)); + let md: f64 = mahalanobis.distance(&a, &b); + + assert!((md - 5.33).abs() < 1e-2); } } diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 13ab983..cbf1a92 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -1,9 +1,28 @@ +//! # Manhattan Distance +//! +//! The Manhattan distance between two points \\(x \in ℝ^n \\) and \\( y \in ℝ^n \\) in n-dimensional space is the sum of the distances in each dimension. +//! +//! \\[ d(x, y) = \sum_{i=0}^n \lvert x_i - y_i \rvert \\] +//! +//! Example: +//! +//! ``` +//! use smartcore::math::distance::Distance; +//! use smartcore::math::distance::manhattan::Manhattan; +//! +//! let x = vec![1., 1.]; +//! let y = vec![2., 2.]; +//! +//! let l1: f64 = Manhattan {}.distance(&x, &y); +//! ``` +//! use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; +/// Manhattan distance #[derive(Serialize, Deserialize, Debug)] pub struct Manhattan {} diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index fdf83ed..e345fce 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -1,11 +1,35 @@ +//! # Minkowski Distance +//! +//! The Minkowski distance of order _p_ (where _p_ is an integer) is a metric in a normed vector space which can be considered as a generalization of both the Euclidean distance and the Manhattan distance. +//! The Manhattan distance between two points \\(x \in ℝ^n \\) and \\( y \in ℝ^n \\) in n-dimensional space is defined as: +//! +//! \\[ d(x, y) = \left(\sum_{i=0}^n \lvert x_i - y_i \rvert^p\right)^{1/p} \\] +//! +//! Example: +//! +//! ``` +//! use smartcore::math::distance::Distance; +//! use smartcore::math::distance::minkowski::Minkowski; +//! +//! let x = vec![1., 1.]; +//! let y = vec![2., 2.]; +//! +//! let l1: f64 = Minkowski { p: 1 }.distance(&x, &y); +//! let l2: f64 = Minkowski { p: 2 }.distance(&x, &y); +//! +//! ``` +//! + use serde::{Deserialize, Serialize}; use crate::math::num::RealNumber; use super::Distance; +/// Defines the Minkowski distance of order `p` #[derive(Serialize, Deserialize, Debug)] pub struct Minkowski { + /// order, integer pub p: u16, } diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 9ee1b65..7b5f1f8 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -23,6 +23,7 @@ pub mod manhattan; /// A generalization of both the Euclidean distance and the Manhattan distance. pub mod minkowski; +use crate::linalg::Matrix; use crate::math::num::RealNumber; /// Distance metric, a function that calculates distance between two points @@ -35,24 +36,29 @@ pub trait Distance { pub struct Distances {} impl Distances { - /// Euclidian distance + /// Euclidian distance, see [`Euclidian`](euclidian/index.html) pub fn euclidian() -> euclidian::Euclidian { euclidian::Euclidian {} } - /// Minkowski distance + /// Minkowski distance, see [`Minkowski`](minkowski/index.html) /// * `p` - function order. Should be >= 1 pub fn minkowski(p: u16) -> minkowski::Minkowski { minkowski::Minkowski { p: p } } - /// Manhattan distance + /// Manhattan distance, see [`Manhattan`](manhattan/index.html) pub fn manhattan() -> manhattan::Manhattan { manhattan::Manhattan {} } - /// Hamming distance + /// Hamming distance, see [`Hamming`](hamming/index.html) pub fn hamming() -> hamming::Hamming { hamming::Hamming {} } + + /// Mahalanobis distance, see [`Mahalanobis`](mahalanobis/index.html) + pub fn mahalanobis>(data: &M) -> mahalanobis::Mahalanobis { + mahalanobis::Mahalanobis::new(data) + } } diff --git a/src/math/num.rs b/src/math/num.rs index c87a73d..b5b7946 100644 --- a/src/math/num.rs +++ b/src/math/num.rs @@ -1,21 +1,34 @@ +//! # Real Number +//! Most algorithms in SmartCore rely on basic linear algebra operations like dot product, matrix decomposition and other subroutines that are defined for a set of real numbers, ℝ. +//! This module defines real number and some useful functions that are used in [Linear Algebra](../../linalg/index.html) module. + use num_traits::{Float, FromPrimitive}; use rand::prelude::*; use std::fmt::{Debug, Display}; use std::iter::{Product, Sum}; +/// Defines real number +/// pub trait RealNumber: Float + FromPrimitive + Debug + Display + Copy + Sum + Product { + /// Copy sign from `sign` - another real number fn copysign(self, sign: Self) -> Self; + /// Calculates natural \\( \ln(1+e^x) \\) without overflow. fn ln_1pe(self) -> Self; + /// Efficient implementation of Sigmoid function, \\( S(x) = \frac{1}{1 + e^{-x}} \\), see [Sigmoid function](https://en.wikipedia.org/wiki/Sigmoid_function) fn sigmoid(self) -> Self; + /// Returns pseudorandom number between 0 and 1 fn rand() -> Self; + /// Returns 2 fn two() -> Self; + /// Returns .5 fn half() -> Self; + /// Returns \\( x^2 \\) fn square(self) -> Self { self * self }