feat: + cluster metrics

This commit is contained in:
Volodymyr Orlov
2020-09-22 20:23:51 -07:00
parent 0803532e79
commit 750015b861
15 changed files with 477 additions and 16 deletions
+6 -7
View File
@@ -71,11 +71,11 @@ impl<T: RealNumber> BBDTree<T> {
) -> T { ) -> T {
let k = centroids.len(); let k = centroids.len();
counts.iter_mut().for_each(|x| *x = 0); counts.iter_mut().for_each(|v| *v = 0);
let mut candidates = vec![0; k]; let mut candidates = vec![0; k];
for i in 0..k { for i in 0..k {
candidates[i] = i; candidates[i] = i;
sums[i].iter_mut().for_each(|x| *x = T::zero()); sums[i].iter_mut().for_each(|v| *v = T::zero());
} }
self.filter( self.filter(
@@ -124,7 +124,7 @@ impl<T: RealNumber> BBDTree<T> {
if !BBDTree::prune( if !BBDTree::prune(
&self.nodes[node].center, &self.nodes[node].center,
&self.nodes[node].radius, &self.nodes[node].radius,
&centroids, centroids,
closest, closest,
candidates[i], candidates[i],
) { ) {
@@ -135,7 +135,7 @@ impl<T: RealNumber> BBDTree<T> {
// Recurse if there's at least two // Recurse if there's at least two
if newk > 1 { if newk > 1 {
let result = self.filter( return self.filter(
self.nodes[node].lower.unwrap(), self.nodes[node].lower.unwrap(),
centroids, centroids,
&mut new_candidates, &mut new_candidates,
@@ -152,7 +152,6 @@ impl<T: RealNumber> BBDTree<T> {
counts, counts,
membership, membership,
); );
return result;
} }
} }
@@ -198,7 +197,7 @@ impl<T: RealNumber> BBDTree<T> {
} }
} }
return lhs >= T::two() * rhs; lhs >= T::two() * rhs
} }
fn build_node<M: Matrix<T>>(&mut self, data: &M, begin: usize, end: usize) -> usize { fn build_node<M: Matrix<T>>(&mut self, data: &M, begin: usize, end: usize) -> usize {
@@ -336,7 +335,7 @@ mod tests {
use crate::linalg::naive::dense_matrix::DenseMatrix; use crate::linalg::naive::dense_matrix::DenseMatrix;
#[test] #[test]
fn fit_predict_iris() { fn bbdtree_iris() {
let data = DenseMatrix::from_2d_array(&[ let data = DenseMatrix::from_2d_array(&[
&[5.1, 3.5, 1.4, 0.2], &[5.1, 3.5, 1.4, 0.2],
&[4.9, 3.0, 1.4, 0.2], &[4.9, 3.0, 1.4, 0.2],
+16 -8
View File
@@ -189,15 +189,18 @@ impl<T: RealNumber + Sum> KMeans<T> {
/// Predict clusters for `x` /// Predict clusters for `x`
/// * `x` - matrix with new data to transform of size _KxM_ , where _K_ is number of new samples and _M_ is number of features. /// * `x` - matrix with new data to transform of size _KxM_ , where _K_ is number of new samples and _M_ is number of features.
pub fn predict<M: Matrix<T>>(&self, x: &M) -> Result<M::RowVector, Failed> { pub fn predict<M: Matrix<T>>(&self, x: &M) -> Result<M::RowVector, Failed> {
let (n, _) = x.shape(); let (n, m) = x.shape();
let mut result = M::zeros(1, n); let mut result = M::zeros(1, n);
let mut row = vec![T::zero(); m];
for i in 0..n { for i in 0..n {
let mut min_dist = T::max_value(); let mut min_dist = T::max_value();
let mut best_cluster = 0; let mut best_cluster = 0;
for j in 0..self.k { for j in 0..self.k {
let dist = Euclidian::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]); x.copy_row_as_vec(i, &mut row);
let dist = Euclidian::squared_distance(&row, &self.centroids[j]);
if dist < min_dist { if dist < min_dist {
min_dist = dist; min_dist = dist;
best_cluster = j; best_cluster = j;
@@ -211,19 +214,22 @@ impl<T: RealNumber + Sum> KMeans<T> {
fn kmeans_plus_plus<M: Matrix<T>>(data: &M, k: usize) -> Vec<usize> { fn kmeans_plus_plus<M: Matrix<T>>(data: &M, k: usize) -> Vec<usize> {
let mut rng = rand::thread_rng(); let mut rng = rand::thread_rng();
let (n, _) = data.shape(); let (n, m) = data.shape();
let mut y = vec![0; n]; let mut y = vec![0; n];
let mut centroid = data.get_row_as_vec(rng.gen_range(0, n)); let mut centroid = data.get_row_as_vec(rng.gen_range(0, n));
let mut d = vec![T::max_value(); n]; let mut d = vec![T::max_value(); n];
let mut row = vec![T::zero(); m];
// pick the next center // pick the next center
for j in 1..k { for j in 1..k {
// Loop over the samples and compare them to the most recent center. Store // Loop over the samples and compare them to the most recent center. Store
// the distance from each sample to its closest center in scores. // the distance from each sample to its closest center in scores.
for i in 0..n { for i in 0..n {
// compute the distance between this sample and the current center // compute the distance between this sample and the current center
let dist = Euclidian::squared_distance(&data.get_row_as_vec(i), &centroid); data.copy_row_as_vec(i, &mut row);
let dist = Euclidian::squared_distance(&row, &centroid);
if dist < d[i] { if dist < d[i] {
d[i] = dist; d[i] = dist;
@@ -237,20 +243,22 @@ impl<T: RealNumber + Sum> KMeans<T> {
} }
let cutoff = T::from(rng.gen::<f64>()).unwrap() * sum; let cutoff = T::from(rng.gen::<f64>()).unwrap() * sum;
let mut cost = T::zero(); let mut cost = T::zero();
let index = 0; let mut index = 0;
for index in 0..n { while index < n {
cost = cost + d[index]; cost = cost + d[index];
if cost >= cutoff { if cost >= cutoff {
break; break;
} }
index += 1;
} }
centroid = data.get_row_as_vec(index); data.copy_row_as_vec(index, &mut centroid);
} }
for i in 0..n { for i in 0..n {
data.copy_row_as_vec(i, &mut row);
// compute the distance between this sample and the current center // compute the distance between this sample and the current center
let dist = Euclidian::squared_distance(&data.get_row_as_vec(i), &centroid); let dist = Euclidian::squared_distance(&row, &centroid);
if dist < d[i] { if dist < d[i] {
d[i] = dist; d[i] = dist;
+67
View File
@@ -0,0 +1,67 @@
//! # Optical Recognition of Handwritten Digits Data Set
//!
//! | Number of Instances | Number of Attributes | Missing Values? | Associated Tasks: |
//! |-|-|-|-|
//! | 1797 | 64 | No | Classification, Clusteing |
//!
//! [Digits dataset](https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits) contains normalized bitmaps of handwritten digits (0-9) from a preprinted form.
//! This multivariate dataset is frequently used to demonstrate various machine learning algorithms.
//!
//! All input attributes are integers in the range 0..16.
//!
use crate::dataset::deserialize_data;
use crate::dataset::Dataset;
/// Get dataset
pub fn load_dataset() -> Dataset<f32, f32> {
let (x, y, num_samples, num_features) = match deserialize_data(std::include_bytes!("digits.xy"))
{
Err(why) => panic!("Can't deserialize digits.xy. {}", why),
Ok((x, y, num_samples, num_features)) => (x, y, num_samples, num_features),
};
Dataset {
data: x,
target: y,
num_samples: num_samples,
num_features: num_features,
feature_names: vec![
"sepal length (cm)",
"sepal width (cm)",
"petal length (cm)",
"petal width (cm)",
]
.iter()
.map(|s| s.to_string())
.collect(),
target_names: vec!["setosa", "versicolor", "virginica"]
.iter()
.map(|s| s.to_string())
.collect(),
description: "Digits dataset: https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits".to_string(),
}
}
#[cfg(test)]
mod tests {
use super::super::*;
use super::*;
#[test]
#[ignore]
fn refresh_digits_dataset() {
// run this test to generate digits.xy file.
let dataset = load_dataset();
assert!(serialize_data(&dataset, "digits.xy").is_ok());
}
#[test]
fn digits_dataset() {
let dataset = load_dataset();
assert_eq!(dataset.data.len(), 1797 * 64);
assert_eq!(dataset.target.len(), 1797);
assert_eq!(dataset.num_features, 64);
assert_eq!(dataset.num_samples, 1797);
}
}
Binary file not shown.
+1
View File
@@ -4,6 +4,7 @@
pub mod boston; pub mod boston;
pub mod breast_cancer; pub mod breast_cancer;
pub mod diabetes; pub mod diabetes;
pub mod digits;
pub mod iris; pub mod iris;
use crate::math::num::RealNumber; use crate::math::num::RealNumber;
+10
View File
@@ -110,10 +110,20 @@ pub trait BaseMatrix<T: RealNumber>: Clone + Debug {
/// * `row` - row number /// * `row` - row number
fn get_row_as_vec(&self, row: usize) -> Vec<T>; fn get_row_as_vec(&self, row: usize) -> Vec<T>;
/// Copies a vector with elements of the `row`'th row into `result`
/// * `row` - row number
/// * `result` - receiver for the row
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>);
/// Get a vector with elements of the `col`'th column /// Get a vector with elements of the `col`'th column
/// * `col` - column number /// * `col` - column number
fn get_col_as_vec(&self, col: usize) -> Vec<T>; fn get_col_as_vec(&self, col: usize) -> Vec<T>;
/// Copies a vector with elements of the `col`'th column into `result`
/// * `col` - column number
/// * `result` - receiver for the col
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>);
/// Set an element at `col`, `row` to `x` /// Set an element at `col`, `row` to `x`
fn set(&mut self, row: usize, col: usize, x: T); fn set(&mut self, row: usize, col: usize, x: T);
+59
View File
@@ -54,6 +54,16 @@ pub struct DenseMatrix<T: RealNumber> {
values: Vec<T>, values: Vec<T>,
} }
/// Column-major, dense matrix. See [Simple Dense Matrix](../index.html).
#[derive(Debug)]
pub struct DenseMatrixIterator<'a, T: RealNumber> {
cur_c: usize,
cur_r: usize,
max_c: usize,
max_r: usize,
m: &'a DenseMatrix<T>,
}
impl<T: RealNumber> fmt::Display for DenseMatrix<T> { impl<T: RealNumber> fmt::Display for DenseMatrix<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut rows: Vec<Vec<f64>> = Vec::new(); let mut rows: Vec<Vec<f64>> = Vec::new();
@@ -162,6 +172,36 @@ impl<T: RealNumber> DenseMatrix<T> {
values: values, values: values,
} }
} }
/// Creates new column vector (_1xN_ matrix) from a vector.
/// * `values` - values to initialize the matrix.
pub fn iter<'a>(&'a self) -> DenseMatrixIterator<'a, T> {
DenseMatrixIterator {
cur_c: 0,
cur_r: 0,
max_c: self.ncols,
max_r: self.nrows,
m: &self,
}
}
}
impl<'a, T: RealNumber> Iterator for DenseMatrixIterator<'a, T> {
type Item = T;
fn next(&mut self) -> Option<T> {
if self.cur_r * self.max_c + self.cur_c >= self.max_c * self.max_r {
None
} else {
let v = self.m.get(self.cur_r, self.cur_c);
self.cur_c += 1;
if self.cur_c >= self.max_c {
self.cur_c = 0;
self.cur_r += 1;
}
Some(v)
}
}
} }
impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for DenseMatrix<T> { impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for DenseMatrix<T> {
@@ -339,6 +379,12 @@ impl<T: RealNumber> BaseMatrix<T> for DenseMatrix<T> {
result result
} }
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>) {
for c in 0..self.ncols {
result[c] = self.get(row, c);
}
}
fn get_col_as_vec(&self, col: usize) -> Vec<T> { fn get_col_as_vec(&self, col: usize) -> Vec<T> {
let mut result = vec![T::zero(); self.nrows]; let mut result = vec![T::zero(); self.nrows];
for r in 0..self.nrows { for r in 0..self.nrows {
@@ -347,6 +393,12 @@ impl<T: RealNumber> BaseMatrix<T> for DenseMatrix<T> {
result result
} }
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>) {
for r in 0..self.nrows {
result[r] = self.get(r, col);
}
}
fn set(&mut self, row: usize, col: usize, x: T) { fn set(&mut self, row: usize, col: usize, x: T) {
self.values[col * self.nrows + row] = x; self.values[col * self.nrows + row] = x;
} }
@@ -852,6 +904,13 @@ mod tests {
); );
} }
#[test]
fn iter() {
let vec = vec![1., 2., 3., 4., 5., 6.];
let m = DenseMatrix::from_array(3, 2, &vec);
assert_eq!(vec, m.iter().collect::<Vec<f32>>());
}
#[test] #[test]
fn v_stack() { fn v_stack() {
let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]); let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]);
+27
View File
@@ -102,10 +102,26 @@ impl<T: RealNumber + Scalar + AddAssign + SubAssign + MulAssign + DivAssign + Su
self.row(row).iter().map(|v| *v).collect() self.row(row).iter().map(|v| *v).collect()
} }
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>) {
let mut r = 0;
for e in self.row(row).iter() {
result[r] = *e;
r += 1;
}
}
fn get_col_as_vec(&self, col: usize) -> Vec<T> { fn get_col_as_vec(&self, col: usize) -> Vec<T> {
self.column(col).iter().map(|v| *v).collect() self.column(col).iter().map(|v| *v).collect()
} }
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>) {
let mut r = 0;
for e in self.column(col).iter() {
result[r] = *e;
r += 1;
}
}
fn set(&mut self, row: usize, col: usize, x: T) { fn set(&mut self, row: usize, col: usize, x: T) {
*self.get_mut((row, col)).unwrap() = x; *self.get_mut((row, col)).unwrap() = x;
} }
@@ -563,6 +579,17 @@ mod tests {
assert_eq!(m.get_col_as_vec(1), vec!(2., 5., 8.)); assert_eq!(m.get_col_as_vec(1), vec!(2., 5., 8.));
} }
#[test]
fn copy_row_col_as_vec() {
let m = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]);
let mut v = vec![0f32; 3];
m.copy_row_as_vec(1, &mut v);
assert_eq!(v, vec!(4., 5., 6.));
m.copy_col_as_vec(1, &mut v);
assert_eq!(v, vec!(2., 5., 8.));
}
#[test] #[test]
fn element_add_sub_mul_div() { fn element_add_sub_mul_div() {
let mut m = DMatrix::from_row_slice(2, 2, &[1.0, 2.0, 3.0, 4.0]); let mut m = DMatrix::from_row_slice(2, 2, &[1.0, 2.0, 3.0, 4.0]);
+27
View File
@@ -109,10 +109,26 @@ impl<T: RealNumber + ScalarOperand + AddAssign + SubAssign + MulAssign + DivAssi
self.row(row).to_vec() self.row(row).to_vec()
} }
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>) {
let mut r = 0;
for e in self.row(row).iter() {
result[r] = *e;
r += 1;
}
}
fn get_col_as_vec(&self, col: usize) -> Vec<T> { fn get_col_as_vec(&self, col: usize) -> Vec<T> {
self.column(col).to_vec() self.column(col).to_vec()
} }
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>) {
let mut r = 0;
for e in self.column(col).iter() {
result[r] = *e;
r += 1;
}
}
fn set(&mut self, row: usize, col: usize, x: T) { fn set(&mut self, row: usize, col: usize, x: T) {
self[[row, col]] = x; self[[row, col]] = x;
} }
@@ -669,6 +685,17 @@ mod tests {
assert_eq!(res, vec![2., 5., 8.]); assert_eq!(res, vec![2., 5., 8.]);
} }
#[test]
fn copy_row_col_as_vec() {
let m = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]);
let mut v = vec![0f32; 3];
m.copy_row_as_vec(1, &mut v);
assert_eq!(v, vec!(4., 5., 6.));
m.copy_col_as_vec(1, &mut v);
assert_eq!(v, vec!(2., 5., 8.));
}
#[test] #[test]
fn col_mean() { fn col_mean() {
let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]); let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]);
+3 -1
View File
@@ -29,6 +29,7 @@ use super::Distance;
pub struct Euclidian {} pub struct Euclidian {}
impl Euclidian { impl Euclidian {
#[inline]
pub(crate) fn squared_distance<T: RealNumber>(x: &Vec<T>, y: &Vec<T>) -> T { pub(crate) fn squared_distance<T: RealNumber>(x: &Vec<T>, y: &Vec<T>) -> T {
if x.len() != y.len() { if x.len() != y.len() {
panic!("Input vector sizes are different."); panic!("Input vector sizes are different.");
@@ -36,7 +37,8 @@ impl Euclidian {
let mut sum = T::zero(); let mut sum = T::zero();
for i in 0..x.len() { for i in 0..x.len() {
sum = sum + (x[i] - y[i]).powf(T::two()); let d = x[i] - y[i];
sum = sum + d * d;
} }
sum sum
+1
View File
@@ -1,3 +1,4 @@
/// Multitude of distance metrics are defined here /// Multitude of distance metrics are defined here
pub mod distance; pub mod distance;
pub mod num; pub mod num;
pub(crate) mod vector;
+40
View File
@@ -0,0 +1,40 @@
use crate::math::num::RealNumber;
use std::collections::HashMap;
pub trait RealNumberVector<T: RealNumber> {
fn unique(&self) -> (Vec<T>, Vec<usize>);
}
impl<T: RealNumber> RealNumberVector<T> for Vec<T> {
fn unique(&self) -> (Vec<T>, Vec<usize>) {
let mut unique = self.clone();
unique.sort_by(|a, b| a.partial_cmp(b).unwrap());
unique.dedup();
let mut index = HashMap::with_capacity(unique.len());
for (i, u) in unique.iter().enumerate() {
index.insert(u.to_i64().unwrap(), i);
}
let mut unique_index = Vec::with_capacity(self.len());
for e in self {
unique_index.push(index[&e.to_i64().unwrap()]);
}
(unique, unique_index)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn unique() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
assert_eq!(
(vec!(0.0, 1.0, 2.0, 4.0), vec!(0, 0, 1, 1, 2, 0, 3)),
v1.unique()
);
}
}
+54
View File
@@ -0,0 +1,54 @@
use serde::{Deserialize, Serialize};
use crate::linalg::BaseVector;
use crate::math::num::RealNumber;
use crate::metrics::cluster_helpers::*;
#[derive(Serialize, Deserialize, Debug)]
/// Mean Absolute Error
pub struct HCVScore {}
impl HCVScore {
/// Computes mean absolute error
/// * `y_true` - Ground truth (correct) target values.
/// * `y_pred` - Estimated target values.
pub fn get_score<T: RealNumber, V: BaseVector<T>>(
&self,
labels_true: &V,
labels_pred: &V,
) -> (T, T, T) {
let labels_true = labels_true.to_vec();
let labels_pred = labels_pred.to_vec();
let entropy_c = entropy(&labels_true);
let entropy_k = entropy(&labels_pred);
let contingency = contingency_matrix(&labels_true, &labels_pred);
let mi: T = mutual_info_score(&contingency);
let homogeneity = entropy_c.map(|e| mi / e).unwrap_or(T::one());
let completeness = entropy_k.map(|e| mi / e).unwrap_or(T::one());
let v_measure_score = if homogeneity + completeness == T::zero() {
T::zero()
} else {
T::two() * homogeneity * completeness / (T::one() * homogeneity + completeness)
};
(homogeneity, completeness, v_measure_score)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn homogeneity_score() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
let v2 = vec![1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0];
let scores = HCVScore {}.get_score(&v1, &v2);
assert!((0.2548f32 - scores.0).abs() < 1e-4);
assert!((0.5440f32 - scores.1).abs() < 1e-4);
assert!((0.3471f32 - scores.2).abs() < 1e-4);
}
}
+127
View File
@@ -0,0 +1,127 @@
use std::collections::HashMap;
use crate::math::num::RealNumber;
use crate::math::vector::RealNumberVector;
pub fn contingency_matrix<T: RealNumber>(
labels_true: &Vec<T>,
labels_pred: &Vec<T>,
) -> Vec<Vec<usize>> {
let (classes, class_idx) = labels_true.unique();
let (clusters, cluster_idx) = labels_pred.unique();
let mut contingency_matrix = Vec::with_capacity(classes.len());
for _ in 0..classes.len() {
contingency_matrix.push(vec![0; clusters.len()]);
}
for i in 0..class_idx.len() {
contingency_matrix[class_idx[i]][cluster_idx[i]] += 1;
}
contingency_matrix
}
pub fn entropy<T: RealNumber>(data: &Vec<T>) -> Option<T> {
let mut bincounts = HashMap::with_capacity(data.len());
for e in data.iter() {
let k = e.to_i64().unwrap();
bincounts.insert(k, bincounts.get(&k).unwrap_or(&0) + 1);
}
let mut entropy = T::zero();
let sum = T::from_usize(bincounts.values().sum()).unwrap();
for &c in bincounts.values() {
if c > 0 {
let pi = T::from_usize(c).unwrap();
entropy = entropy - (pi / sum) * (pi.ln() - sum.ln());
}
}
Some(entropy)
}
pub fn mutual_info_score<T: RealNumber>(contingency: &Vec<Vec<usize>>) -> T {
let mut contingency_sum = 0;
let mut pi = vec![0; contingency.len()];
let mut pj = vec![0; contingency[0].len()];
let (mut nzx, mut nzy, mut nz_val) = (Vec::new(), Vec::new(), Vec::new());
for r in 0..contingency.len() {
for c in 0..contingency[0].len() {
contingency_sum += contingency[r][c];
pi[r] += contingency[r][c];
pj[c] += contingency[r][c];
if contingency[r][c] > 0 {
nzx.push(r);
nzy.push(c);
nz_val.push(contingency[r][c]);
}
}
}
let contingency_sum = T::from_usize(contingency_sum).unwrap();
let contingency_sum_ln = contingency_sum.ln();
let pi_sum_l = T::from_usize(pi.iter().sum()).unwrap().ln();
let pj_sum_l = T::from_usize(pj.iter().sum()).unwrap().ln();
let log_contingency_nm: Vec<T> = nz_val
.iter()
.map(|v| T::from_usize(*v).unwrap().ln())
.collect();
let contingency_nm: Vec<T> = nz_val
.iter()
.map(|v| T::from_usize(*v).unwrap() / contingency_sum)
.collect();
let outer: Vec<usize> = nzx
.iter()
.zip(nzy.iter())
.map(|(&x, &y)| pi[x] * pj[y])
.collect();
let log_outer: Vec<T> = outer
.iter()
.map(|&o| -T::from_usize(o).unwrap().ln() + pi_sum_l + pj_sum_l)
.collect();
let mut result = T::zero();
for i in 0..log_outer.len() {
result = result
+ ((contingency_nm[i] * (log_contingency_nm[i] - contingency_sum_ln))
+ contingency_nm[i] * log_outer[i])
}
result.max(T::zero())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn contingency_matrix_test() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
let v2 = vec![1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0];
println!("{:?}", contingency_matrix(&v1, &v2));
}
#[test]
fn entropy_test() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
println!("{:?}", entropy(&v1));
}
#[test]
fn mutual_info_score_test() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
let v2 = vec![1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0];
let s: f32 = mutual_info_score(&contingency_matrix(&v1, &v2));
println!("{}", s);
}
}
+39
View File
@@ -54,6 +54,8 @@
pub mod accuracy; pub mod accuracy;
/// Computes Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. /// Computes Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
pub mod auc; pub mod auc;
pub mod cluster_hcv;
pub(crate) mod cluster_helpers;
/// F1 score, also known as balanced F-score or F-measure. /// F1 score, also known as balanced F-score or F-measure.
pub mod f1; pub mod f1;
/// Mean absolute error regression loss. /// Mean absolute error regression loss.
@@ -76,6 +78,9 @@ pub struct ClassificationMetrics {}
/// Metrics for regression models. /// Metrics for regression models.
pub struct RegressionMetrics {} pub struct RegressionMetrics {}
/// Cluster metrics.
pub struct ClusterMetrics {}
impl ClassificationMetrics { impl ClassificationMetrics {
/// Accuracy score, see [accuracy](accuracy/index.html). /// Accuracy score, see [accuracy](accuracy/index.html).
pub fn accuracy() -> accuracy::Accuracy { pub fn accuracy() -> accuracy::Accuracy {
@@ -120,6 +125,13 @@ impl RegressionMetrics {
} }
} }
impl ClusterMetrics {
/// Mean squared error, see [mean squared error](mean_squared_error/index.html).
pub fn hcv_score() -> cluster_hcv::HCVScore {
cluster_hcv::HCVScore {}
}
}
/// Function that calculated accuracy score, see [accuracy](accuracy/index.html). /// Function that calculated accuracy score, see [accuracy](accuracy/index.html).
/// * `y_true` - cround truth (correct) labels /// * `y_true` - cround truth (correct) labels
/// * `y_pred` - predicted labels, as returned by a classifier. /// * `y_pred` - predicted labels, as returned by a classifier.
@@ -175,3 +187,30 @@ pub fn mean_absolute_error<T: RealNumber, V: BaseVector<T>>(y_true: &V, y_pred:
pub fn r2<T: RealNumber, V: BaseVector<T>>(y_true: &V, y_pred: &V) -> T { pub fn r2<T: RealNumber, V: BaseVector<T>>(y_true: &V, y_pred: &V) -> T {
RegressionMetrics::r2().get_score(y_true, y_pred) RegressionMetrics::r2().get_score(y_true, y_pred)
} }
/// Computes R2 score, see [R2](r2/index.html).
/// * `y_true` - Ground truth (correct) target values.
/// * `y_pred` - Estimated target values.
pub fn homogeneity_score<T: RealNumber, V: BaseVector<T>>(labels_true: &V, labels_pred: &V) -> T {
ClusterMetrics::hcv_score()
.get_score(labels_true, labels_pred)
.0
}
/// Computes R2 score, see [R2](r2/index.html).
/// * `y_true` - Ground truth (correct) target values.
/// * `y_pred` - Estimated target values.
pub fn completeness_score<T: RealNumber, V: BaseVector<T>>(labels_true: &V, labels_pred: &V) -> T {
ClusterMetrics::hcv_score()
.get_score(labels_true, labels_pred)
.1
}
/// Computes R2 score, see [R2](r2/index.html).
/// * `y_true` - Ground truth (correct) target values.
/// * `y_pred` - Estimated target values.
pub fn v_measure_score<T: RealNumber, V: BaseVector<T>>(labels_true: &V, labels_pred: &V) -> T {
ClusterMetrics::hcv_score()
.get_score(labels_true, labels_pred)
.2
}