feat: + cluster metrics
This commit is contained in:
@@ -71,11 +71,11 @@ impl<T: RealNumber> BBDTree<T> {
|
||||
) -> T {
|
||||
let k = centroids.len();
|
||||
|
||||
counts.iter_mut().for_each(|x| *x = 0);
|
||||
counts.iter_mut().for_each(|v| *v = 0);
|
||||
let mut candidates = vec![0; k];
|
||||
for i in 0..k {
|
||||
candidates[i] = i;
|
||||
sums[i].iter_mut().for_each(|x| *x = T::zero());
|
||||
sums[i].iter_mut().for_each(|v| *v = T::zero());
|
||||
}
|
||||
|
||||
self.filter(
|
||||
@@ -124,7 +124,7 @@ impl<T: RealNumber> BBDTree<T> {
|
||||
if !BBDTree::prune(
|
||||
&self.nodes[node].center,
|
||||
&self.nodes[node].radius,
|
||||
¢roids,
|
||||
centroids,
|
||||
closest,
|
||||
candidates[i],
|
||||
) {
|
||||
@@ -135,7 +135,7 @@ impl<T: RealNumber> BBDTree<T> {
|
||||
|
||||
// Recurse if there's at least two
|
||||
if newk > 1 {
|
||||
let result = self.filter(
|
||||
return self.filter(
|
||||
self.nodes[node].lower.unwrap(),
|
||||
centroids,
|
||||
&mut new_candidates,
|
||||
@@ -152,7 +152,6 @@ impl<T: RealNumber> BBDTree<T> {
|
||||
counts,
|
||||
membership,
|
||||
);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,7 +197,7 @@ impl<T: RealNumber> BBDTree<T> {
|
||||
}
|
||||
}
|
||||
|
||||
return lhs >= T::two() * rhs;
|
||||
lhs >= T::two() * rhs
|
||||
}
|
||||
|
||||
fn build_node<M: Matrix<T>>(&mut self, data: &M, begin: usize, end: usize) -> usize {
|
||||
@@ -336,7 +335,7 @@ mod tests {
|
||||
use crate::linalg::naive::dense_matrix::DenseMatrix;
|
||||
|
||||
#[test]
|
||||
fn fit_predict_iris() {
|
||||
fn bbdtree_iris() {
|
||||
let data = DenseMatrix::from_2d_array(&[
|
||||
&[5.1, 3.5, 1.4, 0.2],
|
||||
&[4.9, 3.0, 1.4, 0.2],
|
||||
|
||||
+16
-8
@@ -189,15 +189,18 @@ impl<T: RealNumber + Sum> KMeans<T> {
|
||||
/// Predict clusters for `x`
|
||||
/// * `x` - matrix with new data to transform of size _KxM_ , where _K_ is number of new samples and _M_ is number of features.
|
||||
pub fn predict<M: Matrix<T>>(&self, x: &M) -> Result<M::RowVector, Failed> {
|
||||
let (n, _) = x.shape();
|
||||
let (n, m) = x.shape();
|
||||
let mut result = M::zeros(1, n);
|
||||
|
||||
let mut row = vec![T::zero(); m];
|
||||
|
||||
for i in 0..n {
|
||||
let mut min_dist = T::max_value();
|
||||
let mut best_cluster = 0;
|
||||
|
||||
for j in 0..self.k {
|
||||
let dist = Euclidian::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]);
|
||||
x.copy_row_as_vec(i, &mut row);
|
||||
let dist = Euclidian::squared_distance(&row, &self.centroids[j]);
|
||||
if dist < min_dist {
|
||||
min_dist = dist;
|
||||
best_cluster = j;
|
||||
@@ -211,19 +214,22 @@ impl<T: RealNumber + Sum> KMeans<T> {
|
||||
|
||||
fn kmeans_plus_plus<M: Matrix<T>>(data: &M, k: usize) -> Vec<usize> {
|
||||
let mut rng = rand::thread_rng();
|
||||
let (n, _) = data.shape();
|
||||
let (n, m) = data.shape();
|
||||
let mut y = vec![0; n];
|
||||
let mut centroid = data.get_row_as_vec(rng.gen_range(0, n));
|
||||
|
||||
let mut d = vec![T::max_value(); n];
|
||||
|
||||
let mut row = vec![T::zero(); m];
|
||||
|
||||
// pick the next center
|
||||
for j in 1..k {
|
||||
// Loop over the samples and compare them to the most recent center. Store
|
||||
// the distance from each sample to its closest center in scores.
|
||||
for i in 0..n {
|
||||
// compute the distance between this sample and the current center
|
||||
let dist = Euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid);
|
||||
data.copy_row_as_vec(i, &mut row);
|
||||
let dist = Euclidian::squared_distance(&row, ¢roid);
|
||||
|
||||
if dist < d[i] {
|
||||
d[i] = dist;
|
||||
@@ -237,20 +243,22 @@ impl<T: RealNumber + Sum> KMeans<T> {
|
||||
}
|
||||
let cutoff = T::from(rng.gen::<f64>()).unwrap() * sum;
|
||||
let mut cost = T::zero();
|
||||
let index = 0;
|
||||
for index in 0..n {
|
||||
let mut index = 0;
|
||||
while index < n {
|
||||
cost = cost + d[index];
|
||||
if cost >= cutoff {
|
||||
break;
|
||||
}
|
||||
index += 1;
|
||||
}
|
||||
|
||||
centroid = data.get_row_as_vec(index);
|
||||
data.copy_row_as_vec(index, &mut centroid);
|
||||
}
|
||||
|
||||
for i in 0..n {
|
||||
data.copy_row_as_vec(i, &mut row);
|
||||
// compute the distance between this sample and the current center
|
||||
let dist = Euclidian::squared_distance(&data.get_row_as_vec(i), ¢roid);
|
||||
let dist = Euclidian::squared_distance(&row, ¢roid);
|
||||
|
||||
if dist < d[i] {
|
||||
d[i] = dist;
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
//! # Optical Recognition of Handwritten Digits Data Set
|
||||
//!
|
||||
//! | Number of Instances | Number of Attributes | Missing Values? | Associated Tasks: |
|
||||
//! |-|-|-|-|
|
||||
//! | 1797 | 64 | No | Classification, Clusteing |
|
||||
//!
|
||||
//! [Digits dataset](https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits) contains normalized bitmaps of handwritten digits (0-9) from a preprinted form.
|
||||
//! This multivariate dataset is frequently used to demonstrate various machine learning algorithms.
|
||||
//!
|
||||
//! All input attributes are integers in the range 0..16.
|
||||
//!
|
||||
use crate::dataset::deserialize_data;
|
||||
use crate::dataset::Dataset;
|
||||
|
||||
/// Get dataset
|
||||
pub fn load_dataset() -> Dataset<f32, f32> {
|
||||
let (x, y, num_samples, num_features) = match deserialize_data(std::include_bytes!("digits.xy"))
|
||||
{
|
||||
Err(why) => panic!("Can't deserialize digits.xy. {}", why),
|
||||
Ok((x, y, num_samples, num_features)) => (x, y, num_samples, num_features),
|
||||
};
|
||||
|
||||
Dataset {
|
||||
data: x,
|
||||
target: y,
|
||||
num_samples: num_samples,
|
||||
num_features: num_features,
|
||||
feature_names: vec![
|
||||
"sepal length (cm)",
|
||||
"sepal width (cm)",
|
||||
"petal length (cm)",
|
||||
"petal width (cm)",
|
||||
]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect(),
|
||||
target_names: vec!["setosa", "versicolor", "virginica"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect(),
|
||||
description: "Digits dataset: https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::*;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn refresh_digits_dataset() {
|
||||
// run this test to generate digits.xy file.
|
||||
let dataset = load_dataset();
|
||||
assert!(serialize_data(&dataset, "digits.xy").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn digits_dataset() {
|
||||
let dataset = load_dataset();
|
||||
assert_eq!(dataset.data.len(), 1797 * 64);
|
||||
assert_eq!(dataset.target.len(), 1797);
|
||||
assert_eq!(dataset.num_features, 64);
|
||||
assert_eq!(dataset.num_samples, 1797);
|
||||
}
|
||||
}
|
||||
Binary file not shown.
@@ -4,6 +4,7 @@
|
||||
pub mod boston;
|
||||
pub mod breast_cancer;
|
||||
pub mod diabetes;
|
||||
pub mod digits;
|
||||
pub mod iris;
|
||||
|
||||
use crate::math::num::RealNumber;
|
||||
|
||||
@@ -110,10 +110,20 @@ pub trait BaseMatrix<T: RealNumber>: Clone + Debug {
|
||||
/// * `row` - row number
|
||||
fn get_row_as_vec(&self, row: usize) -> Vec<T>;
|
||||
|
||||
/// Copies a vector with elements of the `row`'th row into `result`
|
||||
/// * `row` - row number
|
||||
/// * `result` - receiver for the row
|
||||
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>);
|
||||
|
||||
/// Get a vector with elements of the `col`'th column
|
||||
/// * `col` - column number
|
||||
fn get_col_as_vec(&self, col: usize) -> Vec<T>;
|
||||
|
||||
/// Copies a vector with elements of the `col`'th column into `result`
|
||||
/// * `col` - column number
|
||||
/// * `result` - receiver for the col
|
||||
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>);
|
||||
|
||||
/// Set an element at `col`, `row` to `x`
|
||||
fn set(&mut self, row: usize, col: usize, x: T);
|
||||
|
||||
|
||||
@@ -54,6 +54,16 @@ pub struct DenseMatrix<T: RealNumber> {
|
||||
values: Vec<T>,
|
||||
}
|
||||
|
||||
/// Column-major, dense matrix. See [Simple Dense Matrix](../index.html).
|
||||
#[derive(Debug)]
|
||||
pub struct DenseMatrixIterator<'a, T: RealNumber> {
|
||||
cur_c: usize,
|
||||
cur_r: usize,
|
||||
max_c: usize,
|
||||
max_r: usize,
|
||||
m: &'a DenseMatrix<T>,
|
||||
}
|
||||
|
||||
impl<T: RealNumber> fmt::Display for DenseMatrix<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let mut rows: Vec<Vec<f64>> = Vec::new();
|
||||
@@ -162,6 +172,36 @@ impl<T: RealNumber> DenseMatrix<T> {
|
||||
values: values,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates new column vector (_1xN_ matrix) from a vector.
|
||||
/// * `values` - values to initialize the matrix.
|
||||
pub fn iter<'a>(&'a self) -> DenseMatrixIterator<'a, T> {
|
||||
DenseMatrixIterator {
|
||||
cur_c: 0,
|
||||
cur_r: 0,
|
||||
max_c: self.ncols,
|
||||
max_r: self.nrows,
|
||||
m: &self,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: RealNumber> Iterator for DenseMatrixIterator<'a, T> {
|
||||
type Item = T;
|
||||
|
||||
fn next(&mut self) -> Option<T> {
|
||||
if self.cur_r * self.max_c + self.cur_c >= self.max_c * self.max_r {
|
||||
None
|
||||
} else {
|
||||
let v = self.m.get(self.cur_r, self.cur_c);
|
||||
self.cur_c += 1;
|
||||
if self.cur_c >= self.max_c {
|
||||
self.cur_c = 0;
|
||||
self.cur_r += 1;
|
||||
}
|
||||
Some(v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for DenseMatrix<T> {
|
||||
@@ -339,6 +379,12 @@ impl<T: RealNumber> BaseMatrix<T> for DenseMatrix<T> {
|
||||
result
|
||||
}
|
||||
|
||||
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>) {
|
||||
for c in 0..self.ncols {
|
||||
result[c] = self.get(row, c);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_col_as_vec(&self, col: usize) -> Vec<T> {
|
||||
let mut result = vec![T::zero(); self.nrows];
|
||||
for r in 0..self.nrows {
|
||||
@@ -347,6 +393,12 @@ impl<T: RealNumber> BaseMatrix<T> for DenseMatrix<T> {
|
||||
result
|
||||
}
|
||||
|
||||
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>) {
|
||||
for r in 0..self.nrows {
|
||||
result[r] = self.get(r, col);
|
||||
}
|
||||
}
|
||||
|
||||
fn set(&mut self, row: usize, col: usize, x: T) {
|
||||
self.values[col * self.nrows + row] = x;
|
||||
}
|
||||
@@ -852,6 +904,13 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter() {
|
||||
let vec = vec![1., 2., 3., 4., 5., 6.];
|
||||
let m = DenseMatrix::from_array(3, 2, &vec);
|
||||
assert_eq!(vec, m.iter().collect::<Vec<f32>>());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v_stack() {
|
||||
let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]);
|
||||
|
||||
@@ -102,10 +102,26 @@ impl<T: RealNumber + Scalar + AddAssign + SubAssign + MulAssign + DivAssign + Su
|
||||
self.row(row).iter().map(|v| *v).collect()
|
||||
}
|
||||
|
||||
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>) {
|
||||
let mut r = 0;
|
||||
for e in self.row(row).iter() {
|
||||
result[r] = *e;
|
||||
r += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn get_col_as_vec(&self, col: usize) -> Vec<T> {
|
||||
self.column(col).iter().map(|v| *v).collect()
|
||||
}
|
||||
|
||||
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>) {
|
||||
let mut r = 0;
|
||||
for e in self.column(col).iter() {
|
||||
result[r] = *e;
|
||||
r += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn set(&mut self, row: usize, col: usize, x: T) {
|
||||
*self.get_mut((row, col)).unwrap() = x;
|
||||
}
|
||||
@@ -563,6 +579,17 @@ mod tests {
|
||||
assert_eq!(m.get_col_as_vec(1), vec!(2., 5., 8.));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn copy_row_col_as_vec() {
|
||||
let m = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]);
|
||||
let mut v = vec![0f32; 3];
|
||||
|
||||
m.copy_row_as_vec(1, &mut v);
|
||||
assert_eq!(v, vec!(4., 5., 6.));
|
||||
m.copy_col_as_vec(1, &mut v);
|
||||
assert_eq!(v, vec!(2., 5., 8.));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn element_add_sub_mul_div() {
|
||||
let mut m = DMatrix::from_row_slice(2, 2, &[1.0, 2.0, 3.0, 4.0]);
|
||||
|
||||
@@ -109,10 +109,26 @@ impl<T: RealNumber + ScalarOperand + AddAssign + SubAssign + MulAssign + DivAssi
|
||||
self.row(row).to_vec()
|
||||
}
|
||||
|
||||
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>) {
|
||||
let mut r = 0;
|
||||
for e in self.row(row).iter() {
|
||||
result[r] = *e;
|
||||
r += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn get_col_as_vec(&self, col: usize) -> Vec<T> {
|
||||
self.column(col).to_vec()
|
||||
}
|
||||
|
||||
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>) {
|
||||
let mut r = 0;
|
||||
for e in self.column(col).iter() {
|
||||
result[r] = *e;
|
||||
r += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn set(&mut self, row: usize, col: usize, x: T) {
|
||||
self[[row, col]] = x;
|
||||
}
|
||||
@@ -669,6 +685,17 @@ mod tests {
|
||||
assert_eq!(res, vec![2., 5., 8.]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn copy_row_col_as_vec() {
|
||||
let m = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]);
|
||||
let mut v = vec![0f32; 3];
|
||||
|
||||
m.copy_row_as_vec(1, &mut v);
|
||||
assert_eq!(v, vec!(4., 5., 6.));
|
||||
m.copy_col_as_vec(1, &mut v);
|
||||
assert_eq!(v, vec!(2., 5., 8.));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn col_mean() {
|
||||
let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]);
|
||||
|
||||
@@ -29,6 +29,7 @@ use super::Distance;
|
||||
pub struct Euclidian {}
|
||||
|
||||
impl Euclidian {
|
||||
#[inline]
|
||||
pub(crate) fn squared_distance<T: RealNumber>(x: &Vec<T>, y: &Vec<T>) -> T {
|
||||
if x.len() != y.len() {
|
||||
panic!("Input vector sizes are different.");
|
||||
@@ -36,7 +37,8 @@ impl Euclidian {
|
||||
|
||||
let mut sum = T::zero();
|
||||
for i in 0..x.len() {
|
||||
sum = sum + (x[i] - y[i]).powf(T::two());
|
||||
let d = x[i] - y[i];
|
||||
sum = sum + d * d;
|
||||
}
|
||||
|
||||
sum
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
/// Multitude of distance metrics are defined here
|
||||
pub mod distance;
|
||||
pub mod num;
|
||||
pub(crate) mod vector;
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
use crate::math::num::RealNumber;
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub trait RealNumberVector<T: RealNumber> {
|
||||
fn unique(&self) -> (Vec<T>, Vec<usize>);
|
||||
}
|
||||
|
||||
impl<T: RealNumber> RealNumberVector<T> for Vec<T> {
|
||||
fn unique(&self) -> (Vec<T>, Vec<usize>) {
|
||||
let mut unique = self.clone();
|
||||
unique.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
unique.dedup();
|
||||
|
||||
let mut index = HashMap::with_capacity(unique.len());
|
||||
for (i, u) in unique.iter().enumerate() {
|
||||
index.insert(u.to_i64().unwrap(), i);
|
||||
}
|
||||
|
||||
let mut unique_index = Vec::with_capacity(self.len());
|
||||
for e in self {
|
||||
unique_index.push(index[&e.to_i64().unwrap()]);
|
||||
}
|
||||
|
||||
(unique, unique_index)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn unique() {
|
||||
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
|
||||
assert_eq!(
|
||||
(vec!(0.0, 1.0, 2.0, 4.0), vec!(0, 0, 1, 1, 2, 0, 3)),
|
||||
v1.unique()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::linalg::BaseVector;
|
||||
use crate::math::num::RealNumber;
|
||||
use crate::metrics::cluster_helpers::*;
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
/// Mean Absolute Error
|
||||
pub struct HCVScore {}
|
||||
|
||||
impl HCVScore {
|
||||
/// Computes mean absolute error
|
||||
/// * `y_true` - Ground truth (correct) target values.
|
||||
/// * `y_pred` - Estimated target values.
|
||||
pub fn get_score<T: RealNumber, V: BaseVector<T>>(
|
||||
&self,
|
||||
labels_true: &V,
|
||||
labels_pred: &V,
|
||||
) -> (T, T, T) {
|
||||
let labels_true = labels_true.to_vec();
|
||||
let labels_pred = labels_pred.to_vec();
|
||||
let entropy_c = entropy(&labels_true);
|
||||
let entropy_k = entropy(&labels_pred);
|
||||
let contingency = contingency_matrix(&labels_true, &labels_pred);
|
||||
let mi: T = mutual_info_score(&contingency);
|
||||
|
||||
let homogeneity = entropy_c.map(|e| mi / e).unwrap_or(T::one());
|
||||
let completeness = entropy_k.map(|e| mi / e).unwrap_or(T::one());
|
||||
|
||||
let v_measure_score = if homogeneity + completeness == T::zero() {
|
||||
T::zero()
|
||||
} else {
|
||||
T::two() * homogeneity * completeness / (T::one() * homogeneity + completeness)
|
||||
};
|
||||
|
||||
(homogeneity, completeness, v_measure_score)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn homogeneity_score() {
|
||||
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
|
||||
let v2 = vec![1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0];
|
||||
let scores = HCVScore {}.get_score(&v1, &v2);
|
||||
|
||||
assert!((0.2548f32 - scores.0).abs() < 1e-4);
|
||||
assert!((0.5440f32 - scores.1).abs() < 1e-4);
|
||||
assert!((0.3471f32 - scores.2).abs() < 1e-4);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::math::num::RealNumber;
|
||||
use crate::math::vector::RealNumberVector;
|
||||
|
||||
pub fn contingency_matrix<T: RealNumber>(
|
||||
labels_true: &Vec<T>,
|
||||
labels_pred: &Vec<T>,
|
||||
) -> Vec<Vec<usize>> {
|
||||
let (classes, class_idx) = labels_true.unique();
|
||||
let (clusters, cluster_idx) = labels_pred.unique();
|
||||
|
||||
let mut contingency_matrix = Vec::with_capacity(classes.len());
|
||||
|
||||
for _ in 0..classes.len() {
|
||||
contingency_matrix.push(vec![0; clusters.len()]);
|
||||
}
|
||||
|
||||
for i in 0..class_idx.len() {
|
||||
contingency_matrix[class_idx[i]][cluster_idx[i]] += 1;
|
||||
}
|
||||
|
||||
contingency_matrix
|
||||
}
|
||||
|
||||
pub fn entropy<T: RealNumber>(data: &Vec<T>) -> Option<T> {
|
||||
let mut bincounts = HashMap::with_capacity(data.len());
|
||||
|
||||
for e in data.iter() {
|
||||
let k = e.to_i64().unwrap();
|
||||
bincounts.insert(k, bincounts.get(&k).unwrap_or(&0) + 1);
|
||||
}
|
||||
|
||||
let mut entropy = T::zero();
|
||||
let sum = T::from_usize(bincounts.values().sum()).unwrap();
|
||||
|
||||
for &c in bincounts.values() {
|
||||
if c > 0 {
|
||||
let pi = T::from_usize(c).unwrap();
|
||||
entropy = entropy - (pi / sum) * (pi.ln() - sum.ln());
|
||||
}
|
||||
}
|
||||
|
||||
Some(entropy)
|
||||
}
|
||||
|
||||
pub fn mutual_info_score<T: RealNumber>(contingency: &Vec<Vec<usize>>) -> T {
|
||||
let mut contingency_sum = 0;
|
||||
let mut pi = vec![0; contingency.len()];
|
||||
let mut pj = vec![0; contingency[0].len()];
|
||||
let (mut nzx, mut nzy, mut nz_val) = (Vec::new(), Vec::new(), Vec::new());
|
||||
|
||||
for r in 0..contingency.len() {
|
||||
for c in 0..contingency[0].len() {
|
||||
contingency_sum += contingency[r][c];
|
||||
pi[r] += contingency[r][c];
|
||||
pj[c] += contingency[r][c];
|
||||
if contingency[r][c] > 0 {
|
||||
nzx.push(r);
|
||||
nzy.push(c);
|
||||
nz_val.push(contingency[r][c]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let contingency_sum = T::from_usize(contingency_sum).unwrap();
|
||||
let contingency_sum_ln = contingency_sum.ln();
|
||||
let pi_sum_l = T::from_usize(pi.iter().sum()).unwrap().ln();
|
||||
let pj_sum_l = T::from_usize(pj.iter().sum()).unwrap().ln();
|
||||
|
||||
let log_contingency_nm: Vec<T> = nz_val
|
||||
.iter()
|
||||
.map(|v| T::from_usize(*v).unwrap().ln())
|
||||
.collect();
|
||||
let contingency_nm: Vec<T> = nz_val
|
||||
.iter()
|
||||
.map(|v| T::from_usize(*v).unwrap() / contingency_sum)
|
||||
.collect();
|
||||
let outer: Vec<usize> = nzx
|
||||
.iter()
|
||||
.zip(nzy.iter())
|
||||
.map(|(&x, &y)| pi[x] * pj[y])
|
||||
.collect();
|
||||
let log_outer: Vec<T> = outer
|
||||
.iter()
|
||||
.map(|&o| -T::from_usize(o).unwrap().ln() + pi_sum_l + pj_sum_l)
|
||||
.collect();
|
||||
|
||||
let mut result = T::zero();
|
||||
|
||||
for i in 0..log_outer.len() {
|
||||
result = result
|
||||
+ ((contingency_nm[i] * (log_contingency_nm[i] - contingency_sum_ln))
|
||||
+ contingency_nm[i] * log_outer[i])
|
||||
}
|
||||
|
||||
result.max(T::zero())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn contingency_matrix_test() {
|
||||
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
|
||||
let v2 = vec![1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0];
|
||||
|
||||
println!("{:?}", contingency_matrix(&v1, &v2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn entropy_test() {
|
||||
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
|
||||
|
||||
println!("{:?}", entropy(&v1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mutual_info_score_test() {
|
||||
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
|
||||
let v2 = vec![1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0];
|
||||
let s: f32 = mutual_info_score(&contingency_matrix(&v1, &v2));
|
||||
|
||||
println!("{}", s);
|
||||
}
|
||||
}
|
||||
@@ -54,6 +54,8 @@
|
||||
pub mod accuracy;
|
||||
/// Computes Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
|
||||
pub mod auc;
|
||||
pub mod cluster_hcv;
|
||||
pub(crate) mod cluster_helpers;
|
||||
/// F1 score, also known as balanced F-score or F-measure.
|
||||
pub mod f1;
|
||||
/// Mean absolute error regression loss.
|
||||
@@ -76,6 +78,9 @@ pub struct ClassificationMetrics {}
|
||||
/// Metrics for regression models.
|
||||
pub struct RegressionMetrics {}
|
||||
|
||||
/// Cluster metrics.
|
||||
pub struct ClusterMetrics {}
|
||||
|
||||
impl ClassificationMetrics {
|
||||
/// Accuracy score, see [accuracy](accuracy/index.html).
|
||||
pub fn accuracy() -> accuracy::Accuracy {
|
||||
@@ -120,6 +125,13 @@ impl RegressionMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusterMetrics {
|
||||
/// Mean squared error, see [mean squared error](mean_squared_error/index.html).
|
||||
pub fn hcv_score() -> cluster_hcv::HCVScore {
|
||||
cluster_hcv::HCVScore {}
|
||||
}
|
||||
}
|
||||
|
||||
/// Function that calculated accuracy score, see [accuracy](accuracy/index.html).
|
||||
/// * `y_true` - cround truth (correct) labels
|
||||
/// * `y_pred` - predicted labels, as returned by a classifier.
|
||||
@@ -175,3 +187,30 @@ pub fn mean_absolute_error<T: RealNumber, V: BaseVector<T>>(y_true: &V, y_pred:
|
||||
pub fn r2<T: RealNumber, V: BaseVector<T>>(y_true: &V, y_pred: &V) -> T {
|
||||
RegressionMetrics::r2().get_score(y_true, y_pred)
|
||||
}
|
||||
|
||||
/// Computes R2 score, see [R2](r2/index.html).
|
||||
/// * `y_true` - Ground truth (correct) target values.
|
||||
/// * `y_pred` - Estimated target values.
|
||||
pub fn homogeneity_score<T: RealNumber, V: BaseVector<T>>(labels_true: &V, labels_pred: &V) -> T {
|
||||
ClusterMetrics::hcv_score()
|
||||
.get_score(labels_true, labels_pred)
|
||||
.0
|
||||
}
|
||||
|
||||
/// Computes R2 score, see [R2](r2/index.html).
|
||||
/// * `y_true` - Ground truth (correct) target values.
|
||||
/// * `y_pred` - Estimated target values.
|
||||
pub fn completeness_score<T: RealNumber, V: BaseVector<T>>(labels_true: &V, labels_pred: &V) -> T {
|
||||
ClusterMetrics::hcv_score()
|
||||
.get_score(labels_true, labels_pred)
|
||||
.1
|
||||
}
|
||||
|
||||
/// Computes R2 score, see [R2](r2/index.html).
|
||||
/// * `y_true` - Ground truth (correct) target values.
|
||||
/// * `y_pred` - Estimated target values.
|
||||
pub fn v_measure_score<T: RealNumber, V: BaseVector<T>>(labels_true: &V, labels_pred: &V) -> T {
|
||||
ClusterMetrics::hcv_score()
|
||||
.get_score(labels_true, labels_pred)
|
||||
.2
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user