feat: + cluster metrics

This commit is contained in:
Volodymyr Orlov
2020-09-22 20:23:51 -07:00
parent 0803532e79
commit 750015b861
15 changed files with 477 additions and 16 deletions
+6 -7
View File
@@ -71,11 +71,11 @@ impl<T: RealNumber> BBDTree<T> {
) -> T {
let k = centroids.len();
counts.iter_mut().for_each(|x| *x = 0);
counts.iter_mut().for_each(|v| *v = 0);
let mut candidates = vec![0; k];
for i in 0..k {
candidates[i] = i;
sums[i].iter_mut().for_each(|x| *x = T::zero());
sums[i].iter_mut().for_each(|v| *v = T::zero());
}
self.filter(
@@ -124,7 +124,7 @@ impl<T: RealNumber> BBDTree<T> {
if !BBDTree::prune(
&self.nodes[node].center,
&self.nodes[node].radius,
&centroids,
centroids,
closest,
candidates[i],
) {
@@ -135,7 +135,7 @@ impl<T: RealNumber> BBDTree<T> {
// Recurse if there's at least two
if newk > 1 {
let result = self.filter(
return self.filter(
self.nodes[node].lower.unwrap(),
centroids,
&mut new_candidates,
@@ -152,7 +152,6 @@ impl<T: RealNumber> BBDTree<T> {
counts,
membership,
);
return result;
}
}
@@ -198,7 +197,7 @@ impl<T: RealNumber> BBDTree<T> {
}
}
return lhs >= T::two() * rhs;
lhs >= T::two() * rhs
}
fn build_node<M: Matrix<T>>(&mut self, data: &M, begin: usize, end: usize) -> usize {
@@ -336,7 +335,7 @@ mod tests {
use crate::linalg::naive::dense_matrix::DenseMatrix;
#[test]
fn fit_predict_iris() {
fn bbdtree_iris() {
let data = DenseMatrix::from_2d_array(&[
&[5.1, 3.5, 1.4, 0.2],
&[4.9, 3.0, 1.4, 0.2],
+16 -8
View File
@@ -189,15 +189,18 @@ impl<T: RealNumber + Sum> KMeans<T> {
/// Predict clusters for `x`
/// * `x` - matrix with new data to transform of size _KxM_ , where _K_ is number of new samples and _M_ is number of features.
pub fn predict<M: Matrix<T>>(&self, x: &M) -> Result<M::RowVector, Failed> {
let (n, _) = x.shape();
let (n, m) = x.shape();
let mut result = M::zeros(1, n);
let mut row = vec![T::zero(); m];
for i in 0..n {
let mut min_dist = T::max_value();
let mut best_cluster = 0;
for j in 0..self.k {
let dist = Euclidian::squared_distance(&x.get_row_as_vec(i), &self.centroids[j]);
x.copy_row_as_vec(i, &mut row);
let dist = Euclidian::squared_distance(&row, &self.centroids[j]);
if dist < min_dist {
min_dist = dist;
best_cluster = j;
@@ -211,19 +214,22 @@ impl<T: RealNumber + Sum> KMeans<T> {
fn kmeans_plus_plus<M: Matrix<T>>(data: &M, k: usize) -> Vec<usize> {
let mut rng = rand::thread_rng();
let (n, _) = data.shape();
let (n, m) = data.shape();
let mut y = vec![0; n];
let mut centroid = data.get_row_as_vec(rng.gen_range(0, n));
let mut d = vec![T::max_value(); n];
let mut row = vec![T::zero(); m];
// pick the next center
for j in 1..k {
// Loop over the samples and compare them to the most recent center. Store
// the distance from each sample to its closest center in scores.
for i in 0..n {
// compute the distance between this sample and the current center
let dist = Euclidian::squared_distance(&data.get_row_as_vec(i), &centroid);
data.copy_row_as_vec(i, &mut row);
let dist = Euclidian::squared_distance(&row, &centroid);
if dist < d[i] {
d[i] = dist;
@@ -237,20 +243,22 @@ impl<T: RealNumber + Sum> KMeans<T> {
}
let cutoff = T::from(rng.gen::<f64>()).unwrap() * sum;
let mut cost = T::zero();
let index = 0;
for index in 0..n {
let mut index = 0;
while index < n {
cost = cost + d[index];
if cost >= cutoff {
break;
}
index += 1;
}
centroid = data.get_row_as_vec(index);
data.copy_row_as_vec(index, &mut centroid);
}
for i in 0..n {
data.copy_row_as_vec(i, &mut row);
// compute the distance between this sample and the current center
let dist = Euclidian::squared_distance(&data.get_row_as_vec(i), &centroid);
let dist = Euclidian::squared_distance(&row, &centroid);
if dist < d[i] {
d[i] = dist;
+67
View File
@@ -0,0 +1,67 @@
//! # Optical Recognition of Handwritten Digits Data Set
//!
//! | Number of Instances | Number of Attributes | Missing Values? | Associated Tasks: |
//! |-|-|-|-|
//! | 1797 | 64 | No | Classification, Clusteing |
//!
//! [Digits dataset](https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits) contains normalized bitmaps of handwritten digits (0-9) from a preprinted form.
//! This multivariate dataset is frequently used to demonstrate various machine learning algorithms.
//!
//! All input attributes are integers in the range 0..16.
//!
use crate::dataset::deserialize_data;
use crate::dataset::Dataset;
/// Get dataset
pub fn load_dataset() -> Dataset<f32, f32> {
let (x, y, num_samples, num_features) = match deserialize_data(std::include_bytes!("digits.xy"))
{
Err(why) => panic!("Can't deserialize digits.xy. {}", why),
Ok((x, y, num_samples, num_features)) => (x, y, num_samples, num_features),
};
Dataset {
data: x,
target: y,
num_samples: num_samples,
num_features: num_features,
feature_names: vec![
"sepal length (cm)",
"sepal width (cm)",
"petal length (cm)",
"petal width (cm)",
]
.iter()
.map(|s| s.to_string())
.collect(),
target_names: vec!["setosa", "versicolor", "virginica"]
.iter()
.map(|s| s.to_string())
.collect(),
description: "Digits dataset: https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits".to_string(),
}
}
#[cfg(test)]
mod tests {
use super::super::*;
use super::*;
#[test]
#[ignore]
fn refresh_digits_dataset() {
// run this test to generate digits.xy file.
let dataset = load_dataset();
assert!(serialize_data(&dataset, "digits.xy").is_ok());
}
#[test]
fn digits_dataset() {
let dataset = load_dataset();
assert_eq!(dataset.data.len(), 1797 * 64);
assert_eq!(dataset.target.len(), 1797);
assert_eq!(dataset.num_features, 64);
assert_eq!(dataset.num_samples, 1797);
}
}
Binary file not shown.
+1
View File
@@ -4,6 +4,7 @@
pub mod boston;
pub mod breast_cancer;
pub mod diabetes;
pub mod digits;
pub mod iris;
use crate::math::num::RealNumber;
+10
View File
@@ -110,10 +110,20 @@ pub trait BaseMatrix<T: RealNumber>: Clone + Debug {
/// * `row` - row number
fn get_row_as_vec(&self, row: usize) -> Vec<T>;
/// Copies a vector with elements of the `row`'th row into `result`
/// * `row` - row number
/// * `result` - receiver for the row
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>);
/// Get a vector with elements of the `col`'th column
/// * `col` - column number
fn get_col_as_vec(&self, col: usize) -> Vec<T>;
/// Copies a vector with elements of the `col`'th column into `result`
/// * `col` - column number
/// * `result` - receiver for the col
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>);
/// Set an element at `col`, `row` to `x`
fn set(&mut self, row: usize, col: usize, x: T);
+59
View File
@@ -54,6 +54,16 @@ pub struct DenseMatrix<T: RealNumber> {
values: Vec<T>,
}
/// Column-major, dense matrix. See [Simple Dense Matrix](../index.html).
#[derive(Debug)]
pub struct DenseMatrixIterator<'a, T: RealNumber> {
cur_c: usize,
cur_r: usize,
max_c: usize,
max_r: usize,
m: &'a DenseMatrix<T>,
}
impl<T: RealNumber> fmt::Display for DenseMatrix<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut rows: Vec<Vec<f64>> = Vec::new();
@@ -162,6 +172,36 @@ impl<T: RealNumber> DenseMatrix<T> {
values: values,
}
}
/// Creates new column vector (_1xN_ matrix) from a vector.
/// * `values` - values to initialize the matrix.
pub fn iter<'a>(&'a self) -> DenseMatrixIterator<'a, T> {
DenseMatrixIterator {
cur_c: 0,
cur_r: 0,
max_c: self.ncols,
max_r: self.nrows,
m: &self,
}
}
}
impl<'a, T: RealNumber> Iterator for DenseMatrixIterator<'a, T> {
type Item = T;
fn next(&mut self) -> Option<T> {
if self.cur_r * self.max_c + self.cur_c >= self.max_c * self.max_r {
None
} else {
let v = self.m.get(self.cur_r, self.cur_c);
self.cur_c += 1;
if self.cur_c >= self.max_c {
self.cur_c = 0;
self.cur_r += 1;
}
Some(v)
}
}
}
impl<'de, T: RealNumber + fmt::Debug + Deserialize<'de>> Deserialize<'de> for DenseMatrix<T> {
@@ -339,6 +379,12 @@ impl<T: RealNumber> BaseMatrix<T> for DenseMatrix<T> {
result
}
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>) {
for c in 0..self.ncols {
result[c] = self.get(row, c);
}
}
fn get_col_as_vec(&self, col: usize) -> Vec<T> {
let mut result = vec![T::zero(); self.nrows];
for r in 0..self.nrows {
@@ -347,6 +393,12 @@ impl<T: RealNumber> BaseMatrix<T> for DenseMatrix<T> {
result
}
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>) {
for r in 0..self.nrows {
result[r] = self.get(r, col);
}
}
fn set(&mut self, row: usize, col: usize, x: T) {
self.values[col * self.nrows + row] = x;
}
@@ -852,6 +904,13 @@ mod tests {
);
}
#[test]
fn iter() {
let vec = vec![1., 2., 3., 4., 5., 6.];
let m = DenseMatrix::from_array(3, 2, &vec);
assert_eq!(vec, m.iter().collect::<Vec<f32>>());
}
#[test]
fn v_stack() {
let a = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]);
+27
View File
@@ -102,10 +102,26 @@ impl<T: RealNumber + Scalar + AddAssign + SubAssign + MulAssign + DivAssign + Su
self.row(row).iter().map(|v| *v).collect()
}
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>) {
let mut r = 0;
for e in self.row(row).iter() {
result[r] = *e;
r += 1;
}
}
fn get_col_as_vec(&self, col: usize) -> Vec<T> {
self.column(col).iter().map(|v| *v).collect()
}
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>) {
let mut r = 0;
for e in self.column(col).iter() {
result[r] = *e;
r += 1;
}
}
fn set(&mut self, row: usize, col: usize, x: T) {
*self.get_mut((row, col)).unwrap() = x;
}
@@ -563,6 +579,17 @@ mod tests {
assert_eq!(m.get_col_as_vec(1), vec!(2., 5., 8.));
}
#[test]
fn copy_row_col_as_vec() {
let m = DMatrix::from_row_slice(3, 3, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]);
let mut v = vec![0f32; 3];
m.copy_row_as_vec(1, &mut v);
assert_eq!(v, vec!(4., 5., 6.));
m.copy_col_as_vec(1, &mut v);
assert_eq!(v, vec!(2., 5., 8.));
}
#[test]
fn element_add_sub_mul_div() {
let mut m = DMatrix::from_row_slice(2, 2, &[1.0, 2.0, 3.0, 4.0]);
+27
View File
@@ -109,10 +109,26 @@ impl<T: RealNumber + ScalarOperand + AddAssign + SubAssign + MulAssign + DivAssi
self.row(row).to_vec()
}
fn copy_row_as_vec(&self, row: usize, result: &mut Vec<T>) {
let mut r = 0;
for e in self.row(row).iter() {
result[r] = *e;
r += 1;
}
}
fn get_col_as_vec(&self, col: usize) -> Vec<T> {
self.column(col).to_vec()
}
fn copy_col_as_vec(&self, col: usize, result: &mut Vec<T>) {
let mut r = 0;
for e in self.column(col).iter() {
result[r] = *e;
r += 1;
}
}
fn set(&mut self, row: usize, col: usize, x: T) {
self[[row, col]] = x;
}
@@ -669,6 +685,17 @@ mod tests {
assert_eq!(res, vec![2., 5., 8.]);
}
#[test]
fn copy_row_col_as_vec() {
let m = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]);
let mut v = vec![0f32; 3];
m.copy_row_as_vec(1, &mut v);
assert_eq!(v, vec!(4., 5., 6.));
m.copy_col_as_vec(1, &mut v);
assert_eq!(v, vec!(2., 5., 8.));
}
#[test]
fn col_mean() {
let a = arr2(&[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]);
+3 -1
View File
@@ -29,6 +29,7 @@ use super::Distance;
pub struct Euclidian {}
impl Euclidian {
#[inline]
pub(crate) fn squared_distance<T: RealNumber>(x: &Vec<T>, y: &Vec<T>) -> T {
if x.len() != y.len() {
panic!("Input vector sizes are different.");
@@ -36,7 +37,8 @@ impl Euclidian {
let mut sum = T::zero();
for i in 0..x.len() {
sum = sum + (x[i] - y[i]).powf(T::two());
let d = x[i] - y[i];
sum = sum + d * d;
}
sum
+1
View File
@@ -1,3 +1,4 @@
/// Multitude of distance metrics are defined here
pub mod distance;
pub mod num;
pub(crate) mod vector;
+40
View File
@@ -0,0 +1,40 @@
use crate::math::num::RealNumber;
use std::collections::HashMap;
pub trait RealNumberVector<T: RealNumber> {
fn unique(&self) -> (Vec<T>, Vec<usize>);
}
impl<T: RealNumber> RealNumberVector<T> for Vec<T> {
fn unique(&self) -> (Vec<T>, Vec<usize>) {
let mut unique = self.clone();
unique.sort_by(|a, b| a.partial_cmp(b).unwrap());
unique.dedup();
let mut index = HashMap::with_capacity(unique.len());
for (i, u) in unique.iter().enumerate() {
index.insert(u.to_i64().unwrap(), i);
}
let mut unique_index = Vec::with_capacity(self.len());
for e in self {
unique_index.push(index[&e.to_i64().unwrap()]);
}
(unique, unique_index)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn unique() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
assert_eq!(
(vec!(0.0, 1.0, 2.0, 4.0), vec!(0, 0, 1, 1, 2, 0, 3)),
v1.unique()
);
}
}
+54
View File
@@ -0,0 +1,54 @@
use serde::{Deserialize, Serialize};
use crate::linalg::BaseVector;
use crate::math::num::RealNumber;
use crate::metrics::cluster_helpers::*;
#[derive(Serialize, Deserialize, Debug)]
/// Mean Absolute Error
pub struct HCVScore {}
impl HCVScore {
/// Computes mean absolute error
/// * `y_true` - Ground truth (correct) target values.
/// * `y_pred` - Estimated target values.
pub fn get_score<T: RealNumber, V: BaseVector<T>>(
&self,
labels_true: &V,
labels_pred: &V,
) -> (T, T, T) {
let labels_true = labels_true.to_vec();
let labels_pred = labels_pred.to_vec();
let entropy_c = entropy(&labels_true);
let entropy_k = entropy(&labels_pred);
let contingency = contingency_matrix(&labels_true, &labels_pred);
let mi: T = mutual_info_score(&contingency);
let homogeneity = entropy_c.map(|e| mi / e).unwrap_or(T::one());
let completeness = entropy_k.map(|e| mi / e).unwrap_or(T::one());
let v_measure_score = if homogeneity + completeness == T::zero() {
T::zero()
} else {
T::two() * homogeneity * completeness / (T::one() * homogeneity + completeness)
};
(homogeneity, completeness, v_measure_score)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn homogeneity_score() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
let v2 = vec![1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0];
let scores = HCVScore {}.get_score(&v1, &v2);
assert!((0.2548f32 - scores.0).abs() < 1e-4);
assert!((0.5440f32 - scores.1).abs() < 1e-4);
assert!((0.3471f32 - scores.2).abs() < 1e-4);
}
}
+127
View File
@@ -0,0 +1,127 @@
use std::collections::HashMap;
use crate::math::num::RealNumber;
use crate::math::vector::RealNumberVector;
pub fn contingency_matrix<T: RealNumber>(
labels_true: &Vec<T>,
labels_pred: &Vec<T>,
) -> Vec<Vec<usize>> {
let (classes, class_idx) = labels_true.unique();
let (clusters, cluster_idx) = labels_pred.unique();
let mut contingency_matrix = Vec::with_capacity(classes.len());
for _ in 0..classes.len() {
contingency_matrix.push(vec![0; clusters.len()]);
}
for i in 0..class_idx.len() {
contingency_matrix[class_idx[i]][cluster_idx[i]] += 1;
}
contingency_matrix
}
pub fn entropy<T: RealNumber>(data: &Vec<T>) -> Option<T> {
let mut bincounts = HashMap::with_capacity(data.len());
for e in data.iter() {
let k = e.to_i64().unwrap();
bincounts.insert(k, bincounts.get(&k).unwrap_or(&0) + 1);
}
let mut entropy = T::zero();
let sum = T::from_usize(bincounts.values().sum()).unwrap();
for &c in bincounts.values() {
if c > 0 {
let pi = T::from_usize(c).unwrap();
entropy = entropy - (pi / sum) * (pi.ln() - sum.ln());
}
}
Some(entropy)
}
pub fn mutual_info_score<T: RealNumber>(contingency: &Vec<Vec<usize>>) -> T {
let mut contingency_sum = 0;
let mut pi = vec![0; contingency.len()];
let mut pj = vec![0; contingency[0].len()];
let (mut nzx, mut nzy, mut nz_val) = (Vec::new(), Vec::new(), Vec::new());
for r in 0..contingency.len() {
for c in 0..contingency[0].len() {
contingency_sum += contingency[r][c];
pi[r] += contingency[r][c];
pj[c] += contingency[r][c];
if contingency[r][c] > 0 {
nzx.push(r);
nzy.push(c);
nz_val.push(contingency[r][c]);
}
}
}
let contingency_sum = T::from_usize(contingency_sum).unwrap();
let contingency_sum_ln = contingency_sum.ln();
let pi_sum_l = T::from_usize(pi.iter().sum()).unwrap().ln();
let pj_sum_l = T::from_usize(pj.iter().sum()).unwrap().ln();
let log_contingency_nm: Vec<T> = nz_val
.iter()
.map(|v| T::from_usize(*v).unwrap().ln())
.collect();
let contingency_nm: Vec<T> = nz_val
.iter()
.map(|v| T::from_usize(*v).unwrap() / contingency_sum)
.collect();
let outer: Vec<usize> = nzx
.iter()
.zip(nzy.iter())
.map(|(&x, &y)| pi[x] * pj[y])
.collect();
let log_outer: Vec<T> = outer
.iter()
.map(|&o| -T::from_usize(o).unwrap().ln() + pi_sum_l + pj_sum_l)
.collect();
let mut result = T::zero();
for i in 0..log_outer.len() {
result = result
+ ((contingency_nm[i] * (log_contingency_nm[i] - contingency_sum_ln))
+ contingency_nm[i] * log_outer[i])
}
result.max(T::zero())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn contingency_matrix_test() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
let v2 = vec![1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0];
println!("{:?}", contingency_matrix(&v1, &v2));
}
#[test]
fn entropy_test() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
println!("{:?}", entropy(&v1));
}
#[test]
fn mutual_info_score_test() {
let v1 = vec![0.0, 0.0, 1.0, 1.0, 2.0, 0.0, 4.0];
let v2 = vec![1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0];
let s: f32 = mutual_info_score(&contingency_matrix(&v1, &v2));
println!("{}", s);
}
}
+39
View File
@@ -54,6 +54,8 @@
pub mod accuracy;
/// Computes Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
pub mod auc;
pub mod cluster_hcv;
pub(crate) mod cluster_helpers;
/// F1 score, also known as balanced F-score or F-measure.
pub mod f1;
/// Mean absolute error regression loss.
@@ -76,6 +78,9 @@ pub struct ClassificationMetrics {}
/// Metrics for regression models.
pub struct RegressionMetrics {}
/// Cluster metrics.
pub struct ClusterMetrics {}
impl ClassificationMetrics {
/// Accuracy score, see [accuracy](accuracy/index.html).
pub fn accuracy() -> accuracy::Accuracy {
@@ -120,6 +125,13 @@ impl RegressionMetrics {
}
}
impl ClusterMetrics {
/// Mean squared error, see [mean squared error](mean_squared_error/index.html).
pub fn hcv_score() -> cluster_hcv::HCVScore {
cluster_hcv::HCVScore {}
}
}
/// Function that calculated accuracy score, see [accuracy](accuracy/index.html).
/// * `y_true` - cround truth (correct) labels
/// * `y_pred` - predicted labels, as returned by a classifier.
@@ -175,3 +187,30 @@ pub fn mean_absolute_error<T: RealNumber, V: BaseVector<T>>(y_true: &V, y_pred:
pub fn r2<T: RealNumber, V: BaseVector<T>>(y_true: &V, y_pred: &V) -> T {
RegressionMetrics::r2().get_score(y_true, y_pred)
}
/// Computes R2 score, see [R2](r2/index.html).
/// * `y_true` - Ground truth (correct) target values.
/// * `y_pred` - Estimated target values.
pub fn homogeneity_score<T: RealNumber, V: BaseVector<T>>(labels_true: &V, labels_pred: &V) -> T {
ClusterMetrics::hcv_score()
.get_score(labels_true, labels_pred)
.0
}
/// Computes R2 score, see [R2](r2/index.html).
/// * `y_true` - Ground truth (correct) target values.
/// * `y_pred` - Estimated target values.
pub fn completeness_score<T: RealNumber, V: BaseVector<T>>(labels_true: &V, labels_pred: &V) -> T {
ClusterMetrics::hcv_score()
.get_score(labels_true, labels_pred)
.1
}
/// Computes R2 score, see [R2](r2/index.html).
/// * `y_true` - Ground truth (correct) target values.
/// * `y_pred` - Estimated target values.
pub fn v_measure_score<T: RealNumber, V: BaseVector<T>>(labels_true: &V, labels_pred: &V) -> T {
ClusterMetrics::hcv_score()
.get_score(labels_true, labels_pred)
.2
}