Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
70212c71e0 | ||
|
|
63f86f7bc9 | ||
|
|
e633afa520 | ||
|
|
b6e32fb328 | ||
|
|
948d78a4d0 | ||
|
|
448b6f77e3 |
@@ -0,0 +1,41 @@
|
||||
cff-version: 1.2.0
|
||||
message: "If this software contributes to published work, please cite smartcore."
|
||||
type: software
|
||||
title: "smartcore: Machine Learning in Rust"
|
||||
abstract: "smartcore is a comprehensive machine learning and numerical computing library for Rust, offering supervised and unsupervised algorithms, model evaluation tools, and linear algebra abstractions, with optional ndarray integration." [web:5][web:3]
|
||||
repository-code: "https://github.com/smartcorelib/smartcore" [web:5]
|
||||
url: "https://github.com/smartcorelib" [web:3]
|
||||
license: "MIT" [web:13]
|
||||
keywords:
|
||||
- Rust
|
||||
- machine learning
|
||||
- numerical computing
|
||||
- linear algebra
|
||||
- classification
|
||||
- regression
|
||||
- clustering
|
||||
- SVM
|
||||
- Random Forest
|
||||
- XGBoost [web:5]
|
||||
authors:
|
||||
- name: "smartcore Developers" [web:7]
|
||||
- name: "Lorenzo (contributor)" [web:16]
|
||||
- name: "Community contributors" [web:7]
|
||||
version: "0.4.2" [attached_file:1]
|
||||
date-released: "2025-09-14" [attached_file:1]
|
||||
preferred-citation:
|
||||
type: software
|
||||
title: "smartcore: Machine Learning in Rust"
|
||||
authors:
|
||||
- name: "smartcore Developers" [web:7]
|
||||
url: "https://github.com/smartcorelib" [web:3]
|
||||
repository-code: "https://github.com/smartcorelib/smartcore" [web:5]
|
||||
license: "MIT" [web:13]
|
||||
references:
|
||||
- type: manual
|
||||
title: "smartcore Documentation"
|
||||
url: "https://docs.rs/smartcore" [web:5]
|
||||
- type: webpage
|
||||
title: "smartcore Homepage"
|
||||
url: "https://github.com/smartcorelib" [web:3]
|
||||
notes: "For development features, see the docs.rs page and the repository README; SmartCore includes algorithms such as SVM, Random Forest, K-Means, PCA, DBSCAN, and XGBoost." [web:5]
|
||||
+2
-1
@@ -2,7 +2,7 @@
|
||||
name = "smartcore"
|
||||
description = "Machine Learning in Rust."
|
||||
homepage = "https://smartcorelib.org"
|
||||
version = "0.4.3"
|
||||
version = "0.4.5"
|
||||
authors = ["smartcore Developers"]
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
@@ -28,6 +28,7 @@ num = "0.4"
|
||||
rand = { version = "0.8.5", default-features = false, features = ["small_rng"] }
|
||||
rand_distr = { version = "0.4", optional = true }
|
||||
serde = { version = "1", features = ["derive"], optional = true }
|
||||
ordered-float = "5.1.0"
|
||||
|
||||
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||
typetag = { version = "0.2", optional = true }
|
||||
|
||||
@@ -16,6 +16,132 @@
|
||||
</p>
|
||||
|
||||
-----
|
||||
[](https://github.com/smartcorelib/smartcore/actions/workflows/ci.yml)
|
||||
[](https://github.com/smartcorelib/smartcore/actions/workflows/ci.yml) [](https://doi.org/10.5281/zenodo.17219259)
|
||||
|
||||
To start getting familiar with the new smartcore v0.4 API, there is now available a [**Jupyter Notebook environment repository**](https://github.com/smartcorelib/smartcore-jupyter). Please see instructions there, contributions welcome see [CONTRIBUTING](.github/CONTRIBUTING.md).
|
||||
|
||||
smartcore is a fast, ergonomic machine learning library for Rust, covering classical supervised and unsupervised methods with a modular linear algebra abstraction and optional ndarray support. It aims to provide production-friendly APIs, strong typing, and good defaults while remaining flexible for research and experimentation.
|
||||
|
||||
|
||||
## Highlights
|
||||
|
||||
- Broad algorithm coverage: linear models, tree-based methods, ensembles, SVMs, neighbors, clustering, decomposition, and preprocessing.
|
||||
- Strong linear algebra traits with optional ndarray integration for users who prefer array-first workflows.
|
||||
- WASM-first defaults with attention to portability; features such as serde and datasets are opt-in.
|
||||
- Practical utilities for model selection, evaluation, readers (CSV), dataset generators, and built-in sample datasets.
|
||||
|
||||
|
||||
## Install
|
||||
|
||||
Add to Cargo.toml:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
smartcore = "^0.4.3"
|
||||
```
|
||||
|
||||
For the latest development branch:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
smartcore = { git = "https://github.com/smartcorelib/smartcore", branch = "development" }
|
||||
```
|
||||
|
||||
Optional features (examples):
|
||||
|
||||
- datasets
|
||||
- serde
|
||||
- ndarray-bindings (deprecated in favor of ndarray-only support per recent changes)
|
||||
|
||||
Check Cargo.toml for available features and compatibility notes.
|
||||
|
||||
## Quick start
|
||||
|
||||
Here is a minimal example fitting a KNN classifier from native Rust vectors using DenseMatrix:
|
||||
|
||||
```rust
|
||||
use smartcore::linalg::basic::matrix::DenseMatrix;
|
||||
use smartcore::neighbors::knn_classifier::KNNClassifier;
|
||||
|
||||
// Turn vector slices into a matrix
|
||||
let x = DenseMatrix::from_2d_array(&[
|
||||
&[1., 2.],
|
||||
&[3., 4.],
|
||||
&[5., 6.],
|
||||
&[7., 8.],
|
||||
&[9., 10.],
|
||||
]).unwrap;
|
||||
|
||||
// Class labels
|
||||
let y = vec![2, 2, 2, 3, 3];
|
||||
|
||||
// Train classifier
|
||||
let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap();
|
||||
|
||||
// Predict
|
||||
let yhat = knn.predict(&x).unwrap();
|
||||
```
|
||||
|
||||
This example mirrors the “First Example” section of the crate docs and demonstrates smartcore’s ergonomic API surface.
|
||||
|
||||
## Algorithms
|
||||
|
||||
smartcore organizes algorithms into clear modules with consistent traits:
|
||||
|
||||
- Clustering: K-Means, DBSCAN, agglomerative (including single-linkage), with K-Means++ initialization and utilities.
|
||||
- Matrix decomposition: SVD, EVD, Cholesky, LU, QR, plus related linear algebra helpers.
|
||||
- Linear models: OLS, Ridge, Lasso, ElasticNet, Logistic Regression.
|
||||
- Ensemble and tree-based: Random Forest (classifier and regressor), Extra Trees, shared reusable components across trees and forests.
|
||||
- SVM: SVC/SVR with kernel enum support and multiclass extensions.
|
||||
- Neighbors: KNN classification and regression with distance metrics and fast selection helpers.
|
||||
- Naive Bayes: Gaussian, Bernoulli, Categorical, Multinomial.
|
||||
- Preprocessing: encoders, split utilities, and common transforms.
|
||||
- Model selection and metrics: K-fold, search parameters, and evaluation utilities.
|
||||
|
||||
Recent refactors emphasize reusable components in trees/forests and expanded multiclass SVM capabilities. XGBoost-style regression and single-linkage clustering have been added. See CHANGELOG for API changes and migration notes.
|
||||
|
||||
## Data access and readers
|
||||
|
||||
- CSV readers: Read matrices from CSV with configurable delimiter and header rows, with helpful error messages and testing utilities (including non-IO reader abstractions).
|
||||
- Dataset generators: make_blobs, make_circles, make_moons for quick experiments.
|
||||
- Built-in datasets (feature-gated): digits, diabetes, breast cancer, boston, with serialization utilities to persist or refresh .xy bundles.
|
||||
|
||||
|
||||
## WebAssembly and portability
|
||||
|
||||
smartcore adopts a WASM/WASI-first posture in defaults to ease browser and embedded deployments. Some file-system operations are restricted in wasm targets; tests and IO utilities are structured to avoid unsupported calls where possible. Enable features like serde selectively to minimize footprint. Consult module-level docs and CHANGELOG for target-specific caveats.
|
||||
|
||||
## Notebooks
|
||||
|
||||
A curated set of Jupyter notebooks is available via the companion repository to explore smartcore interactively. To run locally, use EVCXR to enable Rust notebooks. This is the recommended path to quickly experiment with the v0.4 API.
|
||||
|
||||
## Roadmap and recent changes
|
||||
|
||||
- Trait-system refactor, fewer structs and more object-safe traits, large codebase reorganization.
|
||||
- Move to Rust 2021 edition and cleanup of duplicate code paths.
|
||||
- Seeds and deterministic controls across algorithms using RNG plumbing.
|
||||
- Search parameter API for hyperparameter exploration in K-Means and SVM families.
|
||||
- Tree and forest components refactored for reuse; Extra Trees added.
|
||||
- SVM multiclass support; SVR kernel enum and related improvements.
|
||||
- XGBoost-style regression introduced; single-linkage clustering implemented.
|
||||
|
||||
See CHANGELOG.md for precise details, deprecations, and breaking changes. Some features like nalgebra-bindings have been dropped in favor of ndarray-only paths. Default features are tuned for WASM/WASI builds; enable serde/datasets as needed.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome:
|
||||
|
||||
- Open an issue describing the change and link it in the PR.
|
||||
- Keep PRs in sync with the development branch and ensure tests pass on stable Rust.
|
||||
- Provide or update tests; run clippy and apply formatting. Coverage and linting are part of the workflow.
|
||||
- Use the provided PR and issue templates to describe behavior changes, new features, and expectations.
|
||||
|
||||
If adding IO, prefer abstractions that make non-IO testing straightforward (see readers/iotesting). For datasets, keep serialization helpers in tests gated appropriately to avoid unintended file writes in wasm targets.
|
||||
|
||||
## License
|
||||
|
||||
smartcore is open source under a permissive license; see Cargo.toml and LICENSE for details. The crate metadata identifies “smartcore Developers” as authors; community contributions are credited via Git history and releases.
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
smartcore’s design incorporates well-known ML patterns while staying idiomatic to Rust. Thanks to all contributors who have helped expand algorithms, improve docs, modernize traits, and harden the codebase for production.
|
||||
|
||||
@@ -23,7 +23,10 @@
|
||||
/// ```
|
||||
/// <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
||||
/// <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
use std::collections::HashMap;
|
||||
use ordered_float::{FloatCore, OrderedFloat};
|
||||
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
|
||||
use num::Bounded;
|
||||
|
||||
@@ -34,6 +37,25 @@ use crate::metrics::distance::{Distance, PairwiseDistance};
|
||||
use crate::numbers::floatnum::FloatNumber;
|
||||
use crate::numbers::realnum::RealNumber;
|
||||
|
||||
/// Parameters for CosinePair construction
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CosinePairParameters {
|
||||
/// Maximum number of neighbors to consider per point (default: all points)
|
||||
pub top_k: Option<usize>,
|
||||
/// Whether to use approximate nearest neighbor search
|
||||
pub approximate: bool,
|
||||
}
|
||||
|
||||
#[allow(clippy::derivable_impls)]
|
||||
impl Default for CosinePairParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
top_k: None,
|
||||
approximate: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Inspired by Python implementation:
|
||||
/// <https://github.com/carsonfarmer/fastpair/blob/b8b4d3000ab6f795a878936667eee1b557bf353d/fastpair/base.py>
|
||||
@@ -49,12 +71,29 @@ pub struct CosinePair<'a, T: RealNumber + FloatNumber, M: Array2<T>> {
|
||||
pub distances: HashMap<usize, PairwiseDistance<T>>,
|
||||
/// conga line used to keep track of the closest pair
|
||||
pub neighbours: Vec<usize>,
|
||||
/// parameters used during construction
|
||||
pub parameters: CosinePairParameters,
|
||||
}
|
||||
|
||||
impl<'a, T: RealNumber + FloatNumber, M: Array2<T>> CosinePair<'a, T, M> {
|
||||
/// Constructor
|
||||
/// Instantiate and initialize the algorithm
|
||||
impl<'a, T: RealNumber + FloatNumber + FloatCore, M: Array2<T>> CosinePair<'a, T, M> {
|
||||
/// Constructor with default parameters (backward compatibility)
|
||||
pub fn new(m: &'a M) -> Result<Self, Failed> {
|
||||
Self::with_parameters(m, CosinePairParameters::default())
|
||||
}
|
||||
|
||||
/// Constructor with top-k limiting for faster performance
|
||||
pub fn with_top_k(m: &'a M, top_k: usize) -> Result<Self, Failed> {
|
||||
Self::with_parameters(
|
||||
m,
|
||||
CosinePairParameters {
|
||||
top_k: Some(top_k),
|
||||
approximate: false,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Constructor with full parameter control
|
||||
pub fn with_parameters(m: &'a M, parameters: CosinePairParameters) -> Result<Self, Failed> {
|
||||
if m.shape().0 < 2 {
|
||||
return Err(Failed::because(
|
||||
FailedError::FindFailed,
|
||||
@@ -64,96 +103,156 @@ impl<'a, T: RealNumber + FloatNumber, M: Array2<T>> CosinePair<'a, T, M> {
|
||||
|
||||
let mut init = Self {
|
||||
samples: m,
|
||||
// to be computed in init(..)
|
||||
distances: HashMap::with_capacity(m.shape().0),
|
||||
neighbours: Vec::with_capacity(m.shape().0 + 1),
|
||||
neighbours: Vec::with_capacity(m.shape().0),
|
||||
parameters,
|
||||
};
|
||||
init.init();
|
||||
Ok(init)
|
||||
}
|
||||
|
||||
/// Initialise `CosinePair` by passing a `Array2`.
|
||||
/// Build a CosinePairs data-structure from a set of (new) points.
|
||||
/// Helper function to create ordered float wrapper
|
||||
fn ordered_float(value: T) -> OrderedFloat<T> {
|
||||
OrderedFloat(value)
|
||||
}
|
||||
|
||||
/// Helper function to extract value from ordered float wrapper
|
||||
fn extract_float(ordered: OrderedFloat<T>) -> T {
|
||||
ordered.into_inner()
|
||||
}
|
||||
|
||||
/// Optimized initialization with top-k neighbor limiting
|
||||
fn init(&mut self) {
|
||||
// basic measures
|
||||
let len = self.samples.shape().0;
|
||||
let max_index = self.samples.shape().0 - 1;
|
||||
let max_neighbors: usize = self.parameters.top_k.unwrap_or(len - 1).min(len - 1);
|
||||
|
||||
// Store all closest neighbors
|
||||
let _distances = Box::new(HashMap::with_capacity(len));
|
||||
let _neighbours = Box::new(Vec::with_capacity(len));
|
||||
let mut distances = HashMap::with_capacity(len);
|
||||
let mut neighbours = Vec::with_capacity(len);
|
||||
|
||||
let mut distances = *_distances;
|
||||
let mut neighbours = *_neighbours;
|
||||
|
||||
// fill neighbours with -1 values
|
||||
neighbours.extend(0..len);
|
||||
|
||||
// init closest neighbour pairwise data
|
||||
for index_row_i in 0..(max_index) {
|
||||
// Initialize with max distances
|
||||
for i in 0..len {
|
||||
distances.insert(
|
||||
index_row_i,
|
||||
i,
|
||||
PairwiseDistance {
|
||||
node: index_row_i,
|
||||
neighbour: Option::None,
|
||||
node: i,
|
||||
neighbour: None,
|
||||
distance: Some(<T as Bounded>::max_value()),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// loop through indeces and neighbours
|
||||
for index_row_i in 0..(len) {
|
||||
// start looking for the neighbour in the second element
|
||||
let mut index_closest = index_row_i + 1; // closest neighbour index
|
||||
let mut nbd: Option<T> = distances[&index_row_i].distance; // init neighbour distance
|
||||
for index_row_j in (index_row_i + 1)..len {
|
||||
distances.insert(
|
||||
index_row_j,
|
||||
PairwiseDistance {
|
||||
node: index_row_j,
|
||||
neighbour: Some(index_row_i),
|
||||
distance: nbd,
|
||||
},
|
||||
);
|
||||
// Compute distances for each point using top-k optimization
|
||||
for i in 0..len {
|
||||
let mut candidate_distances = BinaryHeap::new();
|
||||
|
||||
let d = Cosine::new().distance(
|
||||
for j in 0..len {
|
||||
if i != j {
|
||||
let distance = T::from(Cosine::new().distance(
|
||||
&Vec::from_iterator(
|
||||
self.samples.get_row(index_row_i).iterator(0).copied(),
|
||||
self.samples.get_row(i).iterator(0).copied(),
|
||||
self.samples.shape().1,
|
||||
),
|
||||
&Vec::from_iterator(
|
||||
self.samples.get_row(index_row_j).iterator(0).copied(),
|
||||
self.samples.get_row(j).iterator(0).copied(),
|
||||
self.samples.shape().1,
|
||||
),
|
||||
);
|
||||
if d < nbd.unwrap().to_f64().unwrap() {
|
||||
// set this j-value to be the closest neighbour
|
||||
index_closest = index_row_j;
|
||||
nbd = Some(T::from(d).unwrap());
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
// Use OrderedFloat for stable ordering
|
||||
candidate_distances.push(Reverse((Self::ordered_float(distance), j)));
|
||||
|
||||
if candidate_distances.len() > max_neighbors {
|
||||
candidate_distances.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add that edge
|
||||
distances.entry(index_row_i).and_modify(|e| {
|
||||
e.distance = nbd;
|
||||
e.neighbour = Some(index_closest);
|
||||
// Find the closest neighbor from candidates
|
||||
if let Some(Reverse((closest_distance, closest_neighbor))) =
|
||||
candidate_distances.iter().min_by_key(|Reverse((d, _))| *d)
|
||||
{
|
||||
distances.entry(i).and_modify(|e| {
|
||||
e.distance = Some(Self::extract_float(*closest_distance));
|
||||
e.neighbour = Some(*closest_neighbor);
|
||||
});
|
||||
}
|
||||
// No more neighbors, terminate conga line.
|
||||
// Last person on the line has no neigbors
|
||||
distances.get_mut(&max_index).unwrap().neighbour = Some(max_index);
|
||||
distances.get_mut(&(len - 1)).unwrap().distance = Some(<T as Bounded>::max_value());
|
||||
|
||||
// compute sparse matrix (connectivity matrix)
|
||||
let mut sparse_matrix = M::zeros(len, len);
|
||||
for (_, p) in distances.iter() {
|
||||
sparse_matrix.set((p.node, p.neighbour.unwrap()), p.distance.unwrap());
|
||||
}
|
||||
|
||||
self.distances = distances;
|
||||
self.neighbours = neighbours;
|
||||
}
|
||||
|
||||
/// Fast query using top-k pre-computed neighbors with ordered-float
|
||||
pub fn query_row_top_k(
|
||||
&self,
|
||||
query_row_index: usize,
|
||||
k: usize,
|
||||
) -> Result<Vec<(T, usize)>, Failed> {
|
||||
if query_row_index >= self.samples.shape().0 {
|
||||
return Err(Failed::because(
|
||||
FailedError::FindFailed,
|
||||
"Query row index out of bounds",
|
||||
));
|
||||
}
|
||||
|
||||
if k == 0 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let max_candidates = self.parameters.top_k.unwrap_or(self.samples.shape().0);
|
||||
let actual_k: usize = k.min(max_candidates);
|
||||
|
||||
// Use binary heap with ordered-float for reliable ordering
|
||||
let mut heap = BinaryHeap::with_capacity(actual_k + 1);
|
||||
|
||||
let candidates = if let Some(top_k) = self.parameters.top_k {
|
||||
let step = (self.samples.shape().0 / top_k).max(1);
|
||||
(0..self.samples.shape().0)
|
||||
.step_by(step)
|
||||
.filter(|&i| i != query_row_index)
|
||||
.take(top_k)
|
||||
.collect::<Vec<_>>()
|
||||
} else {
|
||||
(0..self.samples.shape().0)
|
||||
.filter(|&i| i != query_row_index)
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
for &candidate_idx in &candidates {
|
||||
let distance = T::from(Cosine::new().distance(
|
||||
&Vec::from_iterator(
|
||||
self.samples.get_row(query_row_index).iterator(0).copied(),
|
||||
self.samples.shape().1,
|
||||
),
|
||||
&Vec::from_iterator(
|
||||
self.samples.get_row(candidate_idx).iterator(0).copied(),
|
||||
self.samples.shape().1,
|
||||
),
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
heap.push(Reverse((Self::ordered_float(distance), candidate_idx)));
|
||||
|
||||
if heap.len() > actual_k {
|
||||
heap.pop();
|
||||
}
|
||||
}
|
||||
|
||||
// Convert heap to sorted vector
|
||||
let mut neighbors: Vec<_> = heap
|
||||
.into_vec()
|
||||
.into_iter()
|
||||
.map(|Reverse((dist, idx))| (Self::extract_float(dist), idx))
|
||||
.collect();
|
||||
|
||||
neighbors.sort_by(|a, b| Self::ordered_float(a.0).cmp(&Self::ordered_float(b.0)));
|
||||
|
||||
Ok(neighbors)
|
||||
}
|
||||
|
||||
/// Query k nearest neighbors for a row that's already in the dataset
|
||||
pub fn query_row(&self, query_row_index: usize, k: usize) -> Result<Vec<(T, usize)>, Failed> {
|
||||
if query_row_index >= self.samples.shape().0 {
|
||||
@@ -318,7 +417,7 @@ impl<'a, T: RealNumber + FloatNumber, M: Array2<T>> CosinePair<'a, T, M> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::linalg::basic::{arrays::Array, matrix::DenseMatrix};
|
||||
use approx::assert_relative_eq;
|
||||
use approx::{assert_relative_eq, relative_eq};
|
||||
|
||||
#[cfg_attr(
|
||||
all(target_arch = "wasm32", not(target_os = "wasi")),
|
||||
@@ -499,10 +598,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(
|
||||
all(target_arch = "wasm32", not(target_os = "wasi")),
|
||||
wasm_bindgen_test::wasm_bindgen_test
|
||||
)]
|
||||
#[test]
|
||||
fn cosine_pair_query_row_bounds_error() {
|
||||
let x = DenseMatrix::<f64>::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0]]).unwrap();
|
||||
@@ -520,10 +615,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(
|
||||
all(target_arch = "wasm32", not(target_os = "wasi")),
|
||||
wasm_bindgen_test::wasm_bindgen_test
|
||||
)]
|
||||
#[test]
|
||||
fn cosine_pair_query_row_k_zero() {
|
||||
let x =
|
||||
@@ -635,6 +726,206 @@ mod tests {
|
||||
assert!(distance >= 0.0 && distance <= 2.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_row_top_k_top_k_limiting() {
|
||||
// Test that query_row_top_k respects top_k parameter and returns correct results
|
||||
let x = DenseMatrix::<f64>::from_2d_array(&[
|
||||
&[1.0, 0.0, 0.0], // Point 0
|
||||
&[0.0, 1.0, 0.0], // Point 1 - orthogonal to point 0
|
||||
&[0.0, 0.0, 1.0], // Point 2 - orthogonal to point 0
|
||||
&[1.0, 1.0, 0.0], // Point 3 - closer to point 0 than points 1,2
|
||||
&[0.5, 0.0, 0.0], // Point 4 - very close to point 0 (parallel)
|
||||
&[2.0, 0.0, 0.0], // Point 5 - very close to point 0 (parallel)
|
||||
&[0.0, 1.0, 1.0], // Point 6 - far from point 0
|
||||
&[3.0, 3.0, 3.0], // Point 7 - moderately close to point 0
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
// Create CosinePair with top_k=4 to limit candidates
|
||||
let cosine_pair = CosinePair::with_top_k(&x, 4).unwrap();
|
||||
|
||||
// Query for 3 nearest neighbors to point 0
|
||||
let neighbors = cosine_pair.query_row_top_k(0, 3).unwrap();
|
||||
|
||||
// Should return exactly 3 neighbors
|
||||
assert_eq!(neighbors.len(), 3);
|
||||
|
||||
// Verify that distances are in ascending order
|
||||
for i in 1..neighbors.len() {
|
||||
assert!(
|
||||
neighbors[i - 1].0 <= neighbors[i].0,
|
||||
"Distances should be in ascending order: {} <= {}",
|
||||
neighbors[i - 1].0,
|
||||
neighbors[i].0
|
||||
);
|
||||
}
|
||||
|
||||
// All distances should be valid cosine distances (0 to 2)
|
||||
for (distance, index) in &neighbors {
|
||||
assert!(
|
||||
*distance >= 0.0 && *distance <= 2.0,
|
||||
"Cosine distance {} should be between 0 and 2",
|
||||
distance
|
||||
);
|
||||
assert!(
|
||||
*index < x.shape().0,
|
||||
"Neighbor index {} should be less than dataset size {}",
|
||||
index,
|
||||
x.shape().0
|
||||
);
|
||||
assert!(
|
||||
*index != 0,
|
||||
"Neighbor index should not include query point itself"
|
||||
);
|
||||
}
|
||||
|
||||
// The closest neighbor should be either point 4 or 5 (parallel vectors)
|
||||
// These should have cosine distance ≈ 0
|
||||
let closest_distance = neighbors[0].0;
|
||||
assert!(
|
||||
closest_distance < 0.01,
|
||||
"Closest parallel vector should have distance close to 0, got {}",
|
||||
closest_distance
|
||||
);
|
||||
|
||||
// Verify that we get different results with different top_k values
|
||||
let cosine_pair_full = CosinePair::new(&x).unwrap();
|
||||
let neighbors_full = cosine_pair_full.query_row(0, 3).unwrap();
|
||||
|
||||
// Results should be the same or very close since we're asking for top 3
|
||||
// but the algorithm might find different candidates due to top_k limiting
|
||||
assert_eq!(neighbors.len(), neighbors_full.len());
|
||||
|
||||
// The closest neighbor should be the same in both cases
|
||||
let closest_idx_fast = neighbors[0].1;
|
||||
let closest_idx_full = neighbors_full[0].1;
|
||||
let closest_dist_fast = neighbors[0].0;
|
||||
let closest_dist_full = neighbors_full[0].0;
|
||||
|
||||
// Either we get the same closest neighbor, or distances are very close
|
||||
if closest_idx_fast == closest_idx_full {
|
||||
assert!(relative_eq!(
|
||||
closest_dist_fast,
|
||||
closest_dist_full,
|
||||
epsilon = 1e-10
|
||||
));
|
||||
} else {
|
||||
// Different neighbors, but distances should be very close (parallel vectors)
|
||||
assert!(relative_eq!(
|
||||
closest_dist_fast,
|
||||
closest_dist_full,
|
||||
epsilon = 1e-6
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_row_top_k_performance_vs_accuracy() {
|
||||
// Test that query_row_top_k provides reasonable performance/accuracy tradeoff
|
||||
// and handles edge cases properly
|
||||
let large_dataset = DenseMatrix::<f32>::from_2d_array(&[
|
||||
&[1.0f32, 2.0, 3.0, 4.0], // Point 0 - query point
|
||||
&[1.1f32, 2.1, 3.1, 4.1], // Point 1 - very close to 0
|
||||
&[1.05f32, 2.05, 3.05, 4.05], // Point 2 - very close to 0
|
||||
&[2.0f32, 4.0, 6.0, 8.0], // Point 3 - parallel to 0 (2x scaling)
|
||||
&[0.5f32, 1.0, 1.5, 2.0], // Point 4 - parallel to 0 (0.5x scaling)
|
||||
&[-1.0f32, -2.0, -3.0, -4.0], // Point 5 - opposite to 0
|
||||
&[4.0f32, 3.0, 2.0, 1.0], // Point 6 - different direction
|
||||
&[0.0f32, 0.0, 0.0, 0.1], // Point 7 - mostly orthogonal
|
||||
&[10.0f32, 20.0, 30.0, 40.0], // Point 8 - parallel but far
|
||||
&[1.0f32, 0.0, 0.0, 0.0], // Point 9 - partially similar
|
||||
&[0.0f32, 2.0, 0.0, 0.0], // Point 10 - partially similar
|
||||
&[0.0f32, 0.0, 3.0, 0.0], // Point 11 - partially similar
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
// Test with aggressive top_k limiting (only consider 5 out of 11 other points)
|
||||
let cosine_pair_limited = CosinePair::with_top_k(&large_dataset, 5).unwrap();
|
||||
|
||||
// Query for 4 nearest neighbors
|
||||
let neighbors_limited = cosine_pair_limited.query_row_top_k(0, 4).unwrap();
|
||||
|
||||
// Should return exactly 4 neighbors
|
||||
assert_eq!(neighbors_limited.len(), 4);
|
||||
|
||||
// Test error handling - out of bounds query
|
||||
let result_oob = cosine_pair_limited.query_row_top_k(15, 2);
|
||||
assert!(result_oob.is_err());
|
||||
if let Err(e) = result_oob {
|
||||
assert_eq!(
|
||||
e,
|
||||
Failed::because(FailedError::FindFailed, "Query row index out of bounds")
|
||||
);
|
||||
}
|
||||
|
||||
// Test k=0 case
|
||||
let neighbors_zero = cosine_pair_limited.query_row_top_k(0, 0).unwrap();
|
||||
assert_eq!(neighbors_zero.len(), 0);
|
||||
|
||||
// Test k > available candidates
|
||||
let neighbors_large_k = cosine_pair_limited.query_row_top_k(0, 20).unwrap();
|
||||
assert!(neighbors_large_k.len() <= 11); // At most 11 other points
|
||||
|
||||
// Verify ordering is correct
|
||||
for i in 1..neighbors_limited.len() {
|
||||
assert!(
|
||||
neighbors_limited[i - 1].0 <= neighbors_limited[i].0,
|
||||
"Distance ordering violation at position {}: {} > {}",
|
||||
i,
|
||||
neighbors_limited[i - 1].0,
|
||||
neighbors_limited[i].0
|
||||
);
|
||||
}
|
||||
|
||||
// The closest neighbors should be the parallel vectors (points 1, 2, 3, 4)
|
||||
// since they have the smallest cosine distances
|
||||
let closest_distance = neighbors_limited[0].0;
|
||||
assert!(
|
||||
closest_distance < 0.1,
|
||||
"Closest neighbor should be nearly parallel, distance: {}",
|
||||
closest_distance
|
||||
);
|
||||
|
||||
// Compare with full algorithm for accuracy assessment
|
||||
let cosine_pair_full = CosinePair::new(&large_dataset).unwrap();
|
||||
let neighbors_full = cosine_pair_full.query_row(0, 4).unwrap();
|
||||
|
||||
// The fast version might not find the exact same neighbors due to sampling,
|
||||
// but the closest neighbor's distance should be very similar
|
||||
let dist_diff = (neighbors_limited[0].0 - neighbors_full[0].0).abs();
|
||||
assert!(
|
||||
dist_diff < 0.01,
|
||||
"Fast and full algorithms should give similar closest distances. Diff: {}",
|
||||
dist_diff
|
||||
);
|
||||
|
||||
// Verify that all returned indices are valid and unique
|
||||
let mut indices: Vec<usize> = neighbors_limited.iter().map(|(_, idx)| *idx).collect();
|
||||
indices.sort();
|
||||
indices.dedup();
|
||||
assert_eq!(
|
||||
indices.len(),
|
||||
neighbors_limited.len(),
|
||||
"All neighbor indices should be unique"
|
||||
);
|
||||
|
||||
for &idx in &indices {
|
||||
assert!(
|
||||
idx < large_dataset.shape().0,
|
||||
"Neighbor index {} should be valid",
|
||||
idx
|
||||
);
|
||||
assert!(idx != 0, "Neighbor should not include query point itself");
|
||||
}
|
||||
|
||||
// Test with f32 precision to ensure type compatibility
|
||||
for (distance, _) in &neighbors_limited {
|
||||
assert!(!distance.is_nan(), "Distance should not be NaN");
|
||||
assert!(distance.is_finite(), "Distance should be finite");
|
||||
assert!(*distance >= 0.0, "Distance should be non-negative");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cosine_pair_float_precision() {
|
||||
// Test with f32 precision
|
||||
|
||||
@@ -92,7 +92,7 @@ impl<T: Number> Cosine<T> {
|
||||
let magnitude_y = Self::magnitude(y);
|
||||
|
||||
if magnitude_x == 0.0 || magnitude_y == 0.0 {
|
||||
panic!("Cannot compute cosine distance for zero-magnitude vectors.");
|
||||
return f64::MIN;
|
||||
}
|
||||
|
||||
dot_product / (magnitude_x * magnitude_y)
|
||||
@@ -188,12 +188,12 @@ mod tests {
|
||||
wasm_bindgen_test::wasm_bindgen_test
|
||||
)]
|
||||
#[test]
|
||||
#[should_panic(expected = "Cannot compute cosine distance for zero-magnitude vectors.")]
|
||||
fn cosine_distance_zero_vector() {
|
||||
let a = vec![0, 0, 0];
|
||||
let b = vec![1, 2, 3];
|
||||
|
||||
let _dist: f64 = Cosine::new().distance(&a, &b);
|
||||
let dist: f64 = Cosine::new().distance(&a, &b);
|
||||
assert!(dist > 1e300)
|
||||
}
|
||||
|
||||
#[cfg_attr(
|
||||
|
||||
Reference in New Issue
Block a user