diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index b49743f..f633cac 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -1,6 +1,7 @@ -//! # K Nearest Neighbors Regressor +//! # K Nearest Neighbors Regressor with Feature Sparsing //! //! Regressor that predicts estimated values as a function of k nearest neightbours. +//! Now supports feature sparsing - the ability to consider only a subset of features during prediction. //! //! `KNNRegressor` relies on 2 backend algorithms to speedup KNN queries: //! * [`LinearSearch`](../../algorithm/neighbour/linear_search/index.html) @@ -29,6 +30,10 @@ //! //! let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = knn.predict(&x).unwrap(); +//! +//! // Predict using only features at indices 0 +//! let feature_indices = vec![0]; +//! let y_hat_sparse = knn.predict_sparse(&x, &feature_indices).unwrap(); //! ``` //! //! variable `y_hat` will hold predicted value @@ -77,12 +82,13 @@ pub struct KNNRegressorParameters>> { pub struct KNNRegressor, Y: Array1, D: Distance>> { y: Option, + x: Option, // Store training data for sparse feature prediction knn_algorithm: Option>, + distance: Option, // Store distance function for sparse prediction weight: Option, k: Option, _phantom_tx: PhantomData, _phantom_ty: PhantomData, - _phantom_x: PhantomData, } impl, Y: Array1, D: Distance>> @@ -92,12 +98,20 @@ impl, Y: Array1, D: Distance>> self.y.as_ref().unwrap() } + fn x(&self) -> &X { + self.x.as_ref().unwrap() + } + fn knn_algorithm(&self) -> &KNNAlgorithm { self.knn_algorithm .as_ref() .expect("Missing parameter: KNNAlgorithm") } + fn distance(&self) -> &D { + self.distance.as_ref().expect("Missing parameter: distance") + } + fn weight(&self) -> &KNNWeightFunction { self.weight.as_ref().expect("Missing parameter: weight") } @@ -176,12 +190,13 @@ impl, Y: Array1, D: Distance>> fn new() -> Self { Self { y: Option::None, + x: Option::None, knn_algorithm: Option::None, + distance: Option::None, weight: Option::None, k: Option::None, _phantom_tx: PhantomData, _phantom_ty: PhantomData, - _phantom_x: PhantomData, } } @@ -231,16 +246,17 @@ impl, Y: Array1, D: Distance>> ))); } - let knn_algo = parameters.algorithm.fit(data, parameters.distance)?; + let knn_algo = parameters.algorithm.fit(data, parameters.distance.clone())?; Ok(KNNRegressor { y: Some(y.clone()), + x: Some(x.clone()), k: Some(parameters.k), knn_algorithm: Some(knn_algo), + distance: Some(parameters.distance), weight: Some(parameters.weight), _phantom_tx: PhantomData, _phantom_ty: PhantomData, - _phantom_x: PhantomData, }) } @@ -262,6 +278,45 @@ impl, Y: Array1, D: Distance>> Ok(result) } + /// Predict the target for the provided data using only specified features. + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. + /// * `feature_indices` - indices of features to consider (e.g., [0, 2, 4] to use only features at positions 0, 2, and 4) + /// + /// Returns a vector of size N with estimates. + pub fn predict_sparse(&self, x: &X, feature_indices: &[usize]) -> Result { + let (n_samples, n_features) = x.shape(); + + // Validate feature indices + for &idx in feature_indices { + if idx >= n_features { + return Err(Failed::predict(&format!( + "Feature index {} out of bounds (max: {})", + idx, + n_features - 1 + ))); + } + } + + if feature_indices.is_empty() { + return Err(Failed::predict( + "feature_indices cannot be empty" + )); + } + + let mut result = Y::zeros(n_samples); + + let mut row_vec = vec![TX::zero(); feature_indices.len()]; + for (i, row) in x.row_iter().enumerate() { + // Extract only the specified features + for (j, &feat_idx) in feature_indices.iter().enumerate() { + row_vec[j] = *row.get(feat_idx); + } + result.set(i, self.predict_for_row_sparse(&row_vec, feature_indices)?); + } + + Ok(result) + } + fn predict_for_row(&self, row: &Vec) -> Result { let search_result = self.knn_algorithm().find(row, self.k.unwrap())?; let mut result = TY::zero(); @@ -277,6 +332,50 @@ impl, Y: Array1, D: Distance>> Ok(result) } + + fn predict_for_row_sparse( + &self, + row: &Vec, + feature_indices: &[usize], + ) -> Result { + let training_data = self.x(); + let (n_training_samples, _) = training_data.shape(); + let k = self.k.unwrap(); + + // Manually compute distances using only specified features + let mut distances: Vec<(usize, f64)> = Vec::with_capacity(n_training_samples); + + for i in 0..n_training_samples { + let train_row = training_data.get_row(i); + + // Extract sparse features from training data + let mut train_sparse = Vec::with_capacity(feature_indices.len()); + for &feat_idx in feature_indices { + train_sparse.push(*train_row.get(feat_idx)); + } + + // Compute distance using only selected features + let dist = self.distance().distance(row, &train_sparse); + distances.push((i, dist)); + } + + // Sort by distance and take k nearest + distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); + let k_nearest: Vec<(usize, f64)> = distances.into_iter().take(k).collect(); + + // Compute weighted prediction + let mut result = TY::zero(); + let weights = self + .weight() + .calc_weights(k_nearest.iter().map(|v| v.1).collect()); + let w_sum: f64 = weights.iter().copied().sum(); + + for (neighbor, w) in k_nearest.iter().zip(weights.iter()) { + result += *self.y().get(neighbor.0) * TY::from_f64(*w / w_sum).unwrap(); + } + + Ok(result) + } } #[cfg(test)] @@ -332,6 +431,91 @@ mod tests { } } + #[cfg_attr( + all(target_arch = "wasm32", not(target_os = "wasi")), + wasm_bindgen_test::wasm_bindgen_test + )] + #[test] + fn knn_predict_sparse() { + // Training data with 3 features + let x = DenseMatrix::from_2d_array(&[ + &[1., 2., 10.], + &[3., 4., 20.], + &[5., 6., 30.], + &[7., 8., 40.], + &[9., 10., 50.], + ]) + .unwrap(); + let y: Vec = vec![1., 2., 3., 4., 5.]; + + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); + + // Test data + let x_test = DenseMatrix::from_2d_array(&[ + &[1., 2., 999.], // Third feature is very different + &[5., 6., 999.], + ]) + .unwrap(); + + // Predict using only first two features (ignore the third) + let feature_indices = vec![0, 1]; + let y_hat_sparse = knn.predict_sparse(&x_test, &feature_indices).unwrap(); + + // Should get good predictions since we're ignoring the mismatched third feature + assert_eq!(2, Vec::len(&y_hat_sparse)); + assert!((y_hat_sparse[0] - 2.0).abs() < 1.0); // Should be close to 1-2 + assert!((y_hat_sparse[1] - 3.0).abs() < 1.0); // Should be close to 3 + } + + #[cfg_attr( + all(target_arch = "wasm32", not(target_os = "wasi")), + wasm_bindgen_test::wasm_bindgen_test + )] + #[test] + fn knn_predict_sparse_single_feature() { + let x = DenseMatrix::from_2d_array(&[ + &[1., 100., 1000.], + &[2., 200., 2000.], + &[3., 300., 3000.], + &[4., 400., 4000.], + &[5., 500., 5000.], + ]) + .unwrap(); + let y: Vec = vec![1., 2., 3., 4., 5.]; + + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); + + let x_test = DenseMatrix::from_2d_array(&[&[1.5, 999., 9999.]]).unwrap(); + + // Use only first feature + let y_hat = knn.predict_sparse(&x_test, &[0]).unwrap(); + + // Should predict based on first feature only + assert_eq!(1, Vec::len(&y_hat)); + assert!((y_hat[0] - 1.5).abs() < 1.0); + } + + #[cfg_attr( + all(target_arch = "wasm32", not(target_os = "wasi")), + wasm_bindgen_test::wasm_bindgen_test + )] + #[test] + fn knn_predict_sparse_invalid_indices() { + let x = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.]]).unwrap(); + let y: Vec = vec![1., 2.]; + + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); + let x_test = DenseMatrix::from_2d_array(&[&[1., 2.]]).unwrap(); + + // Index out of bounds + let result = knn.predict_sparse(&x_test, &[5]); + assert!(result.is_err()); + + // Empty indices + let result = knn.predict_sparse(&x_test, &[]); + assert!(result.is_err()); + } + #[cfg_attr( all(target_arch = "wasm32", not(target_os = "wasi")), wasm_bindgen_test::wasm_bindgen_test @@ -350,4 +534,4 @@ mod tests { assert_eq!(knn, deserialized_knn); } -} +} \ No newline at end of file