From 5cc55283670f58d867eddad786760d420fd8502b Mon Sep 17 00:00:00 2001 From: Daniel Lacina <127566471+DanielLacina@users.noreply.github.com> Date: Wed, 9 Jul 2025 09:25:45 -0500 Subject: [PATCH] implemented xgdboost_regression (#314) * implemented xgd_regression --- src/lib.rs | 1 + src/xgboost/mod.rs | 16 + src/xgboost/xgb_regressor.rs | 762 +++++++++++++++++++++++++++++++++++ 3 files changed, 779 insertions(+) create mode 100644 src/xgboost/mod.rs create mode 100644 src/xgboost/xgb_regressor.rs diff --git a/src/lib.rs b/src/lib.rs index c6f9349..c68368f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -130,5 +130,6 @@ pub mod readers; pub mod svm; /// Supervised tree-based learning methods pub mod tree; +pub mod xgboost; pub(crate) mod rand_custom; diff --git a/src/xgboost/mod.rs b/src/xgboost/mod.rs new file mode 100644 index 0000000..68e087b --- /dev/null +++ b/src/xgboost/mod.rs @@ -0,0 +1,16 @@ +//! # XGBoost +//! +//! XGBoost, which stands for Extreme Gradient Boosting, is a powerful and efficient implementation of the gradient boosting framework. Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees. +//! +//! The core idea of boosting is to build the model in a stage-wise fashion. It learns from its mistakes by sequentially adding new models that correct the errors of the previous ones. Unlike bagging, which trains models in parallel, boosting is a sequential process. Each new tree is fit on a modified version of the original data set, specifically focusing on the instances where the previous models performed poorly. +//! +//! XGBoost enhances this process through several key innovations. It employs a more regularized model formalization to control over-fitting, which gives it better performance. It also has a highly optimized and parallelized tree construction process, making it significantly faster and more scalable than traditional gradient boosting implementations. +//! +//! ## References: +//! +//! * "Elements of Statistical Learning", Hastie T., Tibshirani R., Friedman J., 10. Boosting and Additive Trees +//! * XGBoost: A Scalable Tree Boosting System, Chen T., Guestrin C. + +// xgboost implementation +pub mod xgb_regressor; +pub use xgb_regressor::{XGRegressor, XGRegressorParameters}; diff --git a/src/xgboost/xgb_regressor.rs b/src/xgboost/xgb_regressor.rs new file mode 100644 index 0000000..ac6ec75 --- /dev/null +++ b/src/xgboost/xgb_regressor.rs @@ -0,0 +1,762 @@ +//! # Extreme Gradient Boosting (XGBoost) +//! +//! XGBoost is a highly efficient and effective implementation of the gradient boosting framework. +//! Like other boosting models, it builds an ensemble of sequential decision trees, where each new tree +//! is trained to correct the errors of the previous ones. +//! +//! What makes XGBoost powerful is its use of both the first and second derivatives (gradient and hessian) +//! of the loss function, which allows for more accurate approximations and faster convergence. It also +//! includes built-in regularization techniques (L1/`alpha` and L2/`lambda`) to prevent overfitting. +//! +//! This implementation was ported to Rust from the concepts and algorithm explained in the blog post +//! ["XGBoost from Scratch"](https://randomrealizations.com/posts/xgboost-from-scratch/). It is designed +//! to be a general-purpose regressor that can be used with any objective function that provides a gradient +//! and a hessian. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::basic::matrix::DenseMatrix; +//! use smartcore::xgboost::{XGRegressor, XGRegressorParameters}; +//! +//! // Simple dataset: predict y = 2*x +//! let x = DenseMatrix::from_2d_array(&[ +//! &[1.0], &[2.0], &[3.0], &[4.0], &[5.0] +//! ]).unwrap(); +//! let y = vec![2.0, 4.0, 6.0, 8.0, 10.0]; +//! +//! // Use default parameters, but set a few for demonstration +//! let parameters = XGRegressorParameters::default() +//! .with_n_estimators(50) +//! .with_max_depth(3) +//! .with_learning_rate(0.1); +//! +//! // Train the model +//! let model = XGRegressor::fit(&x, &y, parameters).unwrap(); +//! +//! // Make predictions +//! let x_test = DenseMatrix::from_2d_array(&[&[6.0], &[7.0]]).unwrap(); +//! let y_hat = model.predict(&x_test).unwrap(); +//! +//! // y_hat should be close to [12.0, 14.0] +//! ``` +//! + +use rand::{seq::SliceRandom, Rng}; +use std::{iter::zip, marker::PhantomData}; + +use crate::{ + api::{PredictorBorrow, SupervisedEstimatorBorrow}, + error::{Failed, FailedError}, + linalg::basic::arrays::{Array1, Array2}, + numbers::basenum::Number, + rand_custom::get_rng_impl, +}; + +/// Defines the objective function to be optimized. +/// The objective function provides the loss, gradient (first derivative), and +/// hessian (second derivative) required for the XGBoost algorithm. +#[derive(Clone, Debug)] +pub enum Objective { + /// The objective for regression tasks using Mean Squared Error. + /// Loss: 0.5 * (y_true - y_pred)^2 + MeanSquaredError, +} + +impl Objective { + /// Calculates the loss for each sample given the true and predicted values. + /// + /// # Arguments + /// * `y_true` - A vector of the true target values. + /// * `y_pred` - A vector of the predicted values. + /// + /// # Returns + /// The mean of the calculated loss values. + pub fn loss_function>(&self, y_true: &Y, y_pred: &Vec) -> f64 { + match self { + Objective::MeanSquaredError => { + zip(y_true.iterator(0), y_pred) + .map(|(true_val, pred_val)| { + 0.5 * (true_val.to_f64().unwrap() - pred_val).powi(2) + }) + .sum::() + / y_true.shape() as f64 + } + } + } + + /// Calculates the gradient (first derivative) of the loss function. + /// + /// # Arguments + /// * `y_true` - A vector of the true target values. + /// * `y_pred` - A vector of the predicted values. + /// + /// # Returns + /// A vector of gradients for each sample. + pub fn gradient>(&self, y_true: &Y, y_pred: &Vec) -> Vec { + match self { + Objective::MeanSquaredError => zip(y_true.iterator(0), y_pred) + .map(|(true_val, pred_val)| (*pred_val - true_val.to_f64().unwrap())) + .collect(), + } + } + + /// Calculates the hessian (second derivative) of the loss function. + /// + /// # Arguments + /// * `y_true` - A vector of the true target values. + /// * `y_pred` - A vector of the predicted values. + /// + /// # Returns + /// A vector of hessians for each sample. + #[allow(unused_variables)] + pub fn hessian>(&self, y_true: &Y, y_pred: &[f64]) -> Vec { + match self { + Objective::MeanSquaredError => vec![1.0; y_true.shape()], + } + } +} + +/// Represents a single decision tree in the XGBoost ensemble. +/// +/// This is a recursive data structure where each `TreeRegressor` is a node +/// that can have a left and a right child, also of type `TreeRegressor`. +#[allow(dead_code)] +struct TreeRegressor, Y: Array1> { + left: Option>>, + right: Option>>, + /// The output value of this node. If it's a leaf, this is the final prediction. + value: f64, + /// The feature value threshold used to split this node. + threshold: f64, + /// The index of the feature used for splitting. + split_feature_idx: usize, + /// The gain in score achieved by this split. + split_score: f64, + _phantom_tx: PhantomData, + _phantom_ty: PhantomData, + _phantom_x: PhantomData, + _phantom_y: PhantomData, +} + +impl, Y: Array1> + TreeRegressor +{ + /// Recursively builds a decision tree (a `TreeRegressor` node). + /// + /// This function determines the optimal split for the given set of samples (`idxs`) + /// and then recursively calls itself to build the left and right child nodes. + /// + /// # Arguments + /// * `data` - The full training dataset. + /// * `g` - Gradients for all samples. + /// * `h` - Hessians for all samples. + /// * `idxs` - The indices of the samples belonging to the current node. + /// * `max_depth` - The maximum remaining depth for this branch. + /// * `min_child_weight` - The minimum sum of hessians required in a child node. + /// * `lambda` - L2 regularization term on weights. + /// * `gamma` - Minimum loss reduction required to make a further partition. + pub fn fit( + data: &X, + g: &Vec, + h: &Vec, + idxs: &[usize], + max_depth: u16, + min_child_weight: f64, + lambda: f64, + gamma: f64, + ) -> Self { + let g_sum = idxs.iter().map(|&i| g[i]).sum::(); + let h_sum = idxs.iter().map(|&i| h[i]).sum::(); + let value = -g_sum / (h_sum + lambda); + + let mut best_feature_idx = usize::MAX; + let mut best_split_score = 0.0; + let mut best_threshold = 0.0; + let mut left = Option::None; + let mut right = Option::None; + + if max_depth > 0 { + Self::insert_child_nodes( + data, + g, + h, + idxs, + &mut best_feature_idx, + &mut best_split_score, + &mut best_threshold, + &mut left, + &mut right, + max_depth, + min_child_weight, + lambda, + gamma, + ); + } + + Self { + left, + right, + value, + threshold: best_threshold, + split_feature_idx: best_feature_idx, + split_score: best_split_score, + _phantom_tx: PhantomData, + _phantom_ty: PhantomData, + _phantom_x: PhantomData, + _phantom_y: PhantomData, + } + } + + /// Finds the best split and creates child nodes if a valid split is found. + fn insert_child_nodes( + data: &X, + g: &Vec, + h: &Vec, + idxs: &[usize], + best_feature_idx: &mut usize, + best_split_score: &mut f64, + best_threshold: &mut f64, + left: &mut Option>, + right: &mut Option>, + max_depth: u16, + min_child_weight: f64, + lambda: f64, + gamma: f64, + ) { + let (_, n_features) = data.shape(); + for i in 0..n_features { + Self::find_best_split( + data, + g, + h, + idxs, + i, + best_feature_idx, + best_split_score, + best_threshold, + min_child_weight, + lambda, + gamma, + ); + } + + // A split is only valid if it results in a positive gain. + if *best_split_score > 0.0 { + let mut left_idxs = Vec::new(); + let mut right_idxs = Vec::new(); + for idx in idxs.iter() { + if data.get((*idx, *best_feature_idx)).to_f64().unwrap() <= *best_threshold { + left_idxs.push(*idx); + } else { + right_idxs.push(*idx); + } + } + + *left = Some(Box::new(TreeRegressor::fit( + data, + g, + h, + &left_idxs, + max_depth - 1, + min_child_weight, + lambda, + gamma, + ))); + *right = Some(Box::new(TreeRegressor::fit( + data, + g, + h, + &right_idxs, + max_depth - 1, + min_child_weight, + lambda, + gamma, + ))); + } + } + + /// Iterates through a single feature to find the best possible split point. + fn find_best_split( + data: &X, + g: &[f64], + h: &[f64], + idxs: &[usize], + feature_idx: usize, + best_feature_idx: &mut usize, + best_split_score: &mut f64, + best_threshold: &mut f64, + min_child_weight: f64, + lambda: f64, + gamma: f64, + ) { + let mut sorted_idxs = idxs.to_owned(); + sorted_idxs.sort_by(|a, b| { + data.get((*a, feature_idx)) + .partial_cmp(data.get((*b, feature_idx))) + .unwrap() + }); + + let sum_g = sorted_idxs.iter().map(|&i| g[i]).sum::(); + let sum_h = sorted_idxs.iter().map(|&i| h[i]).sum::(); + + let mut sum_g_right = sum_g; + let mut sum_h_right = sum_h; + let mut sum_g_left = 0.0; + let mut sum_h_left = 0.0; + + for i in 0..sorted_idxs.len() - 1 { + let idx = sorted_idxs[i]; + let next_idx = sorted_idxs[i + 1]; + + let g_i = g[idx]; + let h_i = h[idx]; + let x_i = data.get((idx, feature_idx)).to_f64().unwrap(); + let x_i_next = data.get((next_idx, feature_idx)).to_f64().unwrap(); + + sum_g_left += g_i; + sum_h_left += h_i; + sum_g_right -= g_i; + sum_h_right -= h_i; + + if sum_h_left < min_child_weight || x_i == x_i_next { + continue; + } + if sum_h_right < min_child_weight { + break; + } + + let gain = 0.5 + * ((sum_g_left * sum_g_left / (sum_h_left + lambda)) + + (sum_g_right * sum_g_right / (sum_h_right + lambda)) + - (sum_g * sum_g / (sum_h + lambda))) + - gamma; + + if gain > *best_split_score { + *best_split_score = gain; + *best_threshold = (x_i + x_i_next) / 2.0; + *best_feature_idx = feature_idx; + } + } + } + + /// Predicts the output values for a dataset. + pub fn predict(&self, data: &X) -> Vec { + let (n_samples, n_features) = data.shape(); + (0..n_samples) + .map(|i| { + self.predict_for_row(&Vec::from_iterator( + data.get_row(i).iterator(0).copied(), + n_features, + )) + }) + .collect() + } + + /// Predicts the output value for a single row of data by traversing the tree. + pub fn predict_for_row(&self, row: &Vec) -> f64 { + // A leaf node is identified by having no children. + if self.left.is_none() { + return self.value; + } + + // Recurse down the appropriate branch. + let child = if row[self.split_feature_idx].to_f64().unwrap() <= self.threshold { + self.left.as_ref().unwrap() + } else { + self.right.as_ref().unwrap() + }; + + child.predict_for_row(row) + } +} + +/// Parameters for the `jRegressor` model. +/// +/// This struct holds all the hyperparameters that control the training process. +#[derive(Clone, Debug)] +pub struct XGRegressorParameters { + /// The number of boosting rounds or trees to build. + pub n_estimators: usize, + /// The maximum depth of each tree. + pub max_depth: u16, + /// Step size shrinkage used to prevent overfitting. + pub learning_rate: f64, + /// Minimum sum of instance weight (hessian) needed in a child. + pub min_child_weight: usize, + /// L2 regularization term on weights. + pub lambda: f64, + /// Minimum loss reduction required to make a further partition on a leaf node. + pub gamma: f64, + /// The initial prediction score for all instances. + pub base_score: f64, + /// The fraction of samples to be used for fitting the individual base learners. + pub subsample: f64, + /// The seed for the random number generator for reproducibility. + pub seed: u64, + /// The objective function to be optimized. + pub objective: Objective, +} + +impl Default for XGRegressorParameters { + /// Creates a new set of `XGRegressorParameters` with default values. + fn default() -> Self { + Self { + n_estimators: 100, + learning_rate: 0.3, + max_depth: 6, + min_child_weight: 1, + lambda: 1.0, + gamma: 0.0, + base_score: 0.5, + subsample: 1.0, + seed: 0, + objective: Objective::MeanSquaredError, + } + } +} + +// Builder pattern for XGRegressorParameters +impl XGRegressorParameters { + /// Sets the number of boosting rounds or trees to build. + pub fn with_n_estimators(mut self, n_estimators: usize) -> Self { + self.n_estimators = n_estimators; + self + } + + /// Sets the step size shrinkage used to prevent overfitting. + /// + /// Also known as `eta`. A smaller value makes the model more robust by preventing + /// too much weight being given to any single tree. + pub fn with_learning_rate(mut self, learning_rate: f64) -> Self { + self.learning_rate = learning_rate; + self + } + + /// Sets the maximum depth of each individual tree. + // A lower value helps prevent overfitting.* + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = max_depth; + self + } + + /// Sets the minimum sum of instance weight (hessian) needed in a child node. + /// + /// If the tree partition step results in a leaf node with the sum of + // instance weight less than `min_child_weight`, then the building process* + /// will give up further partitioning. + pub fn with_min_child_weight(mut self, min_child_weight: usize) -> Self { + self.min_child_weight = min_child_weight; + self + } + + /// Sets the L2 regularization term on weights (`lambda`). + /// + /// Increasing this value will make the model more conservative. + pub fn with_lambda(mut self, lambda: f64) -> Self { + self.lambda = lambda; + self + } + + /// Sets the minimum loss reduction required to make a further partition on a leaf node. + /// + /// The larger `gamma` is, the more conservative the algorithm will be. + pub fn with_gamma(mut self, gamma: f64) -> Self { + self.gamma = gamma; + self + } + + /// Sets the initial prediction score for all instances. + pub fn with_base_score(mut self, base_score: f64) -> Self { + self.base_score = base_score; + self + } + + /// Sets the fraction of samples to be used for fitting individual base learners. + /// + /// A value of less than 1.0 introduces randomness and helps prevent overfitting. + pub fn with_subsample(mut self, subsample: f64) -> Self { + self.subsample = subsample; + self + } + + /// Sets the seed for the random number generator for reproducibility. + pub fn with_seed(mut self, seed: u64) -> Self { + self.seed = seed; + self + } + + /// Sets the objective function to be optimized during training. + pub fn with_objective(mut self, objective: Objective) -> Self { + self.objective = objective; + self + } +} + +/// An Extreme Gradient Boosting (XGBoost) model for regression and classification tasks. +pub struct XGRegressor, Y: Array1> { + regressors: Option>>, + parameters: Option, + _phantom_ty: PhantomData, + _phantom_tx: PhantomData, + _phantom_y: PhantomData, + _phantom_x: PhantomData, +} + +impl, Y: Array1> XGRegressor { + /// Fits the XGBoost model to the training data. + pub fn fit(data: &X, y: &Y, parameters: XGRegressorParameters) -> Result { + if parameters.subsample > 1.0 || parameters.subsample <= 0.0 { + return Err(Failed::because( + FailedError::ParametersError, + "Subsample ratio must be in (0, 1].", + )); + } + + let (n_samples, _) = data.shape(); + let learning_rate = parameters.learning_rate; + let mut predictions = vec![parameters.base_score; n_samples]; + + let mut regressors = Vec::new(); + let mut rng = get_rng_impl(Some(parameters.seed)); + + for _ in 0..parameters.n_estimators { + let gradients = parameters.objective.gradient(y, &predictions); + let hessians = parameters.objective.hessian(y, &predictions); + + let sample_idxs = if parameters.subsample < 1.0 { + Self::sample_without_replacement(n_samples, parameters.subsample, &mut rng) + } else { + (0..n_samples).collect::>() + }; + + let regressor = TreeRegressor::fit( + data, + &gradients, + &hessians, + &sample_idxs, + parameters.max_depth, + parameters.min_child_weight as f64, + parameters.lambda, + parameters.gamma, + ); + + let corrections = regressor.predict(data); + predictions = zip(predictions, corrections) + .map(|(pred, correction)| pred + (learning_rate * correction)) + .collect(); + + regressors.push(regressor); + } + + Ok(Self { + regressors: Some(regressors), + parameters: Some(parameters), + _phantom_ty: PhantomData, + _phantom_y: PhantomData, + _phantom_tx: PhantomData, + _phantom_x: PhantomData, + }) + } + + /// Predicts target values for the given input data. + pub fn predict(&self, data: &X) -> Result, Failed> { + let (n_samples, _) = data.shape(); + + let parameters = self.parameters.as_ref().unwrap(); + let mut predictions = vec![parameters.base_score; n_samples]; + let regressors = self.regressors.as_ref().unwrap(); + + for regressor in regressors.iter() { + let corrections = regressor.predict(data); + predictions = zip(predictions, corrections) + .map(|(pred, correction)| pred + (parameters.learning_rate * correction)) + .collect(); + } + + Ok(predictions + .into_iter() + .map(|p| TX::from_f64(p).unwrap()) + .collect()) + } + + /// Creates a random sample of indices without replacement. + fn sample_without_replacement( + population_size: usize, + subsample_ratio: f64, + rng: &mut impl Rng, + ) -> Vec { + let mut indices: Vec = (0..population_size).collect(); + indices.shuffle(rng); + indices.truncate((population_size as f64 * subsample_ratio) as usize); + indices + } +} + +// Boilerplate implementation for the smartcore traits +impl, Y: Array1> + SupervisedEstimatorBorrow<'_, X, Y, XGRegressorParameters> for XGRegressor +{ + fn new() -> Self { + Self { + regressors: None, + parameters: None, + _phantom_ty: PhantomData, + _phantom_y: PhantomData, + _phantom_tx: PhantomData, + _phantom_x: PhantomData, + } + } + + fn fit(x: &X, y: &Y, parameters: &XGRegressorParameters) -> Result { + XGRegressor::fit(x, y, parameters.clone()) + } +} + +impl, Y: Array1> PredictorBorrow<'_, X, TX> + for XGRegressor +{ + fn predict(&self, x: &X) -> Result, Failed> { + self.predict(x) + } +} + +// ------------------- TESTS ------------------- +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::basic::{arrays::Array, matrix::DenseMatrix}; + + /// Tests the gradient and hessian calculations for MeanSquaredError. + #[test] + fn test_mse_objective() { + let objective = Objective::MeanSquaredError; + let y_true = vec![1.0, 2.0, 3.0]; + let y_pred = vec![1.5, 2.5, 2.5]; + + let gradients = objective.gradient(&y_true, &y_pred); + let hessians = objective.hessian(&y_true, &y_pred); + + // Gradients should be (pred - true) + assert_eq!(gradients, vec![0.5, 0.5, -0.5]); + // Hessians should be all 1.0 for MSE + assert_eq!(hessians, vec![1.0, 1.0, 1.0]); + } + + #[test] + fn test_find_best_split_multidimensional() { + // Data has two features. The second feature is a better predictor. + let data = vec![ + vec![1.0, 10.0], // g = -0.5 + vec![1.0, 20.0], // g = -1.0 + vec![1.0, 30.0], // g = 1.0 + vec![1.0, 40.0], // g = 1.5 + ]; + let data = DenseMatrix::from_2d_vec(&data).unwrap(); + let g = vec![-0.5, -1.0, 1.0, 1.5]; + let h = vec![1.0, 1.0, 1.0, 1.0]; + let idxs = (0..4).collect::>(); + + let mut best_feature_idx = usize::MAX; + let mut best_split_score = 0.0; + let mut best_threshold = 0.0; + + // Manually calculated expected gain for the best split (on feature 1, with lambda=1.0). + // G_left = -1.5, H_left = 2.0 + // G_right = 2.5, H_right = 2.0 + // G_total = 1.0, H_total = 4.0 + // Gain = 0.5 * (G_l^2/(H_l+λ) + G_r^2/(H_r+λ) - G_t^2/(H_t+λ)) + // Gain = 0.5 * ((-1.5)^2/(2+1) + (2.5)^2/(2+1) - (1.0)^2/(4+1)) + // Gain = 0.5 * (2.25/3 + 6.25/3 - 1.0/5) = 0.5 * (0.75 + 2.0833 - 0.2) = 1.3166... + let expected_gain = 1.3166666666666667; + + // Search both features. The algorithm must find the best split on feature 1. + let (_, n_features) = data.shape(); + for i in 0..n_features { + TreeRegressor::, Vec>::find_best_split( + &data, + &g, + &h, + &idxs, + i, + &mut best_feature_idx, + &mut best_split_score, + &mut best_threshold, + 1.0, + 1.0, + 0.0, + ); + } + + assert_eq!(best_feature_idx, 1); // Should choose the second feature + assert!((best_split_score - expected_gain).abs() < 1e-9); + assert_eq!(best_threshold, 25.0); // (20 + 30) / 2 + } + + /// Tests that the TreeRegressor can build a simple one-level tree on multidimensional data. + #[test] + fn test_tree_regressor_fit_multidimensional() { + let data = vec![ + vec![1.0, 10.0], + vec![1.0, 20.0], + vec![1.0, 30.0], + vec![1.0, 40.0], + ]; + let data = DenseMatrix::from_2d_vec(&data).unwrap(); + let g = vec![-0.5, -1.0, 1.0, 1.5]; + let h = vec![1.0, 1.0, 1.0, 1.0]; + let idxs = (0..4).collect::>(); + + let tree = TreeRegressor::, Vec>::fit( + &data, &g, &h, &idxs, 2, 1.0, 1.0, 0.0, + ); + + // Check that the root node was split on the correct feature + assert!(tree.left.is_some()); + assert!(tree.right.is_some()); + assert_eq!(tree.split_feature_idx, 1); // Should split on the second feature + assert_eq!(tree.threshold, 25.0); + + // Check leaf values (G/H+lambda) + // Left leaf: G = -1.5, H = 2.0 => value = -(-1.5)/(2+1) = 0.5 + // Right leaf: G = 2.5, H = 2.0 => value = -(2.5)/(2+1) = -0.8333 + assert!((tree.left.unwrap().value - 0.5).abs() < 1e-9); + assert!((tree.right.unwrap().value - (-0.833333333)).abs() < 1e-9); + } + + /// A "smoke test" to ensure the main XGRegressor can fit and predict on multidimensional data. + #[test] + fn test_xgregressor_fit_predict_multidimensional() { + // Simple 2D data where y is roughly 2*x1 + 3*x2 + let x_vec = vec![ + vec![1.0, 1.0], + vec![2.0, 1.0], + vec![1.0, 2.0], + vec![2.0, 2.0], + ]; + let x = DenseMatrix::from_2d_vec(&x_vec).unwrap(); + let y = vec![5.0, 7.0, 8.0, 10.0]; + + let params = XGRegressorParameters::default() + .with_n_estimators(10) + .with_max_depth(2); + + let fit_result = XGRegressor::fit(&x, &y, params); + assert!( + fit_result.is_ok(), + "Fit failed with error: {:?}", + fit_result.err() + ); + + let model = fit_result.unwrap(); + let predict_result = model.predict(&x); + assert!( + predict_result.is_ok(), + "Predict failed with error: {:?}", + predict_result.err() + ); + + let predictions = predict_result.unwrap(); + assert_eq!(predictions.len(), 4); + } +}