//! # Extreme Gradient Boosting (XGBoost) //! //! XGBoost is a highly efficient and effective implementation of the gradient boosting framework. //! Like other boosting models, it builds an ensemble of sequential decision trees, where each new tree //! is trained to correct the errors of the previous ones. //! //! What makes XGBoost powerful is its use of both the first and second derivatives (gradient and hessian) //! of the loss function, which allows for more accurate approximations and faster convergence. It also //! includes built-in regularization techniques (L1/`alpha` and L2/`lambda`) to prevent overfitting. //! //! This implementation was ported to Rust from the concepts and algorithm explained in the blog post //! ["XGBoost from Scratch"](https://randomrealizations.com/posts/xgboost-from-scratch/). It is designed //! to be a general-purpose regressor that can be used with any objective function that provides a gradient //! and a hessian. //! //! Example: //! //! ``` //! use smartcore::linalg::basic::matrix::DenseMatrix; //! use smartcore::xgboost::{XGRegressor, XGRegressorParameters}; //! //! // Simple dataset: predict y = 2*x //! let x = DenseMatrix::from_2d_array(&[ //! &[1.0], &[2.0], &[3.0], &[4.0], &[5.0] //! ]).unwrap(); //! let y = vec![2.0, 4.0, 6.0, 8.0, 10.0]; //! //! // Use default parameters, but set a few for demonstration //! let parameters = XGRegressorParameters::default() //! .with_n_estimators(50) //! .with_max_depth(3) //! .with_learning_rate(0.1); //! //! // Train the model //! let model = XGRegressor::fit(&x, &y, parameters).unwrap(); //! //! // Make predictions //! let x_test = DenseMatrix::from_2d_array(&[&[6.0], &[7.0]]).unwrap(); //! let y_hat = model.predict(&x_test).unwrap(); //! //! // y_hat should be close to [12.0, 14.0] //! ``` //! use rand::{seq::SliceRandom, Rng}; use std::{iter::zip, marker::PhantomData}; use crate::{ api::{PredictorBorrow, SupervisedEstimatorBorrow}, error::{Failed, FailedError}, linalg::basic::arrays::{Array1, Array2}, numbers::basenum::Number, rand_custom::get_rng_impl, }; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; /// Defines the objective function to be optimized. /// The objective function provides the loss, gradient (first derivative), and /// hessian (second derivative) required for the XGBoost algorithm. #[derive(Clone, Debug)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum Objective { /// The objective for regression tasks using Mean Squared Error. /// Loss: 0.5 * (y_true - y_pred)^2 MeanSquaredError, } impl Objective { /// Calculates the loss for each sample given the true and predicted values. /// /// # Arguments /// * `y_true` - A vector of the true target values. /// * `y_pred` - A vector of the predicted values. /// /// # Returns /// The mean of the calculated loss values. pub fn loss_function>(&self, y_true: &Y, y_pred: &Vec) -> f64 { match self { Objective::MeanSquaredError => { zip(y_true.iterator(0), y_pred) .map(|(true_val, pred_val)| { 0.5 * (true_val.to_f64().unwrap() - pred_val).powi(2) }) .sum::() / y_true.shape() as f64 } } } /// Calculates the gradient (first derivative) of the loss function. /// /// # Arguments /// * `y_true` - A vector of the true target values. /// * `y_pred` - A vector of the predicted values. /// /// # Returns /// A vector of gradients for each sample. pub fn gradient>(&self, y_true: &Y, y_pred: &Vec) -> Vec { match self { Objective::MeanSquaredError => zip(y_true.iterator(0), y_pred) .map(|(true_val, pred_val)| *pred_val - true_val.to_f64().unwrap()) .collect(), } } /// Calculates the hessian (second derivative) of the loss function. /// /// # Arguments /// * `y_true` - A vector of the true target values. /// * `y_pred` - A vector of the predicted values. /// /// # Returns /// A vector of hessians for each sample. #[allow(unused_variables)] pub fn hessian>(&self, y_true: &Y, y_pred: &[f64]) -> Vec { match self { Objective::MeanSquaredError => vec![1.0; y_true.shape()], } } } /// Represents a single decision tree in the XGBoost ensemble. /// /// This is a recursive data structure where each `TreeRegressor` is a node /// that can have a left and a right child, also of type `TreeRegressor`. #[allow(dead_code)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] struct TreeRegressor, Y: Array1> { left: Option>>, right: Option>>, /// The output value of this node. If it's a leaf, this is the final prediction. value: f64, /// The feature value threshold used to split this node. threshold: f64, /// The index of the feature used for splitting. split_feature_idx: usize, /// The gain in score achieved by this split. split_score: f64, _phantom_tx: PhantomData, _phantom_ty: PhantomData, _phantom_x: PhantomData, _phantom_y: PhantomData, } impl, Y: Array1> TreeRegressor { /// Recursively builds a decision tree (a `TreeRegressor` node). /// /// This function determines the optimal split for the given set of samples (`idxs`) /// and then recursively calls itself to build the left and right child nodes. /// /// # Arguments /// * `data` - The full training dataset. /// * `g` - Gradients for all samples. /// * `h` - Hessians for all samples. /// * `idxs` - The indices of the samples belonging to the current node. /// * `max_depth` - The maximum remaining depth for this branch. /// * `min_child_weight` - The minimum sum of hessians required in a child node. /// * `lambda` - L2 regularization term on weights. /// * `gamma` - Minimum loss reduction required to make a further partition. pub fn fit( data: &X, g: &Vec, h: &Vec, idxs: &[usize], max_depth: u16, min_child_weight: f64, lambda: f64, gamma: f64, ) -> Self { let g_sum = idxs.iter().map(|&i| g[i]).sum::(); let h_sum = idxs.iter().map(|&i| h[i]).sum::(); let value = -g_sum / (h_sum + lambda); let mut best_feature_idx = usize::MAX; let mut best_split_score = 0.0; let mut best_threshold = 0.0; let mut left = Option::None; let mut right = Option::None; if max_depth > 0 { Self::insert_child_nodes( data, g, h, idxs, &mut best_feature_idx, &mut best_split_score, &mut best_threshold, &mut left, &mut right, max_depth, min_child_weight, lambda, gamma, ); } Self { left, right, value, threshold: best_threshold, split_feature_idx: best_feature_idx, split_score: best_split_score, _phantom_tx: PhantomData, _phantom_ty: PhantomData, _phantom_x: PhantomData, _phantom_y: PhantomData, } } /// Finds the best split and creates child nodes if a valid split is found. fn insert_child_nodes( data: &X, g: &Vec, h: &Vec, idxs: &[usize], best_feature_idx: &mut usize, best_split_score: &mut f64, best_threshold: &mut f64, left: &mut Option>, right: &mut Option>, max_depth: u16, min_child_weight: f64, lambda: f64, gamma: f64, ) { let (_, n_features) = data.shape(); for i in 0..n_features { Self::find_best_split( data, g, h, idxs, i, best_feature_idx, best_split_score, best_threshold, min_child_weight, lambda, gamma, ); } // A split is only valid if it results in a positive gain. if *best_split_score > 0.0 { let mut left_idxs = Vec::new(); let mut right_idxs = Vec::new(); for idx in idxs.iter() { if data.get((*idx, *best_feature_idx)).to_f64().unwrap() <= *best_threshold { left_idxs.push(*idx); } else { right_idxs.push(*idx); } } *left = Some(Box::new(TreeRegressor::fit( data, g, h, &left_idxs, max_depth - 1, min_child_weight, lambda, gamma, ))); *right = Some(Box::new(TreeRegressor::fit( data, g, h, &right_idxs, max_depth - 1, min_child_weight, lambda, gamma, ))); } } /// Iterates through a single feature to find the best possible split point. fn find_best_split( data: &X, g: &[f64], h: &[f64], idxs: &[usize], feature_idx: usize, best_feature_idx: &mut usize, best_split_score: &mut f64, best_threshold: &mut f64, min_child_weight: f64, lambda: f64, gamma: f64, ) { let mut sorted_idxs = idxs.to_owned(); sorted_idxs.sort_by(|a, b| { data.get((*a, feature_idx)) .partial_cmp(data.get((*b, feature_idx))) .unwrap() }); let sum_g = sorted_idxs.iter().map(|&i| g[i]).sum::(); let sum_h = sorted_idxs.iter().map(|&i| h[i]).sum::(); let mut sum_g_right = sum_g; let mut sum_h_right = sum_h; let mut sum_g_left = 0.0; let mut sum_h_left = 0.0; for i in 0..sorted_idxs.len() - 1 { let idx = sorted_idxs[i]; let next_idx = sorted_idxs[i + 1]; let g_i = g[idx]; let h_i = h[idx]; let x_i = data.get((idx, feature_idx)).to_f64().unwrap(); let x_i_next = data.get((next_idx, feature_idx)).to_f64().unwrap(); sum_g_left += g_i; sum_h_left += h_i; sum_g_right -= g_i; sum_h_right -= h_i; if sum_h_left < min_child_weight || x_i == x_i_next { continue; } if sum_h_right < min_child_weight { break; } let gain = 0.5 * ((sum_g_left * sum_g_left / (sum_h_left + lambda)) + (sum_g_right * sum_g_right / (sum_h_right + lambda)) - (sum_g * sum_g / (sum_h + lambda))) - gamma; if gain > *best_split_score { *best_split_score = gain; *best_threshold = (x_i + x_i_next) / 2.0; *best_feature_idx = feature_idx; } } } /// Predicts the output values for a dataset. pub fn predict(&self, data: &X) -> Vec { let (n_samples, n_features) = data.shape(); (0..n_samples) .map(|i| { self.predict_for_row(&Vec::from_iterator( data.get_row(i).iterator(0).copied(), n_features, )) }) .collect() } /// Predicts the output value for a single row of data by traversing the tree. pub fn predict_for_row(&self, row: &Vec) -> f64 { // A leaf node is identified by having no children. if self.left.is_none() { return self.value; } // Recurse down the appropriate branch. let child = if row[self.split_feature_idx].to_f64().unwrap() <= self.threshold { self.left.as_ref().unwrap() } else { self.right.as_ref().unwrap() }; child.predict_for_row(row) } } /// Parameters for the `jRegressor` model. /// /// This struct holds all the hyperparameters that control the training process. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, Debug)] pub struct XGRegressorParameters { /// The number of boosting rounds or trees to build. pub n_estimators: usize, /// The maximum depth of each tree. pub max_depth: u16, /// Step size shrinkage used to prevent overfitting. pub learning_rate: f64, /// Minimum sum of instance weight (hessian) needed in a child. pub min_child_weight: usize, /// L2 regularization term on weights. pub lambda: f64, /// Minimum loss reduction required to make a further partition on a leaf node. pub gamma: f64, /// The initial prediction score for all instances. pub base_score: f64, /// The fraction of samples to be used for fitting the individual base learners. pub subsample: f64, /// The seed for the random number generator for reproducibility. pub seed: u64, /// The objective function to be optimized. pub objective: Objective, } impl Default for XGRegressorParameters { /// Creates a new set of `XGRegressorParameters` with default values. fn default() -> Self { Self { n_estimators: 100, learning_rate: 0.3, max_depth: 6, min_child_weight: 1, lambda: 1.0, gamma: 0.0, base_score: 0.5, subsample: 1.0, seed: 0, objective: Objective::MeanSquaredError, } } } // Builder pattern for XGRegressorParameters impl XGRegressorParameters { /// Sets the number of boosting rounds or trees to build. pub fn with_n_estimators(mut self, n_estimators: usize) -> Self { self.n_estimators = n_estimators; self } /// Sets the step size shrinkage used to prevent overfitting. /// /// Also known as `eta`. A smaller value makes the model more robust by preventing /// too much weight being given to any single tree. pub fn with_learning_rate(mut self, learning_rate: f64) -> Self { self.learning_rate = learning_rate; self } /// Sets the maximum depth of each individual tree. // A lower value helps prevent overfitting.* pub fn with_max_depth(mut self, max_depth: u16) -> Self { self.max_depth = max_depth; self } /// Sets the minimum sum of instance weight (hessian) needed in a child node. /// /// If the tree partition step results in a leaf node with the sum of // instance weight less than `min_child_weight`, then the building process* /// will give up further partitioning. pub fn with_min_child_weight(mut self, min_child_weight: usize) -> Self { self.min_child_weight = min_child_weight; self } /// Sets the L2 regularization term on weights (`lambda`). /// /// Increasing this value will make the model more conservative. pub fn with_lambda(mut self, lambda: f64) -> Self { self.lambda = lambda; self } /// Sets the minimum loss reduction required to make a further partition on a leaf node. /// /// The larger `gamma` is, the more conservative the algorithm will be. pub fn with_gamma(mut self, gamma: f64) -> Self { self.gamma = gamma; self } /// Sets the initial prediction score for all instances. pub fn with_base_score(mut self, base_score: f64) -> Self { self.base_score = base_score; self } /// Sets the fraction of samples to be used for fitting individual base learners. /// /// A value of less than 1.0 introduces randomness and helps prevent overfitting. pub fn with_subsample(mut self, subsample: f64) -> Self { self.subsample = subsample; self } /// Sets the seed for the random number generator for reproducibility. pub fn with_seed(mut self, seed: u64) -> Self { self.seed = seed; self } /// Sets the objective function to be optimized during training. pub fn with_objective(mut self, objective: Objective) -> Self { self.objective = objective; self } } /// An Extreme Gradient Boosting (XGBoost) model for regression and classification tasks. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Debug)] pub struct XGRegressor, Y: Array1> { regressors: Option>>, parameters: Option, _phantom_ty: PhantomData, _phantom_tx: PhantomData, _phantom_y: PhantomData, _phantom_x: PhantomData, } impl, Y: Array1> XGRegressor { /// Fits the XGBoost model to the training data. pub fn fit(data: &X, y: &Y, parameters: XGRegressorParameters) -> Result { if parameters.subsample > 1.0 || parameters.subsample <= 0.0 { return Err(Failed::because( FailedError::ParametersError, "Subsample ratio must be in (0, 1].", )); } let (n_samples, _) = data.shape(); let learning_rate = parameters.learning_rate; let mut predictions = vec![parameters.base_score; n_samples]; let mut regressors = Vec::new(); let mut rng = get_rng_impl(Some(parameters.seed)); for _ in 0..parameters.n_estimators { let gradients = parameters.objective.gradient(y, &predictions); let hessians = parameters.objective.hessian(y, &predictions); let sample_idxs = if parameters.subsample < 1.0 { Self::sample_without_replacement(n_samples, parameters.subsample, &mut rng) } else { (0..n_samples).collect::>() }; let regressor = TreeRegressor::fit( data, &gradients, &hessians, &sample_idxs, parameters.max_depth, parameters.min_child_weight as f64, parameters.lambda, parameters.gamma, ); let corrections = regressor.predict(data); predictions = zip(predictions, corrections) .map(|(pred, correction)| pred + (learning_rate * correction)) .collect(); regressors.push(regressor); } Ok(Self { regressors: Some(regressors), parameters: Some(parameters), _phantom_ty: PhantomData, _phantom_y: PhantomData, _phantom_tx: PhantomData, _phantom_x: PhantomData, }) } /// Predicts target values for the given input data. pub fn predict(&self, data: &X) -> Result, Failed> { let (n_samples, _) = data.shape(); let parameters = self.parameters.as_ref().unwrap(); let mut predictions = vec![parameters.base_score; n_samples]; let regressors = self.regressors.as_ref().unwrap(); for regressor in regressors.iter() { let corrections = regressor.predict(data); predictions = zip(predictions, corrections) .map(|(pred, correction)| pred + (parameters.learning_rate * correction)) .collect(); } Ok(predictions .into_iter() .map(|p| TX::from_f64(p).unwrap()) .collect()) } /// Creates a random sample of indices without replacement. fn sample_without_replacement( population_size: usize, subsample_ratio: f64, rng: &mut impl Rng, ) -> Vec { let mut indices: Vec = (0..population_size).collect(); indices.shuffle(rng); indices.truncate((population_size as f64 * subsample_ratio) as usize); indices } } // Boilerplate implementation for the smartcore traits impl, Y: Array1> SupervisedEstimatorBorrow<'_, X, Y, XGRegressorParameters> for XGRegressor { fn new() -> Self { Self { regressors: None, parameters: None, _phantom_ty: PhantomData, _phantom_y: PhantomData, _phantom_tx: PhantomData, _phantom_x: PhantomData, } } fn fit(x: &X, y: &Y, parameters: &XGRegressorParameters) -> Result { XGRegressor::fit(x, y, parameters.clone()) } } impl, Y: Array1> PredictorBorrow<'_, X, TX> for XGRegressor { fn predict(&self, x: &X) -> Result, Failed> { self.predict(x) } } // ------------------- TESTS ------------------- #[cfg(test)] mod tests { use super::*; use crate::linalg::basic::{arrays::Array, matrix::DenseMatrix}; /// Tests the gradient and hessian calculations for MeanSquaredError. #[test] fn test_mse_objective() { let objective = Objective::MeanSquaredError; let y_true = vec![1.0, 2.0, 3.0]; let y_pred = vec![1.5, 2.5, 2.5]; let gradients = objective.gradient(&y_true, &y_pred); let hessians = objective.hessian(&y_true, &y_pred); // Gradients should be (pred - true) assert_eq!(gradients, vec![0.5, 0.5, -0.5]); // Hessians should be all 1.0 for MSE assert_eq!(hessians, vec![1.0, 1.0, 1.0]); } #[test] fn test_find_best_split_multidimensional() { // Data has two features. The second feature is a better predictor. let data = vec![ vec![1.0, 10.0], // g = -0.5 vec![1.0, 20.0], // g = -1.0 vec![1.0, 30.0], // g = 1.0 vec![1.0, 40.0], // g = 1.5 ]; let data = DenseMatrix::from_2d_vec(&data).unwrap(); let g = vec![-0.5, -1.0, 1.0, 1.5]; let h = vec![1.0, 1.0, 1.0, 1.0]; let idxs = (0..4).collect::>(); let mut best_feature_idx = usize::MAX; let mut best_split_score = 0.0; let mut best_threshold = 0.0; // Manually calculated expected gain for the best split (on feature 1, with lambda=1.0). // G_left = -1.5, H_left = 2.0 // G_right = 2.5, H_right = 2.0 // G_total = 1.0, H_total = 4.0 // Gain = 0.5 * (G_l^2/(H_l+λ) + G_r^2/(H_r+λ) - G_t^2/(H_t+λ)) // Gain = 0.5 * ((-1.5)^2/(2+1) + (2.5)^2/(2+1) - (1.0)^2/(4+1)) // Gain = 0.5 * (2.25/3 + 6.25/3 - 1.0/5) = 0.5 * (0.75 + 2.0833 - 0.2) = 1.3166... let expected_gain = 1.3166666666666667; // Search both features. The algorithm must find the best split on feature 1. let (_, n_features) = data.shape(); for i in 0..n_features { TreeRegressor::, Vec>::find_best_split( &data, &g, &h, &idxs, i, &mut best_feature_idx, &mut best_split_score, &mut best_threshold, 1.0, 1.0, 0.0, ); } assert_eq!(best_feature_idx, 1); // Should choose the second feature assert!((best_split_score - expected_gain).abs() < 1e-9); assert_eq!(best_threshold, 25.0); // (20 + 30) / 2 } /// Tests that the TreeRegressor can build a simple one-level tree on multidimensional data. #[test] fn test_tree_regressor_fit_multidimensional() { let data = vec![ vec![1.0, 10.0], vec![1.0, 20.0], vec![1.0, 30.0], vec![1.0, 40.0], ]; let data = DenseMatrix::from_2d_vec(&data).unwrap(); let g = vec![-0.5, -1.0, 1.0, 1.5]; let h = vec![1.0, 1.0, 1.0, 1.0]; let idxs = (0..4).collect::>(); let tree = TreeRegressor::, Vec>::fit( &data, &g, &h, &idxs, 2, 1.0, 1.0, 0.0, ); // Check that the root node was split on the correct feature assert!(tree.left.is_some()); assert!(tree.right.is_some()); assert_eq!(tree.split_feature_idx, 1); // Should split on the second feature assert_eq!(tree.threshold, 25.0); // Check leaf values (G/H+lambda) // Left leaf: G = -1.5, H = 2.0 => value = -(-1.5)/(2+1) = 0.5 // Right leaf: G = 2.5, H = 2.0 => value = -(2.5)/(2+1) = -0.8333 assert!((tree.left.unwrap().value - 0.5).abs() < 1e-9); assert!((tree.right.unwrap().value - (-0.833333333)).abs() < 1e-9); } /// A "smoke test" to ensure the main XGRegressor can fit and predict on multidimensional data. #[test] fn test_xgregressor_fit_predict_multidimensional() { // Simple 2D data where y is roughly 2*x1 + 3*x2 let x_vec = vec![ vec![1.0, 1.0], vec![2.0, 1.0], vec![1.0, 2.0], vec![2.0, 2.0], ]; let x = DenseMatrix::from_2d_vec(&x_vec).unwrap(); let y = vec![5.0, 7.0, 8.0, 10.0]; let params = XGRegressorParameters::default() .with_n_estimators(10) .with_max_depth(2); let fit_result = XGRegressor::fit(&x, &y, params); assert!( fit_result.is_ok(), "Fit failed with error: {:?}", fit_result.err() ); let model = fit_result.unwrap(); let predict_result = model.predict(&x); assert!( predict_result.is_ok(), "Predict failed with error: {:?}", predict_result.err() ); let predictions = predict_result.unwrap(); assert_eq!(predictions.len(), 4); } }