Update ci.yml

fix test conditions
Merge branch 'issue-50-predict-proba-for-randomforest' of github.com:smartcorelib/smartcore into issue-50-predict-proba-for-randomforest
2025-01-22 12:12:07 +00:00 · 2025-01-22 12:08:11 +00:00 · 2025-01-20 20:13:04 +00:00 · 2025-01-20 20:12:41 +00:00 · 2025-01-20 18:51:36 +00:00 · 2025-01-20 18:51:06 +00:00
15 changed files with 247 additions and 606 deletions
@@ -70,3 +70,15 @@ $ rust-code-analysis-cli -p src/algorithm/neighbour/fastpair.rs --ls 22 --le 213
 * **PRs on develop**: any change should be PRed first in `development`

 * **testing**:  everything should work and be tested as defined in the workflow. If any is failing for non-related reasons, annotate the test failure in the PR comment.
+
+
+## Suggestions for debugging
+1. Install `lldb` for your platform
+2. Run `rust-lldb target/debug/libsmartcore.rlib` in your command-line
+3. In lldb, set up some breakpoints using `b func_name` or `b src/path/to/file.rs:linenumber`
+4. In lldb, run a single test with `r the_name_of_your_test`
+
+Display variables in scope: `frame variable <name>` 
+
+Execute expression: `p <expr>`
+
@@ -19,13 +19,15 @@ jobs:
            { os: "ubuntu", target: "i686-unknown-linux-gnu" },
            { os: "ubuntu", target: "wasm32-unknown-unknown" },
            { os: "macos", target: "aarch64-apple-darwin" },
+            { os: "ubuntu", target: "wasm32-wasi" },
          ]
    env:
      TZ: "/usr/share/zoneinfo/your/location"
+      RUST_BACKTRACE: "1"
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
      - name: Cache .cargo and target
-        uses: actions/cache@v4
+        uses: actions/cache@v2
        with:
          path: |
            ~/.cargo
@@ -35,13 +37,16 @@ jobs:
      - name: Install Rust toolchain
        uses: actions-rs/toolchain@v1
        with:
-          toolchain: stable
+          toolchain: 1.81 # 1.82 seems to break wasm32 tests https://github.com/rustwasm/wasm-bindgen/issues/4274
          target: ${{ matrix.platform.target }}
          profile: minimal
          default: true
      - name: Install test runner for wasm
        if: matrix.platform.target == 'wasm32-unknown-unknown'
        run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
+      - name: Install test runner for wasi
+        if: matrix.platform.target == 'wasm32-wasi'
+        run: curl https://wasmtime.dev/install.sh -sSf | bash
      - name: Stable Build with all features
        uses: actions-rs/cargo@v1
        with:
@@ -61,7 +66,13 @@ jobs:
      - name: Tests in WASM
        if: matrix.platform.target == 'wasm32-unknown-unknown'
        run: wasm-pack test --node -- --all-features
-  
+      - name: Tests in WASI
+        if: matrix.platform.target == 'wasm32-wasi'
+        run: |
+          export WASMTIME_HOME="$HOME/.wasmtime"
+          export PATH="$WASMTIME_HOME/bin:$PATH"
+          cargo install cargo-wasi && cargo wasi test
+
  check_features:
    runs-on: "${{ matrix.platform.os }}-latest"
    strategy:
@@ -71,9 +82,9 @@ jobs:
    env:
      TZ: "/usr/share/zoneinfo/your/location"
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
      - name: Cache .cargo and target
-        uses: actions/cache@v4
+        uses: actions/cache@v2
        with:
          path: |
            ~/.cargo
@@ -12,9 +12,9 @@ jobs:
    env:
      TZ: "/usr/share/zoneinfo/your/location"
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2
      - name: Cache .cargo
-        uses: actions/cache@v4
+        uses: actions/cache@v2
        with:
          path: |
            ~/.cargo
@@ -14,7 +14,7 @@ jobs:
    steps:
      - uses: actions/checkout@v2
      - name: Cache .cargo and target
-        uses: actions/cache@v4
+        uses: actions/cache@v2
        with:
          path: |
            ~/.cargo
@@ -2,7 +2,7 @@
 name = "smartcore"
 description = "Machine Learning in Rust."
 homepage = "https://smartcorelib.org"
-version = "0.4.1"
+version = "0.4.0"
 authors = ["smartcore Developers"]
 edition = "2021"
 license = "Apache-2.0"
@@ -18,4 +18,4 @@
 -----
 [![CI](https://github.com/smartcorelib/smartcore/actions/workflows/ci.yml/badge.svg)](https://github.com/smartcorelib/smartcore/actions/workflows/ci.yml)

-To start getting familiar with the new smartcore v0.4 API, there is now available a [**Jupyter Notebook environment repository**](https://github.com/smartcorelib/smartcore-jupyter). Please see instructions there, contributions welcome see [CONTRIBUTING](.github/CONTRIBUTING.md).
+To start getting familiar with the new smartcore v0.3 API, there is now available a [**Jupyter Notebook environment repository**](https://github.com/smartcorelib/smartcore-jupyter). Please see instructions there, contributions welcome see [CONTRIBUTING](.github/CONTRIBUTING.md).
@@ -173,21 +173,6 @@ impl<'a, T: RealNumber + FloatNumber, M: Array2<T>> FastPair<'a, T, M> {
        }
    }

-    ///
-    /// Return order dissimilarities from closest to furthest
-    ///
-    #[allow(dead_code)]
-    pub fn ordered_pairs(&self) -> std::vec::IntoIter<&PairwiseDistance<T>> {
-        // improvement: implement this to return `impl Iterator<Item = &PairwiseDistance<T>>`
-        // need to implement trait `Iterator` for `Vec<&PairwiseDistance<T>>`
-        let mut distances = self
-            .distances
-            .values()
-            .collect::<Vec<&PairwiseDistance<T>>>();
-        distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
-        distances.into_iter()
-    }
-
    //
    // Compute distances from input to all other points in data-structure.
    // input is the row index of the sample matrix
@@ -603,103 +588,4 @@ mod tests_fastpair {

        assert_eq!(closest, min_dissimilarity);
    }
-
-    #[test]
-    fn fastpair_ordered_pairs() {
-        let x = DenseMatrix::<f64>::from_2d_array(&[
-            &[5.1, 3.5, 1.4, 0.2],
-            &[4.9, 3.0, 1.4, 0.2],
-            &[4.7, 3.2, 1.3, 0.2],
-            &[4.6, 3.1, 1.5, 0.2],
-            &[5.0, 3.6, 1.4, 0.2],
-            &[5.4, 3.9, 1.7, 0.4],
-            &[4.9, 3.1, 1.5, 0.1],
-            &[7.0, 3.2, 4.7, 1.4],
-            &[6.4, 3.2, 4.5, 1.5],
-            &[6.9, 3.1, 4.9, 1.5],
-            &[5.5, 2.3, 4.0, 1.3],
-            &[6.5, 2.8, 4.6, 1.5],
-            &[4.6, 3.4, 1.4, 0.3],
-            &[5.0, 3.4, 1.5, 0.2],
-            &[4.4, 2.9, 1.4, 0.2],
-        ])
-        .unwrap();
-        let fastpair = FastPair::new(&x).unwrap();
-
-        let ordered = fastpair.ordered_pairs();
-
-        let mut previous: f64 = -1.0;
-        for p in ordered {
-            if previous == -1.0 {
-                previous = p.distance.unwrap();
-            } else {
-                let current = p.distance.unwrap();
-                assert!(current >= previous);
-                previous = current;
-            }
-        }
-    }
-
-    #[test]
-    fn test_empty_set() {
-        let empty_matrix = DenseMatrix::<f64>::zeros(0, 0);
-        let result = FastPair::new(&empty_matrix);
-        assert!(result.is_err());
-        if let Err(e) = result {
-            assert_eq!(
-                e,
-                Failed::because(FailedError::FindFailed, "min number of rows should be 3")
-            );
-        }
-    }
-
-    #[test]
-    fn test_single_point() {
-        let single_point = DenseMatrix::from_2d_array(&[&[1.0, 2.0, 3.0]]).unwrap();
-        let result = FastPair::new(&single_point);
-        assert!(result.is_err());
-        if let Err(e) = result {
-            assert_eq!(
-                e,
-                Failed::because(FailedError::FindFailed, "min number of rows should be 3")
-            );
-        }
-    }
-
-    #[test]
-    fn test_two_points() {
-        let two_points = DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0]]).unwrap();
-        let result = FastPair::new(&two_points);
-        assert!(result.is_err());
-        if let Err(e) = result {
-            assert_eq!(
-                e,
-                Failed::because(FailedError::FindFailed, "min number of rows should be 3")
-            );
-        }
-    }
-
-    #[test]
-    fn test_three_identical_points() {
-        let identical_points =
-            DenseMatrix::from_2d_array(&[&[1.0, 1.0], &[1.0, 1.0], &[1.0, 1.0]]).unwrap();
-        let result = FastPair::new(&identical_points);
-        assert!(result.is_ok());
-        let fastpair = result.unwrap();
-        let closest_pair = fastpair.closest_pair();
-        assert_eq!(closest_pair.distance, Some(0.0));
-    }
-
-    #[test]
-    fn test_result_unwrapping() {
-        let valid_matrix =
-            DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0], &[5.0, 6.0], &[7.0, 8.0]])
-                .unwrap();
-
-        let result = FastPair::new(&valid_matrix);
-        assert!(result.is_ok());
-
-        // This should not panic
-        let _fastpair = result.unwrap();
-    }
 }
@@ -55,7 +55,9 @@ use serde::{Deserialize, Serialize};

 use crate::api::{Predictor, SupervisedEstimator};
 use crate::error::{Failed, FailedError};
+use crate::linalg::basic::arrays::MutArray;
 use crate::linalg::basic::arrays::{Array1, Array2};
+use crate::linalg::basic::matrix::DenseMatrix;
 use crate::numbers::basenum::Number;
 use crate::numbers::floatnum::FloatNumber;

@@ -602,11 +604,76 @@ impl<TX: FloatNumber + PartialOrd, TY: Number + Ord, X: Array2<TX>, Y: Array1<TY
        }
        samples
    }
+
+    /// Predict class probabilities for X.
+    ///
+    /// The predicted class probabilities of an input sample are computed as
+    /// the mean predicted class probabilities of the trees in the forest.
+    /// The class probability of a single tree is the fraction of samples of
+    /// the same class in a leaf.
+    ///
+    /// # Arguments
+    ///
+    /// * `x` - The input samples. A matrix of shape (n_samples, n_features).
+    ///
+    /// # Returns
+    ///
+    /// * `Result<DenseMatrix<f64>, Failed>` - The class probabilities of the input samples.
+    ///   The order of the classes corresponds to that in the attribute `classes_`.
+    ///   The matrix has shape (n_samples, n_classes).
+    ///
+    /// # Errors
+    ///
+    /// Returns a `Failed` error if:
+    /// * The model has not been fitted yet.
+    /// * The input `x` is not compatible with the model's expected input.
+    /// * Any of the tree predictions fail.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use smartcore::ensemble::random_forest_classifier::RandomForestClassifier;
+    /// use smartcore::linalg::basic::matrix::DenseMatrix;
+    /// use smartcore::linalg::basic::arrays::Array;
+    ///
+    /// let x = DenseMatrix::from_2d_array(&[
+    ///     &[5.1, 3.5, 1.4, 0.2],
+    ///     &[4.9, 3.0, 1.4, 0.2],
+    ///     &[7.0, 3.2, 4.7, 1.4],
+    /// ]).unwrap();
+    /// let y = vec![0, 0, 1];
+    ///
+    /// let forest = RandomForestClassifier::fit(&x, &y, Default::default()).unwrap();
+    /// let probas = forest.predict_proba(&x).unwrap();
+    ///
+    /// assert_eq!(probas.shape(), (3, 2));
+    /// ```
+    pub fn predict_proba(&self, x: &X) -> Result<DenseMatrix<f64>, Failed> {
+        let (n_samples, _) = x.shape();
+        let n_classes = self.classes.as_ref().unwrap().len();
+        let mut probas = DenseMatrix::<f64>::zeros(n_samples, n_classes);
+
+        for tree in self.trees.as_ref().unwrap().iter() {
+            let tree_predictions: Y = tree.predict(x).unwrap();
+
+            for (i, &class_idx) in tree_predictions.iterator(0).enumerate() {
+                let class_ = class_idx.to_usize().unwrap();
+                probas.add_element_mut((i, class_), 1.0);
+            }
+        }
+
+        let n_trees: f64 = self.trees.as_ref().unwrap().len() as f64;
+        probas.mul_scalar_mut(1.0 / n_trees);
+
+        Ok(probas)
+    }
 }

 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::ensemble::random_forest_classifier::RandomForestClassifier;
+    use crate::linalg::basic::arrays::Array;
    use crate::linalg::basic::matrix::DenseMatrix;
    use crate::metrics::*;

@@ -760,6 +827,101 @@ mod tests {
        );
    }

+    #[cfg_attr(
+        all(target_arch = "wasm32", not(target_os = "wasi")),
+        wasm_bindgen_test::wasm_bindgen_test
+    )]
+    #[test]
+    fn test_random_forest_predict_proba() {
+        use num_traits::FromPrimitive;
+        // Iris-like dataset (subset)
+        let x: DenseMatrix<f64> = DenseMatrix::from_2d_array(&[
+            &[5.1, 3.5, 1.4, 0.2],
+            &[4.9, 3.0, 1.4, 0.2],
+            &[4.7, 3.2, 1.3, 0.2],
+            &[4.6, 3.1, 1.5, 0.2],
+            &[5.0, 3.6, 1.4, 0.2],
+            &[7.0, 3.2, 4.7, 1.4],
+            &[6.4, 3.2, 4.5, 1.5],
+            &[6.9, 3.1, 4.9, 1.5],
+            &[5.5, 2.3, 4.0, 1.3],
+            &[6.5, 2.8, 4.6, 1.5],
+        ])
+        .unwrap();
+        let y: Vec<u32> = vec![0, 0, 0, 0, 0, 1, 1, 1, 1, 1];
+
+        let forest = RandomForestClassifier::fit(&x, &y, Default::default()).unwrap();
+        let probas = forest.predict_proba(&x).unwrap();
+
+        // Test shape
+        assert_eq!(probas.shape(), (10, 2));
+
+        let (pro_n_rows, _) = probas.shape();
+
+        // Test probability sum
+        for i in 0..pro_n_rows {
+            let row_sum: f64 = probas.get_row(i).sum();
+            assert!(
+                (row_sum - 1.0).abs() < 1e-6,
+                "Row probabilities should sum to 1"
+            );
+        }
+
+        // Test class prediction
+        let predictions: Vec<u32> = (0..pro_n_rows)
+            .map(|i| {
+                if probas.get((i, 0)) > probas.get((i, 1)) {
+                    0
+                } else {
+                    1
+                }
+            })
+            .collect();
+        let acc = accuracy(&y, &predictions);
+        assert!(acc > 0.8, "Accuracy should be high for the training set");
+
+        // Test probability values
+        // These values are approximate and based on typical random forest behavior
+        for i in 0..(pro_n_rows / 2) {
+            assert!(
+                f64::from_f32(0.6).unwrap().lt(probas.get((i, 0))),
+                "Class 0 samples should have high probability for class 0"
+            );
+            assert!(
+                f64::from_f32(0.4).unwrap().gt(probas.get((i, 1))),
+                "Class 0 samples should have low probability for class 1"
+            );
+        }
+
+        for i in (pro_n_rows / 2)..pro_n_rows {
+            assert!(
+                f64::from_f32(0.6).unwrap().lt(probas.get((i, 1))),
+                "Class 1 samples should have high probability for class 1"
+            );
+            assert!(
+                f64::from_f32(0.4).unwrap().gt(probas.get((i, 0))),
+                "Class 1 samples should have low probability for class 0"
+            );
+        }
+
+        // Test with new data
+        let x_new = DenseMatrix::from_2d_array(&[
+            &[5.0, 3.4, 1.5, 0.2], // Should be close to class 0
+            &[6.3, 3.3, 4.7, 1.6], // Should be close to class 1
+        ])
+        .unwrap();
+        let probas_new = forest.predict_proba(&x_new).unwrap();
+        assert_eq!(probas_new.shape(), (2, 2));
+        assert!(
+            probas_new.get((0, 0)) > probas_new.get((0, 1)),
+            "First sample should be predicted as class 0"
+        );
+        assert!(
+            probas_new.get((1, 1)) > probas_new.get((1, 0)),
+            "Second sample should be predicted as class 1"
+        );
+    }
+
    #[cfg_attr(
        all(target_arch = "wasm32", not(target_os = "wasi")),
        wasm_bindgen_test::wasm_bindgen_test
@@ -7,6 +7,7 @@
    clippy::approx_constant
 )]
 #![warn(missing_docs)]
+#![warn(rustdoc::missing_doc_code_examples)]

 //! # smartcore
 //!
@@ -663,7 +663,6 @@ mod tests {
    #[test]
    fn test_instantiate_err_view3() {
        let x = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.], &[7., 8., 9.]]).unwrap();
-        #[allow(clippy::reversed_empty_ranges)]
        let v = DenseMatrixView::new(&x, 0..3, 4..3);
        assert!(v.is_err());
    }
@@ -257,7 +257,8 @@ impl<TY: Number + Ord + Unsigned> BernoulliNBDistribution<TY> {
    /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features.
    /// * `x` - training data.
    /// * `y` - vector with target values (classes) of length N.
-    /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, priors are adjusted according to the data.
+    /// * `priors` - Optional vector with prior probabilities of the classes. If not defined,
+    ///     priors are adjusted according to the data.
    /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter.
    /// * `binarize` - Threshold for binarizing.
    fn fit<TX: Number + PartialOrd, X: Array2<TX>, Y: Array1<TY>>(
@@ -174,7 +174,8 @@ impl<TY: Number + Ord + Unsigned> GaussianNBDistribution<TY> {
    /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features.
    /// * `x` - training data.
    /// * `y` - vector with target values (classes) of length N.
-    /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, priors are adjusted according to the data.
+    /// * `priors` - Optional vector with prior probabilities of the classes. If not defined,
+    ///     priors are adjusted according to the data.
    pub fn fit<TX: Number + RealNumber, X: Array2<TX>, Y: Array1<TY>>(
        x: &X,
        y: &Y,
@@ -40,7 +40,7 @@ use crate::linalg::basic::arrays::{Array1, Array2, ArrayView1};
 use crate::numbers::basenum::Number;
 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
-use std::marker::PhantomData;
+use std::{cmp::Ordering, marker::PhantomData};

 /// Distribution used in the Naive Bayes classifier.
 pub(crate) trait NBDistribution<X: Number, Y: Number>: Clone {
@@ -93,41 +93,41 @@ impl<TX: Number, TY: Number, X: Array2<TX>, Y: Array1<TY>, D: NBDistribution<TX,
    /// Returns a vector of size N with class estimates.
    pub fn predict(&self, x: &X) -> Result<Y, Failed> {
        let y_classes = self.distribution.classes();
-
-        if y_classes.is_empty() {
-            return Err(Failed::predict("Failed to predict, no classes available"));
-        }
-
-        let (rows, _) = x.shape();
-        let mut predictions = Vec::with_capacity(rows);
-        let mut all_probs_nan = true;
-
-        for row_index in 0..rows {
-            let row = x.get_row(row_index);
-            let mut max_log_prob = f64::NEG_INFINITY;
-            let mut max_class = None;
-
-            for (class_index, class) in y_classes.iter().enumerate() {
-                let log_likelihood = self.distribution.log_likelihood(class_index, &row);
-                let log_prob = log_likelihood + self.distribution.prior(class_index).ln();
-
-                if !log_prob.is_nan() && log_prob > max_log_prob {
-                    max_log_prob = log_prob;
-                    max_class = Some(*class);
-                    all_probs_nan = false;
-                }
-            }
-
-            predictions.push(max_class.unwrap_or(y_classes[0]));
-        }
-
-        if all_probs_nan {
-            Err(Failed::predict(
-                "Failed to predict, all probabilities were NaN",
-            ))
-        } else {
-            Ok(Y::from_vec_slice(&predictions))
-        }
+        let predictions = x
+            .row_iter()
+            .map(|row| {
+                y_classes
+                    .iter()
+                    .enumerate()
+                    .map(|(class_index, class)| {
+                        (
+                            class,
+                            self.distribution.log_likelihood(class_index, &row)
+                                + self.distribution.prior(class_index).ln(),
+                        )
+                    })
+                    // For some reason, the max_by method cannot use NaNs for finding the maximum value, it panics.
+                    // NaN must be considered as minimum values,
+                    // therefore it's like NaNs would not be considered for choosing the maximum value.
+                    // So we need to handle this case for avoiding panicking by using `Option::unwrap`.
+                    .max_by(|(_, p1), (_, p2)| match p1.partial_cmp(p2) {
+                        Some(ordering) => ordering,
+                        None => {
+                            if p1.is_nan() {
+                                Ordering::Less
+                            } else if p2.is_nan() {
+                                Ordering::Greater
+                            } else {
+                                Ordering::Equal
+                            }
+                        }
+                    })
+                    .map(|(prediction, _probability)| *prediction)
+                    .ok_or_else(|| Failed::predict("Failed to predict, there is no result"))
+            })
+            .collect::<Result<Vec<TY>, Failed>>()?;
+        let y_hat = Y::from_vec_slice(&predictions);
+        Ok(y_hat)
    }
 }
 pub mod bernoulli;
@@ -177,7 +177,7 @@ mod tests {
            Ok(_) => panic!("Should return error in case of empty classes"),
            Err(err) => assert_eq!(
                err.to_string(),
-                "Predict failed: Failed to predict, no classes available"
+                "Predict failed: Failed to predict, there is no result"
            ),
        }

@@ -193,441 +193,4 @@ mod tests {
            Err(_) => panic!("Should success in normal case without NaNs"),
        }
    }
-
-    // A simple test distribution using float
-    #[derive(Debug, PartialEq, Clone)]
-    struct TestDistributionAgain {
-        classes: Vec<u32>,
-        probs: Vec<f64>,
-    }
-
-    impl NBDistribution<f64, u32> for TestDistributionAgain {
-        fn classes(&self) -> &Vec<u32> {
-            &self.classes
-        }
-        fn prior(&self, class_index: usize) -> f64 {
-            self.probs[class_index]
-        }
-        fn log_likelihood<'a>(
-            &'a self,
-            class_index: usize,
-            _j: &'a Box<dyn ArrayView1<f64> + 'a>,
-        ) -> f64 {
-            self.probs[class_index].ln()
-        }
-    }
-
-    type TestNB = BaseNaiveBayes<f64, u32, DenseMatrix<f64>, Vec<u32>, TestDistributionAgain>;
-
-    #[test]
-    fn test_predict_empty_classes() {
-        let dist = TestDistributionAgain {
-            classes: vec![],
-            probs: vec![],
-        };
-        let nb = TestNB::fit(dist).unwrap();
-        let x = DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0]]).unwrap();
-        assert!(nb.predict(&x).is_err());
-    }
-
-    #[test]
-    fn test_predict_single_class() {
-        let dist = TestDistributionAgain {
-            classes: vec![1],
-            probs: vec![1.0],
-        };
-        let nb = TestNB::fit(dist).unwrap();
-        let x = DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0]]).unwrap();
-        let result = nb.predict(&x).unwrap();
-        assert_eq!(result, vec![1, 1]);
-    }
-
-    #[test]
-    fn test_predict_multiple_classes() {
-        let dist = TestDistributionAgain {
-            classes: vec![1, 2, 3],
-            probs: vec![0.2, 0.5, 0.3],
-        };
-        let nb = TestNB::fit(dist).unwrap();
-        let x = DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0], &[5.0, 6.0]]).unwrap();
-        let result = nb.predict(&x).unwrap();
-        assert_eq!(result, vec![2, 2, 2]);
-    }
-
-    #[test]
-    fn test_predict_with_nans() {
-        let dist = TestDistributionAgain {
-            classes: vec![1, 2],
-            probs: vec![f64::NAN, 0.5],
-        };
-        let nb = TestNB::fit(dist).unwrap();
-        let x = DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0]]).unwrap();
-        let result = nb.predict(&x).unwrap();
-        assert_eq!(result, vec![2, 2]);
-    }
-
-    #[test]
-    fn test_predict_all_nans() {
-        let dist = TestDistributionAgain {
-            classes: vec![1, 2],
-            probs: vec![f64::NAN, f64::NAN],
-        };
-        let nb = TestNB::fit(dist).unwrap();
-        let x = DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0]]).unwrap();
-        assert!(nb.predict(&x).is_err());
-    }
-
-    #[test]
-    fn test_predict_extreme_probabilities() {
-        let dist = TestDistributionAgain {
-            classes: vec![1, 2],
-            probs: vec![1e-300, 1e-301],
-        };
-        let nb = TestNB::fit(dist).unwrap();
-        let x = DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0]]).unwrap();
-        let result = nb.predict(&x).unwrap();
-        assert_eq!(result, vec![1, 1]);
-    }
-
-    #[test]
-    fn test_predict_with_infinity() {
-        let dist = TestDistributionAgain {
-            classes: vec![1, 2, 3],
-            probs: vec![f64::INFINITY, 1.0, 2.0],
-        };
-        let nb = TestNB::fit(dist).unwrap();
-        let x = DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0]]).unwrap();
-        let result = nb.predict(&x).unwrap();
-        assert_eq!(result, vec![1, 1]);
-    }
-
-    #[test]
-    fn test_predict_with_negative_infinity() {
-        let dist = TestDistributionAgain {
-            classes: vec![1, 2, 3],
-            probs: vec![f64::NEG_INFINITY, 1.0, 2.0],
-        };
-        let nb = TestNB::fit(dist).unwrap();
-        let x = DenseMatrix::from_2d_array(&[&[1.0, 2.0], &[3.0, 4.0]]).unwrap();
-        let result = nb.predict(&x).unwrap();
-        assert_eq!(result, vec![3, 3]);
-    }
-
-    #[test]
-    fn test_gaussian_naive_bayes_numerical_stability() {
-        #[derive(Debug, PartialEq, Clone)]
-        struct GaussianTestDistribution {
-            classes: Vec<u32>,
-            means: Vec<Vec<f64>>,
-            variances: Vec<Vec<f64>>,
-            priors: Vec<f64>,
-        }
-
-        impl NBDistribution<f64, u32> for GaussianTestDistribution {
-            fn classes(&self) -> &Vec<u32> {
-                &self.classes
-            }
-
-            fn prior(&self, class_index: usize) -> f64 {
-                self.priors[class_index]
-            }
-
-            fn log_likelihood<'a>(
-                &'a self,
-                class_index: usize,
-                j: &'a Box<dyn ArrayView1<f64> + 'a>,
-            ) -> f64 {
-                let means = &self.means[class_index];
-                let variances = &self.variances[class_index];
-                j.iterator(0)
-                    .enumerate()
-                    .map(|(i, &xi)| {
-                        let mean = means[i];
-                        let var = variances[i] + 1e-9; // Small smoothing for numerical stability
-                        let coeff = -0.5 * (2.0 * std::f64::consts::PI * var).ln();
-                        let exponent = -(xi - mean).powi(2) / (2.0 * var);
-                        coeff + exponent
-                    })
-                    .sum()
-            }
-        }
-
-        fn train_distribution(x: &DenseMatrix<f64>, y: &[u32]) -> GaussianTestDistribution {
-            let mut classes: Vec<u32> = y
-                .iter()
-                .cloned()
-                .collect::<std::collections::HashSet<u32>>()
-                .into_iter()
-                .collect();
-            classes.sort();
-            let n_classes = classes.len();
-            let n_features = x.shape().1;
-
-            let mut means = vec![vec![0.0; n_features]; n_classes];
-            let mut variances = vec![vec![0.0; n_features]; n_classes];
-            let mut class_counts = vec![0; n_classes];
-
-            // Calculate means and count samples per class
-            for (sample, &class) in x.row_iter().zip(y.iter()) {
-                let class_idx = classes.iter().position(|&c| c == class).unwrap();
-                class_counts[class_idx] += 1;
-                for (i, &value) in sample.iterator(0).enumerate() {
-                    means[class_idx][i] += value;
-                }
-            }
-
-            // Normalize means
-            for (class_idx, mean) in means.iter_mut().enumerate() {
-                for value in mean.iter_mut() {
-                    *value /= class_counts[class_idx] as f64;
-                }
-            }
-
-            // Calculate variances
-            for (sample, &class) in x.row_iter().zip(y.iter()) {
-                let class_idx = classes.iter().position(|&c| c == class).unwrap();
-                for (i, &value) in sample.iterator(0).enumerate() {
-                    let diff = value - means[class_idx][i];
-                    variances[class_idx][i] += diff * diff;
-                }
-            }
-
-            // Normalize variances and add small epsilon to avoid zero variance
-            let epsilon = 1e-9;
-            for (class_idx, variance) in variances.iter_mut().enumerate() {
-                for value in variance.iter_mut() {
-                    *value = *value / class_counts[class_idx] as f64 + epsilon;
-                }
-            }
-
-            // Calculate priors
-            let total_samples = y.len() as f64;
-            let priors: Vec<f64> = class_counts
-                .iter()
-                .map(|&count| count as f64 / total_samples)
-                .collect();
-
-            GaussianTestDistribution {
-                classes,
-                means,
-                variances,
-                priors,
-            }
-        }
-
-        type TestNBGaussian =
-            BaseNaiveBayes<f64, u32, DenseMatrix<f64>, Vec<u32>, GaussianTestDistribution>;
-
-        // Create a constant training dataset
-        let n_samples = 1000;
-        let n_features = 5;
-        let n_classes = 4;
-
-        let mut x_data = Vec::with_capacity(n_samples * n_features);
-        let mut y_data = Vec::with_capacity(n_samples);
-
-        for i in 0..n_samples {
-            for j in 0..n_features {
-                x_data.push((i * j) as f64 % 10.0);
-            }
-            y_data.push((i % n_classes) as u32);
-        }
-
-        let x = DenseMatrix::new(n_samples, n_features, x_data, true).unwrap();
-        let y = y_data;
-
-        // Train the model
-        let dist = train_distribution(&x, &y);
-        let nb = TestNBGaussian::fit(dist).unwrap();
-
-        // Create constant test data
-        let n_test_samples = 100;
-        let mut test_x_data = Vec::with_capacity(n_test_samples * n_features);
-        for i in 0..n_test_samples {
-            for j in 0..n_features {
-                test_x_data.push((i * j * 2) as f64 % 15.0);
-            }
-        }
-        let test_x = DenseMatrix::new(n_test_samples, n_features, test_x_data, true).unwrap();
-
-        // Make predictions
-        let predictions = nb
-            .predict(&test_x)
-            .map_err(|e| format!("Prediction failed: {}", e))
-            .unwrap();
-
-        // Check numerical stability
-        assert_eq!(
-            predictions.len(),
-            n_test_samples,
-            "Number of predictions should match number of test samples"
-        );
-
-        // Check that all predictions are valid class labels
-        for &pred in predictions.iter() {
-            assert!(pred < n_classes as u32, "Predicted class should be valid");
-        }
-
-        // Check consistency of predictions
-        let repeated_predictions = nb
-            .predict(&test_x)
-            .map_err(|e| format!("Repeated prediction failed: {}", e))
-            .unwrap();
-        assert_eq!(
-            predictions, repeated_predictions,
-            "Predictions should be consistent when repeated"
-        );
-
-        // Check extreme values
-        let extreme_x =
-            DenseMatrix::new(2, n_features, vec![f64::MAX; n_features * 2], true).unwrap();
-        let extreme_predictions = nb.predict(&extreme_x);
-        assert!(
-            extreme_predictions.is_err(),
-            "Extreme value input should result in an error"
-        );
-        assert_eq!(
-            extreme_predictions.unwrap_err().to_string(),
-            "Predict failed: Failed to predict, all probabilities were NaN",
-            "Incorrect error message for extreme values"
-        );
-
-        // Check for NaN handling
-        let nan_x = DenseMatrix::new(2, n_features, vec![f64::NAN; n_features * 2], true).unwrap();
-        let nan_predictions = nb.predict(&nan_x);
-        assert!(
-            nan_predictions.is_err(),
-            "NaN input should result in an error"
-        );
-
-        // Check for very small values
-        let small_x =
-            DenseMatrix::new(2, n_features, vec![f64::MIN_POSITIVE; n_features * 2], true).unwrap();
-        let small_predictions = nb
-            .predict(&small_x)
-            .map_err(|e| format!("Small value prediction failed: {}", e))
-            .unwrap();
-        for &pred in small_predictions.iter() {
-            assert!(
-                pred < n_classes as u32,
-                "Predictions for very small values should be valid"
-            );
-        }
-
-        // Check for values close to zero
-        let near_zero_x =
-            DenseMatrix::new(2, n_features, vec![1e-300; n_features * 2], true).unwrap();
-        let near_zero_predictions = nb
-            .predict(&near_zero_x)
-            .map_err(|e| format!("Near-zero value prediction failed: {}", e))
-            .unwrap();
-        for &pred in near_zero_predictions.iter() {
-            assert!(
-                pred < n_classes as u32,
-                "Predictions for near-zero values should be valid"
-            );
-        }
-
-        println!("All numerical stability checks passed!");
-    }
-
-    #[test]
-    fn test_gaussian_naive_bayes_numerical_stability_random_data() {
-        #[derive(Debug)]
-        struct MySimpleRng {
-            state: u64,
-        }
-
-        impl MySimpleRng {
-            fn new(seed: u64) -> Self {
-                MySimpleRng { state: seed }
-            }
-
-            /// Get the next u64 in the sequence.
-            fn next_u64(&mut self) -> u64 {
-                // LCG parameters; these are somewhat arbitrary but commonly used.
-                // Feel free to tweak the multiplier/adder etc.
-                self.state = self.state.wrapping_mul(6364136223846793005).wrapping_add(1);
-                self.state
-            }
-
-            /// Get an f64 in the range [min, max).
-            fn next_f64(&mut self, min: f64, max: f64) -> f64 {
-                let fraction = (self.next_u64() as f64) / (u64::MAX as f64);
-                min + fraction * (max - min)
-            }
-
-            /// Get a usize in the range [min, max). This floors the floating result.
-            fn gen_range_usize(&mut self, min: usize, max: usize) -> usize {
-                let v = self.next_f64(min as f64, max as f64);
-                // Truncate into the integer range. Because of floating inexactness,
-                // ensure we also clamp.
-                let int_v = v.floor() as isize;
-                // simple clamp to avoid any float rounding out of range
-                let clamped = int_v.max(min as isize).min((max - 1) as isize);
-                clamped as usize
-            }
-        }
-        use crate::naive_bayes::gaussian::GaussianNB;
-        // We will generate random data in a reproducible way (using a fixed seed).
-        // We will generate random data in a reproducible way:
-        let mut rng = MySimpleRng::new(42);
-
-        let n_samples = 1000;
-        let n_features = 5;
-        let n_classes = 4;
-
-        // Our feature matrix and label vector
-        let mut x_data = Vec::with_capacity(n_samples * n_features);
-        let mut y_data = Vec::with_capacity(n_samples);
-
-        // Fill x_data with random values and y_data with random class labels.
-        for _i in 0..n_samples {
-            for _j in 0..n_features {
-                // We’ll pick random values in [-10, 10).
-                x_data.push(rng.next_f64(-10.0, 10.0));
-            }
-            let class = rng.gen_range_usize(0, n_classes) as u32;
-            y_data.push(class);
-        }
-
-        // Create DenseMatrix from x_data
-        let x = DenseMatrix::new(n_samples, n_features, x_data, true).unwrap();
-
-        // Train GaussianNB
-        let gnb = GaussianNB::fit(&x, &y_data, Default::default())
-            .expect("Fitting GaussianNB with random data failed.");
-
-        // Predict on the same training data to verify no numerical instability
-        let predictions = gnb.predict(&x).expect("Prediction on random data failed.");
-
-        // Basic sanity checks
-        assert_eq!(
-            predictions.len(),
-            n_samples,
-            "Prediction size must match n_samples"
-        );
-        for &pred_class in &predictions {
-            assert!(
-                (pred_class as usize) < n_classes,
-                "Predicted class {} is out of range [0..n_classes).",
-                pred_class
-            );
-        }
-
-        // If you want to compare with scikit-learn, you can do something like:
-        // println!("X = {:?}", &x);
-        // println!("Y = {:?}", &y_data);
-        // println!("predictions = {:?}", &predictions);
-        // and then in Python:
-        //    import numpy as np
-        //    from sklearn.naive_bayes import GaussianNB
-        //    X = np.reshape(np.array(x), (1000, 5), order='F')
-        //    Y = np.array(y)
-        //    gnb = GaussianNB().fit(X, Y)
-        //    preds = gnb.predict(X)
-        //    expected = np.array(predictions)
-        //    assert expected == preds
-        // They should match closely (or exactly) depending on floating rounding.
-    }
 }
@@ -207,7 +207,8 @@ impl<TY: Number + Ord + Unsigned> MultinomialNBDistribution<TY> {
    /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features.
    /// * `x` - training data.
    /// * `y` - vector with target values (classes) of length N.
-    /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, priors are adjusted according to the data.
+    /// * `priors` - Optional vector with prior probabilities of the classes. If not defined,
+    ///     priors are adjusted according to the data.
    /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter.
    pub fn fit<TX: Number + Unsigned, X: Array2<TX>, Y: Array1<TY>>(
        x: &X,
@@ -24,7 +24,7 @@
 //! //    &[1.5, 1.0, 0.0, 1.5, 0.0, 0.0, 1.0, 0.0]
 //! //    &[1.5, 0.0, 1.0, 1.5, 0.0, 0.0, 0.0, 1.0]
 //! ```
-use std::iter::repeat_n;
+use std::iter;

 use crate::error::Failed;
 use crate::linalg::basic::arrays::Array2;
@@ -75,7 +75,11 @@ fn find_new_idxs(num_params: usize, cat_sizes: &[usize], cat_idxs: &[usize]) ->
    let offset = (0..1).chain(offset_);

    let new_param_idxs: Vec<usize> = (0..num_params)
-        .zip(repeats.zip(offset).flat_map(|(r, o)| repeat_n(o, r)))
+        .zip(
+            repeats
+                .zip(offset)
+                .flat_map(|(r, o)| iter::repeat(o).take(r)),
+        )
        .map(|(idx, ofst)| idx + ofst)
        .collect();
    new_param_idxs
@@ -120,7 +124,7 @@ impl OneHotEncoder {
                let (nrows, _) = data.shape();

                // col buffer to avoid allocations
-                let mut col_buf: Vec<T> = repeat_n(T::zero(), nrows).collect();
+                let mut col_buf: Vec<T> = iter::repeat(T::zero()).take(nrows).collect();

                let mut res: Vec<CategoryMapper<CategoricalFloat>> = Vec::with_capacity(idxs.len());
Author	SHA1	Message	Date
Lorenzo	78780787db	Update ci.yml	2025-01-22 12:12:07 +00:00
Lorenzo Mec-iS	4aee603ae4	fix test conditions	2025-01-22 12:08:11 +00:00
Lorenzo Mec-iS	4878042392	Merge branch 'issue-50-predict-proba-for-randomforest' of github.com:smartcorelib/smartcore into issue-50-predict-proba-for-randomforest	2025-01-20 20:13:04 +00:00
Lorenzo Mec-iS	d427c91cef	try to fix test error	2025-01-20 20:12:41 +00:00
Lorenzo Mec-iS	0262dae872	Merge branch 'development' of github.com:smartcorelib/smartcore into issue-50-predict-proba-for-randomforest	2025-01-20 18:51:36 +00:00
Lorenzo	5d6ed49071	Merge branch 'development' into issue-50-predict-proba-for-randomforest	2025-01-20 18:51:06 +00:00
Lorenzo Mec-iS	bb356e6a28	fix test	2025-01-20 17:29:29 +00:00
Lorenzo Mec-iS	52b797d520	format	2025-01-20 17:18:09 +00:00
Lorenzo Mec-iS	63fa00334b	Fix clippy error	2025-01-20 17:17:41 +00:00
Lorenzo Mec-iS	40ee35b04f	Implement predict_proba for RandomForestClassifier	2025-01-20 17:15:52 +00:00
Lorenzo Mec-iS	5711788fd8	add proper error handling	2025-01-20 16:08:29 +00:00
Lorenzo Mec-iS	fc7f2e61d9	format	2025-01-20 15:27:39 +00:00
Lorenzo Mec-iS	609f8024bc	more clippy fixes	2025-01-20 15:23:36 +00:00
Lorenzo Mec-iS	58ee0cb8d1	Some automated fixes suggested by cargo clippy --fix	2025-01-20 15:04:21 +00:00
Lorenzo Mec-iS	68fd27f8f4	Implement predict_proba for DecisionTreeClassifier	2025-01-20 14:59:50 +00:00