diff --git a/benches/naive_bayes.rs b/benches/naive_bayes.rs
index 2a4595b..ba8cb6f 100644
--- a/benches/naive_bayes.rs
+++ b/benches/naive_bayes.rs
@@ -6,7 +6,7 @@ use ndarray::Array2;
 use smartcore::linalg::naive::dense_matrix::DenseMatrix;
 use smartcore::linalg::BaseMatrix;
 use smartcore::linalg::BaseVector;
-use smartcore::naive_bayes::GaussianNB;
+use smartcore::naive_bayes::gaussian::GaussianNB;
 
 pub fn gaussian_naive_bayes_fit_benchmark(c: &mut Criterion) {
     let mut group = c.benchmark_group("GaussianNB::fit");
diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs
index 057b447..c478d58 100644
--- a/src/naive_bayes/bernoulli.rs
+++ b/src/naive_bayes/bernoulli.rs
@@ -1,3 +1,38 @@
+//! # Bernoulli Naive Bayes
+//!
+//! Bernoulli Naive Bayes classifier is a variant of [Naive Bayes](../index.html) for the data that is distributed according to multivariate Bernoulli distribution.
+//! It is used for discrete data with binary features. One example of a binary feature is a word that occurs in the text or not.
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::linalg::naive::dense_matrix::*;
+//! use smartcore::naive_bayes::bernoulli::BernoulliNB;
+//!
+//! // Training data points are:
+//! // Chinese Beijing Chinese (class: China)
+//! // Chinese Chinese Shanghai (class: China)
+//! // Chinese Macao (class: China)
+//! // Tokyo Japan Chinese (class: Japan)
+//! let x = DenseMatrix::<f64>::from_2d_array(&[
+//!           &[1., 1., 0., 0., 0., 0.],
+//!           &[0., 1., 0., 0., 1., 0.],
+//!           &[0., 1., 0., 1., 0., 0.],
+//!           &[0., 1., 1., 0., 0., 1.],
+//! ]);
+//! let y = vec![0., 0., 0., 1.];
+//!
+//! let nb = BernoulliNB::fit(&x, &y, Default::default()).unwrap();
+//!
+//! // Testing data point is:
+//! // Chinese Chinese Chinese Tokyo Japan
+//! let x_test = DenseMatrix::<f64>::from_2d_array(&[&[0., 1., 1., 0., 0., 1.]]);
+//! let y_hat = nb.predict(&x_test).unwrap();
+//! ```
+//!
+//! ## References:
+//!
+//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
 use crate::error::Failed;
 use crate::linalg::row_iter;
 use crate::linalg::BaseVector;
diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs
index d32c34d..d6b24a2 100644
--- a/src/naive_bayes/categorical.rs
+++ b/src/naive_bayes/categorical.rs
@@ -1,3 +1,35 @@
+//! # Categorical Naive Bayes
+//!
+//! Categorical Naive Bayes is a variant of [Naive Bayes](../index.html) for the categorically distributed data.
+//! It assumes that each feature has its own categorical distribution.
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::linalg::naive::dense_matrix::*;
+//! use smartcore::naive_bayes::categorical::CategoricalNB;
+//!
+//! let x = DenseMatrix::from_2d_array(&[
+//!              &[3., 4., 0., 1.],
+//!              &[3., 0., 0., 1.],
+//!              &[4., 4., 1., 2.],
+//!              &[4., 2., 4., 3.],
+//!              &[4., 2., 4., 2.],
+//!              &[4., 1., 1., 0.],
+//!              &[1., 1., 1., 1.],
+//!              &[0., 4., 1., 0.],
+//!              &[0., 3., 2., 1.],
+//!              &[0., 3., 1., 1.],
+//!              &[3., 4., 0., 1.],
+//!              &[3., 4., 2., 4.],
+//!              &[0., 3., 1., 2.],
+//!              &[0., 4., 1., 2.],
+//!          ]);
+//! let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.];
+//!
+//! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap();
+//! let y_hat = nb.predict(&x).unwrap();
+//! ```
 use crate::error::Failed;
 use crate::linalg::BaseVector;
 use crate::linalg::Matrix;
diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs
index af5732d..fc11b49 100644
--- a/src/naive_bayes/gaussian.rs
+++ b/src/naive_bayes/gaussian.rs
@@ -1,3 +1,27 @@
+//! # Gaussian Naive Bayes
+//!
+//! Gaussian Naive Bayes is a variant of [Naive Bayes](../index.html) for the data that follows Gaussian distribution and
+//! it supports continuous valued features conforming to a normal distribution.
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::linalg::naive::dense_matrix::*;
+//! use smartcore::naive_bayes::gaussian::GaussianNB;
+//!
+//! let x = DenseMatrix::from_2d_array(&[
+//!              &[-1., -1.],
+//!              &[-2., -1.],
+//!              &[-3., -2.],
+//!              &[ 1.,  1.],
+//!              &[ 2.,  1.],
+//!              &[ 3.,  2.],
+//!          ]);
+//! let y = vec![1., 1., 1., 2., 2., 2.];
+//!
+//! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap();
+//! let y_hat = nb.predict(&x).unwrap();
+//! ```
 use crate::error::Failed;
 use crate::linalg::row_iter;
 use crate::linalg::BaseVector;
diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs
index 508b976..7ab8b85 100644
--- a/src/naive_bayes/mod.rs
+++ b/src/naive_bayes/mod.rs
@@ -1,3 +1,40 @@
+//! # Naive Bayes
+//!
+//! Naive Bayes (NB) is a simple but powerful machine learning algorithm.
+//! Naive Bayes classifier is based on Bayes’ Theorem with an ssumption of conditional independence
+//! between every pair of features given the value of the class variable.
+//!
+//! Bayes’ theorem can be written as
+//!
+//! \\[ P(y | X) = \frac{P(y)P(X| y)}{P(X)} \\]
+//!
+//! where
+//!
+//! * \\(X = (x_1,...x_n)\\) represents the predictors.
+//! * \\(P(y | X)\\) is the probability of class _y_ given the data X
+//! * \\(P(X| y)\\) is the probability of data X given the class _y_.
+//! * \\(P(y)\\) is the probability of class y. This is called the prior probability of y.
+//! * \\(P(y | X)\\) is the probability of the data (regardless of the class value).
+//!
+//! The naive conditional independence assumption let us rewrite this equation as
+//!
+//! \\[ P(y | x_1,...x_n) = \frac{P(y)\prod_{i=1}^nP(x_i|y)}{P(x_1,...x_n)} \\]
+//!
+//!
+//! The denominator can be removed since \\(P(x_1,...x_n)\\) is constrant for all the entries in the dataset.
+//!
+//! \\[ P(y | x_1,...x_n) \propto P(y)\prod_{i=1}^nP(x_i|y) \\]
+//!
+//! To find class y from predictors X we use this equation
+//!
+//! \\[ y = \underset{y}{argmax} P(y)\prod_{i=1}^nP(x_i|y) \\]
+//!
+//! ## References:
+//!
+//! * ["Machine Learning: A Probabilistic Perspective", Kevin P. Murphy, 2012, Chapter 3 ](https://mitpress.mit.edu/books/machine-learning-1)
+//!
+//! <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+//! <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
 use crate::error::Failed;
 use crate::linalg::BaseVector;
 use crate::linalg::Matrix;
@@ -64,12 +101,7 @@ impl<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> BaseNaiveBayes<T, M,
         Ok(y_hat)
     }
 }
-mod bernoulli;
-mod categorical;
-mod gaussian;
-mod multinomial;
-
-pub use bernoulli::{BernoulliNB, BernoulliNBParameters};
-pub use categorical::{CategoricalNB, CategoricalNBParameters};
-pub use gaussian::{GaussianNB, GaussianNBParameters};
-pub use multinomial::{MultinomialNB, MultinomialNBParameters};
+pub mod bernoulli;
+pub mod categorical;
+pub mod gaussian;
+pub mod multinomial;
diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs
index be8a7da..0fb7aa4 100644
--- a/src/naive_bayes/multinomial.rs
+++ b/src/naive_bayes/multinomial.rs
@@ -1,3 +1,38 @@
+//! # Multinomial Naive Bayes
+//!
+//! Multinomial Naive Bayes classifier is a variant of [Naive Bayes](../index.html) for the multinomially distributed data.
+//! It is often used for discrete data with predictors representing the number of times an event was observed in a particular instance,
+//! for example frequency of the words present in the document.
+//!
+//! Example:
+//!
+//! ```
+//! use smartcore::linalg::naive::dense_matrix::*;
+//! use smartcore::naive_bayes::multinomial::MultinomialNB;
+//!
+//! // Training data points are:
+//! // Chinese Beijing Chinese (class: China)
+//! // Chinese Chinese Shanghai (class: China)
+//! // Chinese Macao (class: China)
+//! // Tokyo Japan Chinese (class: Japan)
+//! let x = DenseMatrix::<f64>::from_2d_array(&[
+//!                   &[1., 2., 0., 0., 0., 0.],
+//!                   &[0., 2., 0., 0., 1., 0.],
+//!                   &[0., 1., 0., 1., 0., 0.],
+//!                   &[0., 1., 1., 0., 0., 1.],
+//!         ]);
+//! let y = vec![0., 0., 0., 1.];
+//! let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap();
+//!
+//! // Testing data point is:
+//! //  Chinese Chinese Chinese Tokyo Japan
+//! let x_test = DenseMatrix::<f64>::from_2d_array(&[&[0., 3., 1., 0., 0., 1.]]);
+//! let y_hat = nb.predict(&x_test).unwrap();
+//! ```
+//!
+//! ## References:
+//!
+//! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
 use crate::error::Failed;
 use crate::linalg::row_iter;
 use crate::linalg::BaseVector;
diff --git a/src/svm/svc.rs b/src/svm/svc.rs
index 4fd70df..9e166d5 100644
--- a/src/svm/svc.rs
+++ b/src/svm/svc.rs
@@ -28,7 +28,6 @@
 //!
 //! ```
 //! use smartcore::linalg::naive::dense_matrix::*;
-//! use smartcore::linear::linear_regression::*;
 //! use smartcore::svm::Kernels;
 //! use smartcore::svm::svc::{SVC, SVCParameters};
 //!