diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs
index 201918e..021be83 100644
--- a/src/cluster/kmeans.rs
+++ b/src/cluster/kmeans.rs
@@ -49,7 +49,7 @@
 //!
 //! ## References:
 //!
-//! * ["An Introduction to Statistical Learning", James et al., 10.3.1 K-Means Clustering](http://faculty.marshall.usc.edu/gareth-james/ISL/)
+//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 10.3.1 K-Means Clustering](http://faculty.marshall.usc.edu/gareth-james/ISL/)
 //! * ["k-means++: The Advantages of Careful Seeding", Arthur D., Vassilvitskii S.](http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf)
 
 extern crate rand;
diff --git a/src/decomposition/mod.rs b/src/decomposition/mod.rs
index a6bac38..bdbd3e9 100644
--- a/src/decomposition/mod.rs
+++ b/src/decomposition/mod.rs
@@ -1 +1,15 @@
+//! # Dimension Reduction Methods
+//! Dimension reduction is a popular approach for deriving a low-dimensional set of features from a large set of variables.
+//!
+//! High Dimensional Data (a lot of input features) often degrade performance of machine learning algorithms due to [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality).
+//! The more dimensions you have in a data set, the more difficult it becomes to predict certain quantities. While it seems that the more explanatory variables the better,
+//! when it comes to adding variables, the opposite is true. Each added variable results in an exponential decrease in predictive power.
+//! Therefore, it is often desirable to reduce the number of input features.
+//!
+//! Dimension reduction is also used for the purposes of data visualization.
+//!
+//! ## References
+//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 10.3.1 K-Means Clustering, 6.3 Dimension Reduction Methods](http://faculty.marshall.usc.edu/gareth-james/ISL/)
+
+/// PCA is a popular approach for deriving a low-dimensional set of features from a large set of variables.
 pub mod pca;
diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs
index e1f7027..b565ce4 100644
--- a/src/decomposition/pca.rs
+++ b/src/decomposition/pca.rs
@@ -1,3 +1,49 @@
+//! # PCA
+//!
+//! Principal components analysis (PCA) is a method that is used to select several linear combinations that capture most of the variation in your data.
+//! PCA is an unsupervised approach, since it involves only a set of features \\(X1, X2, . . . , Xn\\), and no associated response \\(Y\\).
+//! Apart from producing derived variables for use in supervised learning problems, PCA also serves as a tool for data visualization.
+//!
+//! PCA is scale sensitive. Before PCA is performed, the variables should be centered to have mean zero.
+//! Furthermore, the results obtained also depend on whether the variables have been individually scaled.
+//! Use `use_correlation_matrix` parameter to standardize your variables (to mean 0 and standard deviation 1).
+//!
+//! Example:
+//! ```
+//! use smartcore::linalg::naive::dense_matrix::*;
+//! use smartcore::decomposition::pca::*;
+//!
+//! // Iris data
+//! let iris = DenseMatrix::from_array(&[
+//!                     &[5.1, 3.5, 1.4, 0.2],
+//!                     &[4.9, 3.0, 1.4, 0.2],
+//!                     &[4.7, 3.2, 1.3, 0.2],
+//!                     &[4.6, 3.1, 1.5, 0.2],
+//!                     &[5.0, 3.6, 1.4, 0.2],
+//!                     &[5.4, 3.9, 1.7, 0.4],
+//!                     &[4.6, 3.4, 1.4, 0.3],
+//!                     &[5.0, 3.4, 1.5, 0.2],
+//!                     &[4.4, 2.9, 1.4, 0.2],
+//!                     &[4.9, 3.1, 1.5, 0.1],
+//!                     &[7.0, 3.2, 4.7, 1.4],
+//!                     &[6.4, 3.2, 4.5, 1.5],
+//!                     &[6.9, 3.1, 4.9, 1.5],
+//!                     &[5.5, 2.3, 4.0, 1.3],
+//!                     &[6.5, 2.8, 4.6, 1.5],
+//!                     &[5.7, 2.8, 4.5, 1.3],
+//!                     &[6.3, 3.3, 4.7, 1.6],
+//!                     &[4.9, 2.4, 3.3, 1.0],
+//!                     &[6.6, 2.9, 4.6, 1.3],
+//!                     &[5.2, 2.7, 3.9, 1.4],
+//!                     ]);
+//!
+//! let pca = PCA::new(&iris, 2, Default::default()); // Reduce number of features to 2
+//!
+//! let iris_reduced = pca.transform(&iris);
+//!
+//! ```
+//!
+//! <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_CHTML"></script>
 use std::fmt::Debug;
 
 use serde::{Deserialize, Serialize};
@@ -5,6 +51,7 @@ use serde::{Deserialize, Serialize};
 use crate::linalg::Matrix;
 use crate::math::num::RealNumber;
 
+/// Principal components analysis algorithm
 #[derive(Serialize, Deserialize, Debug)]
 pub struct PCA<T: RealNumber, M: Matrix<T>> {
     eigenvectors: M,
@@ -32,8 +79,11 @@ impl<T: RealNumber, M: Matrix<T>> PartialEq for PCA<T, M> {
 }
 
 #[derive(Debug, Clone)]
+/// PCA parameters
 pub struct PCAParameters {
-    use_correlation_matrix: bool,
+    /// By default, covariance matrix is used to compute principal components.
+    /// Enable this flag if you want to use correlation matrix instead.
+    pub use_correlation_matrix: bool,
 }
 
 impl Default for PCAParameters {
@@ -45,6 +95,10 @@ impl Default for PCAParameters {
 }
 
 impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
+    /// Fits PCA to your data.
+    /// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
+    /// * `n_components` - number of components to keep.
+    /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values.
     pub fn new(data: &M, n_components: usize, parameters: PCAParameters) -> PCA<T, M> {
         let (m, n) = data.shape();
 
@@ -143,6 +197,8 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
         }
     }
 
+    /// Run dimensionality reduction for `x`
+    /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features.
     pub fn transform(&self, x: &M) -> M {
         let (nrows, ncols) = x.shape();
         let (_, n_components) = self.projection.shape();