OpenI
/
machinelearning_notebook

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scikit-learn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Scikit-learn contains simple and efficient tools for data mining and data analysis.  It implements a wide variety of machine learning algorithms and processes to conduct advanced analytics.\n",
    "\n",
    "Library documentation: <a>http://scikit-learn.org/stable/</a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### General"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn import datasets\n",
    "from sklearn import svm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[  0.   0.   5. ...,   0.   0.   0.]\n",
      " [  0.   0.   0. ...,  10.   0.   0.]\n",
      " [  0.   0.   0. ...,  16.   9.   0.]\n",
      " ..., \n",
      " [  0.   0.   1. ...,   6.   0.   0.]\n",
      " [  0.   0.   2. ...,  12.   0.   0.]\n",
      " [  0.   0.  10. ...,  12.   1.   0.]]\n"
     ]
    }
   ],
   "source": [
    "# import a sample dataset and view the data\n",
    "digits = datasets.load_digits()\n",
    "print(digits.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 2, ..., 8, 9, 8])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# view the target variable\n",
    "digits.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,\n",
       "  gamma=0.001, kernel='rbf', max_iter=-1, probability=False,\n",
       "  random_state=None, shrinking=True, tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# train a support vector machine using everything but the last example \n",
    "classifier = svm.SVC(gamma=0.001, C=100.)\n",
    "classifier.fit(digits.data[:-1], digits.target[:-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([8])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# predict the target of the last example\n",
    "classifier.predict(digits.data[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([8])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# persist the model and reload\n",
    "import pickle\n",
    "from sklearn.externals import joblib\n",
    "joblib.dump(classifier, 'model.pkl')\n",
    "classifier2 = joblib.load('model.pkl')\n",
    "classifier2.predict(digits.data[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "os.remove('model.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.97999999999999998"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# another example with the digits data set\n",
    "svc = svm.SVC(C=1, kernel='linear')\n",
    "svc.fit(digits.data[:-100], digits.target[:-100]).score(digits.data[-100:], digits.target[-100:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: [2 3 4 5] | test: [0 1]\n",
      "Train: [0 1 4 5] | test: [2 3]\n",
      "Train: [0 1 2 3] | test: [4 5]\n"
     ]
    }
   ],
   "source": [
    "# perform cross-validation on the estimator's predictions\n",
    "from sklearn import cross_validation\n",
    "k_fold = cross_validation.KFold(n=6, n_folds=3)\n",
    "for train_indices, test_indices in k_fold:\n",
    "    print('Train: %s | test: %s' % (train_indices, test_indices))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.93489149,  0.95659432,  0.93989983])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# apply to the model\n",
    "kfold = cross_validation.KFold(len(digits.data), n_folds=3)\n",
    "cross_validation.cross_val_score(svc, digits.data, digits.target, cv=kfold, n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=None,\n",
       "       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
       "  kernel='linear', max_iter=-1, probability=False, random_state=None,\n",
       "  shrinking=True, tol=0.001, verbose=False),\n",
       "       fit_params={}, iid=True, loss_func=None, n_jobs=-1,\n",
       "       param_grid={'gamma': array([  1.00000e-06,   3.59381e-06,   1.29155e-05,   4.64159e-05,\n",
       "         1.66810e-04,   5.99484e-04,   2.15443e-03,   7.74264e-03,\n",
       "         2.78256e-02,   1.00000e-01])},\n",
       "       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n",
       "       verbose=0)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the grid search module to optimize model parameters\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "gammas = np.logspace(-6, -1, 10)\n",
    "classifier = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas), n_jobs=-1)\n",
    "classifier.fit(digits.data[:1000], digits.target[:1000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.92400000000000004"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9.9999999999999995e-07"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.best_estimator_.gamma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.94228356336260977"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# run against the test set\n",
    "classifier.score(digits.data[1000:], digits.target[1000:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.93521595,  0.95826377,  0.93791946])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# nested cross-validation example\n",
    "cross_validation.cross_val_score(classifier, digits.data, digits.target)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Other Classifiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# import the iris dataset\n",
    "iris = datasets.load_iris()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_neighbors=5, p=2, weights='uniform')"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# k nearest neighbors\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "knn = KNeighborsClassifier()\n",
    "knn.fit(iris.data, iris.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(compute_importances=None, criterion='gini',\n",
       "            max_depth=None, max_features=None, max_leaf_nodes=None,\n",
       "            min_density=None, min_samples_leaf=1, min_samples_split=2,\n",
       "            random_state=None, splitter='best')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# decision tree\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "dtree = DecisionTreeClassifier()\n",
    "dtree.fit(iris.data, iris.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,\n",
       "       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',\n",
       "       loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,\n",
       "       random_state=None, shuffle=False, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# stochastic gradient descent\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "sgd = SGDClassifier(loss=\"hinge\", penalty=\"l2\")\n",
    "sgd.fit(iris.data, iris.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of mislabeled points : 6\n"
     ]
    }
   ],
   "source": [
    "# naive bayes\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "gnb = GaussianNB()\n",
    "y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)\n",
    "print(\"Number of mislabeled points : %d\" % (iris.target != y_pred).sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# load another sample dataset\n",
    "diabetes = datasets.load_diabetes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# linear regression\n",
    "from sklearn import linear_model\n",
    "regr = linear_model.LinearRegression()\n",
    "regr.fit(diabetes.data, diabetes.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ -10.01219782 -239.81908937  519.83978679  324.39042769 -792.18416163\n",
      "  476.74583782  101.04457032  177.06417623  751.27932109   67.62538639]\n"
     ]
    }
   ],
   "source": [
    "# regression coefficients\n",
    "print(regr.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2859.6903987680657"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# mean squared error\n",
    "np.mean((regr.predict(diabetes.data)-diabetes.target)**2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.51774942541329338"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# explained variance\n",
    "regr.score(diabetes.data, diabetes.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,\n",
       "   normalize=False, solver='auto', tol=0.001)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# ridge regression\n",
    "regr = linear_model.Ridge(alpha=.1)\n",
    "regr.fit(diabetes.data, diabetes.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n",
       "   normalize=False, positive=False, precompute='auto', tol=0.0001,\n",
       "   warm_start=False)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lasso regression\n",
    "regr = linear_model.Lasso()\n",
    "regr.fit(diabetes.data, diabetes.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=100000.0, class_weight=None, dual=False,\n",
       "          fit_intercept=True, intercept_scaling=1, penalty='l2',\n",
       "          random_state=None, tol=0.0001)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# logistic regression (this is actually a classifier)\n",
    "iris = datasets.load_iris()\n",
    "logistic = linear_model.LogisticRegression(C=1e5)\n",
    "logistic.fit(iris.data, iris.target)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# feature scaling\n",
    "from sklearn import preprocessing\n",
    "X = np.array([[ 1., -1.,  2.],\n",
    "               [ 2.,  0.,  0.],\n",
    "               [ 0.,  1., -1.]])\n",
    "X_scaled = preprocessing.scale(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "StandardScaler(copy=True, with_mean=True, with_std=True)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# save the scaling transform to apply to new data later\n",
    "scaler = preprocessing.StandardScaler().fit(X)\n",
    "scaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.        , -1.22474487,  1.33630621],\n",
       "       [ 1.22474487,  0.        , -0.26726124],\n",
       "       [-1.22474487,  1.22474487, -1.06904497]])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scaler.transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.5       ,  0.        ,  1.        ],\n",
       "       [ 1.        ,  0.5       ,  0.33333333],\n",
       "       [ 0.        ,  1.        ,  0.        ]])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# range scaling\n",
    "min_max_scaler = preprocessing.MinMaxScaler()\n",
    "X_minmax = min_max_scaler.fit_transform(X)\n",
    "X_minmax"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.40824829, -0.40824829,  0.81649658],\n",
       "       [ 1.        ,  0.        ,  0.        ],\n",
       "       [ 0.        ,  0.70710678, -0.70710678]])"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# instance normalization using L2 norm\n",
    "X_normalized = preprocessing.normalize(X, norm='l2')\n",
    "X_normalized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# category encoding\n",
    "enc = preprocessing.OneHotEncoder()\n",
    "enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])\n",
    "enc.transform([[0, 1, 3]]).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.,  0.,  1.],\n",
       "       [ 1.,  0.,  0.],\n",
       "       [ 0.,  1.,  0.]])"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# binning\n",
    "binarizer = preprocessing.Binarizer().fit(X)\n",
    "binarizer.transform(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,\n",
       "    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,\n",
       "    verbose=0)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# k means clustering\n",
    "from sklearn import cluster\n",
    "k_means = cluster.KMeans(n_clusters=3)\n",
    "k_means.fit(iris.data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Decomposition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# create a signal with 2 useful dimensions\n",
    "x1 = np.random.normal(size=100)\n",
    "x2 = np.random.normal(size=100)\n",
    "x3 = x1 + x2\n",
    "X = np.c_[x1, x2, x3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PCA(copy=True, n_components=None, whiten=False)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compute principal component analysis\n",
    "from sklearn import decomposition\n",
    "pca = decomposition.PCA()\n",
    "pca.fit(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  2.77625101e+00,   9.03048616e-01,   3.02456658e-31])"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pca.explained_variance_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100L, 2L)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# only the 2 first components are useful\n",
    "pca.n_components = 2\n",
    "X_reduced = pca.fit_transform(X)\n",
    "X_reduced.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# generate more sample data\n",
    "time = np.linspace(0, 10, 2000)\n",
    "s1 = np.sin(2 * time)  # signal 1 : sinusoidal signal\n",
    "s2 = np.sign(np.sin(3 * time))  # signal 2 : square signal\n",
    "S = np.c_[s1, s2]\n",
    "S += 0.2 * np.random.normal(size=S.shape)  # Add noise\n",
    "S /= S.std(axis=0)  # standardize data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# mix data\n",
    "A = np.array([[1, 1], [0.5, 2]])  # mixing matrix\n",
    "X = np.dot(S, A.T)  # generate observations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compute independent component analysis\n",
    "ica = decomposition.FastICA()\n",
    "S_ = ica.fit_transform(X)  # get the estimated sources\n",
    "A_ = ica.mixing_.T\n",
    "np.allclose(X,  np.dot(S_, A_) + ica.mean_)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}