|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Scikit-learn"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Scikit-learn contains simple and efficient tools for data mining and data analysis. It implements a wide variety of machine learning algorithms and processes to conduct advanced analytics.\n",
- "\n",
- "Library documentation: <a>http://scikit-learn.org/stable/</a>"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### General"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "from sklearn import datasets\n",
- "from sklearn import svm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[[ 0. 0. 5. ..., 0. 0. 0.]\n",
- " [ 0. 0. 0. ..., 10. 0. 0.]\n",
- " [ 0. 0. 0. ..., 16. 9. 0.]\n",
- " ..., \n",
- " [ 0. 0. 1. ..., 6. 0. 0.]\n",
- " [ 0. 0. 2. ..., 12. 0. 0.]\n",
- " [ 0. 0. 10. ..., 12. 1. 0.]]\n"
- ]
- }
- ],
- "source": [
- "# import a sample dataset and view the data\n",
- "digits = datasets.load_digits()\n",
- "print(digits.data)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([0, 1, 2, ..., 8, 9, 8])"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# view the target variable\n",
- "digits.target"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,\n",
- " gamma=0.001, kernel='rbf', max_iter=-1, probability=False,\n",
- " random_state=None, shrinking=True, tol=0.001, verbose=False)"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# train a support vector machine using everything but the last example \n",
- "classifier = svm.SVC(gamma=0.001, C=100.)\n",
- "classifier.fit(digits.data[:-1], digits.target[:-1])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([8])"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# predict the target of the last example\n",
- "classifier.predict(digits.data[-1])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([8])"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# persist the model and reload\n",
- "import pickle\n",
- "from sklearn.externals import joblib\n",
- "joblib.dump(classifier, 'model.pkl')\n",
- "classifier2 = joblib.load('model.pkl')\n",
- "classifier2.predict(digits.data[-1])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import os\n",
- "os.remove('model.pkl')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.97999999999999998"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# another example with the digits data set\n",
- "svc = svm.SVC(C=1, kernel='linear')\n",
- "svc.fit(digits.data[:-100], digits.target[:-100]).score(digits.data[-100:], digits.target[-100:])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train: [2 3 4 5] | test: [0 1]\n",
- "Train: [0 1 4 5] | test: [2 3]\n",
- "Train: [0 1 2 3] | test: [4 5]\n"
- ]
- }
- ],
- "source": [
- "# perform cross-validation on the estimator's predictions\n",
- "from sklearn import cross_validation\n",
- "k_fold = cross_validation.KFold(n=6, n_folds=3)\n",
- "for train_indices, test_indices in k_fold:\n",
- " print('Train: %s | test: %s' % (train_indices, test_indices))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ 0.93489149, 0.95659432, 0.93989983])"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# apply to the model\n",
- "kfold = cross_validation.KFold(len(digits.data), n_folds=3)\n",
- "cross_validation.cross_val_score(svc, digits.data, digits.target, cv=kfold, n_jobs=-1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "GridSearchCV(cv=None,\n",
- " estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
- " kernel='linear', max_iter=-1, probability=False, random_state=None,\n",
- " shrinking=True, tol=0.001, verbose=False),\n",
- " fit_params={}, iid=True, loss_func=None, n_jobs=-1,\n",
- " param_grid={'gamma': array([ 1.00000e-06, 3.59381e-06, 1.29155e-05, 4.64159e-05,\n",
- " 1.66810e-04, 5.99484e-04, 2.15443e-03, 7.74264e-03,\n",
- " 2.78256e-02, 1.00000e-01])},\n",
- " pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n",
- " verbose=0)"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# use the grid search module to optimize model parameters\n",
- "from sklearn.grid_search import GridSearchCV\n",
- "gammas = np.logspace(-6, -1, 10)\n",
- "classifier = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas), n_jobs=-1)\n",
- "classifier.fit(digits.data[:1000], digits.target[:1000])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.92400000000000004"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "classifier.best_score_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "9.9999999999999995e-07"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "classifier.best_estimator_.gamma"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.94228356336260977"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# run against the test set\n",
- "classifier.score(digits.data[1000:], digits.target[1000:])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ 0.93521595, 0.95826377, 0.93791946])"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# nested cross-validation example\n",
- "cross_validation.cross_val_score(classifier, digits.data, digits.target)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Other Classifiers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# import the iris dataset\n",
- "iris = datasets.load_iris()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
- " metric_params=None, n_neighbors=5, p=2, weights='uniform')"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# k nearest neighbors\n",
- "from sklearn.neighbors import KNeighborsClassifier\n",
- "knn = KNeighborsClassifier()\n",
- "knn.fit(iris.data, iris.target)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "DecisionTreeClassifier(compute_importances=None, criterion='gini',\n",
- " max_depth=None, max_features=None, max_leaf_nodes=None,\n",
- " min_density=None, min_samples_leaf=1, min_samples_split=2,\n",
- " random_state=None, splitter='best')"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# decision tree\n",
- "from sklearn.tree import DecisionTreeClassifier\n",
- "dtree = DecisionTreeClassifier()\n",
- "dtree.fit(iris.data, iris.target)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,\n",
- " fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',\n",
- " loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,\n",
- " random_state=None, shuffle=False, verbose=0, warm_start=False)"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# stochastic gradient descent\n",
- "from sklearn.linear_model import SGDClassifier\n",
- "sgd = SGDClassifier(loss=\"hinge\", penalty=\"l2\")\n",
- "sgd.fit(iris.data, iris.target)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Number of mislabeled points : 6\n"
- ]
- }
- ],
- "source": [
- "# naive bayes\n",
- "from sklearn.naive_bayes import GaussianNB\n",
- "gnb = GaussianNB()\n",
- "y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)\n",
- "print(\"Number of mislabeled points : %d\" % (iris.target != y_pred).sum())"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Regression"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# load another sample dataset\n",
- "diabetes = datasets.load_diabetes()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# linear regression\n",
- "from sklearn import linear_model\n",
- "regr = linear_model.LinearRegression()\n",
- "regr.fit(diabetes.data, diabetes.target)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[ -10.01219782 -239.81908937 519.83978679 324.39042769 -792.18416163\n",
- " 476.74583782 101.04457032 177.06417623 751.27932109 67.62538639]\n"
- ]
- }
- ],
- "source": [
- "# regression coefficients\n",
- "print(regr.coef_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "2859.6903987680657"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# mean squared error\n",
- "np.mean((regr.predict(diabetes.data)-diabetes.target)**2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.51774942541329338"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# explained variance\n",
- "regr.score(diabetes.data, diabetes.target)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,\n",
- " normalize=False, solver='auto', tol=0.001)"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# ridge regression\n",
- "regr = linear_model.Ridge(alpha=.1)\n",
- "regr.fit(diabetes.data, diabetes.target)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n",
- " normalize=False, positive=False, precompute='auto', tol=0.0001,\n",
- " warm_start=False)"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# lasso regression\n",
- "regr = linear_model.Lasso()\n",
- "regr.fit(diabetes.data, diabetes.target)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "LogisticRegression(C=100000.0, class_weight=None, dual=False,\n",
- " fit_intercept=True, intercept_scaling=1, penalty='l2',\n",
- " random_state=None, tol=0.0001)"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# logistic regression (this is actually a classifier)\n",
- "iris = datasets.load_iris()\n",
- "logistic = linear_model.LogisticRegression(C=1e5)\n",
- "logistic.fit(iris.data, iris.target)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Preprocessing"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# feature scaling\n",
- "from sklearn import preprocessing\n",
- "X = np.array([[ 1., -1., 2.],\n",
- " [ 2., 0., 0.],\n",
- " [ 0., 1., -1.]])\n",
- "X_scaled = preprocessing.scale(X)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "StandardScaler(copy=True, with_mean=True, with_std=True)"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# save the scaling transform to apply to new data later\n",
- "scaler = preprocessing.StandardScaler().fit(X)\n",
- "scaler"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 0. , -1.22474487, 1.33630621],\n",
- " [ 1.22474487, 0. , -0.26726124],\n",
- " [-1.22474487, 1.22474487, -1.06904497]])"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "scaler.transform(X)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 0.5 , 0. , 1. ],\n",
- " [ 1. , 0.5 , 0.33333333],\n",
- " [ 0. , 1. , 0. ]])"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# range scaling\n",
- "min_max_scaler = preprocessing.MinMaxScaler()\n",
- "X_minmax = min_max_scaler.fit_transform(X)\n",
- "X_minmax"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 0.40824829, -0.40824829, 0.81649658],\n",
- " [ 1. , 0. , 0. ],\n",
- " [ 0. , 0.70710678, -0.70710678]])"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# instance normalization using L2 norm\n",
- "X_normalized = preprocessing.normalize(X, norm='l2')\n",
- "X_normalized"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]])"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# category encoding\n",
- "enc = preprocessing.OneHotEncoder()\n",
- "enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])\n",
- "enc.transform([[0, 1, 3]]).toarray()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 1., 0., 1.],\n",
- " [ 1., 0., 0.],\n",
- " [ 0., 1., 0.]])"
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# binning\n",
- "binarizer = preprocessing.Binarizer().fit(X)\n",
- "binarizer.transform(X)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Clustering"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,\n",
- " n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,\n",
- " verbose=0)"
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# k means clustering\n",
- "from sklearn import cluster\n",
- "k_means = cluster.KMeans(n_clusters=3)\n",
- "k_means.fit(iris.data)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Decomposition"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# create a signal with 2 useful dimensions\n",
- "x1 = np.random.normal(size=100)\n",
- "x2 = np.random.normal(size=100)\n",
- "x3 = x1 + x2\n",
- "X = np.c_[x1, x2, x3]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "PCA(copy=True, n_components=None, whiten=False)"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# compute principal component analysis\n",
- "from sklearn import decomposition\n",
- "pca = decomposition.PCA()\n",
- "pca.fit(X)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ 2.77625101e+00, 9.03048616e-01, 3.02456658e-31])"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pca.explained_variance_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(100L, 2L)"
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# only the 2 first components are useful\n",
- "pca.n_components = 2\n",
- "X_reduced = pca.fit_transform(X)\n",
- "X_reduced.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# generate more sample data\n",
- "time = np.linspace(0, 10, 2000)\n",
- "s1 = np.sin(2 * time) # signal 1 : sinusoidal signal\n",
- "s2 = np.sign(np.sin(3 * time)) # signal 2 : square signal\n",
- "S = np.c_[s1, s2]\n",
- "S += 0.2 * np.random.normal(size=S.shape) # Add noise\n",
- "S /= S.std(axis=0) # standardize data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# mix data\n",
- "A = np.array([[1, 1], [0.5, 2]]) # mixing matrix\n",
- "X = np.dot(S, A.T) # generate observations"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 43,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# compute independent component analysis\n",
- "ica = decomposition.FastICA()\n",
- "S_ = ica.fit_transform(X) # get the estimated sources\n",
- "A_ = ica.mixing_.T\n",
- "np.allclose(X, np.dot(S_, A_) + ica.mean_)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }
|