|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- # -*- coding: utf-8 -*-
- # ---
- # jupyter:
- # jupytext_format_version: '1.2'
- # kernelspec:
- # display_name: Python 3
- # language: python
- # name: python3
- # language_info:
- # codemirror_mode:
- # name: ipython
- # version: 3
- # file_extension: .py
- # mimetype: text/x-python
- # name: python
- # nbconvert_exporter: python
- # pygments_lexer: ipython3
- # version: 3.5.2
- # ---
-
- # # Chaining a PCA and a logistic regression
-
- # The PCA does an unsupervised dimensionality reduction, while the logistic regression does the prediction.
- #
- # We use a GridSearchCV to set the dimensionality of the PCA
-
- # +
- % matplotlib inline
-
- import numpy as np
- import matplotlib.pyplot as plt
-
- from sklearn import linear_model, decomposition, datasets
- from sklearn.pipeline import Pipeline
- from sklearn.model_selection import GridSearchCV
-
- logistic = linear_model.LogisticRegression()
-
- pca = decomposition.PCA()
- pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
-
- digits = datasets.load_digits()
- X_digits = digits.data
- y_digits = digits.target
-
- # Plot the PCA spectrum
- pca.fit(X_digits)
-
- plt.figure(1, figsize=(4, 3))
- plt.clf()
- plt.axes([.2, .2, .7, .7])
- plt.plot(pca.explained_variance_, linewidth=2)
- plt.axis('tight')
- plt.xlabel('n_components')
- plt.ylabel('explained_variance_')
-
- # Prediction
- n_components = [20, 40, 64]
- Cs = np.logspace(-4, 4, 3)
-
- # Parameters of pipelines can be set using ‘__’ separated parameter names:
- estimator = GridSearchCV(pipe,
- dict(pca__n_components=n_components,
- logistic__C=Cs))
- estimator.fit(X_digits, y_digits)
-
- plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
- linestyle=':', label='n_components chosen')
- plt.legend(prop=dict(size=12))
- plt.show()
-
- # +
- # Compare the performance
- from sklearn.datasets import load_digits
- from sklearn.linear_model.logistic import LogisticRegression
- from sklearn import decomposition
- from sklearn.metrics import confusion_matrix
- from sklearn.metrics import accuracy_score
- import matplotlib.pyplot as plt
-
-
- # load digital data
- digits, dig_label = load_digits(return_X_y=True)
- print(digits.shape)
-
- # draw one digital
- plt.gray()
- plt.matshow(digits[0].reshape([8, 8]))
- plt.show()
-
-
- # +
-
- # calculate train/test data number
- N = len(digits)
- N_train = int(N*0.8)
- N_test = N - N_train
-
- # split train/test data
- x_train = digits[:N_train, :]
- y_train = dig_label[:N_train]
- x_test = digits[N_train:, :]
- y_test = dig_label[N_train:]
-
- # do logistic regression
- lr=LogisticRegression()
- lr.fit(x_train,y_train)
-
- pred_train = lr.predict(x_train)
- pred_test = lr.predict(x_test)
-
- # calculate train/test accuracy
- acc_train = accuracy_score(y_train, pred_train)
- acc_test = accuracy_score(y_test, pred_test)
- print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))
-
-
- # +
- # do PCA with 'n_components=40'
- pca = decomposition.PCA(n_components=40)
- pca.fit(x_train)
-
- x_train_pca = pca.transform(x_train)
- x_test_pca = pca.transform(x_test)
-
- # do logistic regression
- lr=LogisticRegression()
- lr.fit(x_train_pca,y_train)
-
- pred_train = lr.predict(x_train_pca)
- pred_test = lr.predict(x_test_pca)
-
- # calculate train/test accuracy
- acc_train = accuracy_score(y_train, pred_train)
- acc_test = accuracy_score(y_test, pred_test)
- print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))
-
- # -
-
- # ## References
- # * [Pipelining: chaining a PCA and a logistic regression](http://scikit-learn.org/stable/auto_examples/plot_digits_pipe.html)
|