You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

PCA_and_Logistic_Regression.py 4.3 kB

6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. # -*- coding: utf-8 -*-
  2. # ---
  3. # jupyter:
  4. # jupytext_format_version: '1.2'
  5. # kernelspec:
  6. # display_name: Python 3
  7. # language: python
  8. # name: python3
  9. # language_info:
  10. # codemirror_mode:
  11. # name: ipython
  12. # version: 3
  13. # file_extension: .py
  14. # mimetype: text/x-python
  15. # name: python
  16. # nbconvert_exporter: python
  17. # pygments_lexer: ipython3
  18. # version: 3.5.2
  19. # ---
  20. # # Chaining a PCA and a logistic regression
  21. # The PCA does an unsupervised dimensionality reduction, while the logistic regression does the prediction.
  22. #
  23. # We use a GridSearchCV to set the dimensionality of the PCA
  24. # +
  25. % matplotlib inline
  26. import numpy as np
  27. import matplotlib.pyplot as plt
  28. from sklearn import linear_model, decomposition, datasets
  29. from sklearn.pipeline import Pipeline
  30. from sklearn.model_selection import GridSearchCV
  31. logistic = linear_model.LogisticRegression()
  32. pca = decomposition.PCA()
  33. pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
  34. digits = datasets.load_digits()
  35. X_digits = digits.data
  36. y_digits = digits.target
  37. # Plot the PCA spectrum
  38. pca.fit(X_digits)
  39. plt.figure(1, figsize=(4, 3))
  40. plt.clf()
  41. plt.axes([.2, .2, .7, .7])
  42. plt.plot(pca.explained_variance_, linewidth=2)
  43. plt.axis('tight')
  44. plt.xlabel('n_components')
  45. plt.ylabel('explained_variance_')
  46. # Prediction
  47. n_components = [20, 40, 64]
  48. Cs = np.logspace(-4, 4, 3)
  49. # Parameters of pipelines can be set using ‘__’ separated parameter names:
  50. estimator = GridSearchCV(pipe,
  51. dict(pca__n_components=n_components,
  52. logistic__C=Cs))
  53. estimator.fit(X_digits, y_digits)
  54. plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
  55. linestyle=':', label='n_components chosen')
  56. plt.legend(prop=dict(size=12))
  57. plt.show()
  58. # +
  59. # Compare the performance
  60. from sklearn.datasets import load_digits
  61. from sklearn.linear_model.logistic import LogisticRegression
  62. from sklearn import decomposition
  63. from sklearn.metrics import confusion_matrix
  64. from sklearn.metrics import accuracy_score
  65. import matplotlib.pyplot as plt
  66. # load digital data
  67. digits, dig_label = load_digits(return_X_y=True)
  68. print(digits.shape)
  69. # draw one digital
  70. plt.gray()
  71. plt.matshow(digits[0].reshape([8, 8]))
  72. plt.show()
  73. # +
  74. # calculate train/test data number
  75. N = len(digits)
  76. N_train = int(N*0.8)
  77. N_test = N - N_train
  78. # split train/test data
  79. x_train = digits[:N_train, :]
  80. y_train = dig_label[:N_train]
  81. x_test = digits[N_train:, :]
  82. y_test = dig_label[N_train:]
  83. # do logistic regression
  84. lr=LogisticRegression()
  85. lr.fit(x_train,y_train)
  86. pred_train = lr.predict(x_train)
  87. pred_test = lr.predict(x_test)
  88. # calculate train/test accuracy
  89. acc_train = accuracy_score(y_train, pred_train)
  90. acc_test = accuracy_score(y_test, pred_test)
  91. print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))
  92. # +
  93. # do PCA with 'n_components=40'
  94. pca = decomposition.PCA(n_components=40)
  95. pca.fit(x_train)
  96. x_train_pca = pca.transform(x_train)
  97. x_test_pca = pca.transform(x_test)
  98. # do logistic regression
  99. lr=LogisticRegression()
  100. lr.fit(x_train_pca,y_train)
  101. pred_train = lr.predict(x_train_pca)
  102. pred_test = lr.predict(x_test_pca)
  103. # calculate train/test accuracy
  104. acc_train = accuracy_score(y_train, pred_train)
  105. acc_test = accuracy_score(y_test, pred_test)
  106. print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))
  107. # +
  108. # do kernel PCA
  109. # Ref: http://scikit-learn.org/stable/auto_examples/decomposition/plot_kernel_pca.html
  110. from sklearn.decomposition import PCA, KernelPCA
  111. kpca = KernelPCA(n_components=45, kernel="rbf", fit_inverse_transform=True, gamma=10)
  112. kpca.fit(x_train)
  113. x_train_pca = kpca.transform(x_train)
  114. x_test_pca = kpca.transform(x_test)
  115. # do logistic regression
  116. lr=LogisticRegression()
  117. lr.fit(x_train_pca,y_train)
  118. pred_train = lr.predict(x_train_pca)
  119. pred_test = lr.predict(x_test_pca)
  120. # calculate train/test accuracy
  121. acc_train = accuracy_score(y_train, pred_train)
  122. acc_test = accuracy_score(y_test, pred_test)
  123. print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))
  124. # -
  125. # ## References
  126. # * [Pipelining: chaining a PCA and a logistic regression](http://scikit-learn.org/stable/auto_examples/plot_digits_pipe.html)
  127. # * [PCA进行无监督降维](https://ljalphabeta.gitbooks.io/python-/content/pca.html)

机器学习越来越多应用到飞行器、机器人等领域,其目的是利用计算机实现类似人类的智能,从而实现装备的智能化与无人化。本课程旨在引导学生掌握机器学习的基本知识、典型方法与技术,通过具体的应用案例激发学生对该学科的兴趣,鼓励学生能够从人工智能的角度来分析、解决飞行器、机器人所面临的问题和挑战。本课程主要内容包括Python编程基础,机器学习模型,无监督学习、监督学习、深度学习基础知识与实现,并学习如何利用机器学习解决实际问题,从而全面提升自我的《综合能力》。