You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

Logistic_regression.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. # -*- coding: utf-8 -*-
  2. # ---
  3. # jupyter:
  4. # jupytext_format_version: '1.2'
  5. # kernelspec:
  6. # display_name: Python 3
  7. # language: python
  8. # name: python3
  9. # language_info:
  10. # codemirror_mode:
  11. # name: ipython
  12. # version: 3
  13. # file_extension: .py
  14. # mimetype: text/x-python
  15. # name: python
  16. # nbconvert_exporter: python
  17. # pygments_lexer: ipython3
  18. # version: 3.5.2
  19. # ---
  20. # # Logistic Regression
  21. #
  22. # 逻辑回归(Logistic Regression, LR)模型其实仅在线性回归的基础上,套用了一个逻辑函数,但也就由于这个逻辑函数,使得逻辑回归模型成为了机器学习领域一颗耀眼的明星,更是计算广告学的核心。本节主要详述逻辑回归模型的基础。
  23. #
  24. #
  25. # ## 1 逻辑回归模型
  26. # 回归是一种比较容易理解的模型,就相当于$y=f(x)$,表明自变量$x$与因变量$y$的关系。最常见问题有如医生治病时的望、闻、问、切,之后判定病人是否生病或生了什么病,其中的望闻问切就是获取自变量$x$,即特征数据,判断是否生病就相当于获取因变量$y$,即预测分类。
  27. #
  28. # 最简单的回归是线性回归,在此借用Andrew NG的讲义,有如图所示,$X$为数据点——肿瘤的大小,$Y$为观测值——是否是恶性肿瘤。通过构建线性回归模型,如$h_\theta(x)$所示,构建线性回归模型后,即可以根据肿瘤大小,预测是否为恶性肿瘤$h_\theta(x)) \ge 0.5$为恶性,$h_\theta(x) \lt 0.5$为良性。
  29. #
  30. # ![LinearRegression](images/fig1.gif)
  31. #
  32. # 然而线性回归的鲁棒性很差,例如在上图的数据集上建立回归,因最右边噪点的存在,使回归模型在训练集上表现都很差。这主要是由于线性回归在整个实数域内敏感度一致,而分类范围,需要在$[0,1]$。
  33. #
  34. # 逻辑回归就是一种减小预测范围,将预测值限定为$[0,1]$间的一种回归模型,其回归方程与回归曲线如图2所示。逻辑曲线在$z=0$时,十分敏感,在$z>>0$或$z<<0$处,都不敏感,将预测值限定为$(0,1)$。
  35. #
  36. #
  37. # +
  38. # %matplotlib inline
  39. import matplotlib.pyplot as plt
  40. import numpy as np
  41. plt.figure()
  42. plt.axis([-10,10,0,1])
  43. plt.grid(True)
  44. X=np.arange(-10,10,0.1)
  45. y=1/(1+np.e**(-X))
  46. plt.plot(X,y,'b-')
  47. plt.title("Logistic function")
  48. plt.show()
  49. # -
  50. # ### 逻辑回归表达式
  51. #
  52. # 这个函数称为Logistic函数(logistic function),也称为Sigmoid函数(sigmoid function)。函数公式如下:
  53. #
  54. # $$
  55. # g(z) = \frac{1}{1+e^{-z}}
  56. # $$
  57. #
  58. # Logistic函数当z趋近于无穷大时,g(z)趋近于1;当z趋近于无穷小时,g(z)趋近于0。Logistic函数的图形如上图所示。Logistic函数求导时有一个特性,这个特性将在下面的推导中用到,这个特性为:
  59. # $$
  60. # g'(z) = \frac{d}{dz} \frac{1}{1+e^{-z}} \\
  61. # = \frac{1}{(1+e^{-z})^2}(e^{-z}) \\
  62. # = \frac{1}{(1+e^{-z})} (1 - \frac{1}{(1+e^{-z})}) \\
  63. # = g(z)(1-g(z))
  64. # $$
  65. #
  66. #
  67. # +
  68. # %matplotlib inline
  69. import matplotlib.pyplot as plt
  70. import numpy as np
  71. plt.figure()
  72. plt.axis([-10,10,0,1])
  73. plt.grid(True)
  74. X=np.arange(-10,10,0.1)
  75. y=1/(1+np.e**(-X))
  76. plt.plot(X,y,'b-')
  77. plt.title("Logistic function")
  78. plt.show()
  79. # -
  80. # 逻辑回归本质上是线性回归,只是在特征到结果的映射中加入了一层函数映射,即先把特征线性求和,然后使用函数$g(z)$将最为假设函数来预测。$g(z)$可以将连续值映射到0到1之间。线性回归模型的表达式带入$g(z)$,就得到逻辑回归的表达式:
  81. #
  82. # $$
  83. # h_\theta(x) = g(\theta^T x) = \frac{1}{1+e^{-\theta^T x}}
  84. # $$
  85. # ### 逻辑回归的软分类
  86. #
  87. # 现在我们将y的取值$h_\theta(x)$通过Logistic函数归一化到(0,1)间,$y$的取值有特殊的含义,它表示结果取1的概率,因此对于输入$x$分类结果为类别1和类别0的概率分别为:
  88. #
  89. # $$
  90. # P(y=1|x,\theta) = h_\theta(x) \\
  91. # P(y=0|x,\theta) = 1 - h_\theta(x)
  92. # $$
  93. #
  94. # 对上面的表达式合并一下就是:
  95. #
  96. # $$
  97. # p(y|x,\theta) = (h_\theta(x))^y (1 - h_\theta(x))^{1-y}
  98. # $$
  99. #
  100. #
  101. # ### 梯度上升
  102. #
  103. # 得到了逻辑回归的表达式,下一步跟线性回归类似,构建似然函数,然后最大似然估计,最终推导出$\theta$的迭代更新表达式。只不过这里用的不是梯度下降,而是梯度上升,因为这里是最大化似然函数。
  104. #
  105. # 我们假设训练样本相互独立,那么似然函数表达式为:
  106. # ![Loss](images/eq_loss.png)
  107. #
  108. # 同样对似然函数取log,转换为:
  109. # ![LogLoss](images/eq_logloss.png)
  110. #
  111. # 转换后的似然函数对$\theta$求偏导,在这里我们以只有一个训练样本的情况为例:
  112. # ![LogLossDiff](images/eq_logloss_diff.png)
  113. #
  114. # 这个求偏导过程中:
  115. # * 第一步是对$\theta$偏导的转化,依据偏导公式:$y=lnx$, $y'=1/x$。
  116. # * 第二步是根据g(z)求导的特性g'(z) = g(z)(1 - g(z)) 。
  117. # * 第三步就是普通的变换。
  118. #
  119. # 这样我们就得到了梯度上升每次迭代的更新方向,那么$\theta$的迭代表达式为:
  120. # $$
  121. # \theta_j := \theta_j + \alpha(y^i - h_\theta(x^i)) x_j^i
  122. # $$
  123. #
  124. #
  125. # ## Program
  126. # +
  127. # %matplotlib inline
  128. from __future__ import division
  129. import numpy as np
  130. import sklearn.datasets
  131. import matplotlib.pyplot as plt
  132. np.random.seed(0)
  133. # +
  134. # load sample data
  135. data, label = sklearn.datasets.make_moons(200, noise=0.30)
  136. print("data = ", data[:10, :])
  137. print("label = ", label[:10])
  138. plt.scatter(data[:,0], data[:,1], c=label)
  139. plt.title("Original Data")
  140. # +
  141. def plot_decision_boundary(predict_func, data, label):
  142. """画出结果图
  143. Args:
  144. pred_func (callable): 预测函数
  145. data (numpy.ndarray): 训练数据集合
  146. label (numpy.ndarray): 训练数据标签
  147. """
  148. x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
  149. y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
  150. h = 0.01
  151. xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
  152. Z = predict_func(np.c_[xx.ravel(), yy.ravel()])
  153. Z = Z.reshape(xx.shape)
  154. plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
  155. plt.scatter(data[:, 0], data[:, 1], c=label, cmap=plt.cm.Spectral)
  156. plt.show()
  157. # +
  158. def sigmoid(x):
  159. return 1.0 / (1 + np.exp(-x))
  160. class Logistic(object):
  161. """logistic回归模型"""
  162. def __init__(self, data, label):
  163. self.data = data
  164. self.label = label
  165. self.data_num, n = np.shape(data)
  166. self.weights = np.ones(n)
  167. self.b = 1
  168. def train(self, num_iteration=150):
  169. """随机梯度上升算法
  170. Args:
  171. data (numpy.ndarray): 训练数据集
  172. labels (numpy.ndarray): 训练标签
  173. num_iteration (int): 迭代次数
  174. """
  175. for j in range(num_iteration):
  176. data_index = list(range(self.data_num))
  177. for i in range(self.data_num):
  178. # 学习速率
  179. alpha = 0.01
  180. rand_index = int(np.random.uniform(0, len(data_index)))
  181. error = self.label[rand_index] - sigmoid(sum(self.data[rand_index] * self.weights + self.b))
  182. self.weights += alpha * error * self.data[rand_index]
  183. self.b += alpha * error
  184. del(data_index[rand_index])
  185. def predict(self, predict_data):
  186. """预测函数"""
  187. result = list(map(lambda x: 1 if sum(self.weights * x + self.b) > 0 else 0,
  188. predict_data))
  189. return np.array(result)
  190. # -
  191. logistic = Logistic(data, label)
  192. logistic.train(200)
  193. plot_decision_boundary(lambda x: logistic.predict(x), data, label)
  194. # ## How to use sklearn to resolve the problem
  195. #
  196. # +
  197. from sklearn.linear_model.logistic import LogisticRegression
  198. from sklearn.metrics import confusion_matrix
  199. from sklearn.metrics import accuracy_score
  200. import matplotlib.pyplot as plt
  201. # calculate train/test data number
  202. N = len(data)
  203. N_train = int(N*0.8)
  204. N_test = N - N_train
  205. # split train/test data
  206. x_train = data[:N_train, :]
  207. y_train = label[:N_train]
  208. x_test = data[N_train:, :]
  209. y_test = label[N_train:]
  210. # do logistic regression
  211. lr=LogisticRegression()
  212. lr.fit(x_train,y_train)
  213. pred_train = lr.predict(x_train)
  214. pred_test = lr.predict(x_test)
  215. # calculate train/test accuracy
  216. acc_train = accuracy_score(y_train, pred_train)
  217. acc_test = accuracy_score(y_test, pred_test)
  218. print("accuracy train = %f" % acc_train)
  219. print("accuracy test = %f" % acc_test)
  220. # plot confusion matrix
  221. cm = confusion_matrix(y_test,pred_test)
  222. plt.matshow(cm)
  223. plt.title(u'Confusion Matrix')
  224. plt.colorbar()
  225. plt.ylabel(u'Groundtruth')
  226. plt.xlabel(u'Predict')
  227. plt.show()
  228. # -
  229. # ## Multi-class recognition
  230. # ### Load & show the data
  231. # +
  232. import matplotlib.pyplot as plt
  233. from sklearn.datasets import load_digits
  234. # load data
  235. digits = load_digits()
  236. # copied from notebook 02_sklearn_data.ipynb
  237. fig = plt.figure(figsize=(6, 6)) # figure size in inches
  238. fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
  239. # plot the digits: each image is 8x8 pixels
  240. for i in range(64):
  241. ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
  242. ax.imshow(digits.images[i], cmap=plt.cm.binary)
  243. # label the image with the target value
  244. ax.text(0, 7, str(digits.target[i]))
  245. # -
  246. # ### Visualizing the Data
  247. #
  248. # A good first-step for many problems is to visualize the data using one of the Dimensionality Reduction techniques we saw earlier. We'll start with the most straightforward one, Principal Component Analysis (PCA).
  249. #
  250. # PCA seeks orthogonal linear combinations of the features which show the greatest variance, and as such, can help give you a good idea of the structure of the data set. Here we'll use RandomizedPCA, because it's faster for large N.
  251. # +
  252. from sklearn.decomposition import PCA
  253. pca = PCA(n_components=2, svd_solver="randomized")
  254. proj = pca.fit_transform(digits.data)
  255. plt.scatter(proj[:, 0], proj[:, 1], c=digits.target)
  256. plt.colorbar()
  257. # -
  258. # A weakness of PCA is that it produces a linear dimensionality reduction:
  259. # this may miss some interesting relationships in the data. If we want to
  260. # see a nonlinear mapping of the data, we can use one of the several
  261. # methods in the `manifold` module. Here we'll use [Isomap](https://blog.csdn.net/VictoriaW/article/details/78497316) (a concatenation
  262. # of Isometric Mapping) which is a manifold learning method based on
  263. # graph theory:
  264. # +
  265. from sklearn.manifold import Isomap
  266. iso = Isomap(n_neighbors=5, n_components=2)
  267. proj = iso.fit_transform(digits.data)
  268. plt.scatter(proj[:, 0], proj[:, 1], c=digits.target)
  269. plt.colorbar()
  270. # -
  271. # ## Program
  272. # +
  273. from sklearn.datasets import load_digits
  274. from sklearn.linear_model.logistic import LogisticRegression
  275. from sklearn.metrics import accuracy_score
  276. import matplotlib.pyplot as plt
  277. # load digital data
  278. digits, dig_label = load_digits(return_X_y=True)
  279. print(digits.shape)
  280. # calculate train/test data number
  281. N = len(digits)
  282. N_train = int(N*0.8)
  283. N_test = N - N_train
  284. # split train/test data
  285. x_train = digits[:N_train, :]
  286. y_train = dig_label[:N_train]
  287. x_test = digits[N_train:, :]
  288. y_test = dig_label[N_train:]
  289. # do logistic regression
  290. lr=LogisticRegression()
  291. lr.fit(x_train,y_train)
  292. pred_train = lr.predict(x_train)
  293. pred_test = lr.predict(x_test)
  294. # calculate train/test accuracy
  295. acc_train = accuracy_score(y_train, pred_train)
  296. acc_test = accuracy_score(y_test, pred_test)
  297. print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))
  298. score_train = lr.score(x_train, y_train)
  299. score_test = lr.score(x_test, y_test)
  300. print("score_train = %f, score_test = %f" % (score_train, score_test))
  301. # +
  302. from sklearn.metrics import confusion_matrix
  303. # plot confusion matrix
  304. cm = confusion_matrix(y_test,pred_test)
  305. plt.matshow(cm)
  306. plt.title(u'Confusion Matrix')
  307. plt.colorbar()
  308. plt.ylabel(u'Groundtruth')
  309. plt.xlabel(u'Predict')
  310. plt.show()
  311. # -
  312. # ## Exercise - How to draw mis-classfied data?
  313. #
  314. # 1. How to obtain the mis-classified index?
  315. # 2. How to draw them?
  316. # ## References
  317. #
  318. # * [逻辑回归模型(Logistic Regression, LR)基础](https://www.cnblogs.com/sparkwen/p/3441197.html)
  319. # * [逻辑回归(Logistic Regression)](http://www.cnblogs.com/BYRans/p/4713624.html)

机器学习越来越多应用到飞行器、机器人等领域,其目的是利用计算机实现类似人类的智能,从而实现装备的智能化与无人化。本课程旨在引导学生掌握机器学习的基本知识、典型方法与技术,通过具体的应用案例激发学生对该学科的兴趣,鼓励学生能够从人工智能的角度来分析、解决飞行器、机器人所面临的问题和挑战。本课程主要内容包括Python编程基础,机器学习模型,无监督学习、监督学习、深度学习基础知识与实现,并学习如何利用机器学习解决实际问题,从而全面提升自我的《综合能力》。