You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

Logistic_regression.py 9.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. # -*- coding: utf-8 -*-
  2. # ---
  3. # jupyter:
  4. # jupytext_format_version: '1.2'
  5. # kernelspec:
  6. # display_name: Python 3
  7. # language: python
  8. # name: python3
  9. # language_info:
  10. # codemirror_mode:
  11. # name: ipython
  12. # version: 3
  13. # file_extension: .py
  14. # mimetype: text/x-python
  15. # name: python
  16. # nbconvert_exporter: python
  17. # pygments_lexer: ipython3
  18. # version: 3.5.2
  19. # ---
  20. # # Logistic Regression
  21. #
  22. # 逻辑回归(Logistic Regression, LR)模型其实仅在线性回归的基础上,套用了一个逻辑函数,但也就由于这个逻辑函数,使得逻辑回归模型成为了机器学习领域一颗耀眼的明星,更是计算广告学的核心。本节主要详述逻辑回归模型的基础。
  23. #
  24. #
  25. # ## 1 逻辑回归模型
  26. # 回归是一种比较容易理解的模型,就相当于$y=f(x)$,表明自变量$x$与因变量$y$的关系。最常见问题有如医生治病时的望、闻、问、切,之后判定病人是否生病或生了什么病,其中的望闻问切就是获取自变量$x$,即特征数据,判断是否生病就相当于获取因变量$y$,即预测分类。
  27. #
  28. # 最简单的回归是线性回归,在此借用Andrew NG的讲义,有如图所示,$X$为数据点——肿瘤的大小,$Y$为观测值——是否是恶性肿瘤。通过构建线性回归模型,如$h_\theta(x)$所示,构建线性回归模型后,即可以根据肿瘤大小,预测是否为恶性肿瘤$h_\theta(x)) \ge 0.5$为恶性,$h_\theta(x) \lt 0.5$为良性。
  29. #
  30. # ![LinearRegression](images/fig1.gif)
  31. #
  32. # 然而线性回归的鲁棒性很差,例如在上图的数据集上建立回归,因最右边噪点的存在,使回归模型在训练集上表现都很差。这主要是由于线性回归在整个实数域内敏感度一致,而分类范围,需要在$[0,1]$。
  33. #
  34. # 逻辑回归就是一种减小预测范围,将预测值限定为$[0,1]$间的一种回归模型,其回归方程与回归曲线如图2所示。逻辑曲线在$z=0$时,十分敏感,在$z>>0$或$z<<0$处,都不敏感,将预测值限定为$(0,1)$。
  35. #
  36. # ![LogisticFunction](images/fig2.gif)
  37. #
  38. #
  39. # ### 逻辑回归表达式
  40. #
  41. # 这个函数称为Logistic函数(logistic function),也称为Sigmoid函数(sigmoid function)。函数公式如下:
  42. #
  43. # $$
  44. # g(z) = \frac{1}{1+e^{-z}}
  45. # $$
  46. #
  47. # Logistic函数当z趋近于无穷大时,g(z)趋近于1;当z趋近于无穷小时,g(z)趋近于0。Logistic函数的图形如上图所示。Logistic函数求导时有一个特性,这个特性将在下面的推导中用到,这个特性为:
  48. # $$
  49. # g'(z) = \frac{d}{dz} \frac{1}{1+e^{-z}} \\
  50. # = \frac{1}{(1+e^{-z})^2}(e^{-z}) \\
  51. # = \frac{1}{(1+e^{-z})} (1 - \frac{1}{(1+e^{-z})}) \\
  52. # = g(z)(1-g(z))
  53. # $$
  54. #
  55. #
  56. # +
  57. # %matplotlib inline
  58. import matplotlib.pyplot as plt
  59. import numpy as np
  60. plt.figure()
  61. plt.axis([-10,10,0,1])
  62. plt.grid(True)
  63. X=np.arange(-10,10,0.1)
  64. y=1/(1+np.e**(-X))
  65. plt.plot(X,y,'b-')
  66. plt.title("Logistic function")
  67. # -
  68. # 逻辑回归本质上是线性回归,只是在特征到结果的映射中加入了一层函数映射,即先把特征线性求和,然后使用函数$g(z)$将最为假设函数来预测。$g(z)$可以将连续值映射到0到1之间。线性回归模型的表达式带入$g(z)$,就得到逻辑回归的表达式:
  69. #
  70. # $$
  71. # h_\theta(x) = g(\theta^T x) = \frac{1}{1+e^{-\theta^T x}}
  72. # $$
  73. # ### 逻辑回归的软分类
  74. #
  75. # 现在我们将y的取值$h_\theta(x)$通过Logistic函数归一化到(0,1)间,$y$的取值有特殊的含义,它表示结果取1的概率,因此对于输入$x$分类结果为类别1和类别0的概率分别为:
  76. #
  77. # $$
  78. # P(y=1|x,\theta) = h_\theta(x) \\
  79. # P(y=0|x,\theta) = 1 - h_\theta(x)
  80. # $$
  81. #
  82. # 对上面的表达式合并一下就是:
  83. #
  84. # $$
  85. # p(y|x,\theta) = (h_\theta(x))^y (1 - h_\theta(x))^{1-y}
  86. # $$
  87. #
  88. #
  89. # ### 梯度上升
  90. #
  91. # 得到了逻辑回归的表达式,下一步跟线性回归类似,构建似然函数,然后最大似然估计,最终推导出$\theta$的迭代更新表达式。只不过这里用的不是梯度下降,而是梯度上升,因为这里是最大化似然函数。
  92. #
  93. # 我们假设训练样本相互独立,那么似然函数表达式为:
  94. # ![Loss](images/eq_loss.png)
  95. #
  96. # 同样对似然函数取log,转换为:
  97. # ![LogLoss](images/eq_logloss.png)
  98. #
  99. # 转换后的似然函数对$\theta$求偏导,在这里我们以只有一个训练样本的情况为例:
  100. # ![LogLossDiff](images/eq_logloss_diff.png)
  101. #
  102. # 这个求偏导过程中:
  103. # * 第一步是对$\theta$偏导的转化,依据偏导公式:$y=lnx$, $y'=1/x$。
  104. # * 第二步是根据g(z)求导的特性g'(z) = g(z)(1 - g(z)) 。
  105. # * 第三步就是普通的变换。
  106. #
  107. # 这样我们就得到了梯度上升每次迭代的更新方向,那么$\theta$的迭代表达式为:
  108. # $$
  109. # \theta_j := \theta_j + \alpha(y^i - h_\theta(x^i)) x_j^i
  110. # $$
  111. #
  112. #
  113. # ## Program
  114. # +
  115. # %matplotlib inline
  116. from __future__ import division
  117. import numpy as np
  118. import sklearn.datasets
  119. import matplotlib.pyplot as plt
  120. np.random.seed(0)
  121. # +
  122. # load sample data
  123. data, label = sklearn.datasets.make_moons(200, noise=0.30)
  124. print("data = ", data[:10, :])
  125. print("label = ", label[:10])
  126. plt.scatter(data[:,0], data[:,1], c=label)
  127. plt.title("Original Data")
  128. # +
  129. def plot_decision_boundary(predict_func, data, label):
  130. """画出结果图
  131. Args:
  132. pred_func (callable): 预测函数
  133. data (numpy.ndarray): 训练数据集合
  134. label (numpy.ndarray): 训练数据标签
  135. """
  136. x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
  137. y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
  138. h = 0.01
  139. xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
  140. Z = predict_func(np.c_[xx.ravel(), yy.ravel()])
  141. Z = Z.reshape(xx.shape)
  142. plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
  143. plt.scatter(data[:, 0], data[:, 1], c=label, cmap=plt.cm.Spectral)
  144. plt.show()
  145. # +
  146. def sigmoid(x):
  147. return 1.0 / (1 + np.exp(-x))
  148. class Logistic(object):
  149. """logistic回归模型"""
  150. def __init__(self, data, label):
  151. self.data = data
  152. self.label = label
  153. self.data_num, n = np.shape(data)
  154. self.weights = np.ones(n)
  155. self.b = 1
  156. def train(self, num_iteration=150):
  157. """随机梯度上升算法
  158. Args:
  159. data (numpy.ndarray): 训练数据集
  160. labels (numpy.ndarray): 训练标签
  161. num_iteration (int): 迭代次数
  162. """
  163. for j in range(num_iteration):
  164. data_index = list(range(self.data_num))
  165. for i in range(self.data_num):
  166. # 学习速率
  167. alpha = 0.01
  168. rand_index = int(np.random.uniform(0, len(data_index)))
  169. error = self.label[rand_index] - sigmoid(sum(self.data[rand_index] * self.weights + self.b))
  170. self.weights += alpha * error * self.data[rand_index]
  171. self.b += alpha * error
  172. del(data_index[rand_index])
  173. def predict(self, predict_data):
  174. """预测函数"""
  175. result = list(map(lambda x: 1 if sum(self.weights * x + self.b) > 0 else 0,
  176. predict_data))
  177. return np.array(result)
  178. # -
  179. logistic = Logistic(data, label)
  180. logistic.train(200)
  181. plot_decision_boundary(lambda x: logistic.predict(x), data, label)
  182. # ## How to use sklearn to resolve the problem
  183. #
  184. # +
  185. from sklearn.linear_model.logistic import LogisticRegression
  186. from sklearn.metrics import confusion_matrix
  187. from sklearn.metrics import accuracy_score
  188. import matplotlib.pyplot as plt
  189. # calculate train/test data number
  190. N = len(data)
  191. N_train = int(N*0.8)
  192. N_test = N - N_train
  193. # split train/test data
  194. x_train = data[:N_train, :]
  195. y_train = label[:N_train]
  196. x_test = data[N_train:, :]
  197. y_test = label[N_train:]
  198. # do logistic regression
  199. lr=LogisticRegression()
  200. lr.fit(x_train,y_train)
  201. pred_train = lr.predict(x_train)
  202. pred_test = lr.predict(x_test)
  203. # calculate train/test accuracy
  204. acc_train = accuracy_score(y_train, pred_train)
  205. acc_test = accuracy_score(y_test, pred_test)
  206. print("accuracy train = %f" % acc_train)
  207. print("accuracy test = %f" % acc_test)
  208. # plot confusion matrix
  209. cm = confusion_matrix(y_test,pred_test)
  210. plt.matshow(cm)
  211. plt.title(u'Confusion Matrix')
  212. plt.colorbar()
  213. plt.ylabel(u'Groundtruth')
  214. plt.xlabel(u'Predict')
  215. plt.show()
  216. # -
  217. # ## Multi-class recognition
  218. # +
  219. from sklearn.datasets import load_digits
  220. import matplotlib.pyplot as plt
  221. # load digital data
  222. digits, dig_label = load_digits(return_X_y=True)
  223. print(digits.shape)
  224. # draw one digital
  225. plt.gray()
  226. plt.matshow(digits[0].reshape([8, 8]))
  227. plt.show()
  228. # calculate train/test data number
  229. N = len(digits)
  230. N_train = int(N*0.8)
  231. N_test = N - N_train
  232. # split train/test data
  233. x_train = digits[:N_train, :]
  234. y_train = dig_label[:N_train]
  235. x_test = digits[N_train:, :]
  236. y_test = dig_label[N_train:]
  237. # do logistic regression
  238. lr=LogisticRegression()
  239. lr.fit(x_train,y_train)
  240. pred_train = lr.predict(x_train)
  241. pred_test = lr.predict(x_test)
  242. # calculate train/test accuracy
  243. acc_train = accuracy_score(y_train, pred_train)
  244. acc_test = accuracy_score(y_test, pred_test)
  245. print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test)
  246. score_train = lr.score(x_train, y_train)
  247. score_test = lr.score(x_test, y_test)
  248. print("score_train = %f, score_test = %f" % (score_train, score_test))
  249. # plot confusion matrix
  250. cm = confusion_matrix(y_test,pred_test)
  251. plt.matshow(cm)
  252. plt.title(u'Confusion Matrix')
  253. plt.colorbar()
  254. plt.ylabel(u'Groundtruth')
  255. plt.xlabel(u'Predict')
  256. plt.show()
  257. # -
  258. # ## Exercise - How to draw mis-classfied data?
  259. #
  260. # 1. How to obtain the mis-classified index?
  261. # 2. How to draw them?
  262. # ## References
  263. #
  264. # * [逻辑回归模型(Logistic Regression, LR)基础](https://www.cnblogs.com/sparkwen/p/3441197.html)
  265. # * [逻辑回归(Logistic Regression)](http://www.cnblogs.com/BYRans/p/4713624.html)

机器学习越来越多应用到飞行器、机器人等领域,其目的是利用计算机实现类似人类的智能,从而实现装备的智能化与无人化。本课程旨在引导学生掌握机器学习的基本知识、典型方法与技术,通过具体的应用案例激发学生对该学科的兴趣,鼓励学生能够从人工智能的角度来分析、解决飞行器、机器人所面临的问题和挑战。本课程主要内容包括Python编程基础,机器学习模型,无监督学习、监督学习、深度学习基础知识与实现,并学习如何利用机器学习解决实际问题,从而全面提升自我的《综合能力》。

Contributors (1)