Browse Source

Remove some unused files

fetches/feikei/master
Shuhui Bu 6 years ago
parent
commit
b708c58ef6
12 changed files with 4188 additions and 371 deletions
  1. +1
    -1
      0_numpy_matplotlib_scipy_sympy/matplotlib_ani2.ipynb
  2. +3954
    -11
      1_logistic_regression/Least_squares.ipynb
  3. +74
    -1
      1_logistic_regression/Least_squares.py
  4. +27
    -1
      1_logistic_regression/Logistic_regression.ipynb
  5. +132
    -0
      1_logistic_regression/Logistic_regression.py
  6. BIN
      1_logistic_regression/images/fig1.gif
  7. BIN
      1_logistic_regression/images/fig2.gif
  8. BIN
      1_logistic_regression/images/fig3.gif
  9. +0
    -149
      1_logistic_regression/linear models.ipynb
  10. +0
    -66
      1_logistic_regression/linear_regression.py
  11. +0
    -70
      1_logistic_regression/logistic3.py
  12. +0
    -72
      1_logistic_regression/logistic_demo.py

+ 1
- 1
0_numpy_matplotlib_scipy_sympy/matplotlib_ani2.ipynb
File diff suppressed because it is too large
View File


+ 3954
- 11
1_logistic_regression/Least_squares.ipynb
File diff suppressed because it is too large
View File


+ 74
- 1
1_logistic_regression/Least_squares.py View File

@@ -113,6 +113,78 @@ plt.legend()
plt.show() plt.show()
# - # -


# ## How to use iterative method to estimate parameters?
#

# +
n_epoch = 3000 # epoch size
a, b = 1, 1 # initial parameters
epsilon = 0.001 # learning rate

for i in range(n_epoch):
for j in range(N):
a = a + epsilon*2*(Y[j] - a*X[j] - b)*X[j]
b = b + epsilon*2*(Y[j] - a*X[j] - b)

L = 0
for j in range(N):
L = L + (Y[j]-a*X[j]-b)**2
print("epoch %4d: loss = %f, a = %f, b = %f" % (i, L, a, b))
x_min = np.min(X)
x_max = np.max(X)
y_min = a * x_min + b
y_max = a * x_max + b

plt.scatter(X, Y, label='original data')
plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
plt.legend()
plt.show()
# -

# ## How to show the iterative process

# +
# %matplotlib nbagg

import matplotlib.pyplot as plt
import matplotlib.animation as animation

n_epoch = 3000 # epoch size
a, b = 1, 1 # initial parameters
epsilon = 0.001 # learning rate

fig = plt.figure()
imgs = []

for i in range(n_epoch):
for j in range(N):
a = a + epsilon*2*(Y[j] - a*X[j] - b)*X[j]
b = b + epsilon*2*(Y[j] - a*X[j] - b)

L = 0
for j in range(N):
L = L + (Y[j]-a*X[j]-b)**2
#print("epoch %4d: loss = %f, a = %f, b = %f" % (i, L, a, b))
if i % 50 == 0:
x_min = np.min(X)
x_max = np.max(X)
y_min = a * x_min + b
y_max = a * x_max + b

img = plt.scatter(X, Y, label='original data')
img = plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
imgs.append(img)
ani = animation.ArtistAnimation(fig, imgs)
plt.show()
# -

# ## How to use batch update method?
#
# If some data is outliear, then the

# ## How to fit polynomial function? # ## How to fit polynomial function?
# #
# If we observe a missle at some time, then how to estimate the trajectory? Acoording the physical theory, the trajectory can be formulated as: # If we observe a missle at some time, then how to estimate the trajectory? Acoording the physical theory, the trajectory can be formulated as:
@@ -217,8 +289,9 @@ Y_est = regr.predict(X_test)
print("Y_est = ", Y_est) print("Y_est = ", Y_est)
print("Y_test = ", Y_test) print("Y_test = ", Y_test)
err = (Y_est - Y_test)**2 err = (Y_est - Y_test)**2
err2 = sklearn.metrics.mean_squared_error(Y_test, Y_est)
score = regr.score(X_test, Y_test) score = regr.score(X_test, Y_test)
print("err = %f, score = %f" % (np.sqrt(np.sum(err))/N_test, score))
print("err = %f (%f), score = %f" % (np.sqrt(np.sum(err))/N_test, np.sqrt(err2), score))




# plot data # plot data


+ 27
- 1
1_logistic_regression/Logistic_regression.ipynb View File

@@ -5,12 +5,28 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Logistic Regression\n", "# Logistic Regression\n",
"\n",
"逻辑回归(Logistic Regression, LR)模型其实仅在线性回归的基础上,套用了一个逻辑函数,但也就由于这个逻辑函数,使得逻辑回归模型成为了机器学习领域一颗耀眼的明星,更是计算广告学的核心。本节主要详述逻辑回归模型的基础。\n",
"\n",
"\n",
"## 1 逻辑回归模型\n",
"回归是一种比较容易理解的模型,就相当于$y=f(x)$,表明自变量$x$与因变量$y$的关系。最常见问题有如医生治病时的望、闻、问、切,之后判定病人是否生病或生了什么病,其中的望闻问切就是获取自变量$x$,即特征数据,判断是否生病就相当于获取因变量$y$,即预测分类。\n",
"\n",
"最简单的回归是线性回归,在此借用Andrew NG的讲义,有如图所示,$X$为数据点——肿瘤的大小,$Y$为观测值——是否是恶性肿瘤。通过构建线性回归模型,如$h_\\theta(x)$所示,构建线性回归模型后,即可以根据肿瘤大小,预测是否为恶性肿瘤$h_\\theta(x)) \\ge 0.5$为恶性,$h_\\theta(x) \\lt 0.5$为良性。\n",
"\n",
"![LinearRegression](images/fig1.gif)\n",
"\n",
"然而线性回归的鲁棒性很差,例如在上图的数据集上建立回归,因最右边噪点的存在,使回归模型在训练集上表现都很差。这主要是由于线性回归在整个实数域内敏感度一致,而分类范围,需要在$[0,1]$。\n",
"\n",
"逻辑回归就是一种减小预测范围,将预测值限定为$[0,1]$间的一种回归模型,其回归方程与回归曲线如图2所示。逻辑曲线在$z=0$时,十分敏感,在$z>>0$或$z<<0$处,都不敏感,将预测值限定为$(0,1)$。\n",
"\n",
"![LogisticFunction](images/fig2.gif)\n",
"\n" "\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23,
"execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -171,6 +187,16 @@
"logistic.train(200)\n", "logistic.train(200)\n",
"plot_decision_boundary(lambda x: logistic.predict(x), data, label)" "plot_decision_boundary(lambda x: logistic.predict(x), data, label)"
] ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"\n",
"* [逻辑回归模型(Logistic Regression, LR)基础](https://www.cnblogs.com/sparkwen/p/3441197.html)\n",
"* [逻辑回归(Logistic Regression)](http://www.cnblogs.com/BYRans/p/4713624.html)"
]
} }
], ],
"metadata": { "metadata": {


+ 132
- 0
1_logistic_regression/Logistic_regression.py View File

@@ -0,0 +1,132 @@
# -*- coding: utf-8 -*-
# ---
# jupyter:
# jupytext_format_version: '1.2'
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# language_info:
# codemirror_mode:
# name: ipython
# version: 3
# file_extension: .py
# mimetype: text/x-python
# name: python
# nbconvert_exporter: python
# pygments_lexer: ipython3
# version: 3.5.2
# ---

# # Logistic Regression
#
# 逻辑回归(Logistic Regression, LR)模型其实仅在线性回归的基础上,套用了一个逻辑函数,但也就由于这个逻辑函数,使得逻辑回归模型成为了机器学习领域一颗耀眼的明星,更是计算广告学的核心。本节主要详述逻辑回归模型的基础。
#
#
# ## 1 逻辑回归模型
# 回归是一种比较容易理解的模型,就相当于$y=f(x)$,表明自变量$x$与因变量$y$的关系。最常见问题有如医生治病时的望、闻、问、切,之后判定病人是否生病或生了什么病,其中的望闻问切就是获取自变量$x$,即特征数据,判断是否生病就相当于获取因变量$y$,即预测分类。
#
# 最简单的回归是线性回归,在此借用Andrew NG的讲义,有如图所示,$X$为数据点——肿瘤的大小,$Y$为观测值——是否是恶性肿瘤。通过构建线性回归模型,如$h_\theta(x)$所示,构建线性回归模型后,即可以根据肿瘤大小,预测是否为恶性肿瘤$h_\theta(x)) \ge 0.5$为恶性,$h_\theta(x) \lt 0.5$为良性。
#
# ![LinearRegression](images/fig1.gif)
#
# 然而线性回归的鲁棒性很差,例如在上图的数据集上建立回归,因最右边噪点的存在,使回归模型在训练集上表现都很差。这主要是由于线性回归在整个实数域内敏感度一致,而分类范围,需要在$[0,1]$。
#
# 逻辑回归就是一种减小预测范围,将预测值限定为$[0,1]$间的一种回归模型,其回归方程与回归曲线如图2所示。逻辑曲线在$z=0$时,十分敏感,在$z>>0$或$z<<0$处,都不敏感,将预测值限定为$(0,1)$。
#
# ![LogisticFunction](images/fig2.gif)
#
#

# +
# %matplotlib inline

from __future__ import division
import numpy as np
import sklearn.datasets
import matplotlib.pyplot as plt

np.random.seed(0)


# +
# load sample data
data, label = sklearn.datasets.make_moons(200, noise=0.30)

print("data = ", data[:10, :])
print("label = ", label[:10])

plt.scatter(data[:,0], data[:,1], c=label)
plt.title("Original Data")

# +
def plot_decision_boundary(predict_func, data, label):
"""画出结果图
Args:
pred_func (callable): 预测函数
data (numpy.ndarray): 训练数据集合
label (numpy.ndarray): 训练数据标签
"""
x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
h = 0.01

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = predict_func(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.scatter(data[:, 0], data[:, 1], c=label, cmap=plt.cm.Spectral)
plt.show()



# +
def sigmoid(x):
return 1.0 / (1 + np.exp(-x))

class Logistic(object):
"""logistic回归模型"""
def __init__(self, data, label):
self.data = data
self.label = label

self.data_num, n = np.shape(data)
self.weights = np.ones(n)
self.b = 1

def train(self, num_iteration=150):
"""随机梯度上升算法
Args:
data (numpy.ndarray): 训练数据集
labels (numpy.ndarray): 训练标签
num_iteration (int): 迭代次数
"""
for j in range(num_iteration):
data_index = list(range(self.data_num))
for i in range(self.data_num):
# 学习速率
alpha = 0.01
rand_index = int(np.random.uniform(0, len(data_index)))
error = self.label[rand_index] - sigmoid(sum(self.data[rand_index] * self.weights + self.b))
self.weights += alpha * error * self.data[rand_index]
self.b += alpha * error
del(data_index[rand_index])

def predict(self, predict_data):
"""预测函数"""
result = list(map(lambda x: 1 if sum(self.weights * x + self.b) > 0 else 0,
predict_data))
return np.array(result)

# -

logistic = Logistic(data, label)
logistic.train(200)
plot_decision_boundary(lambda x: logistic.predict(x), data, label)

# ## References
#
# * [逻辑回归模型(Logistic Regression, LR)基础](https://www.cnblogs.com/sparkwen/p/3441197.html)
# * [逻辑回归(Logistic Regression)](http://www.cnblogs.com/BYRans/p/4713624.html)

BIN
1_logistic_regression/images/fig1.gif View File

Before After
Width: 515  |  Height: 348  |  Size: 19 kB

BIN
1_logistic_regression/images/fig2.gif View File

Before After
Width: 586  |  Height: 297  |  Size: 9.3 kB

BIN
1_logistic_regression/images/fig3.gif View File

Before After
Width: 433  |  Height: 139  |  Size: 4.1 kB

+ 0
- 149
1_logistic_regression/linear models.ipynb
File diff suppressed because it is too large
View File


+ 0
- 66
1_logistic_regression/linear_regression.py View File

@@ -1,66 +0,0 @@

import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn import datasets

# load data
d = datasets.load_diabetes()

X = d.data[:, 2]
Y = d.target

# draw original data
plt.scatter(X, Y)
plt.show()


###############################################################################
# Least squares
###############################################################################

# L = \sum_{i=1, N} (y_i - a*x_i - b)^2
N = X.shape[0]

S_X2 = np.sum(X*X)
S_X = np.sum(X)
S_XY = np.sum(X*Y)
S_Y = np.sum(Y)

A1 = np.array([[S_X2, S_X], [S_X, N]])
B1 = np.array([S_XY, S_Y])

coeff = np.linalg.inv(A1).dot(B1)

x_min = np.min(X)
x_max = np.max(X)
y_min = coeff[0] * x_min + coeff[1]
y_max = coeff[0] * x_max + coeff[1]

plt.scatter(X, Y)
plt.plot([x_min, x_max], [y_min, y_max], 'r')
plt.show()


###############################################################################
# Linear regression
###############################################################################
# the loss function
# L = \sum_{i=1, N} (y_i - a*x_i - b)^2

n_train = 1000


a, b = 1, 1
epsilon = 0.001

for i in range(n_train):
for j in range(N):
a = a + epsilon*2*(Y[j] - a*X[j] - b)*X[j]
b = b + epsilon*2*(Y[j] - a*X[j] - b)

L = 0
for j in range(N):
L = L + (Y[j]-a*X[j]-b)**2
print("epoch %4d: loss = %f" % (i, L))


+ 0
- 70
1_logistic_regression/logistic3.py View File

@@ -1,70 +0,0 @@
# -*- coding=utf8 -*-
from __future__ import division
import numpy as np
import sklearn.datasets
import matplotlib.pyplot as plt

np.random.seed(0)
data, label = sklearn.datasets.make_moons(200, noise=0.30)

def plot_decision_boundary(predict_func, data, label):
"""画出结果图
Args:
pred_func (callable): 预测函数
data (numpy.ndarray): 训练数据集合
label (numpy.ndarray): 训练数据标签
"""
x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
h = 0.01

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = predict_func(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.scatter(data[:, 0], data[:, 1], c=label, cmap=plt.cm.Spectral)
plt.show()

def sigmoid(x):
return 1.0 / (1 + np.exp(-x))

class Logistic(object):
"""logistic回归模型"""
def __init__(self, data, label):
self.data = data
self.label = label

self.data_num, n = np.shape(data)
self.weights = np.ones(n)
self.b = 1

def train(self, num_iteration=150):
"""随机梯度上升算法
Args:
data (numpy.ndarray): 训练数据集
labels (numpy.ndarray): 训练标签
num_iteration (int): 迭代次数
"""
for j in range(num_iteration):
data_index = list(range(self.data_num))
for i in range(self.data_num):
# 学习速率
alpha = 0.01
rand_index = int(np.random.uniform(0, len(data_index)))
error = self.label[rand_index] - sigmoid(sum(self.data[rand_index] * self.weights + self.b))
self.weights += alpha * error * self.data[rand_index]
self.b += alpha * error
del(data_index[rand_index])

def predict(self, predict_data):
"""预测函数"""
result = list(map(lambda x: 1 if sum(self.weights * x + self.b) > 0 else 0,
predict_data))
return np.array(result)

if __name__ == '__main__':
logistic = Logistic(data, label)
logistic.train(200)
plot_decision_boundary(lambda x: logistic.predict(x), data, label)

+ 0
- 72
1_logistic_regression/logistic_demo.py View File

@@ -1,72 +0,0 @@
# -*- coding=utf8 -*-
from __future__ import division
import numpy as np
import sklearn.datasets
import matplotlib.pyplot as plt

np.random.seed(0)
data, label = sklearn.datasets.make_moons(200, noise=0.30)

def plot_decision_boundary(predict_func, data, label):
"""画出结果图
Args:
pred_func (callable): 预测函数
data (numpy.ndarray): 训练数据集合
label (numpy.ndarray): 训练数据标签
"""
x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
h = 0.01

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = predict_func(np.c_[xx.ravel(), yy.ravel()])
print(Z.shape)
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.scatter(data[:, 0], data[:, 1], c=label, cmap=plt.cm.Spectral)
plt.show()

def sigmoid(x):
return 1.0 / (1 + np.exp(-x))

class Logistic(object):
"""logistic回归模型"""
def __init__(self, data, label):
self.data = data
self.label = label

self.data_num, n = np.shape(data)
self.weights = np.ones(n)
self.b = 1

def train(self, num_iteration=150):
"""随机梯度上升算法
Args:
data (numpy.ndarray): 训练数据集
labels (numpy.ndarray): 训练标签
num_iteration (int): 迭代次数
"""
for j in range(num_iteration):
data_index = range(self.data_num)
for i in range(self.data_num):
# 学习速率
alpha = 0.01
rand_index = int(np.random.uniform(0, len(data_index)))
error = self.label[rand_index] - sigmoid(sum(self.data[rand_index] * self.weights + self.b))
self.weights += alpha * error * self.data[rand_index]
self.b += alpha * error
def predict(self, predict_data):
"""预测函数"""
result = map(lambda x: 1 if sum(self.weights * x + self.b) > 0 else 0,
predict_data)
print(result)
return np.array(result)

if __name__ == '__main__':
logistic = Logistic(data, label)
logistic.train(200)
plot_decision_boundary(lambda x: logistic.predict(x), data, label)

Loading…
Cancel
Save