|
- # ---
- # jupyter:
- # jupytext_format_version: '1.2'
- # kernelspec:
- # display_name: Python 3
- # language: python
- # name: python3
- # language_info:
- # codemirror_mode:
- # name: ipython
- # version: 3
- # file_extension: .py
- # mimetype: text/x-python
- # name: python
- # nbconvert_exporter: python
- # pygments_lexer: ipython3
- # version: 3.5.2
- # ---
-
- # # Least squares
- #
- # A mathematical procedure for finding the best-fitting curve to a given set of points by minimizing the sum of the squares of the offsets ("the residuals") of the points from the curve. The sum of the squares of the offsets is used instead of the offset absolute values because this allows the residuals to be treated as a continuous differentiable quantity. However, because squares of the offsets are used, outlying points can have a disproportionate effect on the fit, a property which may or may not be desirable depending on the problem at hand.
- #
-
- # ### Show the data
- #
-
- # +
- # %matplotlib inline
-
- import matplotlib.pyplot as plt
- import numpy as np
- import sklearn
- from sklearn import datasets
-
- # load data
- d = datasets.load_diabetes()
-
- X = d.data[:, 2]
- Y = d.target
-
- # draw original data
- plt.scatter(X, Y)
- plt.xlabel("X")
- plt.ylabel("Y")
- plt.show()
-
- # -
-
- # ### Theory
- # For $N$ observation data:
- # $$
- # \mathbf{X} = \{x_1, x_2, ..., x_N \} \\
- # \mathbf{Y} = \{y_1, y_2, ..., y_N \}
- # $$
- #
- # We want to find the model which can predict the data. The simplest model is linear model, which has the form of
- # $$
- # y = ax + b
- # $$
- #
- # The purpose is to find parameters $a, b$ which best fit the model to the observation data.
- #
- # We use the sum of squares to measure the differences (loss function) between the model's prediction and observation data:
- # $$
- # L = \sum_{i=1}^{N} (y_i - a x_i + b)^2
- # $$
- #
- # To make the loss function minimize, we can find the parameters:
- # $$
- # \frac{\partial L}{\partial a} = -2 \sum_{i=1}^{N} (y_i - a x_i - b) x_i \\
- # \frac{\partial L}{\partial b} = -2 \sum_{i=1}^{N} (y_i - a x_i - b)
- # $$
- # When the loss is minimized, therefore the partial difference is zero, then we can get:
- # $$
- # -2 \sum_{i=1}^{N} (y_i - a x_i - b) x_i = 0 \\
- # -2 \sum_{i=1}^{N} (y_i - a x_i - b) = 0 \\
- # $$
- #
- # We reoder the items as:
- # $$
- # a \sum x_i^2 + b \sum x_i = \sum y_i x_i \\
- # a \sum x_i + b N = \sum y_i
- # $$
- # By solving the linear equation we can obtain the model parameters.
-
- # ### Program
-
- # +
- N = X.shape[0]
-
- S_X2 = np.sum(X*X)
- S_X = np.sum(X)
- S_XY = np.sum(X*Y)
- S_Y = np.sum(Y)
-
- A1 = np.array([[S_X2, S_X],
- [S_X, N]])
- B1 = np.array([S_XY, S_Y])
-
- coeff = np.linalg.inv(A1).dot(B1)
-
- print('a = %f, b = %f' % (coeff[0], coeff[1]))
-
- x_min = np.min(X)
- x_max = np.max(X)
- y_min = coeff[0] * x_min + coeff[1]
- y_max = coeff[0] * x_max + coeff[1]
-
- plt.scatter(X, Y, label='original data')
- plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
- plt.legend()
- plt.show()
- # -
-
- # ## How to use iterative method to estimate parameters?
- #
-
- # +
- n_epoch = 3000 # epoch size
- a, b = 1, 1 # initial parameters
- epsilon = 0.001 # learning rate
-
- for i in range(n_epoch):
- for j in range(N):
- a = a + epsilon*2*(Y[j] - a*X[j] - b)*X[j]
- b = b + epsilon*2*(Y[j] - a*X[j] - b)
-
- L = 0
- for j in range(N):
- L = L + (Y[j]-a*X[j]-b)**2
- print("epoch %4d: loss = %f, a = %f, b = %f" % (i, L, a, b))
-
- x_min = np.min(X)
- x_max = np.max(X)
- y_min = a * x_min + b
- y_max = a * x_max + b
-
- plt.scatter(X, Y, label='original data')
- plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
- plt.legend()
- plt.show()
- # -
-
- # ## How to show the iterative process
-
- # +
- # %matplotlib nbagg
-
- import matplotlib.pyplot as plt
- import matplotlib.animation as animation
-
- n_epoch = 3000 # epoch size
- a, b = 1, 1 # initial parameters
- epsilon = 0.001 # learning rate
-
- fig = plt.figure()
- imgs = []
-
- for i in range(n_epoch):
- for j in range(N):
- a = a + epsilon*2*(Y[j] - a*X[j] - b)*X[j]
- b = b + epsilon*2*(Y[j] - a*X[j] - b)
-
- L = 0
- for j in range(N):
- L = L + (Y[j]-a*X[j]-b)**2
- #print("epoch %4d: loss = %f, a = %f, b = %f" % (i, L, a, b))
-
- if i % 50 == 0:
- x_min = np.min(X)
- x_max = np.max(X)
- y_min = a * x_min + b
- y_max = a * x_max + b
-
- img = plt.scatter(X, Y, label='original data')
- img = plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
- imgs.append(img)
-
- ani = animation.ArtistAnimation(fig, imgs)
- plt.show()
- # -
-
- # ## How to use batch update method?
- #
- # If some data is outliear, then the
-
- # ## How to fit polynomial function?
- #
- # If we observe a missle at some time, then how to estimate the trajectory? Acoording the physical theory, the trajectory can be formulated as:
- # $$
- # y = at^2 + bt + c
- # $$
- # The we need at least three data to compute the parameters $a, b, c$.
- #
- #
-
- # +
- t = np.array([2, 4, 6, 8])
- #t = np.linspace(0, 10)
-
- pa = -20
- pb = 90
- pc = 800
-
- y = pa*t**2 + pb*t + pc
-
-
- plt.scatter(t, y)
- plt.show()
- # -
-
- # ## How to use sklearn to solve linear problem?
- #
- #
-
- # +
- from sklearn import linear_model
-
- # load data
- d = datasets.load_diabetes()
-
- X = d.data[:, np.newaxis, 2]
- Y = d.target
-
- # create regression model
- regr = linear_model.LinearRegression()
- regr.fit(X, Y)
-
- a, b = regr.coef_, regr.intercept_
- print("a = %f, b = %f" % (a, b))
-
- x_min = np.min(X)
- x_max = np.max(X)
- y_min = a * x_min + b
- y_max = a * x_max + b
-
- plt.scatter(X, Y)
- plt.plot([x_min, x_max], [y_min, y_max], 'r')
- plt.show()
- # -
-
- # ## How to use sklearn to fit polynomial function?
-
- # +
- # Fitting polynomial functions
-
- from sklearn.preprocessing import PolynomialFeatures
- from sklearn.linear_model import LinearRegression
- from sklearn.pipeline import Pipeline
-
- t = np.array([2, 4, 6, 8])
-
- pa = -20
- pb = 90
- pc = 800
-
- y = pa*t**2 + pb*t + pc
-
- model = Pipeline([('poly', PolynomialFeatures(degree=2)),
- ('linear', LinearRegression(fit_intercept=False))])
- model = model.fit(t[:, np.newaxis], y)
- model.named_steps['linear'].coef_
-
- # -
-
- # ## How to estimate some missing value by the model?
- #
-
- # +
- # load data
- d = datasets.load_diabetes()
-
- N = d.target.shape[0]
- N_train = int(N*0.9)
- N_test = N - N_train
-
- X = d.data[:N_train, np.newaxis, 2]
- Y = d.target[:N_train]
-
- X_test = d.data[N_train:, np.newaxis, 2]
- Y_test = d.target[N_train:]
-
- # create regression model
- regr = linear_model.LinearRegression()
- regr.fit(X, Y)
-
- Y_est = regr.predict(X_test)
- print("Y_est = ", Y_est)
- print("Y_test = ", Y_test)
- err = (Y_est - Y_test)**2
- err2 = sklearn.metrics.mean_squared_error(Y_test, Y_est)
- score = regr.score(X_test, Y_test)
- print("err = %f (%f), score = %f" % (np.sqrt(np.sum(err))/N_test, np.sqrt(err2), score))
-
-
- # plot data
- a, b = regr.coef_, regr.intercept_
- print("a = %f, b = %f" % (a, b))
-
- x_min = np.min(X)
- x_max = np.max(X)
- y_min = a * x_min + b
- y_max = a * x_max + b
-
-
- plt.scatter(X, Y, label='train data')
- plt.scatter(X_test, Y_test, label='test data')
- plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
- plt.legend()
- plt.show()
- # -
-
|