@@ -17,11 +17,7 @@
# version: 3.5.2
# ---
# # Linear regression
#
#
# ## Least squares
# # Least squares
#
# A mathematical procedure for finding the best-fitting curve to a given set of points by minimizing the sum of the squares of the offsets ("the residuals") of the points from the curve. The sum of the squares of the offsets is used instead of the offset absolute values because this allows the residuals to be treated as a continuous differentiable quantity. However, because squares of the offsets are used, outlying points can have a disproportionate effect on the fit, a property which may or may not be desirable depending on the problem at hand.
#
@@ -45,7 +41,10 @@ Y = d.target
# draw original data
plt.scatter(X, Y)
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
# -
# ### Theory
@@ -108,6 +107,135 @@ x_max = np.max(X)
y_min = coeff[0] * x_min + coeff[1]
y_max = coeff[0] * x_max + coeff[1]
plt.scatter(X, Y, label='original data')
plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
plt.legend()
plt.show()
# -
# ## How to fit polynomial function?
#
# If we observe a missle at some time, then how to estimate the trajectory? Acoording the physical theory, the trajectory can be formulated as:
# $$
# y = at^2 + bt + c
# $$
# The we need at least three data to compute the parameters $a, b, c$.
#
#
# +
t = np.array([2, 4, 6, 8])
#t = np.linspace(0, 10)
pa = -20
pb = 90
pc = 800
y = pa*t**2 + pb*t + pc
plt.scatter(t, y)
plt.show()
# -
# ## How to use sklearn to solve linear problem?
#
#
# +
from sklearn import linear_model
# load data
d = datasets.load_diabetes()
X = d.data[:, np.newaxis, 2]
Y = d.target
# create regression model
regr = linear_model.LinearRegression()
regr.fit(X, Y)
a, b = regr.coef_, regr.intercept_
print("a = %f, b = %f" % (a, b))
x_min = np.min(X)
x_max = np.max(X)
y_min = a * x_min + b
y_max = a * x_max + b
plt.scatter(X, Y)
plt.plot([x_min, x_max], [y_min, y_max], 'r')
plt.show()
# -
# ## How to use sklearn to fit polynomial function?
# +
# Fitting polynomial functions
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
t = np.array([2, 4, 6, 8])
pa = -20
pb = 90
pc = 800
y = pa*t**2 + pb*t + pc
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
('linear', LinearRegression(fit_intercept=False))])
model = model.fit(t[:, np.newaxis], y)
model.named_steps['linear'].coef_
# -
# ## How to estimate some missing value by the model?
#
# +
# load data
d = datasets.load_diabetes()
N = d.target.shape[0]
N_train = int(N*0.9)
N_test = N - N_train
X = d.data[:N_train, np.newaxis, 2]
Y = d.target[:N_train]
X_test = d.data[N_train:, np.newaxis, 2]
Y_test = d.target[N_train:]
# create regression model
regr = linear_model.LinearRegression()
regr.fit(X, Y)
Y_est = regr.predict(X_test)
print("Y_est = ", Y_est)
print("Y_test = ", Y_test)
err = (Y_est - Y_test)**2
score = regr.score(X_test, Y_test)
print("err = %f, score = %f" % (np.sqrt(np.sum(err))/N_test, score))
# plot data
a, b = regr.coef_, regr.intercept_
print("a = %f, b = %f" % (a, b))
x_min = np.min(X)
x_max = np.max(X)
y_min = a * x_min + b
y_max = a * x_max + b
plt.scatter(X, Y, label='train data')
plt.scatter(X_test, Y_test, label='test data')
plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
plt.legend()
plt.show()
# -