Add perceptron

6 years ago · bbd30e5078
--- a/1_kmeans/ClusteringAlgorithms.ipynb
+++ b/1_kmeans/ClusteringAlgorithms.ipynb
--- a/1_kmeans/ClusteringAlgorithms.py
+++ b/1_kmeans/ClusteringAlgorithms.py
@@ -0,0 +1,191 @@
 # -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext_format_version: '1.2'
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
 #     name: python3
 #   language_info:
 #     codemirror_mode:
 #       name: ipython
 #       version: 3
 #     file_extension: .py
 #     mimetype: text/x-python
 #     name: python
 #     nbconvert_exporter: python
 #     pygments_lexer: ipython3
 #     version: 3.5.2
 # ---
 # # Comparing different clustering algorithms on toy datasets
 #
 # This example shows characteristics of different clustering algorithms on datasets that are “interesting” but still in 2D. With the exception of the last dataset, the parameters of each of these dataset-algorithm pairs has been tuned to produce good clustering results. Some algorithms are more sensitive to parameter values than others.
 # The last dataset is an example of a ‘null’ situation for clustering: the data is homogeneous, and there is no good clustering. For this example, the null dataset uses the same parameters as the dataset in the row above it, which represents a mismatch in the parameter values and the data structure.
 # While these examples give some intuition about the algorithms, this intuition might not apply to very high dimensional data.
 # +
 % matplotlib inline
 import time
 import warnings
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn import cluster, datasets, mixture
 from sklearn.neighbors import kneighbors_graph
 from sklearn.preprocessing import StandardScaler
 from itertools import cycle, islice
 np.random.seed(0)
 # ============
 # Generate datasets. We choose the size big enough to see the scalability
 # of the algorithms, but not too big to avoid too long running times
 # ============
 n_samples = 1500
 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)
 noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
 blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
 no_structure = np.random.rand(n_samples, 2), None
 # Anisotropicly distributed data
 random_state = 170
 X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
 transformation = [[0.6, -0.6], [-0.4, 0.8]]
 X_aniso = np.dot(X, transformation)
 aniso = (X_aniso, y)
 # blobs with varied variances
 varied = datasets.make_blobs(n_samples=n_samples,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=random_state)
 # ============
 # Set up cluster parameters
 # ============
 plt.figure(figsize=(9 * 2 + 3, 12.5))
 plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                    hspace=.01)
 plot_num = 1
 default_base = {'quantile': .3,
                'eps': .3,
                'damping': .9,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 3}
 datasets = [
    (noisy_circles, {'damping': .77, 'preference': -240,
                     'quantile': .2, 'n_clusters': 2}),
    (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
    (varied, {'eps': .18, 'n_neighbors': 2}),
    (aniso, {'eps': .15, 'n_neighbors': 2}),
    (blobs, {}),
    (no_structure, {})]
 for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)
    X, y = dataset
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)
    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(
        X, n_neighbors=params['n_neighbors'], include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    # ============
    # Create cluster objects
    # ============
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(
        n_clusters=params['n_clusters'], linkage='ward',
        connectivity=connectivity)
    spectral = cluster.SpectralClustering(
        n_clusters=params['n_clusters'], eigen_solver='arpack',
        affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average", affinity="cityblock",
        n_clusters=params['n_clusters'], connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(
        n_components=params['n_clusters'], covariance_type='full')
    clustering_algorithms = (
        ('MiniBatchKMeans', two_means),
        ('AffinityPropagation', affinity_propagation),
        ('MeanShift', ms),
        ('SpectralClustering', spectral),
        ('Ward', ward),
        ('AgglomerativeClustering', average_linkage),
        ('DBSCAN', dbscan),
        ('Birch', birch),
        ('GaussianMixture', gmm)
    )
    for name, algorithm in clustering_algorithms:
        t0 = time.time()
        # catch warnings related to kneighbors_graph
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="the number of connected components of the " +
                "connectivity matrix is [0-9]{1,2}" +
                " > 1. Completing it to avoid stopping the tree early.",
                category=UserWarning)
            warnings.filterwarnings(
                "ignore",
                message="Graph is not fully connected, spectral embedding" +
                " may not work as expected.",
                category=UserWarning)
            algorithm.fit(X)
        t1 = time.time()
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)
        plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)
        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                      int(max(y_pred) + 1))))
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
        plot_num += 1
 plt.show()
 # -
 # ## Reference
 # * [Comparing different clustering algorithms on toy datasets](http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html)
--- a/1_logistic_regression/Least_squares.ipynb
+++ b/1_logistic_regression/Least_squares.ipynb
--- a/1_logistic_regression/PCA_and_Logistic_Regression.ipynb
+++ b/1_logistic_regression/PCA_and_Logistic_Regression.ipynb
@@ -18,7 +18,7 @@
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@@ -82,7 +82,7 @@
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@@ -136,7 +136,7 @@
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@@ -175,7 +175,7 @@
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@@ -208,11 +208,50 @@
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy train = 0.148921, accuracy_test = 0.102778\n"
     ]
    }
   ],
   "source": [
    "# do kernel PCA\n",
    "#   Ref: http://scikit-learn.org/stable/auto_examples/decomposition/plot_kernel_pca.html\n",
    "\n",
    "from sklearn.decomposition import PCA, KernelPCA\n",
    "\n",
    "kpca = KernelPCA(n_components=45, kernel=\"rbf\", fit_inverse_transform=True, gamma=10)\n",
    "kpca.fit(x_train)\n",
    "\n",
    "x_train_pca = kpca.transform(x_train)\n",
    "x_test_pca = kpca.transform(x_test)\n",
    "\n",
    "# do logistic regression\n",
    "lr=LogisticRegression()\n",
    "lr.fit(x_train_pca,y_train)\n",
    "\n",
    "pred_train = lr.predict(x_train_pca)\n",
    "pred_test  = lr.predict(x_test_pca)\n",
    "\n",
    "# calculate train/test accuracy\n",
    "acc_train = accuracy_score(y_train, pred_train)\n",
    "acc_test = accuracy_score(y_test, pred_test)\n",
    "print(\"accuracy train = %f, accuracy_test = %f\" % (acc_train, acc_test))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## References\n",
    "* [Pipelining: chaining a PCA and a logistic regression](http://scikit-learn.org/stable/auto_examples/plot_digits_pipe.html)"
    "* [Pipelining: chaining a PCA and a logistic regression](http://scikit-learn.org/stable/auto_examples/plot_digits_pipe.html)\n",
    "* [PCA进行无监督降维](https://ljalphabeta.gitbooks.io/python-/content/pca.html)"
   ]
  }
 ],
@@ -233,8 +272,7 @@
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "main_language": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
--- a/1_logistic_regression/PCA_and_Logistic_Regression.py
+++ b/1_logistic_regression/PCA_and_Logistic_Regression.py
@@ -135,7 +135,33 @@ acc_train = accuracy_score(y_train, pred_train)
 acc_test = accuracy_score(y_test, pred_test)
 print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))
 # +
 # do kernel PCA
 #   Ref: http://scikit-learn.org/stable/auto_examples/decomposition/plot_kernel_pca.html
 from sklearn.decomposition import PCA, KernelPCA
 kpca = KernelPCA(n_components=45, kernel="rbf", fit_inverse_transform=True, gamma=10)
 kpca.fit(x_train)
 x_train_pca = kpca.transform(x_train)
 x_test_pca = kpca.transform(x_test)
 # do logistic regression
 lr=LogisticRegression()
 lr.fit(x_train_pca,y_train)
 pred_train = lr.predict(x_train_pca)
 pred_test  = lr.predict(x_test_pca)
 # calculate train/test accuracy
 acc_train = accuracy_score(y_train, pred_train)
 acc_test = accuracy_score(y_test, pred_test)
 print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))
 # -
 # ## References
 # * [Pipelining: chaining a PCA and a logistic regression](http://scikit-learn.org/stable/auto_examples/plot_digits_pipe.html)
 # * [PCA进行无监督降维](https://ljalphabeta.gitbooks.io/python-/content/pca.html)
--- a/References.md
+++ b/References.md
@@ -1,5 +1,6 @@
 # References
 更多的学习资料，可以自行在下属列表找找到适合自己的学习资料。
 可以自行在下属列表找找到适合自己的学习资料，虽然罗列的比较多，但是个人最好选择一个深入阅读、练习。当练习到一定程度，可以再看看其他的资料，这样弥补单一学习资料可能存在的欠缺。
 ## Python & IPython
 * [Python教程](https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000)
@@ -8,7 +9,6 @@
 * [IPython tutorials](https://nbviewer.jupyter.org/github/ipython/ipython/blob/master/examples/IPython%20Kernel/Index.ipynb)
 * [Examples from the IPython mini-book](https://github.com/rossant/ipython-minibook)
 * [Code of the IPython Cookbook, Second Edition (2018)](https://github.com/ipython-books/cookbook-2nd-code)
 * [scientific-python-lectures](http://nbviewer.jupyter.org/github/jrjohansson/scientific-python-lectures/tree/master/)
 ## Libs
@@ -24,6 +24,8 @@
 * [ipython-notebooks: A collection of IPython notebooks covering various topics](https://github.com/jdwittenauer/ipython-notebooks)
 * [Learn Data Science](http://learnds.com/)
 * [AM207 2016](https://github.com/AM207/2016/tree/master)
 * [Python机器学习](https://ljalphabeta.gitbooks.io/python-/content/)
 * [scientific-python-lectures](http://nbviewer.jupyter.org/github/jrjohansson/scientific-python-lectures/tree/master/)
 ## Awesome series
--- a/nn/Perceptron.ipynb
+++ b/nn/Perceptron.ipynb
@@ -0,0 +1,252 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 感知机\n",
    "\n",
    "感知机（perceptron）是二分类的线性分类模型，输入为实例的特征向量，输出为实例的类别（取+1和-1）。感知机对应于输入空间中将实例划分为两类的分离超平面。感知机旨在求出该超平面，为求得超平面导入了基于误分类的损失函数，利用梯度下降法 对损失函数进行最优化（最优化）。感知机的学习算法具有简单而易于实现的优点，分为原始形式和对偶形式。感知机预测是用学习得到的感知机模型对新的实例进行预测的，因此属于判别模型。感知机由Rosenblatt于1957年提出的，是神经网络和支持向量机的基础。\n",
    "\n",
    "模仿的是生物神经系统内的神经元，它能够接受来自多个源的信号输入，然后将信号转化为便于传播的信号在进行输出(在生物体内表现为电信号)。\n",
    "\n",
    "![neuron](images/neuron.png)\n",
    "\n",
    "* dendrites - 树突\n",
    "* nucleus - 细胞核\n",
    "* axon - 轴突\n",
    "\n",
    "心理学家Rosenblatt构想了感知机，它作为简化的数学模型解释大脑神经元如何工作：它取一组二进制输入值（附近的神经元），将每个输入值乘以一个连续值权重（每个附近神经元的突触强度），并设立一个阈值，如果这些加权输入值的和超过这个阈值，就输出1，否则输出0（同理于神经元是否放电）。对于感知机，绝大多数输入值不是一些数据，就是别的感知机的输出值。\n",
    "\n",
    "麦卡洛克-皮兹模型缺乏一个对AI而言至关重要的学习机制。这就是感知机更出色的地方所在——罗森布拉特受到唐纳德·赫布(Donald Hebb) 基础性工作的启发，想出一个让这种人工神经元学习的办法。赫布提出了一个出人意料并影响深远的想法，称知识和学习发生在大脑主要是通过神经元间突触的形成与变化，简要表述为赫布法则：\n",
    "\n",
    ">当细胞A的轴突足以接近以激发细胞B，并反复持续地对细胞B放电，一些生长过程或代谢变化将发生在某一个或这两个细胞内，以致A作为对B放电的细胞中的一个，效率增加。\n",
    "\n",
    "\n",
    "感知机并没有完全遵循这个想法，**但通过调输入值的权重，可以有一个非常简单直观的学习方案：给定一个有输入输出实例的训练集，感知机应该「学习」一个函数：对每个例子，若感知机的输出值比实例低太多，则增加它的权重，否则若设比实例高太多，则减少它的权重。**\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. 感知机模型\n",
    "\n",
    "假设输入空间(特征向量)为X⊆Rn，输出空间为Y={-1, +1}。输入x∈X表示实例的特征向量，对应于输入空间的点；输出y∈Y表示示例的类别。由输入空间到输出空间的函数为\n",
    "\n",
    "$$\n",
    "f(x) = sign(w x + b)\n",
    "$$\n",
    "\n",
    "称为感知机。其中，参数w叫做权值向量，b称为偏置。w·x表示w和x的内积。sign为符号函数，即\n",
    "![sign_function](images/sign.png)\n",
    "\n",
    "### 几何解释    \n",
    "感知机模型是线性分类模型，感知机模型的假设空间是定义在特征空间中的所有线性分类模型，即函数集合{f|f(x)=w·x+b}。线性方程 w·x+b=0对应于特征空间Rn中的一个超平面S，其中w是超平面的法向量，b是超平面的截踞。这个超平面把特征空间划分为两部分。位于两侧的点分别为正负两类。超平面S称为分离超平面，如下图：\n",
    "![perceptron_geometry_def](images/perceptron_geometry_def.png)\n",
    "\n",
    "### 生物学类比\n",
    "![perceptron_2](images/perceptron_2.PNG)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 感知机学习策略\n",
    "\n",
    "假设训练数据集是线性可分的，感知机学习的目标是求得一个能够将训练数据的正负实例点完全分开的分离超平面，即最终求得参数w、b。这需要一个学习策略，即定义（经验）损失函数并将损失函数最小化。\n",
    "\n",
    "损失函数的一个自然的选择是误分类的点的总数。但是这样得到的损失函数不是参数w、b的连续可导函数，不宜优化。损失函数的另一个选择是误分类点到分里面的距离之和。\n",
    "\n",
    "首先，对于任意一点xo到超平面的距离为\n",
    "$$\n",
    "\\frac{1}{||w||} | w \\cdot xo + b |\n",
    "$$\n",
    "\n",
    "其次，对于误分类点（xi,yi）来说 -yi(w·xi+b)>0\n",
    "\n",
    "这样，假设超平面S的总的误分类点集合为M，那么所有误分类点到S的距离之和为\n",
    "$$\n",
    "-\\frac{1}{||w||} \\sum_{x_i \\in M} y_i (w \\cdot x_i + b)\n",
    "$$\n",
    "不考虑1/||w||，就得到了感知机学习的损失函数。\n",
    "\n",
    "### 经验风险函数\n",
    "\n",
    "给定数据集T={(x1,y1),(x2,y2)...(xN,yN)}（其中xi∈X=Rn，yi∈Y={-1, +1}，i=1,2...N），感知机sign(w·x+b)学习的损失函数定义为\n",
    "$$\n",
    "L(w, b) = - \\sum_{x_i \\in M} y_i (w \\cdot x_i + b)\n",
    "$$\n",
    "其中M为误分类点的集合，这个损失函数就是感知机学习的[经验风险函数](https://blog.csdn.net/zhzhx1204/article/details/70163099)。\n",
    "\n",
    "显然，损失函数L(w,b)是非负的。如果没有误分类点，那么L(w,b)为0，误分类点数越少，L(w,b)值越小。一个特定的损失函数：在误分类时是参数w,b的线性函数，在正确分类时，是0.因此，给定训练数据集T,损失函数L(w,b)是w,b的连续可导函数。\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 感知机学习算法\n",
    "\n",
    "\n",
    "最优化问题：给定数据集T={(x1,y1),(x2,y2)...(xN,yN)}（其中xi∈X=Rn，yi∈Y={-1, +1}，i=1,2...N），求参数w,b,使其成为损失函数的解（M为误分类的集合）：\n",
    "\n",
    "$$\n",
    "min_{w,b} L(w, b) =  - \\sum_{x_i \\in M} y_i (w \\cdot x_i + b)\n",
    "$$\n",
    "\n",
    "感知机学习是误分类驱动的，具体采用[随机梯度下降法](https://blog.csdn.net/zbc1090549839/article/details/38149561)。首先，任意选定$w_0$、$b_0$，然后用梯度下降法不断极小化目标函数，极小化的过程不知一次性的把M中的所有误分类点梯度下降，而是一次随机选取一个误分类点使其梯度下降。\n",
    "\n",
    "假设误分类集合M是固定的，那么损失函数L(w,b)的梯度为\n",
    "$$\n",
    "\\triangledown_w L(w, b) = - \\sum_{x_i \\in M} y_i x_i \\\\\n",
    "\\triangledown_b L(w, b) = - \\sum_{x_i \\in M} y_i \\\\\n",
    "$$\n",
    "\n",
    "随机选取一个误分类点$(x_i,y_i)$,对$w,b$进行更新：\n",
    "$$\n",
    "w = w + \\eta y_i x_i \\\\\n",
    "b = b + \\eta y_i\n",
    "$$\n",
    "\n",
    "式中$\\eta$（0 ≤ $ \\eta $ ≤ 1）是步长，在统计学是中成为学习速率。步长越大，梯度下降的速度越快，更能接近极小点。如果步长过大，有可能导致跨过极小点，导致函数发散；如果步长过小，有可能会耗很长时间才能达到极小点。\n",
    "\n",
    "直观解释：当一个实例点被误分类时，调整w,b，使分离超平面向该误分类点的一侧移动，以减少该误分类点与超平面的距离，直至超越该点被正确分类。\n",
    "\n",
    "\n",
    "\n",
    "算法\n",
    "```\n",
    "输入：T={(x1,y1),(x2,y2)...(xN,yN)}（其中xi∈X=Rn，yi∈Y={-1, +1}，i=1,2...N，学习速率为η）\n",
    "输出：w, b;感知机模型f(x)=sign(w·x+b)\n",
    "(1) 初始化w0,b0\n",
    "(2) 在训练数据集中选取（xi, yi）\n",
    "(3) 如果yi(w xi+b)≤0\n",
    "           w = w + ηyixi\n",
    "           b = b + ηyi\n",
    "(4) 转至（2）\n",
    "```\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Program\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "update weight and bias:  1.5 4.0 0.5\n",
      "update weight and bias:  -0.5 3.5 0.0\n",
      "update weight and bias:  -2.0 3.0 -0.5\n",
      "w =  [-2.0, 3.0]\n",
      "b =  -0.5\n",
      "[ 1  1  1  1 -1 -1 -1 -1]\n",
      "[1, 1, 1, 1, -1, -1, -1, -1]\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "import numpy as np\n",
    "\n",
    "# 符号函数\n",
    "def sign(v):\n",
    "    if v > 0:  return 1\n",
    "    else:      return -1\n",
    "    \n",
    "def perceptron_train(train_data, eta=0.5, n_iter=100):\n",
    "    weight = [0, 0]  # 权重\n",
    "    bias = 0  # 偏置量\n",
    "    learning_rate = eta  # 学习速率\n",
    "\n",
    "    train_num = n_iter  # 迭代次数\n",
    "\n",
    "    for i in range(train_num):\n",
    "        train = random.choice(train_data)\n",
    "        x1, x2, y = train\n",
    "        predict = sign(weight[0] * x1 + weight[1] * x2 + bias)  # 输出\n",
    "        #print(\"train data: x: (%2d, %2d) y: %2d  ==> predict: %2d\" % (x1, x2, y, predict))\n",
    "        \n",
    "        if y * predict <= 0:  # 判断误分类点\n",
    "            weight[0] = weight[0] + learning_rate * y * x1  # 更新权重\n",
    "            weight[1] = weight[1] + learning_rate * y * x2\n",
    "            bias      = bias      + learning_rate * y       # 更新偏置量\n",
    "            print(\"update weight and bias: \", weight[0], weight[1], bias)\n",
    "\n",
    "    #print(\"stop training: \", weight[0], weight[1], bias)\n",
    "\n",
    "    return weight, bias\n",
    "\n",
    "def perceptron_pred(data, w, b):\n",
    "    y_pred = []\n",
    "    for d in data:\n",
    "        x1, x2, y = d\n",
    "        yi = sign(w[0]*x1 + w[1]*x2 + b)\n",
    "        y_pred.append(yi)\n",
    "        \n",
    "    return y_pred\n",
    "\n",
    "# set training data\n",
    "train_data = np.array([[1, 3,  1], [2, 5,  1], [3, 8,  1], [2, 6,  1], \n",
    "                       [3, 1, -1], [4, 1, -1], [6, 2, -1], [7, 3, -1]])\n",
    "\n",
    "# do training\n",
    "w, b = perceptron_train(train_data)\n",
    "print(\"w = \", w)\n",
    "print(\"b = \", b)\n",
    "\n",
    "# predict \n",
    "y_pred = perceptron_pred(train_data, w, b)\n",
    "\n",
    "print(train_data[:, 2])\n",
    "print(y_pred)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reference\n",
    "* [感知机（Python实现）](http://www.cnblogs.com/kaituorensheng/p/3561091.html)\n",
    "* [Programming a Perceptron in Python](https://blog.dbrgn.ch/2013/3/26/perceptrons-in-python/)\n",
    "* [损失函数、风险函数、经验风险最小化、结构风险最小化](https://blog.csdn.net/zhzhx1204/article/details/70163099)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "main_language": "python"
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/nn/Perceptron.py
+++ b/nn/Perceptron.py
@@ -0,0 +1,202 @@
 # -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext_format_version: '1.2'
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
 #     name: python3
 #   language_info:
 #     codemirror_mode:
 #       name: ipython
 #       version: 3
 #     file_extension: .py
 #     mimetype: text/x-python
 #     name: python
 #     nbconvert_exporter: python
 #     pygments_lexer: ipython3
 #     version: 3.5.2
 # ---
 # ## 感知机
 #
 # 感知机（perceptron）是二分类的线性分类模型，输入为实例的特征向量，输出为实例的类别（取+1和-1）。感知机对应于输入空间中将实例划分为两类的分离超平面。感知机旨在求出该超平面，为求得超平面导入了基于误分类的损失函数，利用梯度下降法 对损失函数进行最优化（最优化）。感知机的学习算法具有简单而易于实现的优点，分为原始形式和对偶形式。感知机预测是用学习得到的感知机模型对新的实例进行预测的，因此属于判别模型。感知机由Rosenblatt于1957年提出的，是神经网络和支持向量机的基础。
 #
 # 模仿的是生物神经系统内的神经元，它能够接受来自多个源的信号输入，然后将信号转化为便于传播的信号在进行输出(在生物体内表现为电信号)。
 #
 # ![neuron](images/neuron.png)
 #
 # * dendrites - 树突
 # * nucleus - 细胞核
 # * axon - 轴突
 #
 # 心理学家Rosenblatt构想了感知机，它作为简化的数学模型解释大脑神经元如何工作：它取一组二进制输入值（附近的神经元），将每个输入值乘以一个连续值权重（每个附近神经元的突触强度），并设立一个阈值，如果这些加权输入值的和超过这个阈值，就输出1，否则输出0（同理于神经元是否放电）。对于感知机，绝大多数输入值不是一些数据，就是别的感知机的输出值。
 #
 # 麦卡洛克-皮兹模型缺乏一个对AI而言至关重要的学习机制。这就是感知机更出色的地方所在——罗森布拉特受到唐纳德·赫布(Donald Hebb) 基础性工作的启发，想出一个让这种人工神经元学习的办法。赫布提出了一个出人意料并影响深远的想法，称知识和学习发生在大脑主要是通过神经元间突触的形成与变化，简要表述为赫布法则：
 #
 # >当细胞A的轴突足以接近以激发细胞B，并反复持续地对细胞B放电，一些生长过程或代谢变化将发生在某一个或这两个细胞内，以致A作为对B放电的细胞中的一个，效率增加。
 #
 #
 # 感知机并没有完全遵循这个想法，**但通过调输入值的权重，可以有一个非常简单直观的学习方案：给定一个有输入输出实例的训练集，感知机应该「学习」一个函数：对每个例子，若感知机的输出值比实例低太多，则增加它的权重，否则若设比实例高太多，则减少它的权重。**
 #
 # ## 1. 感知机模型
 #
 # 假设输入空间(特征向量)为X⊆Rn，输出空间为Y={-1, +1}。输入x∈X表示实例的特征向量，对应于输入空间的点；输出y∈Y表示示例的类别。由输入空间到输出空间的函数为
 #
 # $$
 # f(x) = sign(w x + b)
 # $$
 #
 # 称为感知机。其中，参数w叫做权值向量，b称为偏置。w·x表示w和x的内积。sign为符号函数，即
 # ![sign_function](images/sign.png)
 #
 # ### 几何解释    
 # 感知机模型是线性分类模型，感知机模型的假设空间是定义在特征空间中的所有线性分类模型，即函数集合{f|f(x)=w·x+b}。线性方程 w·x+b=0对应于特征空间Rn中的一个超平面S，其中w是超平面的法向量，b是超平面的截踞。这个超平面把特征空间划分为两部分。位于两侧的点分别为正负两类。超平面S称为分离超平面，如下图：
 # ![perceptron_geometry_def](images/perceptron_geometry_def.png)
 #
 # ### 生物学类比
 # ![perceptron_2](images/perceptron_2.PNG)
 #
 #
 #
 # ## 2. 感知机学习策略
 #
 # 假设训练数据集是线性可分的，感知机学习的目标是求得一个能够将训练数据的正负实例点完全分开的分离超平面，即最终求得参数w、b。这需要一个学习策略，即定义（经验）损失函数并将损失函数最小化。
 #
 # 损失函数的一个自然的选择是误分类的点的总数。但是这样得到的损失函数不是参数w、b的连续可导函数，不宜优化。损失函数的另一个选择是误分类点到分里面的距离之和。
 #
 # 首先，对于任意一点xo到超平面的距离为
 # $$
 # \frac{1}{||w||} | w \cdot xo + b |
 # $$
 #
 # 其次，对于误分类点（xi,yi）来说 -yi(w·xi+b)>0
 #
 # 这样，假设超平面S的总的误分类点集合为M，那么所有误分类点到S的距离之和为
 # $$
 # -\frac{1}{||w||} \sum_{x_i \in M} y_i (w \cdot x_i + b)
 # $$
 # 不考虑1/||w||，就得到了感知机学习的损失函数。
 #
 # ### 经验风险函数
 #
 # 给定数据集T={(x1,y1),(x2,y2)...(xN,yN)}（其中xi∈X=Rn，yi∈Y={-1, +1}，i=1,2...N），感知机sign(w·x+b)学习的损失函数定义为
 # $$
 # L(w, b) = - \sum_{x_i \in M} y_i (w \cdot x_i + b)
 # $$
 # 其中M为误分类点的集合，这个损失函数就是感知机学习的[经验风险函数](https://blog.csdn.net/zhzhx1204/article/details/70163099)。
 #
 # 显然，损失函数L(w,b)是非负的。如果没有误分类点，那么L(w,b)为0，误分类点数越少，L(w,b)值越小。一个特定的损失函数：在误分类时是参数w,b的线性函数，在正确分类时，是0.因此，给定训练数据集T,损失函数L(w,b)是w,b的连续可导函数。
 #
 # ## 3. 感知机学习算法
 #
 #
 # 最优化问题：给定数据集T={(x1,y1),(x2,y2)...(xN,yN)}（其中xi∈X=Rn，yi∈Y={-1, +1}，i=1,2...N），求参数w,b,使其成为损失函数的解（M为误分类的集合）：
 #
 # $$
 # min_{w,b} L(w, b) =  - \sum_{x_i \in M} y_i (w \cdot x_i + b)
 # $$
 #
 # 感知机学习是误分类驱动的，具体采用[随机梯度下降法](https://blog.csdn.net/zbc1090549839/article/details/38149561)。首先，任意选定$w_0$、$b_0$，然后用梯度下降法不断极小化目标函数，极小化的过程不知一次性的把M中的所有误分类点梯度下降，而是一次随机选取一个误分类点使其梯度下降。
 #
 # 假设误分类集合M是固定的，那么损失函数L(w,b)的梯度为
 # $$
 # \triangledown_w L(w, b) = - \sum_{x_i \in M} y_i x_i \\
 # \triangledown_b L(w, b) = - \sum_{x_i \in M} y_i \\
 # $$
 #
 # 随机选取一个误分类点$(x_i,y_i)$,对$w,b$进行更新：
 # $$
 # w = w + \eta y_i x_i \\
 # b = b + \eta y_i
 # $$
 #
 # 式中$\eta$（0 ≤ $ \eta $ ≤ 1）是步长，在统计学是中成为学习速率。步长越大，梯度下降的速度越快，更能接近极小点。如果步长过大，有可能导致跨过极小点，导致函数发散；如果步长过小，有可能会耗很长时间才能达到极小点。
 #
 # 直观解释：当一个实例点被误分类时，调整w,b，使分离超平面向该误分类点的一侧移动，以减少该误分类点与超平面的距离，直至超越该点被正确分类。
 #
 #
 #
 # 算法
 # ```
 # 输入：T={(x1,y1),(x2,y2)...(xN,yN)}（其中xi∈X=Rn，yi∈Y={-1, +1}，i=1,2...N，学习速率为η）
 # 输出：w, b;感知机模型f(x)=sign(w·x+b)
 # (1) 初始化w0,b0
 # (2) 在训练数据集中选取（xi, yi）
 # (3) 如果yi(w xi+b)≤0
 #            w = w + ηyixi
 #            b = b + ηyi
 # (4) 转至（2）
 # ```
 #
 #
 # ## 4. Program
 #
 # +
 import random
 import numpy as np
 # 符号函数
 def sign(v):
    if v > 0:  return 1
    else:      return -1
 def perceptron_train(train_data, eta=0.5, n_iter=100):
    weight = [0, 0]  # 权重
    bias = 0  # 偏置量
    learning_rate = eta  # 学习速率
    train_num = n_iter  # 迭代次数
    for i in range(train_num):
        train = random.choice(train_data)
        x1, x2, y = train
        predict = sign(weight[0] * x1 + weight[1] * x2 + bias)  # 输出
        #print("train data: x: (%2d, %2d) y: %2d  ==> predict: %2d" % (x1, x2, y, predict))
        if y * predict <= 0:  # 判断误分类点
            weight[0] = weight[0] + learning_rate * y * x1  # 更新权重
            weight[1] = weight[1] + learning_rate * y * x2
            bias      = bias      + learning_rate * y       # 更新偏置量
            print("update weight and bias: ", weight[0], weight[1], bias)
    #print("stop training: ", weight[0], weight[1], bias)
    return weight, bias
 def perceptron_pred(data, w, b):
    y_pred = []
    for d in data:
        x1, x2, y = d
        yi = sign(w[0]*x1 + w[1]*x2 + b)
        y_pred.append(yi)
    return y_pred
 # set training data
 train_data = np.array([[1, 3,  1], [2, 5,  1], [3, 8,  1], [2, 6,  1], 
                       [3, 1, -1], [4, 1, -1], [6, 2, -1], [7, 3, -1]])
 # do training
 w, b = perceptron_train(train_data)
 print("w = ", w)
 print("b = ", b)
 # predict 
 y_pred = perceptron_pred(train_data, w, b)
 print(train_data[:, 2])
 print(y_pred)
 # -
 # ## Reference
 # * [感知机（Python实现）](http://www.cnblogs.com/kaituorensheng/p/3561091.html)
 # * [Programming a Perceptron in Python](https://blog.dbrgn.ch/2013/3/26/perceptrons-in-python/)
 # * [损失函数、风险函数、经验风险最小化、结构风险最小化](https://blog.csdn.net/zhzhx1204/article/details/70163099)
--- a/nn/images/L_b.png
+++ b/nn/images/L_b.png
--- a/nn/images/L_w.png
+++ b/nn/images/L_w.png
--- a/nn/images/neuron.png
+++ b/nn/images/neuron.png
--- a/nn/images/perceptron_2.PNG
+++ b/nn/images/perceptron_2.PNG
--- a/nn/images/perceptron_geometry_def.png
+++ b/nn/images/perceptron_geometry_def.png
--- a/nn/images/sign.png
+++ b/nn/images/sign.png