You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

Scikit-learn.ipynb 23 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Scikit-learn"
  8. ]
  9. },
  10. {
  11. "cell_type": "markdown",
  12. "metadata": {},
  13. "source": [
  14. "Scikit-learn contains simple and efficient tools for data mining and data analysis. It implements a wide variety of machine learning algorithms and processes to conduct advanced analytics.\n",
  15. "\n",
  16. "Library documentation: <a>http://scikit-learn.org/stable/</a>"
  17. ]
  18. },
  19. {
  20. "cell_type": "markdown",
  21. "metadata": {},
  22. "source": [
  23. "### General"
  24. ]
  25. },
  26. {
  27. "cell_type": "code",
  28. "execution_count": 1,
  29. "metadata": {
  30. "collapsed": false
  31. },
  32. "outputs": [],
  33. "source": [
  34. "import numpy as np\n",
  35. "from sklearn import datasets\n",
  36. "from sklearn import svm"
  37. ]
  38. },
  39. {
  40. "cell_type": "code",
  41. "execution_count": 2,
  42. "metadata": {
  43. "collapsed": false
  44. },
  45. "outputs": [
  46. {
  47. "name": "stdout",
  48. "output_type": "stream",
  49. "text": [
  50. "[[ 0. 0. 5. ..., 0. 0. 0.]\n",
  51. " [ 0. 0. 0. ..., 10. 0. 0.]\n",
  52. " [ 0. 0. 0. ..., 16. 9. 0.]\n",
  53. " ..., \n",
  54. " [ 0. 0. 1. ..., 6. 0. 0.]\n",
  55. " [ 0. 0. 2. ..., 12. 0. 0.]\n",
  56. " [ 0. 0. 10. ..., 12. 1. 0.]]\n"
  57. ]
  58. }
  59. ],
  60. "source": [
  61. "# import a sample dataset and view the data\n",
  62. "digits = datasets.load_digits()\n",
  63. "print(digits.data)"
  64. ]
  65. },
  66. {
  67. "cell_type": "code",
  68. "execution_count": 3,
  69. "metadata": {
  70. "collapsed": false
  71. },
  72. "outputs": [
  73. {
  74. "data": {
  75. "text/plain": [
  76. "array([0, 1, 2, ..., 8, 9, 8])"
  77. ]
  78. },
  79. "execution_count": 3,
  80. "metadata": {},
  81. "output_type": "execute_result"
  82. }
  83. ],
  84. "source": [
  85. "# view the target variable\n",
  86. "digits.target"
  87. ]
  88. },
  89. {
  90. "cell_type": "code",
  91. "execution_count": 4,
  92. "metadata": {
  93. "collapsed": false
  94. },
  95. "outputs": [
  96. {
  97. "data": {
  98. "text/plain": [
  99. "SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,\n",
  100. " gamma=0.001, kernel='rbf', max_iter=-1, probability=False,\n",
  101. " random_state=None, shrinking=True, tol=0.001, verbose=False)"
  102. ]
  103. },
  104. "execution_count": 4,
  105. "metadata": {},
  106. "output_type": "execute_result"
  107. }
  108. ],
  109. "source": [
  110. "# train a support vector machine using everything but the last example \n",
  111. "classifier = svm.SVC(gamma=0.001, C=100.)\n",
  112. "classifier.fit(digits.data[:-1], digits.target[:-1])"
  113. ]
  114. },
  115. {
  116. "cell_type": "code",
  117. "execution_count": 5,
  118. "metadata": {
  119. "collapsed": false
  120. },
  121. "outputs": [
  122. {
  123. "data": {
  124. "text/plain": [
  125. "array([8])"
  126. ]
  127. },
  128. "execution_count": 5,
  129. "metadata": {},
  130. "output_type": "execute_result"
  131. }
  132. ],
  133. "source": [
  134. "# predict the target of the last example\n",
  135. "classifier.predict(digits.data[-1])"
  136. ]
  137. },
  138. {
  139. "cell_type": "code",
  140. "execution_count": 6,
  141. "metadata": {
  142. "collapsed": false
  143. },
  144. "outputs": [
  145. {
  146. "data": {
  147. "text/plain": [
  148. "array([8])"
  149. ]
  150. },
  151. "execution_count": 6,
  152. "metadata": {},
  153. "output_type": "execute_result"
  154. }
  155. ],
  156. "source": [
  157. "# persist the model and reload\n",
  158. "import pickle\n",
  159. "from sklearn.externals import joblib\n",
  160. "joblib.dump(classifier, 'model.pkl')\n",
  161. "classifier2 = joblib.load('model.pkl')\n",
  162. "classifier2.predict(digits.data[-1])"
  163. ]
  164. },
  165. {
  166. "cell_type": "code",
  167. "execution_count": 7,
  168. "metadata": {
  169. "collapsed": false
  170. },
  171. "outputs": [],
  172. "source": [
  173. "import os\n",
  174. "os.remove('model.pkl')"
  175. ]
  176. },
  177. {
  178. "cell_type": "code",
  179. "execution_count": 8,
  180. "metadata": {
  181. "collapsed": false
  182. },
  183. "outputs": [
  184. {
  185. "data": {
  186. "text/plain": [
  187. "0.97999999999999998"
  188. ]
  189. },
  190. "execution_count": 8,
  191. "metadata": {},
  192. "output_type": "execute_result"
  193. }
  194. ],
  195. "source": [
  196. "# another example with the digits data set\n",
  197. "svc = svm.SVC(C=1, kernel='linear')\n",
  198. "svc.fit(digits.data[:-100], digits.target[:-100]).score(digits.data[-100:], digits.target[-100:])"
  199. ]
  200. },
  201. {
  202. "cell_type": "code",
  203. "execution_count": 9,
  204. "metadata": {
  205. "collapsed": false
  206. },
  207. "outputs": [
  208. {
  209. "name": "stdout",
  210. "output_type": "stream",
  211. "text": [
  212. "Train: [2 3 4 5] | test: [0 1]\n",
  213. "Train: [0 1 4 5] | test: [2 3]\n",
  214. "Train: [0 1 2 3] | test: [4 5]\n"
  215. ]
  216. }
  217. ],
  218. "source": [
  219. "# perform cross-validation on the estimator's predictions\n",
  220. "from sklearn import cross_validation\n",
  221. "k_fold = cross_validation.KFold(n=6, n_folds=3)\n",
  222. "for train_indices, test_indices in k_fold:\n",
  223. " print('Train: %s | test: %s' % (train_indices, test_indices))"
  224. ]
  225. },
  226. {
  227. "cell_type": "code",
  228. "execution_count": 10,
  229. "metadata": {
  230. "collapsed": false
  231. },
  232. "outputs": [
  233. {
  234. "data": {
  235. "text/plain": [
  236. "array([ 0.93489149, 0.95659432, 0.93989983])"
  237. ]
  238. },
  239. "execution_count": 10,
  240. "metadata": {},
  241. "output_type": "execute_result"
  242. }
  243. ],
  244. "source": [
  245. "# apply to the model\n",
  246. "kfold = cross_validation.KFold(len(digits.data), n_folds=3)\n",
  247. "cross_validation.cross_val_score(svc, digits.data, digits.target, cv=kfold, n_jobs=-1)"
  248. ]
  249. },
  250. {
  251. "cell_type": "code",
  252. "execution_count": 11,
  253. "metadata": {
  254. "collapsed": false
  255. },
  256. "outputs": [
  257. {
  258. "data": {
  259. "text/plain": [
  260. "GridSearchCV(cv=None,\n",
  261. " estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
  262. " kernel='linear', max_iter=-1, probability=False, random_state=None,\n",
  263. " shrinking=True, tol=0.001, verbose=False),\n",
  264. " fit_params={}, iid=True, loss_func=None, n_jobs=-1,\n",
  265. " param_grid={'gamma': array([ 1.00000e-06, 3.59381e-06, 1.29155e-05, 4.64159e-05,\n",
  266. " 1.66810e-04, 5.99484e-04, 2.15443e-03, 7.74264e-03,\n",
  267. " 2.78256e-02, 1.00000e-01])},\n",
  268. " pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n",
  269. " verbose=0)"
  270. ]
  271. },
  272. "execution_count": 11,
  273. "metadata": {},
  274. "output_type": "execute_result"
  275. }
  276. ],
  277. "source": [
  278. "# use the grid search module to optimize model parameters\n",
  279. "from sklearn.grid_search import GridSearchCV\n",
  280. "gammas = np.logspace(-6, -1, 10)\n",
  281. "classifier = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas), n_jobs=-1)\n",
  282. "classifier.fit(digits.data[:1000], digits.target[:1000])"
  283. ]
  284. },
  285. {
  286. "cell_type": "code",
  287. "execution_count": 12,
  288. "metadata": {
  289. "collapsed": false
  290. },
  291. "outputs": [
  292. {
  293. "data": {
  294. "text/plain": [
  295. "0.92400000000000004"
  296. ]
  297. },
  298. "execution_count": 12,
  299. "metadata": {},
  300. "output_type": "execute_result"
  301. }
  302. ],
  303. "source": [
  304. "classifier.best_score_"
  305. ]
  306. },
  307. {
  308. "cell_type": "code",
  309. "execution_count": 13,
  310. "metadata": {
  311. "collapsed": false
  312. },
  313. "outputs": [
  314. {
  315. "data": {
  316. "text/plain": [
  317. "9.9999999999999995e-07"
  318. ]
  319. },
  320. "execution_count": 13,
  321. "metadata": {},
  322. "output_type": "execute_result"
  323. }
  324. ],
  325. "source": [
  326. "classifier.best_estimator_.gamma"
  327. ]
  328. },
  329. {
  330. "cell_type": "code",
  331. "execution_count": 14,
  332. "metadata": {
  333. "collapsed": false
  334. },
  335. "outputs": [
  336. {
  337. "data": {
  338. "text/plain": [
  339. "0.94228356336260977"
  340. ]
  341. },
  342. "execution_count": 14,
  343. "metadata": {},
  344. "output_type": "execute_result"
  345. }
  346. ],
  347. "source": [
  348. "# run against the test set\n",
  349. "classifier.score(digits.data[1000:], digits.target[1000:])"
  350. ]
  351. },
  352. {
  353. "cell_type": "code",
  354. "execution_count": 15,
  355. "metadata": {
  356. "collapsed": false
  357. },
  358. "outputs": [
  359. {
  360. "data": {
  361. "text/plain": [
  362. "array([ 0.93521595, 0.95826377, 0.93791946])"
  363. ]
  364. },
  365. "execution_count": 15,
  366. "metadata": {},
  367. "output_type": "execute_result"
  368. }
  369. ],
  370. "source": [
  371. "# nested cross-validation example\n",
  372. "cross_validation.cross_val_score(classifier, digits.data, digits.target)"
  373. ]
  374. },
  375. {
  376. "cell_type": "markdown",
  377. "metadata": {},
  378. "source": [
  379. "### Other Classifiers"
  380. ]
  381. },
  382. {
  383. "cell_type": "code",
  384. "execution_count": 16,
  385. "metadata": {
  386. "collapsed": false
  387. },
  388. "outputs": [],
  389. "source": [
  390. "# import the iris dataset\n",
  391. "iris = datasets.load_iris()"
  392. ]
  393. },
  394. {
  395. "cell_type": "code",
  396. "execution_count": 17,
  397. "metadata": {
  398. "collapsed": false
  399. },
  400. "outputs": [
  401. {
  402. "data": {
  403. "text/plain": [
  404. "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
  405. " metric_params=None, n_neighbors=5, p=2, weights='uniform')"
  406. ]
  407. },
  408. "execution_count": 17,
  409. "metadata": {},
  410. "output_type": "execute_result"
  411. }
  412. ],
  413. "source": [
  414. "# k nearest neighbors\n",
  415. "from sklearn.neighbors import KNeighborsClassifier\n",
  416. "knn = KNeighborsClassifier()\n",
  417. "knn.fit(iris.data, iris.target)"
  418. ]
  419. },
  420. {
  421. "cell_type": "code",
  422. "execution_count": 18,
  423. "metadata": {
  424. "collapsed": false
  425. },
  426. "outputs": [
  427. {
  428. "data": {
  429. "text/plain": [
  430. "DecisionTreeClassifier(compute_importances=None, criterion='gini',\n",
  431. " max_depth=None, max_features=None, max_leaf_nodes=None,\n",
  432. " min_density=None, min_samples_leaf=1, min_samples_split=2,\n",
  433. " random_state=None, splitter='best')"
  434. ]
  435. },
  436. "execution_count": 18,
  437. "metadata": {},
  438. "output_type": "execute_result"
  439. }
  440. ],
  441. "source": [
  442. "# decision tree\n",
  443. "from sklearn.tree import DecisionTreeClassifier\n",
  444. "dtree = DecisionTreeClassifier()\n",
  445. "dtree.fit(iris.data, iris.target)"
  446. ]
  447. },
  448. {
  449. "cell_type": "code",
  450. "execution_count": 19,
  451. "metadata": {
  452. "collapsed": false
  453. },
  454. "outputs": [
  455. {
  456. "data": {
  457. "text/plain": [
  458. "SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,\n",
  459. " fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',\n",
  460. " loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,\n",
  461. " random_state=None, shuffle=False, verbose=0, warm_start=False)"
  462. ]
  463. },
  464. "execution_count": 19,
  465. "metadata": {},
  466. "output_type": "execute_result"
  467. }
  468. ],
  469. "source": [
  470. "# stochastic gradient descent\n",
  471. "from sklearn.linear_model import SGDClassifier\n",
  472. "sgd = SGDClassifier(loss=\"hinge\", penalty=\"l2\")\n",
  473. "sgd.fit(iris.data, iris.target)"
  474. ]
  475. },
  476. {
  477. "cell_type": "code",
  478. "execution_count": 20,
  479. "metadata": {
  480. "collapsed": false
  481. },
  482. "outputs": [
  483. {
  484. "name": "stdout",
  485. "output_type": "stream",
  486. "text": [
  487. "Number of mislabeled points : 6\n"
  488. ]
  489. }
  490. ],
  491. "source": [
  492. "# naive bayes\n",
  493. "from sklearn.naive_bayes import GaussianNB\n",
  494. "gnb = GaussianNB()\n",
  495. "y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)\n",
  496. "print(\"Number of mislabeled points : %d\" % (iris.target != y_pred).sum())"
  497. ]
  498. },
  499. {
  500. "cell_type": "markdown",
  501. "metadata": {},
  502. "source": [
  503. "### Regression"
  504. ]
  505. },
  506. {
  507. "cell_type": "code",
  508. "execution_count": 21,
  509. "metadata": {
  510. "collapsed": false
  511. },
  512. "outputs": [],
  513. "source": [
  514. "# load another sample dataset\n",
  515. "diabetes = datasets.load_diabetes()"
  516. ]
  517. },
  518. {
  519. "cell_type": "code",
  520. "execution_count": 22,
  521. "metadata": {
  522. "collapsed": false
  523. },
  524. "outputs": [
  525. {
  526. "data": {
  527. "text/plain": [
  528. "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)"
  529. ]
  530. },
  531. "execution_count": 22,
  532. "metadata": {},
  533. "output_type": "execute_result"
  534. }
  535. ],
  536. "source": [
  537. "# linear regression\n",
  538. "from sklearn import linear_model\n",
  539. "regr = linear_model.LinearRegression()\n",
  540. "regr.fit(diabetes.data, diabetes.target)"
  541. ]
  542. },
  543. {
  544. "cell_type": "code",
  545. "execution_count": 23,
  546. "metadata": {
  547. "collapsed": false
  548. },
  549. "outputs": [
  550. {
  551. "name": "stdout",
  552. "output_type": "stream",
  553. "text": [
  554. "[ -10.01219782 -239.81908937 519.83978679 324.39042769 -792.18416163\n",
  555. " 476.74583782 101.04457032 177.06417623 751.27932109 67.62538639]\n"
  556. ]
  557. }
  558. ],
  559. "source": [
  560. "# regression coefficients\n",
  561. "print(regr.coef_)"
  562. ]
  563. },
  564. {
  565. "cell_type": "code",
  566. "execution_count": 24,
  567. "metadata": {
  568. "collapsed": false
  569. },
  570. "outputs": [
  571. {
  572. "data": {
  573. "text/plain": [
  574. "2859.6903987680657"
  575. ]
  576. },
  577. "execution_count": 24,
  578. "metadata": {},
  579. "output_type": "execute_result"
  580. }
  581. ],
  582. "source": [
  583. "# mean squared error\n",
  584. "np.mean((regr.predict(diabetes.data)-diabetes.target)**2)"
  585. ]
  586. },
  587. {
  588. "cell_type": "code",
  589. "execution_count": 25,
  590. "metadata": {
  591. "collapsed": false
  592. },
  593. "outputs": [
  594. {
  595. "data": {
  596. "text/plain": [
  597. "0.51774942541329338"
  598. ]
  599. },
  600. "execution_count": 25,
  601. "metadata": {},
  602. "output_type": "execute_result"
  603. }
  604. ],
  605. "source": [
  606. "# explained variance\n",
  607. "regr.score(diabetes.data, diabetes.target)"
  608. ]
  609. },
  610. {
  611. "cell_type": "code",
  612. "execution_count": 26,
  613. "metadata": {
  614. "collapsed": false
  615. },
  616. "outputs": [
  617. {
  618. "data": {
  619. "text/plain": [
  620. "Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,\n",
  621. " normalize=False, solver='auto', tol=0.001)"
  622. ]
  623. },
  624. "execution_count": 26,
  625. "metadata": {},
  626. "output_type": "execute_result"
  627. }
  628. ],
  629. "source": [
  630. "# ridge regression\n",
  631. "regr = linear_model.Ridge(alpha=.1)\n",
  632. "regr.fit(diabetes.data, diabetes.target)"
  633. ]
  634. },
  635. {
  636. "cell_type": "code",
  637. "execution_count": 27,
  638. "metadata": {
  639. "collapsed": false
  640. },
  641. "outputs": [
  642. {
  643. "data": {
  644. "text/plain": [
  645. "Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n",
  646. " normalize=False, positive=False, precompute='auto', tol=0.0001,\n",
  647. " warm_start=False)"
  648. ]
  649. },
  650. "execution_count": 27,
  651. "metadata": {},
  652. "output_type": "execute_result"
  653. }
  654. ],
  655. "source": [
  656. "# lasso regression\n",
  657. "regr = linear_model.Lasso()\n",
  658. "regr.fit(diabetes.data, diabetes.target)"
  659. ]
  660. },
  661. {
  662. "cell_type": "code",
  663. "execution_count": 28,
  664. "metadata": {
  665. "collapsed": false
  666. },
  667. "outputs": [
  668. {
  669. "data": {
  670. "text/plain": [
  671. "LogisticRegression(C=100000.0, class_weight=None, dual=False,\n",
  672. " fit_intercept=True, intercept_scaling=1, penalty='l2',\n",
  673. " random_state=None, tol=0.0001)"
  674. ]
  675. },
  676. "execution_count": 28,
  677. "metadata": {},
  678. "output_type": "execute_result"
  679. }
  680. ],
  681. "source": [
  682. "# logistic regression (this is actually a classifier)\n",
  683. "iris = datasets.load_iris()\n",
  684. "logistic = linear_model.LogisticRegression(C=1e5)\n",
  685. "logistic.fit(iris.data, iris.target)"
  686. ]
  687. },
  688. {
  689. "cell_type": "markdown",
  690. "metadata": {},
  691. "source": [
  692. "### Preprocessing"
  693. ]
  694. },
  695. {
  696. "cell_type": "code",
  697. "execution_count": 29,
  698. "metadata": {
  699. "collapsed": false
  700. },
  701. "outputs": [],
  702. "source": [
  703. "# feature scaling\n",
  704. "from sklearn import preprocessing\n",
  705. "X = np.array([[ 1., -1., 2.],\n",
  706. " [ 2., 0., 0.],\n",
  707. " [ 0., 1., -1.]])\n",
  708. "X_scaled = preprocessing.scale(X)"
  709. ]
  710. },
  711. {
  712. "cell_type": "code",
  713. "execution_count": 30,
  714. "metadata": {
  715. "collapsed": false
  716. },
  717. "outputs": [
  718. {
  719. "data": {
  720. "text/plain": [
  721. "StandardScaler(copy=True, with_mean=True, with_std=True)"
  722. ]
  723. },
  724. "execution_count": 30,
  725. "metadata": {},
  726. "output_type": "execute_result"
  727. }
  728. ],
  729. "source": [
  730. "# save the scaling transform to apply to new data later\n",
  731. "scaler = preprocessing.StandardScaler().fit(X)\n",
  732. "scaler"
  733. ]
  734. },
  735. {
  736. "cell_type": "code",
  737. "execution_count": 31,
  738. "metadata": {
  739. "collapsed": false
  740. },
  741. "outputs": [
  742. {
  743. "data": {
  744. "text/plain": [
  745. "array([[ 0. , -1.22474487, 1.33630621],\n",
  746. " [ 1.22474487, 0. , -0.26726124],\n",
  747. " [-1.22474487, 1.22474487, -1.06904497]])"
  748. ]
  749. },
  750. "execution_count": 31,
  751. "metadata": {},
  752. "output_type": "execute_result"
  753. }
  754. ],
  755. "source": [
  756. "scaler.transform(X)"
  757. ]
  758. },
  759. {
  760. "cell_type": "code",
  761. "execution_count": 32,
  762. "metadata": {
  763. "collapsed": false
  764. },
  765. "outputs": [
  766. {
  767. "data": {
  768. "text/plain": [
  769. "array([[ 0.5 , 0. , 1. ],\n",
  770. " [ 1. , 0.5 , 0.33333333],\n",
  771. " [ 0. , 1. , 0. ]])"
  772. ]
  773. },
  774. "execution_count": 32,
  775. "metadata": {},
  776. "output_type": "execute_result"
  777. }
  778. ],
  779. "source": [
  780. "# range scaling\n",
  781. "min_max_scaler = preprocessing.MinMaxScaler()\n",
  782. "X_minmax = min_max_scaler.fit_transform(X)\n",
  783. "X_minmax"
  784. ]
  785. },
  786. {
  787. "cell_type": "code",
  788. "execution_count": 33,
  789. "metadata": {
  790. "collapsed": false
  791. },
  792. "outputs": [
  793. {
  794. "data": {
  795. "text/plain": [
  796. "array([[ 0.40824829, -0.40824829, 0.81649658],\n",
  797. " [ 1. , 0. , 0. ],\n",
  798. " [ 0. , 0.70710678, -0.70710678]])"
  799. ]
  800. },
  801. "execution_count": 33,
  802. "metadata": {},
  803. "output_type": "execute_result"
  804. }
  805. ],
  806. "source": [
  807. "# instance normalization using L2 norm\n",
  808. "X_normalized = preprocessing.normalize(X, norm='l2')\n",
  809. "X_normalized"
  810. ]
  811. },
  812. {
  813. "cell_type": "code",
  814. "execution_count": 34,
  815. "metadata": {
  816. "collapsed": false
  817. },
  818. "outputs": [
  819. {
  820. "data": {
  821. "text/plain": [
  822. "array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]])"
  823. ]
  824. },
  825. "execution_count": 34,
  826. "metadata": {},
  827. "output_type": "execute_result"
  828. }
  829. ],
  830. "source": [
  831. "# category encoding\n",
  832. "enc = preprocessing.OneHotEncoder()\n",
  833. "enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])\n",
  834. "enc.transform([[0, 1, 3]]).toarray()"
  835. ]
  836. },
  837. {
  838. "cell_type": "code",
  839. "execution_count": 35,
  840. "metadata": {
  841. "collapsed": false
  842. },
  843. "outputs": [
  844. {
  845. "data": {
  846. "text/plain": [
  847. "array([[ 1., 0., 1.],\n",
  848. " [ 1., 0., 0.],\n",
  849. " [ 0., 1., 0.]])"
  850. ]
  851. },
  852. "execution_count": 35,
  853. "metadata": {},
  854. "output_type": "execute_result"
  855. }
  856. ],
  857. "source": [
  858. "# binning\n",
  859. "binarizer = preprocessing.Binarizer().fit(X)\n",
  860. "binarizer.transform(X)"
  861. ]
  862. },
  863. {
  864. "cell_type": "markdown",
  865. "metadata": {},
  866. "source": [
  867. "### Clustering"
  868. ]
  869. },
  870. {
  871. "cell_type": "code",
  872. "execution_count": 36,
  873. "metadata": {
  874. "collapsed": false
  875. },
  876. "outputs": [
  877. {
  878. "data": {
  879. "text/plain": [
  880. "KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,\n",
  881. " n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,\n",
  882. " verbose=0)"
  883. ]
  884. },
  885. "execution_count": 36,
  886. "metadata": {},
  887. "output_type": "execute_result"
  888. }
  889. ],
  890. "source": [
  891. "# k means clustering\n",
  892. "from sklearn import cluster\n",
  893. "k_means = cluster.KMeans(n_clusters=3)\n",
  894. "k_means.fit(iris.data)"
  895. ]
  896. },
  897. {
  898. "cell_type": "markdown",
  899. "metadata": {},
  900. "source": [
  901. "### Decomposition"
  902. ]
  903. },
  904. {
  905. "cell_type": "code",
  906. "execution_count": 37,
  907. "metadata": {
  908. "collapsed": false
  909. },
  910. "outputs": [],
  911. "source": [
  912. "# create a signal with 2 useful dimensions\n",
  913. "x1 = np.random.normal(size=100)\n",
  914. "x2 = np.random.normal(size=100)\n",
  915. "x3 = x1 + x2\n",
  916. "X = np.c_[x1, x2, x3]"
  917. ]
  918. },
  919. {
  920. "cell_type": "code",
  921. "execution_count": 38,
  922. "metadata": {
  923. "collapsed": false
  924. },
  925. "outputs": [
  926. {
  927. "data": {
  928. "text/plain": [
  929. "PCA(copy=True, n_components=None, whiten=False)"
  930. ]
  931. },
  932. "execution_count": 38,
  933. "metadata": {},
  934. "output_type": "execute_result"
  935. }
  936. ],
  937. "source": [
  938. "# compute principal component analysis\n",
  939. "from sklearn import decomposition\n",
  940. "pca = decomposition.PCA()\n",
  941. "pca.fit(X)"
  942. ]
  943. },
  944. {
  945. "cell_type": "code",
  946. "execution_count": 39,
  947. "metadata": {
  948. "collapsed": false
  949. },
  950. "outputs": [
  951. {
  952. "data": {
  953. "text/plain": [
  954. "array([ 2.77625101e+00, 9.03048616e-01, 3.02456658e-31])"
  955. ]
  956. },
  957. "execution_count": 39,
  958. "metadata": {},
  959. "output_type": "execute_result"
  960. }
  961. ],
  962. "source": [
  963. "pca.explained_variance_"
  964. ]
  965. },
  966. {
  967. "cell_type": "code",
  968. "execution_count": 40,
  969. "metadata": {
  970. "collapsed": false
  971. },
  972. "outputs": [
  973. {
  974. "data": {
  975. "text/plain": [
  976. "(100L, 2L)"
  977. ]
  978. },
  979. "execution_count": 40,
  980. "metadata": {},
  981. "output_type": "execute_result"
  982. }
  983. ],
  984. "source": [
  985. "# only the 2 first components are useful\n",
  986. "pca.n_components = 2\n",
  987. "X_reduced = pca.fit_transform(X)\n",
  988. "X_reduced.shape"
  989. ]
  990. },
  991. {
  992. "cell_type": "code",
  993. "execution_count": 41,
  994. "metadata": {
  995. "collapsed": false
  996. },
  997. "outputs": [],
  998. "source": [
  999. "# generate more sample data\n",
  1000. "time = np.linspace(0, 10, 2000)\n",
  1001. "s1 = np.sin(2 * time) # signal 1 : sinusoidal signal\n",
  1002. "s2 = np.sign(np.sin(3 * time)) # signal 2 : square signal\n",
  1003. "S = np.c_[s1, s2]\n",
  1004. "S += 0.2 * np.random.normal(size=S.shape) # Add noise\n",
  1005. "S /= S.std(axis=0) # standardize data"
  1006. ]
  1007. },
  1008. {
  1009. "cell_type": "code",
  1010. "execution_count": 42,
  1011. "metadata": {
  1012. "collapsed": false
  1013. },
  1014. "outputs": [],
  1015. "source": [
  1016. "# mix data\n",
  1017. "A = np.array([[1, 1], [0.5, 2]]) # mixing matrix\n",
  1018. "X = np.dot(S, A.T) # generate observations"
  1019. ]
  1020. },
  1021. {
  1022. "cell_type": "code",
  1023. "execution_count": 43,
  1024. "metadata": {
  1025. "collapsed": false
  1026. },
  1027. "outputs": [
  1028. {
  1029. "data": {
  1030. "text/plain": [
  1031. "True"
  1032. ]
  1033. },
  1034. "execution_count": 43,
  1035. "metadata": {},
  1036. "output_type": "execute_result"
  1037. }
  1038. ],
  1039. "source": [
  1040. "# compute independent component analysis\n",
  1041. "ica = decomposition.FastICA()\n",
  1042. "S_ = ica.fit_transform(X) # get the estimated sources\n",
  1043. "A_ = ica.mixing_.T\n",
  1044. "np.allclose(X, np.dot(S_, A_) + ica.mean_)"
  1045. ]
  1046. }
  1047. ],
  1048. "metadata": {
  1049. "kernelspec": {
  1050. "display_name": "Python 2",
  1051. "language": "python",
  1052. "name": "python2"
  1053. },
  1054. "language_info": {
  1055. "codemirror_mode": {
  1056. "name": "ipython",
  1057. "version": 2
  1058. },
  1059. "file_extension": ".py",
  1060. "mimetype": "text/x-python",
  1061. "name": "python",
  1062. "nbconvert_exporter": "python",
  1063. "pygments_lexer": "ipython2",
  1064. "version": "2.7.9"
  1065. }
  1066. },
  1067. "nbformat": 4,
  1068. "nbformat_minor": 0
  1069. }

机器学习越来越多应用到飞行器、机器人等领域,其目的是利用计算机实现类似人类的智能,从而实现装备的智能化与无人化。本课程旨在引导学生掌握机器学习的基本知识、典型方法与技术,通过具体的应用案例激发学生对该学科的兴趣,鼓励学生能够从人工智能的角度来分析、解决飞行器、机器人所面临的问题和挑战。本课程主要内容包括Python编程基础,机器学习模型,无监督学习、监督学习、深度学习基础知识与实现,并学习如何利用机器学习解决实际问题,从而全面提升自我的《综合能力》。