You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fastNLP_padding_tutorial.ipynb 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [
  8. {
  9. "name": "stderr",
  10. "output_type": "stream",
  11. "text": [
  12. "/Users/yh/miniconda2/envs/python3/lib/python3.6/site-packages/tqdm/autonotebook/__init__.py:14: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
  13. " \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n"
  14. ]
  15. },
  16. {
  17. "data": {
  18. "text/plain": [
  19. "DataSet({'raw_sent': this is a bad idea . type=str,\n",
  20. "'label': 0 type=int,\n",
  21. "'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n",
  22. "'words': [4, 2, 5, 6, 7, 3] type=list},\n",
  23. "{'raw_sent': it is great . type=str,\n",
  24. "'label': 1 type=int,\n",
  25. "'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n",
  26. "'words': [8, 2, 9, 3] type=list})"
  27. ]
  28. },
  29. "execution_count": 1,
  30. "metadata": {},
  31. "output_type": "execute_result"
  32. }
  33. ],
  34. "source": [
  35. "# 假设有以下的DataSet, 这里只是为了举例所以只选择了两个sample\n",
  36. "import sys\n",
  37. "import os\n",
  38. "sys.path.append('/Users/yh/Desktop/fastNLP/fastNLP')\n",
  39. "\n",
  40. "from fastNLP import DataSet\n",
  41. "from fastNLP import Instance\n",
  42. "from fastNLP import Vocabulary\n",
  43. "\n",
  44. "dataset = DataSet()\n",
  45. "dataset.append(Instance(raw_sent='This is a bad idea .', label=0))\n",
  46. "dataset.append(Instance(raw_sent='It is great .', label=1))\n",
  47. "\n",
  48. "# 按照fastNLP_10min_tutorial.ipynb的步骤,对数据进行一些处理。这里为了演示padding操作,把field的名称做了一些改变\n",
  49. "dataset.apply(lambda x:x['raw_sent'].lower(), new_field_name='raw_sent')\n",
  50. "dataset.apply(lambda x:x['raw_sent'].split(), new_field_name='word_str_lst')\n",
  51. "\n",
  52. "# 建立Vocabulary\n",
  53. "word_vocab = Vocabulary()\n",
  54. "dataset.apply(lambda x:word_vocab.update(x['word_str_lst']))\n",
  55. "dataset.apply(lambda x:[word_vocab.to_index(word) for word in x['word_str_lst']], new_field_name='words')\n",
  56. "\n",
  57. "# 检查以下是否得到我们想要的结果了\n",
  58. "dataset[:2]"
  59. ]
  60. },
  61. {
  62. "cell_type": "code",
  63. "execution_count": 2,
  64. "metadata": {},
  65. "outputs": [
  66. {
  67. "name": "stdout",
  68. "output_type": "stream",
  69. "text": [
  70. "batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
  71. " list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[4, 2, 5, 6, 7, 3],\n",
  72. " [8, 2, 9, 3, 0, 0]])}\n",
  73. "batch_y has: {'label': tensor([0, 1])}\n"
  74. ]
  75. },
  76. {
  77. "data": {
  78. "text/plain": [
  79. "'\"\\n结果中\\n Batch会对元素类型(元素即最内层的数据,raw_sent为str,word_str_lst为str,words为int, label为int)为int或者float的数据进行默认\\n padding,而非int或float的则不进行padding。但若每个Instance中该field为二维数据,也不进行padding。因为二维数据的padding涉及到\\n 两个维度的padding,不容易自动判断padding的形式。\\n'"
  80. ]
  81. },
  82. "execution_count": 2,
  83. "metadata": {},
  84. "output_type": "execute_result"
  85. }
  86. ],
  87. "source": [
  88. "# 将field设置为input或者target\n",
  89. "dataset.set_input('word_str_lst')\n",
  90. "dataset.set_input('words')\n",
  91. "dataset.set_target('label')\n",
  92. "\n",
  93. "# 使用Batch取出batch数据\n",
  94. "from fastNLP.core.batch import Batch\n",
  95. "from fastNLP.core.sampler import RandomSampler\n",
  96. "\n",
  97. "batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
  98. "for batch_x, batch_y in batch_iterator:\n",
  99. " print(\"batch_x has: \", batch_x)\n",
  100. " print(\"batch_y has: \", batch_y)\n",
  101. "\"\"\"\"\n",
  102. "结果中\n",
  103. " Batch会对元素类型(元素即最内层的数据,raw_sent为str,word_str_lst为str,words为int, label为int)为int或者float的数据进行默认\n",
  104. " padding,而非int或float的则不进行padding。但若每个Instance中该field为二维数据,也不进行padding。因为二维数据的padding涉及到\n",
  105. " 两个维度的padding,不容易自动判断padding的形式。\n",
  106. "\"\"\""
  107. ]
  108. },
  109. {
  110. "cell_type": "code",
  111. "execution_count": 3,
  112. "metadata": {},
  113. "outputs": [
  114. {
  115. "name": "stdout",
  116. "output_type": "stream",
  117. "text": [
  118. "batch_x has: {'word_str_lst': array([list(['it', 'is', 'great', '.']),\n",
  119. " list(['this', 'is', 'a', 'bad', 'idea', '.'])], dtype=object), 'words': tensor([[ 8, 2, 9, 3, -100, -100],\n",
  120. " [ 4, 2, 5, 6, 7, 3]])}\n",
  121. "batch_y has: {'label': tensor([1, 0])}\n"
  122. ]
  123. }
  124. ],
  125. "source": [
  126. "# 所有的pad_val都默认为0,如果需要修改某一个field的默认pad值,可以通过DataSet.set_pad_val(field_name, pad_val)进行修改\n",
  127. "# 若需要将word的padding修改为-100\n",
  128. "dataset.set_pad_val('words', pad_val=-100)\n",
  129. "batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
  130. "for batch_x, batch_y in batch_iterator:\n",
  131. " print(\"batch_x has: \", batch_x)\n",
  132. " print(\"batch_y has: \", batch_y)\n",
  133. "# pad的值修改为-100了"
  134. ]
  135. },
  136. {
  137. "cell_type": "code",
  138. "execution_count": 4,
  139. "metadata": {},
  140. "outputs": [
  141. {
  142. "data": {
  143. "text/plain": [
  144. "DataSet({'raw_sent': this is a bad idea . type=str,\n",
  145. "'label': 0 type=int,\n",
  146. "'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n",
  147. "'words': [4, 2, 5, 6, 7, 3] type=list,\n",
  148. "'char_str_lst': [['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['b', 'a', 'd'], ['i', 'd', 'e', 'a'], ['.']] type=list,\n",
  149. "'chars': [[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]] type=list},\n",
  150. "{'raw_sent': it is great . type=str,\n",
  151. "'label': 1 type=int,\n",
  152. "'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n",
  153. "'words': [8, 2, 9, 3] type=list,\n",
  154. "'char_str_lst': [['i', 't'], ['i', 's'], ['g', 'r', 'e', 'a', 't'], ['.']] type=list,\n",
  155. "'chars': [[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]] type=list})"
  156. ]
  157. },
  158. "execution_count": 4,
  159. "metadata": {},
  160. "output_type": "execute_result"
  161. }
  162. ],
  163. "source": [
  164. "# 若需要使用二维padding或指定padding方式,可以通过设置该field的padder实现,下面以英文的character padding为例。在某些场景下,可能想要\n",
  165. "# 使用英文word的character作为特征,character的padding为二维padding,fastNLP默认只会进行一维padding。\n",
  166. "\n",
  167. "dataset.apply(lambda x: [[c for c in word] for word in x['word_str_lst']], new_field_name='char_str_lst')\n",
  168. "char_vocab = Vocabulary()\n",
  169. "dataset.apply(lambda x:[char_vocab.update(chars) for chars in x['char_str_lst']])\n",
  170. "dataset.apply(lambda x:[[char_vocab.to_index(c) for c in chars] for chars in x['char_str_lst']],new_field_name='chars')\n",
  171. "dataset[:2]"
  172. ]
  173. },
  174. {
  175. "cell_type": "code",
  176. "execution_count": 5,
  177. "metadata": {},
  178. "outputs": [
  179. {
  180. "name": "stdout",
  181. "output_type": "stream",
  182. "text": [
  183. "batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
  184. " list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
  185. " [ 8, 2, 9, 3, -100, -100]]), 'chars': array([list([[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]]),\n",
  186. " list([[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]])], dtype=object)}\n",
  187. "batch_y has: {'label': tensor([0, 1])}\n"
  188. ]
  189. },
  190. {
  191. "data": {
  192. "text/plain": [
  193. "'\\n 其它field与之前的是相同的。chars因为存在两个维度需要padding,不能自动决定padding方式,所以直接输出了原始形式。\\n'"
  194. ]
  195. },
  196. "execution_count": 5,
  197. "metadata": {},
  198. "output_type": "execute_result"
  199. }
  200. ],
  201. "source": [
  202. "# 如果不针对二维的character指定padding方法\n",
  203. "dataset.set_input('chars')\n",
  204. "batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
  205. "for batch_x, batch_y in batch_iterator:\n",
  206. " print(\"batch_x has: \", batch_x)\n",
  207. " print(\"batch_y has: \", batch_y)\n",
  208. " \n",
  209. "\"\"\"\n",
  210. " 其它field与之前的是相同的。chars因为存在两个维度需要padding,不能自动决定padding方式,所以直接输出了原始形式。\n",
  211. "\"\"\""
  212. ]
  213. },
  214. {
  215. "cell_type": "code",
  216. "execution_count": 6,
  217. "metadata": {},
  218. "outputs": [
  219. {
  220. "name": "stdout",
  221. "output_type": "stream",
  222. "text": [
  223. "batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
  224. " list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
  225. " [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n",
  226. " [ 2, 5, 0, 0],\n",
  227. " [ 3, 0, 0, 0],\n",
  228. " [10, 3, 6, 0],\n",
  229. " [ 2, 6, 7, 3],\n",
  230. " [ 8, 0, 0, 0]],\n",
  231. "\n",
  232. " [[ 2, 4, 0, 0],\n",
  233. " [ 2, 5, 0, 0],\n",
  234. " [11, 12, 7, 3],\n",
  235. " [ 8, 0, 0, 0],\n",
  236. " [ 0, 0, 0, 0],\n",
  237. " [ 0, 0, 0, 0]]])}\n",
  238. "batch_y has: {'label': tensor([0, 1])}\n"
  239. ]
  240. },
  241. {
  242. "data": {
  243. "text/plain": [
  244. "'\\n chars被正确padding了\\n'"
  245. ]
  246. },
  247. "execution_count": 6,
  248. "metadata": {},
  249. "output_type": "execute_result"
  250. }
  251. ],
  252. "source": [
  253. "# 若要使用二维padding,需要手动设置padding方式\n",
  254. "from fastNLP.core.fieldarray import EngChar2DPadder\n",
  255. "dataset.set_padder('chars', EngChar2DPadder())\n",
  256. "batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
  257. "for batch_x, batch_y in batch_iterator:\n",
  258. " print(\"batch_x has: \", batch_x)\n",
  259. " print(\"batch_y has: \", batch_y)\n",
  260. " \n",
  261. "\"\"\"\n",
  262. " chars被正确padding了\n",
  263. "\"\"\""
  264. ]
  265. },
  266. {
  267. "cell_type": "code",
  268. "execution_count": 7,
  269. "metadata": {},
  270. "outputs": [
  271. {
  272. "name": "stdout",
  273. "output_type": "stream",
  274. "text": [
  275. "batch_x has: {'raw_sent': ['this is a bad idea .', 'it is great . '], 'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
  276. " list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
  277. " [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n",
  278. " [ 2, 5, 0, 0],\n",
  279. " [ 3, 0, 0, 0],\n",
  280. " [10, 3, 6, 0],\n",
  281. " [ 2, 6, 7, 3],\n",
  282. " [ 8, 0, 0, 0]],\n",
  283. "\n",
  284. " [[ 2, 4, 0, 0],\n",
  285. " [ 2, 5, 0, 0],\n",
  286. " [11, 12, 7, 3],\n",
  287. " [ 8, 0, 0, 0],\n",
  288. " [ 0, 0, 0, 0],\n",
  289. " [ 0, 0, 0, 0]]])}\n",
  290. "batch_y has: {'label': tensor([0, 1])}\n"
  291. ]
  292. },
  293. {
  294. "data": {
  295. "text/plain": [
  296. "'\\n raw_sent正确输出,对应内容也进行了pad。\\n'"
  297. ]
  298. },
  299. "execution_count": 7,
  300. "metadata": {},
  301. "output_type": "execute_result"
  302. }
  303. ],
  304. "source": [
  305. "# 如果AutoPad与EngChar2DPadder不能满足需要,可以自己实现Padder对象。这里举一个例子,比如需要把raw_sentence pad到一样长\n",
  306. "from fastNLP.core.fieldarray import PadderBase\n",
  307. "\n",
  308. "class PadStr(PadderBase):\n",
  309. " def __init__(self, pad_val=' '):\n",
  310. " super().__init__(pad_val=pad_val) #让父类管理pad_val的值,这样可以通过DataSet.set_pad_val()修改到该值\n",
  311. " \n",
  312. " def __call__(self, contents, field_name, field_ele_dtype):\n",
  313. " \"\"\"\n",
  314. " 如果以上面的例子举例,在raw_sent这个field进行pad时,传入的\n",
  315. " contents:\n",
  316. " [\n",
  317. " 'This is a bad idea .',\n",
  318. " 'It is great .'\n",
  319. " ]\n",
  320. " field_name: 'raw_sent',当前field的名称,主要用于帮助debug。\n",
  321. " field_ele_dtype: np.str. 这个参数基本都用不上,是该field中内部元素的类型\n",
  322. " \"\"\"\n",
  323. " max_len = max([len(str_) for str_ in contents])\n",
  324. " pad_strs = []\n",
  325. " for content in contents:\n",
  326. " pad_strs.append(content + (max_len-len(content))*self.pad_val)\n",
  327. " return pad_strs\n",
  328. "\n",
  329. "dataset.set_input('raw_sent')\n",
  330. "dataset.set_padder('raw_sent', PadStr())\n",
  331. "batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
  332. "for batch_x, batch_y in batch_iterator:\n",
  333. " print(\"batch_x has: \", batch_x)\n",
  334. " print(\"batch_y has: \", batch_y)\n",
  335. "\n",
  336. "\"\"\"\n",
  337. " raw_sent正确输出,对应内容也进行了pad。\n",
  338. "\"\"\""
  339. ]
  340. },
  341. {
  342. "cell_type": "code",
  343. "execution_count": null,
  344. "metadata": {},
  345. "outputs": [],
  346. "source": []
  347. }
  348. ],
  349. "metadata": {
  350. "kernelspec": {
  351. "display_name": "Python 3",
  352. "language": "python",
  353. "name": "python3"
  354. },
  355. "language_info": {
  356. "codemirror_mode": {
  357. "name": "ipython",
  358. "version": 3
  359. },
  360. "file_extension": ".py",
  361. "mimetype": "text/x-python",
  362. "name": "python",
  363. "nbconvert_exporter": "python",
  364. "pygments_lexer": "ipython3",
  365. "version": "3.6.7"
  366. }
  367. },
  368. "nbformat": 4,
  369. "nbformat_minor": 2
  370. }