You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

char_language_model.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. import os
  2. from collections import namedtuple
  3. import numpy as np
  4. import torch
  5. import torch.nn as nn
  6. import torch.nn.functional as F
  7. import torch.optim as optim
  8. from torch.autograd import Variable
  9. from model.base_model import BaseModel
  10. class CharLM(BaseModel):
  11. """
  12. Controller of the Character-level Neural Language Model
  13. """
  14. def __init__(self):
  15. super(CharLM, self).__init__()
  16. """
  17. Settings
  18. """
  19. self.word_embed_dim = 300
  20. self.char_embedding_dim = 15
  21. self.cnn_batch_size = 700
  22. self.lstm_seq_len = 35
  23. self.lstm_batch_size = 20
  24. self.vocab_size = 100
  25. self.num_char = 150
  26. self.data = None # named tuple to store all data set
  27. self.data_ready = False
  28. self.criterion = nn.CrossEntropyLoss()
  29. self.loss = None
  30. self.optimizer = optim.SGD(self.parameters(), lr=learning_rate, momentum=0.85)
  31. self.use_gpu = False
  32. # word_emb_dim == hidden_size / num of hidden units
  33. self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
  34. to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))
  35. self.model = charLM(self.char_embedding_dim,
  36. self.word_embed_dim,
  37. self.vocab_size,
  38. self.num_char,
  39. use_gpu=self.use_gpu)
  40. def prepare_input(self, raw_text):
  41. """
  42. Do some preparation jobs. Transform raw data into input vectors.
  43. """
  44. if not self.data_ready:
  45. # To do: These need to be dropped out from here. (below)
  46. if os.path.exists("cache/prep.pt") is False:
  47. self.preprocess()
  48. objects = torch.load("cache/prep.pt")
  49. word_dict = objects["word_dict"]
  50. char_dict = objects["char_dict"]
  51. max_word_len = objects["max_word_len"]
  52. self.data_ready = True
  53. print("word/char dictionary built. Start making inputs.")
  54. if os.path.exists("cache/data_sets.pt") is False:
  55. train_text = read_data("./train.txt")
  56. valid_text = read_data("./valid.txt")
  57. test_text = read_data("./tests.txt")
  58. # To do: These need to be dropped out from here. (above)
  59. input_vec = np.array(text2vec(raw_text, char_dict, max_word_len))
  60. # Labels are next-word index in word_dict with the same length as inputs
  61. input_label = np.array([word_dict[w] for w in raw_text[1:]] + [word_dict[raw_text[-1]]])
  62. category = {"features": input_vec, "label": input_label}
  63. torch.save(category, "cache/data_sets.pt")
  64. else:
  65. data_sets = torch.load("cache/data_sets.pt")
  66. input_vec = data_sets["features"]
  67. input_label = data_sets["label"]
  68. DataTuple = namedtuple("DataTuple", ["feature", "label"])
  69. self.data = DataTuple(feature=input_vec, label=input_label)
  70. return self.data.feature, self.data.label
  71. def mode(self, test=False):
  72. raise NotImplementedError
  73. def data_forward(self, x):
  74. # detach hidden state of LSTM from last batch
  75. hidden = [state.detach() for state in self.hidden]
  76. output, self.hidden = self.model(to_var(x), hidden)
  77. return output
  78. def grad_backward(self):
  79. self.model.zero_grad()
  80. self.loss.backward()
  81. torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
  82. self.optimizer.step()
  83. def loss(self, predict, truth):
  84. self.loss = self.criterion(predict, to_var(truth))
  85. return self.loss
  86. @staticmethod
  87. def preprocess():
  88. word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "tests.txt")
  89. num_char = len(char_dict)
  90. char_dict["BOW"] = num_char + 1
  91. char_dict["EOW"] = num_char + 2
  92. char_dict["PAD"] = 0
  93. # dict of (int, string)
  94. reverse_word_dict = {value: key for key, value in word_dict.items()}
  95. max_word_len = max([len(word) for word in word_dict])
  96. objects = {
  97. "word_dict": word_dict,
  98. "char_dict": char_dict,
  99. "reverse_word_dict": reverse_word_dict,
  100. "max_word_len": max_word_len
  101. }
  102. torch.save(objects, "cache/prep.pt")
  103. print("Preprocess done.")
  104. def forward(self, x, hidden):
  105. lstm_batch_size = x.size()[0]
  106. lstm_seq_len = x.size()[1]
  107. x = x.contiguous().view(-1, x.size()[2])
  108. x = self.char_embed(x)
  109. x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
  110. x = self.conv_layers(x)
  111. x = self.batch_norm(x)
  112. x = self.highway1(x)
  113. x = self.highway2(x)
  114. x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
  115. x, hidden = self.lstm(x, hidden)
  116. x = self.dropout(x)
  117. x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
  118. x = self.linear(x)
  119. return x, hidden
  120. """
  121. Global Functions
  122. """
  123. def batch_generator(x, batch_size):
  124. # x: [num_words, in_channel, height, width]
  125. # partitions x into batches
  126. num_step = x.size()[0] // batch_size
  127. for t in range(num_step):
  128. yield x[t * batch_size:(t + 1) * batch_size]
  129. def text2vec(words, char_dict, max_word_len):
  130. """ Return list of list of int """
  131. word_vec = []
  132. for word in words:
  133. vec = [char_dict[ch] for ch in word]
  134. if len(vec) < max_word_len:
  135. vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
  136. vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
  137. word_vec.append(vec)
  138. return word_vec
  139. def read_data(file_name):
  140. with open(file_name, 'r') as f:
  141. corpus = f.read().lower()
  142. import re
  143. corpus = re.sub(r"<unk>", "unk", corpus)
  144. return corpus.split()
  145. def get_char_dict(vocabulary):
  146. char_dict = dict()
  147. count = 1
  148. for word in vocabulary:
  149. for ch in word:
  150. if ch not in char_dict:
  151. char_dict[ch] = count
  152. count += 1
  153. return char_dict
  154. def create_word_char_dict(*file_name):
  155. text = []
  156. for file in file_name:
  157. text += read_data(file)
  158. word_dict = {word: ix for ix, word in enumerate(set(text))}
  159. char_dict = get_char_dict(word_dict)
  160. return word_dict, char_dict
  161. def to_var(x):
  162. if torch.cuda.is_available():
  163. x = x.cuda()
  164. return Variable(x)
  165. class Highway(nn.Module):
  166. """Highway network"""
  167. def __init__(self, input_size):
  168. super(Highway, self).__init__()
  169. self.fc1 = nn.Linear(input_size, input_size, bias=True)
  170. self.fc2 = nn.Linear(input_size, input_size, bias=True)
  171. def forward(self, x):
  172. t = F.sigmoid(self.fc1(x))
  173. return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)
  174. class charLM(nn.Module):
  175. """Character-level Neural Language Model
  176. CNN + highway network + LSTM
  177. # Input:
  178. 4D tensor with shape [batch_size, in_channel, height, width]
  179. # Output:
  180. 2D Tensor with shape [batch_size, vocab_size]
  181. # Arguments:
  182. char_emb_dim: the size of each character's embedding
  183. word_emb_dim: the size of each word's embedding
  184. vocab_size: num of unique words
  185. num_char: num of characters
  186. use_gpu: True or False
  187. """
  188. def __init__(self, char_emb_dim, word_emb_dim,
  189. vocab_size, num_char, use_gpu):
  190. super(charLM, self).__init__()
  191. self.char_emb_dim = char_emb_dim
  192. self.word_emb_dim = word_emb_dim
  193. self.vocab_size = vocab_size
  194. # char embedding layer
  195. self.char_embed = nn.Embedding(num_char, char_emb_dim)
  196. # convolutions of filters with different sizes
  197. self.convolutions = []
  198. # list of tuples: (the number of filter, width)
  199. self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
  200. for out_channel, filter_width in self.filter_num_width:
  201. self.convolutions.append(
  202. nn.Conv2d(
  203. 1, # in_channel
  204. out_channel, # out_channel
  205. kernel_size=(char_emb_dim, filter_width), # (height, width)
  206. bias=True
  207. )
  208. )
  209. self.highway_input_dim = sum([x for x, y in self.filter_num_width])
  210. self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)
  211. # highway net
  212. self.highway1 = Highway(self.highway_input_dim)
  213. self.highway2 = Highway(self.highway_input_dim)
  214. # LSTM
  215. self.lstm_num_layers = 2
  216. self.lstm = nn.LSTM(input_size=self.highway_input_dim,
  217. hidden_size=self.word_emb_dim,
  218. num_layers=self.lstm_num_layers,
  219. bias=True,
  220. dropout=0.5,
  221. batch_first=True)
  222. # output layer
  223. self.dropout = nn.Dropout(p=0.5)
  224. self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)
  225. if use_gpu is True:
  226. for x in range(len(self.convolutions)):
  227. self.convolutions[x] = self.convolutions[x].cuda()
  228. self.highway1 = self.highway1.cuda()
  229. self.highway2 = self.highway2.cuda()
  230. self.lstm = self.lstm.cuda()
  231. self.dropout = self.dropout.cuda()
  232. self.char_embed = self.char_embed.cuda()
  233. self.linear = self.linear.cuda()
  234. self.batch_norm = self.batch_norm.cuda()
  235. def forward(self, x, hidden):
  236. # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
  237. # Return: Variable of Tensor with shape [num_words, len(word_dict)]
  238. lstm_batch_size = x.size()[0]
  239. lstm_seq_len = x.size()[1]
  240. x = x.contiguous().view(-1, x.size()[2])
  241. # [num_seq*seq_len, max_word_len+2]
  242. x = self.char_embed(x)
  243. # [num_seq*seq_len, max_word_len+2, char_emb_dim]
  244. x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
  245. # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]
  246. x = self.conv_layers(x)
  247. # [num_seq*seq_len, total_num_filters]
  248. x = self.batch_norm(x)
  249. # [num_seq*seq_len, total_num_filters]
  250. x = self.highway1(x)
  251. x = self.highway2(x)
  252. # [num_seq*seq_len, total_num_filters]
  253. x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
  254. # [num_seq, seq_len, total_num_filters]
  255. x, hidden = self.lstm(x, hidden)
  256. # [seq_len, num_seq, hidden_size]
  257. x = self.dropout(x)
  258. # [seq_len, num_seq, hidden_size]
  259. x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
  260. # [num_seq*seq_len, hidden_size]
  261. x = self.linear(x)
  262. # [num_seq*seq_len, vocab_size]
  263. return x, hidden
  264. def conv_layers(self, x):
  265. chosen_list = list()
  266. for conv in self.convolutions:
  267. feature_map = F.tanh(conv(x))
  268. # (batch_size, out_channel, 1, max_word_len-width+1)
  269. chosen = torch.max(feature_map, 3)[0]
  270. # (batch_size, out_channel, 1)
  271. chosen = chosen.squeeze()
  272. # (batch_size, out_channel)
  273. chosen_list.append(chosen)
  274. # (batch_size, total_num_filers)
  275. return torch.cat(chosen_list, 1)

一款轻量级的自然语言处理(NLP)工具包,目标是减少用户项目中的工程型代码,例如数据处理循环、训练循环、多卡运行等