You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

char_language_model.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. import os
  2. from collections import namedtuple
  3. import numpy as np
  4. import torch
  5. import torch.nn as nn
  6. import torch.nn.functional as F
  7. import torch.optim as optim
  8. from torch.autograd import Variable
  9. from model.base_model import BaseModel
  10. USE_GPU = True
  11. class CharLM(BaseModel):
  12. """
  13. Controller of the Character-level Neural Language Model
  14. To do:
  15. - where the data goes, call data savers.
  16. """
  17. DataTuple = namedtuple("DataTuple", ["feature", "label"])
  18. def __init__(self, lstm_batch_size, lstm_seq_len):
  19. super(CharLM, self).__init__()
  20. """
  21. Settings: should come from config loader or pre-processing
  22. """
  23. self.word_embed_dim = 300
  24. self.char_embedding_dim = 15
  25. self.cnn_batch_size = lstm_batch_size * lstm_seq_len
  26. self.lstm_seq_len = lstm_seq_len
  27. self.lstm_batch_size = lstm_batch_size
  28. self.num_epoch = 10
  29. self.old_PPL = 100000
  30. self.best_PPL = 100000
  31. """
  32. These parameters are set by pre-processing.
  33. """
  34. self.max_word_len = None
  35. self.num_char = None
  36. self.vocab_size = None
  37. self.preprocess("./data_for_tests/charlm.txt")
  38. self.data = None # named tuple to store all data set
  39. self.data_ready = False
  40. self.criterion = nn.CrossEntropyLoss()
  41. self._loss = None
  42. self.use_gpu = USE_GPU
  43. # word_emb_dim == hidden_size / num of hidden units
  44. self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
  45. to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))
  46. self.model = charLM(self.char_embedding_dim,
  47. self.word_embed_dim,
  48. self.vocab_size,
  49. self.num_char,
  50. use_gpu=self.use_gpu)
  51. for param in self.model.parameters():
  52. nn.init.uniform(param.data, -0.05, 0.05)
  53. self.learning_rate = 0.1
  54. self.optimizer = None
  55. def prepare_input(self, raw_text):
  56. """
  57. :param raw_text: raw input text consisting of words
  58. :return: torch.Tensor, torch.Tensor
  59. feature matrix, label vector
  60. This function is only called once in Trainer.train, but may called multiple times in Tester.test
  61. So Tester will save test input for frequent calls.
  62. """
  63. if os.path.exists("cache/prep.pt") is False:
  64. self.preprocess("./data_for_tests/charlm.txt") # To do: This is not good. Need to fix..
  65. objects = torch.load("cache/prep.pt")
  66. word_dict = objects["word_dict"]
  67. char_dict = objects["char_dict"]
  68. max_word_len = self.max_word_len
  69. print("word/char dictionary built. Start making inputs.")
  70. words = raw_text
  71. input_vec = np.array(text2vec(words, char_dict, max_word_len))
  72. # Labels are next-word index in word_dict with the same length as inputs
  73. input_label = np.array([word_dict[w] for w in words[1:]] + [word_dict[words[-1]]])
  74. feature_input = torch.from_numpy(input_vec)
  75. label_input = torch.from_numpy(input_label)
  76. return feature_input, label_input
  77. def mode(self, test=False):
  78. if test:
  79. self.model.eval()
  80. else:
  81. self.model.train()
  82. def data_forward(self, x):
  83. """
  84. :param x: Tensor of size [lstm_batch_size, lstm_seq_len, max_word_len+2]
  85. :return: Tensor of size [num_words, ?]
  86. """
  87. # additional processing of inputs after batching
  88. num_seq = x.size()[0] // self.lstm_seq_len
  89. x = x[:num_seq * self.lstm_seq_len, :]
  90. x = x.view(-1, self.lstm_seq_len, self.max_word_len + 2)
  91. # detach hidden state of LSTM from last batch
  92. hidden = [state.detach() for state in self.hidden]
  93. output, self.hidden = self.model(to_var(x), hidden)
  94. return output
  95. def grad_backward(self):
  96. self.model.zero_grad()
  97. self._loss.backward()
  98. torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
  99. self.optimizer.step()
  100. def get_loss(self, predict, truth):
  101. self._loss = self.criterion(predict, to_var(truth))
  102. return self._loss.data # No pytorch data structure exposed outsides
  103. def define_optimizer(self):
  104. # redefine optimizer for every new epoch
  105. self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.85)
  106. def save(self):
  107. print("network saved")
  108. # torch.save(self.model, "cache/model.pkl")
  109. def preprocess(self, all_text_files):
  110. word_dict, char_dict = create_word_char_dict(all_text_files)
  111. num_char = len(char_dict)
  112. self.vocab_size = len(word_dict)
  113. char_dict["BOW"] = num_char + 1
  114. char_dict["EOW"] = num_char + 2
  115. char_dict["PAD"] = 0
  116. self.num_char = num_char + 3
  117. # char_dict is a dict of (int, string), int counting from 0 to 47
  118. reverse_word_dict = {value: key for key, value in word_dict.items()}
  119. self.max_word_len = max([len(word) for word in word_dict])
  120. objects = {
  121. "word_dict": word_dict,
  122. "char_dict": char_dict,
  123. "reverse_word_dict": reverse_word_dict,
  124. }
  125. torch.save(objects, "cache/prep.pt")
  126. print("Preprocess done.")
  127. """
  128. Global Functions
  129. """
  130. def batch_generator(x, batch_size):
  131. # x: [num_words, in_channel, height, width]
  132. # partitions x into batches
  133. num_step = x.size()[0] // batch_size
  134. for t in range(num_step):
  135. yield x[t * batch_size:(t + 1) * batch_size]
  136. def text2vec(words, char_dict, max_word_len):
  137. """ Return list of list of int """
  138. word_vec = []
  139. for word in words:
  140. vec = [char_dict[ch] for ch in word]
  141. if len(vec) < max_word_len:
  142. vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
  143. vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
  144. word_vec.append(vec)
  145. return word_vec
  146. def read_data(file_name):
  147. with open(file_name, 'r') as f:
  148. corpus = f.read().lower()
  149. import re
  150. corpus = re.sub(r"<unk>", "unk", corpus)
  151. return corpus.split()
  152. def get_char_dict(vocabulary):
  153. char_dict = dict()
  154. count = 1
  155. for word in vocabulary:
  156. for ch in word:
  157. if ch not in char_dict:
  158. char_dict[ch] = count
  159. count += 1
  160. return char_dict
  161. def create_word_char_dict(*file_name):
  162. text = []
  163. for file in file_name:
  164. text += read_data(file)
  165. word_dict = {word: ix for ix, word in enumerate(set(text))}
  166. char_dict = get_char_dict(word_dict)
  167. return word_dict, char_dict
  168. def to_var(x):
  169. if torch.cuda.is_available() and USE_GPU:
  170. x = x.cuda()
  171. return Variable(x)
  172. """
  173. Neural Network
  174. """
  175. class Highway(nn.Module):
  176. """Highway network"""
  177. def __init__(self, input_size):
  178. super(Highway, self).__init__()
  179. self.fc1 = nn.Linear(input_size, input_size, bias=True)
  180. self.fc2 = nn.Linear(input_size, input_size, bias=True)
  181. def forward(self, x):
  182. t = F.sigmoid(self.fc1(x))
  183. return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)
  184. class charLM(nn.Module):
  185. """Character-level Neural Language Model
  186. CNN + highway network + LSTM
  187. # Input:
  188. 4D tensor with shape [batch_size, in_channel, height, width]
  189. # Output:
  190. 2D Tensor with shape [batch_size, vocab_size]
  191. # Arguments:
  192. char_emb_dim: the size of each character's embedding
  193. word_emb_dim: the size of each word's embedding
  194. vocab_size: num of unique words
  195. num_char: num of characters
  196. use_gpu: True or False
  197. """
  198. def __init__(self, char_emb_dim, word_emb_dim,
  199. vocab_size, num_char, use_gpu):
  200. super(charLM, self).__init__()
  201. self.char_emb_dim = char_emb_dim
  202. self.word_emb_dim = word_emb_dim
  203. self.vocab_size = vocab_size
  204. # char embedding layer
  205. self.char_embed = nn.Embedding(num_char, char_emb_dim)
  206. # convolutions of filters with different sizes
  207. self.convolutions = []
  208. # list of tuples: (the number of filter, width)
  209. # self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
  210. self.filter_num_width = [(25, 1), (50, 2), (75, 3)]
  211. for out_channel, filter_width in self.filter_num_width:
  212. self.convolutions.append(
  213. nn.Conv2d(
  214. 1, # in_channel
  215. out_channel, # out_channel
  216. kernel_size=(char_emb_dim, filter_width), # (height, width)
  217. bias=True
  218. )
  219. )
  220. self.highway_input_dim = sum([x for x, y in self.filter_num_width])
  221. self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)
  222. # highway net
  223. self.highway1 = Highway(self.highway_input_dim)
  224. self.highway2 = Highway(self.highway_input_dim)
  225. # LSTM
  226. self.lstm_num_layers = 2
  227. self.lstm = nn.LSTM(input_size=self.highway_input_dim,
  228. hidden_size=self.word_emb_dim,
  229. num_layers=self.lstm_num_layers,
  230. bias=True,
  231. dropout=0.5,
  232. batch_first=True)
  233. # output layer
  234. self.dropout = nn.Dropout(p=0.5)
  235. self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)
  236. if use_gpu is True:
  237. for x in range(len(self.convolutions)):
  238. self.convolutions[x] = self.convolutions[x].cuda()
  239. self.highway1 = self.highway1.cuda()
  240. self.highway2 = self.highway2.cuda()
  241. self.lstm = self.lstm.cuda()
  242. self.dropout = self.dropout.cuda()
  243. self.char_embed = self.char_embed.cuda()
  244. self.linear = self.linear.cuda()
  245. self.batch_norm = self.batch_norm.cuda()
  246. def forward(self, x, hidden):
  247. # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
  248. # Return: Variable of Tensor with shape [num_words, len(word_dict)]
  249. lstm_batch_size = x.size()[0]
  250. lstm_seq_len = x.size()[1]
  251. x = x.contiguous().view(-1, x.size()[2])
  252. # [num_seq*seq_len, max_word_len+2]
  253. x = self.char_embed(x)
  254. # [num_seq*seq_len, max_word_len+2, char_emb_dim]
  255. x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
  256. # [num_seq*seq_len, 1, char_emb_dim, max_word_len+2]
  257. x = self.conv_layers(x)
  258. # [num_seq*seq_len, total_num_filters]
  259. x = self.batch_norm(x)
  260. # [num_seq*seq_len, total_num_filters]
  261. x = self.highway1(x)
  262. x = self.highway2(x)
  263. # [num_seq*seq_len, total_num_filters]
  264. x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
  265. # [num_seq, seq_len, total_num_filters]
  266. x, hidden = self.lstm(x, hidden)
  267. # [seq_len, num_seq, hidden_size]
  268. x = self.dropout(x)
  269. # [seq_len, num_seq, hidden_size]
  270. x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
  271. # [num_seq*seq_len, hidden_size]
  272. x = self.linear(x)
  273. # [num_seq*seq_len, vocab_size]
  274. return x, hidden
  275. def conv_layers(self, x):
  276. chosen_list = list()
  277. for conv in self.convolutions:
  278. feature_map = F.tanh(conv(x))
  279. # (batch_size, out_channel, 1, max_word_len-width+1)
  280. chosen = torch.max(feature_map, 3)[0]
  281. # (batch_size, out_channel, 1)
  282. chosen = chosen.squeeze()
  283. # (batch_size, out_channel)
  284. chosen_list.append(chosen)
  285. # (batch_size, total_num_filers)
  286. return torch.cat(chosen_list, 1)

一款轻量级的自然语言处理(NLP)工具包,目标是减少用户项目中的工程型代码,例如数据处理循环、训练循环、多卡运行等