You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

char_language_model.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. import os
  2. from collections import namedtuple
  3. import numpy as np
  4. import torch
  5. import torch.nn as nn
  6. import torch.nn.functional as F
  7. import torch.optim as optim
  8. from torch.autograd import Variable
  9. from model.base_model import BaseModel
  10. class CharLM(BaseModel):
  11. """
  12. Controller of the Character-level Neural Language Model
  13. To do:
  14. - where the data goes, call data savers.
  15. """
  16. def __init__(self):
  17. super(CharLM, self).__init__()
  18. """
  19. Settings
  20. """
  21. self.word_embed_dim = 300
  22. self.char_embedding_dim = 15
  23. self.cnn_batch_size = 700
  24. self.lstm_seq_len = 35
  25. self.lstm_batch_size = 20
  26. self.vocab_size = 100
  27. self.num_char = 150
  28. self.max_word_len = 10
  29. self.num_epoch = 10
  30. self.old_PPL = 100000
  31. self.best_PPL = 100000
  32. self.data = None # named tuple to store all data set
  33. self.data_ready = False
  34. self.criterion = nn.CrossEntropyLoss()
  35. self.loss = None
  36. self.use_gpu = False
  37. # word_emb_dim == hidden_size / num of hidden units
  38. self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
  39. to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))
  40. self.model = charLM(self.char_embedding_dim,
  41. self.word_embed_dim,
  42. self.vocab_size,
  43. self.num_char,
  44. use_gpu=self.use_gpu)
  45. for param in self.model.parameters():
  46. nn.init.uniform(param.data, -0.05, 0.05)
  47. self.learning_rate = 0.1
  48. self.optimizer = None
  49. def prepare_input(self, raw_text):
  50. """
  51. :param raw_text: raw input data
  52. :return: torch.Tensor, torch.Tensor
  53. feature matrix, label vector
  54. """
  55. if not self.data_ready:
  56. # To do: These need to be dropped out from here. (below)
  57. if os.path.exists("cache/prep.pt") is False:
  58. self.preprocess()
  59. objects = torch.load("cache/prep.pt")
  60. word_dict = objects["word_dict"]
  61. char_dict = objects["char_dict"]
  62. max_word_len = objects["max_word_len"]
  63. self.data_ready = True
  64. print("word/char dictionary built. Start making inputs.")
  65. if os.path.exists("cache/data_sets.pt") is False:
  66. train_text = read_data("./train.txt")
  67. valid_text = read_data("./valid.txt")
  68. test_text = read_data("./tests.txt")
  69. # To do: These need to be dropped out from here. (above)
  70. input_vec = np.array(text2vec(raw_text, char_dict, max_word_len))
  71. # Labels are next-word index in word_dict with the same length as inputs
  72. input_label = np.array([word_dict[w] for w in raw_text[1:]] + [word_dict[raw_text[-1]]])
  73. category = {"features": input_vec, "label": input_label}
  74. torch.save(category, "cache/data_sets.pt")
  75. else:
  76. data_sets = torch.load("cache/data_sets.pt")
  77. input_vec = data_sets["features"]
  78. input_label = data_sets["label"]
  79. DataTuple = namedtuple("DataTuple", ["feature", "label"])
  80. self.data = DataTuple(feature=input_vec, label=input_label)
  81. feature_input = torch.from_numpy(self.data.feature)
  82. label_input = torch.from_numpy(self.data.label)
  83. num_seq = feature_input.size()[0] // self.lstm_seq_len
  84. feature_input = feature_input[:num_seq * self.lstm_seq_len, :]
  85. feature_input = feature_input.view(-1, self.lstm_seq_len, self.max_word_len + 2)
  86. self.num_iter_per_epoch = feature_input.size()[0] // self.lstm_batch_size
  87. return feature_input, label_input
  88. def mode(self, test=False):
  89. if test:
  90. self.model.eval()
  91. else:
  92. self.model.train()
  93. def data_forward(self, x):
  94. # detach hidden state of LSTM from last batch
  95. hidden = [state.detach() for state in self.hidden]
  96. output, self.hidden = self.model(to_var(x), hidden)
  97. return output
  98. def grad_backward(self):
  99. self.model.zero_grad()
  100. self.loss.backward()
  101. torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
  102. self.optimizer.step()
  103. def loss(self, predict, truth):
  104. self.loss = self.criterion(predict, to_var(truth))
  105. return self.loss
  106. def define_optimizer(self):
  107. # redefine optimizer for every new epoch
  108. self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.85)
  109. def save(self):
  110. torch.save(self.model, "cache/model.pkl")
  111. @staticmethod
  112. def preprocess():
  113. word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "tests.txt")
  114. num_char = len(char_dict)
  115. char_dict["BOW"] = num_char + 1
  116. char_dict["EOW"] = num_char + 2
  117. char_dict["PAD"] = 0
  118. # dict of (int, string)
  119. reverse_word_dict = {value: key for key, value in word_dict.items()}
  120. max_word_len = max([len(word) for word in word_dict])
  121. objects = {
  122. "word_dict": word_dict,
  123. "char_dict": char_dict,
  124. "reverse_word_dict": reverse_word_dict,
  125. "max_word_len": max_word_len
  126. }
  127. torch.save(objects, "cache/prep.pt")
  128. print("Preprocess done.")
  129. """
  130. Global Functions
  131. """
  132. def batch_generator(x, batch_size):
  133. # x: [num_words, in_channel, height, width]
  134. # partitions x into batches
  135. num_step = x.size()[0] // batch_size
  136. for t in range(num_step):
  137. yield x[t * batch_size:(t + 1) * batch_size]
  138. def text2vec(words, char_dict, max_word_len):
  139. """ Return list of list of int """
  140. word_vec = []
  141. for word in words:
  142. vec = [char_dict[ch] for ch in word]
  143. if len(vec) < max_word_len:
  144. vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
  145. vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
  146. word_vec.append(vec)
  147. return word_vec
  148. def read_data(file_name):
  149. with open(file_name, 'r') as f:
  150. corpus = f.read().lower()
  151. import re
  152. corpus = re.sub(r"<unk>", "unk", corpus)
  153. return corpus.split()
  154. def get_char_dict(vocabulary):
  155. char_dict = dict()
  156. count = 1
  157. for word in vocabulary:
  158. for ch in word:
  159. if ch not in char_dict:
  160. char_dict[ch] = count
  161. count += 1
  162. return char_dict
  163. def create_word_char_dict(*file_name):
  164. text = []
  165. for file in file_name:
  166. text += read_data(file)
  167. word_dict = {word: ix for ix, word in enumerate(set(text))}
  168. char_dict = get_char_dict(word_dict)
  169. return word_dict, char_dict
  170. def to_var(x):
  171. if torch.cuda.is_available():
  172. x = x.cuda()
  173. return Variable(x)
  174. class Highway(nn.Module):
  175. """Highway network"""
  176. def __init__(self, input_size):
  177. super(Highway, self).__init__()
  178. self.fc1 = nn.Linear(input_size, input_size, bias=True)
  179. self.fc2 = nn.Linear(input_size, input_size, bias=True)
  180. def forward(self, x):
  181. t = F.sigmoid(self.fc1(x))
  182. return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)
  183. class charLM(nn.Module):
  184. """Character-level Neural Language Model
  185. CNN + highway network + LSTM
  186. # Input:
  187. 4D tensor with shape [batch_size, in_channel, height, width]
  188. # Output:
  189. 2D Tensor with shape [batch_size, vocab_size]
  190. # Arguments:
  191. char_emb_dim: the size of each character's embedding
  192. word_emb_dim: the size of each word's embedding
  193. vocab_size: num of unique words
  194. num_char: num of characters
  195. use_gpu: True or False
  196. """
  197. def __init__(self, char_emb_dim, word_emb_dim,
  198. vocab_size, num_char, use_gpu):
  199. super(charLM, self).__init__()
  200. self.char_emb_dim = char_emb_dim
  201. self.word_emb_dim = word_emb_dim
  202. self.vocab_size = vocab_size
  203. # char embedding layer
  204. self.char_embed = nn.Embedding(num_char, char_emb_dim)
  205. # convolutions of filters with different sizes
  206. self.convolutions = []
  207. # list of tuples: (the number of filter, width)
  208. self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
  209. for out_channel, filter_width in self.filter_num_width:
  210. self.convolutions.append(
  211. nn.Conv2d(
  212. 1, # in_channel
  213. out_channel, # out_channel
  214. kernel_size=(char_emb_dim, filter_width), # (height, width)
  215. bias=True
  216. )
  217. )
  218. self.highway_input_dim = sum([x for x, y in self.filter_num_width])
  219. self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)
  220. # highway net
  221. self.highway1 = Highway(self.highway_input_dim)
  222. self.highway2 = Highway(self.highway_input_dim)
  223. # LSTM
  224. self.lstm_num_layers = 2
  225. self.lstm = nn.LSTM(input_size=self.highway_input_dim,
  226. hidden_size=self.word_emb_dim,
  227. num_layers=self.lstm_num_layers,
  228. bias=True,
  229. dropout=0.5,
  230. batch_first=True)
  231. # output layer
  232. self.dropout = nn.Dropout(p=0.5)
  233. self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)
  234. if use_gpu is True:
  235. for x in range(len(self.convolutions)):
  236. self.convolutions[x] = self.convolutions[x].cuda()
  237. self.highway1 = self.highway1.cuda()
  238. self.highway2 = self.highway2.cuda()
  239. self.lstm = self.lstm.cuda()
  240. self.dropout = self.dropout.cuda()
  241. self.char_embed = self.char_embed.cuda()
  242. self.linear = self.linear.cuda()
  243. self.batch_norm = self.batch_norm.cuda()
  244. def forward(self, x, hidden):
  245. # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
  246. # Return: Variable of Tensor with shape [num_words, len(word_dict)]
  247. lstm_batch_size = x.size()[0]
  248. lstm_seq_len = x.size()[1]
  249. x = x.contiguous().view(-1, x.size()[2])
  250. # [num_seq*seq_len, max_word_len+2]
  251. x = self.char_embed(x)
  252. # [num_seq*seq_len, max_word_len+2, char_emb_dim]
  253. x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
  254. # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]
  255. x = self.conv_layers(x)
  256. # [num_seq*seq_len, total_num_filters]
  257. x = self.batch_norm(x)
  258. # [num_seq*seq_len, total_num_filters]
  259. x = self.highway1(x)
  260. x = self.highway2(x)
  261. # [num_seq*seq_len, total_num_filters]
  262. x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
  263. # [num_seq, seq_len, total_num_filters]
  264. x, hidden = self.lstm(x, hidden)
  265. # [seq_len, num_seq, hidden_size]
  266. x = self.dropout(x)
  267. # [seq_len, num_seq, hidden_size]
  268. x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
  269. # [num_seq*seq_len, hidden_size]
  270. x = self.linear(x)
  271. # [num_seq*seq_len, vocab_size]
  272. return x, hidden
  273. def conv_layers(self, x):
  274. chosen_list = list()
  275. for conv in self.convolutions:
  276. feature_map = F.tanh(conv(x))
  277. # (batch_size, out_channel, 1, max_word_len-width+1)
  278. chosen = torch.max(feature_map, 3)[0]
  279. # (batch_size, out_channel, 1)
  280. chosen = chosen.squeeze()
  281. # (batch_size, out_channel)
  282. chosen_list.append(chosen)
  283. # (batch_size, total_num_filers)
  284. return torch.cat(chosen_list, 1)

一款轻量级的自然语言处理(NLP)工具包,目标是减少用户项目中的工程型代码,例如数据处理循环、训练循环、多卡运行等