You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

preprocess.py 1.3 kB

7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142
  1. import pickle
  2. import json
  3. import nltk
  4. from nltk.tokenize import stanford
  5. # f = open('dataset/review.json', encoding='utf-8')
  6. # samples = []
  7. # j = 0
  8. # for i, line in enumerate(f.readlines()):
  9. # review = json.loads(line)
  10. # samples.append((review['stars'], review['text']))
  11. # if (i+1) % 5000 == 0:
  12. # print(i)
  13. # pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
  14. # j += 1
  15. # samples = []
  16. # pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
  17. samples = pickle.load(open('review/samples0.pkl', 'rb'))
  18. # print(samples[0])
  19. import os
  20. os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
  21. path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
  22. tokenizer = stanford.CoreNLPTokenizer()
  23. dirname = 'review'
  24. dirname1 = 'reviews'
  25. for fn in os.listdir(dirname):
  26. print(fn)
  27. precessed = []
  28. for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')):
  29. tokens = []
  30. sents = nltk.tokenize.sent_tokenize(text)
  31. for s in sents:
  32. tokens.append(tokenizer.tokenize(s))
  33. precessed.append((stars, tokens))
  34. # print(tokens)
  35. if len(precessed) % 100 == 0:
  36. print(len(precessed))
  37. pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb'))

一款轻量级的自然语言处理(NLP)工具包,目标是减少用户项目中的工程型代码,例如数据处理循环、训练循环、多卡运行等