You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

subset.py 3.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. #!/usr/bin/env python
  2. import os, sys, math, random
  3. from collections import defaultdict
  4. if sys.version_info[0] >= 3:
  5. xrange = range
  6. def exit_with_help(argv):
  7. print("""\
  8. Usage: {0} [options] dataset subset_size [output1] [output2]
  9. This script randomly selects a subset of the dataset.
  10. options:
  11. -s method : method of selection (default 0)
  12. 0 -- stratified selection (classification only)
  13. 1 -- random selection
  14. output1 : the subset (optional)
  15. output2 : rest of the data (optional)
  16. If output1 is omitted, the subset will be printed on the screen.""".format(argv[0]))
  17. exit(1)
  18. def process_options(argv):
  19. argc = len(argv)
  20. if argc < 3:
  21. exit_with_help(argv)
  22. # default method is stratified selection
  23. method = 0
  24. subset_file = sys.stdout
  25. rest_file = None
  26. i = 1
  27. while i < argc:
  28. if argv[i][0] != "-":
  29. break
  30. if argv[i] == "-s":
  31. i = i + 1
  32. method = int(argv[i])
  33. if method not in [0,1]:
  34. print("Unknown selection method {0}".format(method))
  35. exit_with_help(argv)
  36. i = i + 1
  37. dataset = argv[i]
  38. subset_size = int(argv[i+1])
  39. if i+2 < argc:
  40. subset_file = open(argv[i+2],'w')
  41. if i+3 < argc:
  42. rest_file = open(argv[i+3],'w')
  43. return dataset, subset_size, method, subset_file, rest_file
  44. def random_selection(dataset, subset_size):
  45. l = sum(1 for line in open(dataset,'r'))
  46. return sorted(random.sample(xrange(l), subset_size))
  47. def stratified_selection(dataset, subset_size):
  48. labels = [line.split(None,1)[0] for line in open(dataset)]
  49. label_linenums = defaultdict(list)
  50. for i, label in enumerate(labels):
  51. label_linenums[label] += [i]
  52. l = len(labels)
  53. remaining = subset_size
  54. ret = []
  55. # classes with fewer data are sampled first; otherwise
  56. # some rare classes may not be selected
  57. for label in sorted(label_linenums, key=lambda x: len(label_linenums[x])):
  58. linenums = label_linenums[label]
  59. label_size = len(linenums)
  60. # at least one instance per class
  61. s = int(min(remaining, max(1, math.ceil(label_size*(float(subset_size)/l)))))
  62. if s == 0:
  63. sys.stderr.write('''\
  64. Error: failed to have at least one instance per class
  65. 1. You may have regression data.
  66. 2. Your classification data is unbalanced or too small.
  67. Please use -s 1.
  68. ''')
  69. sys.exit(-1)
  70. remaining -= s
  71. ret += [linenums[i] for i in random.sample(xrange(label_size), s)]
  72. return sorted(ret)
  73. def main(argv=sys.argv):
  74. dataset, subset_size, method, subset_file, rest_file = process_options(argv)
  75. #uncomment the following line to fix the random seed
  76. #random.seed(0)
  77. selected_lines = []
  78. if method == 0:
  79. selected_lines = stratified_selection(dataset, subset_size)
  80. elif method == 1:
  81. selected_lines = random_selection(dataset, subset_size)
  82. #select instances based on selected_lines
  83. dataset = open(dataset,'r')
  84. prev_selected_linenum = -1
  85. for i in xrange(len(selected_lines)):
  86. for cnt in xrange(selected_lines[i]-prev_selected_linenum-1):
  87. line = dataset.readline()
  88. if rest_file:
  89. rest_file.write(line)
  90. subset_file.write(dataset.readline())
  91. prev_selected_linenum = selected_lines[i]
  92. subset_file.close()
  93. if rest_file:
  94. for line in dataset:
  95. rest_file.write(line)
  96. rest_file.close()
  97. dataset.close()
  98. if __name__ == '__main__':
  99. main(sys.argv)

A Python package for graph kernels, graph edit distances and graph pre-image problem.