From a5fcae41420caf3216a71b4c1ce4ab6b00de3572 Mon Sep 17 00:00:00 2001 From: linlin Date: Tue, 6 Oct 2020 17:22:17 +0200 Subject: [PATCH] New translations subset.py (Chinese Simplified) --- .../gklearn/gedlib/lib/libsvm.3.22/tools/subset.py | 120 +++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 lang/zh/gklearn/gedlib/lib/libsvm.3.22/tools/subset.py diff --git a/lang/zh/gklearn/gedlib/lib/libsvm.3.22/tools/subset.py b/lang/zh/gklearn/gedlib/lib/libsvm.3.22/tools/subset.py new file mode 100644 index 0000000..e031ef0 --- /dev/null +++ b/lang/zh/gklearn/gedlib/lib/libsvm.3.22/tools/subset.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +import os, sys, math, random +from collections import defaultdict + +if sys.version_info[0] >= 3: + xrange = range + +def exit_with_help(argv): + print("""\ +Usage: {0} [options] dataset subset_size [output1] [output2] + +This script randomly selects a subset of the dataset. + +options: +-s method : method of selection (default 0) + 0 -- stratified selection (classification only) + 1 -- random selection + +output1 : the subset (optional) +output2 : rest of the data (optional) +If output1 is omitted, the subset will be printed on the screen.""".format(argv[0])) + exit(1) + +def process_options(argv): + argc = len(argv) + if argc < 3: + exit_with_help(argv) + + # default method is stratified selection + method = 0 + subset_file = sys.stdout + rest_file = None + + i = 1 + while i < argc: + if argv[i][0] != "-": + break + if argv[i] == "-s": + i = i + 1 + method = int(argv[i]) + if method not in [0,1]: + print("Unknown selection method {0}".format(method)) + exit_with_help(argv) + i = i + 1 + + dataset = argv[i] + subset_size = int(argv[i+1]) + if i+2 < argc: + subset_file = open(argv[i+2],'w') + if i+3 < argc: + rest_file = open(argv[i+3],'w') + + return dataset, subset_size, method, subset_file, rest_file + +def random_selection(dataset, subset_size): + l = sum(1 for line in open(dataset,'r')) + return sorted(random.sample(xrange(l), subset_size)) + +def stratified_selection(dataset, subset_size): + labels = [line.split(None,1)[0] for line in open(dataset)] + label_linenums = defaultdict(list) + for i, label in enumerate(labels): + label_linenums[label] += [i] + + l = len(labels) + remaining = subset_size + ret = [] + + # classes with fewer data are sampled first; otherwise + # some rare classes may not be selected + for label in sorted(label_linenums, key=lambda x: len(label_linenums[x])): + linenums = label_linenums[label] + label_size = len(linenums) + # at least one instance per class + s = int(min(remaining, max(1, math.ceil(label_size*(float(subset_size)/l))))) + if s == 0: + sys.stderr.write('''\ +Error: failed to have at least one instance per class + 1. You may have regression data. + 2. Your classification data is unbalanced or too small. +Please use -s 1. +''') + sys.exit(-1) + remaining -= s + ret += [linenums[i] for i in random.sample(xrange(label_size), s)] + return sorted(ret) + +def main(argv=sys.argv): + dataset, subset_size, method, subset_file, rest_file = process_options(argv) + #uncomment the following line to fix the random seed + #random.seed(0) + selected_lines = [] + + if method == 0: + selected_lines = stratified_selection(dataset, subset_size) + elif method == 1: + selected_lines = random_selection(dataset, subset_size) + + #select instances based on selected_lines + dataset = open(dataset,'r') + prev_selected_linenum = -1 + for i in xrange(len(selected_lines)): + for cnt in xrange(selected_lines[i]-prev_selected_linenum-1): + line = dataset.readline() + if rest_file: + rest_file.write(line) + subset_file.write(dataset.readline()) + prev_selected_linenum = selected_lines[i] + subset_file.close() + + if rest_file: + for line in dataset: + rest_file.write(line) + rest_file.close() + dataset.close() + +if __name__ == '__main__': + main(sys.argv) +