Browse Source

New translations subset.py (French)

l10n_v0.2.x
linlin 4 years ago
parent
commit
506f92fbca
1 changed files with 120 additions and 0 deletions
  1. +120
    -0
      lang/fr/gklearn/gedlib/lib/libsvm.3.22/tools/subset.py

+ 120
- 0
lang/fr/gklearn/gedlib/lib/libsvm.3.22/tools/subset.py View File

@@ -0,0 +1,120 @@
#!/usr/bin/env python

import os, sys, math, random
from collections import defaultdict

if sys.version_info[0] >= 3:
xrange = range

def exit_with_help(argv):
print("""\
Usage: {0} [options] dataset subset_size [output1] [output2]

This script randomly selects a subset of the dataset.

options:
-s method : method of selection (default 0)
0 -- stratified selection (classification only)
1 -- random selection

output1 : the subset (optional)
output2 : rest of the data (optional)
If output1 is omitted, the subset will be printed on the screen.""".format(argv[0]))
exit(1)

def process_options(argv):
argc = len(argv)
if argc < 3:
exit_with_help(argv)

# default method is stratified selection
method = 0
subset_file = sys.stdout
rest_file = None

i = 1
while i < argc:
if argv[i][0] != "-":
break
if argv[i] == "-s":
i = i + 1
method = int(argv[i])
if method not in [0,1]:
print("Unknown selection method {0}".format(method))
exit_with_help(argv)
i = i + 1

dataset = argv[i]
subset_size = int(argv[i+1])
if i+2 < argc:
subset_file = open(argv[i+2],'w')
if i+3 < argc:
rest_file = open(argv[i+3],'w')

return dataset, subset_size, method, subset_file, rest_file

def random_selection(dataset, subset_size):
l = sum(1 for line in open(dataset,'r'))
return sorted(random.sample(xrange(l), subset_size))

def stratified_selection(dataset, subset_size):
labels = [line.split(None,1)[0] for line in open(dataset)]
label_linenums = defaultdict(list)
for i, label in enumerate(labels):
label_linenums[label] += [i]

l = len(labels)
remaining = subset_size
ret = []

# classes with fewer data are sampled first; otherwise
# some rare classes may not be selected
for label in sorted(label_linenums, key=lambda x: len(label_linenums[x])):
linenums = label_linenums[label]
label_size = len(linenums)
# at least one instance per class
s = int(min(remaining, max(1, math.ceil(label_size*(float(subset_size)/l)))))
if s == 0:
sys.stderr.write('''\
Error: failed to have at least one instance per class
1. You may have regression data.
2. Your classification data is unbalanced or too small.
Please use -s 1.
''')
sys.exit(-1)
remaining -= s
ret += [linenums[i] for i in random.sample(xrange(label_size), s)]
return sorted(ret)

def main(argv=sys.argv):
dataset, subset_size, method, subset_file, rest_file = process_options(argv)
#uncomment the following line to fix the random seed
#random.seed(0)
selected_lines = []

if method == 0:
selected_lines = stratified_selection(dataset, subset_size)
elif method == 1:
selected_lines = random_selection(dataset, subset_size)

#select instances based on selected_lines
dataset = open(dataset,'r')
prev_selected_linenum = -1
for i in xrange(len(selected_lines)):
for cnt in xrange(selected_lines[i]-prev_selected_linenum-1):
line = dataset.readline()
if rest_file:
rest_file.write(line)
subset_file.write(dataset.readline())
prev_selected_linenum = selected_lines[i]
subset_file.close()

if rest_file:
for line in dataset:
rest_file.write(line)
rest_file.close()
dataset.close()

if __name__ == '__main__':
main(sys.argv)


Loading…
Cancel
Save