First commit
This commit is contained in:
120
libsvm-3.36/tools/subset.py
Executable file
120
libsvm-3.36/tools/subset.py
Executable file
@@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os, sys, math, random
|
||||
from collections import defaultdict
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
xrange = range
|
||||
|
||||
def exit_with_help(argv):
|
||||
print("""\
|
||||
Usage: {0} [options] dataset subset_size [output1] [output2]
|
||||
|
||||
This script randomly selects a subset of the dataset.
|
||||
|
||||
options:
|
||||
-s method : method of selection (default 0)
|
||||
0 -- stratified selection (classification only)
|
||||
1 -- random selection
|
||||
|
||||
output1 : the subset (optional)
|
||||
output2 : rest of the data (optional)
|
||||
If output1 is omitted, the subset will be printed on the screen.""".format(argv[0]))
|
||||
exit(1)
|
||||
|
||||
def process_options(argv):
|
||||
argc = len(argv)
|
||||
if argc < 3:
|
||||
exit_with_help(argv)
|
||||
|
||||
# default method is stratified selection
|
||||
method = 0
|
||||
subset_file = sys.stdout
|
||||
rest_file = None
|
||||
|
||||
i = 1
|
||||
while i < argc:
|
||||
if argv[i][0] != "-":
|
||||
break
|
||||
if argv[i] == "-s":
|
||||
i = i + 1
|
||||
method = int(argv[i])
|
||||
if method not in [0,1]:
|
||||
print("Unknown selection method {0}".format(method))
|
||||
exit_with_help(argv)
|
||||
i = i + 1
|
||||
|
||||
dataset = argv[i]
|
||||
subset_size = int(argv[i+1])
|
||||
if i+2 < argc:
|
||||
subset_file = open(argv[i+2],'w')
|
||||
if i+3 < argc:
|
||||
rest_file = open(argv[i+3],'w')
|
||||
|
||||
return dataset, subset_size, method, subset_file, rest_file
|
||||
|
||||
def random_selection(dataset, subset_size):
|
||||
l = sum(1 for line in open(dataset,'r'))
|
||||
return sorted(random.sample(xrange(l), subset_size))
|
||||
|
||||
def stratified_selection(dataset, subset_size):
|
||||
labels = [line.split(None,1)[0] for line in open(dataset)]
|
||||
label_linenums = defaultdict(list)
|
||||
for i, label in enumerate(labels):
|
||||
label_linenums[label] += [i]
|
||||
|
||||
l = len(labels)
|
||||
remaining = subset_size
|
||||
ret = []
|
||||
|
||||
# classes with fewer data are sampled first; otherwise
|
||||
# some rare classes may not be selected
|
||||
for label in sorted(label_linenums, key=lambda x: len(label_linenums[x])):
|
||||
linenums = label_linenums[label]
|
||||
label_size = len(linenums)
|
||||
# at least one instance per class
|
||||
s = int(min(remaining, max(1, math.ceil(label_size*(float(subset_size)/l)))))
|
||||
if s == 0:
|
||||
sys.stderr.write('''\
|
||||
Error: failed to have at least one instance per class
|
||||
1. You may have regression data.
|
||||
2. Your classification data is unbalanced or too small.
|
||||
Please use -s 1.
|
||||
''')
|
||||
sys.exit(-1)
|
||||
remaining -= s
|
||||
ret += [linenums[i] for i in random.sample(xrange(label_size), s)]
|
||||
return sorted(ret)
|
||||
|
||||
def main(argv=sys.argv):
|
||||
dataset, subset_size, method, subset_file, rest_file = process_options(argv)
|
||||
#uncomment the following line to fix the random seed
|
||||
#random.seed(0)
|
||||
selected_lines = []
|
||||
|
||||
if method == 0:
|
||||
selected_lines = stratified_selection(dataset, subset_size)
|
||||
elif method == 1:
|
||||
selected_lines = random_selection(dataset, subset_size)
|
||||
|
||||
#select instances based on selected_lines
|
||||
dataset = open(dataset,'r')
|
||||
prev_selected_linenum = -1
|
||||
for i in xrange(len(selected_lines)):
|
||||
for cnt in xrange(selected_lines[i]-prev_selected_linenum-1):
|
||||
line = dataset.readline()
|
||||
if rest_file:
|
||||
rest_file.write(line)
|
||||
subset_file.write(dataset.readline())
|
||||
prev_selected_linenum = selected_lines[i]
|
||||
subset_file.close()
|
||||
|
||||
if rest_file:
|
||||
for line in dataset:
|
||||
rest_file.write(line)
|
||||
rest_file.close()
|
||||
dataset.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
||||
|
Reference in New Issue
Block a user