First commit
This commit is contained in:
108
libsvm-3.36/tools/checkdata.py
Executable file
108
libsvm-3.36/tools/checkdata.py
Executable file
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# A format checker for LIBSVM
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2007, Rong-En Fan
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# This program is distributed under the same license of the LIBSVM package.
|
||||
#
|
||||
|
||||
from sys import argv, exit
|
||||
import os.path
|
||||
|
||||
def err(line_no, msg):
|
||||
print("line {0}: {1}".format(line_no, msg))
|
||||
|
||||
# works like float() but does not accept nan and inf
|
||||
def my_float(x):
|
||||
if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
|
||||
raise ValueError
|
||||
|
||||
return float(x)
|
||||
|
||||
def main():
|
||||
if len(argv) != 2:
|
||||
print("Usage: {0} dataset".format(argv[0]))
|
||||
exit(1)
|
||||
|
||||
dataset = argv[1]
|
||||
|
||||
if not os.path.exists(dataset):
|
||||
print("dataset {0} not found".format(dataset))
|
||||
exit(1)
|
||||
|
||||
line_no = 1
|
||||
error_line_count = 0
|
||||
for line in open(dataset, 'r'):
|
||||
line_error = False
|
||||
|
||||
# each line must end with a newline character
|
||||
if line[-1] != '\n':
|
||||
err(line_no, "missing a newline character in the end")
|
||||
line_error = True
|
||||
|
||||
nodes = line.split()
|
||||
|
||||
# check label
|
||||
try:
|
||||
label = nodes.pop(0)
|
||||
|
||||
if label.find(',') != -1:
|
||||
# multi-label format
|
||||
try:
|
||||
for l in label.split(','):
|
||||
l = my_float(l)
|
||||
except:
|
||||
err(line_no, "label {0} is not a valid multi-label form".format(label))
|
||||
line_error = True
|
||||
else:
|
||||
try:
|
||||
label = my_float(label)
|
||||
except:
|
||||
err(line_no, "label {0} is not a number".format(label))
|
||||
line_error = True
|
||||
except:
|
||||
err(line_no, "missing label, perhaps an empty line?")
|
||||
line_error = True
|
||||
|
||||
# check features
|
||||
prev_index = -1
|
||||
for i in range(len(nodes)):
|
||||
try:
|
||||
(index, value) = nodes[i].split(':')
|
||||
|
||||
index = int(index)
|
||||
value = my_float(value)
|
||||
|
||||
# precomputed kernel's index starts from 0 and LIBSVM
|
||||
# checks it. Hence, don't treat index 0 as an error.
|
||||
if index < 0:
|
||||
err(line_no, "feature index must be positive; wrong feature {0}".format(nodes[i]))
|
||||
line_error = True
|
||||
elif index <= prev_index:
|
||||
err(line_no, "feature indices must be in an ascending order, previous/current features {0} {1}".format(nodes[i-1], nodes[i]))
|
||||
line_error = True
|
||||
prev_index = index
|
||||
except:
|
||||
err(line_no, "feature '{0}' not an <index>:<value> pair, <index> integer, <value> real number ".format(nodes[i]))
|
||||
line_error = True
|
||||
|
||||
line_no += 1
|
||||
|
||||
if line_error:
|
||||
error_line_count += 1
|
||||
|
||||
if error_line_count > 0:
|
||||
print("Found {0} lines with error.".format(error_line_count))
|
||||
return 1
|
||||
else:
|
||||
print("No error.")
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
Reference in New Issue
Block a user