You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

checkdata.py 2.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #!/usr/bin/env python
  2. #
  3. # A format checker for LIBSVM
  4. #
  5. #
  6. # Copyright (c) 2007, Rong-En Fan
  7. #
  8. # All rights reserved.
  9. #
  10. # This program is distributed under the same license of the LIBSVM package.
  11. #
  12. from sys import argv, exit
  13. import os.path
  14. def err(line_no, msg):
  15. print("line {0}: {1}".format(line_no, msg))
  16. # works like float() but does not accept nan and inf
  17. def my_float(x):
  18. if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
  19. raise ValueError
  20. return float(x)
  21. def main():
  22. if len(argv) != 2:
  23. print("Usage: {0} dataset".format(argv[0]))
  24. exit(1)
  25. dataset = argv[1]
  26. if not os.path.exists(dataset):
  27. print("dataset {0} not found".format(dataset))
  28. exit(1)
  29. line_no = 1
  30. error_line_count = 0
  31. for line in open(dataset, 'r'):
  32. line_error = False
  33. # each line must end with a newline character
  34. if line[-1] != '\n':
  35. err(line_no, "missing a newline character in the end")
  36. line_error = True
  37. nodes = line.split()
  38. # check label
  39. try:
  40. label = nodes.pop(0)
  41. if label.find(',') != -1:
  42. # multi-label format
  43. try:
  44. for l in label.split(','):
  45. l = my_float(l)
  46. except:
  47. err(line_no, "label {0} is not a valid multi-label form".format(label))
  48. line_error = True
  49. else:
  50. try:
  51. label = my_float(label)
  52. except:
  53. err(line_no, "label {0} is not a number".format(label))
  54. line_error = True
  55. except:
  56. err(line_no, "missing label, perhaps an empty line?")
  57. line_error = True
  58. # check features
  59. prev_index = -1
  60. for i in range(len(nodes)):
  61. try:
  62. (index, value) = nodes[i].split(':')
  63. index = int(index)
  64. value = my_float(value)
  65. # precomputed kernel's index starts from 0 and LIBSVM
  66. # checks it. Hence, don't treat index 0 as an error.
  67. if index < 0:
  68. err(line_no, "feature index must be positive; wrong feature {0}".format(nodes[i]))
  69. line_error = True
  70. elif index <= prev_index:
  71. err(line_no, "feature indices must be in an ascending order, previous/current features {0} {1}".format(nodes[i-1], nodes[i]))
  72. line_error = True
  73. prev_index = index
  74. except:
  75. err(line_no, "feature '{0}' not an <index>:<value> pair, <index> integer, <value> real number ".format(nodes[i]))
  76. line_error = True
  77. line_no += 1
  78. if line_error:
  79. error_line_count += 1
  80. if error_line_count > 0:
  81. print("Found {0} lines with error.".format(error_line_count))
  82. return 1
  83. else:
  84. print("No error.")
  85. return 0
  86. if __name__ == "__main__":
  87. exit(main())

A Python package for graph kernels, graph edit distances and graph pre-image problem.