You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

validate.py 62 kB

first commit Former-commit-id: 08bc23ba02cffbce3cf63962390a65459a132e48 [formerly 0795edd4834b9b7dc66db8d10d4cbaf42bbf82cb] [formerly b5010b42541add7e2ea2578bf2da537efc457757 [formerly a7ca09c2c34c4fc8b3d8e01fcfa08eeeb2cae99d]] [formerly 615058473a2177ca5b89e9edbb797f4c2a59c7e5 [formerly 743d8dfc6843c4c205051a8ab309fbb2116c895e] [formerly bb0ea98b1e14154ef464e2f7a16738705894e54b [formerly 960a69da74b81ef8093820e003f2d6c59a34974c]]] [formerly 2fa3be52c1b44665bc81a7cc7d4cea4bbf0d91d5 [formerly 2054589f0898627e0a17132fd9d4cc78efc91867] [formerly 3b53730e8a895e803dfdd6ca72bc05e17a4164c1 [formerly 8a2fa8ab7baf6686d21af1f322df46fd58c60e69]] [formerly 87d1e3a07a19d03c7d7c94d93ab4fa9f58dada7c [formerly f331916385a5afac1234854ee8d7f160f34b668f] [formerly 69fb3c78a483343f5071da4f7e2891b83a49dd18 [formerly 386086f05aa9487f65bce2ee54438acbdce57650]]]] Former-commit-id: a00aed8c934a6460c4d9ac902b9a74a3d6864697 [formerly 26fdeca29c2f07916d837883983ca2982056c78e] [formerly 0e3170d41a2f99ecf5c918183d361d4399d793bf [formerly 3c12ad4c88ac5192e0f5606ac0d88dd5bf8602dc]] [formerly d5894f84f2fd2e77a6913efdc5ae388cf1be0495 [formerly ad3e7bc670ff92c992730d29c9d3aa1598d844e8] [formerly 69fb3c78a483343f5071da4f7e2891b83a49dd18]] Former-commit-id: 3c19c9fae64f6106415fbc948a4dc613b9ee12f8 [formerly 467ddc0549c74bb007e8f01773bb6dc9103b417d] [formerly 5fa518345d958e2760e443b366883295de6d991c [formerly 3530e130b9fdb7280f638dbc2e785d2165ba82aa]] Former-commit-id: 9f5d473d42a435ec0d60149939d09be1acc25d92 [formerly be0b25c4ec2cde052a041baf0e11f774a158105d] Former-commit-id: 9eca71cb73ba9edccd70ac06a3b636b8d4093b04
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296
  1. #!/usr/bin/env python3
  2. #
  3. # This script validates that problem and dataset descriptions match
  4. # standards and conventions (schemas, naming and directory structure, etc.).
  5. #
  6. # This script expects a that there is a clone of the "data-supply"
  7. # repository in the same directory as this script.
  8. #
  9. # Checks done by this script:
  10. # - Dataset description validates according to its schema.
  11. # - Problem description validates according to its schema.
  12. # - Dataset description filename should be "datasetDoc.json".
  13. # - Problem description filename should be "problemDoc.json".
  14. # - There should be no duplicate dataset IDs or problem IDs.
  15. # - Dataset directory names should match the dataset IDs, and be under
  16. # a matching parent directory based on that ID (where ID should
  17. # have an expected suffix).
  18. # - All problem descriptions for dataset views/splits should be the same.
  19. # - Dataset splits should match in ID the original dataset based on the directory
  20. # structure they are in, but have "TEST, "TRAIN", or "SCORE" suffix.
  21. # - Problem descriptions should reference existing datasets and columns.
  22. # - Dataset and problem descriptions should be (almost) equal between splits.
  23. # - Clustering problems require numClusters in target specifications.
  24. # - Clustering problems should not have data splitting configuration.
  25. # - Test and train split of datasets used in clustering problems should be the same.
  26. # - Require dataset digest.
  27. # - Dataset entry points should have "learningData" as resource ID.
  28. # - Problem descriptions using "f1", "precision", "recall", and "jaccardSimilarityScore"
  29. # metrics should have only two distinct values in target columns, have "posLabel" provided,
  30. # and that "posLabel" value should be among target values.
  31. # - No other should have "posLabel" set.
  32. # - "hammingLoss" metric can be used only with multi-label problems.
  33. # - "precisionAtTopK" should be used only with forecasting.
  34. # - Problem descriptions should have only one target, except for multi-variate
  35. # and object detection problems which should have more than one.
  36. # - Dataset entry point cannot be a collection.
  37. # - Dataset entry point has to have columns metadata.
  38. # - There is at most one "index" or "multiIndex" column per resource.
  39. # - "index" and "multiIndex" cannot be set at the same time.
  40. # - Dataset entry point is required to have an "index" or "multiIndex" column.
  41. # - Columns cannot be both "index" and "key" at the same time.
  42. # - Columns cannot be both "multiIndex" and "key" at the same time.
  43. # - "index" columns have to have unique values and no missing values.
  44. # - "multiIndex" columns have to have no missing values.
  45. # - "key" columns have to have unique values.
  46. # - Every metric should be listed only once in a problem description.
  47. # - Some task keywords can be used only with corresponding task keywords.
  48. # - All resource formats used by a resource should be from the standard list of them.
  49. # - All files used in a collection resource should have a file extension of a resource
  50. # format from the standard list of them.
  51. # - Collection resource should contain at least one file.
  52. # - Resource path of a collection resource should end with "/".
  53. # - Any file referenced in a collection resource must exist.
  54. # - On edgelist resources, both "edgeSource" and "edgeTarget" columns should exist in
  55. # same resource, only one each. It should have additional two column roles for direction
  56. # and simple/multi. Those should match between columns (so both should be directed or not,
  57. # and simple or multi, but not mix).
  58. # - When there is "multiIndex" column, all rows for same index value should have the same
  59. # values in all columns except "suggestedTarget" columns.
  60. # - Makes sure that "columnsCount" matches the number of columns, when it exists.
  61. import argparse
  62. import collections
  63. import copy
  64. import functools
  65. import json
  66. import traceback
  67. import os
  68. import os.path
  69. import sys
  70. import cerberus
  71. import deep_dircmp
  72. import pandas
  73. LIMIT_OUTPUT = 10
  74. EDGELIST_COLUMN_ROLES = [
  75. 'edgeSource',
  76. 'directedEdgeSource',
  77. 'undirectedEdgeSource',
  78. 'multiEdgeSource',
  79. 'simpleEdgeSource',
  80. 'edgeTarget',
  81. 'directedEdgeTarget',
  82. 'undirectedEdgeTarget',
  83. 'multiEdgeTarget',
  84. 'simpleEdgeTarget',
  85. ]
  86. if not os.path.exists(os.path.join(os.path.dirname(__file__), 'data-supply')):
  87. raise Exception("\"data-supply\" directory is missing. You should clone the repository to be in the same directory as this script.")
  88. with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'schemas', 'datasetSchema.json')) as dataset_description_schema_file:
  89. dataset_description_validator = cerberus.Validator(json.load(dataset_description_schema_file))
  90. with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'schemas', 'problemSchema.json')) as problem_description_schema_file:
  91. problem_description_validator = cerberus.Validator(json.load(problem_description_schema_file))
  92. with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'documentation', 'supportedResourceTypesFormats.json')) as supported_resource_types_formats_file:
  93. supported_resource_types_formats = json.load(supported_resource_types_formats_file)
  94. res_format_to_extensions = {}
  95. for supported_resource in supported_resource_types_formats['supported_resource_types_and_formats']:
  96. for res_format, extensions in supported_resource['resFormat'].items():
  97. if res_format not in res_format_to_extensions:
  98. res_format_to_extensions[res_format] = sorted(set(extensions))
  99. else:
  100. res_format_to_extensions[res_format] = sorted(set(extensions) | set(res_format_to_extensions[res_format]))
  101. @functools.lru_cache(maxsize=10)
  102. def read_csv(data_path):
  103. return pandas.read_csv(
  104. data_path,
  105. # We do not want to do any conversion of values.
  106. dtype=str,
  107. # We always expect one row header.
  108. header=0,
  109. # We want empty strings and not NaNs.
  110. na_filter=False,
  111. encoding='utf8',
  112. )
  113. def validate_dataset_path(description_id, description_path, *, strict_naming=True):
  114. if os.path.basename(description_path) != 'datasetDoc.json':
  115. print("ERROR: Dataset description filename is not 'datasetDoc.json'.")
  116. return True
  117. if strict_naming:
  118. split_path = os.path.dirname(description_path).split(os.sep)
  119. for suffix in ['_dataset_TEST', '_dataset_TRAIN', '_dataset_SCORE']:
  120. if description_id.endswith(suffix):
  121. expected_paths = [[description_id[:-len(suffix)], suffix[len('_dataset_'):], suffix[1:]]]
  122. # A special case, SCORE dataset/problem can be in TEST directory.
  123. if suffix == '_dataset_SCORE':
  124. expected_paths.append([description_id[:-len(suffix)], suffix[len('_dataset_'):], 'dataset_TEST'])
  125. if split_path[-3:] not in expected_paths:
  126. print("ERROR: Dataset directory path {directory_path} does not match any of expected paths: {expected_paths}".format(
  127. directory_path=split_path[-3:],
  128. expected_paths=', '.join(str(expected_path) for expected_path in expected_paths),
  129. ))
  130. return True
  131. break
  132. else:
  133. if not description_id.endswith('_dataset'):
  134. print("ERROR: Dataset ID does not end with allowed suffix: {description_id}".format(
  135. description_id=description_id,
  136. ))
  137. return True
  138. expected_path = [description_id[:-len('_dataset')], description_id]
  139. if split_path[-2:] != expected_path:
  140. print("ERROR: Dataset directory path {directory_path} does not match expected path: {expected_path}".format(
  141. directory_path=split_path[-2:],
  142. expected_path=expected_path,
  143. ))
  144. return True
  145. return False
  146. def validate_metrics(problem_description):
  147. error = False
  148. existing_metrics = set()
  149. for metric in problem_description.get('inputs', {}).get('performanceMetrics', []):
  150. if metric['metric'] in ['f1', 'precision', 'recall', 'jaccardSimilarityScore']:
  151. if 'posLabel' not in metric:
  152. print("ERROR: Problem uses '{metric}' metric, but 'posLabel' is not provided.".format(
  153. metric=metric['metric'],
  154. ))
  155. error = True
  156. if set(problem_description['about']['taskKeywords']) & {'multiClass', 'multiLabel'}:
  157. print("ERROR: Problem uses '{metric}' metric, but it is a multi-class or a multi-label problem.".format(
  158. metric=metric['metric'],
  159. ))
  160. error = True
  161. elif 'posLabel' in metric:
  162. print("ERROR: Problem does not use 'f1', 'precision', 'recall', or 'jaccardSimilarityScore' metric, but 'posLabel' is provided.".format(
  163. metric=metric['metric'],
  164. ))
  165. error = True
  166. if metric['metric'] == 'hammingLoss' and 'multiLabel' not in set(problem_description['about']['taskKeywords']):
  167. print("ERROR: Problem uses 'hammingLoss' metric, but it is not a multi-label problem.")
  168. error = True
  169. if metric['metric'] == 'precisionAtTopK' and 'forecasting' not in set(problem_description['about']['taskKeywords']):
  170. print("ERROR: Problem uses 'precisionAtTopK' metric, but it is not forecasting problem.")
  171. error = True
  172. if metric['metric'] in existing_metrics:
  173. print("ERROR: Problem uses same metric '{metric}' multiple times.".format(metric=metric['metric']))
  174. error = True
  175. existing_metrics.add(metric['metric'])
  176. return error
  177. def validate_keywords(problem_description):
  178. task_keywords = set(problem_description['about']['taskKeywords'])
  179. targets_number = 0
  180. for data in problem_description.get('inputs', {}).get('data', []):
  181. targets_number += len(data.get('targets', []))
  182. if 'regression' in task_keywords and 'multivariate' in task_keywords:
  183. if targets_number < 2:
  184. print("ERROR: Problem is a multi-variate problem, but it does not have more than 1 target.")
  185. return True
  186. elif 'objectDetection' in task_keywords:
  187. if targets_number != 1 and targets_number != 2:
  188. print("ERROR: Problem is an object detection problem, but it does not have 1 or 2 targets.")
  189. return True
  190. elif targets_number != 1:
  191. print("ERROR: Problem has more than 1 target.")
  192. return True
  193. if task_keywords & {'binary', 'multiClass', 'multiLabel'} and not task_keywords & {'classification', 'vertexClassification'}:
  194. print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
  195. task_keywords=task_keywords,
  196. ))
  197. return True
  198. if task_keywords & {'classification', 'vertexClassification'} and not task_keywords & {'binary', 'multiClass', 'multiLabel'}:
  199. print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
  200. task_keywords=task_keywords,
  201. ))
  202. return True
  203. if task_keywords & {'univariate', 'multivariate'} and 'regression' not in task_keywords:
  204. print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
  205. task_keywords=task_keywords,
  206. ))
  207. return True
  208. if 'regression' in task_keywords and not task_keywords & {'univariate', 'multivariate'}:
  209. print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
  210. task_keywords=task_keywords,
  211. ))
  212. return True
  213. if task_keywords & {'overlapping', 'nonOverlapping'} and not task_keywords & {'clustering', 'communityDetection'}:
  214. print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
  215. task_keywords=task_keywords,
  216. ))
  217. return True
  218. if task_keywords & {'clustering', 'communityDetection'} and not task_keywords & {'overlapping', 'nonOverlapping'}:
  219. print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
  220. task_keywords=task_keywords,
  221. ))
  222. return True
  223. return False
  224. def validate_files(dataset_description_path, data_resource, dataset_description, column_index, collection_resource_id):
  225. for collection_data_resource in dataset_description['dataResources']:
  226. if collection_data_resource['resID'] == collection_resource_id:
  227. break
  228. else:
  229. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' referencing with column {column_index} a collection resource '{collection_resource_id}', but the resource does not exixt.".format(
  230. dataset_path=dataset_description_path,
  231. resource_id=data_resource['resID'],
  232. column_index=column_index,
  233. collection_resource_id=collection_resource_id,
  234. ))
  235. # We cannot do much more here.
  236. return True
  237. if not collection_data_resource.get('isCollection', False):
  238. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' referencing with column {column_index} a collection resource '{collection_resource_id}', but the resource is not a collection.".format(
  239. dataset_path=dataset_description_path,
  240. resource_id=data_resource['resID'],
  241. column_index=column_index,
  242. collection_resource_id=collection_resource_id,
  243. ))
  244. # We cannot do much more here.
  245. return True
  246. error = False
  247. data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath'])
  248. data = read_csv(data_path)
  249. collection_dir = os.path.join(os.path.dirname(dataset_description_path), collection_data_resource['resPath'])
  250. count = 0
  251. for filename in data.iloc[:, column_index]:
  252. filepath = os.path.join(collection_dir, filename)
  253. if not os.path.isfile(filepath):
  254. count += 1
  255. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' referencing with column {column_index} a file in a collection resource '{collection_resource_id}', but the file does not exist: {filename}".format(
  256. dataset_path=dataset_description_path,
  257. resource_id=data_resource['resID'],
  258. column_index=column_index,
  259. collection_resource_id=collection_resource_id,
  260. filename=filename,
  261. ))
  262. error = True
  263. if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT:
  264. break
  265. return error
  266. def validate_collection(dataset_description_path, data_resource):
  267. error = False
  268. if not data_resource['resPath'].endswith('/'):
  269. print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' where resource path is not ending with '/': {res_path}".format(
  270. dataset_path=dataset_description_path,
  271. resource_id=data_resource['resID'],
  272. res_path=data_resource['resPath'],
  273. ))
  274. error = True
  275. allowed_file_extensions = set()
  276. for res_format, extensions in data_resource['resFormat'].items():
  277. unsupported_extensions = set(extensions) - set(res_format_to_extensions[res_format])
  278. if unsupported_extensions:
  279. print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' and resource format '{res_format}' with unsupported extensions: {unsupported_extensions}".format(
  280. dataset_path=dataset_description_path,
  281. resource_id=data_resource['resID'],
  282. res_format=res_format,
  283. unsupported_extensions=sorted(unsupported_extensions),
  284. ))
  285. error = True
  286. allowed_file_extensions.update(extensions)
  287. collection_dir = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath'])
  288. is_empty = True
  289. count = 0
  290. for dirpath, dirnames, filenames in os.walk(collection_dir):
  291. for filename in filenames:
  292. is_empty = False
  293. filepath = os.path.join(dirpath, filename)
  294. file_extension = get_file_extension(filepath)
  295. if file_extension not in allowed_file_extensions:
  296. count += 1
  297. print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' with a file with unsupported file extension: {filepath}".format(
  298. dataset_path=dataset_description_path,
  299. resource_id=data_resource['resID'],
  300. filepath=filepath,
  301. ))
  302. error = True
  303. if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT:
  304. break
  305. if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT:
  306. break
  307. if is_empty:
  308. print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' without any files.".format(
  309. dataset_path=dataset_description_path,
  310. resource_id=data_resource['resID'],
  311. ))
  312. error = True
  313. return error
  314. def validate_multi_index(dataset_description_path, data_resource, multi_index_column):
  315. error = False
  316. suggested_target_columns = []
  317. for column_description in data_resource['columns']:
  318. if 'suggestedTarget' in column_description['role']:
  319. suggested_target_columns.append(column_description['colIndex'])
  320. data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath'])
  321. data = read_csv(data_path)
  322. attribute_columns = [column_index for column_index in range(len(data.columns)) if column_index != multi_index_column and column_index not in suggested_target_columns]
  323. attributes = data.iloc[:, attribute_columns].set_index(data.iloc[:, multi_index_column])
  324. count = 0
  325. for group_name, group in attributes.groupby(level=0):
  326. # The first row in a group is not marked, so we add 1 to number of duplicated rows.
  327. if group.duplicated(keep='first').sum() + 1 != len(group):
  328. count += 1
  329. print("ERROR: Dataset '{dataset_path}' has a multi-index resource '{resource_id}' with all attributes in rows not equal for index value '{value}'.".format(
  330. dataset_path=dataset_description_path,
  331. resource_id=data_resource['resID'],
  332. value=group_name,
  333. ))
  334. error = True
  335. if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT:
  336. break
  337. return error
  338. def validate_edgelist(dataset_description_path, data_resource):
  339. error = False
  340. found_source = False
  341. is_directed_source = None
  342. is_multi_source = None
  343. found_target = False
  344. is_directed_target = None
  345. is_multi_target = None
  346. for column_description in data_resource['columns']:
  347. if 'edgeSource' in column_description['role']:
  348. # We have to check this only here or only in "edgeTarget" case. We check it here.
  349. if 'edgeTarget' in column_description['role']:
  350. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting source vs. target column roles.".format(
  351. dataset_path=dataset_description_path,
  352. resource_id=data_resource['resID'],
  353. ))
  354. error = True
  355. if found_source:
  356. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with multiple edge source columns.".format(
  357. dataset_path=dataset_description_path,
  358. resource_id=data_resource['resID'],
  359. ))
  360. error = True
  361. continue
  362. found_source = True
  363. if 'multiEdgeSource' in column_description['role']:
  364. if is_multi_source is None:
  365. is_multi_source = True
  366. elif is_multi_source != True:
  367. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
  368. dataset_path=dataset_description_path,
  369. resource_id=data_resource['resID'],
  370. ))
  371. error = True
  372. if 'simpleEdgeSource' in column_description['role']:
  373. if is_multi_source is None:
  374. is_multi_source = False
  375. elif is_multi_source != False:
  376. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
  377. dataset_path=dataset_description_path,
  378. resource_id=data_resource['resID'],
  379. ))
  380. error = True
  381. if is_multi_source is None:
  382. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing multi vs. simple column role.".format(
  383. dataset_path=dataset_description_path,
  384. resource_id=data_resource['resID'],
  385. ))
  386. error = True
  387. if 'directedEdgeSource' in column_description['role']:
  388. if is_directed_source is None:
  389. is_directed_source = True
  390. elif is_directed_source != True:
  391. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
  392. dataset_path=dataset_description_path,
  393. resource_id=data_resource['resID'],
  394. ))
  395. error = True
  396. if 'undirectedEdgeSource' in column_description['role']:
  397. if is_directed_source is None:
  398. is_directed_source = False
  399. elif is_directed_source != False:
  400. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
  401. dataset_path=dataset_description_path,
  402. resource_id=data_resource['resID'],
  403. ))
  404. error = True
  405. if is_directed_source is None:
  406. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing directed vs. undirected column role.".format(
  407. dataset_path=dataset_description_path,
  408. resource_id=data_resource['resID'],
  409. ))
  410. error = True
  411. if 'edgeTarget' in column_description['role']:
  412. if found_target:
  413. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with multiple edge target columns.".format(
  414. dataset_path=dataset_description_path,
  415. resource_id=data_resource['resID'],
  416. ))
  417. error = True
  418. continue
  419. found_target = True
  420. if 'multiEdgeTarget' in column_description['role']:
  421. if is_multi_target is None:
  422. is_multi_target = True
  423. elif is_multi_target != True:
  424. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
  425. dataset_path=dataset_description_path,
  426. resource_id=data_resource['resID'],
  427. ))
  428. error = True
  429. if 'simpleEdgeTarget' in column_description['role']:
  430. if is_multi_target is None:
  431. is_multi_target = False
  432. elif is_multi_target != False:
  433. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
  434. dataset_path=dataset_description_path,
  435. resource_id=data_resource['resID'],
  436. ))
  437. error = True
  438. if is_multi_target is None:
  439. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing multi vs. simple column role.".format(
  440. dataset_path=dataset_description_path,
  441. resource_id=data_resource['resID'],
  442. ))
  443. error = True
  444. if 'directedEdgeTarget' in column_description['role']:
  445. if is_directed_target is None:
  446. is_directed_target = True
  447. elif is_directed_target != True:
  448. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
  449. dataset_path=dataset_description_path,
  450. resource_id=data_resource['resID'],
  451. ))
  452. error = True
  453. if 'undirectedEdgeTarget' in column_description['role']:
  454. if is_directed_target is None:
  455. is_directed_target = False
  456. elif is_directed_target != False:
  457. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
  458. dataset_path=dataset_description_path,
  459. resource_id=data_resource['resID'],
  460. ))
  461. error = True
  462. if is_directed_target is None:
  463. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing directed vs. undirected column role.".format(
  464. dataset_path=dataset_description_path,
  465. resource_id=data_resource['resID'],
  466. ))
  467. error = True
  468. if not found_source:
  469. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing edge source column role.".format(
  470. dataset_path=dataset_description_path,
  471. resource_id=data_resource['resID'],
  472. ))
  473. error = True
  474. if not found_target:
  475. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing edge target column role.".format(
  476. dataset_path=dataset_description_path,
  477. resource_id=data_resource['resID'],
  478. ))
  479. error = True
  480. if found_source and found_target:
  481. if is_directed_source != is_directed_target:
  482. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
  483. dataset_path=dataset_description_path,
  484. resource_id=data_resource['resID'],
  485. ))
  486. error = True
  487. if is_multi_source != is_multi_target:
  488. print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
  489. dataset_path=dataset_description_path,
  490. resource_id=data_resource['resID'],
  491. ))
  492. error = True
  493. return error
  494. def get_file_extension(path):
  495. extension = os.path.splitext(path)[1]
  496. if extension:
  497. # We remove leading dot as returned from "splitext".
  498. return extension[1:]
  499. else:
  500. raise ValueError(f"Cannot get file extension of '{path}'.")
  501. def validate_dataset(dataset_description_path, dataset_description):
  502. error = False
  503. for data_resource in dataset_description['dataResources']:
  504. if os.path.splitext(os.path.basename(data_resource['resPath']))[0] == 'learningData' and data_resource['resID'] != 'learningData':
  505. print("ERROR: Dataset '{dataset_path}' has a dataset entry point without 'learningData' as resource's ID, but '{resource_id}'.".format(
  506. dataset_path=dataset_description_path,
  507. resource_id=data_resource['resID'],
  508. ))
  509. error = True
  510. if data_resource['resID'] == 'learningData':
  511. if data_resource.get('isCollection', False):
  512. print("ERROR: Dataset '{dataset_path}' has a dataset entry point which is a collection.".format(
  513. dataset_path=dataset_description_path,
  514. ))
  515. error = True
  516. if 'columns' not in data_resource:
  517. print("ERROR: Dataset '{dataset_path}' has a dataset entry point without columns metadata.".format(
  518. dataset_path=dataset_description_path,
  519. ))
  520. error = True
  521. if 'columns' in data_resource:
  522. index_columns = []
  523. multi_index_columns = []
  524. key_columns = []
  525. edgelist_columns = []
  526. for column_description in data_resource['columns']:
  527. if 'index' in column_description['role']:
  528. index_columns.append(column_description['colIndex'])
  529. if 'multiIndex' in column_description['role']:
  530. multi_index_columns.append(column_description['colIndex'])
  531. if 'key' in column_description['role']:
  532. key_columns.append(column_description['colIndex'])
  533. if any(edgelist_column_role in column_description['role'] for edgelist_column_role in EDGELIST_COLUMN_ROLES):
  534. edgelist_columns.append(column_description['colIndex'])
  535. index_columns_set = set(index_columns)
  536. multi_index_columns_set = set(multi_index_columns)
  537. key_columns_set = set(key_columns)
  538. if index_columns_set & multi_index_columns_set:
  539. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with columns being both index and multi-index at the same time: {index_columns}".format(
  540. dataset_path=dataset_description_path,
  541. resource_id=data_resource['resID'],
  542. index_columns=sorted(index_columns_set & multi_index_columns_set),
  543. ))
  544. error = True
  545. elif data_resource['resID'] == 'learningData' and len(index_columns) + len(multi_index_columns) == 0:
  546. print("ERROR: Dataset '{dataset_path}' has a dataset entry point with no index columns.".format(
  547. dataset_path=dataset_description_path,
  548. ))
  549. error = True
  550. elif len(index_columns) + len(multi_index_columns) > 1:
  551. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with multiple index columns: {index_columns}".format(
  552. dataset_path=dataset_description_path,
  553. resource_id=data_resource['resID'],
  554. index_columns=index_columns + multi_index_columns,
  555. ))
  556. error = True
  557. if index_columns_set & key_columns_set:
  558. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with columns being both index and key at the same time: {index_columns}".format(
  559. dataset_path=dataset_description_path,
  560. resource_id=data_resource['resID'],
  561. index_columns=sorted(index_columns_set & key_columns_set),
  562. ))
  563. error = True
  564. if multi_index_columns_set & key_columns_set:
  565. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with columns being both multi-index and key at the same time: {index_columns}".format(
  566. dataset_path=dataset_description_path,
  567. resource_id=data_resource['resID'],
  568. index_columns=sorted(multi_index_columns_set & key_columns_set),
  569. ))
  570. error = True
  571. if data_resource.get('isCollection', False):
  572. continue
  573. for column_index in index_columns:
  574. error = validate_column_values(dataset_description_path, data_resource, column_index, unique=True, no_missing=True) or error
  575. for column_index in multi_index_columns:
  576. error = validate_column_values(dataset_description_path, data_resource, column_index, unique=False, no_missing=True) or error
  577. for column_index in key_columns:
  578. error = validate_column_values(dataset_description_path, data_resource, column_index, unique=True, no_missing=False) or error
  579. for column_description in data_resource['columns']:
  580. if 'refersTo' in column_description and column_description['refersTo']['resObject'] == 'item':
  581. error = validate_files(dataset_description_path, data_resource, dataset_description, column_description['colIndex'], column_description['refersTo']['resID']) or error
  582. if edgelist_columns:
  583. error = validate_edgelist(dataset_description_path, data_resource) or error
  584. if len(multi_index_columns) == 1:
  585. error = validate_multi_index(dataset_description_path, data_resource, multi_index_columns[0]) or error
  586. for res_format in data_resource['resFormat'].keys():
  587. if res_format not in res_format_to_extensions:
  588. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with unsupported format: {res_format}".format(
  589. dataset_path=dataset_description_path,
  590. resource_id=data_resource['resID'],
  591. res_format=res_format,
  592. ))
  593. error = True
  594. if data_resource.get('isCollection', False):
  595. error = validate_collection(dataset_description_path, data_resource) or error
  596. else:
  597. if len(data_resource['resFormat']) == 1:
  598. file_extension = get_file_extension(data_resource['resPath'])
  599. # There should be only one resource format listed for non-collection resources.
  600. if file_extension not in list(data_resource['resFormat'].values())[0]:
  601. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with invalid resource path file extension: {file_extension}".format(
  602. dataset_path=dataset_description_path,
  603. resource_id=data_resource['resID'],
  604. file_extension=file_extension,
  605. ))
  606. error = True
  607. else:
  608. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with invalid number of listed formats: {count}".format(
  609. dataset_path=dataset_description_path,
  610. resource_id=data_resource['resID'],
  611. count=len(data_resource['resFormat']),
  612. ))
  613. error = True
  614. return error
  615. def validate_dataset_description(dataset_description_path, known_dataset_descriptions, *, strict_naming=True):
  616. print("Validating dataset '{dataset_description_path}'.".format(dataset_description_path=dataset_description_path))
  617. try:
  618. with open(dataset_description_path) as dataset_description_file:
  619. dataset_description = json.load(dataset_description_file)
  620. if not dataset_description_validator.validate(dataset_description):
  621. print("ERROR: Schema validation: {errors}".format(errors=dataset_description_validator.errors))
  622. return True
  623. dataset_id = dataset_description['about']['datasetID']
  624. # Handle a special case for SCORE dataset splits (those which have "targets.csv" file).
  625. # They are the same as TEST dataset splits, but we present them differently, so that
  626. # SCORE dataset splits have targets as part of data. Because of this we also update
  627. # corresponding dataset ID.
  628. # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176
  629. if os.path.exists(os.path.join(os.path.dirname(dataset_description_path), '..', 'targets.csv')) and dataset_id.endswith('_TEST'):
  630. dataset_id = dataset_id[:-5] + '_SCORE'
  631. if dataset_id in known_dataset_descriptions:
  632. print("ERROR: Duplicate dataset ID '{dataset_id}': '{first_path}' and '{second_path}'".format(
  633. dataset_id=dataset_id,
  634. first_path=known_dataset_descriptions[dataset_id]['path'],
  635. second_path=dataset_description_path,
  636. ))
  637. return True
  638. known_dataset_descriptions[dataset_id] = {
  639. 'path': dataset_description_path,
  640. 'description': dataset_description,
  641. }
  642. if validate_dataset_path(dataset_id, dataset_description_path, strict_naming=strict_naming):
  643. return True
  644. #if 'digest' not in dataset_description['about']:
  645. # print("ERROR: Dataset '{dataset_path}' missing digest.".format(dataset_path=dataset_description_path))
  646. # return True
  647. if validate_dataset(dataset_description_path, dataset_description):
  648. return True
  649. except Exception:
  650. print("ERROR: Unexpected exception:")
  651. traceback.print_exc()
  652. return True
  653. return False
  654. def validate_problem_description(problem_description_path, known_problem_descriptions):
  655. print("Validating problem '{problem_description_path}'.".format(problem_description_path=problem_description_path))
  656. try:
  657. with open(problem_description_path) as problem_description_file:
  658. problem_description = json.load(problem_description_file)
  659. if not problem_description_validator.validate(problem_description):
  660. print("ERROR: Schema validation: {errors}".format(errors=problem_description_validator.errors))
  661. return True
  662. problem_id = problem_description['about']['problemID']
  663. # Handle a special case for SCORE dataset splits (those which have "targets.csv" file).
  664. # They are the same as TEST dataset splits, but we present them differently, so that
  665. # SCORE dataset splits have targets as part of data. Because of this we also update
  666. # corresponding problem ID.
  667. # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176
  668. if os.path.exists(os.path.join(os.path.dirname(problem_description_path), '..', 'targets.csv')) and problem_id.endswith('_TEST'):
  669. problem_id = problem_id[:-5] + '_SCORE'
  670. # Also update dataset references.
  671. for data in problem_description.get('inputs', {}).get('data', []):
  672. if data['datasetID'].endswith('_TEST'):
  673. data['datasetID'] = data['datasetID'][:-5] + '_SCORE'
  674. # All problem descriptions show be the same.
  675. if problem_id.endswith('_TRAIN') or problem_id.endswith('_TEST') or problem_id.endswith('_SCORE'):
  676. print("ERROR: Invalid problem ID '{problem_id}' in '{problem_description_path}'.".format(
  677. problem_id=problem_id,
  678. problem_description_path=problem_description_path,
  679. ))
  680. return True
  681. if problem_id in known_problem_descriptions:
  682. # Problem descriptions with same ID should have the same content.
  683. if problem_description == known_problem_descriptions[problem_id]['description']:
  684. known_problem_descriptions[problem_id]['paths'].append(problem_description_path)
  685. else:
  686. print("ERROR: Duplicate problem ID '{problem_id}', but different problem description: {first_paths} and '{second_path}'".format(
  687. problem_id=problem_id,
  688. first_paths=known_problem_descriptions[problem_id]['paths'],
  689. second_path=problem_description_path,
  690. ))
  691. return True
  692. else:
  693. known_problem_descriptions[problem_id] = {
  694. 'paths': [problem_description_path],
  695. 'description': problem_description,
  696. }
  697. if os.path.basename(problem_description_path) != 'problemDoc.json':
  698. print("ERROR: Problem description filename '{problem_description_path}' is not 'problemDoc.json'.".format(
  699. problem_description_path=problem_description_path,
  700. ))
  701. return True
  702. if validate_metrics(problem_description):
  703. return True
  704. if validate_keywords(problem_description):
  705. return True
  706. split_path = os.path.dirname(problem_description_path).split(os.sep)
  707. for split_directory in ['problem_TRAIN', 'problem_TEST', 'problem_SCORE']:
  708. if split_directory in split_path and 'datasetViewMaps' not in problem_description.get('inputs', {}).get('dataSplits', {}):
  709. print("ERROR: Problem '{problem_description_path}' is missing dataset view maps.".format(
  710. problem_description_path=problem_description_path,
  711. ))
  712. return True
  713. except Exception:
  714. print("ERROR: Unexpected exception:")
  715. traceback.print_exc()
  716. return True
  717. return False
  718. def validate_column_values(dataset_description_path, data_resource, column_index, *, unique, no_missing):
  719. error = False
  720. data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath'])
  721. data = read_csv(data_path)
  722. column_values = data.iloc[:, column_index]
  723. # We assume missing values is represented as empty strings.
  724. column_values_without_missing = column_values[column_values != '']
  725. # There should be no NA anyway anymore.
  726. value_counts = column_values_without_missing.value_counts(dropna=True)
  727. if unique and (value_counts > 1).sum():
  728. duplicate = list(value_counts[value_counts > 1].keys())
  729. if LIMIT_OUTPUT is not None:
  730. duplicate = duplicate[:LIMIT_OUTPUT]
  731. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with column {column_index} which should have unique values but it does not. Example duplicate values: {duplicate}".format(
  732. dataset_path=dataset_description_path,
  733. resource_id=data_resource['resID'],
  734. column_index=column_index,
  735. duplicate=duplicate,
  736. ))
  737. error = True
  738. if no_missing and len(column_values) != len(column_values_without_missing):
  739. print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with column {column_index} which should have no missing values but it does have them.".format(
  740. dataset_path=dataset_description_path,
  741. resource_id=data_resource['resID'],
  742. column_index=column_index,
  743. ))
  744. error = True
  745. return error
  746. def validate_target_values(problem_paths, dataset_path, problem_description, data_resource, target):
  747. error = False
  748. data_path = os.path.join(os.path.dirname(dataset_path), data_resource['resPath'])
  749. data = read_csv(data_path)
  750. target_values = data.iloc[:, target['colIndex']]
  751. distinct_values = list(target_values.value_counts(dropna=False).keys())
  752. number_distinct_values = len(distinct_values)
  753. # We assume missing values is represented as empty strings.
  754. has_missing_values = '' in distinct_values
  755. if has_missing_values:
  756. # We do not count missing values as distinct values.
  757. number_distinct_values -= 1
  758. task_keywords = set(problem_description['about']['taskKeywords'])
  759. if 'binary' in task_keywords:
  760. if number_distinct_values != 2:
  761. print("ERROR: Problem {problem_paths} has 'binary' keyword, but target column does not have 2 distinct values, but {number_distinct_values}.".format(
  762. problem_paths=problem_paths,
  763. number_distinct_values=number_distinct_values,
  764. ))
  765. error = True
  766. elif 'multiClass' in task_keywords:
  767. if number_distinct_values < 3:
  768. print("ERROR: Problem {problem_paths} has 'multiClass' keyword, but target column does not have more than 2 distinct values, but {number_distinct_values}.".format(
  769. problem_paths=problem_paths,
  770. number_distinct_values=number_distinct_values,
  771. ))
  772. error = True
  773. for metric in problem_description.get('inputs', {}).get('performanceMetrics', []):
  774. if metric['metric'] in ['f1', 'precision', 'recall', 'jaccardSimilarityScore']:
  775. if number_distinct_values != 2:
  776. print("ERROR: Problem {problem_paths} uses '{metric}' metric, but target column does not have 2 distinct values, but {number_distinct_values}.".format(
  777. problem_paths=problem_paths,
  778. metric=metric['metric'],
  779. number_distinct_values=number_distinct_values,
  780. ))
  781. error = True
  782. if 'posLabel' in metric and metric['posLabel'] not in distinct_values:
  783. print("ERROR: Problem {problem_paths} provides 'posLabel' for metric '{metric}' with value '{value}', but possible values are: {distinct_values}".format(
  784. problem_paths=problem_paths,
  785. metric=metric['metric'],
  786. value=metric['posLabel'],
  787. distinct_values=sorted(distinct_values),
  788. ))
  789. error = True
  790. if has_missing_values and not task_keywords & {'semiSupervised', 'clustering'}:
  791. print("ERROR: Problem {problem_paths} has target column with missing values, but it not a semi-supervised or clustering task.".format(
  792. problem_paths=problem_paths,
  793. ))
  794. error = True
  795. if 'semiSupervised' in task_keywords and not has_missing_values:
  796. print("ERROR: Problem {problem_paths} is a semi-supervised task, but does not have a target column with missing values.".format(
  797. problem_paths=problem_paths,
  798. ))
  799. error = True
  800. return error
  801. def get_all_columns(dataset_path, resource_id, data_resource):
  802. data_path = os.path.join(os.path.dirname(dataset_path), data_resource['resPath'])
  803. data = read_csv(data_path)
  804. data_columns = [{
  805. 'colIndex': column_index,
  806. 'colName': column_name,
  807. 'colType': 'unknown',
  808. 'role': []
  809. } for column_index, column_name in enumerate(data.columns)]
  810. columns = data_resource.get('columns', None)
  811. if columns is None:
  812. return data_columns
  813. if 'columnsCount' in data_resource and data_resource['columnsCount'] != len(data_columns):
  814. raise ValueError("Dataset '{dataset_path}' has resource '{resource_id}' with incorrect columns count {columns_count} (correct {correct_count}).".format(
  815. dataset_path=dataset_path,
  816. resource_id=resource_id,
  817. columns_count=data_resource['columnsCount'],
  818. correct_count=len(data_columns),
  819. ))
  820. if len(columns) >= len(data_columns):
  821. columns_names = [{'colIndex': c['colIndex'], 'colName': c['colName']} for c in columns]
  822. data_columns_names = [{'colIndex': c['colIndex'], 'colName': c['colName']} for c in data_columns]
  823. if columns_names != data_columns_names:
  824. raise ValueError("Dataset '{dataset_path}' has resource '{resource_id}' where metadata columns do not match data columns.".format(
  825. dataset_path=dataset_path,
  826. resource_id=resource_id,
  827. ))
  828. return columns
  829. else:
  830. for column in columns:
  831. if column['colName'] != data_columns[column['colIndex']]['colName']:
  832. raise ValueError("Dataset '{dataset_path}' has resource '{resource_id}' where column name '{metadata_name}' in metadata does not match column name '{data_name}' in data.".format(
  833. dataset_path=dataset_path,
  834. resource_id=resource_id,
  835. metadata_name=column['colName'],
  836. data_name=data_columns[column['colIndex']]['colName'],
  837. ))
  838. data_columns[column['colIndex']] = column
  839. return data_columns
  840. def validate_target(problem_paths, dataset_path, problem_description, dataset_description, target, check_target_values):
  841. error = False
  842. try:
  843. for data_resource in dataset_description['dataResources']:
  844. if data_resource['resID'] == target['resID']:
  845. columns = get_all_columns(dataset_path, data_resource['resID'], data_resource)
  846. for column in columns:
  847. if target['colName'] == column['colName'] or target['colIndex'] == column['colIndex']:
  848. if not (target['colName'] == column['colName'] and target['colIndex'] == column['colIndex']):
  849. print("ERROR: Problem {problem_paths} has a target '{target_index}' which does not match a column '{column_index}' in dataset '{dataset_path}' fully.".format(
  850. problem_paths=problem_paths,
  851. target_index=target['targetIndex'],
  852. column_index=column['colIndex'],
  853. dataset_path=dataset_path,
  854. ))
  855. error = True
  856. if check_target_values:
  857. error = validate_target_values(problem_paths, dataset_path, problem_description, data_resource, target) or error
  858. break
  859. else:
  860. raise KeyError("Cannot find column with column name '{column_name}' or column index '{column_index}'.".format(
  861. column_name=target['colName'],
  862. column_index=target['colIndex'],
  863. ))
  864. break
  865. else:
  866. raise KeyError("Cannot find data resource with resource ID '{resource_id}'.".format(
  867. resource_id=target['resID'],
  868. ))
  869. except (IndexError, KeyError):
  870. print("ERROR: Problem {problem_paths} has target with index '{target_index}' which does not resolve.".format(
  871. problem_paths=problem_paths,
  872. target_index=target['targetIndex'],
  873. ))
  874. return True
  875. except ValueError as error:
  876. print("ERROR: {error}".format(
  877. error=error,
  878. ))
  879. return True
  880. return error
  881. def canonical_dataset_description(dataset_description):
  882. dataset_description = copy.deepcopy(dataset_description)
  883. del dataset_description['about']['datasetID']
  884. if 'digest' in dataset_description['about']:
  885. del dataset_description['about']['digest']
  886. return dataset_description
  887. def datasets_equal(first_dataset_path, second_dataset_path):
  888. if first_dataset_path == second_dataset_path:
  889. return True
  890. first_dataset_base_path = os.path.dirname(first_dataset_path)
  891. second_dataset_base_path = os.path.dirname(second_dataset_path)
  892. dir_comparison = deep_dircmp.DeepDirCmp(first_dataset_base_path, second_dataset_base_path, hide=[], ignore=[])
  893. different_files = dir_comparison.get_left_only_recursive() + dir_comparison.get_right_only_recursive() + dir_comparison.get_common_funny_recursive() + dir_comparison.get_diff_files_recursive()
  894. # This one can be different. And if it is different, we compare it elsewhere for allowed differences.
  895. if 'datasetDoc.json' in different_files:
  896. different_files.remove('datasetDoc.json')
  897. if different_files:
  898. print("ERROR: Dataset '{first_dataset_path}' and dataset '{second_dataset_path}' are not the same: {differences}".format(
  899. first_dataset_path=first_dataset_path,
  900. second_dataset_path=second_dataset_path,
  901. differences=different_files,
  902. ))
  903. return False
  904. return True
  905. def validate_dataset_reference(dataset_id, dataset_descriptions, targets, problem_description_value, check_target_values):
  906. error = False
  907. if dataset_id not in dataset_descriptions:
  908. print("ERROR: Problem {problem_paths} is referencing unknown dataset '{dataset_id}'.".format(
  909. problem_paths=problem_description_value['paths'],
  910. dataset_id=dataset_id,
  911. ))
  912. error = True
  913. else:
  914. dataset_description_value = dataset_descriptions[dataset_id]
  915. dataset_description = dataset_description_value['description']
  916. for i, target in enumerate(targets):
  917. if target['targetIndex'] != i:
  918. print("ERROR: Problem {problem_paths} has target with invalid target index '{target_index}'.".format(
  919. problem_paths=problem_description_value['paths'],
  920. target_index=target['targetIndex'],
  921. ))
  922. error = True
  923. error = validate_target(problem_description_value['paths'], dataset_description_value['path'], problem_description_value['description'], dataset_description, target, check_target_values) or error
  924. return error
  925. def map_dataset_id(dataset_id, dataset_view_map):
  926. for view_map in dataset_view_map:
  927. if view_map['from'] == dataset_id:
  928. return view_map['to']
  929. else:
  930. raise KeyError("Could not map '{dataset_id}' in dataset view map.".format(dataset_id=dataset_id))
  931. def validate(dataset_descriptions, problem_descriptions):
  932. print("Validating all datasets and problems.")
  933. error = False
  934. dataset_description_groups = collections.defaultdict(list)
  935. for problem_description_value in problem_descriptions.values():
  936. problem_description = problem_description_value['description']
  937. for data in problem_description.get('inputs', {}).get('data', []):
  938. error = validate_dataset_reference(data['datasetID'], dataset_descriptions, data.get('targets', []), problem_description_value, True) or error
  939. if 'datasetViewMaps' in problem_description.get('inputs', {}).get('dataSplits', {}):
  940. if {'train', 'test', 'score'} != set(problem_description['inputs']['dataSplits']['datasetViewMaps'].keys()):
  941. print("ERROR: Problem {problem_paths} has dataset view maps with invalid keys.".format(
  942. problem_paths=problem_description_value['paths'],
  943. ))
  944. error = True
  945. else:
  946. error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['train']), dataset_descriptions, data.get('targets', []), problem_description_value, True) or error
  947. # Test and score splits do not have all values, so we do not validate target values there.
  948. error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['test']), dataset_descriptions, data.get('targets', []), problem_description_value, False) or error
  949. error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['score']), dataset_descriptions, data.get('targets', []), problem_description_value, False) or error
  950. if 'clustering' in problem_description['about']['taskKeywords']:
  951. for data in problem_description.get('inputs', {}).get('data', []):
  952. for target in data.get('targets', []):
  953. if 'numClusters' not in target:
  954. print("ERROR: Problem {problem_paths} is a clustering problem but is missing 'numClusters' in target '{target_index}'.".format(
  955. problem_paths=problem_description_value['paths'],
  956. target_index=target['targetIndex'],
  957. ))
  958. error = True
  959. if 'dataSplits' in problem_description['inputs'] and set(problem_description['inputs']['dataSplits'].keys()) - {'datasetViewMaps'}:
  960. print("ERROR: Problem {problem_paths} is a clustering problem with data splitting configuration, but it should not have one.".format(
  961. problem_paths=problem_description_value['paths'],
  962. ))
  963. error = True
  964. for dataset_description_value in dataset_descriptions.values():
  965. dataset_description = dataset_description_value['description']
  966. dataset_id = dataset_description['about']['datasetID']
  967. for suffix in ['_TEST', '_TRAIN', '_SCORE']:
  968. if dataset_id.endswith(suffix):
  969. dataset_description_groups[dataset_id[:-len(suffix)]].append(dataset_description_value)
  970. break
  971. for problem_description_value in problem_descriptions.values():
  972. problem_description = problem_description_value['description']
  973. # If any clustering problem is using dataset splits, we validate those splits.
  974. if 'clustering' in problem_description['about']['taskKeywords']:
  975. for data in problem_description.get('inputs', {}).get('data', []):
  976. # We check this elsewhere.
  977. if data['datasetID'] not in dataset_descriptions:
  978. continue
  979. dataset_id = data['datasetID']
  980. for suffix in ['_TEST', '_TRAIN', '_SCORE']:
  981. if dataset_id.endswith(suffix):
  982. base_dataset_id = dataset_id[:-len(suffix)]
  983. break
  984. else:
  985. base_dataset_id = dataset_id
  986. # There should always be at least one dataset.
  987. datasets = dataset_description_groups[base_dataset_id]
  988. if len(datasets) > 1:
  989. first_dataset_path = datasets[0]['path']
  990. for second_dataset_value in datasets[1:]:
  991. second_dataset_path = second_dataset_value['path']
  992. if not datasets_equal(first_dataset_path, second_dataset_path):
  993. print("ERROR: Problem {problem_paths} is a clustering problem, but its data splits are not all the same, for example, {first_dataset_path} and {second_dataset_path}.".format(
  994. problem_paths=problem_description_value['paths'],
  995. first_dataset_path=first_dataset_path,
  996. second_dataset_path=second_dataset_path,
  997. ))
  998. error = True
  999. break
  1000. for dataset_description_group in dataset_description_groups.values():
  1001. first_dataset_description_value = dataset_description_group[0]
  1002. first_dataset_description = canonical_dataset_description(first_dataset_description_value['description'])
  1003. for dataset_description_value in dataset_description_group[1:]:
  1004. dataset_description = canonical_dataset_description(dataset_description_value['description'])
  1005. if first_dataset_description != dataset_description:
  1006. print("ERROR: Dataset '{first_dataset_path}' and dataset '{dataset_path}' are not the same.".format(
  1007. first_dataset_path=first_dataset_description_value['path'],
  1008. dataset_path=dataset_description_value['path'],
  1009. ))
  1010. error = True
  1011. return error
  1012. def search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, *, strict_naming=True):
  1013. error = False
  1014. datasets_directory = os.path.abspath(datasets_directory)
  1015. for dirpath, dirnames, filenames in os.walk(datasets_directory, followlinks=True):
  1016. if 'datasetDoc.json' in filenames:
  1017. # Do not traverse further (to not parse "datasetDoc.json" if they
  1018. # exists in raw data filename).
  1019. dirnames[:] = []
  1020. dataset_description_path = os.path.join(dirpath, 'datasetDoc.json')
  1021. error = validate_dataset_description(dataset_description_path, known_dataset_descriptions, strict_naming=strict_naming) or error
  1022. if 'problemDoc.json' in filenames:
  1023. # We continue traversing further in this case.
  1024. problem_description_path = os.path.join(dirpath, 'problemDoc.json')
  1025. error = validate_problem_description(problem_description_path, known_problem_descriptions) or error
  1026. return error
  1027. def configure_parser(parser: argparse.ArgumentParser, *, skip_arguments=()):
  1028. if 'no_strict_naming' not in skip_arguments:
  1029. parser.add_argument(
  1030. '-n', '--no-strict-naming', default=True, action='store_false', dest='strict_naming',
  1031. help="do not require strict naming convention",
  1032. )
  1033. if 'directories' not in skip_arguments:
  1034. parser.add_argument(
  1035. 'directories', metavar='DIR', nargs='*', default=['.'],
  1036. help="path to a directory with datasets, default is current directory",
  1037. )
  1038. def handler(arguments):
  1039. error = False
  1040. known_dataset_descriptions = {}
  1041. known_problem_descriptions = {}
  1042. for datasets_directory in arguments.directories:
  1043. error = search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, strict_naming=arguments.strict_naming) or error
  1044. error = validate(known_dataset_descriptions, known_problem_descriptions) or error
  1045. if error:
  1046. print("There are ERRORS.")
  1047. sys.exit(1)
  1048. else:
  1049. print("There are no errors.")
  1050. def main(argv):
  1051. parser = argparse.ArgumentParser(description="Validate datasets.")
  1052. configure_parser(parser)
  1053. arguments = parser.parse_args(argv[1:])
  1054. handler(arguments)
  1055. if __name__ == '__main__':
  1056. main(sys.argv)

全栈的自动化机器学习系统,主要针对多变量时间序列数据的异常检测。TODS提供了详尽的用于构建基于机器学习的异常检测系统的模块,它们包括:数据处理(data processing),时间序列处理( time series processing),特征分析(feature analysis),检测算法(detection algorithms),和强化模块( reinforcement module)。这些模块所提供的功能包括常见的数据预处理、时间序列数据的平滑或变换,从时域或频域中抽取特征、多种多样的检测算