diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py index 08256fe..19bdd68 100644 --- a/src/clinical_etl/CSVConvert.py +++ b/src/clinical_etl/CSVConvert.py @@ -32,6 +32,8 @@ def parse_args(): parser.add_argument('--manifest', type=str, required=True, help="Path to a manifest file describing the mapping. See README for more information") parser.add_argument('--test', action="store_true", help="Use exact template specified in manifest: do not remove extra lines") parser.add_argument('--verbose', '--v', action="store_true", help="Print extra information, useful for debugging and understanding how the code runs.") + parser.add_argument('--index', '--i', action="store_true", help="Output 'indexed' file, useful for debugging and seeing relationships.") + parser.add_argument('--minify', action="store_true", help="Remove white space and line breaks from json outputs to reduce file size. Less readable for humans.") args = parser.parse_args() return args @@ -619,7 +621,7 @@ def load_manifest(manifest_file): return result -def csv_convert(input_path, manifest_file, verbose=False): +def csv_convert(input_path, manifest_file, minify=False, index_output=False, verbose=False): mappings.VERBOSE = verbose # read manifest data print("Starting conversion") @@ -651,8 +653,12 @@ def csv_convert(input_path, manifest_file, verbose=False): print("Indexing data") mappings.INDEXED_DATA = process_data(raw_csv_dfs) - with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f: - json.dump(mappings.INDEXED_DATA, f, indent=4) + if index_output: + with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f: + if minify: + json.dump(mappings.INDEXED_DATA, f) + else: + json.dump(mappings.INDEXED_DATA, f, indent=4) # if verbose flag is set, warn if column name is present in multiple sheets: for col in mappings.INDEXED_DATA["columns"]: @@ -693,9 +699,12 @@ def csv_convert(input_path, manifest_file, verbose=False): if mappings._pop_from_stack() is not None: raise Exception( f"Stack not empty\n{mappings.IDENTIFIER_FIELD}: {mappings.IDENTIFIER}\n {mappings.INDEX_STACK}") - - with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f: - json.dump(mappings.INDEXED_DATA, f, indent=4) + if index_output: + with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f: + if minify: + json.dump(mappings.INDEXED_DATA, f) + else: + json.dump(mappings.INDEXED_DATA, f, indent=4) result_key = list(schema.validation_schema.keys()).pop(0) @@ -707,7 +716,10 @@ def csv_convert(input_path, manifest_file, verbose=False): if schema.katsu_sha is not None: result["katsu_sha"] = schema.katsu_sha with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion - json.dump(result, f, indent=4) + if minify: + json.dump(result, f) + else: + json.dump(result, f, indent=4) # add validation data: schema.validate_ingest_map(result) @@ -715,7 +727,10 @@ def csv_convert(input_path, manifest_file, verbose=False): result["validation_warnings"] = schema.validation_warnings result["statistics"] = schema.statistics with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion - json.dump(result, f, indent=4) + if minify: + json.dump(result, f) + else: + json.dump(result, f, indent=4) if len(result["validation_warnings"]) > 0: print( @@ -732,4 +747,4 @@ def csv_convert(input_path, manifest_file, verbose=False): args = parse_args() input_path = args.input manifest_file = args.manifest - csv_convert(input_path, manifest_file, verbose=args.verbose) + csv_convert(input_path, manifest_file, minify=args.minify, index_output=args.index, verbose=args.verbose)