Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add index and minify command line arguments #55

Merged
merged 5 commits into from
Mar 4, 2024
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 24 additions & 9 deletions src/clinical_etl/CSVConvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def parse_args():
parser.add_argument('--manifest', type=str, required=True, help="Path to a manifest file describing the mapping. See README for more information")
parser.add_argument('--test', action="store_true", help="Use exact template specified in manifest: do not remove extra lines")
parser.add_argument('--verbose', '--v', action="store_true", help="Print extra information, useful for debugging and understanding how the code runs.")
parser.add_argument('--index', '--i', action="store_true", help="Out put 'indexed' file, useful for debugging and seeing relationships.")
mshadbolt marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument('--minify', action="store_true", help="Remove white space and line breaks from json outputs to reduce file size. Less readable for humans.")
args = parser.parse_args()
return args

Expand Down Expand Up @@ -619,7 +621,7 @@ def load_manifest(manifest_file):
return result


def csv_convert(input_path, manifest_file, verbose=False):
def csv_convert(input_path, manifest_file, minify=False, index_output=False, verbose=False):
mappings.VERBOSE = verbose
# read manifest data
print("Starting conversion")
Expand Down Expand Up @@ -651,8 +653,12 @@ def csv_convert(input_path, manifest_file, verbose=False):

print("Indexing data")
mappings.INDEXED_DATA = process_data(raw_csv_dfs)
with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
json.dump(mappings.INDEXED_DATA, f, indent=4)
if index_output:
with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
if minify:
json.dump(mappings.INDEXED_DATA, f)
else:
json.dump(mappings.INDEXED_DATA, f, indent=4)

# if verbose flag is set, warn if column name is present in multiple sheets:
for col in mappings.INDEXED_DATA["columns"]:
Expand Down Expand Up @@ -693,9 +699,12 @@ def csv_convert(input_path, manifest_file, verbose=False):
if mappings._pop_from_stack() is not None:
raise Exception(
f"Stack not empty\n{mappings.IDENTIFIER_FIELD}: {mappings.IDENTIFIER}\n {mappings.INDEX_STACK}")

with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
json.dump(mappings.INDEXED_DATA, f, indent=4)
if index_output:
with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
if minify:
json.dump(mappings.INDEXED_DATA, f)
else:
json.dump(mappings.INDEXED_DATA, f, indent=4)

result_key = list(schema.validation_schema.keys()).pop(0)

Expand All @@ -707,15 +716,21 @@ def csv_convert(input_path, manifest_file, verbose=False):
if schema.katsu_sha is not None:
result["katsu_sha"] = schema.katsu_sha
with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion
json.dump(result, f, indent=4)
if minify:
json.dump(result, f)
else:
json.dump(result, f, indent=4)

# add validation data:
schema.validate_ingest_map(result)
result["validation_errors"] = schema.validation_errors
result["validation_warnings"] = schema.validation_warnings
result["statistics"] = schema.statistics
with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion
json.dump(result, f, indent=4)
if minify:
json.dump(result, f)
else:
json.dump(result, f, indent=4)

if len(result["validation_warnings"]) > 0:
print(
Expand All @@ -732,4 +747,4 @@ def csv_convert(input_path, manifest_file, verbose=False):
args = parse_args()
input_path = args.input
manifest_file = args.manifest
csv_convert(input_path, manifest_file, verbose=args.verbose)
csv_convert(input_path, manifest_file, minify=args.minify, index_output=args.index, verbose=args.verbose)
Loading