diff --git a/csv-gremlin/README.md b/csv-gremlin/README.md index 34d8d3e3..ac4ff600 100644 --- a/csv-gremlin/README.md +++ b/csv-gremlin/README.md @@ -93,9 +93,10 @@ Rows=13, IDs=5, Duplicate IDs=7, Vertices=5, Edges=0, Properties=17, Errors=0 The help can always be displayed using the `-h` or `--help` command line arguments. ``` -==>python3 csv-gremlin.py -h -usage: csv-gremlin.py [-h] [-v] [-vb VB] [-eb EB] [-java_dates] [-assume_utc] [-rows ROWS] [-all_errors] [-silent] - [-no_summary] [-double_suffix] [-escape_dollar] +$ python3 csv-gremlin.py -h +usage: csv-gremlin.py [-h] [-v] [-vb VB] [-eb EB] [-java_dates] [-assume_utc] + [-rows ROWS] [-all_errors] [-silent] [-no_summary] + [-double_suffix] [-skip_spaces] [-escape_dollar] csvfile positional arguments: @@ -106,18 +107,30 @@ optional arguments: -v, --version Display version information -vb VB Set the vertex batch size to use (default 10) -eb EB Set the edge batch size to use (default 10) - -java_dates Use Java style "new Date()" instead of "datetime()". This option can also be used to force date - validation. - -assume_utc If date fields do not contain timezone information, assume they are in UTC. By default local time - is assumed otherwise. This option only applies if java_dates is also specified. - -rows ROWS Specify the maximum number of rows to process. By default the whole file is processed - -all_errors Show all errors. By default processing stops after any error in the CSV is encountered. - -silent Enable silent mode. Only errors are reported. No Gremlin is generated. + -java_dates Use Java style "new Date()" instead of "datetime()". This + option can also be used to force date validation. + -assume_utc If date fields do not contain timezone information, assume + they are in UTC. By default local time is assumed otherwise. + This option only applies if java_dates is also specified. + -rows ROWS Specify the maximum number of rows to process. By default + the whole file is processed + -all_errors Show all errors. By default processing stops after any error + in the CSV is encountered. + -silent Enable silent mode. Only errors are reported. No Gremlin is + generated. -no_summary Do not show a summary report after processing. - -double_suffix Suffix all floats and doubles with a "d" such as 12.34d. This is helpful when using the Gremlin - Console or Groovy scripts as it will prevent floats and doubles automatically being created as - BigDecimal objects. - -escape_dollar For any dollar signs found convert them to an escaped form \$. This is needed if you are going to - load the generated Gremlin using a Groovy processor such as used by the Gremlin Console. In Groovy - strings, the $ sign is used for interpolation + -double_suffix Suffix all floats and doubles with a "d" such as 12.34d. + This is helpful when using the Gremlin Console or Groovy + scripts as it will prevent floats and doubles automatically + being created as BigDecimal objects. + -skip_spaces Skip any leading spaces in each column. By defaut this + setting is False and any leading spaces will be considered + part of the column header or data value. This setting does + not apply to values enclosed in quotes such as " abcd". + -escape_dollar For any dollar signs found convert them to an escaped form + \$. This is needed if you are going to load the generated + Gremlin using a Groovy processor such as used by the Gremlin + Console. In Groovy strings, the $ sign is used for + interpolation + ``` diff --git a/csv-gremlin/csv-gremlin.py b/csv-gremlin/csv-gremlin.py index c2a88e67..50bb3872 100644 --- a/csv-gremlin/csv-gremlin.py +++ b/csv-gremlin/csv-gremlin.py @@ -19,7 +19,7 @@ @license: Apache2 @contact: @krlawrence @deffield created: 2020-11-17 -@deffield lastUpdated: 2022-02-04 +@deffield lastUpdated: 2022-06-30 Overview -------- @@ -74,7 +74,8 @@ class NeptuneCSVReader: def __init__(self, vbatch=1, ebatch=1, java_dates=False, max_rows=sys.maxsize, assume_utc=False, stop_on_error=True, silent_mode=False, - escape_dollar=False, show_summary=True, double_suffix=False): + escape_dollar=False, show_summary=True, double_suffix=False, + skip_spaces=False): self.vertex_batch_size = vbatch self.edge_batch_size = ebatch @@ -96,6 +97,7 @@ def __init__(self, vbatch=1, ebatch=1, java_dates=False, max_rows=sys.maxsize, self.edge_count = 0 self.property_count = 0 self.verbose_summary = False + self.skip_spaces = skip_spaces def get_batch_sizes(self): return {'vbatch': self.vertex_batch_size, @@ -153,6 +155,12 @@ def set_double_suffix(self,suffix:bool): def get_double_suffix(self): return self.double_suffix + def set_skip_spaces(self,skip:bool): + self.skip_spaces = skip + + def get_skip_spaces(self): + return self.skip_spaces + def escape(self,string): escaped = string.replace('"','\\"') return escaped @@ -451,7 +459,7 @@ def process_csv_file(self,fname): self.property_count = 0 try: with open(fname, newline='') as csvfile: - reader = csv.DictReader(csvfile,escapechar="\\") + reader = csv.DictReader(csvfile, skipinitialspace=self.skip_spaces, escapechar="\\") if not '~id' in reader.fieldnames: self.print_error('The header row must include an ~id column') @@ -509,6 +517,13 @@ def process_csv_file(self,fname): help='Suffix all floats and doubles with a "d" such as 12.34d. This is helpful\ when using the Gremlin Console or Groovy scripts as it will prevent\ floats and doubles automatically being created as BigDecimal objects.') + parser.add_argument('-skip_spaces', action='store_true', + help='Skip any leading spaces in each column.\ + By defaut this setting is False and any leading spaces\ + will be considered part of the column header or data value.\ + This setting does not apply to values enclosed in quotes\ + such as " abcd".', + default=False) parser.add_argument('-escape_dollar', action='store_true', help='For any dollar signs found convert them to an escaped\ form \$. This is needed if you are going to load the\ @@ -527,4 +542,5 @@ def process_csv_file(self,fname): ncsv.set_escape_dollar(args.escape_dollar) ncsv.set_double_suffix(args.double_suffix) ncsv.set_show_summary(not args.no_summary) + ncsv.set_skip_spaces(args.skip_spaces) ncsv.process_csv_file(args.csvfile) diff --git a/csv-gremlin/test-files/header-with-spaces-edge.csv b/csv-gremlin/test-files/header-with-spaces-edge.csv new file mode 100644 index 00000000..b3605caa --- /dev/null +++ b/csv-gremlin/test-files/header-with-spaces-edge.csv @@ -0,0 +1,3 @@ + ~id, ~label, ~from, ~to +e1, likes, a1, a2 +e2, " likes ", a1, a2 diff --git a/csv-gremlin/test-files/header-with-spaces.csv b/csv-gremlin/test-files/header-with-spaces.csv new file mode 100644 index 00000000..59132a38 --- /dev/null +++ b/csv-gremlin/test-files/header-with-spaces.csv @@ -0,0 +1,4 @@ +~id, ~label, type +a1, animal, cat +a2, " animal ", cat +a3, animal , cat