Doubling of STATE_COUNT following dollar_quote feature merging #209

antoineB · 2023-10-26T11:35:47Z

Now (5660d80) the state_count = 13456 roughly doubling, the size of the binary is 1.8Mo previously 1.2Mo.

Either

index 69d0a1d..2421aa2 100644
--- a/grammar.js
+++ b/grammar.js
@@ -1216,7 +1216,7 @@ module.exports = grammar({
         $.keyword_as,
         alias($._dollar_quoted_string_start_tag, $.dollar_quote),
         $._function_body_statement,
-        optional(';'),
+        ';',
         alias($._dollar_quoted_string_end_tag, $.dollar_quote),
       ),
     ),

Or

index 69d0a1d..e8faa61 100644
--- a/grammar.js
+++ b/grammar.js
@@ -1217,7 +1217,7 @@ module.exports = grammar({
         alias($._dollar_quoted_string_start_tag, $.dollar_quote),
         $._function_body_statement,
         optional(';'),
-        alias($._dollar_quoted_string_end_tag, $.dollar_quote),
+        $.dollar_quote,
       ),
     ),

Fix the explosion state_count I don't know why.

The text was updated successfully, but these errors were encountered:

antoineB · 2023-10-29T13:11:54Z

I try to have the closer to what we have but unfortunately it did work, I suppose it is because $.dollar_quote and $._dollar_quoted_string_end_tag lex the same chars.

index 69d0a1d..606f042 100644
--- a/grammar.js
+++ b/grammar.js
@@ -21,6 +21,7 @@ module.exports = grammar({
     [$.between_expression, $.binary_expression],
     [$.time],
     [$.timestamp],
+    [$._function_body_rule_a, $._function_body_rule_b],
   ],
 
   precedences: $ => [
@@ -1212,13 +1213,39 @@ module.exports = grammar({
           $.literal
         ),
       ),
-      seq(
-        $.keyword_as,
-        alias($._dollar_quoted_string_start_tag, $.dollar_quote),
-        $._function_body_statement,
-        optional(';'),
-        alias($._dollar_quoted_string_end_tag, $.dollar_quote),
-      ),
+      // This should be one rule but it is decomposed into 2 rules (A, B) to
+      // avoid state parser increase.
+      //
+      // It will parse valid SQL, but rule B will not fail if there is unbalanced
+      // dollar_quote or if there is another other dollar_quote inside
+      // $._function_body_statement with the same tag as the external dollar_quote.
+      //
+      // In the long run the whole body function should be parser by another
+      // parser based on the language specified before.
+      //
+      // Rule A
+      $._function_body_rule_a,
+      // Rule B
+      $._function_body_rule_b,
+    ),
+
+    _function_body_rule_a: $ => seq(
+      $.keyword_as,
+      alias($._dollar_quoted_string_start_tag, $.dollar_quote),
+      $._function_body_statement,
+      ';', // This should be optional(';') but this will create a lot (~6000)
+      // of state from any ending of $.statement to
+      // $._dollar_quoted_string_start_tag.
+      alias($._dollar_quoted_string_end_tag, $.dollar_quote),
+    ),
+
+    _function_body_rule_b: $ => seq(
+      $.keyword_as,
+      $.dollar_quote,
+      $._function_body_statement,
+      // There is an optimisation of the compiler so it dosn't generate any
+      // ending of $.statement to $.dollar_quote possible state.
+      $.dollar_quote,
     ),
 
     function_language: $ => seq(

antoineB · 2023-10-29T14:50:09Z

I forgot to capture one the conflicting rule:

index 69d0a1d..e6e3e21 100644
--- a/grammar.js
+++ b/grammar.js
@@ -21,6 +21,7 @@ module.exports = grammar({
     [$.between_expression, $.binary_expression],
     [$.time],
     [$.timestamp],
+    [$._function_body_a, $._function_body_b],
   ],
 
   precedences: $ => [
@@ -1180,6 +1181,29 @@ module.exports = grammar({
         ),
         $.keyword_end,
       ),
+      seq(
+        $.keyword_as,
+        alias(
+          choice(
+            $._single_quote_string,
+            $._double_quote_string,
+          ),
+          $.literal
+        ),
+      ),
+      $._function_body_a,
+      $._function_body_b,
+    ),
+    // This should be one rule but it is decomposed into 2 rules (A, B) to
+    // avoid state parser increase.
+    //
+    // It will parse valid SQL, but rule B will not fail if there is unbalanced
+    // dollar_quote or if there is another other dollar_quote inside
+    // $._function_body_statement with the same tag as the external dollar_quote.
+    //
+    // In the long run the whole body function should be parser by another
+    // parser based on the language specified before.
+    _function_body_a: $ => choice(
       seq(
         $.keyword_as,
         alias($._dollar_quoted_string_start_tag, $.dollar_quote),
@@ -1202,24 +1226,24 @@ module.exports = grammar({
         optional(';'),
         alias($._dollar_quoted_string_end_tag, $.dollar_quote),
       ),
-      seq(
-        $.keyword_as,
-        alias(
-          choice(
-            $._single_quote_string,
-            $._double_quote_string,
-          ),
-          $.literal
-        ),
-      ),
       seq(
         $.keyword_as,
         alias($._dollar_quoted_string_start_tag, $.dollar_quote),
         $._function_body_statement,
-        optional(';'),
+        ';', // This should be optional(';') but this will create a lot (~6000)
+             // of state from any ending of $.statement to
+             // $._dollar_quoted_string_start_tag.
         alias($._dollar_quoted_string_end_tag, $.dollar_quote),
       ),
     ),
+    _function_body_b: $ => seq(
+      $.keyword_as,
+      $.dollar_quote,
+      $._function_body_statement,
+      // There is an optimisation of the compiler so it dosn't generate any
+      // ending of $.statement to $.dollar_quote possible state.
+      $.dollar_quote,
+    ),
 
     function_language: $ => seq(
       $.keyword_language,

But that still doesn't work.
I suppose treesitter compiler make the assumption to do some optimisations that 2 tokens ($._dollar_quoted_string_end_tag and $.dollar_quote in our case) can't match the same chars.

And the solution described in the docs (https://tree-sitter.github.io/tree-sitter/creating-parsers#other-external-scanner-details) like

  externals: $ => [
    $._dollar_quoted_string_start_tag,
    $._dollar_quote
    $._dollar_quoted_string,
  ],

The two solutions for me are:

replace optional(';') with ';'
don't use $._dollar_quoted_string_start_tag and $._dollar_quoted_string_end_tag and replace then with $.dollar_quote

antoineB mentioned this issue Oct 29, 2023

fix: Change the parser to require a semicolon in function_body of 1 s… #216

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Doubling of STATE_COUNT following dollar_quote feature merging #209

Doubling of STATE_COUNT following dollar_quote feature merging #209

antoineB commented Oct 26, 2023

antoineB commented Oct 29, 2023

antoineB commented Oct 29, 2023

Doubling of STATE_COUNT following dollar_quote feature merging #209

Doubling of STATE_COUNT following dollar_quote feature merging #209

Comments

antoineB commented Oct 26, 2023

antoineB commented Oct 29, 2023

antoineB commented Oct 29, 2023