Merge pull request #360 from mindsdb/error-info

Better error messaging
mindsdb · Mar 14, 2024 · 727cd00 · 727cd00
2 parents ac6406b + edad4a8
commit 727cd00
Show file tree

Hide file tree

Showing 8 changed files with 239 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -73,6 +73,32 @@ SLY does not support inheritance, therefore every dialect is described completel
   - get_string - to return object as sql expression (or sub-expression)
   - copy - to copy AST-tree to new object
 
+### Error handling
+
+For better user experience parsing error contains useful information about problem location and possible solution to solve it. 
+1. it shows location of error if 
+  - character isn't parsed (by lexer)
+  - token is unexpected (by parser)
+2. it tries to propose correct token instead (or before) error location. Possible options
+  - Keyword will be showed as is.
+  - '[number]' - if float and integer is expected
+  - '[string]' - if string is expected
+  - '[identifier]' - if name of the objects is expected. For example, they are bold words here:
+    - "select **x** as **name** from **tbl1** where **col**=1"
+
+How suggestion works:
+It uses next possible tokens defined by syntax rules.
+If this is the end of the query: just shows these tokens.
+Else:
+- it tries to replace bad token with other token from list of possible tokens
+- tries to parse query once again, if there is no error:
+  - add this token to suggestion list
+- second iteration: put possible token before bad token (instead of replacement) and repeat the same operation.
+
+Example:
+![image](https://github.com/mindsdb/mindsdb_sql/assets/8502631/c4707087-ca6e-47f6-aaba-db3a641947a6)
+
+
 # Planner
 
 

diff --git a/mindsdb_sql/__init__.py b/mindsdb_sql/__init__.py
@@ -1,9 +1,166 @@
 import re
+from collections import defaultdict
+
+from sly.lex import Token
 
 from mindsdb_sql.exceptions import ParsingException
 from mindsdb_sql.parser.ast import *
 
 
+class ErrorHandling:
+
+    def __init__(self, lexer, parser):
+        self.parser = parser
+        self.lexer = lexer
+
+    def process(self, error_info):
+        self.tokens = [t for t in error_info['tokens'] if t is not None]
+        self.bad_token = error_info['bad_token']
+        self.expected_tokens = error_info['expected_tokens']
+
+        if len(self.tokens) == 0:
+            return 'Empty input'
+
+        # show error location
+        msgs = self.error_location()
+
+        # suggestion
+        suggestions = self.make_suggestion()
+
+        if suggestions:
+            prefix = 'Possible inputs: ' if len(suggestions) > 1 else 'Expected symbol: '
+            msgs.append(prefix + ', '.join([f'"{item}"' for item in suggestions]))
+        return '\n'.join(msgs)
+
+    def error_location(self):
+
+        # restore query text
+        lines_idx = defaultdict(str)
+
+        # used + unused tokens
+        for token in self.tokens:
+            if token is None:
+                continue
+            line = lines_idx[token.lineno]
+
+            if len(line) > token.index:
+                line = line[: token.index]
+            else:
+                line = line.ljust(token.index)
+
+            line += token.value
+            lines_idx[token.lineno] = line
+
+        msgs = []
+
+        # error message and location
+        if self.bad_token is None:
+            msgs.append('Syntax error, unexpected end of query:')
+            error_len = 1
+            # last line
+            error_line_num = list(lines_idx.keys())[-1]
+            error_index = len(lines_idx[error_line_num])
+        else:
+            msgs.append('Syntax error, unknown input:')
+            error_len = len(self.bad_token.value)
+            error_line_num = self.bad_token.lineno
+            error_index = self.bad_token.index
+
+        # shift lines indexes (it removes spaces from beginnings of the lines)
+        lines = []
+        shift = 0
+        error_line = 0
+        for i, line_num in enumerate(lines_idx.keys()):
+            if line_num == error_line_num:
+                error_index -= shift
+                error_line = i
+
+            line = lines_idx[line_num]
+            lines.append(line[shift:])
+            shift = len(line)
+
+        # add source code
+        first_line = error_line - 2 if error_line > 1 else 0
+        for line in lines[first_line: error_line + 1]:
+            msgs.append('>' + line)
+
+        # error position
+        msgs.append('-' * (error_index + 1) + '^' * error_len)
+        return msgs
+
+    def make_suggestion(self):
+        if len(self.expected_tokens) == 0:
+            return []
+
+        # find error index
+        error_index = None
+        for i, token in enumerate(self.tokens):
+            if token is self.bad_token :
+                error_index = i
+
+        expected = {}  # value: token
+
+        for token_name in self.expected_tokens:
+            value = getattr(self.lexer, token_name, None)
+            if token_name == 'ID':
+                # a lot of other tokens could be ID
+                expected = {'[identifier]': token_name}
+                break
+            elif token_name in ('FLOAT', 'INTEGER'):
+                expected['[number]'] = token_name
+
+            elif token_name in ('DQUOTE_STRING', 'QUOTE_STRING'):
+                expected['[string]'] = token_name
+
+            elif isinstance(value, str):
+                value = value.replace('\\b', '').replace('\\', '')
+
+                # doesn't content regexp
+                if '\\s' not in value and '|' not in value:
+                    expected[value] = token_name
+
+        suggestions = []
+        if len(expected) == 1:
+            # use only it
+            first_value = list(expected.keys())[0]
+            suggestions.append(first_value)
+
+        elif 1 < len(expected) < 20:
+            if self.bad_token is None:
+                # if this is the end of query, just show next expected keywords
+                return list(expected.keys())
+
+            # not every suggestion satisfy the end of the query. we have to check if it works
+            for value, token_name in expected.items():
+                # make up a token
+                token = Token()
+                token.type = token_name
+                token.value = value
+                token.end = 0
+                token.index = 0
+                token.lineno = 0
+
+                # try to add token
+                tokens2 = self.tokens[:error_index] + [token] + self.tokens[error_index:]
+                if self.query_is_valid(tokens2):
+                    suggestions.append(value)
+                    continue
+
+                # try to replace token
+                tokens2 = self.tokens[:error_index - 1] + [token] + self.tokens[error_index:]
+                if self.query_is_valid(tokens2):
+                    suggestions.append(value)
+                    continue
+
+        return suggestions
+
+    def query_is_valid(self, tokens):
+        # try to parse list of tokens
+
+        ast = self.parser.parse(iter(tokens))
+        return ast is not None
+
+
 def get_lexer_parser(dialect):
     if dialect == 'sqlite':
         from mindsdb_sql.parser.lexer import SQLLexer
@@ -29,4 +186,12 @@ def parse_sql(sql, dialect='mindsdb'):
     lexer, parser = get_lexer_parser(dialect)
     tokens = lexer.tokenize(sql)
     ast = parser.parse(tokens)
+
+    if ast is None:
+
+        eh = ErrorHandling(lexer, parser)
+        message = eh.process(parser.error_info)
+
+        raise ParsingException(message)
+
     return ast
diff --git a/mindsdb_sql/parser/dialects/mindsdb/lexer.py b/mindsdb_sql/parser/dialects/mindsdb/lexer.py
@@ -1,5 +1,6 @@
 import re
 from sly import Lexer
+from sly.lex import LexError
 
 """
 Unfortunately we can't inherit from base SQLLexer, because the order of rules is important.
@@ -355,3 +356,25 @@ def SYSTEM_VARIABLE(self, t):
             t.value = t.value.strip('`')
         return t
 
+    def error(self, t):
+
+        # convert to lines
+        lines = []
+        shift = 0
+        error_line = 0
+        error_index = 0
+        for i, line in enumerate(self.text.split('\n')):
+            if 0 <= t.index - shift < len(line):
+                error_line = i
+                error_index = t.index - shift
+            lines.append(line)
+            shift += len(line) + 1
+
+        msgs = [f'Illegal character {t.value[0]!r}:']
+        # show error code
+        for line in lines[error_line - 1: error_line + 1]:
+            msgs.append('>' + line)
+
+        msgs.append('-' * (error_index + 1) + '^')
+
+        raise LexError('\n'.join(msgs), t.value, self.index)
diff --git a/mindsdb_sql/parser/dialects/mindsdb/parser.py b/mindsdb_sql/parser/dialects/mindsdb/parser.py
@@ -1219,8 +1219,8 @@ def result_columns(self, p):
        'result_column quote_string')
     def result_column(self, p):
         col = p.result_column
-        if col.alias:
-            raise ParsingException(f'Attempt to provide two aliases for {str(col)}')
+        # if col.alias:
+        #     raise ParsingException(f'Attempt to provide two aliases for {str(col)}')
         if hasattr(p, 'dquote_string'):
             alias = Identifier(p.dquote_string)
         elif hasattr(p, 'quote_string'):
@@ -1599,7 +1599,6 @@ def function_name(self, p):
        'COLUMNS',
        'COMMIT',
        'COMMITTED',
-       'CONCAT',
        'DATASET',
        'DATASETS',
        'DATABASE',
@@ -1629,7 +1628,6 @@ def function_name(self, p):
        'OFFSET',
        'ONLY',
        'OPEN',
-       'PARAMETER',
        'PARAMETERS',
        'PERSIST',
        'PLUGINS',
@@ -1764,8 +1762,20 @@ def raw_query(self, p):
     def empty(self, p):
         pass
 
-    def error(self, p):
-        if p:
-            raise ParsingException(f"Syntax error at token {p.type}: \"{p.value}\"")
-        else:
-            raise ParsingException("Syntax error at EOF")
+    def error(self, p, expected_tokens=None):
+
+        if not hasattr(self, 'used_tokens'):
+            # failback mode if user has another sly version module installed
+            if p:
+                raise ParsingException(f"Syntax error at token {p.type}: \"{p.value}\"")
+            else:
+                raise ParsingException("Syntax error at EOF")
+
+        # save error info for future usage
+        self.error_info = dict(
+            tokens=self.used_tokens.copy() + list(self.tokens),
+            bad_token=p,
+            expected_tokens=expected_tokens
+        )
+        # don't raise exception
+        return
diff --git a/mindsdb_sql/parser/dialects/mysql/parser.py b/mindsdb_sql/parser/dialects/mysql/parser.py
@@ -972,7 +972,6 @@ def parameter(self, p):
        'COLUMNS',
        'COMMIT',
        'COMMITTED',
-       'CONCAT',
        'DATABASES',
        'DATABASE',
        'ENGINE',
@@ -992,7 +991,6 @@ def parameter(self, p):
        'OFFSET',
        'ONLY',
        'OPEN',
-       'PARAMETER',
        'PERSIST',
        'PLUGINS',
        'PRIVILEGES',

diff --git a/mindsdb_sql/parser/parser.py b/mindsdb_sql/parser/parser.py
@@ -766,7 +766,7 @@ def dquote_string(self, p):
     def empty(self, p):
         pass
 
-    def error(self, p):
+    def error(self, p, expected_tokens=None):
         if p:
             raise ParsingException(f"Syntax error at token {p.type}: \"{p.value}\"")
         else:

diff --git a/sly/lex.py b/sly/lex.py
@@ -31,7 +31,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # -----------------------------------------------------------------------------
 
-__all__ = ['Lexer', 'LexerStateChange']
+__all__ = ['Lexer', 'LexerStateChange', 'Token']
 
 import re
 import copy

diff --git a/sly/yacc.py b/sly/yacc.py
@@ -2030,7 +2030,7 @@ def _build(cls, definitions):
     # ----------------------------------------------------------------------
     # Parsing Support.  This is the parsing runtime that users use to
     # ----------------------------------------------------------------------
-    def error(self, token):
+    def error(self, token, expected_tokens=None):
         '''
         Default error handling function.  This may be subclassed.
         '''
@@ -2076,6 +2076,7 @@ def parse(self, tokens):
 
         # Set up the state and symbol stacks
         self.tokens = tokens
+        self.used_tokens = []
         self.statestack = statestack = []                 # Stack of parsing states
         self.symstack = symstack = []                     # Stack of grammar symbols
         pslice._stack = symstack                          # Associate the stack with the production
@@ -2096,6 +2097,7 @@ def parse(self, tokens):
                 if not lookahead:
                     if not lookaheadstack:
                         lookahead = next(tokens, None)  # Get the next token
+                        self.used_tokens.append(lookahead)
                     else:
                         lookahead = lookaheadstack.pop()
                     if not lookahead:
@@ -2187,7 +2189,7 @@ def parse(self, tokens):
                     else:
                         errtoken = lookahead
 
-                    tok = self.error(errtoken)
+                    tok = self.error(errtoken, expected_tokens=list(actions[self.state].keys()))
                     if tok:
                         # User must have done some kind of panic
                         # mode recovery on their own.  The