diff --git a/README.md b/README.md index 787c4386..bb82d6e9 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,32 @@ SLY does not support inheritance, therefore every dialect is described completel - get_string - to return object as sql expression (or sub-expression) - copy - to copy AST-tree to new object +### Error handling + +For better user experience parsing error contains useful information about problem location and possible solution to solve it. +1. it shows location of error if + - character isn't parsed (by lexer) + - token is unexpected (by parser) +2. it tries to propose correct token instead (or before) error location. Possible options + - Keyword will be showed as is. + - '[number]' - if float and integer is expected + - '[string]' - if string is expected + - '[identifier]' - if name of the objects is expected. For example, they are bold words here: + - "select **x** as **name** from **tbl1** where **col**=1" + +How suggestion works: +It uses next possible tokens defined by syntax rules. +If this is the end of the query: just shows these tokens. +Else: +- it tries to replace bad token with other token from list of possible tokens +- tries to parse query once again, if there is no error: + - add this token to suggestion list +- second iteration: put possible token before bad token (instead of replacement) and repeat the same operation. + +Example: +![image](https://github.com/mindsdb/mindsdb_sql/assets/8502631/c4707087-ca6e-47f6-aaba-db3a641947a6) + + # Planner diff --git a/mindsdb_sql/__init__.py b/mindsdb_sql/__init__.py index 342801e9..a963257e 100644 --- a/mindsdb_sql/__init__.py +++ b/mindsdb_sql/__init__.py @@ -1,9 +1,166 @@ import re +from collections import defaultdict + +from sly.lex import Token from mindsdb_sql.exceptions import ParsingException from mindsdb_sql.parser.ast import * +class ErrorHandling: + + def __init__(self, lexer, parser): + self.parser = parser + self.lexer = lexer + + def process(self, error_info): + self.tokens = [t for t in error_info['tokens'] if t is not None] + self.bad_token = error_info['bad_token'] + self.expected_tokens = error_info['expected_tokens'] + + if len(self.tokens) == 0: + return 'Empty input' + + # show error location + msgs = self.error_location() + + # suggestion + suggestions = self.make_suggestion() + + if suggestions: + prefix = 'Possible inputs: ' if len(suggestions) > 1 else 'Expected symbol: ' + msgs.append(prefix + ', '.join([f'"{item}"' for item in suggestions])) + return '\n'.join(msgs) + + def error_location(self): + + # restore query text + lines_idx = defaultdict(str) + + # used + unused tokens + for token in self.tokens: + if token is None: + continue + line = lines_idx[token.lineno] + + if len(line) > token.index: + line = line[: token.index] + else: + line = line.ljust(token.index) + + line += token.value + lines_idx[token.lineno] = line + + msgs = [] + + # error message and location + if self.bad_token is None: + msgs.append('Syntax error, unexpected end of query:') + error_len = 1 + # last line + error_line_num = list(lines_idx.keys())[-1] + error_index = len(lines_idx[error_line_num]) + else: + msgs.append('Syntax error, unknown input:') + error_len = len(self.bad_token.value) + error_line_num = self.bad_token.lineno + error_index = self.bad_token.index + + # shift lines indexes (it removes spaces from beginnings of the lines) + lines = [] + shift = 0 + error_line = 0 + for i, line_num in enumerate(lines_idx.keys()): + if line_num == error_line_num: + error_index -= shift + error_line = i + + line = lines_idx[line_num] + lines.append(line[shift:]) + shift = len(line) + + # add source code + first_line = error_line - 2 if error_line > 1 else 0 + for line in lines[first_line: error_line + 1]: + msgs.append('>' + line) + + # error position + msgs.append('-' * (error_index + 1) + '^' * error_len) + return msgs + + def make_suggestion(self): + if len(self.expected_tokens) == 0: + return [] + + # find error index + error_index = None + for i, token in enumerate(self.tokens): + if token is self.bad_token : + error_index = i + + expected = {} # value: token + + for token_name in self.expected_tokens: + value = getattr(self.lexer, token_name, None) + if token_name == 'ID': + # a lot of other tokens could be ID + expected = {'[identifier]': token_name} + break + elif token_name in ('FLOAT', 'INTEGER'): + expected['[number]'] = token_name + + elif token_name in ('DQUOTE_STRING', 'QUOTE_STRING'): + expected['[string]'] = token_name + + elif isinstance(value, str): + value = value.replace('\\b', '').replace('\\', '') + + # doesn't content regexp + if '\\s' not in value and '|' not in value: + expected[value] = token_name + + suggestions = [] + if len(expected) == 1: + # use only it + first_value = list(expected.keys())[0] + suggestions.append(first_value) + + elif 1 < len(expected) < 20: + if self.bad_token is None: + # if this is the end of query, just show next expected keywords + return list(expected.keys()) + + # not every suggestion satisfy the end of the query. we have to check if it works + for value, token_name in expected.items(): + # make up a token + token = Token() + token.type = token_name + token.value = value + token.end = 0 + token.index = 0 + token.lineno = 0 + + # try to add token + tokens2 = self.tokens[:error_index] + [token] + self.tokens[error_index:] + if self.query_is_valid(tokens2): + suggestions.append(value) + continue + + # try to replace token + tokens2 = self.tokens[:error_index - 1] + [token] + self.tokens[error_index:] + if self.query_is_valid(tokens2): + suggestions.append(value) + continue + + return suggestions + + def query_is_valid(self, tokens): + # try to parse list of tokens + + ast = self.parser.parse(iter(tokens)) + return ast is not None + + def get_lexer_parser(dialect): if dialect == 'sqlite': from mindsdb_sql.parser.lexer import SQLLexer @@ -29,4 +186,12 @@ def parse_sql(sql, dialect='mindsdb'): lexer, parser = get_lexer_parser(dialect) tokens = lexer.tokenize(sql) ast = parser.parse(tokens) + + if ast is None: + + eh = ErrorHandling(lexer, parser) + message = eh.process(parser.error_info) + + raise ParsingException(message) + return ast diff --git a/mindsdb_sql/parser/dialects/mindsdb/lexer.py b/mindsdb_sql/parser/dialects/mindsdb/lexer.py index 6ecc1f2c..5d2f725d 100644 --- a/mindsdb_sql/parser/dialects/mindsdb/lexer.py +++ b/mindsdb_sql/parser/dialects/mindsdb/lexer.py @@ -1,5 +1,6 @@ import re from sly import Lexer +from sly.lex import LexError """ Unfortunately we can't inherit from base SQLLexer, because the order of rules is important. @@ -355,3 +356,25 @@ def SYSTEM_VARIABLE(self, t): t.value = t.value.strip('`') return t + def error(self, t): + + # convert to lines + lines = [] + shift = 0 + error_line = 0 + error_index = 0 + for i, line in enumerate(self.text.split('\n')): + if 0 <= t.index - shift < len(line): + error_line = i + error_index = t.index - shift + lines.append(line) + shift += len(line) + 1 + + msgs = [f'Illegal character {t.value[0]!r}:'] + # show error code + for line in lines[error_line - 1: error_line + 1]: + msgs.append('>' + line) + + msgs.append('-' * (error_index + 1) + '^') + + raise LexError('\n'.join(msgs), t.value, self.index) diff --git a/mindsdb_sql/parser/dialects/mindsdb/parser.py b/mindsdb_sql/parser/dialects/mindsdb/parser.py index 9a656ecf..c8ac9524 100644 --- a/mindsdb_sql/parser/dialects/mindsdb/parser.py +++ b/mindsdb_sql/parser/dialects/mindsdb/parser.py @@ -1219,8 +1219,8 @@ def result_columns(self, p): 'result_column quote_string') def result_column(self, p): col = p.result_column - if col.alias: - raise ParsingException(f'Attempt to provide two aliases for {str(col)}') + # if col.alias: + # raise ParsingException(f'Attempt to provide two aliases for {str(col)}') if hasattr(p, 'dquote_string'): alias = Identifier(p.dquote_string) elif hasattr(p, 'quote_string'): @@ -1599,7 +1599,6 @@ def function_name(self, p): 'COLUMNS', 'COMMIT', 'COMMITTED', - 'CONCAT', 'DATASET', 'DATASETS', 'DATABASE', @@ -1629,7 +1628,6 @@ def function_name(self, p): 'OFFSET', 'ONLY', 'OPEN', - 'PARAMETER', 'PARAMETERS', 'PERSIST', 'PLUGINS', @@ -1764,8 +1762,20 @@ def raw_query(self, p): def empty(self, p): pass - def error(self, p): - if p: - raise ParsingException(f"Syntax error at token {p.type}: \"{p.value}\"") - else: - raise ParsingException("Syntax error at EOF") + def error(self, p, expected_tokens=None): + + if not hasattr(self, 'used_tokens'): + # failback mode if user has another sly version module installed + if p: + raise ParsingException(f"Syntax error at token {p.type}: \"{p.value}\"") + else: + raise ParsingException("Syntax error at EOF") + + # save error info for future usage + self.error_info = dict( + tokens=self.used_tokens.copy() + list(self.tokens), + bad_token=p, + expected_tokens=expected_tokens + ) + # don't raise exception + return diff --git a/mindsdb_sql/parser/dialects/mysql/parser.py b/mindsdb_sql/parser/dialects/mysql/parser.py index fde778b5..bb128562 100644 --- a/mindsdb_sql/parser/dialects/mysql/parser.py +++ b/mindsdb_sql/parser/dialects/mysql/parser.py @@ -972,7 +972,6 @@ def parameter(self, p): 'COLUMNS', 'COMMIT', 'COMMITTED', - 'CONCAT', 'DATABASES', 'DATABASE', 'ENGINE', @@ -992,7 +991,6 @@ def parameter(self, p): 'OFFSET', 'ONLY', 'OPEN', - 'PARAMETER', 'PERSIST', 'PLUGINS', 'PRIVILEGES', diff --git a/mindsdb_sql/parser/parser.py b/mindsdb_sql/parser/parser.py index 91c7c56a..18576c86 100644 --- a/mindsdb_sql/parser/parser.py +++ b/mindsdb_sql/parser/parser.py @@ -766,7 +766,7 @@ def dquote_string(self, p): def empty(self, p): pass - def error(self, p): + def error(self, p, expected_tokens=None): if p: raise ParsingException(f"Syntax error at token {p.type}: \"{p.value}\"") else: diff --git a/sly/lex.py b/sly/lex.py index 5bf473ca..2eb0af3e 100644 --- a/sly/lex.py +++ b/sly/lex.py @@ -31,7 +31,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ----------------------------------------------------------------------------- -__all__ = ['Lexer', 'LexerStateChange'] +__all__ = ['Lexer', 'LexerStateChange', 'Token'] import re import copy diff --git a/sly/yacc.py b/sly/yacc.py index 9b3b7dc6..aafd6af3 100644 --- a/sly/yacc.py +++ b/sly/yacc.py @@ -2030,7 +2030,7 @@ def _build(cls, definitions): # ---------------------------------------------------------------------- # Parsing Support. This is the parsing runtime that users use to # ---------------------------------------------------------------------- - def error(self, token): + def error(self, token, expected_tokens=None): ''' Default error handling function. This may be subclassed. ''' @@ -2076,6 +2076,7 @@ def parse(self, tokens): # Set up the state and symbol stacks self.tokens = tokens + self.used_tokens = [] self.statestack = statestack = [] # Stack of parsing states self.symstack = symstack = [] # Stack of grammar symbols pslice._stack = symstack # Associate the stack with the production @@ -2096,6 +2097,7 @@ def parse(self, tokens): if not lookahead: if not lookaheadstack: lookahead = next(tokens, None) # Get the next token + self.used_tokens.append(lookahead) else: lookahead = lookaheadstack.pop() if not lookahead: @@ -2187,7 +2189,7 @@ def parse(self, tokens): else: errtoken = lookahead - tok = self.error(errtoken) + tok = self.error(errtoken, expected_tokens=list(actions[self.state].keys())) if tok: # User must have done some kind of panic # mode recovery on their own. The