diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 36e6f8526..f54a888fa 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -44,7 +44,7 @@ jobs: echo "::set-output name=id::$MATRIX_ID" - name: Run tests run: | - pytest --cov=outlines + pytest -x --cov=outlines env: COVERAGE_FILE: .coverage.${{ steps.matrix-id.outputs.id }} - name: Upload coverage data diff --git a/benchmarks/bench_json_schema.py b/benchmarks/bench_json_schema.py index 62d9b3c1d..3a1f72cb6 100644 --- a/benchmarks/bench_json_schema.py +++ b/benchmarks/bench_json_schema.py @@ -1,6 +1,7 @@ +from outlines_core.fsm.json_schema import build_regex_from_schema + from outlines.caching import cache_disabled from outlines.fsm.guide import RegexGuide -from outlines.fsm.json_schema import build_regex_from_schema from .common import setup_tokenizer # noqa: E402 @@ -70,10 +71,6 @@ def setup(self, schema_name): self.tokenizer = setup_tokenizer() self.schema = schemas[schema_name] - @cache_disabled() - def time_json_schema_to_regex(self, schema_name): - build_regex_from_schema(self.schema) - @cache_disabled() def time_json_schema_to_fsm(self, schema_name): regex = build_regex_from_schema(self.schema) diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py index 0bab57923..578ee7626 100644 --- a/outlines/fsm/json_schema.py +++ b/outlines/fsm/json_schema.py @@ -1,90 +1,10 @@ import inspect import json -import re import warnings from enum import Enum -from typing import Callable, Optional, Tuple, Type, Union +from typing import Callable, Type, Union -from jsonschema.protocols import Validator from pydantic import BaseModel, create_model -from referencing import Registry, Resource -from referencing._core import Resolver -from referencing.jsonschema import DRAFT202012 - -# allow `\"`, `\\`, or any character which isn't a control sequence -STRING_INNER = r'([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])' -STRING = f'"{STRING_INNER}*"' - -INTEGER = r"(-)?(0|[1-9][0-9]*)" -NUMBER = rf"({INTEGER})(\.[0-9]+)?([eE][+-][0-9]+)?" -BOOLEAN = r"(true|false)" -NULL = r"null" -WHITESPACE = r"[ ]?" - -type_to_regex = { - "string": STRING, - "integer": INTEGER, - "number": NUMBER, - "boolean": BOOLEAN, - "null": NULL, -} - -DATE_TIME = r'"(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?"' -DATE = r'"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"' -TIME = r'"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"' -UUID = r'"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"' - -format_to_regex = { - "uuid": UUID, - "date-time": DATE_TIME, - "date": DATE, - "time": TIME, -} - - -def build_regex_from_schema(schema: str, whitespace_pattern: Optional[str] = None): - """Turn a JSON schema into a regex that matches any JSON object that follows - this schema. - - JSON Schema is a declarative language that allows to annotate JSON documents - with types and descriptions. These schemas can be generated from any Python - datastructure that has type annotation: namedtuples, dataclasses, Pydantic - models. And by ensuring that the generation respects the schema we ensure - that the output can be parsed into these objects. - This function parses the provided schema and builds a generation schedule which - mixes deterministic generation (fixed strings), and sampling with constraints. - - Parameters - ---------- - schema - A string that represents a JSON Schema. - whitespace_pattern - Pattern to use for JSON syntactic whitespace (doesn't impact string literals) - Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"` - - Returns - ------- - A generation schedule. A list of strings that represent the JSON - schema's structure and regular expression that define the structure of - the fields. - - References - ---------- - .. [0] JSON Schema. https://json-schema.org/ - - """ - - schema = json.loads(schema) - Validator.check_schema(schema) - - # Build reference resolver - schema = Resource(contents=schema, specification=DRAFT202012) - uri = schema.id() if schema.id() is not None else "" - registry = Registry().with_resource(uri=uri, resource=schema) - resolver = registry.resolver() - - content = schema.contents - return to_regex(resolver, content, whitespace_pattern) def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -> str: @@ -120,413 +40,6 @@ def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) - return schema_str -def _get_num_items_pattern(min_items, max_items, whitespace_pattern): - # Helper function for arrays and objects - min_items = int(min_items or 0) - if max_items is None: - return rf"{{{max(min_items - 1, 0)},}}" - else: - max_items = int(max_items) - if max_items < 1: - return None - return rf"{{{max(min_items - 1, 0)},{max_items - 1}}}" - - -def validate_quantifiers( - min_bound: Optional[str], max_bound: Optional[str], start_offset: int = 0 -) -> Tuple[str, str]: - """ - Ensures that the bounds of a number are valid. Bounds are used as quantifiers in the regex. - - Parameters - ---------- - min_bound - The minimum value that the number can take. - max_bound - The maximum value that the number can take. - start_offset - Number of elements that are already present in the regex but still need to be counted. - ex: if the regex is already "(-)?(0|[1-9][0-9])", we will always have at least 1 digit, so the start_offset is 1. - - Returns - ------- - min_bound - The minimum value that the number can take. - max_bound - The maximum value that the number can take. - - Raises - ------ - ValueError - If the minimum bound is greater than the maximum bound. - - TypeError or ValueError - If the minimum bound is not an integer or None. - or - If the maximum bound is not an integer or None. - """ - min_bound = "" if min_bound is None else str(int(min_bound) - start_offset) - max_bound = "" if max_bound is None else str(int(max_bound) - start_offset) - if min_bound and max_bound: - if int(max_bound) < int(min_bound): - raise ValueError("max bound must be greater than or equal to min bound") - return min_bound, max_bound - - -def to_regex( - resolver: Resolver, instance: dict, whitespace_pattern: Optional[str] = None -): - """Translate a JSON Schema instance into a regex that validates the schema. - - Note - ---- - Many features of JSON schema are missing: - - Handle `additionalProperties` keyword - - Handle types defined as a list - - Handle constraints on numbers - - Handle special patterns: `date`, `uri`, etc. - - This does not support recursive definitions. - - Parameters - ---------- - resolver - An object that resolves references to other instances within a schema - instance - The instance to translate - whitespace_pattern - Pattern to use for JSON syntactic whitespace (doesn't impact string literals) - Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"` - """ - - # set whitespace pattern - if whitespace_pattern is None: - whitespace_pattern = WHITESPACE - - if instance == {}: - # JSON Schema Spec: Empty object means unconstrained, any json type is legal - types = [ - {"type": "boolean"}, - {"type": "null"}, - {"type": "number"}, - {"type": "integer"}, - {"type": "string"}, - {"type": "array"}, - {"type": "object"}, - ] - regexes = [to_regex(resolver, t, whitespace_pattern) for t in types] - regexes = [rf"({r})" for r in regexes] - return rf"{'|'.join(regexes)}" - - elif "properties" in instance: - regex = "" - regex += r"\{" - properties = instance["properties"] - required_properties = instance.get("required", []) - is_required = [item in required_properties for item in properties] - # If at least one property is required, we include the one in the lastest position - # without any comma. - # For each property before it (optional or required), we add with a comma after the property. - # For each property after it (optional), we add with a comma before the property. - if any(is_required): - last_required_pos = max([i for i, value in enumerate(is_required) if value]) - for i, (name, value) in enumerate(properties.items()): - subregex = f'{whitespace_pattern}"{re.escape(name)}"{whitespace_pattern}:{whitespace_pattern}' - subregex += to_regex(resolver, value, whitespace_pattern) - if i < last_required_pos: - subregex = f"{subregex}{whitespace_pattern}," - elif i > last_required_pos: - subregex = f"{whitespace_pattern},{subregex}" - regex += subregex if is_required[i] else f"({subregex})?" - # If no property is required, we have to create a possible pattern for each property in which - # it's the last one necessarilly present. Then, we add the others as optional before and after - # following the same strategy as described above. - # The whole block is made optional to allow the case in which no property is returned. - else: - property_subregexes = [] - for i, (name, value) in enumerate(properties.items()): - subregex = f'{whitespace_pattern}"{name}"{whitespace_pattern}:{whitespace_pattern}' - subregex += to_regex(resolver, value, whitespace_pattern) - property_subregexes.append(subregex) - possible_patterns = [] - for i in range(len(property_subregexes)): - pattern = "" - for subregex in property_subregexes[:i]: - pattern += f"({subregex}{whitespace_pattern},)?" - pattern += property_subregexes[i] - for subregex in property_subregexes[i + 1 :]: - pattern += f"({whitespace_pattern},{subregex})?" - possible_patterns.append(pattern) - regex += f"({'|'.join(possible_patterns)})?" - - regex += f"{whitespace_pattern}" + r"\}" - - return regex - - # To validate against allOf, the given data must be valid against all of the - # given subschemas. - elif "allOf" in instance: - subregexes = [ - to_regex(resolver, t, whitespace_pattern) for t in instance["allOf"] - ] - subregexes_str = [f"{subregex}" for subregex in subregexes] - return rf"({''.join(subregexes_str)})" - - # To validate against `anyOf`, the given data must be valid against - # any (one or more) of the given subschemas. - elif "anyOf" in instance: - subregexes = [ - to_regex(resolver, t, whitespace_pattern) for t in instance["anyOf"] - ] - return rf"({'|'.join(subregexes)})" - - # To validate against oneOf, the given data must be valid against exactly - # one of the given subschemas. - elif "oneOf" in instance: - subregexes = [ - to_regex(resolver, t, whitespace_pattern) for t in instance["oneOf"] - ] - - xor_patterns = [f"(?:{subregex})" for subregex in subregexes] - - return rf"({'|'.join(xor_patterns)})" - - # Create pattern for Tuples, per JSON Schema spec, `prefixItems` determines types at each idx - elif "prefixItems" in instance: - element_patterns = [ - to_regex(resolver, t, whitespace_pattern) for t in instance["prefixItems"] - ] - comma_split_pattern = rf"{whitespace_pattern},{whitespace_pattern}" - tuple_inner = comma_split_pattern.join(element_patterns) - return rf"\[{whitespace_pattern}{tuple_inner}{whitespace_pattern}\]" - - # The enum keyword is used to restrict a value to a fixed set of values. It - # must be an array with at least one element, where each element is unique. - elif "enum" in instance: - choices = [] - for choice in instance["enum"]: - if type(choice) in [int, float, bool, type(None), str]: - choices.append(re.escape(json.dumps(choice))) - elif isinstance(choice, dict): - choices.append(to_regex(resolver, choice, whitespace_pattern)) - else: - raise TypeError(f"Unsupported data type in enum: {type(choice)}") - return f"({'|'.join(choices)})" - - elif "const" in instance: - const = instance["const"] - if type(const) in [int, float, bool, type(None), str]: - const = re.escape(json.dumps(const)) - else: - raise TypeError(f"Unsupported data type in const: {type(const)}") - return const - - elif "$ref" in instance: - path = f"{instance['$ref']}" - instance = resolver.lookup(path).contents - return to_regex(resolver, instance, whitespace_pattern) - - # The type keyword may either be a string or an array: - # - If it's a string, it is the name of one of the basic types. - # - If it is an array, it must be an array of strings, where each string is - # the name of one of the basic types, and each element is unique. In this - # case, the JSON snippet is valid if it matches any of the given types. - elif "type" in instance: - instance_type = instance["type"] - if instance_type == "string": - if "maxLength" in instance or "minLength" in instance: - max_items = instance.get("maxLength", "") - min_items = instance.get("minLength", "") - try: - if int(max_items) < int(min_items): - raise ValueError( - "maxLength must be greater than or equal to minLength" - ) # FIXME this raises an error but is caught right away by the except (meant for int("") I assume) - except ValueError: - pass - return f'"{STRING_INNER}{{{min_items},{max_items}}}"' - elif "pattern" in instance: - pattern = instance["pattern"] - if pattern[0] == "^" and pattern[-1] == "$": - return rf'("{pattern[1:-1]}")' - else: - return rf'("{pattern}")' - elif "format" in instance: - format = instance["format"] - if format == "date-time": - return format_to_regex["date-time"] - elif format == "uuid": - return format_to_regex["uuid"] - elif format == "date": - return format_to_regex["date"] - elif format == "time": - return format_to_regex["time"] - else: - raise NotImplementedError( - f"Format {format} is not supported by Outlines" - ) - else: - return type_to_regex["string"] - - elif instance_type == "number": - bounds = { - "minDigitsInteger", - "maxDigitsInteger", - "minDigitsFraction", - "maxDigitsFraction", - "minDigitsExponent", - "maxDigitsExponent", - } - if bounds.intersection(set(instance.keys())): - min_digits_integer, max_digits_integer = validate_quantifiers( - instance.get("minDigitsInteger"), - instance.get("maxDigitsInteger"), - start_offset=1, - ) - min_digits_fraction, max_digits_fraction = validate_quantifiers( - instance.get("minDigitsFraction"), instance.get("maxDigitsFraction") - ) - min_digits_exponent, max_digits_exponent = validate_quantifiers( - instance.get("minDigitsExponent"), instance.get("maxDigitsExponent") - ) - integers_quantifier = ( - f"{{{min_digits_integer},{max_digits_integer}}}" - if min_digits_integer or max_digits_integer - else "*" - ) - fraction_quantifier = ( - f"{{{min_digits_fraction},{max_digits_fraction}}}" - if min_digits_fraction or max_digits_fraction - else "+" - ) - exponent_quantifier = ( - f"{{{min_digits_exponent},{max_digits_exponent}}}" - if min_digits_exponent or max_digits_exponent - else "+" - ) - return rf"((-)?(0|[1-9][0-9]{integers_quantifier}))(\.[0-9]{fraction_quantifier})?([eE][+-][0-9]{exponent_quantifier})?" - return type_to_regex["number"] - - elif instance_type == "integer": - if "minDigits" in instance or "maxDigits" in instance: - min_digits, max_digits = validate_quantifiers( - instance.get("minDigits"), instance.get("maxDigits"), start_offset=1 - ) - return rf"(-)?(0|[1-9][0-9]{{{min_digits},{max_digits}}})" - return type_to_regex["integer"] - - elif instance_type == "array": - num_repeats = _get_num_items_pattern( - instance.get("minItems"), instance.get("maxItems"), whitespace_pattern - ) - if num_repeats is None: - return rf"\[{whitespace_pattern}\]" - - allow_empty = "?" if int(instance.get("minItems", 0)) == 0 else "" - - if "items" in instance: - items_regex = to_regex(resolver, instance["items"], whitespace_pattern) - return rf"\[{whitespace_pattern}(({items_regex})(,{whitespace_pattern}({items_regex})){num_repeats}){allow_empty}{whitespace_pattern}\]" - else: - # Here we need to make the choice to exclude generating list of objects - # if the specification of the object is not given, even though a JSON - # object that contains an object here would be valid under the specification. - legal_types = [ - {"type": "boolean"}, - {"type": "null"}, - {"type": "number"}, - {"type": "integer"}, - {"type": "string"}, - ] - depth = instance.get("depth", 2) - if depth > 0: - legal_types.append({"type": "object", "depth": depth - 1}) - legal_types.append({"type": "array", "depth": depth - 1}) - - regexes = [ - to_regex(resolver, t, whitespace_pattern) for t in legal_types - ] - return rf"\[{whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}{allow_empty}{whitespace_pattern}\]" - - elif instance_type == "object": - # pattern for json object with values defined by instance["additionalProperties"] - # enforces value type constraints recursively, "minProperties", and "maxProperties" - # doesn't enforce "required", "dependencies", "propertyNames" "any/all/on Of" - num_repeats = _get_num_items_pattern( - instance.get("minProperties"), - instance.get("maxProperties"), - whitespace_pattern, - ) - if num_repeats is None: - return rf"\{{{whitespace_pattern}\}}" - - allow_empty = "?" if int(instance.get("minProperties", 0)) == 0 else "" - - additional_properties = instance.get("additionalProperties") - - if additional_properties is None or additional_properties is True: - # JSON Schema behavior: If the additionalProperties of an object is - # unset or True, it is unconstrained object. - # We handle this by setting additionalProperties to anyOf: {all types} - - legal_types = [ - {"type": "string"}, - {"type": "number"}, - {"type": "boolean"}, - {"type": "null"}, - ] - - # We set the object depth to 2 to keep the expression finite, but the "depth" - # key is not a true component of the JSON Schema specification. - depth = instance.get("depth", 2) - if depth > 0: - legal_types.append({"type": "object", "depth": depth - 1}) - legal_types.append({"type": "array", "depth": depth - 1}) - additional_properties = {"anyOf": legal_types} - - value_pattern = to_regex( - resolver, additional_properties, whitespace_pattern - ) - key_value_pattern = ( - f"{STRING}{whitespace_pattern}:{whitespace_pattern}{value_pattern}" - ) - key_value_successor_pattern = ( - f"{whitespace_pattern},{whitespace_pattern}{key_value_pattern}" - ) - multiple_key_value_pattern = f"({key_value_pattern}({key_value_successor_pattern}){num_repeats}){allow_empty}" - - return ( - r"\{" - + whitespace_pattern - + multiple_key_value_pattern - + whitespace_pattern - + r"\}" - ) - - elif instance_type == "boolean": - return type_to_regex["boolean"] - - elif instance_type == "null": - return type_to_regex["null"] - - elif isinstance(instance_type, list): - # Here we need to make the choice to exclude generating an object - # if the specification of the object is not give, even though a JSON - # object that contains an object here would be valid under the specification. - regexes = [ - to_regex(resolver, {"type": t}, whitespace_pattern) - for t in instance_type - if t != "object" - ] - return rf"({'|'.join(regexes)})" - - raise NotImplementedError( - f"""Could not translate the instance {instance} to a - regular expression. Make sure it is valid to the JSON Schema specification. If - it is, please open an issue on the Outlines repository""" - ) - - def get_schema_from_signature(fn: Callable) -> dict: """Turn a function signature into a JSON schema. @@ -561,8 +74,10 @@ def get_schema_from_enum(myenum: type[Enum]) -> dict: f"Your enum class {myenum.__name__} has 0 members. If you are working with an enum of functions, do not forget to register them as callable (using `partial` for instance)" ) choices = [ - get_schema_from_signature(elt.value.func) if callable(elt.value) else elt.value + get_schema_from_signature(elt.value.func) + if callable(elt.value) + else {"const": elt.value} for elt in myenum ] - schema = {"title": myenum.__name__, "enum": choices} + schema = {"title": myenum.__name__, "oneOf": choices} return schema diff --git a/outlines/generate/choice.py b/outlines/generate/choice.py index 75fc71271..afb998f52 100644 --- a/outlines/generate/choice.py +++ b/outlines/generate/choice.py @@ -4,7 +4,9 @@ from functools import singledispatch from typing import Callable, List, Union -from outlines.fsm.json_schema import build_regex_from_schema, get_schema_from_enum +from outlines_core.fsm.json_schema import build_regex_from_schema + +from outlines.fsm.json_schema import get_schema_from_enum from outlines.generate.api import SequenceGeneratorAdapter from outlines.models import OpenAI from outlines.samplers import Sampler, multinomial diff --git a/outlines/generate/json.py b/outlines/generate/json.py index 703447958..d098d920d 100644 --- a/outlines/generate/json.py +++ b/outlines/generate/json.py @@ -3,13 +3,10 @@ from functools import singledispatch from typing import Callable, Optional, Union +from outlines_core.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel -from outlines.fsm.json_schema import ( - build_regex_from_schema, - get_schema_from_enum, - get_schema_from_signature, -) +from outlines.fsm.json_schema import get_schema_from_enum, get_schema_from_signature from outlines.generate.api import SequenceGeneratorAdapter from outlines.models import OpenAI from outlines.samplers import Sampler, multinomial diff --git a/outlines/processors/base_logits_processor.py b/outlines/processors/base_logits_processor.py index eec7de121..44b55af2e 100644 --- a/outlines/processors/base_logits_processor.py +++ b/outlines/processors/base_logits_processor.py @@ -107,9 +107,12 @@ def _to_torch(tensor_like: Array) -> torch.Tensor: return torch.tensor(tensor_like) elif is_mlx_array_type(type(tensor_like)): - # mlx -> torch -> mlx conversion docs: - # https://ml-explore.github.io/mlx/build/html/usage/numpy.html - return torch.from_dlpack(tensor_like) + import mlx.core as mx + + # https://ml-explore.github.io/mlx/build/html/usage/numpy.html#pytorch + return torch.from_dlpack( + np.array(tensor_like.astype(mx.float32), copy=False) + ) elif is_jax_array_type(type(tensor_like)): import jax diff --git a/outlines/processors/structured.py b/outlines/processors/structured.py index d2bc15f77..64892b73f 100644 --- a/outlines/processors/structured.py +++ b/outlines/processors/structured.py @@ -27,10 +27,11 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union import torch +from outlines_core.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel from outlines.fsm.guide import CFGGuide, Guide, RegexGuide -from outlines.fsm.json_schema import build_regex_from_schema, convert_json_schema_to_str +from outlines.fsm.json_schema import convert_json_schema_to_str from .base_logits_processor import OutlinesLogitsProcessor diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py index 6f0b59c50..23864e029 100644 --- a/tests/fsm/test_json_schema.py +++ b/tests/fsm/test_json_schema.py @@ -1,31 +1,14 @@ import json -import re from contextlib import nullcontext from enum import Enum from functools import partial -from typing import List, Literal, Union +from typing import List -import interegular import pytest -from pydantic import BaseModel, Field, constr +from outlines_core.fsm.json_schema import build_regex_from_schema +from pydantic import BaseModel, constr -from outlines.fsm.json_schema import ( - BOOLEAN, - DATE, - DATE_TIME, - INTEGER, - NULL, - NUMBER, - STRING, - STRING_INNER, - TIME, - UUID, - WHITESPACE, - build_regex_from_schema, - get_schema_from_enum, - get_schema_from_signature, - to_regex, -) +from outlines.fsm.json_schema import get_schema_from_enum, get_schema_from_signature def test_function_basic(): @@ -58,1011 +41,8 @@ class User(BaseModel): is_true: bool schema = json.dumps(User.model_json_schema()) - schedule = build_regex_from_schema(schema) - assert isinstance(schedule, str) - - -@pytest.mark.parametrize( - "pattern,does_match", - [ - ({"integer": "0"}, True), - ({"integer": "1"}, True), - ({"integer": "-1"}, True), - ({"integer": "01"}, False), - ({"integer": "1.3"}, False), - ({"integer": "t"}, False), - ], -) -def test_match_integer(pattern, does_match): - step = {"title": "Foo", "type": "integer"} - regex = to_regex(None, step) - assert regex == INTEGER - - value = pattern["integer"] - match = re.fullmatch(regex, value) - if does_match: - assert match[0] == value - assert match.span() == (0, len(value)) - else: - assert match is None - - -@pytest.mark.parametrize( - "pattern,does_match", - [ - ({"number": "1"}, True), - ({"number": "0"}, True), - ({"number": "01"}, False), - ({"number": ".3"}, False), - ({"number": "1.3"}, True), - ({"number": "-1.3"}, True), - ({"number": "1.3e9"}, False), - ({"number": "1.3e+9"}, True), - ], -) -def test_match_number(pattern, does_match): - step = {"title": "Foo", "type": "number"} - regex = to_regex(None, step) - assert regex == NUMBER - - value = pattern["number"] - match = re.fullmatch(regex, value) - if does_match: - assert match[0] == value - assert match.span() == (0, len(value)) - else: - assert match is None - - -@pytest.mark.parametrize( - "schema,regex,examples", - [ - # String - ( - {"title": "Foo", "type": "string"}, - STRING, - [ - ("unquotedstring", False), - ('"(parenthesized_string)"', True), - ('"malformed) parenthesis (((() string"', True), - ('"quoted_string"', True), - (r'"escape_\character"', False), - (r'"double_\\escape"', True), - (r'"\n"', False), - (r'"\\n"', True), - (r'"unescaped " quote"', False), - (r'"escaped \" quote"', True), - ], - ), - # String with maximum length - ( - {"title": "Foo", "type": "string", "maxLength": 3}, - f'"{STRING_INNER}{{,3}}"', - [('"ab"', True), ('"a""', False), ('"abcd"', False)], - ), - # String with minimum length - ( - {"title": "Foo", "type": "string", "minLength": 3}, - f'"{STRING_INNER}{{3,}}"', - [('"ab"', False), ('"abcd"', True), ('"abc""', False)], - ), - # String with both minimum and maximum length - ( - {"title": "Foo", "type": "string", "minLength": 3, "maxLength": 5}, - f'"{STRING_INNER}{{3,5}}"', - [('"ab"', False), ('"abcd"', True), ('"abcdef""', False)], - ), - # String defined by a regular expression - ( - {"title": "Foo", "type": "string", "pattern": r"^[a-z]$"}, - r'("[a-z]")', - [('"a"', True), ('"1"', False)], - ), - # Boolean - ( - {"title": "Foo", "type": "boolean"}, - BOOLEAN, - [ - ("true", True), - ("false", True), - ("null", False), - ("0", False), - ], - ), - # Null - ( - {"title": "Foo", "type": "null"}, - NULL, - [ - ("null", True), - ("true", False), - ("0", False), - ], - ), - # Const string - ( - {"title": "Foo", "const": "Marc", "type": "string"}, - '"Marc"', - [('"Marc"', True), ('"Jean"', False), ('"John"', False)], - ), - # Make sure strings are escaped with regex escaping - ( - {"title": "Foo", "const": ".*", "type": "string"}, - r'"\.\*"', - [('".*"', True), (r'"\s*"', False), (r'"\.\*"', False)], - ), - # Make sure strings are escaped with JSON escaping - ( - {"title": "Foo", "const": '"', "type": "string"}, - r'"\\""', - [('"\\""', True), ('"""', False)], - ), - # Const integer - ( - {"title": "Foo", "const": 0, "type": "integer"}, - "0", - [("0", True), ("1", False), ("a", False)], - ), - # Const float - ( - {"title": "Foo", "const": 0.2, "type": "float"}, - r"0\.2", - [("0.2", True), ("032", False)], - ), - # Const boolean - ( - {"title": "Foo", "const": True, "type": "boolean"}, - "true", - [("true", True), ("True", False)], - ), - # Const null - ( - {"title": "Foo", "const": None, "type": "null"}, - "null", - [("null", True), ("None", False), ("", False)], - ), - # Enum string - ( - {"title": "Foo", "enum": ["Marc", "Jean"], "type": "string"}, - '("Marc"|"Jean")', - [('"Marc"', True), ('"Jean"', True), ('"John"', False)], - ), - # Make sure strings are escaped with regex and JSON escaping - ( - {"title": "Foo", "enum": [".*", r"\s*"], "type": "string"}, - r'("\.\*"|"\\\\s\*")', - [('".*"', True), (r'"\\s*"', True), (r'"\.\*"', False)], - ), - # Enum integer - ( - {"title": "Foo", "enum": [0, 1], "type": "integer"}, - "(0|1)", - [("0", True), ("1", True), ("a", False)], - ), - # Enum mix of types - ( - { - "title": "Foo", - "enum": [ - 6, - 5.3, - "potato", - True, - None, - { - "properties": { - "a": {"title": "A", "type": "number"}, - "b": {"title": "B", "type": "number"}, - }, - "required": ["a", "b"], - "title": "add", - "type": "object", - }, - ], - }, - r'(6|5\.3|"potato"|true|null|\{[ ]?"a"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?[ ]?,[ ]?"b"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?[ ]?\})', - [ - ("6", True), - ("5.3", True), - ('"potato"', True), - ("true", True), - ("null", True), - ("523", False), - ("True", False), - ("None", False), - ('{"a": -1.0, "b": 1.1}', True), - ('{"a": "a", "b": 1.1}', False), - ], - ), - # integer - ( - { - "title": "Foo", - "type": "object", - "properties": {"count": {"title": "Count", "type": "integer"}}, - "required": ["count"], - }, - '\\{[ ]?"count"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)[ ]?\\}', - [('{ "count": 100 }', True)], - ), - # integer with minimum digits - ( - { - "title": "Foo", - "type": "object", - "properties": { - "count": {"title": "Count", "type": "integer", "minDigits": 3} - }, - "required": ["count"], - }, - '\\{[ ]?"count"[ ]?:[ ]?(-)?(0|[1-9][0-9]{2,})[ ]?\\}', - [('{ "count": 10 }', False), ('{ "count": 100 }', True)], - ), - # integer with maximum digits - ( - { - "title": "Foo", - "type": "object", - "properties": { - "count": {"title": "Count", "type": "integer", "maxDigits": 3} - }, - "required": ["count"], - }, - '\\{[ ]?"count"[ ]?:[ ]?(-)?(0|[1-9][0-9]{,2})[ ]?\\}', - [('{ "count": 100 }', True), ('{ "count": 1000 }', False)], - ), - # integer with minimum and maximum digits - ( - { - "title": "Foo", - "type": "object", - "properties": { - "count": { - "title": "Count", - "type": "integer", - "minDigits": 3, - "maxDigits": 5, - } - }, - "required": ["count"], - }, - '\\{[ ]?"count"[ ]?:[ ]?(-)?(0|[1-9][0-9]{2,4})[ ]?\\}', - [ - ('{ "count": 10 }', False), - ('{ "count": 100 }', True), - ('{ "count": 10000 }', True), - ('{ "count": 100000 }', False), - ], - ), - # number - ( - { - "title": "Foo", - "type": "object", - "properties": {"count": {"title": "Count", "type": "number"}}, - "required": ["count"], - }, - '\\{[ ]?"count"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?[ ]?\\}', - [('{ "count": 100 }', True), ('{ "count": 100.5 }', True)], - ), - # number with min and max integer digits - ( - { - "title": "Foo", - "type": "object", - "properties": { - "count": { - "title": "Count", - "type": "number", - "minDigitsInteger": 3, - "maxDigitsInteger": 5, - } - }, - "required": ["count"], - }, - '\\{[ ]?"count"[ ]?:[ ]?((-)?(0|[1-9][0-9]{2,4}))(\\.[0-9]+)?([eE][+-][0-9]+)?[ ]?\\}', - [ - ('{ "count": 10.005 }', False), - ('{ "count": 100.005 }', True), - ('{ "count": 10000.005 }', True), - ('{ "count": 100000.005 }', False), - ], - ), - # number with min and max fraction digits - ( - { - "title": "Foo", - "type": "object", - "properties": { - "count": { - "title": "Count", - "type": "number", - "minDigitsFraction": 3, - "maxDigitsFraction": 5, - } - }, - "required": ["count"], - }, - '\\{[ ]?"count"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\\.[0-9]{3,5})?([eE][+-][0-9]+)?[ ]?\\}', - [ - ('{ "count": 1.05 }', False), - ('{ "count": 1.005 }', True), - ('{ "count": 1.00005 }', True), - ('{ "count": 1.000005 }', False), - ], - ), - # number with min and max exponent digits - ( - { - "title": "Foo", - "type": "object", - "properties": { - "count": { - "title": "Count", - "type": "number", - "minDigitsExponent": 3, - "maxDigitsExponent": 5, - } - }, - "required": ["count"], - }, - '\\{[ ]?"count"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]{3,5})?[ ]?\\}', - [ - ('{ "count": 1.05e1 }', False), - ('{ "count": 1.05e+001 }', True), - ('{ "count": 1.05e-00001 }', True), - ('{ "count": 1.05e0000001 }', False), - ], - ), - # number with min and max integer, fraction and exponent digits - ( - { - "title": "Foo", - "type": "object", - "properties": { - "count": { - "title": "Count", - "type": "number", - "minDigitsInteger": 3, - "maxDigitsInteger": 5, - "minDigitsFraction": 3, - "maxDigitsFraction": 5, - "minDigitsExponent": 3, - "maxDigitsExponent": 5, - } - }, - "required": ["count"], - }, - '\\{[ ]?"count"[ ]?:[ ]?((-)?(0|[1-9][0-9]{2,4}))(\\.[0-9]{3,5})?([eE][+-][0-9]{3,5})?[ ]?\\}', - [ - ('{ "count": 1.05e1 }', False), - ('{ "count": 100.005e+001 }', True), - ('{ "count": 10000.00005e-00001 }', True), - ('{ "count": 100000.000005e0000001 }', False), - ], - ), - # array - ( - {"title": "Foo", "type": "array", "items": {"type": "number"}}, - rf"\[{WHITESPACE}(({NUMBER})(,{WHITESPACE}({NUMBER})){{0,}})?{WHITESPACE}\]", - [("[1e+9,1.3]", True), ("[]", True), ("[1", False)], - ), - # array with a set length of 1 - ( - { - "title": "Foo", - "type": "array", - "items": {"type": "integer"}, - "minItems": 1, - "maxItems": 1, - }, - rf"\[{WHITESPACE}(({INTEGER})(,{WHITESPACE}({INTEGER})){{0,0}}){WHITESPACE}\]", - [("[1]", True), ("[1,2]", False), ('["a"]', False), ("[]", False)], - ), - # array with a set length greather than 1 - ( - { - "title": "Foo", - "type": "array", - "items": {"type": "integer"}, - "minItems": 3, - "maxItems": 3, - }, - rf"\[{WHITESPACE}(({INTEGER})(,{WHITESPACE}({INTEGER})){{2,2}}){WHITESPACE}\]", - [("[1]", False), ("[]", False), ("[1,2,3]", True), ("[1,2,3,4]", False)], - ), - # array with length 0 - ( - { - "title": "Foo", - "type": "array", - "items": {"type": "integer"}, - "minItems": 0, - "maxItems": 0, - }, - rf"\[{WHITESPACE}\]", - [("[1]", False), ("[]", True), ("[1,2,3]", False), ("[1,2,3,4]", False)], - ), - # object - ( - { - "title": "TestSchema", - "type": "object", - "properties": { - "test_dict": { - "title": "Test Dict", - "additionalProperties": {"type": "string"}, - "type": "object", - } - }, - "required": ["test_dict"], - }, - rf"""\{{{WHITESPACE}"test_dict"{WHITESPACE}:{WHITESPACE}\{{{WHITESPACE}({STRING}{WHITESPACE}:{WHITESPACE}{STRING}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{STRING}){{0,}})?{WHITESPACE}\}}{WHITESPACE}\}}""", - [ - ("""{ "test_dict":{"foo":"bar","baz": "bif"}}""", True), - ("""{ "test_dict":{"foo":"bar" }}""", True), - ("""{ "test_dict":{}}""", True), - ("""{ "WRONG_KEY":{}}""", False), - ("""{ "test_dict":{"wrong_type" 1}}""", False), - ], - ), - # object containing object - ( - { - "title": "TestSchema", - "type": "object", - "properties": { - "test_dict": { - "title": "Test Dict", - "additionalProperties": { - "additionalProperties": {"type": "integer"}, - "type": "object", - }, - "type": "object", - } - }, - "required": ["test_dict"], - }, - rf"""\{{{WHITESPACE}"test_dict"{WHITESPACE}:{WHITESPACE}\{{{WHITESPACE}({STRING}{WHITESPACE}:{WHITESPACE}\{{{WHITESPACE}({STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}){{0,}})?{WHITESPACE}\}}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}\{{{WHITESPACE}({STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}){{0,}})?{WHITESPACE}\}}){{0,}})?{WHITESPACE}\}}{WHITESPACE}\}}""", - [ - ( - """{"test_dict": {"foo": {"bar": 123, "apple": 99}, "baz": {"bif": 456}}}""", - True, - ), - ( - """{"test_dict": {"anykey": {"anykey": 123}, "anykey2": {"bif": 456}}}""", - True, - ), - ("""{"test_dict": {}}""", True), - ("""{"test_dict": {"dict of empty dicts are ok": {} }}""", True), - ( - """{"test_dict": {"anykey": {"ONLY Dict[Dict]": 123}, "No Dict[int]" 1: }}""", - False, - ), - ], - ), - # oneOf - ( - { - "title": "Foo", - "oneOf": [{"type": "string"}, {"type": "number"}, {"type": "boolean"}], - }, - rf'((?:"{STRING_INNER}*")|(?:{NUMBER})|(?:{BOOLEAN}))', - [ - ("12.3", True), - ("true", True), - ('"a"', True), - ("null", False), - ("", False), - ("12true", False), - ('1.3"a"', False), - ('12.3true"a"', False), - ], - ), - # anyOf - ( - { - "title": "Foo", - "anyOf": [{"type": "string"}, {"type": "integer"}], - }, - rf"({STRING}|{INTEGER})", - [("12", True), ('"a"', True), ('1"a"', False)], - ), - # allOf - ( - { - "title": "Foo", - "allOf": [{"type": "string"}, {"type": "integer"}], - }, - rf"({STRING}{INTEGER})", - [('"a"1', True), ('"a"', False), ('"1"', False)], - ), - # Tuple / prefixItems - ( - { - "title": "Foo", - "prefixItems": [{"type": "string"}, {"type": "integer"}], - }, - rf"\[{WHITESPACE}{STRING}{WHITESPACE},{WHITESPACE}{INTEGER}{WHITESPACE}\]", - [('["a", 1]', True), ('["a", 1, 1]', False), ("[]", False)], - ), - # Nested schema - ( - { - "title": "Bar", - "type": "object", - "properties": { - "fuzz": { - "title": "Foo", - "type": "object", - "properties": {"spam": {"title": "Spam", "type": "integer"}}, - "required": ["spam"], - } - }, - "required": ["fuzz"], - }, - f'\\{{[ ]?"fuzz"[ ]?:[ ]?\\{{[ ]?"spam"[ ]?:[ ]?{INTEGER}[ ]?\\}}[ ]?\\}}', - [('{ "fuzz": { "spam": 100 }}', True)], - ), - # Schema with a reference - ( - { - "title": "User", - "type": "object", - "properties": { - "user_id": {"title": "User Id", "type": "integer"}, - "name": {"title": "Name", "type": "string"}, - "a": {"$ref": "#/properties/name"}, - }, - "required": ["user_id", "name", "a"], - }, - f'\\{{[ ]?"user_id"[ ]?:[ ]?{INTEGER}[ ]?,[ ]?"name"[ ]?:[ ]?{STRING}[ ]?,[ ]?"a"[ ]?:[ ]?{STRING}[ ]?\\}}', - [('{"user_id": 100, "name": "John", "a": "Marc"}', True)], - ), - ( - { - "title": "User", - "type": "object", - "$defs": {"name": {"title": "Name2", "type": "string"}}, - "properties": { - "user_id": {"title": "User Id", "type": "integer"}, - "name": {"title": "Name", "type": "string"}, - "name2": {"$ref": "#/$defs/name"}, - }, - "required": ["user_id", "name", "name2"], - }, - f'\\{{[ ]?"user_id"[ ]?:[ ]?{INTEGER}[ ]?,[ ]?"name"[ ]?:[ ]?{STRING}[ ]?,[ ]?"name2"[ ]?:[ ]?{STRING}[ ]?\\}}', - [('{"user_id": 100, "name": "John", "name2": "Marc"}', True)], - ), - ( - { - "$id": "customer", - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "Customer", - "type": "object", - "properties": { - "name": {"type": "string"}, - "last_name": {"type": "string"}, - "address": {"$ref": "customer#/$defs/address"}, - }, - "required": [ - "name", - "first_name", - "last_name", - "address", - "shipping_address", - "billing_address", - ], - "$defs": { - "address": { - "title": "Address", - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "city": {"type": "string"}, - }, - "required": ["street_address", "city", "state"], - "definitions": { - "state": { - "type": "object", - "title": "State", - "properties": {"name": {"type": "string"}}, - "required": ["name"], - } - }, - } - }, - }, - f'\\{{[ ]?"name"[ ]?:[ ]?{STRING}[ ]?,[ ]?"last_name"[ ]?:[ ]?{STRING}[ ]?,[ ]?"address"[ ]?:[ ]?\\{{[ ]?"city"[ ]?:[ ]?{STRING}[ ]?\\}}[ ]?\\}}', - [ - ( - '{"name": "John", "last_name": "Doe", "address": {"city": "Paris"}}', - True, - ) - ], - ), - # Optional properties - # Last required property in first position - ( - { - "properties": { - "name": {"type": "string"}, - "age": {"anyOf": [{"type": "integer"}, {"type": "null"}]}, - "weapon": {"anyOf": [{"type": "string"}, {"type": "null"}]}, - }, - "required": ["name"], - "title": "Character", - "type": "object", - }, - f'\\{{[ ]?"name"[ ]?:[ ]?{STRING}([ ]?,[ ]?"age"[ ]?:[ ]?({INTEGER}|null))?([ ]?,[ ]?"weapon"[ ]?:[ ]?({STRING}|null))?[ ]?\\}}', - [ - ('{ "name" : "Player" }', True), - ('{ "name" : "Player", "weapon" : "sword" }', True), - ('{ "age" : 10, "weapon" : "sword" }', False), - ], - ), - # Last required property in middle position - ( - { - "properties": { - "name": {"type": "string"}, - "age": {"anyOf": [{"type": "integer"}, {"type": "null"}]}, - "weapon": {"type": "string"}, - "strength": {"anyOf": [{"type": "integer"}, {"type": "null"}]}, - }, - "required": ["name", "weapon"], - "title": "Character", - "type": "object", - }, - f'\\{{[ ]?"name"[ ]?:[ ]?{STRING}[ ]?,([ ]?"age"[ ]?:[ ]?({INTEGER}|null)[ ]?,)?[ ]?"weapon"[ ]?:[ ]?{STRING}([ ]?,[ ]?"strength"[ ]?:[ ]?({INTEGER}|null))?[ ]?\\}}', - [ - ('{ "name" : "Player" , "weapon" : "sword" }', True), - ( - '{ "name" : "Player", "age" : 10, "weapon" : "sword" , "strength" : 10 }', - True, - ), - ('{ "weapon" : "sword" }', False), - ], - ), - # Last required property in last position - ( - { - "properties": { - "name": {"anyOf": [{"type": "string"}, {"type": "null"}]}, - "age": {"type": "integer"}, - "armor": {"type": "string"}, - "strength": {"anyOf": [{"type": "integer"}, {"type": "null"}]}, - "weapon": {"title": "Weapon", "type": "string"}, - }, - "required": ["age", "armor", "weapon"], - "title": "Character", - "type": "object", - }, - f'\\{{([ ]?"name"[ ]?:[ ]?({STRING}|null)[ ]?,)?[ ]?"age"[ ]?:[ ]?{INTEGER}[ ]?,[ ]?"armor"[ ]?:[ ]?{STRING}[ ]?,([ ]?"strength"[ ]?:[ ]?({INTEGER}|null)[ ]?,)?[ ]?"weapon"[ ]?:[ ]?{STRING}[ ]?\\}}', - [ - ( - '{ "name" : "Player", "age" : 10, "armor" : "plate", "strength" : 11, "weapon" : "sword" }', - True, - ), - ('{ "age" : 10, "armor" : "plate", "weapon" : "sword" }', True), - ( - '{ "name" : "Kahlhanbeh", "armor" : "plate", "weapon" : "sword" }', - False, - ), - ], - ), - # All properties are optional - ( - { - "properties": { - "name": {"anyOf": [{"type": "string"}, {"type": "null"}]}, - "age": {"anyOf": [{"type": "integer"}, {"type": "null"}]}, - "strength": {"anyOf": [{"type": "integer"}, {"type": "null"}]}, - }, - "title": "Character", - "type": "object", - }, - f'\\{{([ ]?"name"[ ]?:[ ]?({STRING}|null)([ ]?,[ ]?"age"[ ]?:[ ]?({INTEGER}|null))?([ ]?,[ ]?"strength"[ ]?:[ ]?({INTEGER}|null))?|([ ]?"name"[ ]?:[ ]?({STRING}|null)[ ]?,)?[ ]?"age"[ ]?:[ ]?({INTEGER}|null)([ ]?,[ ]?"strength"[ ]?:[ ]?({INTEGER}|null))?|([ ]?"name"[ ]?:[ ]?({STRING}|null)[ ]?,)?([ ]?"age"[ ]?:[ ]?({INTEGER}|null)[ ]?,)?[ ]?"strength"[ ]?:[ ]?({INTEGER}|null))?[ ]?\\}}', - [ - ('{ "name" : "Player" }', True), - ('{ "name" : "Player", "age" : 10, "strength" : 10 }', True), - ('{ "age" : 10, "strength" : 10 }', True), - ("{ }", True), - ], - ), - ], -) -def test_match(schema, regex, examples): - interegular.parse_pattern(regex) - schema = json.dumps(schema) - test_regex = build_regex_from_schema(schema) - assert test_regex == regex - - for string, does_match in examples: - match = re.fullmatch(test_regex, string) - if does_match: - if match is None: - raise ValueError(f"Expected match for '{string}'") - assert match[0] == string - assert match.span() == (0, len(string)) - else: - assert match is None - - -@pytest.mark.parametrize( - "schema,regex,examples", - [ - # UUID - ( - {"title": "Foo", "type": "string", "format": "uuid"}, - UUID, - [ - ("123e4567-e89b-12d3-a456-426614174000", False), - ('"123e4567-e89b-12d3-a456-426614174000"', True), - ('"123e4567-e89b-12d3-a456-42661417400"', False), - ('"123e4567-e89b-12d3-a456-42661417400g"', False), - ('"123e4567-e89b-12d3-a456-42661417400-"', False), - ('""', False), - ], - ), - # DATE-TIME - ( - {"title": "Foo", "type": "string", "format": "date-time"}, - DATE_TIME, - [ - ("2018-11-13T20:20:39Z", False), - ('"2018-11-13T20:20:39Z"', True), - ('"2016-09-18T17:34:02.666Z"', True), - ('"2008-05-11T15:30:00Z"', True), - ('"2021-01-01T00:00:00"', True), - ('"2022-01-10 07:19:30"', False), # missing T - ('"2022-12-10T10-04-29"', False), # incorrect separator - ('"2023-01-01"', False), - ], - ), - # DATE - ( - {"title": "Foo", "type": "string", "format": "date"}, - DATE, - [ - ("2018-11-13", False), - ('"2018-11-13"', True), - ('"2016-09-18"', True), - ('"2008-05-11"', True), - ('"2015-13-01"', False), # incorrect month - ('"2022-01"', False), # missing day - ('"2022/12/01"', False), # incorrect separator" - ], - ), - # TIME - ( - {"title": "Foo", "type": "string", "format": "time"}, - TIME, - [ - ("20:20:39Z", False), - ('"20:20:39Z"', True), - ('"15:30:00Z"', True), - ('"25:30:00"', False), # incorrect hour - ('"15:30"', False), # missing seconds - ('"15:30:00.000"', False), # missing Z - ('"15-30-00"', False), # incorrect separator - ('"15:30:00+01:00"', False), # incorrect separator - ], - ), - ], -) -def test_format(schema, regex, examples): - interegular.parse_pattern(regex) - schema = json.dumps(schema) - test_regex = build_regex_from_schema(schema) - assert test_regex == regex - - for string, does_match in examples: - match = re.fullmatch(test_regex, string) - if does_match: - assert match[0] == string - assert match.span() == (0, len(string)) - else: - assert match is None - - -@pytest.mark.parametrize( - "schema,examples", - [ - # NESTED UUID - ( - { - "title": "Foo", - "type": "object", - "properties": {"uuid": {"type": "string", "format": "uuid"}}, - }, - [ - ('{"uuid": "123e4567-e89b-12d3-a456-426614174000"}', True), - ('{"uuid":"123e4567-e89b-12d3-a456-42661417400"}', False), - ('{"uuid":"123e4567-e89b-12d3-a456-42661417400g"}', False), - ('{"uuid":"123e4567-e89b-12d3-a456-42661417400-"}', False), - ( - '{"uuid":123e4567-e89b-12d3-a456-426614174000}', - False, - ), # missing quotes for value - ('{"uuid":""}', False), - ], - ), - # NESTED DATE-TIME - ( - { - "title": "Foo", - "type": "object", - "properties": {"dateTime": {"type": "string", "format": "date-time"}}, - }, - [ - ('{"dateTime": "2018-11-13T20:20:39Z"}', True), - ('{"dateTime":"2016-09-18T17:34:02.666Z"}', True), - ('{"dateTime":"2008-05-11T15:30:00Z"}', True), - ('{"dateTime":"2021-01-01T00:00:00"}', True), - ('{"dateTime":"2022-01-10 07:19:30"}', False), # missing T - ('{"dateTime":"2022-12-10T10-04-29"}', False), # incorrect separator - ( - '{"dateTime":2018-11-13T20:20:39Z}', - False, - ), # missing quotes for value - ('{"dateTime":"2023-01-01"}', False), - ], - ), - # NESTED DATE - ( - { - "title": "Foo", - "type": "object", - "properties": {"date": {"type": "string", "format": "date"}}, - }, - [ - ('{"date": "2018-11-13"}', True), - ('{"date":"2016-09-18"}', True), - ('{"date":"2008-05-11"}', True), - ('{"date":"2015-13-01"}', False), # incorrect month - ('{"date":"2022-01"}', False), # missing day - ('{"date":"2022/12/01"}', False), # incorrect separator" - ('{"date":2018-11-13}', False), # missing quotes for value - ], - ), - # NESTED TIME - ( - { - "title": "Foo", - "type": "object", - "properties": {"time": {"type": "string", "format": "time"}}, - }, - [ - ('{"time": "20:20:39Z"}', True), - ('{"time":"15:30:00Z"}', True), - ('{"time":"25:30:00"}', False), # incorrect hour - ('{"time":"15:30"}', False), # missing seconds - ('{"time":"15:30:00.000"}', False), # missing Z - ('{"time":"15-30-00"}', False), # incorrect separator - ('{"time":"15:30:00+01:00"}', False), # incorrect separator - ('{"time":20:20:39Z}', False), # missing quotes for value - ], - ), - # Unconstrained Object - ( - { - "title": "Foo", - "type": "object", - }, - [ - ("{}", True), - ('{"a": 1, "b": null}', True), - ('{"a": {"z": {"g": 4}}, "b": null}', True), - ("1234", False), # not an object - ('["a", "a"]', False), # not an array - ], - ), - # Unconstrained Array - ( - { - "type": "array", - }, - [ - ("[1, {}, false]", True), - ("[{}]", True), - ('[{"a": {"z": "q"}, "b": null}]', True), - ('[{"a": [1, 2, true], "b": null}]', True), - ('[{"a": [1, 2, true], "b": {"a": "b"}}, 1, true, [1, [2]]]', True), - # too deep, default unconstrained depth limit = 2 - ( - '[{"a": [1, 2, true], "b": {"a": "b"}}, 1, true, [1, [2, [3]]]]', - False, - ), - ('[{"a": {"z": {"g": 4}}, "b": null}]', False), - ("[[[[1]]]]", False), - # not an array - ("{}", False), - ('{"a": 1, "b": null}', False), - ('{"a": {"z": {"g": 4}}, "b": null}', False), - ("1234", False), # not an array - ('{"a": "a"}', False), # not an array - ], - ), - # No schema / unconstrained value - ( - {}, - [ - ('"aaabbuecuh"', True), # string - ("5.554", True), # number - ("true", True), # boolean - ("null", True), # null - ("5999", True), # integer - ('["a", "b"]', True), # array - ('{"key": {"k2": "value"}}', True), # nested object - ("this isnt valid json", False), - ], - ), - ], -) -def test_format_without_regex(schema, examples): - schema = json.dumps(schema) - test_regex = build_regex_from_schema(schema) - for string, does_match in examples: - match = re.fullmatch(test_regex, string) - if does_match: - assert match[0] == string - assert match.span() == (0, len(string)) - else: - assert match is None - - -@pytest.mark.parametrize("whitespace_pattern", [None, r"[\n ]*", "abc"]) -def test_json_schema_custom_whitespace_pattern(whitespace_pattern): - """assert whitespace_pattern setting respected""" - - class MockModel(BaseModel): - foo: int - bar: str - - schema = json.dumps(MockModel.model_json_schema()) - - # assert any ws pattern can be used - if whitespace_pattern == "abc": - build_regex_from_schema(schema, whitespace_pattern) - return - - pattern = build_regex_from_schema(schema, whitespace_pattern) - - mock_result_mult_ws = ( - """{ "foo" : 4, \n\n\n "bar": "baz baz baz bar"\n\n}""" - ) - mock_result_maybe_ws = """{"foo" : 4 ,"bar":"baz baz baz bar"}""" - - match_default_ws = re.fullmatch(pattern, mock_result_maybe_ws) - if whitespace_pattern is None: - assert match_default_ws - else: - assert re.fullmatch(pattern, mock_result_mult_ws) - - -def test_one_of_doesnt_produce_illegal_lookaround(): - """Reproduces failure in https://github.com/dottxt-ai/outlines/issues/823""" - - class Cat(BaseModel): - pet_type: Literal["cat"] - meows: int - - class Dog(BaseModel): - pet_type: Literal["dog"] - barks: float - - class Model(BaseModel): - pet: Union[Cat, Dog] = Field(..., discriminator="pet_type") - n: int - - json_schema = Model.schema_json() - - json_schema = Model.schema_json() - pattern = build_regex_from_schema(json_schema, whitespace_pattern=None) - - # check if the pattern uses lookarounds incompatible with interegular.Pattern.to_fsm() - interegular.parse_pattern(pattern).to_fsm() + regex_str = build_regex_from_schema(schema) + assert isinstance(regex_str, str) def add(a: float, b: float) -> float: @@ -1089,8 +69,10 @@ class EmptyEnum(Enum): ) def test_enum_schema(enum, expectation): with expectation: - result = get_schema_from_enum(enum) - assert result["title"] == enum.__name__ - assert len(result["enum"]) == len(enum) - for elt in result["enum"]: + schema = get_schema_from_enum(enum) + regex_str = build_regex_from_schema(json.dumps(schema)) + assert isinstance(regex_str, str) + assert schema["title"] == enum.__name__ + assert len(schema["oneOf"]) == len(enum) + for elt in schema["oneOf"]: assert type(elt) in [int, float, bool, type(None), str, dict]