From d218336ce1cc3ddf0d2d4bac3a6618deb1d82cda Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 22 Sep 2024 20:42:50 +0200 Subject: [PATCH] refactor: simplify char class encoding (#101) --- src/__tests__/example-email.ts | 5 +- src/__tests__/example-hex-color.ts | 2 +- src/__tests__/example-js-number.ts | 3 +- src/__tests__/example-url-simple.ts | 7 +- src/constructs/__tests__/char-class.test.ts | 163 ++++++++++++++++++-- src/constructs/char-class.ts | 25 +-- src/constructs/char-escape.ts | 12 +- src/constructs/unicode.ts | 4 +- src/types.ts | 31 ++-- 9 files changed, 192 insertions(+), 60 deletions(-) diff --git a/src/__tests__/example-email.ts b/src/__tests__/example-email.ts index 8c25521..8a89b3e 100644 --- a/src/__tests__/example-email.ts +++ b/src/__tests__/example-email.ts @@ -12,7 +12,7 @@ import { test('example: email validation', () => { const usernameChars = charClass(charRange('a', 'z'), digit, anyOf('._%+-')); - const hostnameChars = charClass(charRange('a', 'z'), digit, anyOf('-.')); + const hostnameChars = charClass(charRange('a', 'z'), digit, anyOf('.-')); const domainChars = charRange('a', 'z'); const regex = buildRegExp( @@ -38,5 +38,6 @@ test('example: email validation', () => { expect(regex).not.toMatchString('a@gmail.c'); expect(regex).not.toMatchString('@gmail.com'); - expect(regex).toEqualRegex(/^[a-z\d._%+-]+@[a-z\d.-]+\.[a-z]{2,}$/i); + // eslint-disable-next-line no-useless-escape + expect(regex).toEqualRegex(/^[a-z\d._%+\-]+@[a-z\d.\-]+\.[a-z]{2,}$/i); }); diff --git a/src/__tests__/example-hex-color.ts b/src/__tests__/example-hex-color.ts index 3e63e93..8a62f3b 100644 --- a/src/__tests__/example-hex-color.ts +++ b/src/__tests__/example-hex-color.ts @@ -42,5 +42,5 @@ test('example: hex color validation', () => { expect(regex).not.toMatchString('#12345'); expect(regex).not.toMatchString('#1234567'); - expect(regex).toEqualRegex(/^#?(?:[a-f\d]{6}|[a-f\d]{3})$/i); + expect(regex).toEqualRegex(/^#?(?:[\da-f]{6}|[\da-f]{3})$/i); }); diff --git a/src/__tests__/example-js-number.ts b/src/__tests__/example-js-number.ts index 9c92ff0..a0da19a 100644 --- a/src/__tests__/example-js-number.ts +++ b/src/__tests__/example-js-number.ts @@ -47,5 +47,6 @@ test('example: validate JavaScript number', () => { expect(numberValidator).not.toMatchString('.1.1'); expect(numberValidator).not.toMatchString('.'); - expect(numberValidator).toEqualRegex(/^[+-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?$/); + // eslint-disable-next-line no-useless-escape + expect(numberValidator).toEqualRegex(/^[+\-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+\-]?\d+)?$/); }); diff --git a/src/__tests__/example-url-simple.ts b/src/__tests__/example-url-simple.ts index 14593ac..3914b21 100644 --- a/src/__tests__/example-url-simple.ts +++ b/src/__tests__/example-url-simple.ts @@ -16,11 +16,11 @@ import { test('example: simple url validation', () => { const protocol = [choiceOf('http', 'https'), '://']; const domainChars = charClass(charRange('a', 'z'), digit); - const domainCharsHypen = charClass(domainChars, anyOf('-')); + const domainCharsHyphen = charClass(domainChars, anyOf('-')); const domainSegment = choiceOf( domainChars, // single char - [domainChars, zeroOrMore(domainCharsHypen), domainChars], // multi char + [domainChars, zeroOrMore(domainCharsHyphen), domainChars], // multi char ); const regex = buildRegExp([ @@ -45,6 +45,7 @@ test('example: simple url validation', () => { expect(regex).not.toMatchString('@gmail.com'); expect(regex).toEqualRegex( - /^(?:(?:http|https):\/\/)?(?:(?:[a-z\d]|[a-z\d][a-z\d-]*[a-z\d])\.)+[a-z][a-z\d]+$/, + // eslint-disable-next-line no-useless-escape + /^(?:(?:http|https):\/\/)?(?:(?:[a-z\d]|[a-z\d][a-z\d\-]*[a-z\d])\.)+[a-z][a-z\d]+$/, ); }); diff --git a/src/constructs/__tests__/char-class.test.ts b/src/constructs/__tests__/char-class.test.ts index a7ace7f..203c96e 100644 --- a/src/constructs/__tests__/char-class.test.ts +++ b/src/constructs/__tests__/char-class.test.ts @@ -1,5 +1,7 @@ +/* eslint-disable no-useless-escape */ import { anyOf, + buildRegExp, charClass, charRange, digit, @@ -9,11 +11,16 @@ import { nonWord, oneOrMore, optional, + type RegexSequence, whitespace, word, zeroOrMore, } from '../..'; +function u(sequence: RegexSequence) { + return buildRegExp(sequence, { unicode: true }); +} + test('`charClass` base cases', () => { expect(charClass(charRange('a', 'z'))).toEqualRegex(/[a-z]/); expect(charClass(charRange('a', 'z'), charRange('A', 'Z'))).toEqualRegex(/[a-zA-Z]/); @@ -66,51 +73,181 @@ test('`charRange` throws on incorrect arguments', () => { ); }); -test('`anyOf` pattern', () => { +test('`anyOf` handles basic cases pattern', () => { + expect(anyOf('a')).toMatchString('a'); expect(anyOf('a')).toEqualRegex(/[a]/); + + expect(['x', anyOf('a'), 'x']).toMatchString('xax'); expect(['x', anyOf('a'), 'x']).toEqualRegex(/x[a]x/); + + expect(anyOf('ab')).toMatchString('a'); + expect(anyOf('ab')).toMatchString('b'); + expect(anyOf('ab')).not.toMatchString('c'); expect(anyOf('ab')).toEqualRegex(/[ab]/); + + expect(['x', anyOf('ab')]).toMatchString('xa'); + expect(['x', anyOf('ab')]).toMatchString('xb'); + expect(['x', anyOf('ab')]).not.toMatchString('x0'); expect(['x', anyOf('ab')]).toEqualRegex(/x[ab]/); + + expect(['x', anyOf('ab'), 'x']).toMatchString('xax'); + expect(['x', anyOf('ab'), 'x']).toMatchString('xbx'); + expect(['x', anyOf('ab'), 'x']).not.toMatchString('x0x'); expect(['x', anyOf('ab'), 'x']).toEqualRegex(/x[ab]x/); }); +test('`anyOf` throws on empty text', () => { + expect(() => anyOf('')).toThrowErrorMatchingInlineSnapshot(`"Expected at least one character"`); +}); + test('`anyOf` pattern with quantifiers', () => { expect(['x', oneOrMore(anyOf('abc')), 'x']).toEqualRegex(/x[abc]+x/); expect(['x', optional(anyOf('abc')), 'x']).toEqualRegex(/x[abc]?x/); expect(['x', zeroOrMore(anyOf('abc')), 'x']).toEqualRegex(/x[abc]*x/); }); -test('`anyOf` pattern escapes special characters', () => { - expect(anyOf('abc-+.]\\')).toEqualRegex(/[abc+.\]\\-]/); -}); +test('`anyOf` handles hyphens', () => { + expect(anyOf('^-')).toMatchString('^'); + expect(anyOf('^-')).toMatchString('-'); + expect(anyOf('^-')).not.toMatchString('a'); + expect(anyOf('^-')).toEqualRegex(/[\^\-]/); + + expect(anyOf('-^')).toMatchString('^'); + expect(anyOf('-^')).toMatchString('-'); + expect(anyOf('-^')).not.toMatchString('a'); + expect(anyOf('-^')).toEqualRegex(/[\-\^]/); -test('`anyOf` pattern moves hyphen to the last position', () => { - expect(anyOf('a-bc')).toEqualRegex(/[abc-]/); + expect(anyOf('-^a')).toMatchString('^'); + expect(anyOf('-^a')).toMatchString('-'); + expect(anyOf('-^a')).toMatchString('a'); + expect(anyOf('-^a')).not.toMatchString('b'); + expect(anyOf('-^a')).toEqualRegex(/[\-\^a]/); }); -test('`anyOf` pattern edge cases', () => { - expect(anyOf('^-')).toEqualRegex(/[\^-]/); - expect(anyOf('-^')).toEqualRegex(/[\^-]/); - expect(anyOf('-^a')).toEqualRegex(/[a^-]/); +test('`anyOf` handles hyphens in unicode mode', () => { + expect(u(anyOf('^-'))).toMatchString('^'); + expect(u(anyOf('^-'))).toMatchString('^'); + expect(u(anyOf('^-'))).toMatchString('-'); + expect(u(anyOf('^-'))).not.toMatchString('a'); + expect(u(anyOf('^-'))).toEqualRegex(/[\^\-]/u); + + expect(u(anyOf('-^'))).toMatchString('^'); + expect(u(anyOf('-^'))).toMatchString('-'); + expect(u(anyOf('-^'))).not.toMatchString('a'); + expect(u(anyOf('-^'))).toEqualRegex(/[\-\^]/u); + + expect(u(anyOf('-^a'))).toMatchString('^'); + expect(u(anyOf('-^a'))).toMatchString('-'); + expect(u(anyOf('-^a'))).toMatchString('a'); + expect(u(anyOf('-^a'))).not.toMatchString('b'); + expect(u(anyOf('-^a'))).toEqualRegex(/[\-\^a]/u); +}); +test('`anyOf` handles special chars', () => { + expect(anyOf('.')).toMatchString('.'); + expect(anyOf('.')).not.toMatchString('a'); expect(anyOf('.')).toEqualRegex(/[.]/); + + expect(anyOf('*')).toMatchString('*'); + expect(anyOf('*')).not.toMatchString('a'); expect(anyOf('*')).toEqualRegex(/[*]/); + + expect(anyOf('+')).toMatchString('+'); + expect(anyOf('+')).not.toMatchString('a'); expect(anyOf('+')).toEqualRegex(/[+]/); + + expect(anyOf('?')).toMatchString('?'); + expect(anyOf('?')).not.toMatchString('a'); expect(anyOf('?')).toEqualRegex(/[?]/); - expect(anyOf('^')).toEqualRegex(/[^]/); + + expect(anyOf('^')).toMatchString('^'); + expect(anyOf('^')).not.toMatchString('a'); + expect(anyOf('^')).toEqualRegex(/[\^]/); + + expect(anyOf('^0')).toMatchString('^'); + expect(anyOf('^0')).not.toMatchString('a'); + expect(anyOf('^0')).toEqualRegex(/[\^0]/); + + expect(anyOf('0^')).toMatchString('^'); + expect(anyOf('0^')).not.toMatchString('a'); + expect(anyOf('0^')).toEqualRegex(/[0\^]/); + + expect(anyOf('$')).toMatchString('$'); + expect(anyOf('$')).not.toMatchString('a'); expect(anyOf('$')).toEqualRegex(/[$]/); + + expect(anyOf('{')).toMatchString('{'); + expect(anyOf('{')).not.toMatchString('a'); expect(anyOf('{')).toEqualRegex(/[{]/); + + expect(anyOf('}')).toMatchString('}'); + expect(anyOf('}')).not.toMatchString('a'); expect(anyOf('}')).toEqualRegex(/[}]/); + + expect(anyOf('(')).toMatchString('('); + expect(anyOf('(')).not.toMatchString('a'); expect(anyOf('(')).toEqualRegex(/[(]/); + + expect(anyOf(')')).toMatchString(')'); + expect(anyOf(')')).not.toMatchString('a'); expect(anyOf(')')).toEqualRegex(/[)]/); + + expect(anyOf('|')).toMatchString('|'); + expect(anyOf('|')).not.toMatchString('a'); expect(anyOf('|')).toEqualRegex(/[|]/); + + expect(anyOf('[')).toMatchString('['); + expect(anyOf('[')).not.toMatchString('a'); expect(anyOf('[')).toEqualRegex(/[[]/); + + expect(anyOf(']')).toMatchString(']'); + expect(anyOf(']')).not.toMatchString('a'); expect(anyOf(']')).toEqualRegex(/[\]]/); + + expect(anyOf('\\')).toMatchString('\\'); + expect(anyOf('\\')).not.toMatchString('a'); expect(anyOf('\\')).toEqualRegex(/[\\]/); }); -test('`anyOf` throws on empty text', () => { - expect(() => anyOf('')).toThrowErrorMatchingInlineSnapshot(`"Expected at least one character"`); +test('`anyof` matches special characters', () => { + expect(anyOf('a')).toMatchString('a'); +}); + +test('`anyof` matches special characters in unicode mode', () => { + expect(u(anyOf('a'))).toMatchString('a'); + + expect(u(anyOf('.'))).toMatchString('.'); + expect(u(anyOf('.'))).not.toMatchString('a'); + expect(u(anyOf('*'))).toMatchString('*'); + expect(u(anyOf('*'))).not.toMatchString('a'); + expect(u(anyOf('+'))).toMatchString('+'); + expect(u(anyOf('+'))).not.toMatchString('a'); + expect(u(anyOf('?'))).toMatchString('?'); + expect(u(anyOf('?'))).not.toMatchString('a'); + expect(u(anyOf('^'))).toMatchString('^'); + expect(u(anyOf('^'))).not.toMatchString('a'); + expect(u(anyOf('^0'))).toMatchString('^'); + expect(u(anyOf('^0'))).not.toMatchString('a'); + expect(u(anyOf('0^'))).toMatchString('^'); + expect(u(anyOf('0^'))).not.toMatchString('a'); + expect(u(anyOf('$'))).toMatchString('$'); + expect(u(anyOf('$'))).not.toMatchString('a'); + expect(u(anyOf('{'))).toMatchString('{'); + expect(u(anyOf('{'))).not.toMatchString('a'); + expect(u(anyOf('}'))).toMatchString('}'); + expect(u(anyOf('}'))).not.toMatchString('a'); + expect(u(anyOf('('))).toMatchString('('); + expect(u(anyOf('('))).not.toMatchString('a'); + expect(u(anyOf(')'))).toMatchString(')'); + expect(u(anyOf(')'))).not.toMatchString('a'); + expect(u(anyOf('|'))).toMatchString('|'); + expect(u(anyOf('|'))).not.toMatchString('a'); + expect(u(anyOf('['))).toMatchString('['); + expect(u(anyOf('['))).not.toMatchString('a'); + expect(u(anyOf(']'))).toMatchString(']'); + expect(u(anyOf(']'))).not.toMatchString('a'); + expect(u(anyOf('\\'))).toMatchString('\\'); + expect(u(anyOf('\\'))).not.toMatchString('a'); }); test('`negated` character class pattern', () => { diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index c18be71..a2b475d 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -13,8 +13,7 @@ export function charClass(...elements: Array): } return { - chars: elements.map((c) => c.chars).flat(), - ranges: elements.map((c) => c.ranges ?? []).flat(), + elements: elements.map((c) => c.elements).flat(), encode: encodeCharClass, }; } @@ -36,8 +35,7 @@ export function charRange(start: string, end: string): CharacterClass { } return { - chars: [], - ranges: [{ start, end }], + elements: [`${start}-${end}`], encode: encodeCharClass, }; } @@ -52,7 +50,7 @@ export function anyOf(chars: string): CharacterClass { ensureText(chars); return { - chars: chars.split('').map(escapeChar), + elements: chars.split('').map(escapeChar), encode: encodeCharClass, }; } @@ -74,27 +72,16 @@ export const inverted = negated; /** Escape chars for usage inside char class */ function escapeChar(text: string): string { - return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string + // anyOf(']-\\^') + return text.replace(/[\]\-\\^]/g, '\\$&'); // "$&" is whole matched string } function encodeCharClass( this: CharacterClass | CharacterEscape, isNegated?: boolean, ): EncodedRegex { - // If passed characters includes hyphen (`-`) it need to be moved to - // first (or last) place in order to treat it as hyphen character and not a range. - // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types - const hyphen = this.chars.includes('-') ? '-' : ''; - const caret = this.chars.includes('^') ? '^' : ''; - const otherChars = this.chars.filter((c) => c !== '-' && c !== '^').join(''); - const ranges = this.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? ''; - const negation = isNegated ? '^' : ''; - - let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`; - if (pattern === '[^-]') pattern = '[\\^-]'; - return { precedence: 'atom', - pattern, + pattern: `[${isNegated ? '^' : ''}${this.elements.join('')}]`, }; } diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts index 70456b0..ee2854b 100644 --- a/src/constructs/char-escape.ts +++ b/src/constructs/char-escape.ts @@ -15,7 +15,7 @@ export const any: EncodedRegex = { export const digit: CharacterEscape = { precedence: 'atom', pattern: '\\d', - chars: ['\\d'], + elements: ['\\d'], }; /** @@ -24,7 +24,7 @@ export const digit: CharacterEscape = { export const nonDigit: CharacterEscape = { precedence: 'atom', pattern: '\\D', - chars: ['\\D'], + elements: ['\\D'], }; /** @@ -33,7 +33,7 @@ export const nonDigit: CharacterEscape = { export const word: CharacterEscape = { precedence: 'atom', pattern: '\\w', - chars: ['\\w'], + elements: ['\\w'], }; /** @@ -42,7 +42,7 @@ export const word: CharacterEscape = { export const nonWord: CharacterEscape = { precedence: 'atom', pattern: '\\W', - chars: ['\\W'], + elements: ['\\W'], }; /** @@ -51,7 +51,7 @@ export const nonWord: CharacterEscape = { export const whitespace: CharacterEscape = { precedence: 'atom', pattern: '\\s', - chars: ['\\s'], + elements: ['\\s'], }; /** @@ -60,7 +60,7 @@ export const whitespace: CharacterEscape = { export const nonWhitespace: CharacterEscape = { precedence: 'atom', pattern: '\\S', - chars: ['\\S'], + elements: ['\\S'], }; /** diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts index c3874ea..9fd17a6 100644 --- a/src/constructs/unicode.ts +++ b/src/constructs/unicode.ts @@ -25,7 +25,7 @@ export function unicodeChar(codePoint: number): CharacterEscape { return { precedence: 'atom', pattern: escape, - chars: [escape], + elements: [escape], }; } @@ -52,6 +52,6 @@ export function unicodeProperty(property: string, value?: string): CharacterEsca return { precedence: 'atom', pattern: escape, - chars: [escape], + elements: [escape], }; } diff --git a/src/types.ts b/src/types.ts index f6fd401..4a4b056 100644 --- a/src/types.ts +++ b/src/types.ts @@ -18,33 +18,38 @@ export type RegexElement = RegexConstruct | RegExp | string; export type RegexConstruct = EncodedRegex | LazyEncodableRegex; /** - * Encoded regex pattern with information about its type (atom, sequence) + * Encoded regex pattern with information about its precedence (atom, sequence, disjunction) */ export interface EncodedRegex { precedence: EncodePrecedence; pattern: string; } +/** + * Precedence of given regex pattern. + */ export type EncodePrecedence = 'atom' | 'sequence' | 'disjunction'; -export interface CharacterEscape extends EncodedRegex { - // `CharacterClass` compatibility - chars: string[]; - ranges?: never; -} - +/** + * Regex patter that can be encoded by calling the `encode` method. + */ export interface LazyEncodableRegex { encode: () => EncodedRegex; } -export interface CharacterClass extends LazyEncodableRegex { - chars: string[]; - ranges?: CharacterRange[]; +/** + * Character escape: `EncodedRegex` that can albo be put into `charClass`. + */ +export interface CharacterEscape extends EncodedRegex { + elements: string[]; } -export interface CharacterRange { - start: string; - end: string; +/** + * Character class. + * Regex: `[...]` + */ +export interface CharacterClass extends LazyEncodableRegex { + elements: string[]; } /**