Skip to content

Commit

Permalink
refactor: simplify char class encoding (#101)
Browse files Browse the repository at this point in the history
  • Loading branch information
mdjastrzebski authored Sep 22, 2024
1 parent 16d9164 commit d218336
Show file tree
Hide file tree
Showing 9 changed files with 192 additions and 60 deletions.
5 changes: 3 additions & 2 deletions src/__tests__/example-email.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import {

test('example: email validation', () => {
const usernameChars = charClass(charRange('a', 'z'), digit, anyOf('._%+-'));
const hostnameChars = charClass(charRange('a', 'z'), digit, anyOf('-.'));
const hostnameChars = charClass(charRange('a', 'z'), digit, anyOf('.-'));
const domainChars = charRange('a', 'z');

const regex = buildRegExp(
Expand All @@ -38,5 +38,6 @@ test('example: email validation', () => {
expect(regex).not.toMatchString('[email protected]');
expect(regex).not.toMatchString('@gmail.com');

expect(regex).toEqualRegex(/^[a-z\d._%+-]+@[a-z\d.-]+\.[a-z]{2,}$/i);
// eslint-disable-next-line no-useless-escape
expect(regex).toEqualRegex(/^[a-z\d._%+\-]+@[a-z\d.\-]+\.[a-z]{2,}$/i);
});
2 changes: 1 addition & 1 deletion src/__tests__/example-hex-color.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ test('example: hex color validation', () => {
expect(regex).not.toMatchString('#12345');
expect(regex).not.toMatchString('#1234567');

expect(regex).toEqualRegex(/^#?(?:[a-f\d]{6}|[a-f\d]{3})$/i);
expect(regex).toEqualRegex(/^#?(?:[\da-f]{6}|[\da-f]{3})$/i);
});
3 changes: 2 additions & 1 deletion src/__tests__/example-js-number.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,6 @@ test('example: validate JavaScript number', () => {
expect(numberValidator).not.toMatchString('.1.1');
expect(numberValidator).not.toMatchString('.');

expect(numberValidator).toEqualRegex(/^[+-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?$/);
// eslint-disable-next-line no-useless-escape
expect(numberValidator).toEqualRegex(/^[+\-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+\-]?\d+)?$/);
});
7 changes: 4 additions & 3 deletions src/__tests__/example-url-simple.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ import {
test('example: simple url validation', () => {
const protocol = [choiceOf('http', 'https'), '://'];
const domainChars = charClass(charRange('a', 'z'), digit);
const domainCharsHypen = charClass(domainChars, anyOf('-'));
const domainCharsHyphen = charClass(domainChars, anyOf('-'));

const domainSegment = choiceOf(
domainChars, // single char
[domainChars, zeroOrMore(domainCharsHypen), domainChars], // multi char
[domainChars, zeroOrMore(domainCharsHyphen), domainChars], // multi char
);

const regex = buildRegExp([
Expand All @@ -45,6 +45,7 @@ test('example: simple url validation', () => {
expect(regex).not.toMatchString('@gmail.com');

expect(regex).toEqualRegex(
/^(?:(?:http|https):\/\/)?(?:(?:[a-z\d]|[a-z\d][a-z\d-]*[a-z\d])\.)+[a-z][a-z\d]+$/,
// eslint-disable-next-line no-useless-escape
/^(?:(?:http|https):\/\/)?(?:(?:[a-z\d]|[a-z\d][a-z\d\-]*[a-z\d])\.)+[a-z][a-z\d]+$/,
);
});
163 changes: 150 additions & 13 deletions src/constructs/__tests__/char-class.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
/* eslint-disable no-useless-escape */
import {
anyOf,
buildRegExp,
charClass,
charRange,
digit,
Expand All @@ -9,11 +11,16 @@ import {
nonWord,
oneOrMore,
optional,
type RegexSequence,
whitespace,
word,
zeroOrMore,
} from '../..';

function u(sequence: RegexSequence) {
return buildRegExp(sequence, { unicode: true });
}

test('`charClass` base cases', () => {
expect(charClass(charRange('a', 'z'))).toEqualRegex(/[a-z]/);
expect(charClass(charRange('a', 'z'), charRange('A', 'Z'))).toEqualRegex(/[a-zA-Z]/);
Expand Down Expand Up @@ -66,51 +73,181 @@ test('`charRange` throws on incorrect arguments', () => {
);
});

test('`anyOf` pattern', () => {
test('`anyOf` handles basic cases pattern', () => {
expect(anyOf('a')).toMatchString('a');
expect(anyOf('a')).toEqualRegex(/[a]/);

expect(['x', anyOf('a'), 'x']).toMatchString('xax');
expect(['x', anyOf('a'), 'x']).toEqualRegex(/x[a]x/);

expect(anyOf('ab')).toMatchString('a');
expect(anyOf('ab')).toMatchString('b');
expect(anyOf('ab')).not.toMatchString('c');
expect(anyOf('ab')).toEqualRegex(/[ab]/);

expect(['x', anyOf('ab')]).toMatchString('xa');
expect(['x', anyOf('ab')]).toMatchString('xb');
expect(['x', anyOf('ab')]).not.toMatchString('x0');
expect(['x', anyOf('ab')]).toEqualRegex(/x[ab]/);

expect(['x', anyOf('ab'), 'x']).toMatchString('xax');
expect(['x', anyOf('ab'), 'x']).toMatchString('xbx');
expect(['x', anyOf('ab'), 'x']).not.toMatchString('x0x');
expect(['x', anyOf('ab'), 'x']).toEqualRegex(/x[ab]x/);
});

test('`anyOf` throws on empty text', () => {
expect(() => anyOf('')).toThrowErrorMatchingInlineSnapshot(`"Expected at least one character"`);
});

test('`anyOf` pattern with quantifiers', () => {
expect(['x', oneOrMore(anyOf('abc')), 'x']).toEqualRegex(/x[abc]+x/);
expect(['x', optional(anyOf('abc')), 'x']).toEqualRegex(/x[abc]?x/);
expect(['x', zeroOrMore(anyOf('abc')), 'x']).toEqualRegex(/x[abc]*x/);
});

test('`anyOf` pattern escapes special characters', () => {
expect(anyOf('abc-+.]\\')).toEqualRegex(/[abc+.\]\\-]/);
});
test('`anyOf` handles hyphens', () => {
expect(anyOf('^-')).toMatchString('^');
expect(anyOf('^-')).toMatchString('-');
expect(anyOf('^-')).not.toMatchString('a');
expect(anyOf('^-')).toEqualRegex(/[\^\-]/);

expect(anyOf('-^')).toMatchString('^');
expect(anyOf('-^')).toMatchString('-');
expect(anyOf('-^')).not.toMatchString('a');
expect(anyOf('-^')).toEqualRegex(/[\-\^]/);

test('`anyOf` pattern moves hyphen to the last position', () => {
expect(anyOf('a-bc')).toEqualRegex(/[abc-]/);
expect(anyOf('-^a')).toMatchString('^');
expect(anyOf('-^a')).toMatchString('-');
expect(anyOf('-^a')).toMatchString('a');
expect(anyOf('-^a')).not.toMatchString('b');
expect(anyOf('-^a')).toEqualRegex(/[\-\^a]/);
});

test('`anyOf` pattern edge cases', () => {
expect(anyOf('^-')).toEqualRegex(/[\^-]/);
expect(anyOf('-^')).toEqualRegex(/[\^-]/);
expect(anyOf('-^a')).toEqualRegex(/[a^-]/);
test('`anyOf` handles hyphens in unicode mode', () => {
expect(u(anyOf('^-'))).toMatchString('^');
expect(u(anyOf('^-'))).toMatchString('^');
expect(u(anyOf('^-'))).toMatchString('-');
expect(u(anyOf('^-'))).not.toMatchString('a');
expect(u(anyOf('^-'))).toEqualRegex(/[\^\-]/u);

expect(u(anyOf('-^'))).toMatchString('^');
expect(u(anyOf('-^'))).toMatchString('-');
expect(u(anyOf('-^'))).not.toMatchString('a');
expect(u(anyOf('-^'))).toEqualRegex(/[\-\^]/u);

expect(u(anyOf('-^a'))).toMatchString('^');
expect(u(anyOf('-^a'))).toMatchString('-');
expect(u(anyOf('-^a'))).toMatchString('a');
expect(u(anyOf('-^a'))).not.toMatchString('b');
expect(u(anyOf('-^a'))).toEqualRegex(/[\-\^a]/u);
});

test('`anyOf` handles special chars', () => {
expect(anyOf('.')).toMatchString('.');
expect(anyOf('.')).not.toMatchString('a');
expect(anyOf('.')).toEqualRegex(/[.]/);

expect(anyOf('*')).toMatchString('*');
expect(anyOf('*')).not.toMatchString('a');
expect(anyOf('*')).toEqualRegex(/[*]/);

expect(anyOf('+')).toMatchString('+');
expect(anyOf('+')).not.toMatchString('a');
expect(anyOf('+')).toEqualRegex(/[+]/);

expect(anyOf('?')).toMatchString('?');
expect(anyOf('?')).not.toMatchString('a');
expect(anyOf('?')).toEqualRegex(/[?]/);
expect(anyOf('^')).toEqualRegex(/[^]/);

expect(anyOf('^')).toMatchString('^');
expect(anyOf('^')).not.toMatchString('a');
expect(anyOf('^')).toEqualRegex(/[\^]/);

expect(anyOf('^0')).toMatchString('^');
expect(anyOf('^0')).not.toMatchString('a');
expect(anyOf('^0')).toEqualRegex(/[\^0]/);

expect(anyOf('0^')).toMatchString('^');
expect(anyOf('0^')).not.toMatchString('a');
expect(anyOf('0^')).toEqualRegex(/[0\^]/);

expect(anyOf('$')).toMatchString('$');
expect(anyOf('$')).not.toMatchString('a');
expect(anyOf('$')).toEqualRegex(/[$]/);

expect(anyOf('{')).toMatchString('{');
expect(anyOf('{')).not.toMatchString('a');
expect(anyOf('{')).toEqualRegex(/[{]/);

expect(anyOf('}')).toMatchString('}');
expect(anyOf('}')).not.toMatchString('a');
expect(anyOf('}')).toEqualRegex(/[}]/);

expect(anyOf('(')).toMatchString('(');
expect(anyOf('(')).not.toMatchString('a');
expect(anyOf('(')).toEqualRegex(/[(]/);

expect(anyOf(')')).toMatchString(')');
expect(anyOf(')')).not.toMatchString('a');
expect(anyOf(')')).toEqualRegex(/[)]/);

expect(anyOf('|')).toMatchString('|');
expect(anyOf('|')).not.toMatchString('a');
expect(anyOf('|')).toEqualRegex(/[|]/);

expect(anyOf('[')).toMatchString('[');
expect(anyOf('[')).not.toMatchString('a');
expect(anyOf('[')).toEqualRegex(/[[]/);

expect(anyOf(']')).toMatchString(']');
expect(anyOf(']')).not.toMatchString('a');
expect(anyOf(']')).toEqualRegex(/[\]]/);

expect(anyOf('\\')).toMatchString('\\');
expect(anyOf('\\')).not.toMatchString('a');
expect(anyOf('\\')).toEqualRegex(/[\\]/);
});

test('`anyOf` throws on empty text', () => {
expect(() => anyOf('')).toThrowErrorMatchingInlineSnapshot(`"Expected at least one character"`);
test('`anyof` matches special characters', () => {
expect(anyOf('a')).toMatchString('a');
});

test('`anyof` matches special characters in unicode mode', () => {
expect(u(anyOf('a'))).toMatchString('a');

expect(u(anyOf('.'))).toMatchString('.');
expect(u(anyOf('.'))).not.toMatchString('a');
expect(u(anyOf('*'))).toMatchString('*');
expect(u(anyOf('*'))).not.toMatchString('a');
expect(u(anyOf('+'))).toMatchString('+');
expect(u(anyOf('+'))).not.toMatchString('a');
expect(u(anyOf('?'))).toMatchString('?');
expect(u(anyOf('?'))).not.toMatchString('a');
expect(u(anyOf('^'))).toMatchString('^');
expect(u(anyOf('^'))).not.toMatchString('a');
expect(u(anyOf('^0'))).toMatchString('^');
expect(u(anyOf('^0'))).not.toMatchString('a');
expect(u(anyOf('0^'))).toMatchString('^');
expect(u(anyOf('0^'))).not.toMatchString('a');
expect(u(anyOf('$'))).toMatchString('$');
expect(u(anyOf('$'))).not.toMatchString('a');
expect(u(anyOf('{'))).toMatchString('{');
expect(u(anyOf('{'))).not.toMatchString('a');
expect(u(anyOf('}'))).toMatchString('}');
expect(u(anyOf('}'))).not.toMatchString('a');
expect(u(anyOf('('))).toMatchString('(');
expect(u(anyOf('('))).not.toMatchString('a');
expect(u(anyOf(')'))).toMatchString(')');
expect(u(anyOf(')'))).not.toMatchString('a');
expect(u(anyOf('|'))).toMatchString('|');
expect(u(anyOf('|'))).not.toMatchString('a');
expect(u(anyOf('['))).toMatchString('[');
expect(u(anyOf('['))).not.toMatchString('a');
expect(u(anyOf(']'))).toMatchString(']');
expect(u(anyOf(']'))).not.toMatchString('a');
expect(u(anyOf('\\'))).toMatchString('\\');
expect(u(anyOf('\\'))).not.toMatchString('a');
});

test('`negated` character class pattern', () => {
Expand Down
25 changes: 6 additions & 19 deletions src/constructs/char-class.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ export function charClass(...elements: Array<CharacterClass | CharacterEscape>):
}

return {
chars: elements.map((c) => c.chars).flat(),
ranges: elements.map((c) => c.ranges ?? []).flat(),
elements: elements.map((c) => c.elements).flat(),
encode: encodeCharClass,
};
}
Expand All @@ -36,8 +35,7 @@ export function charRange(start: string, end: string): CharacterClass {
}

return {
chars: [],
ranges: [{ start, end }],
elements: [`${start}-${end}`],
encode: encodeCharClass,
};
}
Expand All @@ -52,7 +50,7 @@ export function anyOf(chars: string): CharacterClass {
ensureText(chars);

return {
chars: chars.split('').map(escapeChar),
elements: chars.split('').map(escapeChar),
encode: encodeCharClass,
};
}
Expand All @@ -74,27 +72,16 @@ export const inverted = negated;

/** Escape chars for usage inside char class */
function escapeChar(text: string): string {
return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string
// anyOf(']-\\^')
return text.replace(/[\]\-\\^]/g, '\\$&'); // "$&" is whole matched string
}

function encodeCharClass(
this: CharacterClass | CharacterEscape,
isNegated?: boolean,
): EncodedRegex {
// If passed characters includes hyphen (`-`) it need to be moved to
// first (or last) place in order to treat it as hyphen character and not a range.
// See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types
const hyphen = this.chars.includes('-') ? '-' : '';
const caret = this.chars.includes('^') ? '^' : '';
const otherChars = this.chars.filter((c) => c !== '-' && c !== '^').join('');
const ranges = this.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? '';
const negation = isNegated ? '^' : '';

let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`;
if (pattern === '[^-]') pattern = '[\\^-]';

return {
precedence: 'atom',
pattern,
pattern: `[${isNegated ? '^' : ''}${this.elements.join('')}]`,
};
}
12 changes: 6 additions & 6 deletions src/constructs/char-escape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ export const any: EncodedRegex = {
export const digit: CharacterEscape = {
precedence: 'atom',
pattern: '\\d',
chars: ['\\d'],
elements: ['\\d'],
};

/**
Expand All @@ -24,7 +24,7 @@ export const digit: CharacterEscape = {
export const nonDigit: CharacterEscape = {
precedence: 'atom',
pattern: '\\D',
chars: ['\\D'],
elements: ['\\D'],
};

/**
Expand All @@ -33,7 +33,7 @@ export const nonDigit: CharacterEscape = {
export const word: CharacterEscape = {
precedence: 'atom',
pattern: '\\w',
chars: ['\\w'],
elements: ['\\w'],
};

/**
Expand All @@ -42,7 +42,7 @@ export const word: CharacterEscape = {
export const nonWord: CharacterEscape = {
precedence: 'atom',
pattern: '\\W',
chars: ['\\W'],
elements: ['\\W'],
};

/**
Expand All @@ -51,7 +51,7 @@ export const nonWord: CharacterEscape = {
export const whitespace: CharacterEscape = {
precedence: 'atom',
pattern: '\\s',
chars: ['\\s'],
elements: ['\\s'],
};

/**
Expand All @@ -60,7 +60,7 @@ export const whitespace: CharacterEscape = {
export const nonWhitespace: CharacterEscape = {
precedence: 'atom',
pattern: '\\S',
chars: ['\\S'],
elements: ['\\S'],
};

/**
Expand Down
Loading

0 comments on commit d218336

Please sign in to comment.