Skip to content

Commit

Permalink
Implement namedGroups in stdregex
Browse files Browse the repository at this point in the history
  • Loading branch information
stephenamar-db committed Jan 3, 2025
1 parent 6aa8e1b commit ebd99a4
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 88 deletions.
19 changes: 18 additions & 1 deletion sjsonnet/src-js/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,27 @@ object Platform {
}

private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern]
private val namedGroupPattern = Pattern.compile("\\(\\?<(.+?)>.*?\\)")
private val namedGroupPatternReplace = Pattern.compile("(\\(\\?P<)(.+?>.*?\\))")

// scala.js does not rely on re2. Per https://www.scala-js.org/doc/regular-expressions.html.
// Expect to see some differences in behavior.
def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat))
def getPatternFromCache(pat: String) : Pattern = {
val fixedPattern = namedGroupPatternReplace.matcher(pat).replaceAll("(?<$2")
regexCache.computeIfAbsent(pat, _ => Pattern.compile(fixedPattern))
}


def getNamedGroupsMap(pat: Pattern): Map[String, Int] = {
val namedGroups = Map.newBuilder[String, Int]
val matcher = namedGroupPattern.matcher(pat.pattern())
while (matcher.find()) {
for (i <- 1 to matcher.groupCount()) {
namedGroups += matcher.group(i) -> i
}
}
namedGroups.result()
}

def regexQuote(s: String): String = Pattern.quote(s)
}
14 changes: 13 additions & 1 deletion sjsonnet/src-jvm/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,19 @@ object Platform {
}

private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern]
private val dashPattern = getPatternFromCache("-")

def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat))

def regexQuote(s: String): String = Pattern.quote(s)
def getNamedGroupsMap(pat: Pattern): Map[String, Int] = pat.namedGroups().asScala.view.mapValues(_.intValue()).toMap

def regexQuote(s: String): String = {
val quote = Pattern.quote(s)
val matcher = dashPattern.matcher(quote)
if (matcher.find()) {
matcher.replaceAll("\\\\-")
} else {
quote
}
}
}
17 changes: 14 additions & 3 deletions sjsonnet/src-native/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,20 @@ object Platform {
}

private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern]
// scala native is powered by RE2, per https://scala-native.org/en/latest/lib/javalib.html#regular-expressions-java-util-regexp
// It should perform similarly to the JVM implementation.
private val dashPattern = getPatternFromCache("-")

def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat))

def regexQuote(s: String): String = Pattern.quote(s)
def getNamedGroupsMap(pat: Pattern): Map[String, Int] = scala.jdk.javaapi.CollectionConverters.asScala(
pat.re2.namedGroups).view.mapValues(_.intValue()).toMap

def regexQuote(s: String): String = {
val quote = Pattern.quote(s)
val matcher = dashPattern.matcher(quote)
if (matcher.find()) {
matcher.replaceAll("\\\\-")
} else {
quote
}
}
}
21 changes: 4 additions & 17 deletions sjsonnet/src/sjsonnet/Std.scala
Original file line number Diff line number Diff line change
Expand Up @@ -483,26 +483,14 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map.
}

private object StripUtils {
private val dashPattern = Platform.getPatternFromCache("-")

private def cleanupPattern(chars: String): String = {
val matcher = dashPattern.matcher(chars)
if (matcher.find()) {
matcher.replaceAll("") + "-"
} else {
chars
}
}

private def getLeadingPattern(chars: String): String = "^[" + Platform.regexQuote(chars) + "]+"

private def getTrailingPattern(chars: String): String = "[" + Platform.regexQuote(chars) + "]+$"

def unspecializedStrip(str: String, chars: String, left: Boolean, right: Boolean): String = {
var s = str
val cleanedUpPattern = cleanupPattern(chars)
if (right) s = Platform.getPatternFromCache(getTrailingPattern(cleanedUpPattern)).matcher(s).replaceAll("")
if (left) s = Platform.getPatternFromCache(getLeadingPattern(cleanedUpPattern)).matcher(s).replaceAll("")
if (right) s = Platform.getPatternFromCache(getTrailingPattern(chars)).matcher(s).replaceAll("")
if (left) s = Platform.getPatternFromCache(getLeadingPattern(chars)).matcher(s).replaceAll("")
s
}

Expand All @@ -512,9 +500,8 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map.
right: Boolean,
functionName: String
) extends Val.Builtin1(functionName, "str") {
private[this] val cleanedUpPattern = cleanupPattern(chars)
private[this] val leftPattern = Platform.getPatternFromCache(getLeadingPattern(cleanedUpPattern))
private[this] val rightPattern = Platform.getPatternFromCache(getTrailingPattern(cleanedUpPattern))
private[this] val leftPattern = Platform.getPatternFromCache(getLeadingPattern(chars))
private[this] val rightPattern = Platform.getPatternFromCache(getTrailingPattern(chars))

def evalRhs(str: Val, ev: EvalScope, pos: Position): Val = {
var s = str.asString
Expand Down
98 changes: 45 additions & 53 deletions sjsonnet/src/sjsonnet/StdRegex.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,66 +4,58 @@ import sjsonnet.Expr.Member.Visibility
import sjsonnet.Val.Obj

object StdRegex {
def regexPartialMatch(pos: Position, pattern: String, str: String): Val = {
val compiledPattern = Platform.getPatternFromCache(pattern)
val matcher = compiledPattern.matcher(str)
var returnStr: Val = null
val groupCount = matcher.groupCount()
val captures = Array.newBuilder[Val]
captures.sizeHint(groupCount)

while (matcher.find()) {
if (returnStr == null) {
val m = matcher.group(0)
if (m != null) {
returnStr = Val.Str(pos.noOffset, matcher.group(0))
} else {
returnStr = Val.Str(pos.noOffset, "")
}
}
for (i <- 1 to groupCount) {
val m = matcher.group(i)
if (m == null) {
captures += Val.Str(pos.noOffset, "")
} else {
captures += Val.Str(pos.noOffset, m)
}
}
}
if (returnStr == null) {
return Val.Null(pos.noOffset)
}

val result = captures.result()
val namedCaptures = Platform.getNamedGroupsMap(compiledPattern).map {
case (k, v) =>
k -> new Obj.ConstMember(true, Visibility.Normal, result(v - 1))
}.toSeq

Val.Obj.mk(pos.noOffset,
"string" -> new Obj.ConstMember(true, Visibility.Normal, returnStr),
"captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, result)),
"namedCaptures" -> new Obj.ConstMember(true, Visibility.Normal, Val.Obj.mk(pos.noOffset, namedCaptures: _*))
)
}

def functions: Map[String, Val.Builtin] = Map(
"regexPartialMatch" -> new Val.Builtin2("regexPartialMatch", "pattern", "str") {
override def evalRhs(pattern: Val, str: Val, ev: EvalScope, pos: Position): Val = {
val compiledPattern = Platform.getPatternFromCache(pattern.asString)
val matcher = compiledPattern.matcher(str.asString)
var returnStr: Val = null
val captures = Array.newBuilder[Val]
val groupCount = matcher.groupCount()
while (matcher.find()) {
if (returnStr == null) {
val m = matcher.group(0)
if (m != null) {
returnStr = Val.Str(pos.noOffset, matcher.group(0))
} else {
returnStr = Val.Null(pos.noOffset)
}
}
for (i <- 1 to groupCount) {
val m = matcher.group(i)
if (m == null) {
captures += Val.Null(pos.noOffset)
} else {
captures += Val.Str(pos.noOffset, m)
}
}
}
val result = captures.result()
Val.Obj.mk(pos.noOffset,
"string" -> new Obj.ConstMember(true, Visibility.Normal,
if (returnStr == null) Val.Null(pos.noOffset) else returnStr),
"captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, result))
)
regexPartialMatch(pos, pattern.asString, str.asString)
}
},
"regexFullMatch" -> new Val.Builtin2("regexFullMatch", "pattern", "str") {
override def evalRhs(pattern: Val, str: Val, ev: EvalScope, pos: Position): Val = {
val compiledPattern = Platform.getPatternFromCache(pattern.asString)
val matcher = compiledPattern.matcher(str.asString)
if (!matcher.matches()) {
Val.Obj.mk(pos.noOffset,
"string" -> new Obj.ConstMember(true, Visibility.Normal, Val.Null(pos.noOffset)),
"captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, Array.empty[Lazy]))
)
} else {
val captures = Array.newBuilder[Val]
val groupCount = matcher.groupCount()
for (i <- 0 to groupCount) {
val m = matcher.group(i)
if (m == null) {
captures += Val.Null(pos.noOffset)
} else {
captures += Val.Str(pos.noOffset, m)
}
}
val result = captures.result()
Val.Obj.mk(pos.noOffset,
"string" -> new Obj.ConstMember(true, Visibility.Normal, result.head),
"captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, result.drop(1)))
)
}
regexPartialMatch(pos, s"^${pattern.asString}$$", str.asString)
}
},
"regexGlobalReplace" -> new Val.Builtin3("regexGlobalReplace", "str", "pattern", "to") {
Expand Down
34 changes: 21 additions & 13 deletions sjsonnet/test/src/sjsonnet/StdRegexTests.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,36 @@ object StdRegexTests extends TestSuite {
test("std.native - regex") {
eval("""std.native("regexPartialMatch")("a(b)c", "cabc")""") ==> ujson.Obj(
"string" -> "abc",
"captures" -> ujson.Arr("b")
"captures" -> ujson.Arr("b"),
"namedCaptures" -> ujson.Obj()
)
eval("""std.native("regexPartialMatch")("a(b)c", "def")""") ==> ujson.Obj(
"string" -> ujson.Null,
"captures" -> ujson.Arr()
eval("""std.native("regexPartialMatch")("a(?P<foo>b)c", "cabc")""") ==> ujson.Obj(
"string" -> "abc",
"captures" -> ujson.Arr("b"),
"namedCaptures" -> ujson.Obj(
"foo" -> ujson.Str("b")
)
)
eval("""std.native("regexPartialMatch")("a(b)c", "def")""") ==> ujson.Null
eval("""std.native("regexPartialMatch")("a(b)c", "abcabc")""") ==> ujson.Obj(
"string" -> "abc",
"captures" -> ujson.Arr("b", "b")
"captures" -> ujson.Arr("b", "b"),
"namedCaptures" -> ujson.Obj()
)
eval("""std.native("regexFullMatch")("a(b)c", "abc")""") ==> ujson.Obj(
"string" -> "abc",
"captures" -> ujson.Arr("b")
)
eval("""std.native("regexFullMatch")("a(b)c", "cabc")""") ==> ujson.Obj(
"string" -> ujson.Null,
"captures" -> ujson.Arr()
"captures" -> ujson.Arr("b"),
"namedCaptures" -> ujson.Obj()
)
eval("""std.native("regexFullMatch")("a(b)c", "def")""") ==> ujson.Obj(
"string" -> ujson.Null,
"captures" -> ujson.Arr()
eval("""std.native("regexFullMatch")("a(?P<foo>b)c", "abc")""") ==> ujson.Obj(
"string" -> "abc",
"captures" -> ujson.Arr("b"),
"namedCaptures" -> ujson.Obj(
"foo" -> ujson.Str("b")
)
)
eval("""std.native("regexFullMatch")("a(b)c", "cabc")""") ==> ujson.Null
eval("""std.native("regexFullMatch")("a(b)c", "def")""") ==> ujson.Null
eval("""std.native("regexGlobalReplace")("abcbbb", "b", "d")""") ==> ujson.Str("adcddd")
eval("""std.native("regexReplace")("abcbbb", "b", "d")""") ==> ujson.Str("adcbbb")
eval("""std.native("regexQuoteMeta")("a.b")""") ==> ujson.Str(Platform.regexQuote("a.b"))
Expand Down

0 comments on commit ebd99a4

Please sign in to comment.