add with_ascii_uppercased and caseless_ascii_equals to Str

roc-lang · Jan 23, 2025 · fc0a9ec · fc0a9ec
1 parent 690e690
commit fc0a9ec
Show file tree

Hide file tree

Showing 49 changed files with 2,019 additions and 1,718 deletions.
diff --git a/crates/compiler/builtins/bitcode/src/main.zig b/crates/compiler/builtins/bitcode/src/main.zig
@@ -213,6 +213,8 @@ comptime {
     exportStrFn(str.strAllocationPtr, "allocation_ptr");
     exportStrFn(str.strReleaseExcessCapacity, "release_excess_capacity");
     exportStrFn(str.strWithAsciiLowercased, "with_ascii_lowercased");
+    exportStrFn(str.strWithAsciiUppercased, "with_ascii_uppercased");
+    exportStrFn(str.strCaselessAsciiEquals, "caseless_ascii_equals");
 
     for (INTEGERS) |T| {
         str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int.");

diff --git a/crates/compiler/builtins/bitcode/src/str.zig b/crates/compiler/builtins/bitcode/src/str.zig
@@ -2195,6 +2195,135 @@ test "withAsciiLowercased: seamless slice" {
     try expect(str_result.eq(expected));
 }
 
+// Str.with_ascii_uppercased
+pub fn strWithAsciiUppercased(string: RocStr) callconv(.C) RocStr {
+    var new_str = if (string.isUnique())
+        string
+    else blk: {
+        string.decref();
+        break :blk RocStr.fromSlice(string.asSlice());
+    };
+
+    const new_str_bytes = new_str.asU8ptrMut()[0..string.len()];
+    for (new_str_bytes) |*c| {
+        c.* = ascii.toUpper(c.*);
+    }
+    return new_str;
+}
+
+test "withAsciiUppercased: small str" {
+    const original = RocStr.fromSlice("coffé");
+    try expect(original.isSmallStr());
+
+    const expected = RocStr.fromSlice("COFFé");
+    defer expected.decref();
+
+    const str_result = strWithAsciiUppercased(original);
+    defer str_result.decref();
+
+    try expect(str_result.isSmallStr());
+    try expect(str_result.eq(expected));
+}
+
+test "withAsciiUppercased: non small str" {
+    const original = RocStr.fromSlice("coffé coffé coffé coffé coffé coffé");
+    defer original.decref();
+    try expect(!original.isSmallStr());
+
+    const expected = RocStr.fromSlice("COFFé COFFé COFFé COFFé COFFé COFFé");
+    defer expected.decref();
+
+    const str_result = strWithAsciiUppercased(original);
+
+    try expect(!str_result.isSmallStr());
+    try expect(str_result.eq(expected));
+}
+
+test "withAsciiUppercased: seamless slice" {
+    const l = RocStr.fromSlice("coffé coffé coffé coffé coffé coffé");
+    const original = substringUnsafeC(l, 1, l.len() - 1);
+    defer original.decref();
+
+    try expect(original.isSeamlessSlice());
+
+    const expected = RocStr.fromSlice("OFFé COFFé COFFé COFFé COFFé COFFé");
+    defer expected.decref();
+
+    const str_result = strWithAsciiUppercased(original);
+
+    try expect(!str_result.isSmallStr());
+    try expect(str_result.eq(expected));
+}
+
+pub fn strCaselessAsciiEquals(self: RocStr, other: RocStr) callconv(.C) bool {
+    // If they are byte-for-byte equal, they're definitely equal!
+    if (self.bytes == other.bytes and self.length == other.length and self.capacity_or_alloc_ptr == other.capacity_or_alloc_ptr) {
+        return true;
+    }
+
+    return ascii.eqlIgnoreCase(self.asSlice(), other.asSlice());
+}
+
+test "caselessAsciiEquals: same str" {
+    const str1 = RocStr.fromSlice("coFféÉ");
+    defer str1.decref();
+
+    const are_equal = strCaselessAsciiEquals(str1, str1);
+    try expect(are_equal);
+}
+
+test "caselessAsciiEquals: differently capitalized non-ascii char" {
+    const str1 = RocStr.fromSlice("coffé");
+    defer str1.decref();
+    try expect(str1.isSmallStr());
+
+    const str2 = RocStr.fromSlice("coffÉ");
+    defer str2.decref();
+
+    const are_equal = strCaselessAsciiEquals(str1, str2);
+    try expect(!are_equal);
+}
+
+test "caselessAsciiEquals: small str" {
+    const str1 = RocStr.fromSlice("coffé");
+    defer str1.decref();
+    try expect(str1.isSmallStr());
+
+    const str2 = RocStr.fromSlice("COFFé");
+    defer str2.decref();
+
+    const are_equal = strCaselessAsciiEquals(str1, str2);
+    try expect(are_equal);
+}
+
+test "caselessAsciiEquals: non small str" {
+    const str1 = RocStr.fromSlice("coffé coffé coffé coffé coffé coffé");
+    defer str1.decref();
+    try expect(!str1.isSmallStr());
+
+    const str2 = RocStr.fromSlice("COFFé COFFé COFFé COFFé COFFé COFFé");
+    defer str2.decref();
+
+    const are_equal = strCaselessAsciiEquals(str1, str2);
+
+    try expect(are_equal);
+}
+
+test "caselessAsciiEquals: seamless slice" {
+    const l = RocStr.fromSlice("coffé coffé coffé coffé coffé coffé");
+    const str1 = substringUnsafeC(l, 1, l.len() - 1);
+    defer str1.decref();
+
+    try expect(str1.isSeamlessSlice());
+
+    const str2 = RocStr.fromSlice("OFFé COFFé COFFé COFFé COFFé COFFé");
+    defer str2.decref();
+
+    const are_equal = strCaselessAsciiEquals(str1, str2);
+
+    try expect(are_equal);
+}
+
 fn rcNone(_: ?[*]u8) callconv(.C) void {}
 
 fn decStr(ptr: ?[*]u8) callconv(.C) void {

diff --git a/crates/compiler/builtins/roc/Str.roc b/crates/compiler/builtins/roc/Str.roc
@@ -374,6 +374,8 @@ module [
     drop_prefix,
     drop_suffix,
     with_ascii_lowercased,
+    with_ascii_uppercased,
+    caseless_ascii_equals,
 ]
 
 import Bool exposing [Bool]
@@ -1348,7 +1350,71 @@ drop_suffix = |haystack, suffix|
 ## for Unicode capitalization that can be upgraded independently from the language's builtins.
 ##
 ## To do a case-insensitive comparison of the ASCII characters in a string,
-## use [`caseless_ascii_equals`](#caseless_ascii_equals).
+## use [Str.caseless_ascii_equals].
 with_ascii_lowercased : Str -> Str
 
-expect Str.with_ascii_lowercased("cOFFÉ") == "coffÉ"
+expect Str.with_ascii_lowercased("CAFÉ") == "cafÉ"
+
+## Returns a version of the string with all [ASCII characters](https://en.wikipedia.org/wiki/ASCII) uppercased.
+## Non-ASCII characters are left unmodified. For example:
+##
+## ```roc
+##  expect "café".with_ascii_uppercased() == "CAFé"
+## ```
+##
+## This function is useful for things like
+## [command-line options](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option)
+## and [environment variables](https://en.wikipedia.org/wiki/Environment_variable)
+## know in advance that you're dealing with a hardcoded string containing only ASCII characters.
+## It has better performance than lowercasing operations which take Unicode into account.
+##
+## That said, strings received from user input can always contain
+## non-ASCII Unicode characters, and uppercasing [Unicode](https://unicode.org)
+## works differently in different languages.
+## For example, the string `"i"` uppercases to `"I"` in English and to `"İ"`
+## (a [dotted I](https://en.wikipedia.org/wiki/%C4%B0)) in Turkish.
+## These rules can also change in each Unicode release,
+## so we have a separate [`unicode` package](https://github.com/roc-lang/unicode) for Unicode capitalization
+## that can be upgraded independently from the language's builtins.
+##
+## To do a case-insensitive comparison of the ASCII characters in a string,
+## use [Str.caseless_ascii_equals].
+with_ascii_uppercased : Str -> Str
+
+expect Str.with_ascii_uppercased("café") == "CAFé"
+
+## Returns `True` if all the [ASCII characters](https://en.wikipedia.org/wiki/ASCII) in the string are the same
+## when ignoring differences in capitalization.
+## Non-ASCII characters must all be exactly the same,
+## including capitalization. For example:
+##
+## ```roc
+##  expect "café".caseless_ascii_equals("CAFé")
+##
+##  expect !"café".caseless_ascii_equals("CAFÉ")
+## ```
+##
+## The first call returns `True` because all the ASCII characters are the same
+## when ignoring differences in capitalization, and the only non-ASCII character
+## (`é`) is the same in both strings. The second call returns `False`because
+## `é` and `É` are not ASCII characters, and they are different.
+##
+## This function is useful for things like [command-line options](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option)
+## and [environment variables](https://en.wikipedia.org/wiki/Environment_variable)
+## know in advance that you're dealing with a hardcoded string containing only ASCII characters.
+## It has better performance than lowercasing operations which take Unicode into account.
+##
+## That said, strings received from user input can always contain
+## non-ASCII Unicode characters, and lowercasing [Unicode](https://unicode.org) works
+## differently in different languages. For example, the string `"I"` lowercases to `"i"`
+## in English and to `"ı"` (a [dotless i](https://en.wikipedia.org/wiki/Dotless_I))
+## in Turkish. These rules can also change in each [Unicode release](https://www.unicode.org/releases/),
+## so we have separate [`unicode` package](https://github.com/roc-lang/unicode)
+## for Unicode capitalization that can be upgraded independently from the language's builtins.
+##
+##  To convert a string's ASCII characters to uppercase or lowercase, use [Str.with_ascii_uppercased]
+## and [Str.with_ascii_lowercased].
+caseless_ascii_equals : Str, Str -> Bool
+
+expect Str.caseless_ascii_equals("café", "CAFé")
+expect !Str.caseless_ascii_equals("café", "CAFÉ")
diff --git a/crates/compiler/builtins/src/bitcode.rs b/crates/compiler/builtins/src/bitcode.rs
@@ -360,6 +360,8 @@ pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
 pub const STR_ALLOCATION_PTR: &str = "roc_builtins.str.allocation_ptr";
 pub const STR_RELEASE_EXCESS_CAPACITY: &str = "roc_builtins.str.release_excess_capacity";
 pub const STR_WITH_ASCII_LOWERCASED: &str = "roc_builtins.str.with_ascii_lowercased";
+pub const STR_WITH_ASCII_UPPERCASED: &str = "roc_builtins.str.with_ascii_uppercased";
+pub const STR_CASELESS_ASCII_EQUALS: &str = "roc_builtins.str.caseless_ascii_equals";
 
 pub const LIST_MAP: &str = "roc_builtins.list.map";
 pub const LIST_MAP2: &str = "roc_builtins.list.map2";

diff --git a/crates/compiler/can/src/builtins.rs b/crates/compiler/can/src/builtins.rs
@@ -132,6 +132,8 @@ map_symbol_to_lowlevel_and_arity! {
     StrWithCapacity; STR_WITH_CAPACITY; 1,
     StrReleaseExcessCapacity; STR_RELEASE_EXCESS_CAPACITY; 1,
     StrWithAsciiLowercased; STR_WITH_ASCII_LOWERCASED; 1,
+    StrWithAsciiUppercased; STR_WITH_ASCII_UPPERCASED; 1,
+    StrCaselessAsciiEquals; STR_CASELESS_ASCII_EQUALS; 2,
 
     ListLenUsize; LIST_LEN_USIZE; 1,
     ListLenU64; LIST_LEN_U64; 1,

diff --git a/crates/compiler/gen_dev/src/lib.rs b/crates/compiler/gen_dev/src/lib.rs
@@ -1726,6 +1726,20 @@ trait Backend<'a> {
                 arg_layouts,
                 ret_layout,
             ),
+            LowLevel::StrWithAsciiUppercased => self.build_fn_call(
+                sym,
+                bitcode::STR_WITH_ASCII_UPPERCASED.to_string(),
+                args,
+                arg_layouts,
+                ret_layout,
+            ),
+            LowLevel::StrCaselessAsciiEquals => self.build_fn_call(
+                sym,
+                bitcode::STR_CASELESS_ASCII_EQUALS.to_string(),
+                args,
+                arg_layouts,
+                ret_layout,
+            ),
             LowLevel::StrToNum => {
                 let number_layout = match self.interner().get_repr(*ret_layout) {
                     LayoutRepr::Struct(field_layouts) => field_layouts[0], // TODO: why is it sometimes a struct?

diff --git a/crates/compiler/gen_llvm/src/llvm/lowlevel.rs b/crates/compiler/gen_llvm/src/llvm/lowlevel.rs
@@ -626,6 +626,28 @@ pub(crate) fn run_low_level<'a, 'ctx>(
                 bitcode::STR_WITH_ASCII_LOWERCASED,
             )
         }
+        StrWithAsciiUppercased => {
+            arguments!(string);
+
+            call_str_bitcode_fn(
+                env,
+                &[string],
+                &[],
+                BitcodeReturns::Str,
+                bitcode::STR_WITH_ASCII_UPPERCASED,
+            )
+        }
+        StrCaselessAsciiEquals => {
+            arguments!(string1, string2);
+
+            call_str_bitcode_fn(
+                env,
+                &[string1, string2],
+                &[],
+                BitcodeReturns::Basic,
+                bitcode::STR_CASELESS_ASCII_EQUALS,
+            )
+        }
         ListConcat => {
             debug_assert_eq!(args.len(), 2);
 

diff --git a/crates/compiler/gen_wasm/src/low_level.rs b/crates/compiler/gen_wasm/src/low_level.rs
@@ -262,6 +262,12 @@ impl<'a> LowLevelCall<'a> {
             StrWithAsciiLowercased => {
                 self.load_args_and_call_zig(backend, bitcode::STR_WITH_ASCII_LOWERCASED)
             }
+            StrWithAsciiUppercased => {
+                self.load_args_and_call_zig(backend, bitcode::STR_WITH_ASCII_UPPERCASED)
+            }
+            StrCaselessAsciiEquals => {
+                self.load_args_and_call_zig(backend, bitcode::STR_CASELESS_ASCII_EQUALS)
+            }
 
             // List
             ListLenU64 => {

diff --git a/crates/compiler/module/src/low_level.rs b/crates/compiler/module/src/low_level.rs
@@ -28,6 +28,8 @@ pub enum LowLevel {
     StrWithCapacity,
     StrReleaseExcessCapacity,
     StrWithAsciiLowercased,
+    StrWithAsciiUppercased,
+    StrCaselessAsciiEquals,
     ListLenUsize,
     ListLenU64,
     ListWithCapacity,
@@ -269,6 +271,8 @@ map_symbol_to_lowlevel! {
     StrWithCapacity <= STR_WITH_CAPACITY;
     StrReleaseExcessCapacity <= STR_RELEASE_EXCESS_CAPACITY;
     StrWithAsciiLowercased <= STR_WITH_ASCII_LOWERCASED;
+    StrWithAsciiUppercased <= STR_WITH_ASCII_UPPERCASED;
+    StrCaselessAsciiEquals <= STR_CASELESS_ASCII_EQUALS;
     ListLenU64 <= LIST_LEN_U64;
     ListLenUsize <= LIST_LEN_USIZE;
     ListGetCapacity <= LIST_CAPACITY;

diff --git a/crates/compiler/module/src/symbol.rs b/crates/compiler/module/src/symbol.rs
@@ -1421,10 +1421,12 @@ define_builtins! {
         49 STR_DROP_PREFIX: "drop_prefix"
         50 STR_DROP_SUFFIX: "drop_suffix"
         51 STR_WITH_ASCII_LOWERCASED: "with_ascii_lowercased"
-        52 STR_FROM_UTF16: "from_utf16"
-        53 STR_FROM_UTF16_LOSSY: "from_utf16_lossy"
-        54 STR_FROM_UTF32: "from_utf32"
-        55 STR_FROM_UTF32_LOSSY: "from_utf32_lossy"
+        52 STR_WITH_ASCII_UPPERCASED: "with_ascii_uppercased"
+        53 STR_CASELESS_ASCII_EQUALS: "caseless_ascii_equals"
+        54 STR_FROM_UTF16: "from_utf16"
+        55 STR_FROM_UTF16_LOSSY: "from_utf16_lossy"
+        56 STR_FROM_UTF32: "from_utf32"
+        57 STR_FROM_UTF32_LOSSY: "from_utf32_lossy"
     }
     6 LIST: "List" => {
         0 LIST_LIST: "List" exposed_apply_type=true // the List.List type alias

diff --git a/crates/compiler/mono/src/drop_specialization.rs b/crates/compiler/mono/src/drop_specialization.rs
@@ -1550,6 +1550,8 @@ fn low_level_no_rc(lowlevel: &LowLevel) -> RC {
         StrJoinWith => RC::NoRc,
         ListSortWith => RC::Rc,
         StrWithAsciiLowercased => RC::Rc,
+        StrWithAsciiUppercased => RC::Rc,
+        StrCaselessAsciiEquals => RC::NoRc,
 
         ListAppendUnsafe
         | ListReserve

diff --git a/crates/compiler/mono/src/inc_dec.rs b/crates/compiler/mono/src/inc_dec.rs
@@ -1259,6 +1259,8 @@ pub(crate) fn lowlevel_borrow_signature(op: LowLevel) -> &'static [Ownership] {
         ListIncref => &[OWNED],
         ListDecref => &[OWNED],
         StrWithAsciiLowercased => &[OWNED],
+        StrWithAsciiUppercased => &[OWNED],
+        StrCaselessAsciiEquals => &[BORROWED, BORROWED],
 
         Eq | NotEq => &[BORROWED, BORROWED],
 

diff --git a/crates/compiler/solve/tests/solve_expr.rs b/crates/compiler/solve/tests/solve_expr.rs
@@ -3838,6 +3838,30 @@ mod solve_expr {
         );
     }
 
+    #[test]
+    fn str_with_ascii_uppercased() {
+        infer_eq_without_problem(
+            indoc!(
+                r"
+                Str.with_ascii_uppercased
+                "
+            ),
+            "Str -> Str",
+        );
+    }
+
+    #[test]
+    fn str_caseless_ascii_equals() {
+        infer_eq_without_problem(
+            indoc!(
+                r"
+                Str.caseless_ascii_equals
+                "
+            ),
+            "Str, Str -> Bool",
+        );
+    }
+
     #[test]
     fn list_take_first() {
         infer_eq_without_problem(