Skip to content

Commit

Permalink
add with_ascii_uppercased and caseless_ascii_equals to Str
Browse files Browse the repository at this point in the history
  • Loading branch information
HajagosNorbert committed Jan 23, 2025
1 parent 690e690 commit fc0a9ec
Show file tree
Hide file tree
Showing 49 changed files with 2,019 additions and 1,718 deletions.
2 changes: 2 additions & 0 deletions crates/compiler/builtins/bitcode/src/main.zig
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ comptime {
exportStrFn(str.strAllocationPtr, "allocation_ptr");
exportStrFn(str.strReleaseExcessCapacity, "release_excess_capacity");
exportStrFn(str.strWithAsciiLowercased, "with_ascii_lowercased");
exportStrFn(str.strWithAsciiUppercased, "with_ascii_uppercased");
exportStrFn(str.strCaselessAsciiEquals, "caseless_ascii_equals");

for (INTEGERS) |T| {
str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int.");
Expand Down
129 changes: 129 additions & 0 deletions crates/compiler/builtins/bitcode/src/str.zig
Original file line number Diff line number Diff line change
Expand Up @@ -2195,6 +2195,135 @@ test "withAsciiLowercased: seamless slice" {
try expect(str_result.eq(expected));
}

// Str.with_ascii_uppercased
pub fn strWithAsciiUppercased(string: RocStr) callconv(.C) RocStr {
var new_str = if (string.isUnique())
string
else blk: {
string.decref();
break :blk RocStr.fromSlice(string.asSlice());
};

const new_str_bytes = new_str.asU8ptrMut()[0..string.len()];
for (new_str_bytes) |*c| {
c.* = ascii.toUpper(c.*);
}
return new_str;
}

test "withAsciiUppercased: small str" {
const original = RocStr.fromSlice("coffé");
try expect(original.isSmallStr());

const expected = RocStr.fromSlice("COFFé");
defer expected.decref();

const str_result = strWithAsciiUppercased(original);
defer str_result.decref();

try expect(str_result.isSmallStr());
try expect(str_result.eq(expected));
}

test "withAsciiUppercased: non small str" {
const original = RocStr.fromSlice("coffé coffé coffé coffé coffé coffé");
defer original.decref();
try expect(!original.isSmallStr());

const expected = RocStr.fromSlice("COFFé COFFé COFFé COFFé COFFé COFFé");
defer expected.decref();

const str_result = strWithAsciiUppercased(original);

try expect(!str_result.isSmallStr());
try expect(str_result.eq(expected));
}

test "withAsciiUppercased: seamless slice" {
const l = RocStr.fromSlice("coffé coffé coffé coffé coffé coffé");
const original = substringUnsafeC(l, 1, l.len() - 1);
defer original.decref();

try expect(original.isSeamlessSlice());

const expected = RocStr.fromSlice("OFFé COFFé COFFé COFFé COFFé COFFé");
defer expected.decref();

const str_result = strWithAsciiUppercased(original);

try expect(!str_result.isSmallStr());
try expect(str_result.eq(expected));
}

pub fn strCaselessAsciiEquals(self: RocStr, other: RocStr) callconv(.C) bool {
// If they are byte-for-byte equal, they're definitely equal!
if (self.bytes == other.bytes and self.length == other.length and self.capacity_or_alloc_ptr == other.capacity_or_alloc_ptr) {
return true;
}

return ascii.eqlIgnoreCase(self.asSlice(), other.asSlice());
}

test "caselessAsciiEquals: same str" {
const str1 = RocStr.fromSlice("coFféÉ");
defer str1.decref();

const are_equal = strCaselessAsciiEquals(str1, str1);
try expect(are_equal);
}

test "caselessAsciiEquals: differently capitalized non-ascii char" {
const str1 = RocStr.fromSlice("coffé");
defer str1.decref();
try expect(str1.isSmallStr());

const str2 = RocStr.fromSlice("coffÉ");
defer str2.decref();

const are_equal = strCaselessAsciiEquals(str1, str2);
try expect(!are_equal);
}

test "caselessAsciiEquals: small str" {
const str1 = RocStr.fromSlice("coffé");
defer str1.decref();
try expect(str1.isSmallStr());

const str2 = RocStr.fromSlice("COFFé");
defer str2.decref();

const are_equal = strCaselessAsciiEquals(str1, str2);
try expect(are_equal);
}

test "caselessAsciiEquals: non small str" {
const str1 = RocStr.fromSlice("coffé coffé coffé coffé coffé coffé");
defer str1.decref();
try expect(!str1.isSmallStr());

const str2 = RocStr.fromSlice("COFFé COFFé COFFé COFFé COFFé COFFé");
defer str2.decref();

const are_equal = strCaselessAsciiEquals(str1, str2);

try expect(are_equal);
}

test "caselessAsciiEquals: seamless slice" {
const l = RocStr.fromSlice("coffé coffé coffé coffé coffé coffé");
const str1 = substringUnsafeC(l, 1, l.len() - 1);
defer str1.decref();

try expect(str1.isSeamlessSlice());

const str2 = RocStr.fromSlice("OFFé COFFé COFFé COFFé COFFé COFFé");
defer str2.decref();

const are_equal = strCaselessAsciiEquals(str1, str2);

try expect(are_equal);
}

fn rcNone(_: ?[*]u8) callconv(.C) void {}

fn decStr(ptr: ?[*]u8) callconv(.C) void {
Expand Down
70 changes: 68 additions & 2 deletions crates/compiler/builtins/roc/Str.roc
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,8 @@ module [
drop_prefix,
drop_suffix,
with_ascii_lowercased,
with_ascii_uppercased,
caseless_ascii_equals,
]

import Bool exposing [Bool]
Expand Down Expand Up @@ -1348,7 +1350,71 @@ drop_suffix = |haystack, suffix|
## for Unicode capitalization that can be upgraded independently from the language's builtins.
##
## To do a case-insensitive comparison of the ASCII characters in a string,
## use [`caseless_ascii_equals`](#caseless_ascii_equals).
## use [Str.caseless_ascii_equals].
with_ascii_lowercased : Str -> Str

expect Str.with_ascii_lowercased("cOFFÉ") == "coffÉ"
expect Str.with_ascii_lowercased("CAFÉ") == "cafÉ"

## Returns a version of the string with all [ASCII characters](https://en.wikipedia.org/wiki/ASCII) uppercased.
## Non-ASCII characters are left unmodified. For example:
##
## ```roc
## expect "café".with_ascii_uppercased() == "CAFé"
## ```
##
## This function is useful for things like
## [command-line options](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option)
## and [environment variables](https://en.wikipedia.org/wiki/Environment_variable)
## know in advance that you're dealing with a hardcoded string containing only ASCII characters.
## It has better performance than lowercasing operations which take Unicode into account.
##
## That said, strings received from user input can always contain
## non-ASCII Unicode characters, and uppercasing [Unicode](https://unicode.org)
## works differently in different languages.
## For example, the string `"i"` uppercases to `"I"` in English and to `"İ"`
## (a [dotted I](https://en.wikipedia.org/wiki/%C4%B0)) in Turkish.
## These rules can also change in each Unicode release,
## so we have a separate [`unicode` package](https://github.com/roc-lang/unicode) for Unicode capitalization
## that can be upgraded independently from the language's builtins.
##
## To do a case-insensitive comparison of the ASCII characters in a string,
## use [Str.caseless_ascii_equals].
with_ascii_uppercased : Str -> Str

expect Str.with_ascii_uppercased("café") == "CAFé"

## Returns `True` if all the [ASCII characters](https://en.wikipedia.org/wiki/ASCII) in the string are the same
## when ignoring differences in capitalization.
## Non-ASCII characters must all be exactly the same,
## including capitalization. For example:
##
## ```roc
## expect "café".caseless_ascii_equals("CAFé")
##
## expect !"café".caseless_ascii_equals("CAFÉ")
## ```
##
## The first call returns `True` because all the ASCII characters are the same
## when ignoring differences in capitalization, and the only non-ASCII character
## (`é`) is the same in both strings. The second call returns `False`because
## `é` and `É` are not ASCII characters, and they are different.
##
## This function is useful for things like [command-line options](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option)
## and [environment variables](https://en.wikipedia.org/wiki/Environment_variable)
## know in advance that you're dealing with a hardcoded string containing only ASCII characters.
## It has better performance than lowercasing operations which take Unicode into account.
##
## That said, strings received from user input can always contain
## non-ASCII Unicode characters, and lowercasing [Unicode](https://unicode.org) works
## differently in different languages. For example, the string `"I"` lowercases to `"i"`
## in English and to `"ı"` (a [dotless i](https://en.wikipedia.org/wiki/Dotless_I))
## in Turkish. These rules can also change in each [Unicode release](https://www.unicode.org/releases/),
## so we have separate [`unicode` package](https://github.com/roc-lang/unicode)
## for Unicode capitalization that can be upgraded independently from the language's builtins.
##
## To convert a string's ASCII characters to uppercase or lowercase, use [Str.with_ascii_uppercased]
## and [Str.with_ascii_lowercased].
caseless_ascii_equals : Str, Str -> Bool

expect Str.caseless_ascii_equals("café", "CAFé")
expect !Str.caseless_ascii_equals("café", "CAFÉ")
2 changes: 2 additions & 0 deletions crates/compiler/builtins/src/bitcode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,8 @@ pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
pub const STR_ALLOCATION_PTR: &str = "roc_builtins.str.allocation_ptr";
pub const STR_RELEASE_EXCESS_CAPACITY: &str = "roc_builtins.str.release_excess_capacity";
pub const STR_WITH_ASCII_LOWERCASED: &str = "roc_builtins.str.with_ascii_lowercased";
pub const STR_WITH_ASCII_UPPERCASED: &str = "roc_builtins.str.with_ascii_uppercased";
pub const STR_CASELESS_ASCII_EQUALS: &str = "roc_builtins.str.caseless_ascii_equals";

pub const LIST_MAP: &str = "roc_builtins.list.map";
pub const LIST_MAP2: &str = "roc_builtins.list.map2";
Expand Down
2 changes: 2 additions & 0 deletions crates/compiler/can/src/builtins.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ map_symbol_to_lowlevel_and_arity! {
StrWithCapacity; STR_WITH_CAPACITY; 1,
StrReleaseExcessCapacity; STR_RELEASE_EXCESS_CAPACITY; 1,
StrWithAsciiLowercased; STR_WITH_ASCII_LOWERCASED; 1,
StrWithAsciiUppercased; STR_WITH_ASCII_UPPERCASED; 1,
StrCaselessAsciiEquals; STR_CASELESS_ASCII_EQUALS; 2,

ListLenUsize; LIST_LEN_USIZE; 1,
ListLenU64; LIST_LEN_U64; 1,
Expand Down
14 changes: 14 additions & 0 deletions crates/compiler/gen_dev/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1726,6 +1726,20 @@ trait Backend<'a> {
arg_layouts,
ret_layout,
),
LowLevel::StrWithAsciiUppercased => self.build_fn_call(
sym,
bitcode::STR_WITH_ASCII_UPPERCASED.to_string(),
args,
arg_layouts,
ret_layout,
),
LowLevel::StrCaselessAsciiEquals => self.build_fn_call(
sym,
bitcode::STR_CASELESS_ASCII_EQUALS.to_string(),
args,
arg_layouts,
ret_layout,
),
LowLevel::StrToNum => {
let number_layout = match self.interner().get_repr(*ret_layout) {
LayoutRepr::Struct(field_layouts) => field_layouts[0], // TODO: why is it sometimes a struct?
Expand Down
22 changes: 22 additions & 0 deletions crates/compiler/gen_llvm/src/llvm/lowlevel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,28 @@ pub(crate) fn run_low_level<'a, 'ctx>(
bitcode::STR_WITH_ASCII_LOWERCASED,
)
}
StrWithAsciiUppercased => {
arguments!(string);

call_str_bitcode_fn(
env,
&[string],
&[],
BitcodeReturns::Str,
bitcode::STR_WITH_ASCII_UPPERCASED,
)
}
StrCaselessAsciiEquals => {
arguments!(string1, string2);

call_str_bitcode_fn(
env,
&[string1, string2],
&[],
BitcodeReturns::Basic,
bitcode::STR_CASELESS_ASCII_EQUALS,
)
}
ListConcat => {
debug_assert_eq!(args.len(), 2);

Expand Down
6 changes: 6 additions & 0 deletions crates/compiler/gen_wasm/src/low_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,12 @@ impl<'a> LowLevelCall<'a> {
StrWithAsciiLowercased => {
self.load_args_and_call_zig(backend, bitcode::STR_WITH_ASCII_LOWERCASED)
}
StrWithAsciiUppercased => {
self.load_args_and_call_zig(backend, bitcode::STR_WITH_ASCII_UPPERCASED)
}
StrCaselessAsciiEquals => {
self.load_args_and_call_zig(backend, bitcode::STR_CASELESS_ASCII_EQUALS)
}

// List
ListLenU64 => {
Expand Down
4 changes: 4 additions & 0 deletions crates/compiler/module/src/low_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ pub enum LowLevel {
StrWithCapacity,
StrReleaseExcessCapacity,
StrWithAsciiLowercased,
StrWithAsciiUppercased,
StrCaselessAsciiEquals,
ListLenUsize,
ListLenU64,
ListWithCapacity,
Expand Down Expand Up @@ -269,6 +271,8 @@ map_symbol_to_lowlevel! {
StrWithCapacity <= STR_WITH_CAPACITY;
StrReleaseExcessCapacity <= STR_RELEASE_EXCESS_CAPACITY;
StrWithAsciiLowercased <= STR_WITH_ASCII_LOWERCASED;
StrWithAsciiUppercased <= STR_WITH_ASCII_UPPERCASED;
StrCaselessAsciiEquals <= STR_CASELESS_ASCII_EQUALS;
ListLenU64 <= LIST_LEN_U64;
ListLenUsize <= LIST_LEN_USIZE;
ListGetCapacity <= LIST_CAPACITY;
Expand Down
10 changes: 6 additions & 4 deletions crates/compiler/module/src/symbol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1421,10 +1421,12 @@ define_builtins! {
49 STR_DROP_PREFIX: "drop_prefix"
50 STR_DROP_SUFFIX: "drop_suffix"
51 STR_WITH_ASCII_LOWERCASED: "with_ascii_lowercased"
52 STR_FROM_UTF16: "from_utf16"
53 STR_FROM_UTF16_LOSSY: "from_utf16_lossy"
54 STR_FROM_UTF32: "from_utf32"
55 STR_FROM_UTF32_LOSSY: "from_utf32_lossy"
52 STR_WITH_ASCII_UPPERCASED: "with_ascii_uppercased"
53 STR_CASELESS_ASCII_EQUALS: "caseless_ascii_equals"
54 STR_FROM_UTF16: "from_utf16"
55 STR_FROM_UTF16_LOSSY: "from_utf16_lossy"
56 STR_FROM_UTF32: "from_utf32"
57 STR_FROM_UTF32_LOSSY: "from_utf32_lossy"
}
6 LIST: "List" => {
0 LIST_LIST: "List" exposed_apply_type=true // the List.List type alias
Expand Down
2 changes: 2 additions & 0 deletions crates/compiler/mono/src/drop_specialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1550,6 +1550,8 @@ fn low_level_no_rc(lowlevel: &LowLevel) -> RC {
StrJoinWith => RC::NoRc,
ListSortWith => RC::Rc,
StrWithAsciiLowercased => RC::Rc,
StrWithAsciiUppercased => RC::Rc,
StrCaselessAsciiEquals => RC::NoRc,

ListAppendUnsafe
| ListReserve
Expand Down
2 changes: 2 additions & 0 deletions crates/compiler/mono/src/inc_dec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1259,6 +1259,8 @@ pub(crate) fn lowlevel_borrow_signature(op: LowLevel) -> &'static [Ownership] {
ListIncref => &[OWNED],
ListDecref => &[OWNED],
StrWithAsciiLowercased => &[OWNED],
StrWithAsciiUppercased => &[OWNED],
StrCaselessAsciiEquals => &[BORROWED, BORROWED],

Eq | NotEq => &[BORROWED, BORROWED],

Expand Down
24 changes: 24 additions & 0 deletions crates/compiler/solve/tests/solve_expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3838,6 +3838,30 @@ mod solve_expr {
);
}

#[test]
fn str_with_ascii_uppercased() {
infer_eq_without_problem(
indoc!(
r"
Str.with_ascii_uppercased
"
),
"Str -> Str",
);
}

#[test]
fn str_caseless_ascii_equals() {
infer_eq_without_problem(
indoc!(
r"
Str.caseless_ascii_equals
"
),
"Str, Str -> Bool",
);
}

#[test]
fn list_take_first() {
infer_eq_without_problem(
Expand Down
Loading

0 comments on commit fc0a9ec

Please sign in to comment.