Remove Str.fromUtf8Range

Seamless slices make this obsolete!
This commit is contained in:
Richard Feldman 2024-02-16 20:00:07 -05:00
parent 74e58d3d51
commit 37b154df4f
No known key found for this signature in database
GPG Key ID: F1F21AA5B1D9E43B
15 changed files with 94 additions and 219 deletions

View File

@ -1230,7 +1230,7 @@ fn lowlevel_spec<'a>(
builder.add_make_tuple(block, &[cell, bag])
}
StrFromUtf8Range => {
StrFromUtf8 => {
let list = env.symbols[&arguments[0]];
let cell = builder.add_get_tuple_field(block, list, LIST_CELL_INDEX)?;

View File

@ -196,7 +196,7 @@ comptime {
exportStrFn(str.getUnsafeC, "get_unsafe");
exportStrFn(str.reserveC, "reserve");
exportStrFn(str.strToUtf8C, "to_utf8");
exportStrFn(str.fromUtf8RangeC, "from_utf8_range");
exportStrFn(str.fromUtf8C, "from_utf8");
exportStrFn(str.repeatC, "repeat");
exportStrFn(str.strTrim, "trim");
exportStrFn(str.strTrimStart, "trim_start");

View File

@ -1511,33 +1511,19 @@ const FromUtf8Result = extern struct {
problem_code: Utf8ByteProblem,
};
const CountAndStart = extern struct {
count: usize,
start: usize,
};
pub fn fromUtf8RangeC(
pub fn fromUtf8C(
list: RocList,
start_u64: u64,
count_u64: u64,
update_mode: UpdateMode,
) callconv(.C) FromUtf8Result {
return fromUtf8Range(list, @intCast(start_u64), @intCast(count_u64), update_mode);
return fromUtf8(list, update_mode);
}
test "fromUtf8RangeC(\"hello\", 1, 3)" {
const original_bytes = "hello";
const list = RocList.fromSlice(u8, original_bytes[0..]);
const result = fromUtf8RangeC(list, 1, 3, UpdateMode.Immutable);
try expectEqual(result.is_ok, true);
result.string.decref();
}
pub fn fromUtf8Range(arg: RocList, start: usize, count: usize, update_mode: UpdateMode) FromUtf8Result {
if (arg.len() == 0 or count == 0) {
arg.decref(RocStr.alignment);
pub fn fromUtf8(
list: RocList,
update_mode: UpdateMode,
) FromUtf8Result {
if (list.len() == 0) {
list.decref(1); // Alignment 1 for List U8
return FromUtf8Result{
.is_ok = true,
.string = RocStr.empty(),
@ -1545,11 +1531,11 @@ pub fn fromUtf8Range(arg: RocList, start: usize, count: usize, update_mode: Upda
.problem_code = Utf8ByteProblem.InvalidStartByte,
};
}
const bytes = @as([*]const u8, @ptrCast(arg.bytes))[start .. start + count];
const bytes = @as([*]const u8, @ptrCast(list.bytes))[0..list.len()];
if (isValidUnicode(bytes)) {
// Make a seamless slice of the input.
const string = RocStr.fromSubListUnsafe(arg, start, count, update_mode);
const string = RocStr.fromSubListUnsafe(list, 0, list.len(), update_mode);
return FromUtf8Result{
.is_ok = true,
.string = string,
@ -1557,10 +1543,9 @@ pub fn fromUtf8Range(arg: RocList, start: usize, count: usize, update_mode: Upda
.problem_code = Utf8ByteProblem.InvalidStartByte,
};
} else {
const temp = errorToProblem(@as([*]u8, @ptrCast(arg.bytes)), arg.length);
const temp = errorToProblem(bytes);
// decref the list
arg.decref(RocStr.alignment);
list.decref(1); // Alignment 1 for List U8
return FromUtf8Result{
.is_ok = false,
@ -1571,11 +1556,12 @@ pub fn fromUtf8Range(arg: RocList, start: usize, count: usize, update_mode: Upda
}
}
fn errorToProblem(bytes: [*]u8, length: usize) struct { index: usize, problem: Utf8ByteProblem } {
fn errorToProblem(bytes: []const u8) struct { index: usize, problem: Utf8ByteProblem } {
const len = bytes.len;
var index: usize = 0;
while (index < length) {
const nextNumBytes = numberOfNextCodepointBytes(bytes, length, index) catch |err| {
while (index < len) {
const nextNumBytes = numberOfNextCodepointBytes(bytes, index) catch |err| {
switch (err) {
error.UnexpectedEof => {
return .{ .index = index, .problem = Utf8ByteProblem.UnexpectedEndOfSequence };
@ -1649,13 +1635,13 @@ const Utf8DecodeError = error{
// Essentially unicode.utf8ValidateSlice -> https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L156
// but only for the next codepoint from the index. Then we return the number of bytes of that codepoint.
// TODO: we only ever use the values 0-4, so can we use smaller int than `usize`?
pub fn numberOfNextCodepointBytes(ptr: [*]u8, len: usize, index: usize) Utf8DecodeError!usize {
const codepoint_len = try unicode.utf8ByteSequenceLength(ptr[index]);
pub fn numberOfNextCodepointBytes(bytes: []const u8, index: usize) Utf8DecodeError!usize {
const codepoint_len = try unicode.utf8ByteSequenceLength(bytes[index]);
const codepoint_end_index = index + codepoint_len;
if (codepoint_end_index > len) {
if (codepoint_end_index > bytes.len) {
return error.UnexpectedEof;
}
_ = try unicode.utf8Decode(ptr[index..codepoint_end_index]);
_ = try unicode.utf8Decode(bytes[index..codepoint_end_index]);
return codepoint_end_index - index;
}
@ -1671,11 +1657,11 @@ pub const Utf8ByteProblem = enum(u8) {
};
fn validateUtf8Bytes(bytes: [*]u8, length: usize) FromUtf8Result {
return fromUtf8Range(RocList{ .bytes = bytes, .length = length, .capacity_or_alloc_ptr = length }, 0, length, .Immutable);
return fromUtf8(RocList{ .bytes = bytes, .length = length, .capacity_or_alloc_ptr = length }, .Immutable);
}
fn validateUtf8BytesX(str: RocList) FromUtf8Result {
return fromUtf8Range(str, 0, str.len(), .Immutable);
return fromUtf8(str, .Immutable);
}
fn expectOk(result: FromUtf8Result) !void {
@ -1754,7 +1740,7 @@ fn expectErr(list: RocList, index: usize, err: Utf8DecodeError, problem: Utf8Byt
const str_ptr = @as([*]u8, @ptrCast(list.bytes));
const len = list.length;
try expectError(err, numberOfNextCodepointBytes(str_ptr, len, index));
try expectError(err, numberOfNextCodepointBytes(str_ptr[0..len], index));
try expectEqual(toErrUtf8ByteResponse(index, problem), validateUtf8Bytes(str_ptr, len));
}

View File

@ -338,7 +338,6 @@ interface Str
countUtf8Bytes,
toUtf8,
fromUtf8,
fromUtf8Range,
startsWith,
endsWith,
trim,
@ -541,7 +540,7 @@ toUtf8 : Str -> List U8
## ```
fromUtf8 : List U8 -> Result Str [BadUtf8 Utf8ByteProblem U64]
fromUtf8 = \bytes ->
result = fromUtf8RangeLowlevel bytes 0 (List.len bytes)
result = fromUtf8Lowlevel bytes
if result.cIsOk then
Ok result.bString
@ -554,29 +553,6 @@ expect (Str.fromUtf8 [240, 159, 144, 166]) == Ok "🐦"
expect (Str.fromUtf8 []) == Ok ""
expect (Str.fromUtf8 [255]) |> Result.isErr
## Encode part of a [List] of [U8] UTF-8 [code units](https://unicode.org/glossary/#code_unit)
## into a [Str]
## ```
## expect Str.fromUtf8Range [72, 105, 80, 103] { start : 0, count : 2 } == Ok "Hi"
## ```
fromUtf8Range : List U8, { start : U64, count : U64 } -> Result Str [BadUtf8 Utf8ByteProblem U64, OutOfBounds]
fromUtf8Range = \bytes, config ->
if Num.addSaturated config.start config.count <= List.len bytes then
result = fromUtf8RangeLowlevel bytes config.start config.count
if result.cIsOk then
Ok result.bString
else
Err (BadUtf8 result.dProblemCode result.aByteIndex)
else
Err OutOfBounds
expect (Str.fromUtf8Range [72, 105, 80, 103] { start: 0, count: 2 }) == Ok "Hi"
expect (Str.fromUtf8Range [233, 185, 143, 224, 174, 154, 224, 174, 191] { start: 3, count: 3 }) == Ok "ச"
expect (Str.fromUtf8Range [240, 159, 144, 166] { start: 0, count: 4 }) == Ok "🐦"
expect (Str.fromUtf8Range [] { start: 0, count: 0 }) == Ok ""
expect (Str.fromUtf8Range [72, 105, 80, 103] { start: 2, count: 3 }) |> Result.isErr
FromUtf8Result : {
aByteIndex : U64,
bString : Str,
@ -584,7 +560,7 @@ FromUtf8Result : {
dProblemCode : Utf8ByteProblem,
}
fromUtf8RangeLowlevel : List U8, U64, U64 -> FromUtf8Result
fromUtf8Lowlevel : List U8 -> FromUtf8Result
## Check if the given [Str] starts with a value.
## ```

View File

@ -354,7 +354,7 @@ pub const STR_TO_DECIMAL: &str = "roc_builtins.str.to_decimal";
pub const STR_EQUAL: &str = "roc_builtins.str.equal";
pub const STR_SUBSTRING_UNSAFE: &str = "roc_builtins.str.substring_unsafe";
pub const STR_TO_UTF8: &str = "roc_builtins.str.to_utf8";
pub const STR_FROM_UTF8_RANGE: &str = "roc_builtins.str.from_utf8_range";
pub const STR_FROM_UTF8: &str = "roc_builtins.str.from_utf8";
pub const STR_REPEAT: &str = "roc_builtins.str.repeat";
pub const STR_TRIM: &str = "roc_builtins.str.trim";
pub const STR_TRIM_START: &str = "roc_builtins.str.trim_start";

View File

@ -116,7 +116,7 @@ map_symbol_to_lowlevel_and_arity! {
StrEndsWith; STR_ENDS_WITH; 2,
StrSplit; STR_SPLIT; 2,
StrCountUtf8Bytes; STR_COUNT_UTF8_BYTES; 1,
StrFromUtf8Range; STR_FROM_UTF8_RANGE_LOWLEVEL; 3,
StrFromUtf8; STR_FROM_UTF8_LOWLEVEL; 1,
StrToUtf8; STR_TO_UTF8; 1,
StrRepeat; STR_REPEAT; 2,
StrTrim; STR_TRIM; 1,

View File

@ -1623,15 +1623,17 @@ trait Backend<'a> {
arg_layouts,
ret_layout,
),
LowLevel::StrFromUtf8Range => {
LowLevel::StrFromUtf8 => {
let update_mode = self.debug_symbol("update_mode");
// In dev builds, always use UpdateMode::Immutable
self.load_literal_i8(&update_mode, UpdateMode::Immutable as i8);
self.build_fn_call(
sym,
bitcode::STR_FROM_UTF8_RANGE.to_string(),
&[args[0], args[1], args[2], update_mode],
&[arg_layouts[0], arg_layouts[1], arg_layouts[2], Layout::U8],
bitcode::STR_FROM_UTF8.to_string(),
&[args[0], update_mode],
&[arg_layouts[0], Layout::U8],
ret_layout,
)
}

View File

@ -408,7 +408,7 @@ pub(crate) fn run_low_level<'a, 'ctx>(
&bitcode::STR_FROM_FLOAT[float_width],
)
}
StrFromUtf8Range => {
StrFromUtf8 => {
let result_type = env.module.get_struct_type("str.FromUtf8Result").unwrap();
let result_ptr = env
.builder
@ -417,7 +417,7 @@ pub(crate) fn run_low_level<'a, 'ctx>(
use roc_target::Architecture::*;
match env.target_info.architecture {
Aarch32 | X86_32 => {
arguments!(list, start, count);
arguments!(list);
let (a, b) = pass_list_or_string_to_zig_32bit(env, list.into_struct_value());
call_void_bitcode_fn(
@ -426,15 +426,13 @@ pub(crate) fn run_low_level<'a, 'ctx>(
result_ptr.into(),
a.into(),
b.into(),
start,
count,
pass_update_mode(env, update_mode),
],
bitcode::STR_FROM_UTF8_RANGE,
bitcode::STR_FROM_UTF8,
);
}
Aarch64 | X86_64 | Wasm32 => {
arguments!(_list, start, count);
arguments!(_list);
// we use the symbol here instead
let list = args[0];
@ -444,11 +442,9 @@ pub(crate) fn run_low_level<'a, 'ctx>(
&[
result_ptr.into(),
list_symbol_to_c_abi(env, scope, list).into(),
start,
count,
pass_update_mode(env, update_mode),
],
bitcode::STR_FROM_UTF8_RANGE,
bitcode::STR_FROM_UTF8,
);
}
}

View File

@ -225,15 +225,13 @@ impl<'a> LowLevelCall<'a> {
}
StrFromInt => self.num_to_str(backend),
StrFromFloat => self.num_to_str(backend),
StrFromUtf8Range => {
StrFromUtf8 => {
/*
Low-level op returns a struct with all the data for both Ok and Err.
Roc AST wrapper converts this to a tag union, with app-dependent tag IDs.
output: *FromUtf8Result i32
arg: RocList i32
start i64
count i64
update_mode: UpdateMode i32
*/
@ -245,7 +243,7 @@ impl<'a> LowLevelCall<'a> {
&WasmLayout::new(backend.layout_interner, self.ret_layout),
);
backend.code_builder.i32_const(UPDATE_MODE_IMMUTABLE);
backend.call_host_fn_after_loading_args(bitcode::STR_FROM_UTF8_RANGE);
backend.call_host_fn_after_loading_args(bitcode::STR_FROM_UTF8);
}
StrTrimStart => self.load_args_and_call_zig(backend, bitcode::STR_TRIM_START),
StrTrimEnd => self.load_args_and_call_zig(backend, bitcode::STR_TRIM_END),

View File

@ -13,7 +13,7 @@ pub enum LowLevel {
StrSplit,
StrCountUtf8Bytes,
StrFromInt,
StrFromUtf8Range,
StrFromUtf8,
StrToUtf8,
StrRepeat,
StrFromFloat,
@ -257,7 +257,7 @@ map_symbol_to_lowlevel! {
StrEndsWith <= STR_ENDS_WITH;
StrSplit <= STR_SPLIT;
StrCountUtf8Bytes <= STR_COUNT_UTF8_BYTES;
StrFromUtf8Range <= STR_FROM_UTF8_RANGE_LOWLEVEL;
StrFromUtf8 <= STR_FROM_UTF8_LOWLEVEL;
StrToUtf8 <= STR_TO_UTF8;
StrRepeat <= STR_REPEAT;
StrTrim <= STR_TRIM;

View File

@ -1328,7 +1328,7 @@ define_builtins! {
40 STR_WALK_UTF8_WITH_INDEX: "walkUtf8WithIndex"
41 STR_RESERVE: "reserve"
42 STR_TO_NUM: "strToNum"
43 STR_FROM_UTF8_RANGE_LOWLEVEL: "fromUtf8RangeLowlevel"
43 STR_FROM_UTF8_LOWLEVEL: "fromUtf8Lowlevel"
44 STR_CAPACITY: "capacity"
45 STR_REPLACE_EACH: "replaceEach"
46 STR_REPLACE_FIRST: "replaceFirst"

View File

@ -1596,7 +1596,7 @@ fn low_level_no_rc(lowlevel: &LowLevel) -> RC {
I128OfDec => RC::NoRc,
DictPseudoSeed => RC::NoRc,
StrStartsWith | StrEndsWith => RC::NoRc,
StrFromUtf8Range => RC::Rc,
StrFromUtf8 => RC::Rc,
StrToUtf8 => RC::Rc,
StrRepeat => RC::NoRc,
StrFromInt | StrFromFloat => RC::NoRc,

View File

@ -1350,7 +1350,7 @@ fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[Ownership] {
| NumCountOneBits
| I128OfDec => arena.alloc_slice_copy(&[irrelevant]),
StrStartsWith | StrEndsWith => arena.alloc_slice_copy(&[borrowed, borrowed]),
StrFromUtf8Range => arena.alloc_slice_copy(&[owned, irrelevant, irrelevant]),
StrFromUtf8 => arena.alloc_slice_copy(&[owned]),
StrToUtf8 => arena.alloc_slice_copy(&[owned]),
StrRepeat => arena.alloc_slice_copy(&[borrowed, irrelevant]),
StrFromInt | StrFromFloat => arena.alloc_slice_copy(&[irrelevant]),

View File

@ -907,12 +907,14 @@ fn str_to_utf8() {
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf8_range() {
fn str_from_utf8() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { count: 5, start: 0 } is
bytes =
Str.toUtf8 "hello"
when Str.fromUtf8 bytes is
Ok utf8String -> utf8String
_ -> ""
"#
@ -924,12 +926,15 @@ fn str_from_utf8_range() {
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf8_range_slice() {
fn str_from_utf8_slice() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { count: 4, start: 1 } is
bytes =
Str.toUtf8 "hello"
|> List.sublist { start: 1, len: 4 }
when Str.fromUtf8 bytes is
Ok utf8String -> utf8String
_ -> ""
"#
@ -941,12 +946,15 @@ fn str_from_utf8_range_slice() {
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf8_range_slice_not_end() {
fn str_from_utf8_slice_not_end() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { count: 3, start: 1 } is
bytes =
Str.toUtf8 "hello"
|> List.sublist { start: 1, len: 3 }
when Str.fromUtf8 bytes is
Ok utf8String -> utf8String
_ -> ""
"#
@ -958,14 +966,17 @@ fn str_from_utf8_range_slice_not_end() {
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf8_range_order_does_not_matter() {
fn str_from_utf8_order_does_not_matter() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { start: 1, count: 3 } is
bytes =
Str.toUtf8 "hello"
|> List.sublist { start: 1, len: 3 }
when Str.fromUtf8 bytes is
Ok utf8String -> utf8String
Err _ -> "Str.fromUtf8Range returned Err instead of Ok!"
Err _ -> "Str.fromUtf8 returned Err instead of Ok!"
"#
),
RocStr::from("ell"),
@ -973,60 +984,6 @@ fn str_from_utf8_range_order_does_not_matter() {
);
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf8_range_out_of_bounds_start_value() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { start: 7, count: 3 } is
Ok _ -> ""
Err (BadUtf8 _ _) -> ""
Err OutOfBounds -> "out of bounds"
"#
),
RocStr::from("out of bounds"),
RocStr
);
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf8_range_count_too_high() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { start: 0, count: 6 } is
Ok _ -> ""
Err (BadUtf8 _ _) -> ""
Err OutOfBounds -> "out of bounds"
"#
),
RocStr::from("out of bounds"),
RocStr
);
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf8_range_count_too_high_for_start() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { start: 4, count: 3 } is
Ok _ -> ""
Err (BadUtf8 _ _) -> ""
Err OutOfBounds -> "out of bounds"
"#
),
RocStr::from("out of bounds"),
RocStr
);
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_repeat_small_stays_small() {

View File

@ -683,12 +683,14 @@ fn str_to_utf8() {
}
#[test]
fn str_from_utf8_range() {
fn str_from_utf8() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { count: 5, start: 0 } is
bytes =
Str.toUtf8 "hello"
when Str.fromUtf8 bytes is
Ok utf8String -> utf8String
_ -> ""
"#
@ -699,12 +701,15 @@ fn str_from_utf8_range() {
}
#[test]
fn str_from_utf8_range_slice() {
fn str_from_utf8_slice() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { count: 4, start: 1 } is
bytes =
Str.toUtf8 "hello"
|> List.sublist { start: 1, len: 4 }
when Str.fromUtf8 bytes is
Ok utf8String -> utf8String
_ -> ""
"#
@ -715,12 +720,15 @@ fn str_from_utf8_range_slice() {
}
#[test]
fn str_from_utf8_range_slice_not_end() {
fn str_from_utf8_slice_not_end() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { count: 3, start: 1 } is
bytes =
Str.toUtf8 "hello"
|> List.sublist { start: 1, len: 3 }
when Str.fromUtf8 bytes is
Ok utf8String -> utf8String
_ -> ""
"#
@ -731,12 +739,15 @@ fn str_from_utf8_range_slice_not_end() {
}
#[test]
fn str_from_utf8_range_order_does_not_matter() {
fn str_from_utf8_order_does_not_matter() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { start: 1, count: 3 } is
bytes =
Str.toUtf8 "hello"
|> List.sublist { start: 1, len: 3 }
when Str.fromUtf8 bytes is
Ok utf8String -> utf8String
_ -> ""
"#
@ -746,57 +757,6 @@ fn str_from_utf8_range_order_does_not_matter() {
);
}
#[test]
fn str_from_utf8_range_out_of_bounds_start_value() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { start: 7, count: 3 } is
Ok _ -> ""
Err (BadUtf8 _ _) -> ""
Err OutOfBounds -> "out of bounds"
"#
),
RocStr::from("out of bounds"),
RocStr
);
}
#[test]
fn str_from_utf8_range_count_too_high() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { start: 0, count: 6 } is
Ok _ -> ""
Err (BadUtf8 _ _) -> ""
Err OutOfBounds -> "out of bounds"
"#
),
RocStr::from("out of bounds"),
RocStr
);
}
#[test]
fn str_from_utf8_range_count_too_high_for_start() {
assert_evals_to!(
indoc!(
r#"
bytes = Str.toUtf8 "hello"
when Str.fromUtf8Range bytes { start: 4, count: 3 } is
Ok _ -> ""
Err (BadUtf8 _ _) -> ""
Err OutOfBounds -> "out of bounds"
"#
),
RocStr::from("out of bounds"),
RocStr
);
}
#[test]
fn str_repeat_small() {
assert_evals_to!(