Data analysts should be able to use Text.location_of to find indexes within string using various matchers (#3324)

Implements https://www.pivotaltracker.com/n/projects/2539304/stories/181266029
This commit is contained in:
Radosław Waśko 2022-03-12 20:42:00 +01:00 committed by GitHub
parent 3ef18ab5b8
commit 247b284316
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 1237 additions and 110 deletions

View File

@ -63,6 +63,7 @@
- [Implemented `Bool.compare_to` method][3317] - [Implemented `Bool.compare_to` method][3317]
- [Implemented `Map.first`, `Map.last` functions. Expanded `Table.group_by` to - [Implemented `Map.first`, `Map.last` functions. Expanded `Table.group_by` to
also compute mode, percentile, minimum, maximum.][3318] also compute mode, percentile, minimum, maximum.][3318]
- [Implemented `Text.location_of` and `Text.location_of_all` methods.][3324]
[debug-shortcuts]: [debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -100,7 +101,8 @@
[3236]: https://github.com/enso-org/enso/pull/3236 [3236]: https://github.com/enso-org/enso/pull/3236
[3311]: https://github.com/enso-org/enso/pull/3311 [3311]: https://github.com/enso-org/enso/pull/3311
[3317]: https://github.com/enso-org/enso/pull/3317 [3317]: https://github.com/enso-org/enso/pull/3317
[3317]: https://github.com/enso-org/enso/pull/3318 [3318]: https://github.com/enso-org/enso/pull/3318
[3324]: https://github.com/enso-org/enso/pull/3324
#### Enso Compiler #### Enso Compiler

View File

@ -5,9 +5,11 @@ from Standard.Builtins import Text, Prim_Text_Helpers
import Standard.Base.Data.Text.Regex import Standard.Base.Data.Text.Regex
import Standard.Base.Data.Text.Regex.Mode import Standard.Base.Data.Text.Regex.Mode
import Standard.Base.Data.Text.Matching_Mode
import Standard.Base.Data.Text.Case import Standard.Base.Data.Text.Case
import Standard.Base.Data.Text.Location import Standard.Base.Data.Text.Location
import Standard.Base.Data.Text.Line_Ending_Style import Standard.Base.Data.Text.Line_Ending_Style
from Standard.Base.Data.Text.Span as Span_Module import Span
import Standard.Base.Data.Text.Split_Kind import Standard.Base.Data.Text.Split_Kind
import Standard.Base.Data.Text.Text_Sub_Range import Standard.Base.Data.Text.Text_Sub_Range
import Standard.Base.Data.Locale import Standard.Base.Data.Locale
@ -15,6 +17,7 @@ import Standard.Base.Meta
from Standard.Builtins export Text from Standard.Builtins export Text
export Standard.Base.Data.Text.Matching_Mode
export Standard.Base.Data.Text.Case export Standard.Base.Data.Text.Case
export Standard.Base.Data.Text.Location export Standard.Base.Data.Text.Location
export Standard.Base.Data.Text.Split_Kind export Standard.Base.Data.Text.Split_Kind
@ -546,7 +549,7 @@ Text.== that = if Meta.is_same_object this Text then Meta.is_same_object that Te
(('É' . equals_ignore_case 'é') && ('é' . equals_ignore_case 'e\u0301')) == True (('É' . equals_ignore_case 'é') && ('é' . equals_ignore_case 'e\u0301')) == True
Text.equals_ignore_case : Text -> Locale -> Boolean Text.equals_ignore_case : Text -> Locale -> Boolean
Text.equals_ignore_case that locale=Locale.default = Text.equals_ignore_case that locale=Locale.default =
(this.to_case_insensitive_key locale) == (that.to_case_insensitive_key locale) Text_Utils.equals_ignore_case this that locale.java_locale
## ADVANCED ## ADVANCED
PRIVATE PRIVATE
@ -555,7 +558,7 @@ Text.equals_ignore_case that locale=Locale.default =
used to perform case-insensitive comparisons. used to perform case-insensitive comparisons.
Text.to_case_insensitive_key : Locale -> Text Text.to_case_insensitive_key : Locale -> Text
Text.to_case_insensitive_key locale=Locale.default = Text.to_case_insensitive_key locale=Locale.default =
this.to_case Case.Lower locale . to_case Case.Upper locale Text_Utils.case_insensitive_key this locale.java_locale
## Compare two texts to discover their ordering. ## Compare two texts to discover their ordering.
@ -895,7 +898,7 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
Text_Matcher case_sensitivity -> case case_sensitivity of Text_Matcher case_sensitivity -> case case_sensitivity of
True -> Text_Utils.contains this term True -> Text_Utils.contains this term
Case_Insensitive locale -> Case_Insensitive locale ->
Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale) Text_Utils.contains_case_insensitive this term locale.java_locale
Regex_Matcher _ _ _ _ _ -> Regex_Matcher _ _ _ _ _ ->
compiled_pattern = matcher.compile term compiled_pattern = matcher.compile term
match = compiled_pattern.match this Mode.First match = compiled_pattern.match this Mode.First
@ -952,27 +955,6 @@ Text.repeat count=1 =
https://www.pivotaltracker.com/story/show/181435598 https://www.pivotaltracker.com/story/show/181435598
0.up_to (count.max 0) . fold "" acc-> _-> acc + this 0.up_to (count.max 0) . fold "" acc-> _-> acc + this
## PRIVATE
Utility function taking a range pointing at grapheme clusters and converting to a range on the underlying code points
range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error
range_to_char_indices text range =
len = text.length
start = if range.start < 0 then range.start + len else range.start
end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end)
is_valid = (Range 0 len+1).contains
case (Pair (is_valid start) (is_valid end)) of
Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len)
Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len)
Pair True True ->
if start>=end then (Range 0 0) else
iterator = BreakIterator.getCharacterInstance
iterator.setText text
start_index = iterator.next start
end_index = iterator.next (end - start)
Range start_index end_index
## ALIAS first, last, left, right, mid, substring ## ALIAS first, last, left, right, mid, substring
Creates a new Text by selecting the specified range of the input. Creates a new Text by selecting the specified range of the input.
@ -1009,7 +991,7 @@ range_to_char_indices text range =
Text.take : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error Text.take : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
Text.take range = Text.take range =
char_range = case range of char_range = case range of
Range _ _ -> here.range_to_char_indices this range Range _ _ -> Span_Module.range_to_char_indices this range
_ -> range.to_char_range this _ -> range.to_char_range this
Text_Utils.substring this char_range.start char_range.end Text_Utils.substring this char_range.start char_range.end
@ -1049,7 +1031,7 @@ Text.take range =
Text.drop : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error Text.drop : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
Text.drop range = Text.drop range =
char_range = case range of char_range = case range of
Range _ _ -> here.range_to_char_indices this range Range _ _ -> Span_Module.range_to_char_indices this range
_ -> range.to_char_range this _ -> range.to_char_range this
if char_range.start == 0 then Text_Utils.drop_first this char_range.end else if char_range.start == 0 then Text_Utils.drop_first this char_range.end else
prefix = Text_Utils.substring this 0 char_range.start prefix = Text_Utils.substring this 0 char_range.start
@ -1184,3 +1166,204 @@ Text.trim where=Location.Both what=_.is_whitespace =
loop current break_iterator.previous loop current break_iterator.previous
if start_index >= end_index then "" else if start_index >= end_index then "" else
Text_Utils.substring this start_index end_index Text_Utils.substring this start_index end_index
## ALIAS find, index_of, position_of, span_of
Find the location of the `term` in the input.
Returns a Span representing the location at which the term was found, or
`Nothing` if the term was not found in the input.
Arguments:
- term: The term to find.
- mode: Specifies if the first or last occurrence of the term should be
returned if there are multiple occurrences within the input. The first
occurrence is returned by default.
- matcher: Specifies how the term is matched against the input:
- If a `Text_Matcher`, the text is compared using case-sensitively rules
specified in the matcher.
- If a `Regex_Matcher`, the `term` is used as a regular expression and
matched using the associated options.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
> Example
Finding location of a substring.
"Hello World!".location_of "J" == Nothing
"Hello World!".location_of "o" == Span (Range 4 5) "Hello World!"
"Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 7 8) "Hello World!"
! Match Length
The function returns not only the index of the match but a `Span` instance
which contains both the start and end indices, allowing to determine the
length of the match. This is useful not only with regex matches (where a
regular expression can have matches of various lengths) but also for case
insensitive matching. In case insensitive mode, a single character can
match multiple characters, for example `ß` will match `ss` and `SS`, and
the ligature `ffi` will match `ffi` or `f` etc. Thus in case insensitive
mode, the length of the match can be shorter or longer than the term that
was being matched, so it is extremely important to not rely on the length
of the matched term when analysing the matches as they may have different
lengths.
> Example
Match length differences in case insensitive matching.
term = "straße"
text = "MONUMENTENSTRASSE 42"
match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new)
term.length == 6
match.length == 7
! Matching Grapheme Clusters
In case insensitive mode, a single character can match multiple characters,
for example `ß` will match `ss` and `SS`, and the ligature `ffi` will match
`ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
match only a part of some single grapheme cluster, for example in the text
`ffia` the term `ia` will match just one-third of the first grapheme `ffi`.
Since we do not have the resolution to distinguish such partial matches
(as that would require non-integer indices), so a match which matched just
a part of some grapheme cluster is extended and treated as if it matched
the whole grapheme cluster.
> Example
Extending matches to full grapheme clusters.
ligatures = "ffiffl"
ligatures.length == 2
term_1 = "IFF"
match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new)
term_1.length == 3
match_1.length == 2
term_2 = "ffiffl"
match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new)
term_2.length == 6
match_2.length == 2
# After being extended to full grapheme clusters, both terms "IFF" and "ffiffl" match the same span of grapheme clusters.
match_1 == match_2
Text.location_of : Text -> (Matching_Mode.First | Matching_Mode.Last) -> Matcher -> Span | Nothing
Text.location_of term="" mode=Matching_Mode.First matcher=Text_Matcher.new = case matcher of
Text_Matcher case_sensitive -> case case_sensitive of
True ->
codepoint_span = case mode of
Matching_Mode.First -> Text_Utils.span_of this term
Matching_Mode.Last -> Text_Utils.last_span_of this term
if codepoint_span.is_nothing then Nothing else
start = Text_Utils.utf16_index_to_grapheme_index this codepoint_span.start
## While the codepoint_span may have different code unit length
from our term, the `length` counted in grapheme clusters is
guaranteed to be the same.
end = start + term.length
Span (Range start end) this
Case_Insensitive locale -> case term.is_empty of
True -> case mode of
Matching_Mode.First -> Span (Range 0 0) this
Matching_Mode.Last ->
end = this.length
Span (Range end end) this
False ->
search_for_last = case mode of
Matching_Mode.First -> False
Matching_Mode.Last -> True
case Text_Utils.span_of_case_insensitive this term locale.java_locale search_for_last of
Nothing -> Nothing
grapheme_span ->
Span (Range grapheme_span.start grapheme_span.end) this
Regex_Matcher _ _ _ _ _ -> case mode of
Matching_Mode.First ->
case matcher.compile term . match this Mode.First of
Nothing -> Nothing
match -> match.span 0 . to_grapheme_span
Matching_Mode.Last ->
case matcher.compile term . match this Mode.All of
Nothing -> Nothing
matches -> matches.last.span 0 . to_grapheme_span
## ALIAS find_all, index_of_all, position_of_all, span_of_all
Finds all the locations of the `term` in the input.
If not found, the function returns an empty Vector.
Arguments:
- term: The term to find.
- matcher: Specifies how the term is matched against the input:
- If a `Text_Matcher`, the text is compared using case-sensitively rules
specified in the matcher.
- If a `Regex_Matcher`, the `term` is used as a regular expression and
matched using the associated options.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
> Example
Finding locations of all occurrences of a substring.
"Hello World!".location_of_all "J" == []
"Hello World!".location_of_all "o" . map .start == [4, 7]
! Match Length
The function returns not only the index of the match but a `Span` instance
which contains both the start and end indices, allowing to determine the
length of the match. This is useful not only with regex matches (where a
regular expression can have matches of various lengths) but also for case
insensitive matching. In case insensitive mode, a single character can
match multiple characters, for example `ß` will match `ss` and `SS`, and
the ligature `ffi` will match `ffi` or `f` etc. Thus in case insensitive
mode, the length of the match can be shorter or longer than the term that
was being matched, so it is extremely important to not rely on the length
of the matched term when analysing the matches as they may have different
lengths.
> Example
Match length differences in case insensitive matching.
term = "strasse"
text = "MONUMENTENSTRASSE ist eine große Straße."
match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new)
term.length == 7
match . map .length == [7, 6]
! Matching Grapheme Clusters
In case insensitive mode, a single character can match multiple characters,
for example `ß` will match `ss` and `SS`, and the ligature `ffi` will match
`ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
match only a part of some single grapheme cluster, for example in the text
`ffia` the term `ia` will match just one-third of the first grapheme `ffi`.
Since we do not have the resolution to distinguish such partial matches
(as that would require non-integer indices), so a match which matched just
a part of some grapheme cluster is extended and treated as if it matched
the whole grapheme cluster.
> Example
Extending matches to full grapheme clusters.
ligatures = "ffifflFFIFF"
ligatures.length == 7
match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new)
match_1 . map .length == [2, 3]
match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new)
match_2 . map .length == [2, 5]
Text.location_of_all : Text -> Matcher -> [Span]
Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of
True ->
codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term
grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
## While the codepoint_spans may have different code unit lengths
from our term, the `length` counted in grapheme clusters is
guaranteed to be the same.
offset = term.length
grahpeme_ixes . map start->
end = start+offset
Span (Range start end) this
Case_Insensitive locale ->
grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
grapheme_spans.map grapheme_span->
Span (Range grapheme_span.start grapheme_span.end) this
Regex_Matcher _ _ _ _ _ ->
case matcher.compile term . match this Mode.All of
Nothing -> []
matches -> matches.map m-> m.span 0 . to_grapheme_span

View File

@ -0,0 +1,5 @@
## Matches the first found instance.
type First
## Matches the last found instance.
type Last

View File

@ -40,7 +40,7 @@ import Standard.Base.Data.Text.Regex.Engine
import Standard.Base.Data.Text.Regex.Option as Global_Option import Standard.Base.Data.Text.Regex.Option as Global_Option
import Standard.Base.Data.Text.Regex.Mode import Standard.Base.Data.Text.Regex.Mode
import Standard.Base.Polyglot.Java as Java_Ext import Standard.Base.Polyglot.Java as Java_Ext
import Standard.Base.Data.Text.Span from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span
from Standard.Builtins import Java from Standard.Builtins import Java
@ -183,8 +183,13 @@ type Pattern
on the encoding, we normalize all input. on the encoding, we normalize all input.
build_matcher : Text -> Integer -> Integer -> Java_Matcher build_matcher : Text -> Integer -> Integer -> Java_Matcher
build_matcher input start end = build_matcher input start end =
normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else ## TODO [RW] Normalization had to be disabled - since start and end are
Text_Utils.normalize input in code unit space, normalization could shift these indices!
This should be addressed when reviewing
See: https://www.pivotaltracker.com/story/show/181524498
#normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else
# Text_Utils.normalize input
normalized_input = input
internal_matcher = this.internal_pattern.matcher normalized_input . region start end internal_matcher = this.internal_pattern.matcher normalized_input . region start end
if this.options.contains No_Anchoring_Bounds then if this.options.contains No_Anchoring_Bounds then
@ -262,7 +267,7 @@ type Pattern
internal_matcher = this.build_matcher input start end internal_matcher = this.build_matcher input start end
if internal_matcher . find start . not then Nothing else if internal_matcher . find start . not then Nothing else
Match internal_matcher start end Match internal_matcher start end input
Integer -> Integer ->
if mode < 0 then Panic.throw <| if mode < 0 then Panic.throw <|
Mode_Error "Cannot match a negative number of times." Mode_Error "Cannot match a negative number of times."
@ -272,13 +277,16 @@ type Pattern
go : Integer -> Integer -> Nothing go : Integer -> Integer -> Nothing
go offset remaining_count = go offset remaining_count =
should_continue = remaining_count > 0 should_continue = remaining_count > 0
if should_continue.not || (offset > end) then Nothing else if should_continue.not || (offset >= end) then Nothing else
internal_matcher = this.build_matcher input start end internal_matcher = this.build_matcher input start end
found = internal_matcher.find offset found = internal_matcher.find offset
if found.not then Nothing else if found.not then Nothing else
builder.append (Match internal_matcher start end) builder.append (Match internal_matcher start end input)
@Tail_Call go (internal_matcher.end 0) remaining_count-1 match_end = internal_matcher.end 0
# Ensure progress even if the match is an empty string.
new_offset = if match_end > offset then match_end else offset+1
@Tail_Call go new_offset remaining_count-1
go start mode go start mode
vector = builder.to_vector vector = builder.to_vector
@ -294,8 +302,11 @@ type Pattern
found = internal_matcher.find offset found = internal_matcher.find offset
if found.not then Nothing else if found.not then Nothing else
builder.append (Match internal_matcher start end) builder.append (Match internal_matcher start end input)
@Tail_Call go (internal_matcher.end 0) match_end = internal_matcher.end 0
# Ensure progress even if the match is an empty string.
new_offset = if match_end > offset then match_end else offset+1
@Tail_Call go new_offset
go start go start
vector = builder.to_vector vector = builder.to_vector
@ -304,7 +315,7 @@ type Pattern
Mode.Full -> Mode.Full ->
internal_matcher = this.build_matcher input start end internal_matcher = this.build_matcher input start end
if internal_matcher.matches.not then Nothing else if internal_matcher.matches.not then Nothing else
Match internal_matcher start end Match internal_matcher start end input
Mode.Bounded _ _ _ -> Panic.throw <| Mode.Bounded _ _ _ -> Panic.throw <|
Mode_Error "Modes cannot be recursive." Mode_Error "Modes cannot be recursive."
@ -312,7 +323,7 @@ type Pattern
Mode.Bounded start end sub_mode -> Mode.Bounded start end sub_mode ->
if start < end then do_match_mode sub_mode start end else if start < end then do_match_mode sub_mode start end else
Panic.throw Invalid_Bounds_Error Panic.throw Invalid_Bounds_Error
_ -> do_match_mode mode 0 input.length _ -> do_match_mode mode 0 (Text_Utils.char_length input)
## ADVANCED ## ADVANCED
@ -334,7 +345,7 @@ type Pattern
pattern.matches input pattern.matches input
matches : Text -> Boolean matches : Text -> Boolean
matches input = case this.match input mode=Mode.Full of matches input = case this.match input mode=Mode.Full of
Match _ _ _ -> True Match _ _ _ _ -> True
Vector.Vector _ -> True Vector.Vector _ -> True
_ -> False _ -> False
@ -405,7 +416,7 @@ type Pattern
find input mode=Mode.All = find input mode=Mode.All =
matches = this.match input mode matches = this.match input mode
case matches of case matches of
Match _ _ _ -> matches.group 0 Match _ _ _ _ -> matches.group 0
Vector.Vector _ -> matches.map (_.group 0) Vector.Vector _ -> matches.map (_.group 0)
_ -> matches _ -> matches
@ -548,7 +559,7 @@ type Pattern
internal_matcher.replaceAll replacement internal_matcher.replaceAll replacement
Mode.Full -> Mode.Full ->
case this.match input mode=Mode.Full of case this.match input mode=Mode.Full of
Match _ _ _ -> replacement Match _ _ _ _ -> replacement
Nothing -> input Nothing -> input
Mode.Bounded _ _ _ -> Panic.throw <| Mode.Bounded _ _ _ -> Panic.throw <|
Mode_Error "Modes cannot be recursive." Mode_Error "Modes cannot be recursive."
@ -556,7 +567,7 @@ type Pattern
case mode of case mode of
Mode.Bounded _ _ _ -> Panic.throw <| Mode.Bounded _ _ _ -> Panic.throw <|
Mode_Error "Bounded replacements are not well-formed." Mode_Error "Bounded replacements are not well-formed."
_ -> do_replace_mode mode 0 input.length _ -> do_replace_mode mode 0 (Text_Utils.char_length input)
## The default implementation of the `Data.Text.Regex.Engine.Match` interface. ## The default implementation of the `Data.Text.Regex.Engine.Match` interface.
type Match type Match
@ -570,7 +581,8 @@ type Match
match. match.
- region_start: The start of the region over which the match was made. - region_start: The start of the region over which the match was made.
- region_end: The end of the region over which the match was made. - region_end: The end of the region over which the match was made.
type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) - input: The input text that was being matched.
type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) (input : Text)
## Gets the text matched by the group with the provided identifier, or ## Gets the text matched by the group with the provided identifier, or
`Nothing` if the group did not participate in the match. If no such group `Nothing` if the group did not participate in the match. If no such group
@ -743,10 +755,10 @@ type Match
example_Span = example_Span =
match = Examples.match match = Examples.match
match.span 0 match.span 0
span : Integer | Text -> Span | Nothing ! Regex.No_Such_Group_Error span : Integer | Text -> Utf_16_Span | Nothing ! Regex.No_Such_Group_Error
span id = case this.group id of span id = case this.group id of
Nothing -> Nothing Nothing -> Nothing
_ -> Span.new (this.start id) (this.end id) (this.group 0) _ -> Utf_16_Span (Range (this.start id) (this.end id)) this.input
## Returns the start character index of the match's region. ## Returns the start character index of the match's region.

View File

@ -4,11 +4,13 @@
to matching on the `Full` content of the input text. to matching on the `Full` content of the input text.
from Standard.Base import all from Standard.Base import all
from Standard.Base.Data.Text.Matching_Mode import First
from Standard.Base.Data.Text.Matching_Mode export First
type Mode type Mode
## The regex will only match the first instance it finds. ## The regex will only match the first instance it finds.
type First First
## The regex will match up to some `Integer` number of instances. ## The regex will match up to some `Integer` number of instances.
Integer Integer

View File

@ -7,30 +7,14 @@
example_span = example_span =
text = "Hello!" text = "Hello!"
Span.new 0 3 text Span 0 3 text
from Standard.Base import all from Standard.Base import all
import Standard.Base.Data.Range from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error
## Construct a new `Span`. polyglot java import org.enso.base.Text_Utils
polyglot java import com.ibm.icu.text.BreakIterator
Arguments:
- start: The index of the first character included in the span.
- end: The index of the first character after `start` that is _not_ included
in the span.
- text: The `Text` over which the span exists. This is _optional_.
> Example
Creating a span over the first three characters of the text "hello!".
import Standard.Base.Data.Text.Span
example_span =
text = "Hello!"
Span.new 0 3 text
new : Integer -> Integer -> Text | Nothing -> Span
new start end text=Nothing = Span (start.up_to end) text
type Span type Span
@ -38,7 +22,7 @@ type Span
Arguments: Arguments:
- range: The range of characters over which the span exists. - range: The range of characters over which the span exists.
- text: The text over which the span exists. This is _optional_. - text: The text over which the span exists.
! What is a Character? ! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode A character is defined as an Extended Grapheme Cluster, see Unicode
@ -54,7 +38,7 @@ type Span
text = "Hello!" text = "Hello!"
range = 0.up_to 3 range = 0.up_to 3
Span.Span range text Span.Span range text
type Span (range : Range.Range) (text : (Text | Nothing) = Nothing) type Span (range : Range.Range) (text : Text)
## The index of the first character included in the span. ## The index of the first character included in the span.
@ -74,3 +58,112 @@ type Span
meaning in most text-processing applications. meaning in most text-processing applications.
end : Integer end : Integer
end = this.range.end end = this.range.end
## The length of the span in extended grapheme clusters.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
length : Integer
length = this.range.length
## Converts the span of extended grapheme clusters to a corresponding span
of UTF-16 code units.
> Example
Find the span of code units corresponding to the span of extended grapheme clusters.
text = 'ae\u{301}fz'
(Span (Range 1 3) text).to_utf_16_span == (Utf_16_Span (Range 1 4) text)
to_utf_16_span : Utf_16_Span
to_utf_16_span =
Utf_16_Span (here.range_to_char_indices this.text this.range) this.text
type Utf_16_Span
## A representation of a span of UTF-16 code units in Enso's `Text` type.
Arguments:
- range: The range of code units over which the span exists.
- text: The text over which the span exists.
> Example
Creating a span over the first three code units of the text 'a\u{301}bc'.
import Standard.Base.Data.Text.Span
example_span =
text = 'a\u{301}bc'
Span.Utf_16_Span (Range 0 3) text
type Utf_16_Span (range : Range.Range) (text : Text)
## The index of the first code unit included in the span.
start : Integer
start = this.range.start
## The index of the first code unit after `start` that is _not_ included in
the span.
end : Integer
end = this.range.end
## The length of the span in UTF-16 code units.
length : Integer
length = this.range.length
## Returns a span of extended grapheme clusters which is the closest
approximation of this span of code units.
The resulting span is extended in such a way that every code unit that
was contained by the original span is also contained in a new span. Since
some grapheme clusters consist of multiple code units, after the span was
extended it may also contain code units which were not contained inside
of the original span.
> Example
Convert a codepoint span to graphemes and back.
text = 'a\u{301}e\u{302}o\u{303}'
span = Utf_16_Span (Range 1 5) text # The span contains the units [\u{301}, e, \u{302}, o].
extended = span.to_grapheme_span
extended == Span (Range 0 3) text # The span is extended to the whole string since it contained code units from every grapheme cluster.
extended.to_utf_16_span == Utf_16_Span (Range 0 6) text
to_grapheme_span : Span
to_grapheme_span = if (this.start < 0) || (this.end > Text_Utils.char_length this.text) then Error.throw (Illegal_State_Error "Utf_16_Span indices are out of range of the associated text.") else
if this.end < this.start then Error.throw (Illegal_State_Error "Utf_16_Span invariant violation: start <= end") else
case this.start == this.end of
True ->
grapheme_ix = Text_Utils.utf16_index_to_grapheme_index this.text this.start
Span (Range grapheme_ix grapheme_ix) this.text
False ->
grapheme_ixes = Text_Utils.utf16_indices_to_grapheme_indices this.text [this.start, this.end - 1].to_array
grapheme_first = grapheme_ixes.at 0
grapheme_last = grapheme_ixes.at 1
## We find the grapheme index of the last code unit actually contained within our span and set the
end grapheme to the first grapheme after that. This ensures that if code units associated with
only a part of a grapheme were contained in our original span, the resulting span will be
extended to contain this whole grapheme.
grapheme_end = grapheme_last + 1
Span (Range grapheme_first grapheme_end) this.text
## PRIVATE
Utility function taking a range pointing at grapheme clusters and converting
to a range on the underlying code units.
range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error
range_to_char_indices text range =
len = text.length
start = if range.start < 0 then range.start + len else range.start
end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end)
is_valid = (Range 0 len+1).contains
case (Pair (is_valid start) (is_valid end)) of
Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len)
Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len)
Pair True True ->
if start>=end then (Range 0 0) else
iterator = BreakIterator.getCharacterInstance
iterator.setText text
start_index = iterator.next start
end_index = iterator.next (end - start)
Range start_index end_index

View File

@ -79,24 +79,24 @@ type Text_Sub_Range
Range (if start_index == -1 then 0 else start_index) (Text_Utils.char_length text) Range (if start_index == -1 then 0 else start_index) (Text_Utils.char_length text)
Before delimiter -> Before delimiter ->
if delimiter.is_empty then (Range 0 0) else if delimiter.is_empty then (Range 0 0) else
index = Text_Utils.index_of text delimiter span = Text_Utils.span_of text delimiter
if index == -1 then (Range 0 (Text_Utils.char_length text)) else if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
(Range 0 index) (Range 0 span.start)
Before_Last delimiter -> Before_Last delimiter ->
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
index = Text_Utils.last_index_of text delimiter span = Text_Utils.last_span_of text delimiter
if index == -1 then (Range 0 (Text_Utils.char_length text)) else if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
(Range 0 index) (Range 0 span.start)
After delimiter -> After delimiter ->
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
index = Text_Utils.index_of text delimiter span = Text_Utils.span_of text delimiter
if index == -1 then (Range 0 0) else if span.is_nothing then (Range 0 0) else
(Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text)) (Range span.end (Text_Utils.char_length text))
After_Last delimiter -> After_Last delimiter ->
if delimiter.is_empty then (Range 0 0) else if delimiter.is_empty then (Range 0 0) else
index = Text_Utils.last_index_of text delimiter span = Text_Utils.last_span_of text delimiter
if index == -1 then (Range 0 0) else if span.is_nothing then (Range 0 0) else
(Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text)) (Range span.end (Text_Utils.char_length text))
While predicate -> While predicate ->
indices = find_sub_range_end text _-> start-> end-> indices = find_sub_range_end text _-> start-> end->
predicate (Text_Utils.substring text start end) . not predicate (Text_Utils.substring text start end) . not

View File

@ -1,7 +1,7 @@
akka { akka {
loggers = ["akka.event.slf4j.Slf4jLogger"] loggers = ["akka.event.slf4j.Slf4jLogger"]
logging-filter = "akka.event.slf4j.Slf4jLoggingFilter" logging-filter = "akka.event.slf4j.Slf4jLoggingFilter"
version = "2.6.6" version = "2.6.18"
stdout-loglevel = "ERROR" stdout-loglevel = "ERROR"
} }

View File

@ -1,11 +1,19 @@
package org.enso.base; package org.enso.base;
import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.CaseMap.Fold;
import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.StringSearch; import com.ibm.icu.text.StringSearch;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.enso.base.text.CaseFoldedString;
import org.enso.base.text.GraphemeSpan;
import org.enso.base.text.Utf16Span;
/** Utils for standard library operations on Text. */ /** Utils for standard library operations on Text. */
public class Text_Utils { public class Text_Utils {
@ -117,6 +125,23 @@ public class Text_Utils {
} }
} }
/**
* Checks whether two strings are equal up to Unicode canonicalization and ignoring case.
*
* @param str1 the first string
* @param str2 the second string
* @param locale the locale to use for case folding
* @return the result of comparison
*/
public static boolean equals_ignore_case(String str1, Object str2, Locale locale) {
if (str2 instanceof String) {
Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
return compare_normalized(fold.apply(str1), fold.apply((String) str2)) == 0;
} else {
return false;
}
}
/** /**
* Converts an array of codepoints into a string. * Converts an array of codepoints into a string.
* *
@ -176,6 +201,36 @@ public class Text_Utils {
return searcher.first() != StringSearch.DONE; return searcher.first() != StringSearch.DONE;
} }
/**
* Checks if {@code substring} is a substring of {@code string}.
*
* @param string the containing string.
* @param substring the contained string.
* @return whether {@code substring} is a substring of {@code string}.
*/
public static boolean contains_case_insensitive(String string, String substring, Locale locale) {
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
// cases.
if (substring.isEmpty()) return true;
if (string.isEmpty()) return false;
Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
StringSearch searcher = new StringSearch(fold.apply(substring), fold.apply(string));
return searcher.first() != StringSearch.DONE;
}
/**
* Transforms the provided string into a form which can be used for case insensitive comparisons.
*
* @param string the string to transform
* @param locale the locale to use - needed to distinguish a special case when handling Turkish
* 'i' characters
* @return a transformed string that can be used for case insensitive comparisons
*/
public static String case_insensitive_key(String string, Locale locale) {
return CaseFoldedString.simpleFold(string, locale);
}
/** /**
* Replaces all occurrences of {@code oldSequence} within {@code str} with {@code newSequence}. * Replaces all occurrences of {@code oldSequence} within {@code str} with {@code newSequence}.
* *
@ -200,37 +255,215 @@ public class Text_Utils {
} }
/** /**
* Find the first index of needle in the haystack * Find the first occurrence of needle in the haystack
* *
* @param haystack the string to search * @param haystack the string to search
* @param needle the substring that is searched for * @param needle the substring that is searched for
* @return index of the first needle or -1 if not found. * @return a UTF-16 code unit span of the first needle or null if not found.
*/ */
public static long index_of(String haystack, String needle) { public static Utf16Span span_of(String haystack, String needle) {
if (needle.isEmpty()) return new Utf16Span(0, 0);
if (haystack.isEmpty()) return null;
StringSearch search = new StringSearch(needle, haystack); StringSearch search = new StringSearch(needle, haystack);
int pos = search.first(); int pos = search.first();
return pos == StringSearch.DONE ? -1 : pos; if (pos == StringSearch.DONE) return null;
return new Utf16Span(pos, pos + search.getMatchLength());
} }
/** /**
* Find the last index of needle in the haystack * Find the last occurrence of needle in the haystack
* *
* @param haystack the string to search * @param haystack the string to search
* @param needle the substring that is searched for * @param needle the substring that is searched for
* @return index of the last needle or -1 if not found. * @return a UTF-16 code unit span of the last needle or null if not found.
*/ */
public static long last_index_of(String haystack, String needle) { public static Utf16Span last_span_of(String haystack, String needle) {
if (needle.isEmpty()) {
int afterLast = haystack.length();
return new Utf16Span(afterLast, afterLast);
}
if (haystack.isEmpty()) return null;
StringSearch search = new StringSearch(needle, haystack); StringSearch search = new StringSearch(needle, haystack);
int pos = search.first(); int pos = search.last();
if (pos == StringSearch.DONE) return null;
return new Utf16Span(pos, pos + search.getMatchLength());
}
/**
* Find spans of all occurrences of the needle within the haystack.
*
* @param haystack the string to search
* @param needle the substring that is searched for
* @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
*/
public static List<Utf16Span> span_of_all(String haystack, String needle) {
if (needle.isEmpty())
throw new IllegalArgumentException(
"The operation `index_of_all` does not support searching for an empty term.");
if (haystack.isEmpty()) return List.of();
StringSearch search = new StringSearch(needle, haystack);
ArrayList<Utf16Span> occurrences = new ArrayList<>();
long ix;
while ((ix = search.next()) != StringSearch.DONE) {
occurrences.add(new Utf16Span(ix, ix + search.getMatchLength()));
}
return occurrences;
}
/**
* Converts a UTF-16 code unit index to index of the grapheme that this code unit belongs to.
*
* @param text the text associated with the index
* @param codeunit_index the UTF-16 index
* @return an index of an extended grapheme cluster that contains the code unit from the input
*/
public static long utf16_index_to_grapheme_index(String text, long codeunit_index) {
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
breakIterator.setText(text);
if (codeunit_index < 0 || codeunit_index > text.length()) {
throw new IndexOutOfBoundsException(
"Index " + codeunit_index + " is outside of the provided text.");
}
int grapheme_end = breakIterator.next();
long grapheme_index = 0;
while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) {
grapheme_index++;
grapheme_end = breakIterator.next();
}
return grapheme_index;
}
/**
* Converts a series of UTF-16 code unit indices to indices of graphemes that these code units
* belong to.
*
* <p>For performance, it assumes that the provided indices are sorted in a non-decreasing order
* (duplicate entries are permitted). Behaviour is unspecified if an unsorted list is provided.
*
* <p>The behaviour is unspecified if indices provided on the input are outside of the range [0,
* text.length()].
*
* @param text the text associated with the indices
* @param codeunit_indices the array of UTF-16 code unit indices, sorted in non-decreasing order
* @return an array of grapheme indices corresponding to the UTF-16 units from the input
*/
public static long[] utf16_indices_to_grapheme_indices(String text, List<Long> codeunit_indices) {
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
breakIterator.setText(text);
int grapheme_end = breakIterator.next();
long grapheme_index = 0;
long[] result = new long[codeunit_indices.size()];
int result_ix = 0;
for (long codeunit_index : codeunit_indices) {
while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) {
grapheme_index++;
grapheme_end = breakIterator.next();
}
result[result_ix++] = grapheme_index;
}
return result;
}
/**
* Find the first or last occurrence of needle in the haystack.
*
* @param haystack the string to search
* @param needle the substring that is searched for
* @param locale the locale used for case-insensitive comparisons
* @param searchForLast if set to true, will search for the last occurrence; otherwise searches
* for the first one
* @return an extended-grapheme-cluster span of the first or last needle, or null if none found.
*/
public static GraphemeSpan span_of_case_insensitive(
String haystack, String needle, Locale locale, boolean searchForLast) {
if (needle.isEmpty())
throw new IllegalArgumentException(
"The operation `span_of_case_insensitive` does not support searching for an empty term.");
if (haystack.isEmpty()) return null;
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString());
int pos;
if (searchForLast) {
pos = search.last();
} else {
pos = search.first();
}
if (pos == StringSearch.DONE) { if (pos == StringSearch.DONE) {
return -1; return null;
} else {
return findExtendedSpan(foldedHaystack, pos, search.getMatchLength());
}
} }
for (int next = search.next(); next != StringSearch.DONE; next = search.next()) { /**
pos = next; * Find all occurrences of needle in the haystack
*
* @param haystack the string to search
* @param needle the substring that is searched for
* @param locale the locale used for case-insensitive comparisons
* @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
*/
public static List<GraphemeSpan> span_of_all_case_insensitive(
String haystack, String needle, Locale locale) {
if (needle.isEmpty())
throw new IllegalArgumentException(
"The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
if (haystack.isEmpty()) return List.of();
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString());
ArrayList<GraphemeSpan> result = new ArrayList<>();
int pos;
while ((pos = search.next()) != StringSearch.DONE) {
result.add(findExtendedSpan(foldedHaystack, pos, search.getMatchLength()));
} }
return pos; return result;
}
/**
* Finds the grapheme span corresponding to the found match indexed with code units.
*
* <p>It extends the found span to ensure that graphemes associated with all found code units are
* included in the resulting span. Thus, some additional code units which were not present in the
* original match may also be present due to the extension.
*
* <p>The extension to the left is trivial - we just find the grapheme associated with the first
* code unit and even if that code unit is not the first one of that grapheme, by returning it we
* correctly extend to the left. The extension to the right works by finding the index of the
* grapheme associated with the last code unit actually present in the span, then the end of the
* returned span is set to the next grapheme after it. This correctly handles the edge case where
* only a part of some grapheme was matched.
*
* @param string the folded string with which the positions are associated, containing a cache of
* position mappings
* @param position the position of the match (in code units)
* @param length the length of the match (in code units)
* @return a minimal {@code GraphemeSpan} which contains all code units from the match
*/
private static GraphemeSpan findExtendedSpan(CaseFoldedString string, int position, int length) {
int firstGrapheme = string.codeUnitToGraphemeIndex(position);
if (length == 0) {
return new GraphemeSpan(firstGrapheme, firstGrapheme);
} else {
int lastGrapheme = string.codeUnitToGraphemeIndex(position + length - 1);
int endGrapheme = lastGrapheme + 1;
return new GraphemeSpan(firstGrapheme, endGrapheme);
}
} }
/** /**

View File

@ -0,0 +1,135 @@
package org.enso.base.text;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.CaseMap;
import com.ibm.icu.text.CaseMap.Fold;
import java.util.Locale;
/**
* Represents a string transformed using Unicode Case Folding which can be used for case insensitive
* comparisons.
*
* <p>It contains facilities for converting indices in the transformed string to corresponding
* indices back in the original string.
*/
public class CaseFoldedString {
private final String foldedString;
/**
* A mapping from code units in the transformed string to their corresponding graphemes in the
* original string.
*
* <p>The mapping must be valid from indices from 0 to @{code foldedString.length()+1}
* (inclusive).
*/
private final int[] graphemeIndexMapping;
/**
* Constructs a new instance of the folded string.
*
* @param foldeString the string after applying the case folding transformation
* @param graphemeIndexMapping a mapping created during the transformation which maps code units
* in the transformed string to their corresponding graphemes in the original string
*/
private CaseFoldedString(String foldeString, int[] graphemeIndexMapping) {
this.foldedString = foldeString;
this.graphemeIndexMapping = graphemeIndexMapping;
}
/**
* Maps a code unit in the folded string to the corresponding grapheme in the original string.
*
* @param codeunitIndex the index of the code unit in the folded string, valid indices range from
* 0 to {@code getFoldedString().length()+1} (inclusive), allowing to also ask for the
* position of the end code unit which is located right after the end of the string - which
* should always map to the analogous end grapheme.
* @return the index of the grapheme from the original string that after applying the
* transformation contains the requested code unit
*/
public int codeUnitToGraphemeIndex(int codeunitIndex) {
if (codeunitIndex < 0 || codeunitIndex > this.foldedString.length()) {
throw new IndexOutOfBoundsException(codeunitIndex);
}
return graphemeIndexMapping[codeunitIndex];
}
/** Returns the transformed string. */
public String getFoldedString() {
return foldedString;
}
/**
* Folds a string remembering the mapping from code units to its original grapheme cluster
* indices.
*
* @param charSequence a sequence of UTF-16 characters to transform
* @param locale the locale to use as a reference for case folding; it is needed because Turkish
* and Azerbaijani locales handle casing of the letter `i` in a different way than other
* locales
* @return a {@code CaseFoldedString} instance which contains the transformed string and allows to
* map its code units to original grapheme clusters
*/
public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
breakIterator.setText(charSequence);
StringBuilder stringBuilder = new StringBuilder(charSequence.length());
Fold foldAlgorithm = caseFoldAlgorithmForLocale(locale);
IntArrayBuilder index_mapping = new IntArrayBuilder(charSequence.length() + 1);
// We rely on the fact that ICU Case Folding is _not_ context-sensitive, i.e. the mapping of
// each grapheme cluster is independent of surrounding ones. Regular casing is
// context-sensitive.
int current = breakIterator.current();
int next;
int grapheme_index = 0;
while ((next = breakIterator.next()) != BreakIterator.DONE) {
CharSequence grapheme = new StringSlice(charSequence, current, next);
String foldedGrapheme = foldAlgorithm.apply(grapheme);
stringBuilder.append(foldedGrapheme);
for (int i = 0; i < foldedGrapheme.length(); ++i) {
index_mapping.add(grapheme_index);
}
grapheme_index++;
current = next;
}
// The mapping should also be able to handle a {@code str.length()} query, so we add one more
// element to the mapping pointing to a non-existent grapheme after the end of the text.
index_mapping.add(grapheme_index);
return new CaseFoldedString(
stringBuilder.toString(), index_mapping.unsafeGetStorageAndInvalidateTheBuilder());
}
/**
* A helper function which folds the string without remembering the index mapping.
*
* <p>It should be used when the index mapping is not needed, as its implementation is much more
* efficient.
*
* @param charSequence a sequence of UTF-16 characters to transform
* @param locale the locale to use as a reference for case folding; it is needed because Turkish
* and Azerbaijani locales handle casing of the letter `i` in a different way than the others
* @return the folded string
*/
public static String simpleFold(CharSequence string, Locale locale) {
return caseFoldAlgorithmForLocale(locale).apply(string);
}
private static final Locale AZ_LOCALE = new Locale("az");
private static final Locale TR_LOCALE = new Locale("tr");
/**
* Returns a case folding algorithm appropriate for the given locale.
*
* <p>The algorithm is locale-dependent because Turkish and Azerbaijani locales handle casing of
* the letter `i` in a different way than other locales.
*/
public static Fold caseFoldAlgorithmForLocale(Locale locale) {
if (locale.equals(AZ_LOCALE) || locale.equals(TR_LOCALE)) {
return CaseMap.fold().turkic();
}
return CaseMap.fold();
}
}

View File

@ -0,0 +1,28 @@
package org.enso.base.text;
/**
* Represents a span of characters (understood as extended grapheme clusters) within a Text.
*
* <p>The start index indicates the first grapheme of the span and the end index indicates the first
* grapheme after the end of the span.
*
* <p>Represents an empty span if start and end indices are equal. Such an empty span refers to the
* space just before the grapheme corresponding to index start.
*/
public class GraphemeSpan {
public final long start, end;
/**
* Constructs a span of characters (understood as extended grapheme clusters).
*
* @param start index of the first extended grapheme cluster contained within the span (or
* location of the span if it is empty)
* @param end index of the first extended grapheme cluster after start that is not contained
* within the span
*/
public GraphemeSpan(long start, long end) {
this.start = start;
this.end = end;
}
}

View File

@ -0,0 +1,65 @@
package org.enso.base.text;
/** A helper to efficiently build an array of unboxed integers of arbitrary length. */
public class IntArrayBuilder {
private int[] storage;
private int length;
/**
* Constructs an empty builder with a given initial capacity.
*
* @param initialCapacity the initial capacity of the builder, can be used to avoid expanding the
* storage if the amount of elements can be estimated in advance.
*/
public IntArrayBuilder(int initialCapacity) {
length = 0;
storage = new int[initialCapacity];
}
/** Adds a new element to the array, expanding it if necessary. */
public void add(int x) {
if (length >= storage.length) {
grow();
}
storage[length++] = x;
}
/**
* Expands the storage to fit more elements.
*
* <p>The storage grows by 50% and is always increased by at least one. The 50% growth is chosen
* so that the amortized cost of adding a new element to the array stays constant.
*/
private void grow() {
int newCapacity = storage.length + (storage.length / 2);
if (newCapacity <= storage.length) {
newCapacity = storage.length + 1;
}
int[] newStorage = new int[newCapacity];
System.arraycopy(this.storage, 0, newStorage, 0, length);
this.storage = newStorage;
}
/** Returns the amount of elements already added to the storage. */
public int getLength() {
return length;
}
/**
* Returns the underlying storage of the builder.
*
* <p>This method avoids copying for performance so it should be used with care. The storage can
* actually have more elements than were added, so the user should be careful to only query the
* first {@code getLength()} elements. Querying other elements results in an unspecified result.
*
* <p>After calling this method, the builder is invalidated and cannot be used anymore. Any usage
* of the builder afterwards will result in a {@code NullPointerException}.
*/
public int[] unsafeGetStorageAndInvalidateTheBuilder() {
int[] tmp = storage;
this.storage = null;
return tmp;
}
}

View File

@ -0,0 +1,34 @@
package org.enso.base.text;
/** A char sequence which allows to access a slice of another char sequence without copying. */
class StringSlice implements CharSequence {
private final CharSequence text;
private final int subStart, subEnd;
/** Constructs a slice of the given text. */
public StringSlice(CharSequence text, int start, int end) {
this.text = text;
this.subStart = start;
this.subEnd = end;
}
@Override
public int length() {
return subEnd - subStart;
}
@Override
public char charAt(int index) {
return text.charAt(subStart + index);
}
@Override
public CharSequence subSequence(int start, int end) {
return new StringSlice(text, subStart + start, subStart + end);
}
@Override
public String toString() {
return text.subSequence(subStart, subEnd).toString();
}
}

View File

@ -0,0 +1,18 @@
package org.enso.base.text;
/**
* Represents a span of UTF-16 code units within a String.
*
* <p>The start index indicates the first code unit of the span and the end index indicates the
* first code unit after the end of the span.
*/
public class Utf16Span {
public final long start, end;
/** Constructs a span of UTF-16 code units. */
public Utf16Span(long start, long end) {
this.start = start;
this.end = end;
}
}

View File

@ -6,7 +6,7 @@ import Standard.Base.Data.Text.Regex
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
import Standard.Base.Data.Text.Regex.Mode import Standard.Base.Data.Text.Regex.Mode
import Standard.Base.Data.Text.Regex.Option as Global_Option import Standard.Base.Data.Text.Regex.Option as Global_Option
import Standard.Base.Data.Text.Span from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span
polyglot java import java.util.regex.Pattern as Java_Pattern polyglot java import java.util.regex.Pattern as Java_Pattern
@ -182,6 +182,22 @@ spec =
match.at 1 . group 0 . should_equal "ef" match.at 1 . group 0 . should_equal "ef"
match.at 2 . group 0 . should_equal "gh" match.at 2 . group 0 . should_equal "gh"
Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <|
pattern = engine.compile "" []
match_1 = pattern.match "" mode=Mode.All
match_1.length . should_equal 1
match_1.at 0 . start 0 . should_equal 0
match_1.at 0 . end 0 . should_equal 0
match_2 = pattern.match "ABC" mode=Mode.All
match_2.length . should_equal 4
match_2.at 0 . start 0 . should_equal 0
match_2.at 0 . end 0 . should_equal 0
match_2.at 1 . start 0 . should_equal 1
match_2.at 1 . end 0 . should_equal 1
match_2.at 3 . start 0 . should_equal 3
match_2.at 3 . end 0 . should_equal 3
Test.group "The default regex engine's Pattern.find" <| Test.group "The default regex engine's Pattern.find" <|
engine = Default_Engine.new engine = Default_Engine.new
@ -261,11 +277,23 @@ spec =
match.at 1 . should_equal "ef" match.at 1 . should_equal "ef"
match.at 2 . should_equal "gh" match.at 2 . should_equal "gh"
match_2 = pattern.find input mode=(Mode.Bounded 2 8 mode=10)
match_2.length . should_equal 3
match_2.at 0 . should_equal "cd"
match_2.at 1 . should_equal "ef"
match_2.at 2 . should_equal "gh"
match_3 = pattern.find input mode=(Mode.Bounded 2 8 mode=2)
match_3.length . should_equal 2
match_3.at 0 . should_equal "cd"
match_3.at 1 . should_equal "ef"
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <| Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"] engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"] engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"] engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=5 . should_equal ["a", "1", "b", "2"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"] engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"] engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Mode.Bounded 1 3) . should_equal ["1", "b"] engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Mode.Bounded 1 3) . should_equal ["1", "b"]
@ -501,10 +529,10 @@ spec =
match . should_be_a Default_Engine.Match match . should_be_a Default_Engine.Match
Test.specify "should get the span of a group by index" <| Test.specify "should get the span of a group by index" <|
match.span 1 . should_equal (Span.new 0 6 input) match.span 1 . should_equal (Utf_16_Span (Range 0 6) input)
Test.specify "should get the span of a group by name" <| Test.specify "should get the span of a group by name" <|
match.span "letters" . should_equal (Span.new 6 18 input) match.span "letters" . should_equal (Utf_16_Span (Range 6 18) input)
Test.specify "should return Nothing if the group didn't match" <| Test.specify "should return Nothing if the group didn't match" <|
match.span 3 . should_equal Nothing match.span 3 . should_equal Nothing

View File

@ -26,3 +26,4 @@ spec =
pattern = "http://example.com" pattern = "http://example.com"
Regex.escape pattern . should_equal "\Qhttp://example.com\E" Regex.escape pattern . should_equal "\Qhttp://example.com\E"
main = Test.Suite.run_main here.spec

View File

@ -2,20 +2,36 @@
from Standard.Base import all from Standard.Base import all
import Standard.Test import Standard.Test
import Standard.Base.Data.Text.Span from Standard.Base.Data.Text.Span as Span_Module import Span, Utf_16_Span
spec = Test.group "Text.Span" <| spec = Test.group "Text.Span" <|
Test.specify "should be able to be created over a text" <| Test.specify "should be able to be created over a text" <|
text = "Hello!" text = "Hello!"
span = Span.new 0 3 text span = Span (Range 0 3) text
span.start . should_equal 0 span.start . should_equal 0
span.end . should_equal 3 span.end . should_equal 3
span.text . should_equal text span.text . should_equal text
Test.specify "should be able to be created without a text" <| Test.specify "should be able to be converted to code units" <|
span = Span.new 5 8 text = 'ae\u{301}fz'
span.start . should_equal 5 (Span (Range 1 3) text).to_utf_16_span . should_equal (Utf_16_Span (Range 1 4) text)
span.end . should_equal 8
span.text . should_equal Nothing
Test.specify "should expand to the associated grapheme clusters" <|
text = 'a\u{301}e\u{302}o\u{303}'
span = Utf_16_Span (Range 1 5) text
extended = span.to_grapheme_span
extended . should_equal (Span (Range 0 3) text)
extended.to_utf_16_span . should_equal (Utf_16_Span (Range 0 6) text)
Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
Utf_16_Span (Range 0 1) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
Utf_16_Span (Range 0 0) text . to_grapheme_span . should_equal (Span (Range 0 0) text)
Utf_16_Span (Range 1 1) text . to_grapheme_span . should_equal (Span (Range 0 0) text)
Utf_16_Span (Range 2 2) text . to_grapheme_span . should_equal (Span (Range 1 1) text)
Utf_16_Span (Range 0 4) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
main = Test.Suite.run_main here.spec

View File

@ -0,0 +1,61 @@
from Standard.Base import all
polyglot java import org.enso.base.Text_Utils
polyglot java import org.enso.base.text.CaseFoldedString
import Standard.Test
polyglot java import com.ibm.icu.text.BreakIterator
spec =
Test.group "Text_Utils" <|
kshi = '\u0915\u094D\u0937\u093F'
facepalm = '\u{1F926}\u{1F3FC}\u200D\u2642\uFE0F'
text = "a"+kshi+facepalm+'e\u{301}Z'
codepoints_to_graphemes = _.flatten <| text.characters.map_with_index ix-> grapheme->
codepoints_count = grapheme.utf_16.length
Vector.new codepoints_count _->ix
Test.specify "should correctly translate an codepoint index to a grapheme index" <|
codepoints_to_graphemes . each_with_index codepoint_ix-> grapheme_ix->
found_grapheme_ix = Text_Utils.utf16_index_to_grapheme_index text codepoint_ix
found_grapheme_ix.should_equal grapheme_ix
Text_Utils.utf16_index_to_grapheme_index text text.utf_16.length . should_equal text.length
Text_Utils.utf16_index_to_grapheme_index "" 0 . should_equal 0
Text_Utils.utf16_index_to_grapheme_index 'ą' 0 . should_equal 0
Text_Utils.utf16_index_to_grapheme_index 'ą' 1 . should_equal 1
Text_Utils.utf16_index_to_grapheme_index "aB" 0 . should_equal 0
Text_Utils.utf16_index_to_grapheme_index "aB" 1 . should_equal 1
Text_Utils.utf16_index_to_grapheme_index "aB" 2 . should_equal 2
Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 0 . should_equal 0
Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 1 . should_equal 0
Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 2 . should_equal 1
Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <|
translate_indices text ixes =
Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
codepoint_indices = Vector.new text.utf_16.length ix->ix
translate_indices text codepoint_indices . should_equal codepoints_to_graphemes
translate_indices "" [0] . should_equal [0]
translate_indices 'ą' [0, 1] . should_equal [0, 1]
translate_indices "aB" [0, 1, 2] . should_equal [0, 1, 2]
translate_indices 'a\u{301}' [0, 1, 2] . should_equal [0, 0, 1]
Test.specify "should correctly case-fold a string and translate codeunits to graphemes" <|
text = 'a\u{301}AZßffią'
folded = CaseFoldedString.fold text Locale.default.java_locale
folded.getFoldedString . should_equal 'a\u{301}azssffią'
codeunits = Vector.new folded.getFoldedString.utf_16.length+1 ix->ix
grapheme_ixes = codeunits.map ix->
folded.codeUnitToGraphemeIndex ix
grapheme_ixes . should_equal [0, 0, 1, 2, 3, 3, 4, 4, 4, 5, 6]
Test.expect_panic_with (folded.codeUnitToGraphemeIndex -1) Polyglot_Error
Test.expect_panic_with (folded.codeUnitToGraphemeIndex folded.getFoldedString.utf_16.length+1) Polyglot_Error
main = Test.Suite.run_main here.spec

View File

@ -4,7 +4,10 @@ from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
import Standard.Base.Data.Locale import Standard.Base.Data.Locale
import Standard.Base.Data.Text.Split_Kind import Standard.Base.Data.Text.Split_Kind
from Standard.Base.Data.Text.Span as Span_Module import Span
from Standard.Base.Data.Text.Text_Sub_Range import all from Standard.Base.Data.Text.Text_Sub_Range import all
import Standard.Base.Data.Text.Regex.Mode
import Standard.Base.Data.Text.Matching_Mode
import Standard.Test import Standard.Test
type Auto a type Auto a
@ -87,9 +90,8 @@ spec =
'e\u0301' . equals_ignore_case 'e\u0303' . should_be_false 'e\u0301' . equals_ignore_case 'e\u0303' . should_be_false
"I" . equals_ignore_case "i" . should_be_true "I" . equals_ignore_case "i" . should_be_true
"I" . equals_ignore_case "ı" . should_be_true
"İ" . equals_ignore_case "i" . should_be_false
"İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true "İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
"I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true
"I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false "I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
"Kongressstraße"=="Kongressstrasse" . should_be_false "Kongressstraße"=="Kongressstrasse" . should_be_false
@ -199,15 +201,20 @@ spec =
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 6) . should_equal 'Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 6) . should_equal 'Wo\u{301}rld!'
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 5) . should_equal 'o\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 5) . should_equal 'o\u{301}rld!'
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e\u{302}') . should_equal 'H' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e\u{302}') . should_equal 'H'
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'ê') . should_equal 'H'
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} W' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} W'
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'ö') . should_equal 'He\u{302}llo\u{308} W'
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{302}') . should_equal 'llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{302}') . should_equal 'llo\u{308} Wo\u{301}rld!'
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'ê') . should_equal 'llo\u{308} Wo\u{301}rld!'
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{308}') . should_equal '' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{308}') . should_equal ''
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e') . should_equal '' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e') . should_equal ''
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o\u{308}') . should_equal 'rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o\u{308}') . should_equal 'rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'ö') . should_equal 'rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o') . should_equal '' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o') . should_equal ''
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e\u{302}') . should_equal 'H' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e\u{302}') . should_equal 'H'
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='ê') . should_equal 'H'
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range 3 5) . should_equal 'lo\u{308}' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range 3 5) . should_equal 'lo\u{308}'
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range -3 -1) . should_equal 'ld' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range -3 -1) . should_equal 'ld'
@ -232,6 +239,30 @@ spec =
'✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 Nothing) . should_equal '😙😉☺' '✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 Nothing) . should_equal '😙😉☺'
'✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 -1) . should_equal '😙😉' '✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 -1) . should_equal '😙😉'
Test.specify "take should correctly handle edge cases" <|
"".take First.new . should_equal ""
"".take Last.new . should_equal ""
"".take (After "a") . should_equal ""
"".take (After_Last "a") . should_equal ""
"".take (Before "a") . should_equal ""
"".take (Before_Last "a") . should_equal ""
"".take (After "") . should_equal ""
"".take (After_Last "") . should_equal ""
"".take (Before "") . should_equal ""
"".take (Before_Last "") . should_equal ""
"".take (While _->True) . should_equal ""
"".take (Range 0 0) . should_equal ""
'ABC\u{301}'.take (Range 0 0) . should_equal ""
'ABC\u{301}'.take (After "") . should_equal 'ABC\u{301}'
'ABC\u{301}'.take (After_Last "") . should_equal ""
'ABC\u{301}'.take (Before "") . should_equal ""
'ABC\u{301}'.take (Before_Last "") . should_equal 'ABC\u{301}'
Test.specify "drop should work as in the examples" <| Test.specify "drop should work as in the examples" <|
"Hello World!".drop First.new . should_equal "ello World!" "Hello World!".drop First.new . should_equal "ello World!"
"Hello World!".drop (First 5) . should_equal " World!" "Hello World!".drop (First 5) . should_equal " World!"
@ -269,15 +300,20 @@ spec =
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 6) . should_equal 'He\u{302}llo\u{308} ' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 6) . should_equal 'He\u{302}llo\u{308} '
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 5) . should_equal 'He\u{302}llo\u{308} W' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 5) . should_equal 'He\u{302}llo\u{308} W'
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!'
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'ê') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!'
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e') . should_equal '' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e') . should_equal ''
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o\u{308}') . should_equal 'o\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o\u{308}') . should_equal 'o\u{308}rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'ö') . should_equal 'o\u{308}rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o') . should_equal '' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o') . should_equal ''
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{302}') . should_equal 'He\u{302}' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{302}') . should_equal 'He\u{302}'
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'ê') . should_equal 'He\u{302}'
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{308}' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{308}'
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'ö') . should_equal 'He\u{302}llo\u{308} Wo\u{308}'
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='ê') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e') . should_equal '' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e') . should_equal ''
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range 3 5) . should_equal 'He\u{302}l Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range 3 5) . should_equal 'He\u{302}l Wo\u{308}rld!'
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range -3 -1) . should_equal 'He\u{302}llo\u{308} Wo\u{308}r!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range -3 -1) . should_equal 'He\u{302}llo\u{308} Wo\u{308}r!'
@ -301,6 +337,30 @@ spec =
'✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 Nothing) . should_equal '✨🚀🚧😍😃😍😎' '✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 Nothing) . should_equal '✨🚀🚧😍😃😍😎'
'✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 -1) . should_equal '✨🚀🚧😍😃😍😎☺' '✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 -1) . should_equal '✨🚀🚧😍😃😍😎☺'
Test.specify "drop should correctly handle edge cases" <|
"".drop First.new . should_equal ""
"".drop Last.new . should_equal ""
"".drop (After "a") . should_equal ""
"".drop (After_Last "a") . should_equal ""
"".drop (Before "a") . should_equal ""
"".drop (Before_Last "a") . should_equal ""
"".drop (After "") . should_equal ""
"".drop (After_Last "") . should_equal ""
"".drop (Before "") . should_equal ""
"".drop (Before_Last "") . should_equal ""
"".drop (While _->True) . should_equal ""
"".drop (Range 0 0) . should_equal ""
'ABC\u{301}'.drop (Range 0 0) . should_equal 'ABC\u{301}'
'ABC\u{301}'.drop (After "") . should_equal ''
'ABC\u{301}'.drop (After_Last "") . should_equal 'ABC\u{301}'
'ABC\u{301}'.drop (Before "") . should_equal 'ABC\u{301}'
'ABC\u{301}'.drop (Before_Last "") . should_equal ''
Test.specify "should correctly convert character case" <| Test.specify "should correctly convert character case" <|
"FooBar Baz".to_case Case.Lower . should_equal "foobar baz" "FooBar Baz".to_case Case.Lower . should_equal "foobar baz"
"FooBar Baz".to_case Case.Upper . should_equal "FOOBAR BAZ" "FooBar Baz".to_case Case.Upper . should_equal "FOOBAR BAZ"
@ -465,10 +525,7 @@ spec =
## This shows what regex is doing by default and we cannot easily fix ## This shows what regex is doing by default and we cannot easily fix
that. that.
's\u{301}' . contains 's' (Regex_Matcher.new) . should_be_true 's\u{301}' . contains 's' (Regex_Matcher.new) . should_be_true
## This would normally be false, but we perform input normalization 'ś' . contains 's' (Regex_Matcher.new) . should_be_false
to get results that are consistent regardless of if the input was
normalized or not.
'ś' . contains 's' (Regex_Matcher.new) . should_be_true
's\u{301}' . contains 'ś' (Regex_Matcher.new) . should_be_true 's\u{301}' . contains 'ś' (Regex_Matcher.new) . should_be_true
'ś' . contains 's\u{301}' (Regex_Matcher.new) . should_be_true 'ś' . contains 's\u{301}' (Regex_Matcher.new) . should_be_true
@ -767,6 +824,157 @@ spec =
'✨🚀🚧'*2 . should_equal '✨🚀🚧✨🚀🚧' '✨🚀🚧'*2 . should_equal '✨🚀🚧✨🚀🚧'
Test.specify "location_of should work as shown in examples" <|
example_1 =
"Hello World!".location_of "J" == Nothing
"Hello World!".location_of "o" == Span (Range 4 5) "Hello World!"
"Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 4 5) "Hello World!"
example_2 =
term = "straße"
text = "MONUMENTENSTRASSE 42"
match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new)
term.length . should_equal 6
match.length . should_equal 7
example_3 =
ligatures = "ffiffl"
ligatures.length . should_equal 2
term_1 = "IFF"
match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new)
term_1.length . should_equal 3
match_1.length . should_equal 2
term_2 = "ffiffl"
match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new)
term_2.length . should_equal 6
match_2.length . should_equal 2
match_1 . should_equal match_2
example_4 =
"Hello World!".location_of_all "J" . should_equal []
"Hello World!".location_of_all "o" . map .start . should_equal [4, 7]
example_5 =
term = "strasse"
text = "MONUMENTENSTRASSE ist eine große Straße."
match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new)
term.length . should_equal 7
match . map .length . should_equal [7, 6]
example_6 =
ligatures = "ffifflFFIFF"
ligatures.length . should_equal 7
match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new)
match_1 . map .length . should_equal [2, 3]
match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new)
match_2 . map .length . should_equal [2, 5]
# Put them in blocks to avoid name clashes.
example_1
example_2
example_3
example_4
example_5
example_6
Test.specify "should allow to find location_of occurrences within a text" <|
"Hello World!".location_of_all "J" . should_equal []
"Hello World!".location_of_all "o" . map .start . should_equal [4, 7]
accents = 'a\u{301}e\u{301}o\u{301}'
accents.location_of accent_1 . should_equal (Span (Range 1 2) accents)
"".location_of "foo" . should_equal Nothing
"".location_of "foo" mode=Matching_Mode.Last . should_equal Nothing
"".location_of_all "foo" . should_equal []
"".location_of "" . should_equal (Span (Range 0 0) "")
"".location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
"".location_of_all "" . should_equal [Span (Range 0 0) ""]
abc = 'A\u{301}ßC'
abc.location_of "" . should_equal (Span (Range 0 0) abc)
abc.location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
abc.location_of_all "" . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
Test.specify "should allow case insensitive matching in location_of" <|
hello = "Hello WORLD!"
case_insensitive = Text_Matcher Case_Insensitive.new
hello.location_of "world" . should_equal Nothing
hello.location_of "world" matcher=case_insensitive . should_equal (Span (Range 6 11) hello)
hello.location_of "o" mode=Mode.First matcher=case_insensitive . should_equal (Span (Range 4 5) hello)
hello.location_of "o" mode=Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 7 8) hello)
accents = 'A\u{301}E\u{301}O\u{301}'
accents.location_of accent_1 matcher=case_insensitive . should_equal (Span (Range 1 2) accents)
"Strasse".location_of "ß" matcher=case_insensitive . should_equal (Span (Range 4 6) "Strasse")
"Monumentenstraße 42".location_of "STRASSE" matcher=case_insensitive . should_equal (Span (Range 10 16) "Monumentenstraße 42")
'\u0390'.location_of '\u03B9\u0308\u0301' matcher=case_insensitive . should_equal (Span (Range 0 1) '\u0390')
'ԵՒ'.location_of 'և' . should_equal Nothing
'ԵՒ'.location_of 'և' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ԵՒ')
'և'.location_of 'ԵՒ' matcher=case_insensitive . should_equal (Span (Range 0 1) 'և')
ligatures = 'ffafffiflffifflſtstZ'
ligatures.location_of 'FFI' matcher=case_insensitive . should_equal (Span (Range 3 5) ligatures)
ligatures.location_of 'FF' matcher=case_insensitive . should_equal (Span (Range 0 2) ligatures)
ligatures.location_of 'ff' matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 7 8) ligatures)
ligatures.location_of_all 'ff' . should_equal [Span (Range 0 2) ligatures]
ligatures.location_of_all 'FF' matcher=case_insensitive . should_equal [Span (Range 0 2) ligatures, Span (Range 3 4) ligatures, Span (Range 6 7) ligatures, Span (Range 7 8) ligatures]
ligatures.location_of_all 'ffi' matcher=case_insensitive . should_equal [Span (Range 3 5) ligatures, Span (Range 6 7) ligatures]
'fffi'.location_of_all 'ff' matcher=case_insensitive . should_equal [Span (Range 0 2) 'fffi']
'fffi'.location_of_all 'ffi' . should_equal []
'fffi'.location_of_all 'ffi' matcher=case_insensitive . should_equal [Span (Range 1 4) 'fffi']
'FFFI'.location_of 'ffi' matcher=case_insensitive . should_equal (Span (Range 1 4) 'FFFI')
'ffiffl'.location_of 'IF' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ffiffl')
'ffiffl'.location_of 'F' Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 1 2) 'ffiffl')
'ffiffl'.location_of_all 'F' matcher=case_insensitive . should_equal [Span (Range 0 1) 'ffiffl', Span (Range 0 1) 'ffiffl', Span (Range 1 2) 'ffiffl', Span (Range 1 2) 'ffiffl']
'aaffibb'.location_of_all 'af' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaffibb']
'aaffibb'.location_of_all 'affi' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaffibb']
'aaffibb'.location_of_all 'ib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaffibb']
'aaffibb'.location_of_all 'ffib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaffibb']
"".location_of "foo" matcher=case_insensitive . should_equal Nothing
"".location_of "foo" matcher=case_insensitive mode=Matching_Mode.Last . should_equal Nothing
"".location_of_all "foo" matcher=case_insensitive . should_equal []
"".location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) "")
"".location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
"".location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) ""]
abc = 'A\u{301}ßC'
abc.location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) abc)
abc.location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
abc.location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
Test.specify "should allow regexes in location_of" <|
hello = "Hello World!"
regex = Regex_Matcher.new
regex_insensitive = Regex_Matcher.new case_sensitive=Case_Insensitive.new
hello.location_of ".o" Matching_Mode.First matcher=regex . should_equal (Span (Range 3 5) hello)
hello.location_of ".o" Matching_Mode.Last matcher=regex . should_equal (Span (Range 6 8) hello)
hello.location_of_all ".o" matcher=regex . map .start . should_equal [3, 6]
"foobar".location_of "BAR" Mode.First matcher=regex_insensitive . should_equal (Span (Range 3 6) "foobar")
## Regex matching does not do case folding
"Strasse".location_of "ß" Mode.First matcher=regex_insensitive . should_equal Nothing
## But it should handle the Unicode normalization
accents = 'a\u{301}e\u{301}o\u{301}'
accents.location_of accent_1 Mode.First matcher=regex . should_equal (Span (Range 1 2) accents)
Test.specify "should correctly handle regex edge cases in location_of" pending="Figure out how to make Regex correctly handle empty patterns." <|
regex = Regex_Matcher.new
"".location_of "foo" matcher=regex . should_equal Nothing
"".location_of "foo" matcher=regex mode=Matching_Mode.Last . should_equal Nothing
"".location_of_all "foo" matcher=regex . should_equal []
"".location_of "" matcher=regex . should_equal (Span (Range 0 0) "")
"".location_of_all "" matcher=regex . should_equal [Span (Range 0 0) ""]
"".location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
abc = 'A\u{301}ßC'
abc.location_of "" matcher=regex . should_equal (Span (Range 0 0) abc)
abc.location_of_all "" matcher=regex . should_equal [Span (Range 0 0) abc, Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
abc.location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
Test.group "Regex matching" <| Test.group "Regex matching" <|
Test.specify "should be possible on text" <| Test.specify "should be possible on text" <|
match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First

View File

@ -128,3 +128,4 @@ spec = Test.group "Examples" <|
match.groups.length . should_equal 5 match.groups.length . should_equal 5
match.named_groups.size . should_equal 2 match.named_groups.size . should_equal 2
main = Test.Suite.run_main here.spec

View File

@ -34,6 +34,7 @@ import project.Data.Text_Spec
import project.Data.Time.Spec as Time_Spec import project.Data.Time.Spec as Time_Spec
import project.Data.Vector_Spec import project.Data.Vector_Spec
import project.Data.Text.Regex_Spec import project.Data.Text.Regex_Spec
import project.Data.Text.Utils_Spec
import project.Data.Text.Default_Regex_Engine_Spec import project.Data.Text.Default_Regex_Engine_Spec
import project.Data.Text.Matching_Spec import project.Data.Text.Matching_Spec
import project.Data.Text.Span_Spec import project.Data.Text.Span_Spec
@ -87,6 +88,7 @@ main = Test.Suite.run_main <|
Runtime_Spec.spec Runtime_Spec.spec
Span_Spec.spec Span_Spec.spec
Stack_Traces_Spec.spec Stack_Traces_Spec.spec
Utils_Spec.spec
Text_Spec.spec Text_Spec.spec
Time_Spec.spec Time_Spec.spec
Uri_Spec.spec Uri_Spec.spec