mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 15:21:48 +03:00
Data analysts should be able to use Text.location_of
to find indexes within string using various matchers (#3324)
Implements https://www.pivotaltracker.com/n/projects/2539304/stories/181266029
This commit is contained in:
parent
3ef18ab5b8
commit
247b284316
@ -63,6 +63,7 @@
|
||||
- [Implemented `Bool.compare_to` method][3317]
|
||||
- [Implemented `Map.first`, `Map.last` functions. Expanded `Table.group_by` to
|
||||
also compute mode, percentile, minimum, maximum.][3318]
|
||||
- [Implemented `Text.location_of` and `Text.location_of_all` methods.][3324]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -100,7 +101,8 @@
|
||||
[3236]: https://github.com/enso-org/enso/pull/3236
|
||||
[3311]: https://github.com/enso-org/enso/pull/3311
|
||||
[3317]: https://github.com/enso-org/enso/pull/3317
|
||||
[3317]: https://github.com/enso-org/enso/pull/3318
|
||||
[3318]: https://github.com/enso-org/enso/pull/3318
|
||||
[3324]: https://github.com/enso-org/enso/pull/3324
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -5,9 +5,11 @@ from Standard.Builtins import Text, Prim_Text_Helpers
|
||||
|
||||
import Standard.Base.Data.Text.Regex
|
||||
import Standard.Base.Data.Text.Regex.Mode
|
||||
import Standard.Base.Data.Text.Matching_Mode
|
||||
import Standard.Base.Data.Text.Case
|
||||
import Standard.Base.Data.Text.Location
|
||||
import Standard.Base.Data.Text.Line_Ending_Style
|
||||
from Standard.Base.Data.Text.Span as Span_Module import Span
|
||||
import Standard.Base.Data.Text.Split_Kind
|
||||
import Standard.Base.Data.Text.Text_Sub_Range
|
||||
import Standard.Base.Data.Locale
|
||||
@ -15,6 +17,7 @@ import Standard.Base.Meta
|
||||
|
||||
from Standard.Builtins export Text
|
||||
|
||||
export Standard.Base.Data.Text.Matching_Mode
|
||||
export Standard.Base.Data.Text.Case
|
||||
export Standard.Base.Data.Text.Location
|
||||
export Standard.Base.Data.Text.Split_Kind
|
||||
@ -546,7 +549,7 @@ Text.== that = if Meta.is_same_object this Text then Meta.is_same_object that Te
|
||||
(('É' . equals_ignore_case 'é') && ('é' . equals_ignore_case 'e\u0301')) == True
|
||||
Text.equals_ignore_case : Text -> Locale -> Boolean
|
||||
Text.equals_ignore_case that locale=Locale.default =
|
||||
(this.to_case_insensitive_key locale) == (that.to_case_insensitive_key locale)
|
||||
Text_Utils.equals_ignore_case this that locale.java_locale
|
||||
|
||||
## ADVANCED
|
||||
PRIVATE
|
||||
@ -555,7 +558,7 @@ Text.equals_ignore_case that locale=Locale.default =
|
||||
used to perform case-insensitive comparisons.
|
||||
Text.to_case_insensitive_key : Locale -> Text
|
||||
Text.to_case_insensitive_key locale=Locale.default =
|
||||
this.to_case Case.Lower locale . to_case Case.Upper locale
|
||||
Text_Utils.case_insensitive_key this locale.java_locale
|
||||
|
||||
## Compare two texts to discover their ordering.
|
||||
|
||||
@ -895,7 +898,7 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
|
||||
Text_Matcher case_sensitivity -> case case_sensitivity of
|
||||
True -> Text_Utils.contains this term
|
||||
Case_Insensitive locale ->
|
||||
Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale)
|
||||
Text_Utils.contains_case_insensitive this term locale.java_locale
|
||||
Regex_Matcher _ _ _ _ _ ->
|
||||
compiled_pattern = matcher.compile term
|
||||
match = compiled_pattern.match this Mode.First
|
||||
@ -952,27 +955,6 @@ Text.repeat count=1 =
|
||||
https://www.pivotaltracker.com/story/show/181435598
|
||||
0.up_to (count.max 0) . fold "" acc-> _-> acc + this
|
||||
|
||||
## PRIVATE
|
||||
Utility function taking a range pointing at grapheme clusters and converting to a range on the underlying code points
|
||||
range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error
|
||||
range_to_char_indices text range =
|
||||
len = text.length
|
||||
start = if range.start < 0 then range.start + len else range.start
|
||||
end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end)
|
||||
is_valid = (Range 0 len+1).contains
|
||||
|
||||
case (Pair (is_valid start) (is_valid end)) of
|
||||
Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len)
|
||||
Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len)
|
||||
Pair True True ->
|
||||
if start>=end then (Range 0 0) else
|
||||
iterator = BreakIterator.getCharacterInstance
|
||||
iterator.setText text
|
||||
|
||||
start_index = iterator.next start
|
||||
end_index = iterator.next (end - start)
|
||||
Range start_index end_index
|
||||
|
||||
## ALIAS first, last, left, right, mid, substring
|
||||
Creates a new Text by selecting the specified range of the input.
|
||||
|
||||
@ -1009,7 +991,7 @@ range_to_char_indices text range =
|
||||
Text.take : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
|
||||
Text.take range =
|
||||
char_range = case range of
|
||||
Range _ _ -> here.range_to_char_indices this range
|
||||
Range _ _ -> Span_Module.range_to_char_indices this range
|
||||
_ -> range.to_char_range this
|
||||
Text_Utils.substring this char_range.start char_range.end
|
||||
|
||||
@ -1049,7 +1031,7 @@ Text.take range =
|
||||
Text.drop : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
|
||||
Text.drop range =
|
||||
char_range = case range of
|
||||
Range _ _ -> here.range_to_char_indices this range
|
||||
Range _ _ -> Span_Module.range_to_char_indices this range
|
||||
_ -> range.to_char_range this
|
||||
if char_range.start == 0 then Text_Utils.drop_first this char_range.end else
|
||||
prefix = Text_Utils.substring this 0 char_range.start
|
||||
@ -1184,3 +1166,204 @@ Text.trim where=Location.Both what=_.is_whitespace =
|
||||
loop current break_iterator.previous
|
||||
if start_index >= end_index then "" else
|
||||
Text_Utils.substring this start_index end_index
|
||||
|
||||
## ALIAS find, index_of, position_of, span_of
|
||||
Find the location of the `term` in the input.
|
||||
Returns a Span representing the location at which the term was found, or
|
||||
`Nothing` if the term was not found in the input.
|
||||
|
||||
Arguments:
|
||||
- term: The term to find.
|
||||
- mode: Specifies if the first or last occurrence of the term should be
|
||||
returned if there are multiple occurrences within the input. The first
|
||||
occurrence is returned by default.
|
||||
- matcher: Specifies how the term is matched against the input:
|
||||
- If a `Text_Matcher`, the text is compared using case-sensitively rules
|
||||
specified in the matcher.
|
||||
- If a `Regex_Matcher`, the `term` is used as a regular expression and
|
||||
matched using the associated options.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
> Example
|
||||
Finding location of a substring.
|
||||
|
||||
"Hello World!".location_of "J" == Nothing
|
||||
"Hello World!".location_of "o" == Span (Range 4 5) "Hello World!"
|
||||
"Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 7 8) "Hello World!"
|
||||
|
||||
! Match Length
|
||||
The function returns not only the index of the match but a `Span` instance
|
||||
which contains both the start and end indices, allowing to determine the
|
||||
length of the match. This is useful not only with regex matches (where a
|
||||
regular expression can have matches of various lengths) but also for case
|
||||
insensitive matching. In case insensitive mode, a single character can
|
||||
match multiple characters, for example `ß` will match `ss` and `SS`, and
|
||||
the ligature `ffi` will match `ffi` or `f` etc. Thus in case insensitive
|
||||
mode, the length of the match can be shorter or longer than the term that
|
||||
was being matched, so it is extremely important to not rely on the length
|
||||
of the matched term when analysing the matches as they may have different
|
||||
lengths.
|
||||
|
||||
> Example
|
||||
Match length differences in case insensitive matching.
|
||||
|
||||
term = "straße"
|
||||
text = "MONUMENTENSTRASSE 42"
|
||||
match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new)
|
||||
term.length == 6
|
||||
match.length == 7
|
||||
|
||||
! Matching Grapheme Clusters
|
||||
In case insensitive mode, a single character can match multiple characters,
|
||||
for example `ß` will match `ss` and `SS`, and the ligature `ffi` will match
|
||||
`ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
|
||||
match only a part of some single grapheme cluster, for example in the text
|
||||
`ffia` the term `ia` will match just one-third of the first grapheme `ffi`.
|
||||
Since we do not have the resolution to distinguish such partial matches
|
||||
(as that would require non-integer indices), so a match which matched just
|
||||
a part of some grapheme cluster is extended and treated as if it matched
|
||||
the whole grapheme cluster.
|
||||
|
||||
> Example
|
||||
Extending matches to full grapheme clusters.
|
||||
|
||||
ligatures = "ffiffl"
|
||||
ligatures.length == 2
|
||||
term_1 = "IFF"
|
||||
match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new)
|
||||
term_1.length == 3
|
||||
match_1.length == 2
|
||||
term_2 = "ffiffl"
|
||||
match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new)
|
||||
term_2.length == 6
|
||||
match_2.length == 2
|
||||
# After being extended to full grapheme clusters, both terms "IFF" and "ffiffl" match the same span of grapheme clusters.
|
||||
match_1 == match_2
|
||||
Text.location_of : Text -> (Matching_Mode.First | Matching_Mode.Last) -> Matcher -> Span | Nothing
|
||||
Text.location_of term="" mode=Matching_Mode.First matcher=Text_Matcher.new = case matcher of
|
||||
Text_Matcher case_sensitive -> case case_sensitive of
|
||||
True ->
|
||||
codepoint_span = case mode of
|
||||
Matching_Mode.First -> Text_Utils.span_of this term
|
||||
Matching_Mode.Last -> Text_Utils.last_span_of this term
|
||||
if codepoint_span.is_nothing then Nothing else
|
||||
start = Text_Utils.utf16_index_to_grapheme_index this codepoint_span.start
|
||||
## While the codepoint_span may have different code unit length
|
||||
from our term, the `length` counted in grapheme clusters is
|
||||
guaranteed to be the same.
|
||||
end = start + term.length
|
||||
Span (Range start end) this
|
||||
Case_Insensitive locale -> case term.is_empty of
|
||||
True -> case mode of
|
||||
Matching_Mode.First -> Span (Range 0 0) this
|
||||
Matching_Mode.Last ->
|
||||
end = this.length
|
||||
Span (Range end end) this
|
||||
False ->
|
||||
search_for_last = case mode of
|
||||
Matching_Mode.First -> False
|
||||
Matching_Mode.Last -> True
|
||||
case Text_Utils.span_of_case_insensitive this term locale.java_locale search_for_last of
|
||||
Nothing -> Nothing
|
||||
grapheme_span ->
|
||||
Span (Range grapheme_span.start grapheme_span.end) this
|
||||
Regex_Matcher _ _ _ _ _ -> case mode of
|
||||
Matching_Mode.First ->
|
||||
case matcher.compile term . match this Mode.First of
|
||||
Nothing -> Nothing
|
||||
match -> match.span 0 . to_grapheme_span
|
||||
Matching_Mode.Last ->
|
||||
case matcher.compile term . match this Mode.All of
|
||||
Nothing -> Nothing
|
||||
matches -> matches.last.span 0 . to_grapheme_span
|
||||
|
||||
## ALIAS find_all, index_of_all, position_of_all, span_of_all
|
||||
Finds all the locations of the `term` in the input.
|
||||
If not found, the function returns an empty Vector.
|
||||
|
||||
Arguments:
|
||||
- term: The term to find.
|
||||
- matcher: Specifies how the term is matched against the input:
|
||||
- If a `Text_Matcher`, the text is compared using case-sensitively rules
|
||||
specified in the matcher.
|
||||
- If a `Regex_Matcher`, the `term` is used as a regular expression and
|
||||
matched using the associated options.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
> Example
|
||||
Finding locations of all occurrences of a substring.
|
||||
|
||||
"Hello World!".location_of_all "J" == []
|
||||
"Hello World!".location_of_all "o" . map .start == [4, 7]
|
||||
|
||||
! Match Length
|
||||
The function returns not only the index of the match but a `Span` instance
|
||||
which contains both the start and end indices, allowing to determine the
|
||||
length of the match. This is useful not only with regex matches (where a
|
||||
regular expression can have matches of various lengths) but also for case
|
||||
insensitive matching. In case insensitive mode, a single character can
|
||||
match multiple characters, for example `ß` will match `ss` and `SS`, and
|
||||
the ligature `ffi` will match `ffi` or `f` etc. Thus in case insensitive
|
||||
mode, the length of the match can be shorter or longer than the term that
|
||||
was being matched, so it is extremely important to not rely on the length
|
||||
of the matched term when analysing the matches as they may have different
|
||||
lengths.
|
||||
|
||||
> Example
|
||||
Match length differences in case insensitive matching.
|
||||
|
||||
term = "strasse"
|
||||
text = "MONUMENTENSTRASSE ist eine große Straße."
|
||||
match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new)
|
||||
term.length == 7
|
||||
match . map .length == [7, 6]
|
||||
|
||||
! Matching Grapheme Clusters
|
||||
In case insensitive mode, a single character can match multiple characters,
|
||||
for example `ß` will match `ss` and `SS`, and the ligature `ffi` will match
|
||||
`ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
|
||||
match only a part of some single grapheme cluster, for example in the text
|
||||
`ffia` the term `ia` will match just one-third of the first grapheme `ffi`.
|
||||
Since we do not have the resolution to distinguish such partial matches
|
||||
(as that would require non-integer indices), so a match which matched just
|
||||
a part of some grapheme cluster is extended and treated as if it matched
|
||||
the whole grapheme cluster.
|
||||
|
||||
> Example
|
||||
Extending matches to full grapheme clusters.
|
||||
|
||||
ligatures = "ffifflFFIFF"
|
||||
ligatures.length == 7
|
||||
match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new)
|
||||
match_1 . map .length == [2, 3]
|
||||
match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new)
|
||||
match_2 . map .length == [2, 5]
|
||||
Text.location_of_all : Text -> Matcher -> [Span]
|
||||
Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
|
||||
Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of
|
||||
True ->
|
||||
codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term
|
||||
grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
|
||||
## While the codepoint_spans may have different code unit lengths
|
||||
from our term, the `length` counted in grapheme clusters is
|
||||
guaranteed to be the same.
|
||||
offset = term.length
|
||||
grahpeme_ixes . map start->
|
||||
end = start+offset
|
||||
Span (Range start end) this
|
||||
Case_Insensitive locale ->
|
||||
grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
|
||||
grapheme_spans.map grapheme_span->
|
||||
Span (Range grapheme_span.start grapheme_span.end) this
|
||||
Regex_Matcher _ _ _ _ _ ->
|
||||
case matcher.compile term . match this Mode.All of
|
||||
Nothing -> []
|
||||
matches -> matches.map m-> m.span 0 . to_grapheme_span
|
||||
|
@ -0,0 +1,5 @@
|
||||
## Matches the first found instance.
|
||||
type First
|
||||
|
||||
## Matches the last found instance.
|
||||
type Last
|
@ -40,7 +40,7 @@ import Standard.Base.Data.Text.Regex.Engine
|
||||
import Standard.Base.Data.Text.Regex.Option as Global_Option
|
||||
import Standard.Base.Data.Text.Regex.Mode
|
||||
import Standard.Base.Polyglot.Java as Java_Ext
|
||||
import Standard.Base.Data.Text.Span
|
||||
from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span
|
||||
|
||||
from Standard.Builtins import Java
|
||||
|
||||
@ -183,8 +183,13 @@ type Pattern
|
||||
on the encoding, we normalize all input.
|
||||
build_matcher : Text -> Integer -> Integer -> Java_Matcher
|
||||
build_matcher input start end =
|
||||
normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else
|
||||
Text_Utils.normalize input
|
||||
## TODO [RW] Normalization had to be disabled - since start and end are
|
||||
in code unit space, normalization could shift these indices!
|
||||
This should be addressed when reviewing
|
||||
See: https://www.pivotaltracker.com/story/show/181524498
|
||||
#normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else
|
||||
# Text_Utils.normalize input
|
||||
normalized_input = input
|
||||
internal_matcher = this.internal_pattern.matcher normalized_input . region start end
|
||||
|
||||
if this.options.contains No_Anchoring_Bounds then
|
||||
@ -262,7 +267,7 @@ type Pattern
|
||||
internal_matcher = this.build_matcher input start end
|
||||
|
||||
if internal_matcher . find start . not then Nothing else
|
||||
Match internal_matcher start end
|
||||
Match internal_matcher start end input
|
||||
Integer ->
|
||||
if mode < 0 then Panic.throw <|
|
||||
Mode_Error "Cannot match a negative number of times."
|
||||
@ -272,13 +277,16 @@ type Pattern
|
||||
go : Integer -> Integer -> Nothing
|
||||
go offset remaining_count =
|
||||
should_continue = remaining_count > 0
|
||||
if should_continue.not || (offset > end) then Nothing else
|
||||
if should_continue.not || (offset >= end) then Nothing else
|
||||
internal_matcher = this.build_matcher input start end
|
||||
found = internal_matcher.find offset
|
||||
|
||||
if found.not then Nothing else
|
||||
builder.append (Match internal_matcher start end)
|
||||
@Tail_Call go (internal_matcher.end 0) remaining_count-1
|
||||
builder.append (Match internal_matcher start end input)
|
||||
match_end = internal_matcher.end 0
|
||||
# Ensure progress even if the match is an empty string.
|
||||
new_offset = if match_end > offset then match_end else offset+1
|
||||
@Tail_Call go new_offset remaining_count-1
|
||||
|
||||
go start mode
|
||||
vector = builder.to_vector
|
||||
@ -294,8 +302,11 @@ type Pattern
|
||||
found = internal_matcher.find offset
|
||||
|
||||
if found.not then Nothing else
|
||||
builder.append (Match internal_matcher start end)
|
||||
@Tail_Call go (internal_matcher.end 0)
|
||||
builder.append (Match internal_matcher start end input)
|
||||
match_end = internal_matcher.end 0
|
||||
# Ensure progress even if the match is an empty string.
|
||||
new_offset = if match_end > offset then match_end else offset+1
|
||||
@Tail_Call go new_offset
|
||||
|
||||
go start
|
||||
vector = builder.to_vector
|
||||
@ -304,7 +315,7 @@ type Pattern
|
||||
Mode.Full ->
|
||||
internal_matcher = this.build_matcher input start end
|
||||
if internal_matcher.matches.not then Nothing else
|
||||
Match internal_matcher start end
|
||||
Match internal_matcher start end input
|
||||
Mode.Bounded _ _ _ -> Panic.throw <|
|
||||
Mode_Error "Modes cannot be recursive."
|
||||
|
||||
@ -312,7 +323,7 @@ type Pattern
|
||||
Mode.Bounded start end sub_mode ->
|
||||
if start < end then do_match_mode sub_mode start end else
|
||||
Panic.throw Invalid_Bounds_Error
|
||||
_ -> do_match_mode mode 0 input.length
|
||||
_ -> do_match_mode mode 0 (Text_Utils.char_length input)
|
||||
|
||||
## ADVANCED
|
||||
|
||||
@ -334,7 +345,7 @@ type Pattern
|
||||
pattern.matches input
|
||||
matches : Text -> Boolean
|
||||
matches input = case this.match input mode=Mode.Full of
|
||||
Match _ _ _ -> True
|
||||
Match _ _ _ _ -> True
|
||||
Vector.Vector _ -> True
|
||||
_ -> False
|
||||
|
||||
@ -405,7 +416,7 @@ type Pattern
|
||||
find input mode=Mode.All =
|
||||
matches = this.match input mode
|
||||
case matches of
|
||||
Match _ _ _ -> matches.group 0
|
||||
Match _ _ _ _ -> matches.group 0
|
||||
Vector.Vector _ -> matches.map (_.group 0)
|
||||
_ -> matches
|
||||
|
||||
@ -548,7 +559,7 @@ type Pattern
|
||||
internal_matcher.replaceAll replacement
|
||||
Mode.Full ->
|
||||
case this.match input mode=Mode.Full of
|
||||
Match _ _ _ -> replacement
|
||||
Match _ _ _ _ -> replacement
|
||||
Nothing -> input
|
||||
Mode.Bounded _ _ _ -> Panic.throw <|
|
||||
Mode_Error "Modes cannot be recursive."
|
||||
@ -556,7 +567,7 @@ type Pattern
|
||||
case mode of
|
||||
Mode.Bounded _ _ _ -> Panic.throw <|
|
||||
Mode_Error "Bounded replacements are not well-formed."
|
||||
_ -> do_replace_mode mode 0 input.length
|
||||
_ -> do_replace_mode mode 0 (Text_Utils.char_length input)
|
||||
|
||||
## The default implementation of the `Data.Text.Regex.Engine.Match` interface.
|
||||
type Match
|
||||
@ -570,7 +581,8 @@ type Match
|
||||
match.
|
||||
- region_start: The start of the region over which the match was made.
|
||||
- region_end: The end of the region over which the match was made.
|
||||
type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer)
|
||||
- input: The input text that was being matched.
|
||||
type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) (input : Text)
|
||||
|
||||
## Gets the text matched by the group with the provided identifier, or
|
||||
`Nothing` if the group did not participate in the match. If no such group
|
||||
@ -743,10 +755,10 @@ type Match
|
||||
example_Span =
|
||||
match = Examples.match
|
||||
match.span 0
|
||||
span : Integer | Text -> Span | Nothing ! Regex.No_Such_Group_Error
|
||||
span : Integer | Text -> Utf_16_Span | Nothing ! Regex.No_Such_Group_Error
|
||||
span id = case this.group id of
|
||||
Nothing -> Nothing
|
||||
_ -> Span.new (this.start id) (this.end id) (this.group 0)
|
||||
_ -> Utf_16_Span (Range (this.start id) (this.end id)) this.input
|
||||
|
||||
## Returns the start character index of the match's region.
|
||||
|
||||
|
@ -4,11 +4,13 @@
|
||||
to matching on the `Full` content of the input text.
|
||||
|
||||
from Standard.Base import all
|
||||
from Standard.Base.Data.Text.Matching_Mode import First
|
||||
from Standard.Base.Data.Text.Matching_Mode export First
|
||||
|
||||
type Mode
|
||||
|
||||
## The regex will only match the first instance it finds.
|
||||
type First
|
||||
First
|
||||
|
||||
## The regex will match up to some `Integer` number of instances.
|
||||
Integer
|
||||
|
@ -7,30 +7,14 @@
|
||||
|
||||
example_span =
|
||||
text = "Hello!"
|
||||
Span.new 0 3 text
|
||||
Span 0 3 text
|
||||
|
||||
from Standard.Base import all
|
||||
|
||||
import Standard.Base.Data.Range
|
||||
from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error
|
||||
|
||||
## Construct a new `Span`.
|
||||
|
||||
Arguments:
|
||||
- start: The index of the first character included in the span.
|
||||
- end: The index of the first character after `start` that is _not_ included
|
||||
in the span.
|
||||
- text: The `Text` over which the span exists. This is _optional_.
|
||||
|
||||
> Example
|
||||
Creating a span over the first three characters of the text "hello!".
|
||||
|
||||
import Standard.Base.Data.Text.Span
|
||||
|
||||
example_span =
|
||||
text = "Hello!"
|
||||
Span.new 0 3 text
|
||||
new : Integer -> Integer -> Text | Nothing -> Span
|
||||
new start end text=Nothing = Span (start.up_to end) text
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
polyglot java import com.ibm.icu.text.BreakIterator
|
||||
|
||||
type Span
|
||||
|
||||
@ -38,7 +22,7 @@ type Span
|
||||
|
||||
Arguments:
|
||||
- range: The range of characters over which the span exists.
|
||||
- text: The text over which the span exists. This is _optional_.
|
||||
- text: The text over which the span exists.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
@ -54,7 +38,7 @@ type Span
|
||||
text = "Hello!"
|
||||
range = 0.up_to 3
|
||||
Span.Span range text
|
||||
type Span (range : Range.Range) (text : (Text | Nothing) = Nothing)
|
||||
type Span (range : Range.Range) (text : Text)
|
||||
|
||||
## The index of the first character included in the span.
|
||||
|
||||
@ -74,3 +58,112 @@ type Span
|
||||
meaning in most text-processing applications.
|
||||
end : Integer
|
||||
end = this.range.end
|
||||
|
||||
## The length of the span in extended grapheme clusters.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
length : Integer
|
||||
length = this.range.length
|
||||
|
||||
## Converts the span of extended grapheme clusters to a corresponding span
|
||||
of UTF-16 code units.
|
||||
|
||||
> Example
|
||||
Find the span of code units corresponding to the span of extended grapheme clusters.
|
||||
|
||||
text = 'ae\u{301}fz'
|
||||
(Span (Range 1 3) text).to_utf_16_span == (Utf_16_Span (Range 1 4) text)
|
||||
to_utf_16_span : Utf_16_Span
|
||||
to_utf_16_span =
|
||||
Utf_16_Span (here.range_to_char_indices this.text this.range) this.text
|
||||
|
||||
type Utf_16_Span
|
||||
|
||||
## A representation of a span of UTF-16 code units in Enso's `Text` type.
|
||||
|
||||
Arguments:
|
||||
- range: The range of code units over which the span exists.
|
||||
- text: The text over which the span exists.
|
||||
|
||||
> Example
|
||||
Creating a span over the first three code units of the text 'a\u{301}bc'.
|
||||
|
||||
import Standard.Base.Data.Text.Span
|
||||
|
||||
example_span =
|
||||
text = 'a\u{301}bc'
|
||||
Span.Utf_16_Span (Range 0 3) text
|
||||
type Utf_16_Span (range : Range.Range) (text : Text)
|
||||
|
||||
## The index of the first code unit included in the span.
|
||||
start : Integer
|
||||
start = this.range.start
|
||||
|
||||
## The index of the first code unit after `start` that is _not_ included in
|
||||
the span.
|
||||
end : Integer
|
||||
end = this.range.end
|
||||
|
||||
## The length of the span in UTF-16 code units.
|
||||
length : Integer
|
||||
length = this.range.length
|
||||
|
||||
## Returns a span of extended grapheme clusters which is the closest
|
||||
approximation of this span of code units.
|
||||
|
||||
The resulting span is extended in such a way that every code unit that
|
||||
was contained by the original span is also contained in a new span. Since
|
||||
some grapheme clusters consist of multiple code units, after the span was
|
||||
extended it may also contain code units which were not contained inside
|
||||
of the original span.
|
||||
|
||||
> Example
|
||||
Convert a codepoint span to graphemes and back.
|
||||
|
||||
text = 'a\u{301}e\u{302}o\u{303}'
|
||||
span = Utf_16_Span (Range 1 5) text # The span contains the units [\u{301}, e, \u{302}, o].
|
||||
extended = span.to_grapheme_span
|
||||
extended == Span (Range 0 3) text # The span is extended to the whole string since it contained code units from every grapheme cluster.
|
||||
extended.to_utf_16_span == Utf_16_Span (Range 0 6) text
|
||||
to_grapheme_span : Span
|
||||
to_grapheme_span = if (this.start < 0) || (this.end > Text_Utils.char_length this.text) then Error.throw (Illegal_State_Error "Utf_16_Span indices are out of range of the associated text.") else
|
||||
if this.end < this.start then Error.throw (Illegal_State_Error "Utf_16_Span invariant violation: start <= end") else
|
||||
case this.start == this.end of
|
||||
True ->
|
||||
grapheme_ix = Text_Utils.utf16_index_to_grapheme_index this.text this.start
|
||||
Span (Range grapheme_ix grapheme_ix) this.text
|
||||
False ->
|
||||
grapheme_ixes = Text_Utils.utf16_indices_to_grapheme_indices this.text [this.start, this.end - 1].to_array
|
||||
grapheme_first = grapheme_ixes.at 0
|
||||
grapheme_last = grapheme_ixes.at 1
|
||||
## We find the grapheme index of the last code unit actually contained within our span and set the
|
||||
end grapheme to the first grapheme after that. This ensures that if code units associated with
|
||||
only a part of a grapheme were contained in our original span, the resulting span will be
|
||||
extended to contain this whole grapheme.
|
||||
grapheme_end = grapheme_last + 1
|
||||
Span (Range grapheme_first grapheme_end) this.text
|
||||
|
||||
## PRIVATE
|
||||
Utility function taking a range pointing at grapheme clusters and converting
|
||||
to a range on the underlying code units.
|
||||
range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error
|
||||
range_to_char_indices text range =
|
||||
len = text.length
|
||||
start = if range.start < 0 then range.start + len else range.start
|
||||
end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end)
|
||||
is_valid = (Range 0 len+1).contains
|
||||
|
||||
case (Pair (is_valid start) (is_valid end)) of
|
||||
Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len)
|
||||
Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len)
|
||||
Pair True True ->
|
||||
if start>=end then (Range 0 0) else
|
||||
iterator = BreakIterator.getCharacterInstance
|
||||
iterator.setText text
|
||||
|
||||
start_index = iterator.next start
|
||||
end_index = iterator.next (end - start)
|
||||
Range start_index end_index
|
||||
|
@ -79,24 +79,24 @@ type Text_Sub_Range
|
||||
Range (if start_index == -1 then 0 else start_index) (Text_Utils.char_length text)
|
||||
Before delimiter ->
|
||||
if delimiter.is_empty then (Range 0 0) else
|
||||
index = Text_Utils.index_of text delimiter
|
||||
if index == -1 then (Range 0 (Text_Utils.char_length text)) else
|
||||
(Range 0 index)
|
||||
span = Text_Utils.span_of text delimiter
|
||||
if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
|
||||
(Range 0 span.start)
|
||||
Before_Last delimiter ->
|
||||
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
|
||||
index = Text_Utils.last_index_of text delimiter
|
||||
if index == -1 then (Range 0 (Text_Utils.char_length text)) else
|
||||
(Range 0 index)
|
||||
span = Text_Utils.last_span_of text delimiter
|
||||
if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
|
||||
(Range 0 span.start)
|
||||
After delimiter ->
|
||||
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
|
||||
index = Text_Utils.index_of text delimiter
|
||||
if index == -1 then (Range 0 0) else
|
||||
(Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text))
|
||||
span = Text_Utils.span_of text delimiter
|
||||
if span.is_nothing then (Range 0 0) else
|
||||
(Range span.end (Text_Utils.char_length text))
|
||||
After_Last delimiter ->
|
||||
if delimiter.is_empty then (Range 0 0) else
|
||||
index = Text_Utils.last_index_of text delimiter
|
||||
if index == -1 then (Range 0 0) else
|
||||
(Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text))
|
||||
span = Text_Utils.last_span_of text delimiter
|
||||
if span.is_nothing then (Range 0 0) else
|
||||
(Range span.end (Text_Utils.char_length text))
|
||||
While predicate ->
|
||||
indices = find_sub_range_end text _-> start-> end->
|
||||
predicate (Text_Utils.substring text start end) . not
|
||||
|
@ -1,7 +1,7 @@
|
||||
akka {
|
||||
loggers = ["akka.event.slf4j.Slf4jLogger"]
|
||||
logging-filter = "akka.event.slf4j.Slf4jLoggingFilter"
|
||||
version = "2.6.6"
|
||||
version = "2.6.18"
|
||||
stdout-loglevel = "ERROR"
|
||||
}
|
||||
|
||||
|
@ -1,11 +1,19 @@
|
||||
package org.enso.base;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.CaseMap.Fold;
|
||||
import com.ibm.icu.text.Normalizer;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.StringSearch;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Pattern;
|
||||
import org.enso.base.text.CaseFoldedString;
|
||||
import org.enso.base.text.GraphemeSpan;
|
||||
import org.enso.base.text.Utf16Span;
|
||||
|
||||
/** Utils for standard library operations on Text. */
|
||||
public class Text_Utils {
|
||||
@ -117,6 +125,23 @@ public class Text_Utils {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether two strings are equal up to Unicode canonicalization and ignoring case.
|
||||
*
|
||||
* @param str1 the first string
|
||||
* @param str2 the second string
|
||||
* @param locale the locale to use for case folding
|
||||
* @return the result of comparison
|
||||
*/
|
||||
public static boolean equals_ignore_case(String str1, Object str2, Locale locale) {
|
||||
if (str2 instanceof String) {
|
||||
Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
|
||||
return compare_normalized(fold.apply(str1), fold.apply((String) str2)) == 0;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts an array of codepoints into a string.
|
||||
*
|
||||
@ -176,6 +201,36 @@ public class Text_Utils {
|
||||
return searcher.first() != StringSearch.DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if {@code substring} is a substring of {@code string}.
|
||||
*
|
||||
* @param string the containing string.
|
||||
* @param substring the contained string.
|
||||
* @return whether {@code substring} is a substring of {@code string}.
|
||||
*/
|
||||
public static boolean contains_case_insensitive(String string, String substring, Locale locale) {
|
||||
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
|
||||
// cases.
|
||||
if (substring.isEmpty()) return true;
|
||||
if (string.isEmpty()) return false;
|
||||
|
||||
Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
|
||||
StringSearch searcher = new StringSearch(fold.apply(substring), fold.apply(string));
|
||||
return searcher.first() != StringSearch.DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms the provided string into a form which can be used for case insensitive comparisons.
|
||||
*
|
||||
* @param string the string to transform
|
||||
* @param locale the locale to use - needed to distinguish a special case when handling Turkish
|
||||
* 'i' characters
|
||||
* @return a transformed string that can be used for case insensitive comparisons
|
||||
*/
|
||||
public static String case_insensitive_key(String string, Locale locale) {
|
||||
return CaseFoldedString.simpleFold(string, locale);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces all occurrences of {@code oldSequence} within {@code str} with {@code newSequence}.
|
||||
*
|
||||
@ -200,37 +255,215 @@ public class Text_Utils {
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the first index of needle in the haystack
|
||||
* Find the first occurrence of needle in the haystack
|
||||
*
|
||||
* @param haystack the string to search
|
||||
* @param needle the substring that is searched for
|
||||
* @return index of the first needle or -1 if not found.
|
||||
* @return a UTF-16 code unit span of the first needle or null if not found.
|
||||
*/
|
||||
public static long index_of(String haystack, String needle) {
|
||||
public static Utf16Span span_of(String haystack, String needle) {
|
||||
if (needle.isEmpty()) return new Utf16Span(0, 0);
|
||||
if (haystack.isEmpty()) return null;
|
||||
|
||||
StringSearch search = new StringSearch(needle, haystack);
|
||||
int pos = search.first();
|
||||
return pos == StringSearch.DONE ? -1 : pos;
|
||||
if (pos == StringSearch.DONE) return null;
|
||||
return new Utf16Span(pos, pos + search.getMatchLength());
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the last index of needle in the haystack
|
||||
* Find the last occurrence of needle in the haystack
|
||||
*
|
||||
* @param haystack the string to search
|
||||
* @param needle the substring that is searched for
|
||||
* @return index of the last needle or -1 if not found.
|
||||
* @return a UTF-16 code unit span of the last needle or null if not found.
|
||||
*/
|
||||
public static long last_index_of(String haystack, String needle) {
|
||||
public static Utf16Span last_span_of(String haystack, String needle) {
|
||||
if (needle.isEmpty()) {
|
||||
int afterLast = haystack.length();
|
||||
return new Utf16Span(afterLast, afterLast);
|
||||
}
|
||||
if (haystack.isEmpty()) return null;
|
||||
|
||||
StringSearch search = new StringSearch(needle, haystack);
|
||||
int pos = search.first();
|
||||
int pos = search.last();
|
||||
if (pos == StringSearch.DONE) return null;
|
||||
return new Utf16Span(pos, pos + search.getMatchLength());
|
||||
}
|
||||
|
||||
/**
|
||||
* Find spans of all occurrences of the needle within the haystack.
|
||||
*
|
||||
* @param haystack the string to search
|
||||
* @param needle the substring that is searched for
|
||||
* @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
|
||||
*/
|
||||
public static List<Utf16Span> span_of_all(String haystack, String needle) {
|
||||
if (needle.isEmpty())
|
||||
throw new IllegalArgumentException(
|
||||
"The operation `index_of_all` does not support searching for an empty term.");
|
||||
if (haystack.isEmpty()) return List.of();
|
||||
|
||||
StringSearch search = new StringSearch(needle, haystack);
|
||||
ArrayList<Utf16Span> occurrences = new ArrayList<>();
|
||||
long ix;
|
||||
while ((ix = search.next()) != StringSearch.DONE) {
|
||||
occurrences.add(new Utf16Span(ix, ix + search.getMatchLength()));
|
||||
}
|
||||
return occurrences;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a UTF-16 code unit index to index of the grapheme that this code unit belongs to.
|
||||
*
|
||||
* @param text the text associated with the index
|
||||
* @param codeunit_index the UTF-16 index
|
||||
* @return an index of an extended grapheme cluster that contains the code unit from the input
|
||||
*/
|
||||
public static long utf16_index_to_grapheme_index(String text, long codeunit_index) {
|
||||
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
|
||||
breakIterator.setText(text);
|
||||
if (codeunit_index < 0 || codeunit_index > text.length()) {
|
||||
throw new IndexOutOfBoundsException(
|
||||
"Index " + codeunit_index + " is outside of the provided text.");
|
||||
}
|
||||
|
||||
int grapheme_end = breakIterator.next();
|
||||
long grapheme_index = 0;
|
||||
|
||||
while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) {
|
||||
grapheme_index++;
|
||||
grapheme_end = breakIterator.next();
|
||||
}
|
||||
return grapheme_index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a series of UTF-16 code unit indices to indices of graphemes that these code units
|
||||
* belong to.
|
||||
*
|
||||
* <p>For performance, it assumes that the provided indices are sorted in a non-decreasing order
|
||||
* (duplicate entries are permitted). Behaviour is unspecified if an unsorted list is provided.
|
||||
*
|
||||
* <p>The behaviour is unspecified if indices provided on the input are outside of the range [0,
|
||||
* text.length()].
|
||||
*
|
||||
* @param text the text associated with the indices
|
||||
* @param codeunit_indices the array of UTF-16 code unit indices, sorted in non-decreasing order
|
||||
* @return an array of grapheme indices corresponding to the UTF-16 units from the input
|
||||
*/
|
||||
public static long[] utf16_indices_to_grapheme_indices(String text, List<Long> codeunit_indices) {
|
||||
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
|
||||
breakIterator.setText(text);
|
||||
|
||||
int grapheme_end = breakIterator.next();
|
||||
long grapheme_index = 0;
|
||||
|
||||
long[] result = new long[codeunit_indices.size()];
|
||||
int result_ix = 0;
|
||||
|
||||
for (long codeunit_index : codeunit_indices) {
|
||||
while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) {
|
||||
grapheme_index++;
|
||||
grapheme_end = breakIterator.next();
|
||||
}
|
||||
result[result_ix++] = grapheme_index;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the first or last occurrence of needle in the haystack.
|
||||
*
|
||||
* @param haystack the string to search
|
||||
* @param needle the substring that is searched for
|
||||
* @param locale the locale used for case-insensitive comparisons
|
||||
* @param searchForLast if set to true, will search for the last occurrence; otherwise searches
|
||||
* for the first one
|
||||
* @return an extended-grapheme-cluster span of the first or last needle, or null if none found.
|
||||
*/
|
||||
public static GraphemeSpan span_of_case_insensitive(
|
||||
String haystack, String needle, Locale locale, boolean searchForLast) {
|
||||
if (needle.isEmpty())
|
||||
throw new IllegalArgumentException(
|
||||
"The operation `span_of_case_insensitive` does not support searching for an empty term.");
|
||||
if (haystack.isEmpty()) return null;
|
||||
|
||||
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
|
||||
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
|
||||
StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString());
|
||||
int pos;
|
||||
if (searchForLast) {
|
||||
pos = search.last();
|
||||
} else {
|
||||
pos = search.first();
|
||||
}
|
||||
if (pos == StringSearch.DONE) {
|
||||
return -1;
|
||||
return null;
|
||||
} else {
|
||||
return findExtendedSpan(foldedHaystack, pos, search.getMatchLength());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find all occurrences of needle in the haystack
|
||||
*
|
||||
* @param haystack the string to search
|
||||
* @param needle the substring that is searched for
|
||||
* @param locale the locale used for case-insensitive comparisons
|
||||
* @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
|
||||
*/
|
||||
public static List<GraphemeSpan> span_of_all_case_insensitive(
|
||||
String haystack, String needle, Locale locale) {
|
||||
if (needle.isEmpty())
|
||||
throw new IllegalArgumentException(
|
||||
"The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
|
||||
if (haystack.isEmpty()) return List.of();
|
||||
|
||||
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
|
||||
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
|
||||
|
||||
StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString());
|
||||
ArrayList<GraphemeSpan> result = new ArrayList<>();
|
||||
|
||||
int pos;
|
||||
while ((pos = search.next()) != StringSearch.DONE) {
|
||||
result.add(findExtendedSpan(foldedHaystack, pos, search.getMatchLength()));
|
||||
}
|
||||
|
||||
for (int next = search.next(); next != StringSearch.DONE; next = search.next()) {
|
||||
pos = next;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
return pos;
|
||||
/**
|
||||
* Finds the grapheme span corresponding to the found match indexed with code units.
|
||||
*
|
||||
* <p>It extends the found span to ensure that graphemes associated with all found code units are
|
||||
* included in the resulting span. Thus, some additional code units which were not present in the
|
||||
* original match may also be present due to the extension.
|
||||
*
|
||||
* <p>The extension to the left is trivial - we just find the grapheme associated with the first
|
||||
* code unit and even if that code unit is not the first one of that grapheme, by returning it we
|
||||
* correctly extend to the left. The extension to the right works by finding the index of the
|
||||
* grapheme associated with the last code unit actually present in the span, then the end of the
|
||||
* returned span is set to the next grapheme after it. This correctly handles the edge case where
|
||||
* only a part of some grapheme was matched.
|
||||
*
|
||||
* @param string the folded string with which the positions are associated, containing a cache of
|
||||
* position mappings
|
||||
* @param position the position of the match (in code units)
|
||||
* @param length the length of the match (in code units)
|
||||
* @return a minimal {@code GraphemeSpan} which contains all code units from the match
|
||||
*/
|
||||
private static GraphemeSpan findExtendedSpan(CaseFoldedString string, int position, int length) {
|
||||
int firstGrapheme = string.codeUnitToGraphemeIndex(position);
|
||||
if (length == 0) {
|
||||
return new GraphemeSpan(firstGrapheme, firstGrapheme);
|
||||
} else {
|
||||
int lastGrapheme = string.codeUnitToGraphemeIndex(position + length - 1);
|
||||
int endGrapheme = lastGrapheme + 1;
|
||||
return new GraphemeSpan(firstGrapheme, endGrapheme);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -0,0 +1,135 @@
|
||||
package org.enso.base.text;
|
||||
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.CaseMap;
|
||||
import com.ibm.icu.text.CaseMap.Fold;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* Represents a string transformed using Unicode Case Folding which can be used for case insensitive
|
||||
* comparisons.
|
||||
*
|
||||
* <p>It contains facilities for converting indices in the transformed string to corresponding
|
||||
* indices back in the original string.
|
||||
*/
|
||||
public class CaseFoldedString {
|
||||
private final String foldedString;
|
||||
|
||||
/**
|
||||
* A mapping from code units in the transformed string to their corresponding graphemes in the
|
||||
* original string.
|
||||
*
|
||||
* <p>The mapping must be valid from indices from 0 to @{code foldedString.length()+1}
|
||||
* (inclusive).
|
||||
*/
|
||||
private final int[] graphemeIndexMapping;
|
||||
|
||||
/**
|
||||
* Constructs a new instance of the folded string.
|
||||
*
|
||||
* @param foldeString the string after applying the case folding transformation
|
||||
* @param graphemeIndexMapping a mapping created during the transformation which maps code units
|
||||
* in the transformed string to their corresponding graphemes in the original string
|
||||
*/
|
||||
private CaseFoldedString(String foldeString, int[] graphemeIndexMapping) {
|
||||
this.foldedString = foldeString;
|
||||
this.graphemeIndexMapping = graphemeIndexMapping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Maps a code unit in the folded string to the corresponding grapheme in the original string.
|
||||
*
|
||||
* @param codeunitIndex the index of the code unit in the folded string, valid indices range from
|
||||
* 0 to {@code getFoldedString().length()+1} (inclusive), allowing to also ask for the
|
||||
* position of the end code unit which is located right after the end of the string - which
|
||||
* should always map to the analogous end grapheme.
|
||||
* @return the index of the grapheme from the original string that after applying the
|
||||
* transformation contains the requested code unit
|
||||
*/
|
||||
public int codeUnitToGraphemeIndex(int codeunitIndex) {
|
||||
if (codeunitIndex < 0 || codeunitIndex > this.foldedString.length()) {
|
||||
throw new IndexOutOfBoundsException(codeunitIndex);
|
||||
}
|
||||
return graphemeIndexMapping[codeunitIndex];
|
||||
}
|
||||
|
||||
/** Returns the transformed string. */
|
||||
public String getFoldedString() {
|
||||
return foldedString;
|
||||
}
|
||||
|
||||
/**
|
||||
* Folds a string remembering the mapping from code units to its original grapheme cluster
|
||||
* indices.
|
||||
*
|
||||
* @param charSequence a sequence of UTF-16 characters to transform
|
||||
* @param locale the locale to use as a reference for case folding; it is needed because Turkish
|
||||
* and Azerbaijani locales handle casing of the letter `i` in a different way than other
|
||||
* locales
|
||||
* @return a {@code CaseFoldedString} instance which contains the transformed string and allows to
|
||||
* map its code units to original grapheme clusters
|
||||
*/
|
||||
public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {
|
||||
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
|
||||
breakIterator.setText(charSequence);
|
||||
StringBuilder stringBuilder = new StringBuilder(charSequence.length());
|
||||
Fold foldAlgorithm = caseFoldAlgorithmForLocale(locale);
|
||||
IntArrayBuilder index_mapping = new IntArrayBuilder(charSequence.length() + 1);
|
||||
|
||||
// We rely on the fact that ICU Case Folding is _not_ context-sensitive, i.e. the mapping of
|
||||
// each grapheme cluster is independent of surrounding ones. Regular casing is
|
||||
// context-sensitive.
|
||||
int current = breakIterator.current();
|
||||
int next;
|
||||
int grapheme_index = 0;
|
||||
while ((next = breakIterator.next()) != BreakIterator.DONE) {
|
||||
CharSequence grapheme = new StringSlice(charSequence, current, next);
|
||||
String foldedGrapheme = foldAlgorithm.apply(grapheme);
|
||||
stringBuilder.append(foldedGrapheme);
|
||||
for (int i = 0; i < foldedGrapheme.length(); ++i) {
|
||||
index_mapping.add(grapheme_index);
|
||||
}
|
||||
|
||||
grapheme_index++;
|
||||
current = next;
|
||||
}
|
||||
|
||||
// The mapping should also be able to handle a {@code str.length()} query, so we add one more
|
||||
// element to the mapping pointing to a non-existent grapheme after the end of the text.
|
||||
index_mapping.add(grapheme_index);
|
||||
|
||||
return new CaseFoldedString(
|
||||
stringBuilder.toString(), index_mapping.unsafeGetStorageAndInvalidateTheBuilder());
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper function which folds the string without remembering the index mapping.
|
||||
*
|
||||
* <p>It should be used when the index mapping is not needed, as its implementation is much more
|
||||
* efficient.
|
||||
*
|
||||
* @param charSequence a sequence of UTF-16 characters to transform
|
||||
* @param locale the locale to use as a reference for case folding; it is needed because Turkish
|
||||
* and Azerbaijani locales handle casing of the letter `i` in a different way than the others
|
||||
* @return the folded string
|
||||
*/
|
||||
public static String simpleFold(CharSequence string, Locale locale) {
|
||||
return caseFoldAlgorithmForLocale(locale).apply(string);
|
||||
}
|
||||
|
||||
private static final Locale AZ_LOCALE = new Locale("az");
|
||||
private static final Locale TR_LOCALE = new Locale("tr");
|
||||
|
||||
/**
|
||||
* Returns a case folding algorithm appropriate for the given locale.
|
||||
*
|
||||
* <p>The algorithm is locale-dependent because Turkish and Azerbaijani locales handle casing of
|
||||
* the letter `i` in a different way than other locales.
|
||||
*/
|
||||
public static Fold caseFoldAlgorithmForLocale(Locale locale) {
|
||||
if (locale.equals(AZ_LOCALE) || locale.equals(TR_LOCALE)) {
|
||||
return CaseMap.fold().turkic();
|
||||
}
|
||||
return CaseMap.fold();
|
||||
}
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
package org.enso.base.text;
|
||||
|
||||
/**
|
||||
* Represents a span of characters (understood as extended grapheme clusters) within a Text.
|
||||
*
|
||||
* <p>The start index indicates the first grapheme of the span and the end index indicates the first
|
||||
* grapheme after the end of the span.
|
||||
*
|
||||
* <p>Represents an empty span if start and end indices are equal. Such an empty span refers to the
|
||||
* space just before the grapheme corresponding to index start.
|
||||
*/
|
||||
public class GraphemeSpan {
|
||||
|
||||
public final long start, end;
|
||||
|
||||
/**
|
||||
* Constructs a span of characters (understood as extended grapheme clusters).
|
||||
*
|
||||
* @param start index of the first extended grapheme cluster contained within the span (or
|
||||
* location of the span if it is empty)
|
||||
* @param end index of the first extended grapheme cluster after start that is not contained
|
||||
* within the span
|
||||
*/
|
||||
public GraphemeSpan(long start, long end) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
}
|
@ -0,0 +1,65 @@
|
||||
package org.enso.base.text;
|
||||
|
||||
/** A helper to efficiently build an array of unboxed integers of arbitrary length. */
|
||||
public class IntArrayBuilder {
|
||||
private int[] storage;
|
||||
private int length;
|
||||
|
||||
/**
|
||||
* Constructs an empty builder with a given initial capacity.
|
||||
*
|
||||
* @param initialCapacity the initial capacity of the builder, can be used to avoid expanding the
|
||||
* storage if the amount of elements can be estimated in advance.
|
||||
*/
|
||||
public IntArrayBuilder(int initialCapacity) {
|
||||
length = 0;
|
||||
storage = new int[initialCapacity];
|
||||
}
|
||||
|
||||
/** Adds a new element to the array, expanding it if necessary. */
|
||||
public void add(int x) {
|
||||
if (length >= storage.length) {
|
||||
grow();
|
||||
}
|
||||
|
||||
storage[length++] = x;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expands the storage to fit more elements.
|
||||
*
|
||||
* <p>The storage grows by 50% and is always increased by at least one. The 50% growth is chosen
|
||||
* so that the amortized cost of adding a new element to the array stays constant.
|
||||
*/
|
||||
private void grow() {
|
||||
int newCapacity = storage.length + (storage.length / 2);
|
||||
if (newCapacity <= storage.length) {
|
||||
newCapacity = storage.length + 1;
|
||||
}
|
||||
|
||||
int[] newStorage = new int[newCapacity];
|
||||
System.arraycopy(this.storage, 0, newStorage, 0, length);
|
||||
this.storage = newStorage;
|
||||
}
|
||||
|
||||
/** Returns the amount of elements already added to the storage. */
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the underlying storage of the builder.
|
||||
*
|
||||
* <p>This method avoids copying for performance so it should be used with care. The storage can
|
||||
* actually have more elements than were added, so the user should be careful to only query the
|
||||
* first {@code getLength()} elements. Querying other elements results in an unspecified result.
|
||||
*
|
||||
* <p>After calling this method, the builder is invalidated and cannot be used anymore. Any usage
|
||||
* of the builder afterwards will result in a {@code NullPointerException}.
|
||||
*/
|
||||
public int[] unsafeGetStorageAndInvalidateTheBuilder() {
|
||||
int[] tmp = storage;
|
||||
this.storage = null;
|
||||
return tmp;
|
||||
}
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
package org.enso.base.text;
|
||||
|
||||
/** A char sequence which allows to access a slice of another char sequence without copying. */
|
||||
class StringSlice implements CharSequence {
|
||||
private final CharSequence text;
|
||||
private final int subStart, subEnd;
|
||||
|
||||
/** Constructs a slice of the given text. */
|
||||
public StringSlice(CharSequence text, int start, int end) {
|
||||
this.text = text;
|
||||
this.subStart = start;
|
||||
this.subEnd = end;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return subEnd - subStart;
|
||||
}
|
||||
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
return text.charAt(subStart + index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharSequence subSequence(int start, int end) {
|
||||
return new StringSlice(text, subStart + start, subStart + end);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return text.subSequence(subStart, subEnd).toString();
|
||||
}
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
package org.enso.base.text;
|
||||
|
||||
/**
|
||||
* Represents a span of UTF-16 code units within a String.
|
||||
*
|
||||
* <p>The start index indicates the first code unit of the span and the end index indicates the
|
||||
* first code unit after the end of the span.
|
||||
*/
|
||||
public class Utf16Span {
|
||||
|
||||
public final long start, end;
|
||||
|
||||
/** Constructs a span of UTF-16 code units. */
|
||||
public Utf16Span(long start, long end) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
}
|
@ -6,7 +6,7 @@ import Standard.Base.Data.Text.Regex
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
import Standard.Base.Data.Text.Regex.Mode
|
||||
import Standard.Base.Data.Text.Regex.Option as Global_Option
|
||||
import Standard.Base.Data.Text.Span
|
||||
from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span
|
||||
|
||||
polyglot java import java.util.regex.Pattern as Java_Pattern
|
||||
|
||||
@ -182,6 +182,22 @@ spec =
|
||||
match.at 1 . group 0 . should_equal "ef"
|
||||
match.at 2 . group 0 . should_equal "gh"
|
||||
|
||||
Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <|
|
||||
pattern = engine.compile "" []
|
||||
match_1 = pattern.match "" mode=Mode.All
|
||||
match_1.length . should_equal 1
|
||||
match_1.at 0 . start 0 . should_equal 0
|
||||
match_1.at 0 . end 0 . should_equal 0
|
||||
|
||||
match_2 = pattern.match "ABC" mode=Mode.All
|
||||
match_2.length . should_equal 4
|
||||
match_2.at 0 . start 0 . should_equal 0
|
||||
match_2.at 0 . end 0 . should_equal 0
|
||||
match_2.at 1 . start 0 . should_equal 1
|
||||
match_2.at 1 . end 0 . should_equal 1
|
||||
match_2.at 3 . start 0 . should_equal 3
|
||||
match_2.at 3 . end 0 . should_equal 3
|
||||
|
||||
Test.group "The default regex engine's Pattern.find" <|
|
||||
engine = Default_Engine.new
|
||||
|
||||
@ -261,11 +277,23 @@ spec =
|
||||
match.at 1 . should_equal "ef"
|
||||
match.at 2 . should_equal "gh"
|
||||
|
||||
match_2 = pattern.find input mode=(Mode.Bounded 2 8 mode=10)
|
||||
match_2.length . should_equal 3
|
||||
match_2.at 0 . should_equal "cd"
|
||||
match_2.at 1 . should_equal "ef"
|
||||
match_2.at 2 . should_equal "gh"
|
||||
|
||||
match_3 = pattern.find input mode=(Mode.Bounded 2 8 mode=2)
|
||||
match_3.length . should_equal 2
|
||||
match_3.at 0 . should_equal "cd"
|
||||
match_3.at 1 . should_equal "ef"
|
||||
|
||||
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
|
||||
engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||
engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"]
|
||||
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=5 . should_equal ["a", "1", "b", "2"]
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"]
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"]
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Mode.Bounded 1 3) . should_equal ["1", "b"]
|
||||
@ -501,10 +529,10 @@ spec =
|
||||
match . should_be_a Default_Engine.Match
|
||||
|
||||
Test.specify "should get the span of a group by index" <|
|
||||
match.span 1 . should_equal (Span.new 0 6 input)
|
||||
match.span 1 . should_equal (Utf_16_Span (Range 0 6) input)
|
||||
|
||||
Test.specify "should get the span of a group by name" <|
|
||||
match.span "letters" . should_equal (Span.new 6 18 input)
|
||||
match.span "letters" . should_equal (Utf_16_Span (Range 6 18) input)
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.span 3 . should_equal Nothing
|
||||
|
@ -26,3 +26,4 @@ spec =
|
||||
pattern = "http://example.com"
|
||||
Regex.escape pattern . should_equal "\Qhttp://example.com\E"
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
||||
|
@ -2,20 +2,36 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Test
|
||||
|
||||
import Standard.Base.Data.Text.Span
|
||||
from Standard.Base.Data.Text.Span as Span_Module import Span, Utf_16_Span
|
||||
|
||||
spec = Test.group "Text.Span" <|
|
||||
|
||||
Test.specify "should be able to be created over a text" <|
|
||||
text = "Hello!"
|
||||
span = Span.new 0 3 text
|
||||
span = Span (Range 0 3) text
|
||||
span.start . should_equal 0
|
||||
span.end . should_equal 3
|
||||
span.text . should_equal text
|
||||
|
||||
Test.specify "should be able to be created without a text" <|
|
||||
span = Span.new 5 8
|
||||
span.start . should_equal 5
|
||||
span.end . should_equal 8
|
||||
span.text . should_equal Nothing
|
||||
Test.specify "should be able to be converted to code units" <|
|
||||
text = 'ae\u{301}fz'
|
||||
(Span (Range 1 3) text).to_utf_16_span . should_equal (Utf_16_Span (Range 1 4) text)
|
||||
|
||||
Test.specify "should expand to the associated grapheme clusters" <|
|
||||
text = 'a\u{301}e\u{302}o\u{303}'
|
||||
span = Utf_16_Span (Range 1 5) text
|
||||
extended = span.to_grapheme_span
|
||||
extended . should_equal (Span (Range 0 3) text)
|
||||
extended.to_utf_16_span . should_equal (Utf_16_Span (Range 0 6) text)
|
||||
|
||||
Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
|
||||
Utf_16_Span (Range 0 1) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
|
||||
Utf_16_Span (Range 0 0) text . to_grapheme_span . should_equal (Span (Range 0 0) text)
|
||||
Utf_16_Span (Range 1 1) text . to_grapheme_span . should_equal (Span (Range 0 0) text)
|
||||
Utf_16_Span (Range 2 2) text . to_grapheme_span . should_equal (Span (Range 1 1) text)
|
||||
|
||||
Utf_16_Span (Range 0 4) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
|
||||
Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
|
||||
Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
||||
|
61
test/Tests/src/Data/Text/Utils_Spec.enso
Normal file
61
test/Tests/src/Data/Text/Utils_Spec.enso
Normal file
@ -0,0 +1,61 @@
|
||||
from Standard.Base import all
|
||||
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
polyglot java import org.enso.base.text.CaseFoldedString
|
||||
|
||||
import Standard.Test
|
||||
|
||||
polyglot java import com.ibm.icu.text.BreakIterator
|
||||
spec =
|
||||
Test.group "Text_Utils" <|
|
||||
kshi = '\u0915\u094D\u0937\u093F'
|
||||
facepalm = '\u{1F926}\u{1F3FC}\u200D\u2642\uFE0F'
|
||||
text = "a"+kshi+facepalm+'e\u{301}Z'
|
||||
codepoints_to_graphemes = _.flatten <| text.characters.map_with_index ix-> grapheme->
|
||||
codepoints_count = grapheme.utf_16.length
|
||||
Vector.new codepoints_count _->ix
|
||||
|
||||
Test.specify "should correctly translate an codepoint index to a grapheme index" <|
|
||||
codepoints_to_graphemes . each_with_index codepoint_ix-> grapheme_ix->
|
||||
found_grapheme_ix = Text_Utils.utf16_index_to_grapheme_index text codepoint_ix
|
||||
found_grapheme_ix.should_equal grapheme_ix
|
||||
|
||||
Text_Utils.utf16_index_to_grapheme_index text text.utf_16.length . should_equal text.length
|
||||
Text_Utils.utf16_index_to_grapheme_index "" 0 . should_equal 0
|
||||
|
||||
Text_Utils.utf16_index_to_grapheme_index 'ą' 0 . should_equal 0
|
||||
Text_Utils.utf16_index_to_grapheme_index 'ą' 1 . should_equal 1
|
||||
|
||||
Text_Utils.utf16_index_to_grapheme_index "aB" 0 . should_equal 0
|
||||
Text_Utils.utf16_index_to_grapheme_index "aB" 1 . should_equal 1
|
||||
Text_Utils.utf16_index_to_grapheme_index "aB" 2 . should_equal 2
|
||||
|
||||
Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 0 . should_equal 0
|
||||
Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 1 . should_equal 0
|
||||
Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 2 . should_equal 1
|
||||
|
||||
Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <|
|
||||
translate_indices text ixes =
|
||||
Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
|
||||
codepoint_indices = Vector.new text.utf_16.length ix->ix
|
||||
translate_indices text codepoint_indices . should_equal codepoints_to_graphemes
|
||||
|
||||
translate_indices "" [0] . should_equal [0]
|
||||
translate_indices 'ą' [0, 1] . should_equal [0, 1]
|
||||
translate_indices "aB" [0, 1, 2] . should_equal [0, 1, 2]
|
||||
translate_indices 'a\u{301}' [0, 1, 2] . should_equal [0, 0, 1]
|
||||
|
||||
Test.specify "should correctly case-fold a string and translate codeunits to graphemes" <|
|
||||
text = 'a\u{301}AZßffią'
|
||||
folded = CaseFoldedString.fold text Locale.default.java_locale
|
||||
folded.getFoldedString . should_equal 'a\u{301}azssffią'
|
||||
|
||||
codeunits = Vector.new folded.getFoldedString.utf_16.length+1 ix->ix
|
||||
grapheme_ixes = codeunits.map ix->
|
||||
folded.codeUnitToGraphemeIndex ix
|
||||
grapheme_ixes . should_equal [0, 0, 1, 2, 3, 3, 4, 4, 4, 5, 6]
|
||||
|
||||
Test.expect_panic_with (folded.codeUnitToGraphemeIndex -1) Polyglot_Error
|
||||
Test.expect_panic_with (folded.codeUnitToGraphemeIndex folded.getFoldedString.utf_16.length+1) Polyglot_Error
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
@ -4,7 +4,10 @@ from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
import Standard.Base.Data.Locale
|
||||
import Standard.Base.Data.Text.Split_Kind
|
||||
from Standard.Base.Data.Text.Span as Span_Module import Span
|
||||
from Standard.Base.Data.Text.Text_Sub_Range import all
|
||||
import Standard.Base.Data.Text.Regex.Mode
|
||||
import Standard.Base.Data.Text.Matching_Mode
|
||||
import Standard.Test
|
||||
|
||||
type Auto a
|
||||
@ -87,9 +90,8 @@ spec =
|
||||
'e\u0301' . equals_ignore_case 'e\u0303' . should_be_false
|
||||
|
||||
"I" . equals_ignore_case "i" . should_be_true
|
||||
"I" . equals_ignore_case "ı" . should_be_true
|
||||
"İ" . equals_ignore_case "i" . should_be_false
|
||||
"İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
|
||||
"I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true
|
||||
"I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
|
||||
|
||||
"Kongressstraße"=="Kongressstrasse" . should_be_false
|
||||
@ -199,15 +201,20 @@ spec =
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 6) . should_equal 'Wo\u{301}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 5) . should_equal 'o\u{301}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e\u{302}') . should_equal 'H'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'ê') . should_equal 'H'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} W'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'ö') . should_equal 'He\u{302}llo\u{308} W'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{302}') . should_equal 'llo\u{308} Wo\u{301}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'ê') . should_equal 'llo\u{308} Wo\u{301}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{308}') . should_equal ''
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e') . should_equal ''
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o\u{308}') . should_equal 'rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'ö') . should_equal 'rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o') . should_equal ''
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e\u{302}') . should_equal 'H'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='ê') . should_equal 'H'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range 3 5) . should_equal 'lo\u{308}'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range -3 -1) . should_equal 'ld'
|
||||
@ -232,6 +239,30 @@ spec =
|
||||
'✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 Nothing) . should_equal '😙😉☺'
|
||||
'✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 -1) . should_equal '😙😉'
|
||||
|
||||
Test.specify "take should correctly handle edge cases" <|
|
||||
"".take First.new . should_equal ""
|
||||
"".take Last.new . should_equal ""
|
||||
|
||||
"".take (After "a") . should_equal ""
|
||||
"".take (After_Last "a") . should_equal ""
|
||||
"".take (Before "a") . should_equal ""
|
||||
"".take (Before_Last "a") . should_equal ""
|
||||
|
||||
"".take (After "") . should_equal ""
|
||||
"".take (After_Last "") . should_equal ""
|
||||
"".take (Before "") . should_equal ""
|
||||
"".take (Before_Last "") . should_equal ""
|
||||
|
||||
"".take (While _->True) . should_equal ""
|
||||
|
||||
"".take (Range 0 0) . should_equal ""
|
||||
'ABC\u{301}'.take (Range 0 0) . should_equal ""
|
||||
|
||||
'ABC\u{301}'.take (After "") . should_equal 'ABC\u{301}'
|
||||
'ABC\u{301}'.take (After_Last "") . should_equal ""
|
||||
'ABC\u{301}'.take (Before "") . should_equal ""
|
||||
'ABC\u{301}'.take (Before_Last "") . should_equal 'ABC\u{301}'
|
||||
|
||||
Test.specify "drop should work as in the examples" <|
|
||||
"Hello World!".drop First.new . should_equal "ello World!"
|
||||
"Hello World!".drop (First 5) . should_equal " World!"
|
||||
@ -269,15 +300,20 @@ spec =
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 6) . should_equal 'He\u{302}llo\u{308} '
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 5) . should_equal 'He\u{302}llo\u{308} W'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'ê') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e') . should_equal ''
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o\u{308}') . should_equal 'o\u{308}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'ö') . should_equal 'o\u{308}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o') . should_equal ''
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{302}') . should_equal 'He\u{302}'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'ê') . should_equal 'He\u{302}'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{308}'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'ö') . should_equal 'He\u{302}llo\u{308} Wo\u{308}'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='ê') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e') . should_equal ''
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range 3 5) . should_equal 'He\u{302}l Wo\u{308}rld!'
|
||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range -3 -1) . should_equal 'He\u{302}llo\u{308} Wo\u{308}r!'
|
||||
@ -301,6 +337,30 @@ spec =
|
||||
'✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 Nothing) . should_equal '✨🚀🚧😍😃😍😎'
|
||||
'✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 -1) . should_equal '✨🚀🚧😍😃😍😎☺'
|
||||
|
||||
Test.specify "drop should correctly handle edge cases" <|
|
||||
"".drop First.new . should_equal ""
|
||||
"".drop Last.new . should_equal ""
|
||||
|
||||
"".drop (After "a") . should_equal ""
|
||||
"".drop (After_Last "a") . should_equal ""
|
||||
"".drop (Before "a") . should_equal ""
|
||||
"".drop (Before_Last "a") . should_equal ""
|
||||
|
||||
"".drop (After "") . should_equal ""
|
||||
"".drop (After_Last "") . should_equal ""
|
||||
"".drop (Before "") . should_equal ""
|
||||
"".drop (Before_Last "") . should_equal ""
|
||||
|
||||
"".drop (While _->True) . should_equal ""
|
||||
|
||||
"".drop (Range 0 0) . should_equal ""
|
||||
'ABC\u{301}'.drop (Range 0 0) . should_equal 'ABC\u{301}'
|
||||
|
||||
'ABC\u{301}'.drop (After "") . should_equal ''
|
||||
'ABC\u{301}'.drop (After_Last "") . should_equal 'ABC\u{301}'
|
||||
'ABC\u{301}'.drop (Before "") . should_equal 'ABC\u{301}'
|
||||
'ABC\u{301}'.drop (Before_Last "") . should_equal ''
|
||||
|
||||
Test.specify "should correctly convert character case" <|
|
||||
"FooBar Baz".to_case Case.Lower . should_equal "foobar baz"
|
||||
"FooBar Baz".to_case Case.Upper . should_equal "FOOBAR BAZ"
|
||||
@ -465,10 +525,7 @@ spec =
|
||||
## This shows what regex is doing by default and we cannot easily fix
|
||||
that.
|
||||
's\u{301}' . contains 's' (Regex_Matcher.new) . should_be_true
|
||||
## This would normally be false, but we perform input normalization
|
||||
to get results that are consistent regardless of if the input was
|
||||
normalized or not.
|
||||
'ś' . contains 's' (Regex_Matcher.new) . should_be_true
|
||||
'ś' . contains 's' (Regex_Matcher.new) . should_be_false
|
||||
's\u{301}' . contains 'ś' (Regex_Matcher.new) . should_be_true
|
||||
'ś' . contains 's\u{301}' (Regex_Matcher.new) . should_be_true
|
||||
|
||||
@ -767,6 +824,157 @@ spec =
|
||||
|
||||
'✨🚀🚧'*2 . should_equal '✨🚀🚧✨🚀🚧'
|
||||
|
||||
Test.specify "location_of should work as shown in examples" <|
|
||||
example_1 =
|
||||
"Hello World!".location_of "J" == Nothing
|
||||
"Hello World!".location_of "o" == Span (Range 4 5) "Hello World!"
|
||||
"Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 4 5) "Hello World!"
|
||||
|
||||
example_2 =
|
||||
term = "straße"
|
||||
text = "MONUMENTENSTRASSE 42"
|
||||
match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new)
|
||||
term.length . should_equal 6
|
||||
match.length . should_equal 7
|
||||
|
||||
example_3 =
|
||||
ligatures = "ffiffl"
|
||||
ligatures.length . should_equal 2
|
||||
term_1 = "IFF"
|
||||
match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new)
|
||||
term_1.length . should_equal 3
|
||||
match_1.length . should_equal 2
|
||||
term_2 = "ffiffl"
|
||||
match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new)
|
||||
term_2.length . should_equal 6
|
||||
match_2.length . should_equal 2
|
||||
match_1 . should_equal match_2
|
||||
|
||||
example_4 =
|
||||
"Hello World!".location_of_all "J" . should_equal []
|
||||
"Hello World!".location_of_all "o" . map .start . should_equal [4, 7]
|
||||
|
||||
example_5 =
|
||||
term = "strasse"
|
||||
text = "MONUMENTENSTRASSE ist eine große Straße."
|
||||
match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new)
|
||||
term.length . should_equal 7
|
||||
match . map .length . should_equal [7, 6]
|
||||
|
||||
example_6 =
|
||||
ligatures = "ffifflFFIFF"
|
||||
ligatures.length . should_equal 7
|
||||
match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new)
|
||||
match_1 . map .length . should_equal [2, 3]
|
||||
match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new)
|
||||
match_2 . map .length . should_equal [2, 5]
|
||||
|
||||
# Put them in blocks to avoid name clashes.
|
||||
example_1
|
||||
example_2
|
||||
example_3
|
||||
example_4
|
||||
example_5
|
||||
example_6
|
||||
|
||||
Test.specify "should allow to find location_of occurrences within a text" <|
|
||||
"Hello World!".location_of_all "J" . should_equal []
|
||||
"Hello World!".location_of_all "o" . map .start . should_equal [4, 7]
|
||||
|
||||
accents = 'a\u{301}e\u{301}o\u{301}'
|
||||
accents.location_of accent_1 . should_equal (Span (Range 1 2) accents)
|
||||
|
||||
"".location_of "foo" . should_equal Nothing
|
||||
"".location_of "foo" mode=Matching_Mode.Last . should_equal Nothing
|
||||
"".location_of_all "foo" . should_equal []
|
||||
"".location_of "" . should_equal (Span (Range 0 0) "")
|
||||
"".location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
|
||||
"".location_of_all "" . should_equal [Span (Range 0 0) ""]
|
||||
abc = 'A\u{301}ßC'
|
||||
abc.location_of "" . should_equal (Span (Range 0 0) abc)
|
||||
abc.location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
|
||||
abc.location_of_all "" . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
|
||||
|
||||
Test.specify "should allow case insensitive matching in location_of" <|
|
||||
hello = "Hello WORLD!"
|
||||
case_insensitive = Text_Matcher Case_Insensitive.new
|
||||
hello.location_of "world" . should_equal Nothing
|
||||
hello.location_of "world" matcher=case_insensitive . should_equal (Span (Range 6 11) hello)
|
||||
|
||||
hello.location_of "o" mode=Mode.First matcher=case_insensitive . should_equal (Span (Range 4 5) hello)
|
||||
hello.location_of "o" mode=Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 7 8) hello)
|
||||
|
||||
accents = 'A\u{301}E\u{301}O\u{301}'
|
||||
accents.location_of accent_1 matcher=case_insensitive . should_equal (Span (Range 1 2) accents)
|
||||
|
||||
"Strasse".location_of "ß" matcher=case_insensitive . should_equal (Span (Range 4 6) "Strasse")
|
||||
"Monumentenstraße 42".location_of "STRASSE" matcher=case_insensitive . should_equal (Span (Range 10 16) "Monumentenstraße 42")
|
||||
|
||||
'\u0390'.location_of '\u03B9\u0308\u0301' matcher=case_insensitive . should_equal (Span (Range 0 1) '\u0390')
|
||||
'ԵՒ'.location_of 'և' . should_equal Nothing
|
||||
'ԵՒ'.location_of 'և' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ԵՒ')
|
||||
'և'.location_of 'ԵՒ' matcher=case_insensitive . should_equal (Span (Range 0 1) 'և')
|
||||
|
||||
ligatures = 'ffafffiflffifflſtstZ'
|
||||
ligatures.location_of 'FFI' matcher=case_insensitive . should_equal (Span (Range 3 5) ligatures)
|
||||
ligatures.location_of 'FF' matcher=case_insensitive . should_equal (Span (Range 0 2) ligatures)
|
||||
ligatures.location_of 'ff' matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 7 8) ligatures)
|
||||
ligatures.location_of_all 'ff' . should_equal [Span (Range 0 2) ligatures]
|
||||
ligatures.location_of_all 'FF' matcher=case_insensitive . should_equal [Span (Range 0 2) ligatures, Span (Range 3 4) ligatures, Span (Range 6 7) ligatures, Span (Range 7 8) ligatures]
|
||||
ligatures.location_of_all 'ffi' matcher=case_insensitive . should_equal [Span (Range 3 5) ligatures, Span (Range 6 7) ligatures]
|
||||
'fffi'.location_of_all 'ff' matcher=case_insensitive . should_equal [Span (Range 0 2) 'fffi']
|
||||
'fffi'.location_of_all 'ffi' . should_equal []
|
||||
'fffi'.location_of_all 'ffi' matcher=case_insensitive . should_equal [Span (Range 1 4) 'fffi']
|
||||
'FFFI'.location_of 'ffi' matcher=case_insensitive . should_equal (Span (Range 1 4) 'FFFI')
|
||||
|
||||
'ffiffl'.location_of 'IF' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ffiffl')
|
||||
'ffiffl'.location_of 'F' Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 1 2) 'ffiffl')
|
||||
'ffiffl'.location_of_all 'F' matcher=case_insensitive . should_equal [Span (Range 0 1) 'ffiffl', Span (Range 0 1) 'ffiffl', Span (Range 1 2) 'ffiffl', Span (Range 1 2) 'ffiffl']
|
||||
'aaffibb'.location_of_all 'af' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaffibb']
|
||||
'aaffibb'.location_of_all 'affi' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaffibb']
|
||||
'aaffibb'.location_of_all 'ib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaffibb']
|
||||
'aaffibb'.location_of_all 'ffib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaffibb']
|
||||
|
||||
"".location_of "foo" matcher=case_insensitive . should_equal Nothing
|
||||
"".location_of "foo" matcher=case_insensitive mode=Matching_Mode.Last . should_equal Nothing
|
||||
"".location_of_all "foo" matcher=case_insensitive . should_equal []
|
||||
"".location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) "")
|
||||
"".location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
|
||||
"".location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) ""]
|
||||
abc = 'A\u{301}ßC'
|
||||
abc.location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) abc)
|
||||
abc.location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
|
||||
abc.location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
|
||||
|
||||
Test.specify "should allow regexes in location_of" <|
|
||||
hello = "Hello World!"
|
||||
regex = Regex_Matcher.new
|
||||
regex_insensitive = Regex_Matcher.new case_sensitive=Case_Insensitive.new
|
||||
hello.location_of ".o" Matching_Mode.First matcher=regex . should_equal (Span (Range 3 5) hello)
|
||||
hello.location_of ".o" Matching_Mode.Last matcher=regex . should_equal (Span (Range 6 8) hello)
|
||||
hello.location_of_all ".o" matcher=regex . map .start . should_equal [3, 6]
|
||||
|
||||
"foobar".location_of "BAR" Mode.First matcher=regex_insensitive . should_equal (Span (Range 3 6) "foobar")
|
||||
|
||||
## Regex matching does not do case folding
|
||||
"Strasse".location_of "ß" Mode.First matcher=regex_insensitive . should_equal Nothing
|
||||
|
||||
## But it should handle the Unicode normalization
|
||||
accents = 'a\u{301}e\u{301}o\u{301}'
|
||||
accents.location_of accent_1 Mode.First matcher=regex . should_equal (Span (Range 1 2) accents)
|
||||
Test.specify "should correctly handle regex edge cases in location_of" pending="Figure out how to make Regex correctly handle empty patterns." <|
|
||||
regex = Regex_Matcher.new
|
||||
"".location_of "foo" matcher=regex . should_equal Nothing
|
||||
"".location_of "foo" matcher=regex mode=Matching_Mode.Last . should_equal Nothing
|
||||
"".location_of_all "foo" matcher=regex . should_equal []
|
||||
"".location_of "" matcher=regex . should_equal (Span (Range 0 0) "")
|
||||
"".location_of_all "" matcher=regex . should_equal [Span (Range 0 0) ""]
|
||||
"".location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
|
||||
abc = 'A\u{301}ßC'
|
||||
abc.location_of "" matcher=regex . should_equal (Span (Range 0 0) abc)
|
||||
abc.location_of_all "" matcher=regex . should_equal [Span (Range 0 0) abc, Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
|
||||
abc.location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
|
||||
|
||||
Test.group "Regex matching" <|
|
||||
Test.specify "should be possible on text" <|
|
||||
match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First
|
||||
|
@ -128,3 +128,4 @@ spec = Test.group "Examples" <|
|
||||
match.groups.length . should_equal 5
|
||||
match.named_groups.size . should_equal 2
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
||||
|
@ -34,6 +34,7 @@ import project.Data.Text_Spec
|
||||
import project.Data.Time.Spec as Time_Spec
|
||||
import project.Data.Vector_Spec
|
||||
import project.Data.Text.Regex_Spec
|
||||
import project.Data.Text.Utils_Spec
|
||||
import project.Data.Text.Default_Regex_Engine_Spec
|
||||
import project.Data.Text.Matching_Spec
|
||||
import project.Data.Text.Span_Spec
|
||||
@ -87,6 +88,7 @@ main = Test.Suite.run_main <|
|
||||
Runtime_Spec.spec
|
||||
Span_Spec.spec
|
||||
Stack_Traces_Spec.spec
|
||||
Utils_Spec.spec
|
||||
Text_Spec.spec
|
||||
Time_Spec.spec
|
||||
Uri_Spec.spec
|
||||
|
Loading…
Reference in New Issue
Block a user