mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 15:21:48 +03:00
Data analysts should be able to use Text.location_of
to find indexes within string using various matchers (#3324)
Implements https://www.pivotaltracker.com/n/projects/2539304/stories/181266029
This commit is contained in:
parent
3ef18ab5b8
commit
247b284316
@ -63,6 +63,7 @@
|
|||||||
- [Implemented `Bool.compare_to` method][3317]
|
- [Implemented `Bool.compare_to` method][3317]
|
||||||
- [Implemented `Map.first`, `Map.last` functions. Expanded `Table.group_by` to
|
- [Implemented `Map.first`, `Map.last` functions. Expanded `Table.group_by` to
|
||||||
also compute mode, percentile, minimum, maximum.][3318]
|
also compute mode, percentile, minimum, maximum.][3318]
|
||||||
|
- [Implemented `Text.location_of` and `Text.location_of_all` methods.][3324]
|
||||||
|
|
||||||
[debug-shortcuts]:
|
[debug-shortcuts]:
|
||||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||||
@ -100,7 +101,8 @@
|
|||||||
[3236]: https://github.com/enso-org/enso/pull/3236
|
[3236]: https://github.com/enso-org/enso/pull/3236
|
||||||
[3311]: https://github.com/enso-org/enso/pull/3311
|
[3311]: https://github.com/enso-org/enso/pull/3311
|
||||||
[3317]: https://github.com/enso-org/enso/pull/3317
|
[3317]: https://github.com/enso-org/enso/pull/3317
|
||||||
[3317]: https://github.com/enso-org/enso/pull/3318
|
[3318]: https://github.com/enso-org/enso/pull/3318
|
||||||
|
[3324]: https://github.com/enso-org/enso/pull/3324
|
||||||
|
|
||||||
#### Enso Compiler
|
#### Enso Compiler
|
||||||
|
|
||||||
|
@ -5,9 +5,11 @@ from Standard.Builtins import Text, Prim_Text_Helpers
|
|||||||
|
|
||||||
import Standard.Base.Data.Text.Regex
|
import Standard.Base.Data.Text.Regex
|
||||||
import Standard.Base.Data.Text.Regex.Mode
|
import Standard.Base.Data.Text.Regex.Mode
|
||||||
|
import Standard.Base.Data.Text.Matching_Mode
|
||||||
import Standard.Base.Data.Text.Case
|
import Standard.Base.Data.Text.Case
|
||||||
import Standard.Base.Data.Text.Location
|
import Standard.Base.Data.Text.Location
|
||||||
import Standard.Base.Data.Text.Line_Ending_Style
|
import Standard.Base.Data.Text.Line_Ending_Style
|
||||||
|
from Standard.Base.Data.Text.Span as Span_Module import Span
|
||||||
import Standard.Base.Data.Text.Split_Kind
|
import Standard.Base.Data.Text.Split_Kind
|
||||||
import Standard.Base.Data.Text.Text_Sub_Range
|
import Standard.Base.Data.Text.Text_Sub_Range
|
||||||
import Standard.Base.Data.Locale
|
import Standard.Base.Data.Locale
|
||||||
@ -15,6 +17,7 @@ import Standard.Base.Meta
|
|||||||
|
|
||||||
from Standard.Builtins export Text
|
from Standard.Builtins export Text
|
||||||
|
|
||||||
|
export Standard.Base.Data.Text.Matching_Mode
|
||||||
export Standard.Base.Data.Text.Case
|
export Standard.Base.Data.Text.Case
|
||||||
export Standard.Base.Data.Text.Location
|
export Standard.Base.Data.Text.Location
|
||||||
export Standard.Base.Data.Text.Split_Kind
|
export Standard.Base.Data.Text.Split_Kind
|
||||||
@ -546,7 +549,7 @@ Text.== that = if Meta.is_same_object this Text then Meta.is_same_object that Te
|
|||||||
(('É' . equals_ignore_case 'é') && ('é' . equals_ignore_case 'e\u0301')) == True
|
(('É' . equals_ignore_case 'é') && ('é' . equals_ignore_case 'e\u0301')) == True
|
||||||
Text.equals_ignore_case : Text -> Locale -> Boolean
|
Text.equals_ignore_case : Text -> Locale -> Boolean
|
||||||
Text.equals_ignore_case that locale=Locale.default =
|
Text.equals_ignore_case that locale=Locale.default =
|
||||||
(this.to_case_insensitive_key locale) == (that.to_case_insensitive_key locale)
|
Text_Utils.equals_ignore_case this that locale.java_locale
|
||||||
|
|
||||||
## ADVANCED
|
## ADVANCED
|
||||||
PRIVATE
|
PRIVATE
|
||||||
@ -555,7 +558,7 @@ Text.equals_ignore_case that locale=Locale.default =
|
|||||||
used to perform case-insensitive comparisons.
|
used to perform case-insensitive comparisons.
|
||||||
Text.to_case_insensitive_key : Locale -> Text
|
Text.to_case_insensitive_key : Locale -> Text
|
||||||
Text.to_case_insensitive_key locale=Locale.default =
|
Text.to_case_insensitive_key locale=Locale.default =
|
||||||
this.to_case Case.Lower locale . to_case Case.Upper locale
|
Text_Utils.case_insensitive_key this locale.java_locale
|
||||||
|
|
||||||
## Compare two texts to discover their ordering.
|
## Compare two texts to discover their ordering.
|
||||||
|
|
||||||
@ -895,7 +898,7 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
|
|||||||
Text_Matcher case_sensitivity -> case case_sensitivity of
|
Text_Matcher case_sensitivity -> case case_sensitivity of
|
||||||
True -> Text_Utils.contains this term
|
True -> Text_Utils.contains this term
|
||||||
Case_Insensitive locale ->
|
Case_Insensitive locale ->
|
||||||
Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale)
|
Text_Utils.contains_case_insensitive this term locale.java_locale
|
||||||
Regex_Matcher _ _ _ _ _ ->
|
Regex_Matcher _ _ _ _ _ ->
|
||||||
compiled_pattern = matcher.compile term
|
compiled_pattern = matcher.compile term
|
||||||
match = compiled_pattern.match this Mode.First
|
match = compiled_pattern.match this Mode.First
|
||||||
@ -952,27 +955,6 @@ Text.repeat count=1 =
|
|||||||
https://www.pivotaltracker.com/story/show/181435598
|
https://www.pivotaltracker.com/story/show/181435598
|
||||||
0.up_to (count.max 0) . fold "" acc-> _-> acc + this
|
0.up_to (count.max 0) . fold "" acc-> _-> acc + this
|
||||||
|
|
||||||
## PRIVATE
|
|
||||||
Utility function taking a range pointing at grapheme clusters and converting to a range on the underlying code points
|
|
||||||
range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error
|
|
||||||
range_to_char_indices text range =
|
|
||||||
len = text.length
|
|
||||||
start = if range.start < 0 then range.start + len else range.start
|
|
||||||
end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end)
|
|
||||||
is_valid = (Range 0 len+1).contains
|
|
||||||
|
|
||||||
case (Pair (is_valid start) (is_valid end)) of
|
|
||||||
Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len)
|
|
||||||
Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len)
|
|
||||||
Pair True True ->
|
|
||||||
if start>=end then (Range 0 0) else
|
|
||||||
iterator = BreakIterator.getCharacterInstance
|
|
||||||
iterator.setText text
|
|
||||||
|
|
||||||
start_index = iterator.next start
|
|
||||||
end_index = iterator.next (end - start)
|
|
||||||
Range start_index end_index
|
|
||||||
|
|
||||||
## ALIAS first, last, left, right, mid, substring
|
## ALIAS first, last, left, right, mid, substring
|
||||||
Creates a new Text by selecting the specified range of the input.
|
Creates a new Text by selecting the specified range of the input.
|
||||||
|
|
||||||
@ -1009,7 +991,7 @@ range_to_char_indices text range =
|
|||||||
Text.take : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
|
Text.take : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
|
||||||
Text.take range =
|
Text.take range =
|
||||||
char_range = case range of
|
char_range = case range of
|
||||||
Range _ _ -> here.range_to_char_indices this range
|
Range _ _ -> Span_Module.range_to_char_indices this range
|
||||||
_ -> range.to_char_range this
|
_ -> range.to_char_range this
|
||||||
Text_Utils.substring this char_range.start char_range.end
|
Text_Utils.substring this char_range.start char_range.end
|
||||||
|
|
||||||
@ -1049,7 +1031,7 @@ Text.take range =
|
|||||||
Text.drop : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
|
Text.drop : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
|
||||||
Text.drop range =
|
Text.drop range =
|
||||||
char_range = case range of
|
char_range = case range of
|
||||||
Range _ _ -> here.range_to_char_indices this range
|
Range _ _ -> Span_Module.range_to_char_indices this range
|
||||||
_ -> range.to_char_range this
|
_ -> range.to_char_range this
|
||||||
if char_range.start == 0 then Text_Utils.drop_first this char_range.end else
|
if char_range.start == 0 then Text_Utils.drop_first this char_range.end else
|
||||||
prefix = Text_Utils.substring this 0 char_range.start
|
prefix = Text_Utils.substring this 0 char_range.start
|
||||||
@ -1184,3 +1166,204 @@ Text.trim where=Location.Both what=_.is_whitespace =
|
|||||||
loop current break_iterator.previous
|
loop current break_iterator.previous
|
||||||
if start_index >= end_index then "" else
|
if start_index >= end_index then "" else
|
||||||
Text_Utils.substring this start_index end_index
|
Text_Utils.substring this start_index end_index
|
||||||
|
|
||||||
|
## ALIAS find, index_of, position_of, span_of
|
||||||
|
Find the location of the `term` in the input.
|
||||||
|
Returns a Span representing the location at which the term was found, or
|
||||||
|
`Nothing` if the term was not found in the input.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- term: The term to find.
|
||||||
|
- mode: Specifies if the first or last occurrence of the term should be
|
||||||
|
returned if there are multiple occurrences within the input. The first
|
||||||
|
occurrence is returned by default.
|
||||||
|
- matcher: Specifies how the term is matched against the input:
|
||||||
|
- If a `Text_Matcher`, the text is compared using case-sensitively rules
|
||||||
|
specified in the matcher.
|
||||||
|
- If a `Regex_Matcher`, the `term` is used as a regular expression and
|
||||||
|
matched using the associated options.
|
||||||
|
|
||||||
|
! What is a Character?
|
||||||
|
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||||
|
Standard Annex 29. This is the smallest unit that still has semantic
|
||||||
|
meaning in most text-processing applications.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Finding location of a substring.
|
||||||
|
|
||||||
|
"Hello World!".location_of "J" == Nothing
|
||||||
|
"Hello World!".location_of "o" == Span (Range 4 5) "Hello World!"
|
||||||
|
"Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 7 8) "Hello World!"
|
||||||
|
|
||||||
|
! Match Length
|
||||||
|
The function returns not only the index of the match but a `Span` instance
|
||||||
|
which contains both the start and end indices, allowing to determine the
|
||||||
|
length of the match. This is useful not only with regex matches (where a
|
||||||
|
regular expression can have matches of various lengths) but also for case
|
||||||
|
insensitive matching. In case insensitive mode, a single character can
|
||||||
|
match multiple characters, for example `ß` will match `ss` and `SS`, and
|
||||||
|
the ligature `ffi` will match `ffi` or `f` etc. Thus in case insensitive
|
||||||
|
mode, the length of the match can be shorter or longer than the term that
|
||||||
|
was being matched, so it is extremely important to not rely on the length
|
||||||
|
of the matched term when analysing the matches as they may have different
|
||||||
|
lengths.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Match length differences in case insensitive matching.
|
||||||
|
|
||||||
|
term = "straße"
|
||||||
|
text = "MONUMENTENSTRASSE 42"
|
||||||
|
match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
term.length == 6
|
||||||
|
match.length == 7
|
||||||
|
|
||||||
|
! Matching Grapheme Clusters
|
||||||
|
In case insensitive mode, a single character can match multiple characters,
|
||||||
|
for example `ß` will match `ss` and `SS`, and the ligature `ffi` will match
|
||||||
|
`ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
|
||||||
|
match only a part of some single grapheme cluster, for example in the text
|
||||||
|
`ffia` the term `ia` will match just one-third of the first grapheme `ffi`.
|
||||||
|
Since we do not have the resolution to distinguish such partial matches
|
||||||
|
(as that would require non-integer indices), so a match which matched just
|
||||||
|
a part of some grapheme cluster is extended and treated as if it matched
|
||||||
|
the whole grapheme cluster.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Extending matches to full grapheme clusters.
|
||||||
|
|
||||||
|
ligatures = "ffiffl"
|
||||||
|
ligatures.length == 2
|
||||||
|
term_1 = "IFF"
|
||||||
|
match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
term_1.length == 3
|
||||||
|
match_1.length == 2
|
||||||
|
term_2 = "ffiffl"
|
||||||
|
match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
term_2.length == 6
|
||||||
|
match_2.length == 2
|
||||||
|
# After being extended to full grapheme clusters, both terms "IFF" and "ffiffl" match the same span of grapheme clusters.
|
||||||
|
match_1 == match_2
|
||||||
|
Text.location_of : Text -> (Matching_Mode.First | Matching_Mode.Last) -> Matcher -> Span | Nothing
|
||||||
|
Text.location_of term="" mode=Matching_Mode.First matcher=Text_Matcher.new = case matcher of
|
||||||
|
Text_Matcher case_sensitive -> case case_sensitive of
|
||||||
|
True ->
|
||||||
|
codepoint_span = case mode of
|
||||||
|
Matching_Mode.First -> Text_Utils.span_of this term
|
||||||
|
Matching_Mode.Last -> Text_Utils.last_span_of this term
|
||||||
|
if codepoint_span.is_nothing then Nothing else
|
||||||
|
start = Text_Utils.utf16_index_to_grapheme_index this codepoint_span.start
|
||||||
|
## While the codepoint_span may have different code unit length
|
||||||
|
from our term, the `length` counted in grapheme clusters is
|
||||||
|
guaranteed to be the same.
|
||||||
|
end = start + term.length
|
||||||
|
Span (Range start end) this
|
||||||
|
Case_Insensitive locale -> case term.is_empty of
|
||||||
|
True -> case mode of
|
||||||
|
Matching_Mode.First -> Span (Range 0 0) this
|
||||||
|
Matching_Mode.Last ->
|
||||||
|
end = this.length
|
||||||
|
Span (Range end end) this
|
||||||
|
False ->
|
||||||
|
search_for_last = case mode of
|
||||||
|
Matching_Mode.First -> False
|
||||||
|
Matching_Mode.Last -> True
|
||||||
|
case Text_Utils.span_of_case_insensitive this term locale.java_locale search_for_last of
|
||||||
|
Nothing -> Nothing
|
||||||
|
grapheme_span ->
|
||||||
|
Span (Range grapheme_span.start grapheme_span.end) this
|
||||||
|
Regex_Matcher _ _ _ _ _ -> case mode of
|
||||||
|
Matching_Mode.First ->
|
||||||
|
case matcher.compile term . match this Mode.First of
|
||||||
|
Nothing -> Nothing
|
||||||
|
match -> match.span 0 . to_grapheme_span
|
||||||
|
Matching_Mode.Last ->
|
||||||
|
case matcher.compile term . match this Mode.All of
|
||||||
|
Nothing -> Nothing
|
||||||
|
matches -> matches.last.span 0 . to_grapheme_span
|
||||||
|
|
||||||
|
## ALIAS find_all, index_of_all, position_of_all, span_of_all
|
||||||
|
Finds all the locations of the `term` in the input.
|
||||||
|
If not found, the function returns an empty Vector.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- term: The term to find.
|
||||||
|
- matcher: Specifies how the term is matched against the input:
|
||||||
|
- If a `Text_Matcher`, the text is compared using case-sensitively rules
|
||||||
|
specified in the matcher.
|
||||||
|
- If a `Regex_Matcher`, the `term` is used as a regular expression and
|
||||||
|
matched using the associated options.
|
||||||
|
|
||||||
|
! What is a Character?
|
||||||
|
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||||
|
Standard Annex 29. This is the smallest unit that still has semantic
|
||||||
|
meaning in most text-processing applications.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Finding locations of all occurrences of a substring.
|
||||||
|
|
||||||
|
"Hello World!".location_of_all "J" == []
|
||||||
|
"Hello World!".location_of_all "o" . map .start == [4, 7]
|
||||||
|
|
||||||
|
! Match Length
|
||||||
|
The function returns not only the index of the match but a `Span` instance
|
||||||
|
which contains both the start and end indices, allowing to determine the
|
||||||
|
length of the match. This is useful not only with regex matches (where a
|
||||||
|
regular expression can have matches of various lengths) but also for case
|
||||||
|
insensitive matching. In case insensitive mode, a single character can
|
||||||
|
match multiple characters, for example `ß` will match `ss` and `SS`, and
|
||||||
|
the ligature `ffi` will match `ffi` or `f` etc. Thus in case insensitive
|
||||||
|
mode, the length of the match can be shorter or longer than the term that
|
||||||
|
was being matched, so it is extremely important to not rely on the length
|
||||||
|
of the matched term when analysing the matches as they may have different
|
||||||
|
lengths.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Match length differences in case insensitive matching.
|
||||||
|
|
||||||
|
term = "strasse"
|
||||||
|
text = "MONUMENTENSTRASSE ist eine große Straße."
|
||||||
|
match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
term.length == 7
|
||||||
|
match . map .length == [7, 6]
|
||||||
|
|
||||||
|
! Matching Grapheme Clusters
|
||||||
|
In case insensitive mode, a single character can match multiple characters,
|
||||||
|
for example `ß` will match `ss` and `SS`, and the ligature `ffi` will match
|
||||||
|
`ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
|
||||||
|
match only a part of some single grapheme cluster, for example in the text
|
||||||
|
`ffia` the term `ia` will match just one-third of the first grapheme `ffi`.
|
||||||
|
Since we do not have the resolution to distinguish such partial matches
|
||||||
|
(as that would require non-integer indices), so a match which matched just
|
||||||
|
a part of some grapheme cluster is extended and treated as if it matched
|
||||||
|
the whole grapheme cluster.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Extending matches to full grapheme clusters.
|
||||||
|
|
||||||
|
ligatures = "ffifflFFIFF"
|
||||||
|
ligatures.length == 7
|
||||||
|
match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
match_1 . map .length == [2, 3]
|
||||||
|
match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
match_2 . map .length == [2, 5]
|
||||||
|
Text.location_of_all : Text -> Matcher -> [Span]
|
||||||
|
Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
|
||||||
|
Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of
|
||||||
|
True ->
|
||||||
|
codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term
|
||||||
|
grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
|
||||||
|
## While the codepoint_spans may have different code unit lengths
|
||||||
|
from our term, the `length` counted in grapheme clusters is
|
||||||
|
guaranteed to be the same.
|
||||||
|
offset = term.length
|
||||||
|
grahpeme_ixes . map start->
|
||||||
|
end = start+offset
|
||||||
|
Span (Range start end) this
|
||||||
|
Case_Insensitive locale ->
|
||||||
|
grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
|
||||||
|
grapheme_spans.map grapheme_span->
|
||||||
|
Span (Range grapheme_span.start grapheme_span.end) this
|
||||||
|
Regex_Matcher _ _ _ _ _ ->
|
||||||
|
case matcher.compile term . match this Mode.All of
|
||||||
|
Nothing -> []
|
||||||
|
matches -> matches.map m-> m.span 0 . to_grapheme_span
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
## Matches the first found instance.
|
||||||
|
type First
|
||||||
|
|
||||||
|
## Matches the last found instance.
|
||||||
|
type Last
|
@ -40,7 +40,7 @@ import Standard.Base.Data.Text.Regex.Engine
|
|||||||
import Standard.Base.Data.Text.Regex.Option as Global_Option
|
import Standard.Base.Data.Text.Regex.Option as Global_Option
|
||||||
import Standard.Base.Data.Text.Regex.Mode
|
import Standard.Base.Data.Text.Regex.Mode
|
||||||
import Standard.Base.Polyglot.Java as Java_Ext
|
import Standard.Base.Polyglot.Java as Java_Ext
|
||||||
import Standard.Base.Data.Text.Span
|
from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span
|
||||||
|
|
||||||
from Standard.Builtins import Java
|
from Standard.Builtins import Java
|
||||||
|
|
||||||
@ -183,8 +183,13 @@ type Pattern
|
|||||||
on the encoding, we normalize all input.
|
on the encoding, we normalize all input.
|
||||||
build_matcher : Text -> Integer -> Integer -> Java_Matcher
|
build_matcher : Text -> Integer -> Integer -> Java_Matcher
|
||||||
build_matcher input start end =
|
build_matcher input start end =
|
||||||
normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else
|
## TODO [RW] Normalization had to be disabled - since start and end are
|
||||||
Text_Utils.normalize input
|
in code unit space, normalization could shift these indices!
|
||||||
|
This should be addressed when reviewing
|
||||||
|
See: https://www.pivotaltracker.com/story/show/181524498
|
||||||
|
#normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else
|
||||||
|
# Text_Utils.normalize input
|
||||||
|
normalized_input = input
|
||||||
internal_matcher = this.internal_pattern.matcher normalized_input . region start end
|
internal_matcher = this.internal_pattern.matcher normalized_input . region start end
|
||||||
|
|
||||||
if this.options.contains No_Anchoring_Bounds then
|
if this.options.contains No_Anchoring_Bounds then
|
||||||
@ -262,7 +267,7 @@ type Pattern
|
|||||||
internal_matcher = this.build_matcher input start end
|
internal_matcher = this.build_matcher input start end
|
||||||
|
|
||||||
if internal_matcher . find start . not then Nothing else
|
if internal_matcher . find start . not then Nothing else
|
||||||
Match internal_matcher start end
|
Match internal_matcher start end input
|
||||||
Integer ->
|
Integer ->
|
||||||
if mode < 0 then Panic.throw <|
|
if mode < 0 then Panic.throw <|
|
||||||
Mode_Error "Cannot match a negative number of times."
|
Mode_Error "Cannot match a negative number of times."
|
||||||
@ -272,13 +277,16 @@ type Pattern
|
|||||||
go : Integer -> Integer -> Nothing
|
go : Integer -> Integer -> Nothing
|
||||||
go offset remaining_count =
|
go offset remaining_count =
|
||||||
should_continue = remaining_count > 0
|
should_continue = remaining_count > 0
|
||||||
if should_continue.not || (offset > end) then Nothing else
|
if should_continue.not || (offset >= end) then Nothing else
|
||||||
internal_matcher = this.build_matcher input start end
|
internal_matcher = this.build_matcher input start end
|
||||||
found = internal_matcher.find offset
|
found = internal_matcher.find offset
|
||||||
|
|
||||||
if found.not then Nothing else
|
if found.not then Nothing else
|
||||||
builder.append (Match internal_matcher start end)
|
builder.append (Match internal_matcher start end input)
|
||||||
@Tail_Call go (internal_matcher.end 0) remaining_count-1
|
match_end = internal_matcher.end 0
|
||||||
|
# Ensure progress even if the match is an empty string.
|
||||||
|
new_offset = if match_end > offset then match_end else offset+1
|
||||||
|
@Tail_Call go new_offset remaining_count-1
|
||||||
|
|
||||||
go start mode
|
go start mode
|
||||||
vector = builder.to_vector
|
vector = builder.to_vector
|
||||||
@ -294,8 +302,11 @@ type Pattern
|
|||||||
found = internal_matcher.find offset
|
found = internal_matcher.find offset
|
||||||
|
|
||||||
if found.not then Nothing else
|
if found.not then Nothing else
|
||||||
builder.append (Match internal_matcher start end)
|
builder.append (Match internal_matcher start end input)
|
||||||
@Tail_Call go (internal_matcher.end 0)
|
match_end = internal_matcher.end 0
|
||||||
|
# Ensure progress even if the match is an empty string.
|
||||||
|
new_offset = if match_end > offset then match_end else offset+1
|
||||||
|
@Tail_Call go new_offset
|
||||||
|
|
||||||
go start
|
go start
|
||||||
vector = builder.to_vector
|
vector = builder.to_vector
|
||||||
@ -304,7 +315,7 @@ type Pattern
|
|||||||
Mode.Full ->
|
Mode.Full ->
|
||||||
internal_matcher = this.build_matcher input start end
|
internal_matcher = this.build_matcher input start end
|
||||||
if internal_matcher.matches.not then Nothing else
|
if internal_matcher.matches.not then Nothing else
|
||||||
Match internal_matcher start end
|
Match internal_matcher start end input
|
||||||
Mode.Bounded _ _ _ -> Panic.throw <|
|
Mode.Bounded _ _ _ -> Panic.throw <|
|
||||||
Mode_Error "Modes cannot be recursive."
|
Mode_Error "Modes cannot be recursive."
|
||||||
|
|
||||||
@ -312,7 +323,7 @@ type Pattern
|
|||||||
Mode.Bounded start end sub_mode ->
|
Mode.Bounded start end sub_mode ->
|
||||||
if start < end then do_match_mode sub_mode start end else
|
if start < end then do_match_mode sub_mode start end else
|
||||||
Panic.throw Invalid_Bounds_Error
|
Panic.throw Invalid_Bounds_Error
|
||||||
_ -> do_match_mode mode 0 input.length
|
_ -> do_match_mode mode 0 (Text_Utils.char_length input)
|
||||||
|
|
||||||
## ADVANCED
|
## ADVANCED
|
||||||
|
|
||||||
@ -334,7 +345,7 @@ type Pattern
|
|||||||
pattern.matches input
|
pattern.matches input
|
||||||
matches : Text -> Boolean
|
matches : Text -> Boolean
|
||||||
matches input = case this.match input mode=Mode.Full of
|
matches input = case this.match input mode=Mode.Full of
|
||||||
Match _ _ _ -> True
|
Match _ _ _ _ -> True
|
||||||
Vector.Vector _ -> True
|
Vector.Vector _ -> True
|
||||||
_ -> False
|
_ -> False
|
||||||
|
|
||||||
@ -405,7 +416,7 @@ type Pattern
|
|||||||
find input mode=Mode.All =
|
find input mode=Mode.All =
|
||||||
matches = this.match input mode
|
matches = this.match input mode
|
||||||
case matches of
|
case matches of
|
||||||
Match _ _ _ -> matches.group 0
|
Match _ _ _ _ -> matches.group 0
|
||||||
Vector.Vector _ -> matches.map (_.group 0)
|
Vector.Vector _ -> matches.map (_.group 0)
|
||||||
_ -> matches
|
_ -> matches
|
||||||
|
|
||||||
@ -548,7 +559,7 @@ type Pattern
|
|||||||
internal_matcher.replaceAll replacement
|
internal_matcher.replaceAll replacement
|
||||||
Mode.Full ->
|
Mode.Full ->
|
||||||
case this.match input mode=Mode.Full of
|
case this.match input mode=Mode.Full of
|
||||||
Match _ _ _ -> replacement
|
Match _ _ _ _ -> replacement
|
||||||
Nothing -> input
|
Nothing -> input
|
||||||
Mode.Bounded _ _ _ -> Panic.throw <|
|
Mode.Bounded _ _ _ -> Panic.throw <|
|
||||||
Mode_Error "Modes cannot be recursive."
|
Mode_Error "Modes cannot be recursive."
|
||||||
@ -556,7 +567,7 @@ type Pattern
|
|||||||
case mode of
|
case mode of
|
||||||
Mode.Bounded _ _ _ -> Panic.throw <|
|
Mode.Bounded _ _ _ -> Panic.throw <|
|
||||||
Mode_Error "Bounded replacements are not well-formed."
|
Mode_Error "Bounded replacements are not well-formed."
|
||||||
_ -> do_replace_mode mode 0 input.length
|
_ -> do_replace_mode mode 0 (Text_Utils.char_length input)
|
||||||
|
|
||||||
## The default implementation of the `Data.Text.Regex.Engine.Match` interface.
|
## The default implementation of the `Data.Text.Regex.Engine.Match` interface.
|
||||||
type Match
|
type Match
|
||||||
@ -570,7 +581,8 @@ type Match
|
|||||||
match.
|
match.
|
||||||
- region_start: The start of the region over which the match was made.
|
- region_start: The start of the region over which the match was made.
|
||||||
- region_end: The end of the region over which the match was made.
|
- region_end: The end of the region over which the match was made.
|
||||||
type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer)
|
- input: The input text that was being matched.
|
||||||
|
type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) (input : Text)
|
||||||
|
|
||||||
## Gets the text matched by the group with the provided identifier, or
|
## Gets the text matched by the group with the provided identifier, or
|
||||||
`Nothing` if the group did not participate in the match. If no such group
|
`Nothing` if the group did not participate in the match. If no such group
|
||||||
@ -743,10 +755,10 @@ type Match
|
|||||||
example_Span =
|
example_Span =
|
||||||
match = Examples.match
|
match = Examples.match
|
||||||
match.span 0
|
match.span 0
|
||||||
span : Integer | Text -> Span | Nothing ! Regex.No_Such_Group_Error
|
span : Integer | Text -> Utf_16_Span | Nothing ! Regex.No_Such_Group_Error
|
||||||
span id = case this.group id of
|
span id = case this.group id of
|
||||||
Nothing -> Nothing
|
Nothing -> Nothing
|
||||||
_ -> Span.new (this.start id) (this.end id) (this.group 0)
|
_ -> Utf_16_Span (Range (this.start id) (this.end id)) this.input
|
||||||
|
|
||||||
## Returns the start character index of the match's region.
|
## Returns the start character index of the match's region.
|
||||||
|
|
||||||
|
@ -4,11 +4,13 @@
|
|||||||
to matching on the `Full` content of the input text.
|
to matching on the `Full` content of the input text.
|
||||||
|
|
||||||
from Standard.Base import all
|
from Standard.Base import all
|
||||||
|
from Standard.Base.Data.Text.Matching_Mode import First
|
||||||
|
from Standard.Base.Data.Text.Matching_Mode export First
|
||||||
|
|
||||||
type Mode
|
type Mode
|
||||||
|
|
||||||
## The regex will only match the first instance it finds.
|
## The regex will only match the first instance it finds.
|
||||||
type First
|
First
|
||||||
|
|
||||||
## The regex will match up to some `Integer` number of instances.
|
## The regex will match up to some `Integer` number of instances.
|
||||||
Integer
|
Integer
|
||||||
|
@ -7,30 +7,14 @@
|
|||||||
|
|
||||||
example_span =
|
example_span =
|
||||||
text = "Hello!"
|
text = "Hello!"
|
||||||
Span.new 0 3 text
|
Span 0 3 text
|
||||||
|
|
||||||
from Standard.Base import all
|
from Standard.Base import all
|
||||||
|
|
||||||
import Standard.Base.Data.Range
|
from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error
|
||||||
|
|
||||||
## Construct a new `Span`.
|
polyglot java import org.enso.base.Text_Utils
|
||||||
|
polyglot java import com.ibm.icu.text.BreakIterator
|
||||||
Arguments:
|
|
||||||
- start: The index of the first character included in the span.
|
|
||||||
- end: The index of the first character after `start` that is _not_ included
|
|
||||||
in the span.
|
|
||||||
- text: The `Text` over which the span exists. This is _optional_.
|
|
||||||
|
|
||||||
> Example
|
|
||||||
Creating a span over the first three characters of the text "hello!".
|
|
||||||
|
|
||||||
import Standard.Base.Data.Text.Span
|
|
||||||
|
|
||||||
example_span =
|
|
||||||
text = "Hello!"
|
|
||||||
Span.new 0 3 text
|
|
||||||
new : Integer -> Integer -> Text | Nothing -> Span
|
|
||||||
new start end text=Nothing = Span (start.up_to end) text
|
|
||||||
|
|
||||||
type Span
|
type Span
|
||||||
|
|
||||||
@ -38,7 +22,7 @@ type Span
|
|||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
- range: The range of characters over which the span exists.
|
- range: The range of characters over which the span exists.
|
||||||
- text: The text over which the span exists. This is _optional_.
|
- text: The text over which the span exists.
|
||||||
|
|
||||||
! What is a Character?
|
! What is a Character?
|
||||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||||
@ -54,7 +38,7 @@ type Span
|
|||||||
text = "Hello!"
|
text = "Hello!"
|
||||||
range = 0.up_to 3
|
range = 0.up_to 3
|
||||||
Span.Span range text
|
Span.Span range text
|
||||||
type Span (range : Range.Range) (text : (Text | Nothing) = Nothing)
|
type Span (range : Range.Range) (text : Text)
|
||||||
|
|
||||||
## The index of the first character included in the span.
|
## The index of the first character included in the span.
|
||||||
|
|
||||||
@ -74,3 +58,112 @@ type Span
|
|||||||
meaning in most text-processing applications.
|
meaning in most text-processing applications.
|
||||||
end : Integer
|
end : Integer
|
||||||
end = this.range.end
|
end = this.range.end
|
||||||
|
|
||||||
|
## The length of the span in extended grapheme clusters.
|
||||||
|
|
||||||
|
! What is a Character?
|
||||||
|
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||||
|
Standard Annex 29. This is the smallest unit that still has semantic
|
||||||
|
meaning in most text-processing applications.
|
||||||
|
length : Integer
|
||||||
|
length = this.range.length
|
||||||
|
|
||||||
|
## Converts the span of extended grapheme clusters to a corresponding span
|
||||||
|
of UTF-16 code units.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Find the span of code units corresponding to the span of extended grapheme clusters.
|
||||||
|
|
||||||
|
text = 'ae\u{301}fz'
|
||||||
|
(Span (Range 1 3) text).to_utf_16_span == (Utf_16_Span (Range 1 4) text)
|
||||||
|
to_utf_16_span : Utf_16_Span
|
||||||
|
to_utf_16_span =
|
||||||
|
Utf_16_Span (here.range_to_char_indices this.text this.range) this.text
|
||||||
|
|
||||||
|
type Utf_16_Span
|
||||||
|
|
||||||
|
## A representation of a span of UTF-16 code units in Enso's `Text` type.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- range: The range of code units over which the span exists.
|
||||||
|
- text: The text over which the span exists.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Creating a span over the first three code units of the text 'a\u{301}bc'.
|
||||||
|
|
||||||
|
import Standard.Base.Data.Text.Span
|
||||||
|
|
||||||
|
example_span =
|
||||||
|
text = 'a\u{301}bc'
|
||||||
|
Span.Utf_16_Span (Range 0 3) text
|
||||||
|
type Utf_16_Span (range : Range.Range) (text : Text)
|
||||||
|
|
||||||
|
## The index of the first code unit included in the span.
|
||||||
|
start : Integer
|
||||||
|
start = this.range.start
|
||||||
|
|
||||||
|
## The index of the first code unit after `start` that is _not_ included in
|
||||||
|
the span.
|
||||||
|
end : Integer
|
||||||
|
end = this.range.end
|
||||||
|
|
||||||
|
## The length of the span in UTF-16 code units.
|
||||||
|
length : Integer
|
||||||
|
length = this.range.length
|
||||||
|
|
||||||
|
## Returns a span of extended grapheme clusters which is the closest
|
||||||
|
approximation of this span of code units.
|
||||||
|
|
||||||
|
The resulting span is extended in such a way that every code unit that
|
||||||
|
was contained by the original span is also contained in a new span. Since
|
||||||
|
some grapheme clusters consist of multiple code units, after the span was
|
||||||
|
extended it may also contain code units which were not contained inside
|
||||||
|
of the original span.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Convert a codepoint span to graphemes and back.
|
||||||
|
|
||||||
|
text = 'a\u{301}e\u{302}o\u{303}'
|
||||||
|
span = Utf_16_Span (Range 1 5) text # The span contains the units [\u{301}, e, \u{302}, o].
|
||||||
|
extended = span.to_grapheme_span
|
||||||
|
extended == Span (Range 0 3) text # The span is extended to the whole string since it contained code units from every grapheme cluster.
|
||||||
|
extended.to_utf_16_span == Utf_16_Span (Range 0 6) text
|
||||||
|
to_grapheme_span : Span
|
||||||
|
to_grapheme_span = if (this.start < 0) || (this.end > Text_Utils.char_length this.text) then Error.throw (Illegal_State_Error "Utf_16_Span indices are out of range of the associated text.") else
|
||||||
|
if this.end < this.start then Error.throw (Illegal_State_Error "Utf_16_Span invariant violation: start <= end") else
|
||||||
|
case this.start == this.end of
|
||||||
|
True ->
|
||||||
|
grapheme_ix = Text_Utils.utf16_index_to_grapheme_index this.text this.start
|
||||||
|
Span (Range grapheme_ix grapheme_ix) this.text
|
||||||
|
False ->
|
||||||
|
grapheme_ixes = Text_Utils.utf16_indices_to_grapheme_indices this.text [this.start, this.end - 1].to_array
|
||||||
|
grapheme_first = grapheme_ixes.at 0
|
||||||
|
grapheme_last = grapheme_ixes.at 1
|
||||||
|
## We find the grapheme index of the last code unit actually contained within our span and set the
|
||||||
|
end grapheme to the first grapheme after that. This ensures that if code units associated with
|
||||||
|
only a part of a grapheme were contained in our original span, the resulting span will be
|
||||||
|
extended to contain this whole grapheme.
|
||||||
|
grapheme_end = grapheme_last + 1
|
||||||
|
Span (Range grapheme_first grapheme_end) this.text
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
Utility function taking a range pointing at grapheme clusters and converting
|
||||||
|
to a range on the underlying code units.
|
||||||
|
range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error
|
||||||
|
range_to_char_indices text range =
|
||||||
|
len = text.length
|
||||||
|
start = if range.start < 0 then range.start + len else range.start
|
||||||
|
end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end)
|
||||||
|
is_valid = (Range 0 len+1).contains
|
||||||
|
|
||||||
|
case (Pair (is_valid start) (is_valid end)) of
|
||||||
|
Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len)
|
||||||
|
Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len)
|
||||||
|
Pair True True ->
|
||||||
|
if start>=end then (Range 0 0) else
|
||||||
|
iterator = BreakIterator.getCharacterInstance
|
||||||
|
iterator.setText text
|
||||||
|
|
||||||
|
start_index = iterator.next start
|
||||||
|
end_index = iterator.next (end - start)
|
||||||
|
Range start_index end_index
|
||||||
|
@ -79,24 +79,24 @@ type Text_Sub_Range
|
|||||||
Range (if start_index == -1 then 0 else start_index) (Text_Utils.char_length text)
|
Range (if start_index == -1 then 0 else start_index) (Text_Utils.char_length text)
|
||||||
Before delimiter ->
|
Before delimiter ->
|
||||||
if delimiter.is_empty then (Range 0 0) else
|
if delimiter.is_empty then (Range 0 0) else
|
||||||
index = Text_Utils.index_of text delimiter
|
span = Text_Utils.span_of text delimiter
|
||||||
if index == -1 then (Range 0 (Text_Utils.char_length text)) else
|
if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
|
||||||
(Range 0 index)
|
(Range 0 span.start)
|
||||||
Before_Last delimiter ->
|
Before_Last delimiter ->
|
||||||
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
|
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
|
||||||
index = Text_Utils.last_index_of text delimiter
|
span = Text_Utils.last_span_of text delimiter
|
||||||
if index == -1 then (Range 0 (Text_Utils.char_length text)) else
|
if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
|
||||||
(Range 0 index)
|
(Range 0 span.start)
|
||||||
After delimiter ->
|
After delimiter ->
|
||||||
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
|
if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
|
||||||
index = Text_Utils.index_of text delimiter
|
span = Text_Utils.span_of text delimiter
|
||||||
if index == -1 then (Range 0 0) else
|
if span.is_nothing then (Range 0 0) else
|
||||||
(Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text))
|
(Range span.end (Text_Utils.char_length text))
|
||||||
After_Last delimiter ->
|
After_Last delimiter ->
|
||||||
if delimiter.is_empty then (Range 0 0) else
|
if delimiter.is_empty then (Range 0 0) else
|
||||||
index = Text_Utils.last_index_of text delimiter
|
span = Text_Utils.last_span_of text delimiter
|
||||||
if index == -1 then (Range 0 0) else
|
if span.is_nothing then (Range 0 0) else
|
||||||
(Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text))
|
(Range span.end (Text_Utils.char_length text))
|
||||||
While predicate ->
|
While predicate ->
|
||||||
indices = find_sub_range_end text _-> start-> end->
|
indices = find_sub_range_end text _-> start-> end->
|
||||||
predicate (Text_Utils.substring text start end) . not
|
predicate (Text_Utils.substring text start end) . not
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
akka {
|
akka {
|
||||||
loggers = ["akka.event.slf4j.Slf4jLogger"]
|
loggers = ["akka.event.slf4j.Slf4jLogger"]
|
||||||
logging-filter = "akka.event.slf4j.Slf4jLoggingFilter"
|
logging-filter = "akka.event.slf4j.Slf4jLoggingFilter"
|
||||||
version = "2.6.6"
|
version = "2.6.18"
|
||||||
stdout-loglevel = "ERROR"
|
stdout-loglevel = "ERROR"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,11 +1,19 @@
|
|||||||
package org.enso.base;
|
package org.enso.base;
|
||||||
|
|
||||||
import com.ibm.icu.lang.UCharacter;
|
import com.ibm.icu.lang.UCharacter;
|
||||||
|
import com.ibm.icu.text.BreakIterator;
|
||||||
|
import com.ibm.icu.text.CaseMap.Fold;
|
||||||
import com.ibm.icu.text.Normalizer;
|
import com.ibm.icu.text.Normalizer;
|
||||||
import com.ibm.icu.text.Normalizer2;
|
import com.ibm.icu.text.Normalizer2;
|
||||||
import com.ibm.icu.text.StringSearch;
|
import com.ibm.icu.text.StringSearch;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
import org.enso.base.text.CaseFoldedString;
|
||||||
|
import org.enso.base.text.GraphemeSpan;
|
||||||
|
import org.enso.base.text.Utf16Span;
|
||||||
|
|
||||||
/** Utils for standard library operations on Text. */
|
/** Utils for standard library operations on Text. */
|
||||||
public class Text_Utils {
|
public class Text_Utils {
|
||||||
@ -117,6 +125,23 @@ public class Text_Utils {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks whether two strings are equal up to Unicode canonicalization and ignoring case.
|
||||||
|
*
|
||||||
|
* @param str1 the first string
|
||||||
|
* @param str2 the second string
|
||||||
|
* @param locale the locale to use for case folding
|
||||||
|
* @return the result of comparison
|
||||||
|
*/
|
||||||
|
public static boolean equals_ignore_case(String str1, Object str2, Locale locale) {
|
||||||
|
if (str2 instanceof String) {
|
||||||
|
Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
|
||||||
|
return compare_normalized(fold.apply(str1), fold.apply((String) str2)) == 0;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts an array of codepoints into a string.
|
* Converts an array of codepoints into a string.
|
||||||
*
|
*
|
||||||
@ -176,6 +201,36 @@ public class Text_Utils {
|
|||||||
return searcher.first() != StringSearch.DONE;
|
return searcher.first() != StringSearch.DONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if {@code substring} is a substring of {@code string}.
|
||||||
|
*
|
||||||
|
* @param string the containing string.
|
||||||
|
* @param substring the contained string.
|
||||||
|
* @return whether {@code substring} is a substring of {@code string}.
|
||||||
|
*/
|
||||||
|
public static boolean contains_case_insensitive(String string, String substring, Locale locale) {
|
||||||
|
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
|
||||||
|
// cases.
|
||||||
|
if (substring.isEmpty()) return true;
|
||||||
|
if (string.isEmpty()) return false;
|
||||||
|
|
||||||
|
Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
|
||||||
|
StringSearch searcher = new StringSearch(fold.apply(substring), fold.apply(string));
|
||||||
|
return searcher.first() != StringSearch.DONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transforms the provided string into a form which can be used for case insensitive comparisons.
|
||||||
|
*
|
||||||
|
* @param string the string to transform
|
||||||
|
* @param locale the locale to use - needed to distinguish a special case when handling Turkish
|
||||||
|
* 'i' characters
|
||||||
|
* @return a transformed string that can be used for case insensitive comparisons
|
||||||
|
*/
|
||||||
|
public static String case_insensitive_key(String string, Locale locale) {
|
||||||
|
return CaseFoldedString.simpleFold(string, locale);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Replaces all occurrences of {@code oldSequence} within {@code str} with {@code newSequence}.
|
* Replaces all occurrences of {@code oldSequence} within {@code str} with {@code newSequence}.
|
||||||
*
|
*
|
||||||
@ -200,37 +255,215 @@ public class Text_Utils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find the first index of needle in the haystack
|
* Find the first occurrence of needle in the haystack
|
||||||
*
|
*
|
||||||
* @param haystack the string to search
|
* @param haystack the string to search
|
||||||
* @param needle the substring that is searched for
|
* @param needle the substring that is searched for
|
||||||
* @return index of the first needle or -1 if not found.
|
* @return a UTF-16 code unit span of the first needle or null if not found.
|
||||||
*/
|
*/
|
||||||
public static long index_of(String haystack, String needle) {
|
public static Utf16Span span_of(String haystack, String needle) {
|
||||||
|
if (needle.isEmpty()) return new Utf16Span(0, 0);
|
||||||
|
if (haystack.isEmpty()) return null;
|
||||||
|
|
||||||
StringSearch search = new StringSearch(needle, haystack);
|
StringSearch search = new StringSearch(needle, haystack);
|
||||||
int pos = search.first();
|
int pos = search.first();
|
||||||
return pos == StringSearch.DONE ? -1 : pos;
|
if (pos == StringSearch.DONE) return null;
|
||||||
|
return new Utf16Span(pos, pos + search.getMatchLength());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find the last index of needle in the haystack
|
* Find the last occurrence of needle in the haystack
|
||||||
*
|
*
|
||||||
* @param haystack the string to search
|
* @param haystack the string to search
|
||||||
* @param needle the substring that is searched for
|
* @param needle the substring that is searched for
|
||||||
* @return index of the last needle or -1 if not found.
|
* @return a UTF-16 code unit span of the last needle or null if not found.
|
||||||
*/
|
*/
|
||||||
public static long last_index_of(String haystack, String needle) {
|
public static Utf16Span last_span_of(String haystack, String needle) {
|
||||||
|
if (needle.isEmpty()) {
|
||||||
|
int afterLast = haystack.length();
|
||||||
|
return new Utf16Span(afterLast, afterLast);
|
||||||
|
}
|
||||||
|
if (haystack.isEmpty()) return null;
|
||||||
|
|
||||||
StringSearch search = new StringSearch(needle, haystack);
|
StringSearch search = new StringSearch(needle, haystack);
|
||||||
int pos = search.first();
|
int pos = search.last();
|
||||||
|
if (pos == StringSearch.DONE) return null;
|
||||||
|
return new Utf16Span(pos, pos + search.getMatchLength());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find spans of all occurrences of the needle within the haystack.
|
||||||
|
*
|
||||||
|
* @param haystack the string to search
|
||||||
|
* @param needle the substring that is searched for
|
||||||
|
* @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
|
||||||
|
*/
|
||||||
|
public static List<Utf16Span> span_of_all(String haystack, String needle) {
|
||||||
|
if (needle.isEmpty())
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"The operation `index_of_all` does not support searching for an empty term.");
|
||||||
|
if (haystack.isEmpty()) return List.of();
|
||||||
|
|
||||||
|
StringSearch search = new StringSearch(needle, haystack);
|
||||||
|
ArrayList<Utf16Span> occurrences = new ArrayList<>();
|
||||||
|
long ix;
|
||||||
|
while ((ix = search.next()) != StringSearch.DONE) {
|
||||||
|
occurrences.add(new Utf16Span(ix, ix + search.getMatchLength()));
|
||||||
|
}
|
||||||
|
return occurrences;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a UTF-16 code unit index to index of the grapheme that this code unit belongs to.
|
||||||
|
*
|
||||||
|
* @param text the text associated with the index
|
||||||
|
* @param codeunit_index the UTF-16 index
|
||||||
|
* @return an index of an extended grapheme cluster that contains the code unit from the input
|
||||||
|
*/
|
||||||
|
public static long utf16_index_to_grapheme_index(String text, long codeunit_index) {
|
||||||
|
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
|
||||||
|
breakIterator.setText(text);
|
||||||
|
if (codeunit_index < 0 || codeunit_index > text.length()) {
|
||||||
|
throw new IndexOutOfBoundsException(
|
||||||
|
"Index " + codeunit_index + " is outside of the provided text.");
|
||||||
|
}
|
||||||
|
|
||||||
|
int grapheme_end = breakIterator.next();
|
||||||
|
long grapheme_index = 0;
|
||||||
|
|
||||||
|
while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) {
|
||||||
|
grapheme_index++;
|
||||||
|
grapheme_end = breakIterator.next();
|
||||||
|
}
|
||||||
|
return grapheme_index;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a series of UTF-16 code unit indices to indices of graphemes that these code units
|
||||||
|
* belong to.
|
||||||
|
*
|
||||||
|
* <p>For performance, it assumes that the provided indices are sorted in a non-decreasing order
|
||||||
|
* (duplicate entries are permitted). Behaviour is unspecified if an unsorted list is provided.
|
||||||
|
*
|
||||||
|
* <p>The behaviour is unspecified if indices provided on the input are outside of the range [0,
|
||||||
|
* text.length()].
|
||||||
|
*
|
||||||
|
* @param text the text associated with the indices
|
||||||
|
* @param codeunit_indices the array of UTF-16 code unit indices, sorted in non-decreasing order
|
||||||
|
* @return an array of grapheme indices corresponding to the UTF-16 units from the input
|
||||||
|
*/
|
||||||
|
public static long[] utf16_indices_to_grapheme_indices(String text, List<Long> codeunit_indices) {
|
||||||
|
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
|
||||||
|
breakIterator.setText(text);
|
||||||
|
|
||||||
|
int grapheme_end = breakIterator.next();
|
||||||
|
long grapheme_index = 0;
|
||||||
|
|
||||||
|
long[] result = new long[codeunit_indices.size()];
|
||||||
|
int result_ix = 0;
|
||||||
|
|
||||||
|
for (long codeunit_index : codeunit_indices) {
|
||||||
|
while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) {
|
||||||
|
grapheme_index++;
|
||||||
|
grapheme_end = breakIterator.next();
|
||||||
|
}
|
||||||
|
result[result_ix++] = grapheme_index;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the first or last occurrence of needle in the haystack.
|
||||||
|
*
|
||||||
|
* @param haystack the string to search
|
||||||
|
* @param needle the substring that is searched for
|
||||||
|
* @param locale the locale used for case-insensitive comparisons
|
||||||
|
* @param searchForLast if set to true, will search for the last occurrence; otherwise searches
|
||||||
|
* for the first one
|
||||||
|
* @return an extended-grapheme-cluster span of the first or last needle, or null if none found.
|
||||||
|
*/
|
||||||
|
public static GraphemeSpan span_of_case_insensitive(
|
||||||
|
String haystack, String needle, Locale locale, boolean searchForLast) {
|
||||||
|
if (needle.isEmpty())
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"The operation `span_of_case_insensitive` does not support searching for an empty term.");
|
||||||
|
if (haystack.isEmpty()) return null;
|
||||||
|
|
||||||
|
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
|
||||||
|
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
|
||||||
|
StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString());
|
||||||
|
int pos;
|
||||||
|
if (searchForLast) {
|
||||||
|
pos = search.last();
|
||||||
|
} else {
|
||||||
|
pos = search.first();
|
||||||
|
}
|
||||||
if (pos == StringSearch.DONE) {
|
if (pos == StringSearch.DONE) {
|
||||||
return -1;
|
return null;
|
||||||
|
} else {
|
||||||
|
return findExtendedSpan(foldedHaystack, pos, search.getMatchLength());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find all occurrences of needle in the haystack
|
||||||
|
*
|
||||||
|
* @param haystack the string to search
|
||||||
|
* @param needle the substring that is searched for
|
||||||
|
* @param locale the locale used for case-insensitive comparisons
|
||||||
|
* @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
|
||||||
|
*/
|
||||||
|
public static List<GraphemeSpan> span_of_all_case_insensitive(
|
||||||
|
String haystack, String needle, Locale locale) {
|
||||||
|
if (needle.isEmpty())
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
|
||||||
|
if (haystack.isEmpty()) return List.of();
|
||||||
|
|
||||||
|
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
|
||||||
|
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
|
||||||
|
|
||||||
|
StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString());
|
||||||
|
ArrayList<GraphemeSpan> result = new ArrayList<>();
|
||||||
|
|
||||||
|
int pos;
|
||||||
|
while ((pos = search.next()) != StringSearch.DONE) {
|
||||||
|
result.add(findExtendedSpan(foldedHaystack, pos, search.getMatchLength()));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int next = search.next(); next != StringSearch.DONE; next = search.next()) {
|
return result;
|
||||||
pos = next;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return pos;
|
/**
|
||||||
|
* Finds the grapheme span corresponding to the found match indexed with code units.
|
||||||
|
*
|
||||||
|
* <p>It extends the found span to ensure that graphemes associated with all found code units are
|
||||||
|
* included in the resulting span. Thus, some additional code units which were not present in the
|
||||||
|
* original match may also be present due to the extension.
|
||||||
|
*
|
||||||
|
* <p>The extension to the left is trivial - we just find the grapheme associated with the first
|
||||||
|
* code unit and even if that code unit is not the first one of that grapheme, by returning it we
|
||||||
|
* correctly extend to the left. The extension to the right works by finding the index of the
|
||||||
|
* grapheme associated with the last code unit actually present in the span, then the end of the
|
||||||
|
* returned span is set to the next grapheme after it. This correctly handles the edge case where
|
||||||
|
* only a part of some grapheme was matched.
|
||||||
|
*
|
||||||
|
* @param string the folded string with which the positions are associated, containing a cache of
|
||||||
|
* position mappings
|
||||||
|
* @param position the position of the match (in code units)
|
||||||
|
* @param length the length of the match (in code units)
|
||||||
|
* @return a minimal {@code GraphemeSpan} which contains all code units from the match
|
||||||
|
*/
|
||||||
|
private static GraphemeSpan findExtendedSpan(CaseFoldedString string, int position, int length) {
|
||||||
|
int firstGrapheme = string.codeUnitToGraphemeIndex(position);
|
||||||
|
if (length == 0) {
|
||||||
|
return new GraphemeSpan(firstGrapheme, firstGrapheme);
|
||||||
|
} else {
|
||||||
|
int lastGrapheme = string.codeUnitToGraphemeIndex(position + length - 1);
|
||||||
|
int endGrapheme = lastGrapheme + 1;
|
||||||
|
return new GraphemeSpan(firstGrapheme, endGrapheme);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -0,0 +1,135 @@
|
|||||||
|
package org.enso.base.text;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.BreakIterator;
|
||||||
|
import com.ibm.icu.text.CaseMap;
|
||||||
|
import com.ibm.icu.text.CaseMap.Fold;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents a string transformed using Unicode Case Folding which can be used for case insensitive
|
||||||
|
* comparisons.
|
||||||
|
*
|
||||||
|
* <p>It contains facilities for converting indices in the transformed string to corresponding
|
||||||
|
* indices back in the original string.
|
||||||
|
*/
|
||||||
|
public class CaseFoldedString {
|
||||||
|
private final String foldedString;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A mapping from code units in the transformed string to their corresponding graphemes in the
|
||||||
|
* original string.
|
||||||
|
*
|
||||||
|
* <p>The mapping must be valid from indices from 0 to @{code foldedString.length()+1}
|
||||||
|
* (inclusive).
|
||||||
|
*/
|
||||||
|
private final int[] graphemeIndexMapping;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a new instance of the folded string.
|
||||||
|
*
|
||||||
|
* @param foldeString the string after applying the case folding transformation
|
||||||
|
* @param graphemeIndexMapping a mapping created during the transformation which maps code units
|
||||||
|
* in the transformed string to their corresponding graphemes in the original string
|
||||||
|
*/
|
||||||
|
private CaseFoldedString(String foldeString, int[] graphemeIndexMapping) {
|
||||||
|
this.foldedString = foldeString;
|
||||||
|
this.graphemeIndexMapping = graphemeIndexMapping;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maps a code unit in the folded string to the corresponding grapheme in the original string.
|
||||||
|
*
|
||||||
|
* @param codeunitIndex the index of the code unit in the folded string, valid indices range from
|
||||||
|
* 0 to {@code getFoldedString().length()+1} (inclusive), allowing to also ask for the
|
||||||
|
* position of the end code unit which is located right after the end of the string - which
|
||||||
|
* should always map to the analogous end grapheme.
|
||||||
|
* @return the index of the grapheme from the original string that after applying the
|
||||||
|
* transformation contains the requested code unit
|
||||||
|
*/
|
||||||
|
public int codeUnitToGraphemeIndex(int codeunitIndex) {
|
||||||
|
if (codeunitIndex < 0 || codeunitIndex > this.foldedString.length()) {
|
||||||
|
throw new IndexOutOfBoundsException(codeunitIndex);
|
||||||
|
}
|
||||||
|
return graphemeIndexMapping[codeunitIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the transformed string. */
|
||||||
|
public String getFoldedString() {
|
||||||
|
return foldedString;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Folds a string remembering the mapping from code units to its original grapheme cluster
|
||||||
|
* indices.
|
||||||
|
*
|
||||||
|
* @param charSequence a sequence of UTF-16 characters to transform
|
||||||
|
* @param locale the locale to use as a reference for case folding; it is needed because Turkish
|
||||||
|
* and Azerbaijani locales handle casing of the letter `i` in a different way than other
|
||||||
|
* locales
|
||||||
|
* @return a {@code CaseFoldedString} instance which contains the transformed string and allows to
|
||||||
|
* map its code units to original grapheme clusters
|
||||||
|
*/
|
||||||
|
public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {
|
||||||
|
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
|
||||||
|
breakIterator.setText(charSequence);
|
||||||
|
StringBuilder stringBuilder = new StringBuilder(charSequence.length());
|
||||||
|
Fold foldAlgorithm = caseFoldAlgorithmForLocale(locale);
|
||||||
|
IntArrayBuilder index_mapping = new IntArrayBuilder(charSequence.length() + 1);
|
||||||
|
|
||||||
|
// We rely on the fact that ICU Case Folding is _not_ context-sensitive, i.e. the mapping of
|
||||||
|
// each grapheme cluster is independent of surrounding ones. Regular casing is
|
||||||
|
// context-sensitive.
|
||||||
|
int current = breakIterator.current();
|
||||||
|
int next;
|
||||||
|
int grapheme_index = 0;
|
||||||
|
while ((next = breakIterator.next()) != BreakIterator.DONE) {
|
||||||
|
CharSequence grapheme = new StringSlice(charSequence, current, next);
|
||||||
|
String foldedGrapheme = foldAlgorithm.apply(grapheme);
|
||||||
|
stringBuilder.append(foldedGrapheme);
|
||||||
|
for (int i = 0; i < foldedGrapheme.length(); ++i) {
|
||||||
|
index_mapping.add(grapheme_index);
|
||||||
|
}
|
||||||
|
|
||||||
|
grapheme_index++;
|
||||||
|
current = next;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The mapping should also be able to handle a {@code str.length()} query, so we add one more
|
||||||
|
// element to the mapping pointing to a non-existent grapheme after the end of the text.
|
||||||
|
index_mapping.add(grapheme_index);
|
||||||
|
|
||||||
|
return new CaseFoldedString(
|
||||||
|
stringBuilder.toString(), index_mapping.unsafeGetStorageAndInvalidateTheBuilder());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A helper function which folds the string without remembering the index mapping.
|
||||||
|
*
|
||||||
|
* <p>It should be used when the index mapping is not needed, as its implementation is much more
|
||||||
|
* efficient.
|
||||||
|
*
|
||||||
|
* @param charSequence a sequence of UTF-16 characters to transform
|
||||||
|
* @param locale the locale to use as a reference for case folding; it is needed because Turkish
|
||||||
|
* and Azerbaijani locales handle casing of the letter `i` in a different way than the others
|
||||||
|
* @return the folded string
|
||||||
|
*/
|
||||||
|
public static String simpleFold(CharSequence string, Locale locale) {
|
||||||
|
return caseFoldAlgorithmForLocale(locale).apply(string);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Locale AZ_LOCALE = new Locale("az");
|
||||||
|
private static final Locale TR_LOCALE = new Locale("tr");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a case folding algorithm appropriate for the given locale.
|
||||||
|
*
|
||||||
|
* <p>The algorithm is locale-dependent because Turkish and Azerbaijani locales handle casing of
|
||||||
|
* the letter `i` in a different way than other locales.
|
||||||
|
*/
|
||||||
|
public static Fold caseFoldAlgorithmForLocale(Locale locale) {
|
||||||
|
if (locale.equals(AZ_LOCALE) || locale.equals(TR_LOCALE)) {
|
||||||
|
return CaseMap.fold().turkic();
|
||||||
|
}
|
||||||
|
return CaseMap.fold();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
package org.enso.base.text;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents a span of characters (understood as extended grapheme clusters) within a Text.
|
||||||
|
*
|
||||||
|
* <p>The start index indicates the first grapheme of the span and the end index indicates the first
|
||||||
|
* grapheme after the end of the span.
|
||||||
|
*
|
||||||
|
* <p>Represents an empty span if start and end indices are equal. Such an empty span refers to the
|
||||||
|
* space just before the grapheme corresponding to index start.
|
||||||
|
*/
|
||||||
|
public class GraphemeSpan {
|
||||||
|
|
||||||
|
public final long start, end;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a span of characters (understood as extended grapheme clusters).
|
||||||
|
*
|
||||||
|
* @param start index of the first extended grapheme cluster contained within the span (or
|
||||||
|
* location of the span if it is empty)
|
||||||
|
* @param end index of the first extended grapheme cluster after start that is not contained
|
||||||
|
* within the span
|
||||||
|
*/
|
||||||
|
public GraphemeSpan(long start, long end) {
|
||||||
|
this.start = start;
|
||||||
|
this.end = end;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,65 @@
|
|||||||
|
package org.enso.base.text;
|
||||||
|
|
||||||
|
/** A helper to efficiently build an array of unboxed integers of arbitrary length. */
|
||||||
|
public class IntArrayBuilder {
|
||||||
|
private int[] storage;
|
||||||
|
private int length;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs an empty builder with a given initial capacity.
|
||||||
|
*
|
||||||
|
* @param initialCapacity the initial capacity of the builder, can be used to avoid expanding the
|
||||||
|
* storage if the amount of elements can be estimated in advance.
|
||||||
|
*/
|
||||||
|
public IntArrayBuilder(int initialCapacity) {
|
||||||
|
length = 0;
|
||||||
|
storage = new int[initialCapacity];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Adds a new element to the array, expanding it if necessary. */
|
||||||
|
public void add(int x) {
|
||||||
|
if (length >= storage.length) {
|
||||||
|
grow();
|
||||||
|
}
|
||||||
|
|
||||||
|
storage[length++] = x;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expands the storage to fit more elements.
|
||||||
|
*
|
||||||
|
* <p>The storage grows by 50% and is always increased by at least one. The 50% growth is chosen
|
||||||
|
* so that the amortized cost of adding a new element to the array stays constant.
|
||||||
|
*/
|
||||||
|
private void grow() {
|
||||||
|
int newCapacity = storage.length + (storage.length / 2);
|
||||||
|
if (newCapacity <= storage.length) {
|
||||||
|
newCapacity = storage.length + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int[] newStorage = new int[newCapacity];
|
||||||
|
System.arraycopy(this.storage, 0, newStorage, 0, length);
|
||||||
|
this.storage = newStorage;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the amount of elements already added to the storage. */
|
||||||
|
public int getLength() {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the underlying storage of the builder.
|
||||||
|
*
|
||||||
|
* <p>This method avoids copying for performance so it should be used with care. The storage can
|
||||||
|
* actually have more elements than were added, so the user should be careful to only query the
|
||||||
|
* first {@code getLength()} elements. Querying other elements results in an unspecified result.
|
||||||
|
*
|
||||||
|
* <p>After calling this method, the builder is invalidated and cannot be used anymore. Any usage
|
||||||
|
* of the builder afterwards will result in a {@code NullPointerException}.
|
||||||
|
*/
|
||||||
|
public int[] unsafeGetStorageAndInvalidateTheBuilder() {
|
||||||
|
int[] tmp = storage;
|
||||||
|
this.storage = null;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,34 @@
|
|||||||
|
package org.enso.base.text;
|
||||||
|
|
||||||
|
/** A char sequence which allows to access a slice of another char sequence without copying. */
|
||||||
|
class StringSlice implements CharSequence {
|
||||||
|
private final CharSequence text;
|
||||||
|
private final int subStart, subEnd;
|
||||||
|
|
||||||
|
/** Constructs a slice of the given text. */
|
||||||
|
public StringSlice(CharSequence text, int start, int end) {
|
||||||
|
this.text = text;
|
||||||
|
this.subStart = start;
|
||||||
|
this.subEnd = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int length() {
|
||||||
|
return subEnd - subStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public char charAt(int index) {
|
||||||
|
return text.charAt(subStart + index);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CharSequence subSequence(int start, int end) {
|
||||||
|
return new StringSlice(text, subStart + start, subStart + end);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return text.subSequence(subStart, subEnd).toString();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,18 @@
|
|||||||
|
package org.enso.base.text;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents a span of UTF-16 code units within a String.
|
||||||
|
*
|
||||||
|
* <p>The start index indicates the first code unit of the span and the end index indicates the
|
||||||
|
* first code unit after the end of the span.
|
||||||
|
*/
|
||||||
|
public class Utf16Span {
|
||||||
|
|
||||||
|
public final long start, end;
|
||||||
|
|
||||||
|
/** Constructs a span of UTF-16 code units. */
|
||||||
|
public Utf16Span(long start, long end) {
|
||||||
|
this.start = start;
|
||||||
|
this.end = end;
|
||||||
|
}
|
||||||
|
}
|
@ -6,7 +6,7 @@ import Standard.Base.Data.Text.Regex
|
|||||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||||
import Standard.Base.Data.Text.Regex.Mode
|
import Standard.Base.Data.Text.Regex.Mode
|
||||||
import Standard.Base.Data.Text.Regex.Option as Global_Option
|
import Standard.Base.Data.Text.Regex.Option as Global_Option
|
||||||
import Standard.Base.Data.Text.Span
|
from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span
|
||||||
|
|
||||||
polyglot java import java.util.regex.Pattern as Java_Pattern
|
polyglot java import java.util.regex.Pattern as Java_Pattern
|
||||||
|
|
||||||
@ -182,6 +182,22 @@ spec =
|
|||||||
match.at 1 . group 0 . should_equal "ef"
|
match.at 1 . group 0 . should_equal "ef"
|
||||||
match.at 2 . group 0 . should_equal "gh"
|
match.at 2 . group 0 . should_equal "gh"
|
||||||
|
|
||||||
|
Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <|
|
||||||
|
pattern = engine.compile "" []
|
||||||
|
match_1 = pattern.match "" mode=Mode.All
|
||||||
|
match_1.length . should_equal 1
|
||||||
|
match_1.at 0 . start 0 . should_equal 0
|
||||||
|
match_1.at 0 . end 0 . should_equal 0
|
||||||
|
|
||||||
|
match_2 = pattern.match "ABC" mode=Mode.All
|
||||||
|
match_2.length . should_equal 4
|
||||||
|
match_2.at 0 . start 0 . should_equal 0
|
||||||
|
match_2.at 0 . end 0 . should_equal 0
|
||||||
|
match_2.at 1 . start 0 . should_equal 1
|
||||||
|
match_2.at 1 . end 0 . should_equal 1
|
||||||
|
match_2.at 3 . start 0 . should_equal 3
|
||||||
|
match_2.at 3 . end 0 . should_equal 3
|
||||||
|
|
||||||
Test.group "The default regex engine's Pattern.find" <|
|
Test.group "The default regex engine's Pattern.find" <|
|
||||||
engine = Default_Engine.new
|
engine = Default_Engine.new
|
||||||
|
|
||||||
@ -261,11 +277,23 @@ spec =
|
|||||||
match.at 1 . should_equal "ef"
|
match.at 1 . should_equal "ef"
|
||||||
match.at 2 . should_equal "gh"
|
match.at 2 . should_equal "gh"
|
||||||
|
|
||||||
|
match_2 = pattern.find input mode=(Mode.Bounded 2 8 mode=10)
|
||||||
|
match_2.length . should_equal 3
|
||||||
|
match_2.at 0 . should_equal "cd"
|
||||||
|
match_2.at 1 . should_equal "ef"
|
||||||
|
match_2.at 2 . should_equal "gh"
|
||||||
|
|
||||||
|
match_3 = pattern.find input mode=(Mode.Bounded 2 8 mode=2)
|
||||||
|
match_3.length . should_equal 2
|
||||||
|
match_3.at 0 . should_equal "cd"
|
||||||
|
match_3.at 1 . should_equal "ef"
|
||||||
|
|
||||||
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
|
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
|
||||||
engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
|
engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||||
engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
|
engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"]
|
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"]
|
||||||
|
|
||||||
|
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=5 . should_equal ["a", "1", "b", "2"]
|
||||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"]
|
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"]
|
||||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"]
|
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"]
|
||||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Mode.Bounded 1 3) . should_equal ["1", "b"]
|
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Mode.Bounded 1 3) . should_equal ["1", "b"]
|
||||||
@ -501,10 +529,10 @@ spec =
|
|||||||
match . should_be_a Default_Engine.Match
|
match . should_be_a Default_Engine.Match
|
||||||
|
|
||||||
Test.specify "should get the span of a group by index" <|
|
Test.specify "should get the span of a group by index" <|
|
||||||
match.span 1 . should_equal (Span.new 0 6 input)
|
match.span 1 . should_equal (Utf_16_Span (Range 0 6) input)
|
||||||
|
|
||||||
Test.specify "should get the span of a group by name" <|
|
Test.specify "should get the span of a group by name" <|
|
||||||
match.span "letters" . should_equal (Span.new 6 18 input)
|
match.span "letters" . should_equal (Utf_16_Span (Range 6 18) input)
|
||||||
|
|
||||||
Test.specify "should return Nothing if the group didn't match" <|
|
Test.specify "should return Nothing if the group didn't match" <|
|
||||||
match.span 3 . should_equal Nothing
|
match.span 3 . should_equal Nothing
|
||||||
|
@ -26,3 +26,4 @@ spec =
|
|||||||
pattern = "http://example.com"
|
pattern = "http://example.com"
|
||||||
Regex.escape pattern . should_equal "\Qhttp://example.com\E"
|
Regex.escape pattern . should_equal "\Qhttp://example.com\E"
|
||||||
|
|
||||||
|
main = Test.Suite.run_main here.spec
|
||||||
|
@ -2,20 +2,36 @@
|
|||||||
from Standard.Base import all
|
from Standard.Base import all
|
||||||
import Standard.Test
|
import Standard.Test
|
||||||
|
|
||||||
import Standard.Base.Data.Text.Span
|
from Standard.Base.Data.Text.Span as Span_Module import Span, Utf_16_Span
|
||||||
|
|
||||||
spec = Test.group "Text.Span" <|
|
spec = Test.group "Text.Span" <|
|
||||||
|
|
||||||
Test.specify "should be able to be created over a text" <|
|
Test.specify "should be able to be created over a text" <|
|
||||||
text = "Hello!"
|
text = "Hello!"
|
||||||
span = Span.new 0 3 text
|
span = Span (Range 0 3) text
|
||||||
span.start . should_equal 0
|
span.start . should_equal 0
|
||||||
span.end . should_equal 3
|
span.end . should_equal 3
|
||||||
span.text . should_equal text
|
span.text . should_equal text
|
||||||
|
|
||||||
Test.specify "should be able to be created without a text" <|
|
Test.specify "should be able to be converted to code units" <|
|
||||||
span = Span.new 5 8
|
text = 'ae\u{301}fz'
|
||||||
span.start . should_equal 5
|
(Span (Range 1 3) text).to_utf_16_span . should_equal (Utf_16_Span (Range 1 4) text)
|
||||||
span.end . should_equal 8
|
|
||||||
span.text . should_equal Nothing
|
|
||||||
|
|
||||||
|
Test.specify "should expand to the associated grapheme clusters" <|
|
||||||
|
text = 'a\u{301}e\u{302}o\u{303}'
|
||||||
|
span = Utf_16_Span (Range 1 5) text
|
||||||
|
extended = span.to_grapheme_span
|
||||||
|
extended . should_equal (Span (Range 0 3) text)
|
||||||
|
extended.to_utf_16_span . should_equal (Utf_16_Span (Range 0 6) text)
|
||||||
|
|
||||||
|
Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
|
||||||
|
Utf_16_Span (Range 0 1) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
|
||||||
|
Utf_16_Span (Range 0 0) text . to_grapheme_span . should_equal (Span (Range 0 0) text)
|
||||||
|
Utf_16_Span (Range 1 1) text . to_grapheme_span . should_equal (Span (Range 0 0) text)
|
||||||
|
Utf_16_Span (Range 2 2) text . to_grapheme_span . should_equal (Span (Range 1 1) text)
|
||||||
|
|
||||||
|
Utf_16_Span (Range 0 4) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
|
||||||
|
Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
|
||||||
|
Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
|
||||||
|
|
||||||
|
main = Test.Suite.run_main here.spec
|
||||||
|
61
test/Tests/src/Data/Text/Utils_Spec.enso
Normal file
61
test/Tests/src/Data/Text/Utils_Spec.enso
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
from Standard.Base import all
|
||||||
|
|
||||||
|
polyglot java import org.enso.base.Text_Utils
|
||||||
|
polyglot java import org.enso.base.text.CaseFoldedString
|
||||||
|
|
||||||
|
import Standard.Test
|
||||||
|
|
||||||
|
polyglot java import com.ibm.icu.text.BreakIterator
|
||||||
|
spec =
|
||||||
|
Test.group "Text_Utils" <|
|
||||||
|
kshi = '\u0915\u094D\u0937\u093F'
|
||||||
|
facepalm = '\u{1F926}\u{1F3FC}\u200D\u2642\uFE0F'
|
||||||
|
text = "a"+kshi+facepalm+'e\u{301}Z'
|
||||||
|
codepoints_to_graphemes = _.flatten <| text.characters.map_with_index ix-> grapheme->
|
||||||
|
codepoints_count = grapheme.utf_16.length
|
||||||
|
Vector.new codepoints_count _->ix
|
||||||
|
|
||||||
|
Test.specify "should correctly translate an codepoint index to a grapheme index" <|
|
||||||
|
codepoints_to_graphemes . each_with_index codepoint_ix-> grapheme_ix->
|
||||||
|
found_grapheme_ix = Text_Utils.utf16_index_to_grapheme_index text codepoint_ix
|
||||||
|
found_grapheme_ix.should_equal grapheme_ix
|
||||||
|
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index text text.utf_16.length . should_equal text.length
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index "" 0 . should_equal 0
|
||||||
|
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index 'ą' 0 . should_equal 0
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index 'ą' 1 . should_equal 1
|
||||||
|
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index "aB" 0 . should_equal 0
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index "aB" 1 . should_equal 1
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index "aB" 2 . should_equal 2
|
||||||
|
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 0 . should_equal 0
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 1 . should_equal 0
|
||||||
|
Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 2 . should_equal 1
|
||||||
|
|
||||||
|
Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <|
|
||||||
|
translate_indices text ixes =
|
||||||
|
Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
|
||||||
|
codepoint_indices = Vector.new text.utf_16.length ix->ix
|
||||||
|
translate_indices text codepoint_indices . should_equal codepoints_to_graphemes
|
||||||
|
|
||||||
|
translate_indices "" [0] . should_equal [0]
|
||||||
|
translate_indices 'ą' [0, 1] . should_equal [0, 1]
|
||||||
|
translate_indices "aB" [0, 1, 2] . should_equal [0, 1, 2]
|
||||||
|
translate_indices 'a\u{301}' [0, 1, 2] . should_equal [0, 0, 1]
|
||||||
|
|
||||||
|
Test.specify "should correctly case-fold a string and translate codeunits to graphemes" <|
|
||||||
|
text = 'a\u{301}AZßffią'
|
||||||
|
folded = CaseFoldedString.fold text Locale.default.java_locale
|
||||||
|
folded.getFoldedString . should_equal 'a\u{301}azssffią'
|
||||||
|
|
||||||
|
codeunits = Vector.new folded.getFoldedString.utf_16.length+1 ix->ix
|
||||||
|
grapheme_ixes = codeunits.map ix->
|
||||||
|
folded.codeUnitToGraphemeIndex ix
|
||||||
|
grapheme_ixes . should_equal [0, 0, 1, 2, 3, 3, 4, 4, 4, 5, 6]
|
||||||
|
|
||||||
|
Test.expect_panic_with (folded.codeUnitToGraphemeIndex -1) Polyglot_Error
|
||||||
|
Test.expect_panic_with (folded.codeUnitToGraphemeIndex folded.getFoldedString.utf_16.length+1) Polyglot_Error
|
||||||
|
|
||||||
|
main = Test.Suite.run_main here.spec
|
@ -4,7 +4,10 @@ from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error
|
|||||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||||
import Standard.Base.Data.Locale
|
import Standard.Base.Data.Locale
|
||||||
import Standard.Base.Data.Text.Split_Kind
|
import Standard.Base.Data.Text.Split_Kind
|
||||||
|
from Standard.Base.Data.Text.Span as Span_Module import Span
|
||||||
from Standard.Base.Data.Text.Text_Sub_Range import all
|
from Standard.Base.Data.Text.Text_Sub_Range import all
|
||||||
|
import Standard.Base.Data.Text.Regex.Mode
|
||||||
|
import Standard.Base.Data.Text.Matching_Mode
|
||||||
import Standard.Test
|
import Standard.Test
|
||||||
|
|
||||||
type Auto a
|
type Auto a
|
||||||
@ -87,9 +90,8 @@ spec =
|
|||||||
'e\u0301' . equals_ignore_case 'e\u0303' . should_be_false
|
'e\u0301' . equals_ignore_case 'e\u0303' . should_be_false
|
||||||
|
|
||||||
"I" . equals_ignore_case "i" . should_be_true
|
"I" . equals_ignore_case "i" . should_be_true
|
||||||
"I" . equals_ignore_case "ı" . should_be_true
|
|
||||||
"İ" . equals_ignore_case "i" . should_be_false
|
|
||||||
"İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
|
"İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
|
||||||
|
"I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true
|
||||||
"I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
|
"I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
|
||||||
|
|
||||||
"Kongressstraße"=="Kongressstrasse" . should_be_false
|
"Kongressstraße"=="Kongressstrasse" . should_be_false
|
||||||
@ -199,15 +201,20 @@ spec =
|
|||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 6) . should_equal 'Wo\u{301}rld!'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 6) . should_equal 'Wo\u{301}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 5) . should_equal 'o\u{301}rld!'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 5) . should_equal 'o\u{301}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e\u{302}') . should_equal 'H'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e\u{302}') . should_equal 'H'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'ê') . should_equal 'H'
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} W'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} W'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'ö') . should_equal 'He\u{302}llo\u{308} W'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{302}') . should_equal 'llo\u{308} Wo\u{301}rld!'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{302}') . should_equal 'llo\u{308} Wo\u{301}rld!'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'ê') . should_equal 'llo\u{308} Wo\u{301}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{308}') . should_equal ''
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{308}') . should_equal ''
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e') . should_equal ''
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e') . should_equal ''
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o\u{308}') . should_equal 'rld!'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o\u{308}') . should_equal 'rld!'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'ö') . should_equal 'rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o') . should_equal ''
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o') . should_equal ''
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e\u{302}') . should_equal 'H'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e\u{302}') . should_equal 'H'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='ê') . should_equal 'H'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range 3 5) . should_equal 'lo\u{308}'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range 3 5) . should_equal 'lo\u{308}'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range -3 -1) . should_equal 'ld'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range -3 -1) . should_equal 'ld'
|
||||||
@ -232,6 +239,30 @@ spec =
|
|||||||
'✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 Nothing) . should_equal '😙😉☺'
|
'✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 Nothing) . should_equal '😙😉☺'
|
||||||
'✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 -1) . should_equal '😙😉'
|
'✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 -1) . should_equal '😙😉'
|
||||||
|
|
||||||
|
Test.specify "take should correctly handle edge cases" <|
|
||||||
|
"".take First.new . should_equal ""
|
||||||
|
"".take Last.new . should_equal ""
|
||||||
|
|
||||||
|
"".take (After "a") . should_equal ""
|
||||||
|
"".take (After_Last "a") . should_equal ""
|
||||||
|
"".take (Before "a") . should_equal ""
|
||||||
|
"".take (Before_Last "a") . should_equal ""
|
||||||
|
|
||||||
|
"".take (After "") . should_equal ""
|
||||||
|
"".take (After_Last "") . should_equal ""
|
||||||
|
"".take (Before "") . should_equal ""
|
||||||
|
"".take (Before_Last "") . should_equal ""
|
||||||
|
|
||||||
|
"".take (While _->True) . should_equal ""
|
||||||
|
|
||||||
|
"".take (Range 0 0) . should_equal ""
|
||||||
|
'ABC\u{301}'.take (Range 0 0) . should_equal ""
|
||||||
|
|
||||||
|
'ABC\u{301}'.take (After "") . should_equal 'ABC\u{301}'
|
||||||
|
'ABC\u{301}'.take (After_Last "") . should_equal ""
|
||||||
|
'ABC\u{301}'.take (Before "") . should_equal ""
|
||||||
|
'ABC\u{301}'.take (Before_Last "") . should_equal 'ABC\u{301}'
|
||||||
|
|
||||||
Test.specify "drop should work as in the examples" <|
|
Test.specify "drop should work as in the examples" <|
|
||||||
"Hello World!".drop First.new . should_equal "ello World!"
|
"Hello World!".drop First.new . should_equal "ello World!"
|
||||||
"Hello World!".drop (First 5) . should_equal " World!"
|
"Hello World!".drop (First 5) . should_equal " World!"
|
||||||
@ -269,15 +300,20 @@ spec =
|
|||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 6) . should_equal 'He\u{302}llo\u{308} '
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 6) . should_equal 'He\u{302}llo\u{308} '
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 5) . should_equal 'He\u{302}llo\u{308} W'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 5) . should_equal 'He\u{302}llo\u{308} W'
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'ê') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e') . should_equal ''
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e') . should_equal ''
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o\u{308}') . should_equal 'o\u{308}rld!'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o\u{308}') . should_equal 'o\u{308}rld!'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'ö') . should_equal 'o\u{308}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o') . should_equal ''
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o') . should_equal ''
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{302}') . should_equal 'He\u{302}'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{302}') . should_equal 'He\u{302}'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'ê') . should_equal 'He\u{302}'
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
|
'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{308}'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{308}'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'ö') . should_equal 'He\u{302}llo\u{308} Wo\u{308}'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!'
|
||||||
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='ê') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e') . should_equal ''
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e') . should_equal ''
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range 3 5) . should_equal 'He\u{302}l Wo\u{308}rld!'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range 3 5) . should_equal 'He\u{302}l Wo\u{308}rld!'
|
||||||
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range -3 -1) . should_equal 'He\u{302}llo\u{308} Wo\u{308}r!'
|
'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range -3 -1) . should_equal 'He\u{302}llo\u{308} Wo\u{308}r!'
|
||||||
@ -301,6 +337,30 @@ spec =
|
|||||||
'✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 Nothing) . should_equal '✨🚀🚧😍😃😍😎'
|
'✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 Nothing) . should_equal '✨🚀🚧😍😃😍😎'
|
||||||
'✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 -1) . should_equal '✨🚀🚧😍😃😍😎☺'
|
'✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 -1) . should_equal '✨🚀🚧😍😃😍😎☺'
|
||||||
|
|
||||||
|
Test.specify "drop should correctly handle edge cases" <|
|
||||||
|
"".drop First.new . should_equal ""
|
||||||
|
"".drop Last.new . should_equal ""
|
||||||
|
|
||||||
|
"".drop (After "a") . should_equal ""
|
||||||
|
"".drop (After_Last "a") . should_equal ""
|
||||||
|
"".drop (Before "a") . should_equal ""
|
||||||
|
"".drop (Before_Last "a") . should_equal ""
|
||||||
|
|
||||||
|
"".drop (After "") . should_equal ""
|
||||||
|
"".drop (After_Last "") . should_equal ""
|
||||||
|
"".drop (Before "") . should_equal ""
|
||||||
|
"".drop (Before_Last "") . should_equal ""
|
||||||
|
|
||||||
|
"".drop (While _->True) . should_equal ""
|
||||||
|
|
||||||
|
"".drop (Range 0 0) . should_equal ""
|
||||||
|
'ABC\u{301}'.drop (Range 0 0) . should_equal 'ABC\u{301}'
|
||||||
|
|
||||||
|
'ABC\u{301}'.drop (After "") . should_equal ''
|
||||||
|
'ABC\u{301}'.drop (After_Last "") . should_equal 'ABC\u{301}'
|
||||||
|
'ABC\u{301}'.drop (Before "") . should_equal 'ABC\u{301}'
|
||||||
|
'ABC\u{301}'.drop (Before_Last "") . should_equal ''
|
||||||
|
|
||||||
Test.specify "should correctly convert character case" <|
|
Test.specify "should correctly convert character case" <|
|
||||||
"FooBar Baz".to_case Case.Lower . should_equal "foobar baz"
|
"FooBar Baz".to_case Case.Lower . should_equal "foobar baz"
|
||||||
"FooBar Baz".to_case Case.Upper . should_equal "FOOBAR BAZ"
|
"FooBar Baz".to_case Case.Upper . should_equal "FOOBAR BAZ"
|
||||||
@ -465,10 +525,7 @@ spec =
|
|||||||
## This shows what regex is doing by default and we cannot easily fix
|
## This shows what regex is doing by default and we cannot easily fix
|
||||||
that.
|
that.
|
||||||
's\u{301}' . contains 's' (Regex_Matcher.new) . should_be_true
|
's\u{301}' . contains 's' (Regex_Matcher.new) . should_be_true
|
||||||
## This would normally be false, but we perform input normalization
|
'ś' . contains 's' (Regex_Matcher.new) . should_be_false
|
||||||
to get results that are consistent regardless of if the input was
|
|
||||||
normalized or not.
|
|
||||||
'ś' . contains 's' (Regex_Matcher.new) . should_be_true
|
|
||||||
's\u{301}' . contains 'ś' (Regex_Matcher.new) . should_be_true
|
's\u{301}' . contains 'ś' (Regex_Matcher.new) . should_be_true
|
||||||
'ś' . contains 's\u{301}' (Regex_Matcher.new) . should_be_true
|
'ś' . contains 's\u{301}' (Regex_Matcher.new) . should_be_true
|
||||||
|
|
||||||
@ -767,6 +824,157 @@ spec =
|
|||||||
|
|
||||||
'✨🚀🚧'*2 . should_equal '✨🚀🚧✨🚀🚧'
|
'✨🚀🚧'*2 . should_equal '✨🚀🚧✨🚀🚧'
|
||||||
|
|
||||||
|
Test.specify "location_of should work as shown in examples" <|
|
||||||
|
example_1 =
|
||||||
|
"Hello World!".location_of "J" == Nothing
|
||||||
|
"Hello World!".location_of "o" == Span (Range 4 5) "Hello World!"
|
||||||
|
"Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 4 5) "Hello World!"
|
||||||
|
|
||||||
|
example_2 =
|
||||||
|
term = "straße"
|
||||||
|
text = "MONUMENTENSTRASSE 42"
|
||||||
|
match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
term.length . should_equal 6
|
||||||
|
match.length . should_equal 7
|
||||||
|
|
||||||
|
example_3 =
|
||||||
|
ligatures = "ffiffl"
|
||||||
|
ligatures.length . should_equal 2
|
||||||
|
term_1 = "IFF"
|
||||||
|
match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
term_1.length . should_equal 3
|
||||||
|
match_1.length . should_equal 2
|
||||||
|
term_2 = "ffiffl"
|
||||||
|
match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
term_2.length . should_equal 6
|
||||||
|
match_2.length . should_equal 2
|
||||||
|
match_1 . should_equal match_2
|
||||||
|
|
||||||
|
example_4 =
|
||||||
|
"Hello World!".location_of_all "J" . should_equal []
|
||||||
|
"Hello World!".location_of_all "o" . map .start . should_equal [4, 7]
|
||||||
|
|
||||||
|
example_5 =
|
||||||
|
term = "strasse"
|
||||||
|
text = "MONUMENTENSTRASSE ist eine große Straße."
|
||||||
|
match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
term.length . should_equal 7
|
||||||
|
match . map .length . should_equal [7, 6]
|
||||||
|
|
||||||
|
example_6 =
|
||||||
|
ligatures = "ffifflFFIFF"
|
||||||
|
ligatures.length . should_equal 7
|
||||||
|
match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
match_1 . map .length . should_equal [2, 3]
|
||||||
|
match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new)
|
||||||
|
match_2 . map .length . should_equal [2, 5]
|
||||||
|
|
||||||
|
# Put them in blocks to avoid name clashes.
|
||||||
|
example_1
|
||||||
|
example_2
|
||||||
|
example_3
|
||||||
|
example_4
|
||||||
|
example_5
|
||||||
|
example_6
|
||||||
|
|
||||||
|
Test.specify "should allow to find location_of occurrences within a text" <|
|
||||||
|
"Hello World!".location_of_all "J" . should_equal []
|
||||||
|
"Hello World!".location_of_all "o" . map .start . should_equal [4, 7]
|
||||||
|
|
||||||
|
accents = 'a\u{301}e\u{301}o\u{301}'
|
||||||
|
accents.location_of accent_1 . should_equal (Span (Range 1 2) accents)
|
||||||
|
|
||||||
|
"".location_of "foo" . should_equal Nothing
|
||||||
|
"".location_of "foo" mode=Matching_Mode.Last . should_equal Nothing
|
||||||
|
"".location_of_all "foo" . should_equal []
|
||||||
|
"".location_of "" . should_equal (Span (Range 0 0) "")
|
||||||
|
"".location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
|
||||||
|
"".location_of_all "" . should_equal [Span (Range 0 0) ""]
|
||||||
|
abc = 'A\u{301}ßC'
|
||||||
|
abc.location_of "" . should_equal (Span (Range 0 0) abc)
|
||||||
|
abc.location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
|
||||||
|
abc.location_of_all "" . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
|
||||||
|
|
||||||
|
Test.specify "should allow case insensitive matching in location_of" <|
|
||||||
|
hello = "Hello WORLD!"
|
||||||
|
case_insensitive = Text_Matcher Case_Insensitive.new
|
||||||
|
hello.location_of "world" . should_equal Nothing
|
||||||
|
hello.location_of "world" matcher=case_insensitive . should_equal (Span (Range 6 11) hello)
|
||||||
|
|
||||||
|
hello.location_of "o" mode=Mode.First matcher=case_insensitive . should_equal (Span (Range 4 5) hello)
|
||||||
|
hello.location_of "o" mode=Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 7 8) hello)
|
||||||
|
|
||||||
|
accents = 'A\u{301}E\u{301}O\u{301}'
|
||||||
|
accents.location_of accent_1 matcher=case_insensitive . should_equal (Span (Range 1 2) accents)
|
||||||
|
|
||||||
|
"Strasse".location_of "ß" matcher=case_insensitive . should_equal (Span (Range 4 6) "Strasse")
|
||||||
|
"Monumentenstraße 42".location_of "STRASSE" matcher=case_insensitive . should_equal (Span (Range 10 16) "Monumentenstraße 42")
|
||||||
|
|
||||||
|
'\u0390'.location_of '\u03B9\u0308\u0301' matcher=case_insensitive . should_equal (Span (Range 0 1) '\u0390')
|
||||||
|
'ԵՒ'.location_of 'և' . should_equal Nothing
|
||||||
|
'ԵՒ'.location_of 'և' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ԵՒ')
|
||||||
|
'և'.location_of 'ԵՒ' matcher=case_insensitive . should_equal (Span (Range 0 1) 'և')
|
||||||
|
|
||||||
|
ligatures = 'ffafffiflffifflſtstZ'
|
||||||
|
ligatures.location_of 'FFI' matcher=case_insensitive . should_equal (Span (Range 3 5) ligatures)
|
||||||
|
ligatures.location_of 'FF' matcher=case_insensitive . should_equal (Span (Range 0 2) ligatures)
|
||||||
|
ligatures.location_of 'ff' matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 7 8) ligatures)
|
||||||
|
ligatures.location_of_all 'ff' . should_equal [Span (Range 0 2) ligatures]
|
||||||
|
ligatures.location_of_all 'FF' matcher=case_insensitive . should_equal [Span (Range 0 2) ligatures, Span (Range 3 4) ligatures, Span (Range 6 7) ligatures, Span (Range 7 8) ligatures]
|
||||||
|
ligatures.location_of_all 'ffi' matcher=case_insensitive . should_equal [Span (Range 3 5) ligatures, Span (Range 6 7) ligatures]
|
||||||
|
'fffi'.location_of_all 'ff' matcher=case_insensitive . should_equal [Span (Range 0 2) 'fffi']
|
||||||
|
'fffi'.location_of_all 'ffi' . should_equal []
|
||||||
|
'fffi'.location_of_all 'ffi' matcher=case_insensitive . should_equal [Span (Range 1 4) 'fffi']
|
||||||
|
'FFFI'.location_of 'ffi' matcher=case_insensitive . should_equal (Span (Range 1 4) 'FFFI')
|
||||||
|
|
||||||
|
'ffiffl'.location_of 'IF' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ffiffl')
|
||||||
|
'ffiffl'.location_of 'F' Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 1 2) 'ffiffl')
|
||||||
|
'ffiffl'.location_of_all 'F' matcher=case_insensitive . should_equal [Span (Range 0 1) 'ffiffl', Span (Range 0 1) 'ffiffl', Span (Range 1 2) 'ffiffl', Span (Range 1 2) 'ffiffl']
|
||||||
|
'aaffibb'.location_of_all 'af' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaffibb']
|
||||||
|
'aaffibb'.location_of_all 'affi' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaffibb']
|
||||||
|
'aaffibb'.location_of_all 'ib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaffibb']
|
||||||
|
'aaffibb'.location_of_all 'ffib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaffibb']
|
||||||
|
|
||||||
|
"".location_of "foo" matcher=case_insensitive . should_equal Nothing
|
||||||
|
"".location_of "foo" matcher=case_insensitive mode=Matching_Mode.Last . should_equal Nothing
|
||||||
|
"".location_of_all "foo" matcher=case_insensitive . should_equal []
|
||||||
|
"".location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) "")
|
||||||
|
"".location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
|
||||||
|
"".location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) ""]
|
||||||
|
abc = 'A\u{301}ßC'
|
||||||
|
abc.location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) abc)
|
||||||
|
abc.location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
|
||||||
|
abc.location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
|
||||||
|
|
||||||
|
Test.specify "should allow regexes in location_of" <|
|
||||||
|
hello = "Hello World!"
|
||||||
|
regex = Regex_Matcher.new
|
||||||
|
regex_insensitive = Regex_Matcher.new case_sensitive=Case_Insensitive.new
|
||||||
|
hello.location_of ".o" Matching_Mode.First matcher=regex . should_equal (Span (Range 3 5) hello)
|
||||||
|
hello.location_of ".o" Matching_Mode.Last matcher=regex . should_equal (Span (Range 6 8) hello)
|
||||||
|
hello.location_of_all ".o" matcher=regex . map .start . should_equal [3, 6]
|
||||||
|
|
||||||
|
"foobar".location_of "BAR" Mode.First matcher=regex_insensitive . should_equal (Span (Range 3 6) "foobar")
|
||||||
|
|
||||||
|
## Regex matching does not do case folding
|
||||||
|
"Strasse".location_of "ß" Mode.First matcher=regex_insensitive . should_equal Nothing
|
||||||
|
|
||||||
|
## But it should handle the Unicode normalization
|
||||||
|
accents = 'a\u{301}e\u{301}o\u{301}'
|
||||||
|
accents.location_of accent_1 Mode.First matcher=regex . should_equal (Span (Range 1 2) accents)
|
||||||
|
Test.specify "should correctly handle regex edge cases in location_of" pending="Figure out how to make Regex correctly handle empty patterns." <|
|
||||||
|
regex = Regex_Matcher.new
|
||||||
|
"".location_of "foo" matcher=regex . should_equal Nothing
|
||||||
|
"".location_of "foo" matcher=regex mode=Matching_Mode.Last . should_equal Nothing
|
||||||
|
"".location_of_all "foo" matcher=regex . should_equal []
|
||||||
|
"".location_of "" matcher=regex . should_equal (Span (Range 0 0) "")
|
||||||
|
"".location_of_all "" matcher=regex . should_equal [Span (Range 0 0) ""]
|
||||||
|
"".location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
|
||||||
|
abc = 'A\u{301}ßC'
|
||||||
|
abc.location_of "" matcher=regex . should_equal (Span (Range 0 0) abc)
|
||||||
|
abc.location_of_all "" matcher=regex . should_equal [Span (Range 0 0) abc, Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
|
||||||
|
abc.location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
|
||||||
|
|
||||||
Test.group "Regex matching" <|
|
Test.group "Regex matching" <|
|
||||||
Test.specify "should be possible on text" <|
|
Test.specify "should be possible on text" <|
|
||||||
match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First
|
match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First
|
||||||
|
@ -128,3 +128,4 @@ spec = Test.group "Examples" <|
|
|||||||
match.groups.length . should_equal 5
|
match.groups.length . should_equal 5
|
||||||
match.named_groups.size . should_equal 2
|
match.named_groups.size . should_equal 2
|
||||||
|
|
||||||
|
main = Test.Suite.run_main here.spec
|
||||||
|
@ -34,6 +34,7 @@ import project.Data.Text_Spec
|
|||||||
import project.Data.Time.Spec as Time_Spec
|
import project.Data.Time.Spec as Time_Spec
|
||||||
import project.Data.Vector_Spec
|
import project.Data.Vector_Spec
|
||||||
import project.Data.Text.Regex_Spec
|
import project.Data.Text.Regex_Spec
|
||||||
|
import project.Data.Text.Utils_Spec
|
||||||
import project.Data.Text.Default_Regex_Engine_Spec
|
import project.Data.Text.Default_Regex_Engine_Spec
|
||||||
import project.Data.Text.Matching_Spec
|
import project.Data.Text.Matching_Spec
|
||||||
import project.Data.Text.Span_Spec
|
import project.Data.Text.Span_Spec
|
||||||
@ -87,6 +88,7 @@ main = Test.Suite.run_main <|
|
|||||||
Runtime_Spec.spec
|
Runtime_Spec.spec
|
||||||
Span_Spec.spec
|
Span_Spec.spec
|
||||||
Stack_Traces_Spec.spec
|
Stack_Traces_Spec.spec
|
||||||
|
Utils_Spec.spec
|
||||||
Text_Spec.spec
|
Text_Spec.spec
|
||||||
Time_Spec.spec
|
Time_Spec.spec
|
||||||
Uri_Spec.spec
|
Uri_Spec.spec
|
||||||
|
Loading…
Reference in New Issue
Block a user