Add Table.parse_text_to_table to convert Text to a Table. (#6294)

This commit is contained in:
GregoryTravis 2023-04-21 13:43:19 -04:00 committed by GitHub
parent 8db2ad51a1
commit 22f820feb7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 176 additions and 9 deletions

View File

@ -387,6 +387,7 @@
- [Implemented `Table.split` and `Table.tokenize` for in-memory tables.][6233]
- [Added `trim` and `replace` to `Column`. Enhanced number parsing with support
for thousands and decimal point automatic detection.][6253]
- [Implemented `Table.parse_text_to_table`.][6294]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -586,6 +587,7 @@
[6218]: https://github.com/enso-org/enso/pull/6218
[6233]: https://github.com/enso-org/enso/pull/6233
[6253]: https://github.com/enso-org/enso/pull/6253
[6294]: https://github.com/enso-org/enso/pull/6294
#### Enso Compiler

View File

@ -63,7 +63,7 @@ type Pattern
Arguments:
- input: The text to match the pattern described by `self` against.
match_all : Text -> Vector Match ! Type_Error
match_all : Text -> Vector Match ! Type_Error | Illegal_Argument
match_all self input =
Helpers.expect_text input <|
pattern_is_empty = self.internal_regex_object.pattern == ''
@ -288,13 +288,9 @@ type Pattern
groups = self.internal_regex_object.groups
n = case groups of
# If Nothing, there are no named groups
Nothing -> Error.throw (No_Such_Group.Error name)
_ ->
qq = (read_group_map groups name)
case qq of
Nothing -> Nothing
n : Integer -> n
# If Nothing, there are no named groups
Nothing -> Error.throw (No_Such_Group.Error name)
_ -> read_group_map groups name
case n of
_ : Integer -> n
Nothing -> Error.throw (No_Such_Group.Error name)
@ -319,6 +315,12 @@ type Pattern
map = polyglot_map_to_map self.internal_regex_object.groups
map.keys
## Return a map from group number to group name. Only includes named groups.
group_nums_to_names : Map Integer Text
group_nums_to_names self =
map = polyglot_map_to_map self.internal_regex_object.groups
map.transform k-> v-> [v, k]
## PRIVATE
Performs the regex match, and iterates through the results. Yields both

View File

@ -1,14 +1,15 @@
from Standard.Base import all
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.Errors.Unimplemented.Unimplemented
import project.Data.Table.Table
import project.Data.Match_Columns.Match_Columns
import project.Delimited.Delimited_Format.Delimited_Format
import project.Errors.Invalid_JSON_Format
import project.Internal.Delimited_Reader
import project.Internal.Delimited_Writer
import project.Internal.Parse_To_Table
Table.from (that : Text) (format:Delimited_Format = Delimited_Format.Delimited '\t') (on_problems:Problem_Behavior=Report_Warning) =
case format of
@ -91,6 +92,30 @@ Table.from_objects value fields=Nothing =
_ : Array -> Table.from_objects (Vector.from_polyglot_array value) fields
_ -> Error.throw (Illegal_Argument.Error "Invalid value for Table.from_objects. Currently must be one of JS_Object, Vector, Array, Number, Boolean, Text and Nothing are supported (got "+(Meta.get_simple_type_name value)+").")
## Converts a Text into a Table using a regular expression pattern.
Each match becomes a row in the table.
If there are no marked groups, there will be a single column with the
whole content of the match. Otherwise, each group becomes a column
(with the column name taken from the group name if the group is named in the
regex).
Arguments:
- pattern: The pattern used to search within the text.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
- parse_values: Parse any values using the default value parser.
? Column Names
If there are no marked groups, the new column will be named `Column`.
If the marked groups are named, the names will be used otherwise the column
will be named `Column <N>` where `N` is the number of the marked group.
(Group 0 is not included.)
Text.parse_to_table : Text -> Text -> Case_Sensitivity -> Boolean -> Problem_Behavior -> Table ! Type_Error | Regex_Syntax_Error | Illegal_Argument
Text.parse_to_table self pattern="." case_sensitivity=Case_Sensitivity.Sensitive parse_values=True on_problems=Report_Warning =
Parse_To_Table.parse_text_to_table self pattern case_sensitivity parse_values on_problems
## PRIVATE
ADVANCED

View File

@ -0,0 +1,54 @@
from Standard.Base import all
import project.Data.Column.Column
import project.Data.Data_Formatter.Data_Formatter
import project.Data.Table.Table
import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
import Standard.Base.Data.Text.Regex
import Standard.Base.Data.Text.Regex.Pattern
import Standard.Base.Data.Text.Regex.Match.Match
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.Errors.Problem_Behavior.Problem_Behavior
import Standard.Base.Function.Function
from project import Value_Type
from project.Errors import Duplicate_Output_Column_Names
from Standard.Base.Errors.Problem_Behavior.Problem_Behavior import all
## PRIVATE
Converts a Text into a Table using a regular expression pattern.
See Table.parse_text_to_table.
parse_text_to_table : Text -> Text -> Case_Sensitivity -> Boolean -> Problem_Behavior -> Table ! Type_Error | Regex_Syntax_Error | Illegal_Argument
parse_text_to_table text pattern_string="." case_sensitivity=Case_Sensitivity.Sensitive parse_values=True on_problems=Report_Warning =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
pattern = Regex.compile pattern_string case_insensitive=case_insensitive
matches = pattern.match_all text
columns = case pattern.group_count == 1 of
True ->
## No match groups; each row is a single value consisting of the
entire match
column_name = "Column"
column_values = matches.map (match-> match.text 0)
[Column.from_vector column_name column_values]
False ->
unique = Unique_Name_Strategy.new
## Mark named groups used so they take precedence over
auto-generated names.
unique.mark_used pattern.named_groups
## There are match groups; each one becomes a separate value in the
row. Group 0 is not included.
nums_to_names = pattern.group_nums_to_names
1.up_to pattern.group_count . map group_num->
column_values = matches.map (match-> match.at group_num)
column_name = case nums_to_names.get group_num of
_ : Nothing -> unique.make_unique <| "Column " + group_num.to_text
name -> name
Column.from_vector column_name column_values
table = Table.new columns
if parse_values then table.parse on_problems=on_problems else table

View File

@ -7,6 +7,7 @@ import project.In_Memory.Builders_Spec
import project.In_Memory.Column_Spec
import project.In_Memory.Common_Spec
import project.In_Memory.Join_Performance_Spec
import project.In_Memory.Parse_To_Table_Spec
import project.In_Memory.Split_Tokenize_Spec
import project.In_Memory.Table_Spec
import project.In_Memory.Table_Date_Spec
@ -24,5 +25,6 @@ spec =
Builders_Spec.spec
Join_Performance_Spec.spec
Split_Tokenize_Spec.spec
Parse_To_Table_Spec.spec
main = Test_Suite.run_main spec

View File

@ -0,0 +1,72 @@
from Standard.Base import all
import Standard.Base.Data.Text.Case_Sensitivity.Case_Sensitivity
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Table.Data.Table_Conversions
import Standard.Test.Extensions
from Standard.Table import Table
from Standard.Table.Data.Type.Value_Type import Bits, Value_Type
from Standard.Table.Errors import Invalid_Value_Type, Column_Count_Exceeded, Duplicate_Output_Column_Names, Missing_Input_Columns
from Standard.Test import Test, Test_Suite, Problems
from project.Util import all
spec =
Test.group "Text.parse_to_table" <|
Test.specify "text_to_table" <|
expected = Table.from_rows ["Column"]
[["a"], ["ab12"], ["bt100"], ["c12"], ["d20"], ["q"]]
actual = "a 7 ab12 bt100 c12d20q 12".parse_to_table "[a-z]+\d*"
actual.should_equal_verbose expected
Test.group "Text.parse_to_table with groups" <|
Test.specify "with groups" <|
expected = Table.from_rows ["Column 1", "Column 2"]
[["ab", 12], ["bt", 100], ["c", 12], ["d", 20]]
actual = "a 7 ab-12 bt-100 c-12d-20q q8 12".parse_to_table "([a-z]+)-(\d*)"
actual.should_equal_verbose expected
Test.specify "with named groups" <|
expected = Table.from_rows ["letters", "Column 2"]
[["ab", 12], ["bt", 100], ["c", 12], ["d", 20]]
actual = "a 7 ab-12 bt-100 c-12d-20q q8 12".parse_to_table "(?<letters>[a-z]+)-(\d*)"
actual.should_equal_verbose expected
Test.group "Text.parse_to_table with case-insensitivity" <|
Test.specify "case insensitivity" <|
expected = Table.from_rows ["Column 1", "Column 2"]
[["a", "B"], ["A", "b"], ["a", "b"], ["A", "B"]]
actual = "xy aB Ab ab AB".parse_to_table "(a)(b)" case_sensitivity=Case_Sensitivity.Insensitive
actual.should_equal_verbose expected
Test.group "Text.parse_to_table parsing" <|
Test.specify "parsing on" <|
expected = Table.from_rows ["Column 1", "Column 2"]
[["ab", 12], ["bt", 100], ["c", 12], ["d", 20]]
actual = "a 7 ab-12 bt-100 c-12d-20q q8 12".parse_to_table "([a-z]+)-(\d*)"
actual.should_equal_verbose expected
actual.columns.map .value_type . should_equal [Value_Type.Char Nothing True, Value_Type.Integer Bits.Bits_64]
Test.specify "parsing on, with a mixed column" <|
expected = Table.from_rows ["Column 1", "Column 2"]
[["ab", "12"], ["bt", "100"], ["c", "012"], ["d", "20"]]
actual = "a 7 ab-12 bt-100 c-012d-20q q8 12".parse_to_table "([a-z]+)-(\d*)"
actual.should_equal_verbose expected
actual.columns.map .value_type . should_equal [Value_Type.Char Nothing True, Value_Type.Char Nothing True]
Test.specify "parsing off" <|
expected = Table.from_rows ["Column 1", "Column 2"]
[["ab", "12"], ["bt", "100"], ["c", "12"], ["d", "20"]]
actual = "a 7 ab-12 bt-100 c-12d-20q q8 12".parse_to_table "([a-z]+)-(\d*)" parse_values=False
actual.should_equal_verbose expected
actual.columns.map .value_type . should_equal [Value_Type.Char Nothing True, Value_Type.Char Nothing True]
Test.group "Text.parse_to_table errors" <|
Test.specify "Regex_Syntax_Error" <|
"abc".parse_to_table "(a)(?<<" . should_fail_with Regex_Syntax_Error
Test.specify "enpty pattern" <|
"abc".parse_to_table "" . should_fail_with Illegal_Argument
main = Test_Suite.run_main spec

View File

@ -26,6 +26,12 @@ spec =
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
Regex.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax (space in capture group name)" <|
Regex.compile "(?<dot s>..)" . should_fail_with Regex_Syntax_Error
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax (duplicate name)" <|
Regex.compile "(?<foo>.)(?<foo>.)" . should_fail_with Regex_Syntax_Error
Test.specify "should disallow empty patterns in `compile`" <|
Regex.compile "" . should_fail_with Illegal_Argument
@ -360,6 +366,10 @@ spec =
match = pattern.match input
match . should_be_a Match.Value
Test.specify "should provide access to info about group names" <|
pattern.named_groups.sort . should_equal ["empty", "letters"]
pattern.group_nums_to_names . should_equal <| Map.from_vector [[2, "letters"],[4, "empty"]]
Test.specify "should return the results of all named groups" <|
groups = match.named_groups
groups.keys.sort . should_equal ["empty", "letters"]