mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 22:10:15 +03:00
Add Table.parse_text_to_table to convert Text to a Table. (#6294)
This commit is contained in:
parent
8db2ad51a1
commit
22f820feb7
@ -387,6 +387,7 @@
|
||||
- [Implemented `Table.split` and `Table.tokenize` for in-memory tables.][6233]
|
||||
- [Added `trim` and `replace` to `Column`. Enhanced number parsing with support
|
||||
for thousands and decimal point automatic detection.][6253]
|
||||
- [Implemented `Table.parse_text_to_table`.][6294]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -586,6 +587,7 @@
|
||||
[6218]: https://github.com/enso-org/enso/pull/6218
|
||||
[6233]: https://github.com/enso-org/enso/pull/6233
|
||||
[6253]: https://github.com/enso-org/enso/pull/6253
|
||||
[6294]: https://github.com/enso-org/enso/pull/6294
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -63,7 +63,7 @@ type Pattern
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match_all : Text -> Vector Match ! Type_Error
|
||||
match_all : Text -> Vector Match ! Type_Error | Illegal_Argument
|
||||
match_all self input =
|
||||
Helpers.expect_text input <|
|
||||
pattern_is_empty = self.internal_regex_object.pattern == ''
|
||||
@ -288,13 +288,9 @@ type Pattern
|
||||
groups = self.internal_regex_object.groups
|
||||
|
||||
n = case groups of
|
||||
# If Nothing, there are no named groups
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
_ ->
|
||||
qq = (read_group_map groups name)
|
||||
case qq of
|
||||
Nothing -> Nothing
|
||||
n : Integer -> n
|
||||
# If Nothing, there are no named groups
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
_ -> read_group_map groups name
|
||||
case n of
|
||||
_ : Integer -> n
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
@ -319,6 +315,12 @@ type Pattern
|
||||
map = polyglot_map_to_map self.internal_regex_object.groups
|
||||
map.keys
|
||||
|
||||
## Return a map from group number to group name. Only includes named groups.
|
||||
group_nums_to_names : Map Integer Text
|
||||
group_nums_to_names self =
|
||||
map = polyglot_map_to_map self.internal_regex_object.groups
|
||||
map.transform k-> v-> [v, k]
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Performs the regex match, and iterates through the results. Yields both
|
||||
|
@ -1,14 +1,15 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
|
||||
import Standard.Base.Errors.Common.Type_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
import Standard.Base.Errors.Unimplemented.Unimplemented
|
||||
|
||||
import project.Data.Table.Table
|
||||
import project.Data.Match_Columns.Match_Columns
|
||||
import project.Delimited.Delimited_Format.Delimited_Format
|
||||
import project.Errors.Invalid_JSON_Format
|
||||
import project.Internal.Delimited_Reader
|
||||
import project.Internal.Delimited_Writer
|
||||
import project.Internal.Parse_To_Table
|
||||
|
||||
Table.from (that : Text) (format:Delimited_Format = Delimited_Format.Delimited '\t') (on_problems:Problem_Behavior=Report_Warning) =
|
||||
case format of
|
||||
@ -91,6 +92,30 @@ Table.from_objects value fields=Nothing =
|
||||
_ : Array -> Table.from_objects (Vector.from_polyglot_array value) fields
|
||||
_ -> Error.throw (Illegal_Argument.Error "Invalid value for Table.from_objects. Currently must be one of JS_Object, Vector, Array, Number, Boolean, Text and Nothing are supported (got "+(Meta.get_simple_type_name value)+").")
|
||||
|
||||
## Converts a Text into a Table using a regular expression pattern.
|
||||
|
||||
Each match becomes a row in the table.
|
||||
|
||||
If there are no marked groups, there will be a single column with the
|
||||
whole content of the match. Otherwise, each group becomes a column
|
||||
(with the column name taken from the group name if the group is named in the
|
||||
regex).
|
||||
|
||||
Arguments:
|
||||
- pattern: The pattern used to search within the text.
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
- parse_values: Parse any values using the default value parser.
|
||||
|
||||
? Column Names
|
||||
|
||||
If there are no marked groups, the new column will be named `Column`.
|
||||
If the marked groups are named, the names will be used otherwise the column
|
||||
will be named `Column <N>` where `N` is the number of the marked group.
|
||||
(Group 0 is not included.)
|
||||
Text.parse_to_table : Text -> Text -> Case_Sensitivity -> Boolean -> Problem_Behavior -> Table ! Type_Error | Regex_Syntax_Error | Illegal_Argument
|
||||
Text.parse_to_table self pattern="." case_sensitivity=Case_Sensitivity.Sensitive parse_values=True on_problems=Report_Warning =
|
||||
Parse_To_Table.parse_text_to_table self pattern case_sensitivity parse_values on_problems
|
||||
|
||||
## PRIVATE
|
||||
ADVANCED
|
||||
|
@ -0,0 +1,54 @@
|
||||
from Standard.Base import all
|
||||
|
||||
import project.Data.Column.Column
|
||||
import project.Data.Data_Formatter.Data_Formatter
|
||||
import project.Data.Table.Table
|
||||
import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
|
||||
import Standard.Base.Data.Text.Regex
|
||||
import Standard.Base.Data.Text.Regex.Pattern
|
||||
import Standard.Base.Data.Text.Regex.Match.Match
|
||||
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
|
||||
import Standard.Base.Errors.Common.Type_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
import Standard.Base.Errors.Problem_Behavior.Problem_Behavior
|
||||
import Standard.Base.Function.Function
|
||||
|
||||
from project import Value_Type
|
||||
from project.Errors import Duplicate_Output_Column_Names
|
||||
from Standard.Base.Errors.Problem_Behavior.Problem_Behavior import all
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Converts a Text into a Table using a regular expression pattern.
|
||||
|
||||
See Table.parse_text_to_table.
|
||||
parse_text_to_table : Text -> Text -> Case_Sensitivity -> Boolean -> Problem_Behavior -> Table ! Type_Error | Regex_Syntax_Error | Illegal_Argument
|
||||
parse_text_to_table text pattern_string="." case_sensitivity=Case_Sensitivity.Sensitive parse_values=True on_problems=Report_Warning =
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
pattern = Regex.compile pattern_string case_insensitive=case_insensitive
|
||||
matches = pattern.match_all text
|
||||
|
||||
columns = case pattern.group_count == 1 of
|
||||
True ->
|
||||
## No match groups; each row is a single value consisting of the
|
||||
entire match
|
||||
column_name = "Column"
|
||||
column_values = matches.map (match-> match.text 0)
|
||||
[Column.from_vector column_name column_values]
|
||||
False ->
|
||||
unique = Unique_Name_Strategy.new
|
||||
## Mark named groups used so they take precedence over
|
||||
auto-generated names.
|
||||
unique.mark_used pattern.named_groups
|
||||
## There are match groups; each one becomes a separate value in the
|
||||
row. Group 0 is not included.
|
||||
nums_to_names = pattern.group_nums_to_names
|
||||
1.up_to pattern.group_count . map group_num->
|
||||
column_values = matches.map (match-> match.at group_num)
|
||||
column_name = case nums_to_names.get group_num of
|
||||
_ : Nothing -> unique.make_unique <| "Column " + group_num.to_text
|
||||
name -> name
|
||||
Column.from_vector column_name column_values
|
||||
|
||||
table = Table.new columns
|
||||
if parse_values then table.parse on_problems=on_problems else table
|
@ -7,6 +7,7 @@ import project.In_Memory.Builders_Spec
|
||||
import project.In_Memory.Column_Spec
|
||||
import project.In_Memory.Common_Spec
|
||||
import project.In_Memory.Join_Performance_Spec
|
||||
import project.In_Memory.Parse_To_Table_Spec
|
||||
import project.In_Memory.Split_Tokenize_Spec
|
||||
import project.In_Memory.Table_Spec
|
||||
import project.In_Memory.Table_Date_Spec
|
||||
@ -24,5 +25,6 @@ spec =
|
||||
Builders_Spec.spec
|
||||
Join_Performance_Spec.spec
|
||||
Split_Tokenize_Spec.spec
|
||||
Parse_To_Table_Spec.spec
|
||||
|
||||
main = Test_Suite.run_main spec
|
||||
|
72
test/Table_Tests/src/In_Memory/Parse_To_Table_Spec.enso
Normal file
72
test/Table_Tests/src/In_Memory/Parse_To_Table_Spec.enso
Normal file
@ -0,0 +1,72 @@
|
||||
from Standard.Base import all
|
||||
|
||||
import Standard.Base.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
import Standard.Table.Data.Table_Conversions
|
||||
import Standard.Test.Extensions
|
||||
|
||||
from Standard.Table import Table
|
||||
from Standard.Table.Data.Type.Value_Type import Bits, Value_Type
|
||||
from Standard.Table.Errors import Invalid_Value_Type, Column_Count_Exceeded, Duplicate_Output_Column_Names, Missing_Input_Columns
|
||||
from Standard.Test import Test, Test_Suite, Problems
|
||||
from project.Util import all
|
||||
|
||||
spec =
|
||||
Test.group "Text.parse_to_table" <|
|
||||
Test.specify "text_to_table" <|
|
||||
expected = Table.from_rows ["Column"]
|
||||
[["a"], ["ab12"], ["bt100"], ["c12"], ["d20"], ["q"]]
|
||||
actual = "a 7 ab12 bt100 c12d20q 12".parse_to_table "[a-z]+\d*"
|
||||
actual.should_equal_verbose expected
|
||||
|
||||
Test.group "Text.parse_to_table with groups" <|
|
||||
Test.specify "with groups" <|
|
||||
expected = Table.from_rows ["Column 1", "Column 2"]
|
||||
[["ab", 12], ["bt", 100], ["c", 12], ["d", 20]]
|
||||
actual = "a 7 ab-12 bt-100 c-12d-20q q8 12".parse_to_table "([a-z]+)-(\d*)"
|
||||
actual.should_equal_verbose expected
|
||||
|
||||
Test.specify "with named groups" <|
|
||||
expected = Table.from_rows ["letters", "Column 2"]
|
||||
[["ab", 12], ["bt", 100], ["c", 12], ["d", 20]]
|
||||
actual = "a 7 ab-12 bt-100 c-12d-20q q8 12".parse_to_table "(?<letters>[a-z]+)-(\d*)"
|
||||
actual.should_equal_verbose expected
|
||||
|
||||
Test.group "Text.parse_to_table with case-insensitivity" <|
|
||||
Test.specify "case insensitivity" <|
|
||||
expected = Table.from_rows ["Column 1", "Column 2"]
|
||||
[["a", "B"], ["A", "b"], ["a", "b"], ["A", "B"]]
|
||||
actual = "xy aB Ab ab AB".parse_to_table "(a)(b)" case_sensitivity=Case_Sensitivity.Insensitive
|
||||
actual.should_equal_verbose expected
|
||||
|
||||
Test.group "Text.parse_to_table parsing" <|
|
||||
Test.specify "parsing on" <|
|
||||
expected = Table.from_rows ["Column 1", "Column 2"]
|
||||
[["ab", 12], ["bt", 100], ["c", 12], ["d", 20]]
|
||||
actual = "a 7 ab-12 bt-100 c-12d-20q q8 12".parse_to_table "([a-z]+)-(\d*)"
|
||||
actual.should_equal_verbose expected
|
||||
actual.columns.map .value_type . should_equal [Value_Type.Char Nothing True, Value_Type.Integer Bits.Bits_64]
|
||||
|
||||
Test.specify "parsing on, with a mixed column" <|
|
||||
expected = Table.from_rows ["Column 1", "Column 2"]
|
||||
[["ab", "12"], ["bt", "100"], ["c", "012"], ["d", "20"]]
|
||||
actual = "a 7 ab-12 bt-100 c-012d-20q q8 12".parse_to_table "([a-z]+)-(\d*)"
|
||||
actual.should_equal_verbose expected
|
||||
actual.columns.map .value_type . should_equal [Value_Type.Char Nothing True, Value_Type.Char Nothing True]
|
||||
|
||||
Test.specify "parsing off" <|
|
||||
expected = Table.from_rows ["Column 1", "Column 2"]
|
||||
[["ab", "12"], ["bt", "100"], ["c", "12"], ["d", "20"]]
|
||||
actual = "a 7 ab-12 bt-100 c-12d-20q q8 12".parse_to_table "([a-z]+)-(\d*)" parse_values=False
|
||||
actual.should_equal_verbose expected
|
||||
actual.columns.map .value_type . should_equal [Value_Type.Char Nothing True, Value_Type.Char Nothing True]
|
||||
|
||||
Test.group "Text.parse_to_table errors" <|
|
||||
Test.specify "Regex_Syntax_Error" <|
|
||||
"abc".parse_to_table "(a)(?<<" . should_fail_with Regex_Syntax_Error
|
||||
|
||||
Test.specify "enpty pattern" <|
|
||||
"abc".parse_to_table "" . should_fail_with Illegal_Argument
|
||||
|
||||
main = Test_Suite.run_main spec
|
@ -26,6 +26,12 @@ spec =
|
||||
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
|
||||
Regex.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
|
||||
|
||||
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax (space in capture group name)" <|
|
||||
Regex.compile "(?<dot s>..)" . should_fail_with Regex_Syntax_Error
|
||||
|
||||
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax (duplicate name)" <|
|
||||
Regex.compile "(?<foo>.)(?<foo>.)" . should_fail_with Regex_Syntax_Error
|
||||
|
||||
Test.specify "should disallow empty patterns in `compile`" <|
|
||||
Regex.compile "" . should_fail_with Illegal_Argument
|
||||
|
||||
@ -360,6 +366,10 @@ spec =
|
||||
match = pattern.match input
|
||||
match . should_be_a Match.Value
|
||||
|
||||
Test.specify "should provide access to info about group names" <|
|
||||
pattern.named_groups.sort . should_equal ["empty", "letters"]
|
||||
pattern.group_nums_to_names . should_equal <| Map.from_vector [[2, "letters"],[4, "empty"]]
|
||||
|
||||
Test.specify "should return the results of all named groups" <|
|
||||
groups = match.named_groups
|
||||
groups.keys.sort . should_equal ["empty", "letters"]
|
||||
|
Loading…
Reference in New Issue
Block a user