Support Previous_Value in fill_nothing and fill_missing (#8105)

- Adds `Previous_Value` to `fill_nothing` and `fill_empty`, as requested by #7192.
This commit is contained in:
Radosław Waśko 2023-10-20 15:18:53 +02:00 committed by GitHub
parent 1391dd93f4
commit 8172896065
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 300 additions and 47 deletions

View File

@ -587,6 +587,7 @@
deriving column values in the GUI.][8005]
- [Implemented `Table.expand_to_rows` for the in-memory backend.][8029]
- [Added XML support for `.to Table` and `.expand_column`.][8083]
- [Added `Previous_Value` option to `fill_nothing` and `fill_empty`.][8105]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -837,6 +838,7 @@
[8005]: https://github.com/enso-org/enso/pull/8005
[8029]: https://github.com/enso-org/enso/pull/8029
[8083]: https://github.com/enso-org/enso/pull/8083
[8105]: https://github.com/enso-org/enso/pull/8105
#### Enso Compiler

View File

@ -5,6 +5,7 @@ import Standard.Base.Internal.Rounding_Helpers
from Standard.Base.Widget_Helpers import make_regex_text_widget
import Standard.Table.Data.Column.Column as Materialized_Column
import Standard.Table.Data.Constants.Previous_Value
import Standard.Table.Data.Type.Enso_Types
import Standard.Table.Data.Type.Value_Type_Helpers
import Standard.Table.Internal.Column_Naming_Helper.Column_Naming_Helper
@ -1008,7 +1009,10 @@ type Column
Arguments:
- default: The value to replace missing values with. If this argument
is a column, the value from `default` at the corresponding position
will be used.
will be used. If this argument is `Previous_Value`, the missing values
will be replaced with the previous value in the column. Note that the
first rows may stay `Nothing` if they do not have a previous value to
use.
> Example
Fill missing values in a column with the value 20.5.
@ -1016,8 +1020,10 @@ type Column
import Standard.Examples
example_fill_nothing = Examples.decimal_column.fill_nothing 20.5
fill_nothing : Column | Any -> Column
@default (Widget_Helpers.make_fill_default_value_selector include_custom_text=False)
fill_nothing : Column | Previous_Value | Any -> Column
fill_nothing self default =
if Previous_Value == default then Error.throw (Unsupported_Database_Operation.Error "The Previous_Value argument is currently not supported in the database backend.") else
common_type = Value_Type_Helpers.find_common_type_for_arguments [self, default]
common_type.if_not_error <|
op_result = self.make_binary_op "FILL_NULL" default self.name
@ -1032,7 +1038,9 @@ type Column
Arguments:
- default: The value to replace empty values with. If this argument
is a column, the value from `default` at the corresponding position
will be used.
will be used. If this argument is `Previous_Value`, the empty values
will be replaced with the previous value in the column. Note that the
first rows may stay empty if they do not have a previous value to use.
> Example
Fill empty values in a column with the value "hello".
@ -1040,10 +1048,11 @@ type Column
import Standard.Examples
example_fill_empty = Examples.text_column_1.fill_empty "hello"
fill_empty : Column | Any -> Column
@default (Widget_Helpers.make_fill_default_value_selector include_custom_text=True)
fill_empty : Column | Previous_Value | Any -> Column
fill_empty self default =
Value_Type.expect_text self <|
Value_Type.expect_text default <|
if Previous_Value == default then Error.throw (Unsupported_Database_Operation.Error "The Previous_Value argument is currently not supported in the database backend.") else
Value_Type.expect_text self <| Value_Type.expect_text default <|
result = self.is_empty.iif default self
result.rename self.name

View File

@ -13,6 +13,7 @@ from Standard.Base.Widget_Helpers import make_delimiter_selector
import Standard.Table.Data.Calculations.Column_Operation.Column_Operation
import Standard.Table.Data.Column_Ref.Column_Ref
import Standard.Table.Data.Constants.Previous_Value
import Standard.Table.Data.Expression.Expression
import Standard.Table.Data.Expression.Expression_Error
import Standard.Table.Data.Join_Condition.Join_Condition
@ -2368,16 +2369,19 @@ type Table
match names, or a Vector of these.
- default: The value to replace missing values with. If this argument
is a column, the value from `default` at the corresponding position
will be used.
will be used. If this argument is `Previous_Value`, the missing values
will be replaced with the previous value in the column. Note that the
first rows may stay `Nothing` if they do not have a previous value to
use.
> Example
Fill missing values in two columns with the value 20.5.
fill_nothing = table.fill_nothing ["col0", "col1"] 20.5
@columns Widget_Helpers.make_column_name_vector_selector
@default Widget_Helpers.make_column_ref_by_name_selector
fill_nothing : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Column | Column_Ref | Any -> Table
fill_nothing self (columns : Vector | Text | Integer | Regex) (default : Column | Column_Ref | Any) =
@default (self -> Widget_Helpers.make_fill_default_value_selector column_source=self include_custom_text=False)
fill_nothing : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Column | Column_Ref | Previous_Value | Any -> Table
fill_nothing self (columns : Vector | Text | Integer | Regex) default =
resolved_default = (self:Table_Ref).resolve default
transformer col = col.fill_nothing resolved_default
Table_Helpers.replace_columns_with_transformed_columns self columns transformer
@ -2393,16 +2397,18 @@ type Table
match names, or a Vector of these.
- default: The value to replace empty values with. If this argument
is a column, the value from `default` at the corresponding position
will be used.
will be used. If this argument is `Previous_Value`, the empty values
will be replaced with the previous value in the column. Note that the
first rows may stay empty if they do not have a previous value to use.
> Example
Fill empty values in two columns with the value "hello".
fill_empty = table.fill_empty ["col0", "col1"] "hello"
@columns Widget_Helpers.make_column_name_vector_selector
@default Widget_Helpers.make_column_ref_or_text_value_selector
fill_empty : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Column | Column_Ref | Any -> Table
fill_empty self (columns : Vector | Text | Integer | Regex) (default : Column | Column_Ref | Any) =
@default (self -> Widget_Helpers.make_fill_default_value_selector column_source=self include_custom_text=True)
fill_empty : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Column | Column_Ref | Previous_Value | Any -> Table
fill_empty self (columns : Vector | Text | Integer | Regex) default =
resolved_default = (self:Table_Ref).resolve default
transformer col = col.fill_empty resolved_default
Table_Helpers.replace_columns_with_transformed_columns self columns transformer

View File

@ -11,6 +11,7 @@ import Standard.Base.Internal.Polyglot_Helpers
import Standard.Base.Internal.Rounding_Helpers
from Standard.Base.Widget_Helpers import make_regex_text_widget
import project.Data.Constants.Previous_Value
import project.Data.Data_Formatter.Data_Formatter
import project.Data.Table.Table
import project.Data.Type.Enso_Types
@ -1105,7 +1106,10 @@ type Column
Arguments:
- default: The value to replace missing values with. If this argument
is a column, the value from `default` at the corresponding position
will be used.
will be used. If this argument is `Previous_Value`, the missing values
will be replaced with the previous value in the column. Note that the
first rows may stay `Nothing` if they do not have a previous value to
use.
> Example
Fill missing values in a column with the value 20.5.
@ -1113,8 +1117,10 @@ type Column
import Standard.Examples
example_fill_nothing = Examples.decimal_column.fill_nothing 20.5
fill_nothing : Column | Any -> Column
@default (Widget_Helpers.make_fill_default_value_selector include_custom_text=False)
fill_nothing : Column | Previous_Value | Any -> Column
fill_nothing self default =
if Previous_Value == default then fill_previous self Nothing else
common_type = Value_Type_Helpers.find_common_type_for_arguments [self, default]
common_type.if_not_error <|
storage = self.java_column.getStorage
@ -1137,7 +1143,9 @@ type Column
Arguments:
- default: The value to replace empty values with. If this argument
is a column, the value from `default` at the corresponding position
will be used.
will be used. If this argument is `Previous_Value`, the empty values
will be replaced with the previous value in the column. Note that the
first rows may stay empty if they do not have a previous value to use.
> Example
Fill empty values in a column with the value "hello".
@ -1145,9 +1153,11 @@ type Column
import Standard.Examples
example_fill_empty = Examples.text_column_1.fill_empty "hello"
fill_empty : Column | Any -> Column
@default (Widget_Helpers.make_fill_default_value_selector include_custom_text=True)
fill_empty : Column | Previous_Value | Any -> Column
fill_empty self default =
Value_Type.expect_text self <|
if Previous_Value == default then fill_previous self self.is_empty else
Value_Type.expect_text default <|
result = self.is_empty.iif default self
result.rename self.name
@ -2527,6 +2537,19 @@ naming_helper = Column_Naming_Helper.in_memory
Resolves the default date period for `date_add` depending on the source column value type.
default_date_period column = if column.value_type.has_date then Date_Period.Day else Time_Period.Hour
## PRIVATE
Fills the missing values in a provided column with the previous non-missing value.
Arguments:
- column: The column to fill.
- is_missing: A boolean column specifying which elements are deemed missing.
If set to `Nothing`, this will rely on the default missing value semantics
(`is_nothing`).
fill_previous column is_missing =
missing_storage = if Nothing == is_missing then Nothing else is_missing.java_column.getStorage
new_storage = column.java_column.getStorage.fillMissingFromPrevious missing_storage
Column.from_storage column.name new_storage
## PRIVATE
Conversion method to a Column from a Vector.
Column.from (that:Vector) (name:Text="Vector") = Column.from_vector name that

View File

@ -0,0 +1,4 @@
## Indicates that the operation should use the previous non-missing value to
when filling in missing values, for example in `fill_nothing` and
`fill_empty`.
type Previous_Value

View File

@ -18,6 +18,7 @@ import project.Data.Calculations.Column_Operation.Column_Operation
import project.Data.Column as Column_Module
import project.Data.Column.Column
import project.Data.Column_Ref.Column_Ref
import project.Data.Constants.Previous_Value
import project.Data.Data_Formatter.Data_Formatter
import project.Data.Expression.Expression
import project.Data.Expression.Expression_Error
@ -2371,16 +2372,19 @@ type Table
match names, or a Vector of these.
- default: The value to replace missing values with. If this argument
is a column, the value from `default` at the corresponding position
will be used.
will be used. If this argument is `Previous_Value`, the missing values
will be replaced with the previous value in the column. Note that the
first rows may stay `Nothing` if they do not have a previous value to
use.
> Example
Fill missing values in two columns with the value 20.5.
fill_nothing = table.fill_nothing ["col0", "col1"] 20.5
@columns Widget_Helpers.make_column_name_vector_selector
@default Widget_Helpers.make_column_ref_by_name_selector
fill_nothing : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Column | Column_Ref | Any -> Table
fill_nothing self (columns : Vector | Text | Integer | Regex) (default : Column | Column_Ref | Any) =
@default (self -> Widget_Helpers.make_fill_default_value_selector column_source=self include_custom_text=False)
fill_nothing : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Column | Column_Ref | Previous_Value | Any -> Table
fill_nothing self (columns : Vector | Text | Integer | Regex) default =
resolved_default = (self:Table_Ref).resolve default
transformer col = col.fill_nothing resolved_default
Table_Helpers.replace_columns_with_transformed_columns self columns transformer
@ -2396,16 +2400,18 @@ type Table
match names, or a Vector of these.
- default: The value to replace empty values with. If this argument
is a column, the value from `default` at the corresponding position
will be used.
will be used. If this argument is `Previous_Value`, the empty values
will be replaced with the previous value in the column. Note that the
first rows may stay empty if they do not have a previous value to use.
> Example
Fill empty values in two columns with the value "hello".
fill_empty = table.fill_empty ["col0", "col1"] "hello"
@columns Widget_Helpers.make_column_name_vector_selector
@default Widget_Helpers.make_column_ref_or_text_value_selector
fill_empty : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Column | Column_Ref | Any -> Table
fill_empty self (columns : Vector | Text | Integer | Regex) (default : Column | Column_Ref | Any) =
@default (self -> Widget_Helpers.make_fill_default_value_selector column_source=self include_custom_text=True)
fill_empty : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Column | Column_Ref | Previous_Value | Any -> Table
fill_empty self (columns : Vector | Text | Integer | Regex) default =
resolved_default = (self:Table_Ref).resolve default
transformer col = col.fill_empty resolved_default
Table_Helpers.replace_columns_with_transformed_columns self columns transformer

View File

@ -96,6 +96,16 @@ make_column_ref_or_text_value_selector table display=Display.Always =
custom_text_option = Option "'custom text'" "''"
Single_Choice values=(col_names_options+[custom_text_option]) display=display
## PRIVATE
If `column_source` is Nothing, `Column_Ref` options will not be added.
make_fill_default_value_selector : Table | Nothing -> Boolean -> Display -> Widget
make_fill_default_value_selector column_source=Nothing include_custom_text=False display=Display.Always =
col_names_options = if column_source.is_nothing then [] else
column_source.column_names.map (name -> Option name "(Column_Ref.Name "+name.pretty+")")
custom_text_option = if include_custom_text then [Option "'custom text'" "''"] else []
previous_value_option = [Option 'Previous Value' 'Previous_Value']
Single_Choice values=(previous_value_option+col_names_options+custom_text_option) display=display
## PRIVATE
Make a filter condition selector.
make_filter_condition_selector : Table -> Display -> Widget

View File

@ -23,6 +23,7 @@ import project.Excel.Excel_Range.Excel_Range
import project.Excel.Excel_Section.Excel_Section
import project.Excel.Excel_Workbook.Excel_Workbook
import project.Extensions.Prefix_Name.Prefix_Name
from project.Data.Constants import all
from project.Delimited.Delimited_Format.Delimited_Format import Delimited
from project.Excel.Excel_Format.Excel_Format import Excel
from project.Excel.Excel_Section.Excel_Section import Cell_Range, Range_Names, Sheet_Names, Worksheet
@ -51,6 +52,7 @@ export project.Excel.Excel_Range.Excel_Range
export project.Excel.Excel_Section.Excel_Section
export project.Excel.Excel_Workbook.Excel_Workbook
export project.Extensions.Prefix_Name.Prefix_Name
from project.Data.Constants export all
from project.Delimited.Delimited_Format.Delimited_Format export Delimited
from project.Excel.Excel_Format.Excel_Format export Excel
from project.Excel.Excel_Section.Excel_Section export Cell_Range, Range_Names, Sheet_Names, Worksheet

View File

@ -141,6 +141,39 @@ public final class BoolStorage extends Storage<Boolean> {
}
}
@Override
public Storage<?> fillMissingFromPrevious(BoolStorage missingIndicator) {
if (missingIndicator != null) {
throw new IllegalStateException("Custom missing value semantics are not supported by BoolStorage.");
}
boolean previousValue = false;
boolean hasPrevious = false;
BitSet newMissing = new BitSet();
BitSet newValues = new BitSet();
Context context = Context.getCurrent();
for (int i = 0; i < size; i++) {
boolean isCurrentValueMissing = isMissing.get(i);
if (isCurrentValueMissing) {
if (hasPrevious) {
newValues.set(i, previousValue);
} else {
newMissing.set(i);
}
} else {
boolean currentValue = getItem(i);
newValues.set(i, currentValue);
previousValue = currentValue;
hasPrevious = true;
}
context.safepoint();
}
return new BoolStorage(newValues, newMissing, size, false);
}
@Override
public BoolStorage mask(BitSet mask, int cardinality) {
Context context = Context.getCurrent();

View File

@ -84,6 +84,12 @@ public class MixedStorageFacade extends Storage<Object> {
return underlyingStorage.runVectorizedZip(name, argument, problemAggregator);
}
@Override
public Storage<?> fillMissingFromPrevious(BoolStorage missingIndicator) {
Storage<?> newStorage = underlyingStorage.fillMissingFromPrevious(missingIndicator);
return new MixedStorageFacade(newStorage);
}
@Override
public Storage<Object> mask(BitSet mask, int cardinality) {
Storage<?> newStorage = underlyingStorage.mask(mask, cardinality);

View File

@ -184,6 +184,33 @@ public abstract class SpecializedStorage<T> extends Storage<T> {
return newInstance(newData, size + count);
}
@Override
public Storage<T> fillMissingFromPrevious(BoolStorage missingIndicator) {
if (missingIndicator != null && missingIndicator.countMissing() > 0) {
throw new IllegalArgumentException(
"Missing indicator must not contain missing values itself.");
}
T[] newData = newUnderlyingArray(size);
T previous = null;
boolean hasPrevious = false;
Context context = Context.getCurrent();
for (int i = 0; i < size; i++) {
boolean isCurrentValueMissing =
missingIndicator == null ? isNa(i) : missingIndicator.getItem(i);
if (!isCurrentValueMissing) {
previous = data[i];
hasPrevious = true;
}
newData[i] = hasPrevious ? previous : data[i];
context.safepoint();
}
return newInstance(newData, size);
}
@Override
public List<Object> toList() {
return new ReadOnlyList<>(this);

View File

@ -451,6 +451,18 @@ public abstract class Storage<T> {
return builder.seal();
}
/**
* Fills missing values with a previous non-missing value.
*
* <p>
*
* @param missingIndicator Specifies which values should be considered missing. It can be used to
* implement custom missing value semantics, like `fill_empty`. It can be set to {@code null}
* to just rely on the default semantics of missing values. Some storages may not allow
* customizing the semantics.
*/
public abstract Storage<?> fillMissingFromPrevious(BoolStorage missingIndicator);
/**
* Return a new storage, containing only the items marked true in the mask.
*

View File

@ -181,6 +181,41 @@ public abstract class AbstractLongStorage extends NumericStorage<Long> {
return ops;
}
@Override
public AbstractLongStorage fillMissingFromPrevious(BoolStorage missingIndicator) {
if (missingIndicator != null) {
throw new IllegalStateException(
"Custom missing value semantics are not supported by AbstractLongStorage.");
}
int n = size();
long[] newData = new long[n];
BitSet newMissing = new BitSet();
long previousValue = 0;
boolean hasPrevious = false;
Context context = Context.getCurrent();
for (int i = 0; i < n; i++) {
boolean isCurrentMissing = isNa(i);
if (isCurrentMissing) {
if (hasPrevious) {
newData[i] = previousValue;
} else {
newMissing.set(i);
}
} else {
long currentValue = getItem(i);
newData[i] = currentValue;
previousValue = currentValue;
hasPrevious = true;
}
context.safepoint();
}
return new LongStorage(newData, n, newMissing, getType());
}
/**
* Return an instance of storage containing the same data but with a wider type.
*

View File

@ -213,6 +213,41 @@ public final class DoubleStorage extends NumericStorage<Double> implements Doubl
return super.fillMissing(arg, commonType, problemAggregator);
}
@Override
public DoubleStorage fillMissingFromPrevious(BoolStorage missingIndicator) {
if (missingIndicator != null) {
throw new IllegalStateException(
"Custom missing value semantics are not supported by DoubleStorage.");
}
int n = size();
long[] newData = new long[n];
BitSet newMissing = new BitSet();
long previousValueRaw = 0;
boolean hasPrevious = false;
Context context = Context.getCurrent();
for (int i = 0; i < n; i++) {
boolean isCurrentMissing = isNa(i);
if (isCurrentMissing) {
if (hasPrevious) {
newData[i] = previousValueRaw;
} else {
newMissing.set(i);
}
} else {
long currentValueRaw = data[i];
newData[i] = currentValueRaw;
previousValueRaw = currentValueRaw;
hasPrevious = true;
}
context.safepoint();
}
return new DoubleStorage(newData, n, newMissing);
}
@Override
public Storage<Double> mask(BitSet mask, int cardinality) {
BitSet newMissing = new BitSet();

View File

@ -1,6 +1,6 @@
from Standard.Base import all
from Standard.Table import Value_Type, Column_Ref
from Standard.Table import Value_Type, Column_Ref, Previous_Value
from Standard.Table.Data.Aggregate_Column.Aggregate_Column import Count_Distinct
from Standard.Table.Errors import all
@ -271,10 +271,47 @@ spec setup =
e.value_type.variable_length.should_be_true
Test.specify "should allow setting a default column by reference" <|
t = table_builder [["A", ["x", "", Nothing]], ["B", ["a", "b", "c"]]]
t = table_builder [["A", ["x", "", Nothing]], ["B", ["a", "b", "c"]], ["C", [Nothing, Nothing, "ZZZ"]], ["D", [Nothing, "2", "3"]]]
t1 = t.fill_nothing "A" (Column_Ref.Name "B")
t1.at "A" . to_vector . should_equal ["x", "", "c"]
t2 = t.fill_empty "A" (Column_Ref.Name "B")
t2.at "A" . to_vector . should_equal ["x", "b", "c"]
t3 = t.fill_nothing ["A", "C"] (Column_Ref.Name "D")
t3.at "A" . to_vector . should_equal ["x", "", "3"]
t3.at "B" . to_vector . should_equal ["a", "b", "c"]
t3.at "C" . to_vector . should_equal [Nothing, "2", "ZZZ"]
t3.at "D" . to_vector . should_equal [Nothing, "2", "3"]
if setup.is_database.not then Test.specify "should allow filling rows with previous value" <|
t = table_builder [["A", ["a", "", Nothing, Nothing, "", "b", "c", Nothing]]]
t1 = t.fill_nothing "A" Previous_Value
t1.at "A" . to_vector . should_equal ["a", "", "", "", "", "b", "c", "c"]
t2 = t.fill_empty "A" Previous_Value
t2.at "A" . to_vector . should_equal ["a", "a", "a", "a", "a", "b", "c", "c"]
# May still keep Nothing/empty if there is no previous value
t3 = table_builder [["B", [Nothing, Nothing, "", Nothing, "a", Nothing]]]
t3.fill_nothing "B" Previous_Value . at "B" . to_vector . should_equal [Nothing, Nothing, "", "", "a", "a"]
t3.fill_empty "B" Previous_Value . at "B" . to_vector . should_equal [Nothing, Nothing, "", Nothing, "a", "a"]
# Will work on multiple columns
t4 = table_builder [["A", ["a", "b", Nothing, Nothing]], ["B", [Nothing, 42, Nothing, 123]], ["C", ["", "foo", Nothing, "bar"]], ["D", [True, Nothing, False, Nothing]], ["E", [Nothing, Date.new 2001, Nothing, Nothing]], ["F", [2, Nothing, "a", Nothing]]]
t5 = t4.fill_nothing ["A", "B", "D", "E", "F"] Previous_Value
t5.at "A" . to_vector . should_equal ["a", "b", "b", "b"]
t5.at "B" . to_vector . should_equal [Nothing, 42, 42, 123]
t5.at "D" . to_vector . should_equal [True, True, False, False]
t5.at "E" . to_vector . should_equal [Nothing, Date.new 2001, Date.new 2001, Date.new 2001]
t5.at "F" . to_vector . should_equal [2, 2, "a", "a"]
# C is unchanged
t5.at "C" . to_vector . should_equal ["", "foo", Nothing, "bar"]
if setup.is_database then Test.specify "will for now report that Previous_Value is not supported" <|
t = table_builder [["A", ["a", "", Nothing, Nothing, "", "b", "c", Nothing]]]
t.fill_nothing "A" Previous_Value . should_fail_with Unsupported_Database_Operation
t.fill_empty "A" Previous_Value . should_fail_with Unsupported_Database_Operation

View File

@ -3,6 +3,7 @@ package org.enso.table_test_helpers;
import java.util.BitSet;
import java.util.List;
import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.type.IntegerType;
import org.enso.table.data.column.storage.type.StorageType;
@ -87,6 +88,11 @@ public class ExplodingStorage extends Storage<Long> {
return null;
}
@Override
public Storage<?> fillMissingFromPrevious(BoolStorage missingIndicator) {
return null;
}
@Override
public Storage<Long> mask(BitSet mask, int cardinality) {
return null;