mirror of
https://github.com/enso-org/enso.git
synced 2024-11-23 08:08:34 +03:00
Table: Indexes & Joins (#1317)
This commit is contained in:
parent
07190a729c
commit
a40989e7c6
@ -66,11 +66,26 @@ type Json
|
|||||||
to_text : Text
|
to_text : Text
|
||||||
to_text = Internal.render_helper this
|
to_text = Internal.render_helper this
|
||||||
|
|
||||||
|
## Recursively unwraps the JSON value into primitive values.
|
||||||
|
unwrap : Any
|
||||||
|
unwrap = case this of
|
||||||
|
Json.Array its -> its.map unwrap
|
||||||
|
Json.Boolean b -> b
|
||||||
|
Json.Number n -> n
|
||||||
|
Json.String t -> t
|
||||||
|
Json.Null -> Nothing
|
||||||
|
Json.Object f -> f.map unwrap
|
||||||
|
|
||||||
## A failure indicating malformed text input into the JSON parser.
|
## A failure indicating malformed text input into the JSON parser.
|
||||||
|
|
||||||
Check the `message` field for detailed information on the specific failure.
|
Check the `message` field for detailed information on the specific failure.
|
||||||
type Parse_Error message
|
type Parse_Error message
|
||||||
|
|
||||||
|
## Gets the value associated with the given key in this object. Returns
|
||||||
|
`Nothing` if the associated key is not defined.
|
||||||
|
Object.get : Text -> Json | Nothing
|
||||||
|
Object.get field = this.fields.get field
|
||||||
|
|
||||||
## Parses an RFC-8259 compliant JSON text into a `Json` structure.
|
## Parses an RFC-8259 compliant JSON text into a `Json` structure.
|
||||||
parse : Text -> Json ! Parse_Error
|
parse : Text -> Json ! Parse_Error
|
||||||
parse json_text =
|
parse json_text =
|
||||||
|
@ -200,6 +200,14 @@ type Vector
|
|||||||
0.up_to arr.length . each ix-> new_arr.set_at ix (function (arr.at ix))
|
0.up_to arr.length . each ix-> new_arr.set_at ix (function (arr.at ix))
|
||||||
Vector new_arr
|
Vector new_arr
|
||||||
|
|
||||||
|
## Applies a function to each element of the vector, returning the vector
|
||||||
|
of results.
|
||||||
|
|
||||||
|
The function is called with both the element index as well as the
|
||||||
|
element itself.
|
||||||
|
map_with_index : (Int -> Any -> Any) -> Vector
|
||||||
|
map_with_index function = here.new this.length i-> function i (this.at i)
|
||||||
|
|
||||||
## Applies a function to each element of the vector.
|
## Applies a function to each element of the vector.
|
||||||
|
|
||||||
Unlike `map`, this method does not return the individual results,
|
Unlike `map`, this method does not return the individual results,
|
||||||
|
@ -10,16 +10,20 @@ type Column
|
|||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
- show_rows: the number of initial rows that should be displayed.
|
- show_rows: the number of initial rows that should be displayed.
|
||||||
display : Integer -> Text
|
- format_terminal: whether ANSI-terminal formatting should be used
|
||||||
display show_rows=10 =
|
display : Integer -> Boolean -> Text
|
||||||
|
display show_rows=10 format_terminal=False =
|
||||||
java_col = this.java_column
|
java_col = this.java_column
|
||||||
|
index = java_col.getIndex []
|
||||||
col_name = java_col.getName []
|
col_name = java_col.getName []
|
||||||
storage = java_col.getStorage []
|
storage = java_col.getStorage []
|
||||||
num_rows = java_col.getSize []
|
num_rows = java_col.getSize []
|
||||||
display_rows = min num_rows show_rows
|
display_rows = min num_rows show_rows
|
||||||
items = Vector.new display_rows num->
|
items = Vector.new display_rows num->
|
||||||
[if storage.isNa [num] then "NA" else here.get_item_string storage num]
|
row = if storage.isNa [num] then "Nothing" else
|
||||||
table = Table.print_table [col_name] items
|
here.get_item_string storage num
|
||||||
|
[index.ilocString [num], row]
|
||||||
|
table = Table.print_table [index.getName [], col_name] items format_terminal
|
||||||
if num_rows - display_rows <= 0 then table else
|
if num_rows - display_rows <= 0 then table else
|
||||||
missing = '\n\u2026 and ' + (num_rows - display_rows).to_text + ' hidden rows.'
|
missing = '\n\u2026 and ' + (num_rows - display_rows).to_text + ' hidden rows.'
|
||||||
table + missing
|
table + missing
|
||||||
@ -29,7 +33,8 @@ type Column
|
|||||||
Arguments:
|
Arguments:
|
||||||
- show_rows: the number of initial rows that should be displayed.
|
- show_rows: the number of initial rows that should be displayed.
|
||||||
print show_rows=10 =
|
print show_rows=10 =
|
||||||
IO.println (this.display show_rows)
|
IO.println (this.display show_rows format_terminal=True)
|
||||||
|
IO.println ''
|
||||||
|
|
||||||
## Element-wise equality comparison. Returns a column with results of
|
## Element-wise equality comparison. Returns a column with results of
|
||||||
comparing this column's elements against `other`.
|
comparing this column's elements against `other`.
|
||||||
@ -47,12 +52,18 @@ type Column
|
|||||||
not =
|
not =
|
||||||
here.run_vectorized_op this "not" not Nothing
|
here.run_vectorized_op this "not" not Nothing
|
||||||
|
|
||||||
|
## Returns a column of booleans, with `True` items at the positions where
|
||||||
|
this column contains a `Nothing`.
|
||||||
|
is_missing : Column
|
||||||
|
is_missing = here.run_vectorized_op this "is_missing" (== Nothing) Nothing
|
||||||
|
|
||||||
## Applies `function` to each item in this column and returns the column
|
## Applies `function` to each item in this column and returns the column
|
||||||
of results.
|
of results.
|
||||||
map function =
|
map function =
|
||||||
storage = this.java_column.getStorage []
|
storage = this.java_column.getStorage []
|
||||||
|
index = this.java_column.getIndex []
|
||||||
new_st = storage.map [function]
|
new_st = storage.map [function]
|
||||||
col = Java_Column.new ["Result", new_st].to_array
|
col = Java_Column.new ["Result", index, new_st].to_array
|
||||||
Column col
|
Column col
|
||||||
|
|
||||||
## Returns a new column, containing the same elements as `this`, but with
|
## Returns a new column, containing the same elements as `this`, but with
|
||||||
@ -104,9 +115,10 @@ from_vector name items = Column (Java_Column.fromItems [name, items.to_array])
|
|||||||
## PRIVATE
|
## PRIVATE
|
||||||
run_vectorized_op column java_op_name fallback_method operand =
|
run_vectorized_op column java_op_name fallback_method operand =
|
||||||
storage = column.java_column.getStorage []
|
storage = column.java_column.getStorage []
|
||||||
|
ix = column.java_column.getIndex []
|
||||||
rs = if storage.isOpVectorized [java_op_name] then storage.runVectorizedOp [java_op_name, operand] else
|
rs = if storage.isOpVectorized [java_op_name] then storage.runVectorizedOp [java_op_name, operand] else
|
||||||
storage.map [fallback_method]
|
storage.map [fallback_method]
|
||||||
Column (Java_Column.new ["Result", rs].to_array)
|
Column (Java_Column.new ["Result", ix, rs].to_array)
|
||||||
|
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from Base import all
|
from Base import all
|
||||||
import Table.Io.Csv
|
import Table.Io.Csv
|
||||||
import Table.Data.Column
|
import Table.Data.Column
|
||||||
|
import Base.System.Platform
|
||||||
|
|
||||||
polyglot java import org.enso.table.data.table.Table as Java_Table
|
polyglot java import org.enso.table.data.table.Table as Java_Table
|
||||||
|
|
||||||
@ -12,17 +13,20 @@ type Table
|
|||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
- show_rows: the number of initial rows that should be displayed.
|
- show_rows: the number of initial rows that should be displayed.
|
||||||
display : Integer -> Text
|
- format_terminal: whether ANSI-terminal formatting should be used
|
||||||
display show_rows=10 =
|
display : Integer -> Boolean -> Text
|
||||||
|
display show_rows=10 format_terminal=False =
|
||||||
cols = Vector.Vector (this.java_table.getColumns [])
|
cols = Vector.Vector (this.java_table.getColumns [])
|
||||||
col_names = cols.map (_.getName [])
|
index = this.java_table.getIndex []
|
||||||
|
col_names = [index.getName[]] + cols.map (_.getName [])
|
||||||
col_vals = cols.map (_.getStorage [])
|
col_vals = cols.map (_.getStorage [])
|
||||||
num_rows = this.java_table.nrows []
|
num_rows = this.java_table.nrows []
|
||||||
display_rows = min num_rows show_rows
|
display_rows = min num_rows show_rows
|
||||||
rows = Vector.new display_rows row_num->
|
rows = Vector.new display_rows row_num->
|
||||||
col_vals.map col->
|
cols = col_vals.map col->
|
||||||
if col.isNa [row_num] then "NA" else Column.get_item_string col row_num
|
if col.isNa [row_num] then "Nothing" else Column.get_item_string col row_num
|
||||||
table = here.print_table col_names rows
|
[index.ilocString [row_num]] + cols
|
||||||
|
table = here.print_table col_names rows format_terminal
|
||||||
if num_rows - display_rows <= 0 then table else
|
if num_rows - display_rows <= 0 then table else
|
||||||
missing = '\n\u2026 and ' + (num_rows - display_rows).to_text + ' hidden rows.'
|
missing = '\n\u2026 and ' + (num_rows - display_rows).to_text + ' hidden rows.'
|
||||||
table + missing
|
table + missing
|
||||||
@ -32,7 +36,8 @@ type Table
|
|||||||
Arguments:
|
Arguments:
|
||||||
- show_rows: the number of initial rows that should be displayed.
|
- show_rows: the number of initial rows that should be displayed.
|
||||||
print show_rows=10 =
|
print show_rows=10 =
|
||||||
IO.println (this.display show_rows)
|
IO.println (this.display show_rows format_terminal=True)
|
||||||
|
IO.println ''
|
||||||
|
|
||||||
## Converts this table to a JSON structure.
|
## Converts this table to a JSON structure.
|
||||||
to_json : Json
|
to_json : Json
|
||||||
@ -67,6 +72,34 @@ type Table
|
|||||||
columns =
|
columns =
|
||||||
Vector.Vector (this.java_table.getColumns []) . map Column.Column
|
Vector.Vector (this.java_table.getColumns []) . map Column.Column
|
||||||
|
|
||||||
|
## Sets the index of this table, using the column with the provided name.
|
||||||
|
set_index : Text -> Table
|
||||||
|
set_index index =
|
||||||
|
Table (this.java_table.indexFromColumn [index])
|
||||||
|
|
||||||
|
## Selects a subset of columns from this table by name.
|
||||||
|
select columns = Table (this.java_table.selectColumns [columns.to_array])
|
||||||
|
|
||||||
|
## Efficiently joins two tables based on either the index or a key column.
|
||||||
|
|
||||||
|
The resulting table contains rows of `this` extended with rows of
|
||||||
|
`other` with matching indexes. If the index in `other` is not unique,
|
||||||
|
the corresponding rows of `this` will be duplicated in the result.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- other: the table being the right operand of this join operation.
|
||||||
|
- on: the column of `this` that should be used as the join key. If
|
||||||
|
this argument is not provided, the index of `this` will be used.
|
||||||
|
- drop_unmatched: whether the rows of `this` without corresponding
|
||||||
|
matches in `other` should be dropped from the result.
|
||||||
|
- left_suffix: a suffix that should be added to the columns of `this`
|
||||||
|
when there's a name conflict with a column of `other`.
|
||||||
|
- right_suffix: a suffix that should be added to the columns of `other`
|
||||||
|
when there's a name conflict with a column of `this`.
|
||||||
|
join : Table -> Text | Nothing -> Boolean -> Text -> Text -> Table
|
||||||
|
join other on=Nothing drop_unmatched=False left_suffix='_left' right_suffix='_right' =
|
||||||
|
Table (this.java_table.join [other.java_table, drop_unmatched, on, left_suffix, right_suffix])
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
from_columns cols = Table (Java_Table.new [cols.to_array].to_array)
|
from_columns cols = Table (Java_Table.new [cols.to_array].to_array)
|
||||||
|
|
||||||
@ -86,13 +119,22 @@ pad txt len =
|
|||||||
txt + (" ".repeat (len - true_len))
|
txt + (" ".repeat (len - true_len))
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
print_table header rows =
|
ansi_bold enabled txt =
|
||||||
|
case Platform.os of
|
||||||
|
## Output formatting for Windows is not currently supported.
|
||||||
|
Platform.Windows -> txt
|
||||||
|
_ -> if enabled then '\e[1m' + txt + '\e[m' else txt
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
print_table header rows format_term =
|
||||||
content_lengths = Vector.new header.length i->
|
content_lengths = Vector.new header.length i->
|
||||||
max_row = 0.up_to rows.length . fold 0 a-> j-> max a (rows.at j . at i . characters . length)
|
max_row = 0.up_to rows.length . fold 0 a-> j-> max a (rows.at j . at i . characters . length)
|
||||||
max max_row (header.at i . characters . length)
|
max max_row (header.at i . characters . length)
|
||||||
header_line = zip header content_lengths here.pad . join ' | '
|
header_line = zip header content_lengths here.pad . map (here.ansi_bold format_term) . join ' | '
|
||||||
divider = content_lengths . map (l -> "-".repeat l+2) . join '+'
|
divider = content_lengths . map (l -> "-".repeat l+2) . join '+'
|
||||||
row_lines = rows.map r->
|
row_lines = rows.map r->
|
||||||
x = zip r content_lengths here.pad . join ' | '
|
x = zip r content_lengths here.pad
|
||||||
" " + x
|
with_bold_ix = [here.ansi_bold format_term (x.at 0)] + x.drop_start 1
|
||||||
|
y = with_bold_ix . join ' | '
|
||||||
|
" " + y
|
||||||
([" " + header_line, divider] + row_lines).join '\n'
|
([" " + header_line, divider] + row_lines).join '\n'
|
||||||
|
@ -8,3 +8,25 @@ from Table.Io.Csv export all hiding Parser
|
|||||||
export Table.Data.Column
|
export Table.Data.Column
|
||||||
from Table.Data.Table export new
|
from Table.Data.Table export new
|
||||||
|
|
||||||
|
## Converts a JSON array into a dataframe, by looking up the requested keys
|
||||||
|
from each item.
|
||||||
|
|
||||||
|
It assumes the items are JSON objects containing the requested keys.
|
||||||
|
In case an item is not an object, or the request key does not exist, the
|
||||||
|
relevant values of the table will be set to `Nothing`.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- fields: a vector of texts representing the names of fields to look up.
|
||||||
|
Json.Array.to_table : Vector -> Table
|
||||||
|
Json.Array.to_table fields = case this of
|
||||||
|
Json.Array items ->
|
||||||
|
rows = items.map <| case _ of
|
||||||
|
Json.Object fs ->
|
||||||
|
fields.map n-> case fs.get n of
|
||||||
|
Nothing -> Nothing
|
||||||
|
js -> js.unwrap
|
||||||
|
_ -> Vector.fill fields.length Nothing
|
||||||
|
cols = fields.map_with_index i-> n->
|
||||||
|
[n, rows.map (_.at i)]
|
||||||
|
Table.new cols
|
||||||
|
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
package org.enso.table.data.column.storage;
|
package org.enso.table.data.column.storage;
|
||||||
|
|
||||||
|
import org.enso.table.data.index.Index;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
|
|
||||||
/**
|
/** A boolean column storage. */
|
||||||
* A boolean column storage.
|
|
||||||
*/
|
|
||||||
public class BoolStorage extends Storage {
|
public class BoolStorage extends Storage {
|
||||||
private final BitSet values;
|
private final BitSet values;
|
||||||
private final BitSet isMissing;
|
private final BitSet isMissing;
|
||||||
@ -19,7 +19,7 @@ public class BoolStorage extends Storage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long size() {
|
public int size() {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -44,7 +44,7 @@ public class BoolStorage extends Storage {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isOpVectorized(String op) {
|
public boolean isOpVectorized(String op) {
|
||||||
return op.equals(Ops.EQ) || op.equals(Ops.NOT);
|
return op.equals(Ops.EQ) || op.equals(Ops.NOT) || op.equals(Ops.IS_MISSING);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -53,6 +53,8 @@ public class BoolStorage extends Storage {
|
|||||||
return runVectorizedEq(operand);
|
return runVectorizedEq(operand);
|
||||||
} else if (Ops.NOT.equals(name)) {
|
} else if (Ops.NOT.equals(name)) {
|
||||||
return new BoolStorage(values, isMissing, size, !negated);
|
return new BoolStorage(values, isMissing, size, !negated);
|
||||||
|
} else if (Ops.IS_MISSING.equals(name)) {
|
||||||
|
return new BoolStorage(isMissing, new BitSet(), size, false);
|
||||||
}
|
}
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
@ -98,6 +100,36 @@ public class BoolStorage extends Storage {
|
|||||||
return new BoolStorage(newValues, newMissing, cardinality, negated);
|
return new BoolStorage(newValues, newMissing, cardinality, negated);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Storage orderMask(int[] positions) {
|
||||||
|
BitSet newNa = new BitSet();
|
||||||
|
BitSet newVals = new BitSet();
|
||||||
|
for (int i = 0; i < positions.length; i++) {
|
||||||
|
if (positions[i] == Index.NOT_FOUND || isMissing.get(positions[i])) {
|
||||||
|
newNa.set(i);
|
||||||
|
} else if (values.get(positions[i])) {
|
||||||
|
values.set(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new BoolStorage(newVals, newNa, positions.length, negated);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Storage countMask(int[] counts, int total) {
|
||||||
|
BitSet newNa = new BitSet();
|
||||||
|
BitSet newVals = new BitSet();
|
||||||
|
int pos = 0;
|
||||||
|
for (int i = 0; i < counts.length; i++) {
|
||||||
|
if (isMissing.get(i)) {
|
||||||
|
newNa.set(pos, pos + counts[i]);
|
||||||
|
} else if (values.get(i)) {
|
||||||
|
newVals.set(pos, pos + counts[i]);
|
||||||
|
}
|
||||||
|
pos += counts[i];
|
||||||
|
}
|
||||||
|
return new BoolStorage(newVals, newNa, total, negated);
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isNegated() {
|
public boolean isNegated() {
|
||||||
return negated;
|
return negated;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package org.enso.table.data.column.storage;
|
package org.enso.table.data.column.storage;
|
||||||
|
|
||||||
|
import org.enso.table.data.index.Index;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
@ -24,7 +26,7 @@ public class DoubleStorage extends Storage {
|
|||||||
|
|
||||||
/** @inheritDoc */
|
/** @inheritDoc */
|
||||||
@Override
|
@Override
|
||||||
public long size() {
|
public int size() {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,13 +57,15 @@ public class DoubleStorage extends Storage {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isOpVectorized(String op) {
|
public boolean isOpVectorized(String op) {
|
||||||
return op.equals("==");
|
return Ops.EQ.equals(op) || Ops.IS_MISSING.equals(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Storage runVectorizedOp(String name, Object operand) {
|
public Storage runVectorizedOp(String name, Object operand) {
|
||||||
if (name.equals("==")) {
|
if (name.equals(Ops.EQ)) {
|
||||||
return runVectorizedEq(operand);
|
return runVectorizedEq(operand);
|
||||||
|
} else if (name.equals(Ops.IS_MISSING)) {
|
||||||
|
return new BoolStorage(isMissing, new BitSet(), size, false);
|
||||||
}
|
}
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
@ -98,4 +102,36 @@ public class DoubleStorage extends Storage {
|
|||||||
}
|
}
|
||||||
return new DoubleStorage(newData, cardinality, newMissing);
|
return new DoubleStorage(newData, cardinality, newMissing);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Storage orderMask(int[] positions) {
|
||||||
|
long[] newData = new long[positions.length];
|
||||||
|
BitSet newMissing = new BitSet();
|
||||||
|
for (int i = 0; i < positions.length; i++) {
|
||||||
|
if (positions[i] == Index.NOT_FOUND || isMissing.get(positions[i])) {
|
||||||
|
newMissing.set(i);
|
||||||
|
} else {
|
||||||
|
newData[i] = data[positions[i]];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new DoubleStorage(newData, positions.length, newMissing);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Storage countMask(int[] counts, int total) {
|
||||||
|
long[] newData = new long[total];
|
||||||
|
BitSet newMissing = new BitSet();
|
||||||
|
int pos = 0;
|
||||||
|
for (int i = 0; i < counts.length; i++) {
|
||||||
|
if (isMissing.get(i)) {
|
||||||
|
newMissing.set(pos, pos + counts[i]);
|
||||||
|
pos += counts[i];
|
||||||
|
} else {
|
||||||
|
for (int j = 0; j < counts[i]; j++) {
|
||||||
|
newData[pos++] = data[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new DoubleStorage(newData, total, newMissing);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package org.enso.table.data.column.storage;
|
package org.enso.table.data.column.storage;
|
||||||
|
|
||||||
|
import org.enso.table.data.index.Index;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
|
|
||||||
/** A column storing 64-bit integers. */
|
/** A column storing 64-bit integers. */
|
||||||
@ -22,7 +24,7 @@ public class LongStorage extends Storage {
|
|||||||
|
|
||||||
/** @inheritDoc */
|
/** @inheritDoc */
|
||||||
@Override
|
@Override
|
||||||
public long size() {
|
public int size() {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,13 +55,15 @@ public class LongStorage extends Storage {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isOpVectorized(String op) {
|
public boolean isOpVectorized(String op) {
|
||||||
return Ops.EQ.equals(op);
|
return Ops.EQ.equals(op) || Ops.IS_MISSING.equals(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Storage runVectorizedOp(String name, Object operand) {
|
public Storage runVectorizedOp(String name, Object operand) {
|
||||||
if (Ops.EQ.equals(name)) {
|
if (Ops.EQ.equals(name)) {
|
||||||
return runVectorizedEq(operand);
|
return runVectorizedEq(operand);
|
||||||
|
} else if (Ops.IS_MISSING.equals(name)) {
|
||||||
|
return new BoolStorage(isMissing, new BitSet(), size, false);
|
||||||
}
|
}
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
@ -94,4 +98,36 @@ public class LongStorage extends Storage {
|
|||||||
}
|
}
|
||||||
return new LongStorage(newData, cardinality, newMissing);
|
return new LongStorage(newData, cardinality, newMissing);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Storage orderMask(int[] positions) {
|
||||||
|
long[] newData = new long[positions.length];
|
||||||
|
BitSet newMissing = new BitSet();
|
||||||
|
for (int i = 0; i < positions.length; i++) {
|
||||||
|
if (positions[i] == Index.NOT_FOUND || isMissing.get(positions[i])) {
|
||||||
|
newMissing.set(i);
|
||||||
|
} else {
|
||||||
|
newData[i] = data[positions[i]];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new LongStorage(newData, positions.length, newMissing);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Storage countMask(int[] counts, int total) {
|
||||||
|
long[] newData = new long[total];
|
||||||
|
BitSet newMissing = new BitSet();
|
||||||
|
int pos = 0;
|
||||||
|
for (int i = 0; i < counts.length; i++) {
|
||||||
|
if (isMissing.get(i)) {
|
||||||
|
newMissing.set(pos, pos + counts[i]);
|
||||||
|
pos += counts[i];
|
||||||
|
} else {
|
||||||
|
for (int j = 0; j < counts[i]; j++) {
|
||||||
|
newData[pos++] = data[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new LongStorage(newData, total, newMissing);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package org.enso.table.data.column.storage;
|
|||||||
import org.enso.table.data.column.builder.object.BoolBuilder;
|
import org.enso.table.data.column.builder.object.BoolBuilder;
|
||||||
import org.enso.table.data.column.builder.object.Builder;
|
import org.enso.table.data.column.builder.object.Builder;
|
||||||
import org.enso.table.data.column.builder.object.InferredBuilder;
|
import org.enso.table.data.column.builder.object.InferredBuilder;
|
||||||
|
import org.enso.table.data.index.Index;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
@ -23,7 +24,7 @@ public class ObjectStorage extends Storage {
|
|||||||
|
|
||||||
/** @inheritDoc */
|
/** @inheritDoc */
|
||||||
@Override
|
@Override
|
||||||
public long size() {
|
public int size() {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,14 +55,27 @@ public class ObjectStorage extends Storage {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isOpVectorized(String op) {
|
public boolean isOpVectorized(String op) {
|
||||||
return false;
|
return Ops.IS_MISSING.equals(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Storage runVectorizedOp(String name, Object operand) {
|
public Storage runVectorizedOp(String name, Object operand) {
|
||||||
|
if (Ops.IS_MISSING.equals(name)) {
|
||||||
|
return runIsMissing();
|
||||||
|
}
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private BoolStorage runIsMissing() {
|
||||||
|
BitSet vals = new BitSet();
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
if (data[i] == null) {
|
||||||
|
vals.set(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new BoolStorage(vals, new BitSet(), size, false);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ObjectStorage mask(BitSet mask, int cardinality) {
|
public ObjectStorage mask(BitSet mask, int cardinality) {
|
||||||
Object[] newData = new Object[cardinality];
|
Object[] newData = new Object[cardinality];
|
||||||
@ -74,7 +88,32 @@ public class ObjectStorage extends Storage {
|
|||||||
return new ObjectStorage(newData, cardinality);
|
return new ObjectStorage(newData, cardinality);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Object[] getData() {
|
@Override
|
||||||
|
public ObjectStorage orderMask(int[] positions) {
|
||||||
|
Object[] newData = new Object[positions.length];
|
||||||
|
for (int i = 0; i < positions.length; i++) {
|
||||||
|
if (positions[i] == Index.NOT_FOUND) {
|
||||||
|
newData[i] = null;
|
||||||
|
} else {
|
||||||
|
newData[i] = data[positions[i]];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new ObjectStorage(newData, positions.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ObjectStorage countMask(int[] counts, int total) {
|
||||||
|
Object[] newData = new Object[total];
|
||||||
|
int pos = 0;
|
||||||
|
for (int i = 0; i < counts.length; i++) {
|
||||||
|
for (int j = 0; j < counts[i]; j++) {
|
||||||
|
newData[pos++] = data[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new ObjectStorage(newData, total);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object[] getData() {
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,10 +6,12 @@ import org.enso.table.data.column.builder.object.InferredBuilder;
|
|||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import org.graalvm.polyglot.Value;
|
||||||
|
|
||||||
/** An abstract representation of a data column. */
|
/** An abstract representation of a data column. */
|
||||||
public abstract class Storage {
|
public abstract class Storage {
|
||||||
/** @return the number of elements in this column (including NAs) */
|
/** @return the number of elements in this column (including NAs) */
|
||||||
public abstract long size();
|
public abstract int size();
|
||||||
|
|
||||||
/** @return the type tag of this column's storage. Must be one of {@link Type} */
|
/** @return the type tag of this column's storage. Must be one of {@link Type} */
|
||||||
public abstract long getType();
|
public abstract long getType();
|
||||||
@ -49,6 +51,7 @@ public abstract class Storage {
|
|||||||
public static final class Ops {
|
public static final class Ops {
|
||||||
public static final String EQ = "==";
|
public static final String EQ = "==";
|
||||||
public static final String NOT = "not";
|
public static final String NOT = "not";
|
||||||
|
public static final String IS_MISSING = "is_missing";
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -69,15 +72,6 @@ public abstract class Storage {
|
|||||||
*/
|
*/
|
||||||
public abstract Storage runVectorizedOp(String name, Object operand);
|
public abstract Storage runVectorizedOp(String name, Object operand);
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a new storage, containing only the items marked true in the mask.
|
|
||||||
*
|
|
||||||
* @param mask the mask to use
|
|
||||||
* @param cardinality the number of true values in mask
|
|
||||||
* @return a new storage, masked with the given mask
|
|
||||||
*/
|
|
||||||
public abstract Storage mask(BitSet mask, int cardinality);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Runs a function on each non-missing element in this storage and gathers the results.
|
* Runs a function on each non-missing element in this storage and gathers the results.
|
||||||
*
|
*
|
||||||
@ -96,4 +90,38 @@ public abstract class Storage {
|
|||||||
}
|
}
|
||||||
return builder.seal();
|
return builder.seal();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a new storage, containing only the items marked true in the mask.
|
||||||
|
*
|
||||||
|
* @param mask the mask to use
|
||||||
|
* @param cardinality the number of true values in mask
|
||||||
|
* @return a new storage, masked with the given mask
|
||||||
|
*/
|
||||||
|
public abstract Storage mask(BitSet mask, int cardinality);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a new storage, ordered according to the rules specified in a mask. The resulting
|
||||||
|
* storage should contain the {@code positions[i]}-th element of the original storage at the i-th
|
||||||
|
* position. {@code positions[i]} may be equal to {@link
|
||||||
|
* org.enso.table.data.index.Index.NOT_FOUND}, in which case a missing value should be inserted at
|
||||||
|
* this position.
|
||||||
|
*
|
||||||
|
* @param positions an array specifying the ordering as described
|
||||||
|
* @return a storage resulting from applying the reordering rules
|
||||||
|
*/
|
||||||
|
public abstract Storage orderMask(int[] positions);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a new storage, resulting from applying the rules specified in a mask. The resulting
|
||||||
|
* storage should contain the elements of the original storage, in the same order. However, the
|
||||||
|
* number of consecutive copies of the i-th element of the original storage should be {@code
|
||||||
|
* counts[i]}.
|
||||||
|
*
|
||||||
|
* @param counts the mask specifying elements duplication
|
||||||
|
* @param total the sum of all elements in the mask, also interpreted as the size of the resulting
|
||||||
|
* storage
|
||||||
|
* @return the storage masked according to the specified rules
|
||||||
|
*/
|
||||||
|
public abstract Storage countMask(int[] counts, int total);
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package org.enso.table.data.column.storage;
|
package org.enso.table.data.column.storage;
|
||||||
|
|
||||||
|
import org.enso.table.data.index.Index;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
|
|
||||||
/** A column storing strings. */
|
/** A column storing strings. */
|
||||||
@ -29,7 +31,7 @@ public class StringStorage extends ObjectStorage {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isOpVectorized(String op) {
|
public boolean isOpVectorized(String op) {
|
||||||
return op.equals("==");
|
return op.equals("==") || super.isOpVectorized(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -37,7 +39,7 @@ public class StringStorage extends ObjectStorage {
|
|||||||
if (Ops.EQ.equals(name)) {
|
if (Ops.EQ.equals(name)) {
|
||||||
return runVectorizedEq(operand);
|
return runVectorizedEq(operand);
|
||||||
}
|
}
|
||||||
throw new UnsupportedOperationException();
|
return super.runVectorizedOp(name, operand);
|
||||||
}
|
}
|
||||||
|
|
||||||
public BoolStorage runVectorizedEq(Object that) {
|
public BoolStorage runVectorizedEq(Object that) {
|
||||||
@ -58,4 +60,16 @@ public class StringStorage extends ObjectStorage {
|
|||||||
ObjectStorage storage = super.mask(mask, cardinality);
|
ObjectStorage storage = super.mask(mask, cardinality);
|
||||||
return new StringStorage(storage.getData(), cardinality);
|
return new StringStorage(storage.getData(), cardinality);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StringStorage orderMask(int[] positions) {
|
||||||
|
ObjectStorage storage = super.orderMask(positions);
|
||||||
|
return new StringStorage(storage.getData(), (int) storage.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StringStorage countMask(int[] counts, int total) {
|
||||||
|
ObjectStorage storage = super.countMask(counts, total);
|
||||||
|
return new StringStorage(storage.getData(), total);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,53 @@
|
|||||||
|
package org.enso.table.data.index;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class DefaultIndex extends Index {
|
||||||
|
private final int size;
|
||||||
|
|
||||||
|
public DefaultIndex(int size) {
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Integer iloc(int loc) {
|
||||||
|
return loc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Integer> loc(Object item) {
|
||||||
|
if (item instanceof Integer) {
|
||||||
|
if ((Integer) item < size) {
|
||||||
|
return Collections.singletonList((Integer) item);
|
||||||
|
}
|
||||||
|
} else if (item instanceof Long) {
|
||||||
|
long l = (Long) item;
|
||||||
|
if (l < size) {
|
||||||
|
return Collections.singletonList((int) l);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String ilocString(int loc) {
|
||||||
|
return String.valueOf(loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getName() {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Index mask(BitSet mask, int cardinality) {
|
||||||
|
return new DefaultIndex(cardinality);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Index countMask(int[] counts, int total) {
|
||||||
|
return new DefaultIndex(total);
|
||||||
|
}
|
||||||
|
}
|
92
table/src/main/java/org/enso/table/data/index/HashIndex.java
Normal file
92
table/src/main/java/org/enso/table/data/index/HashIndex.java
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
package org.enso.table.data.index;
|
||||||
|
|
||||||
|
import org.enso.table.data.column.storage.Storage;
|
||||||
|
import org.enso.table.data.column.storage.StringStorage;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class HashIndex extends Index {
|
||||||
|
private final Object[] items;
|
||||||
|
private final Map<Object, List<Integer>> locs;
|
||||||
|
private final String name;
|
||||||
|
private final int size;
|
||||||
|
|
||||||
|
private HashIndex(Object[] items, Map<Object, List<Integer>> locs, String name, int size) {
|
||||||
|
this.items = items;
|
||||||
|
this.locs = locs;
|
||||||
|
this.name = name;
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
private HashIndex(String name, Object[] items, int size) {
|
||||||
|
Map<Object, List<Integer>> locations = new HashMap<>();
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
List<Integer> its = locations.computeIfAbsent(items[i], x -> new ArrayList<>());
|
||||||
|
its.add(i);
|
||||||
|
}
|
||||||
|
this.locs = locations;
|
||||||
|
this.items = items;
|
||||||
|
this.name = name;
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static HashIndex fromStorage(String name, Storage storage) {
|
||||||
|
Object[] data = new Object[(int) storage.size()];
|
||||||
|
for (int i = 0; i < storage.size(); i++) {
|
||||||
|
data[i] = storage.getItemBoxed(i);
|
||||||
|
}
|
||||||
|
return new HashIndex(name, data, (int) storage.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object iloc(int i) {
|
||||||
|
return items[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Integer> loc(Object item) {
|
||||||
|
return locs.get(item);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String ilocString(int loc) {
|
||||||
|
return iloc(loc).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Index mask(BitSet mask, int cardinality) {
|
||||||
|
Map<Object, List<Integer>> newLocs = new HashMap<>();
|
||||||
|
for (Map.Entry<Object, List<Integer>> entry : locs.entrySet()) {
|
||||||
|
List<Integer> newIxes =
|
||||||
|
entry.getValue().stream().filter(mask::get).collect(Collectors.toList());
|
||||||
|
if (!newIxes.isEmpty()) {
|
||||||
|
newLocs.put(entry.getKey(), newIxes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Object[] newItems = new Object[cardinality];
|
||||||
|
int j = 0;
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
if (mask.get(i)) {
|
||||||
|
newItems[j++] = items[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new HashIndex(newItems, newLocs, name, cardinality);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Index countMask(int[] counts, int total) {
|
||||||
|
Object[] newItems = new Object[total];
|
||||||
|
int pos = 0;
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
for (int j = 0; j < counts[i]; j++) {
|
||||||
|
newItems[pos++] = items[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new HashIndex(name, newItems, total);
|
||||||
|
}
|
||||||
|
}
|
58
table/src/main/java/org/enso/table/data/index/Index.java
Normal file
58
table/src/main/java/org/enso/table/data/index/Index.java
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
package org.enso.table.data.index;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/** A storage class for ordered multisets. */
|
||||||
|
public abstract class Index {
|
||||||
|
public static final int NOT_FOUND = -1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the element at a given (0-based) position.
|
||||||
|
*
|
||||||
|
* @param loc the position
|
||||||
|
* @return the corresponding element
|
||||||
|
*/
|
||||||
|
public abstract Object iloc(int loc);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a string representation of the item at a given position.
|
||||||
|
*
|
||||||
|
* @param loc the position
|
||||||
|
* @return a string representing the element at the given position
|
||||||
|
*/
|
||||||
|
public abstract String ilocString(int loc);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the list of positions where the given object is contained. The result may be null if
|
||||||
|
* the item is not found.
|
||||||
|
*
|
||||||
|
* @param item the item to lookup
|
||||||
|
* @return the list of all positions containing {@code item}
|
||||||
|
*/
|
||||||
|
public abstract List<Integer> loc(Object item);
|
||||||
|
|
||||||
|
/** @return the name of this index */
|
||||||
|
public abstract String getName();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a new index, containing only the items marked true in the mask.
|
||||||
|
*
|
||||||
|
* @param mask the mask to use
|
||||||
|
* @param cardinality the number of true values in mask
|
||||||
|
* @return a new index, masked with the given mask
|
||||||
|
*/
|
||||||
|
public abstract Index mask(BitSet mask, int cardinality);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a new index, resulting from applying the rules specified in a mask. The resulting index
|
||||||
|
* should contain the elements of the original storage, in the same order. However, the number of
|
||||||
|
* consecutive copies of the i-th element of the original index should be {@code counts[i]}.
|
||||||
|
*
|
||||||
|
* @param counts the mask specifying elements duplication
|
||||||
|
* @param total the sum of all elements in the mask, also interpreted as the size of the resulting
|
||||||
|
* index
|
||||||
|
* @return the index masked according to the specified rules
|
||||||
|
*/
|
||||||
|
public abstract Index countMask(int[] counts, int total);
|
||||||
|
}
|
@ -2,6 +2,8 @@ package org.enso.table.data.table;
|
|||||||
|
|
||||||
import org.enso.table.data.column.builder.object.InferredBuilder;
|
import org.enso.table.data.column.builder.object.InferredBuilder;
|
||||||
import org.enso.table.data.column.storage.Storage;
|
import org.enso.table.data.column.storage.Storage;
|
||||||
|
import org.enso.table.data.index.Index;
|
||||||
|
import org.enso.table.data.index.DefaultIndex;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -10,6 +12,7 @@ import java.util.List;
|
|||||||
public class Column {
|
public class Column {
|
||||||
private final String name;
|
private final String name;
|
||||||
private final Storage storage;
|
private final Storage storage;
|
||||||
|
private final Index index;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new column.
|
* Creates a new column.
|
||||||
@ -17,9 +20,10 @@ public class Column {
|
|||||||
* @param name the column name
|
* @param name the column name
|
||||||
* @param storage the underlying storage
|
* @param storage the underlying storage
|
||||||
*/
|
*/
|
||||||
public Column(String name, Storage storage) {
|
public Column(String name, Index index, Storage storage) {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.storage = storage;
|
this.storage = storage;
|
||||||
|
this.index = index;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return the column name */
|
/** @return the column name */
|
||||||
@ -44,8 +48,8 @@ public class Column {
|
|||||||
* @param cardinality the number of true values in mask
|
* @param cardinality the number of true values in mask
|
||||||
* @return a new column, masked with the given mask
|
* @return a new column, masked with the given mask
|
||||||
*/
|
*/
|
||||||
public Column mask(BitSet mask, int cardinality) {
|
public Column mask(Index maskedIndex, BitSet mask, int cardinality) {
|
||||||
return new Column(name, storage.mask(mask, cardinality));
|
return new Column(name, maskedIndex, storage.mask(mask, cardinality));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -55,7 +59,7 @@ public class Column {
|
|||||||
* @return a new column with the given name
|
* @return a new column with the given name
|
||||||
*/
|
*/
|
||||||
public Column rename(String name) {
|
public Column rename(String name) {
|
||||||
return new Column(name, storage);
|
return new Column(name, index, storage);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -70,6 +74,21 @@ public class Column {
|
|||||||
for (Object item : items) {
|
for (Object item : items) {
|
||||||
builder.append(item);
|
builder.append(item);
|
||||||
}
|
}
|
||||||
return new Column(name, builder.seal());
|
return new Column(name, new DefaultIndex(items.size()), builder.seal());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Changes the index of this column.
|
||||||
|
*
|
||||||
|
* @param ix the index to use
|
||||||
|
* @return a column indexed by {@code ix}
|
||||||
|
*/
|
||||||
|
public Column withIndex(Index ix) {
|
||||||
|
return new Column(name, ix, storage);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @return the index of this column */
|
||||||
|
public Index getIndex() {
|
||||||
|
return index;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,14 +1,21 @@
|
|||||||
package org.enso.table.data.table;
|
package org.enso.table.data.table;
|
||||||
|
|
||||||
import org.enso.table.data.column.storage.BoolStorage;
|
import org.enso.table.data.column.storage.BoolStorage;
|
||||||
|
import org.enso.table.data.column.storage.Storage;
|
||||||
|
import org.enso.table.data.index.Index;
|
||||||
|
import org.enso.table.data.index.DefaultIndex;
|
||||||
|
import org.enso.table.data.index.HashIndex;
|
||||||
|
import org.enso.table.error.NoSuchColumnException;
|
||||||
import org.enso.table.error.UnexpectedColumnTypeException;
|
import org.enso.table.error.UnexpectedColumnTypeException;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/** A representation of a table structure. */
|
/** A representation of a table structure. */
|
||||||
public class Table {
|
public class Table {
|
||||||
|
|
||||||
private final Column[] columns;
|
private final Column[] columns;
|
||||||
|
private final Index index;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new table
|
* Creates a new table
|
||||||
@ -16,7 +23,15 @@ public class Table {
|
|||||||
* @param columns the columns contained in this table.
|
* @param columns the columns contained in this table.
|
||||||
*/
|
*/
|
||||||
public Table(Column[] columns) {
|
public Table(Column[] columns) {
|
||||||
|
this(
|
||||||
|
columns,
|
||||||
|
new DefaultIndex(
|
||||||
|
(columns == null || columns.length == 0) ? 0 : (int) columns[0].getSize()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Table(Column[] columns, Index index) {
|
||||||
this.columns = columns;
|
this.columns = columns;
|
||||||
|
this.index = index;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return the number of rows in this table */
|
/** @return the number of rows in this table */
|
||||||
@ -68,10 +83,11 @@ public class Table {
|
|||||||
mask.andNot(storage.getIsMissing());
|
mask.andNot(storage.getIsMissing());
|
||||||
int cardinality = mask.cardinality();
|
int cardinality = mask.cardinality();
|
||||||
Column[] newColumns = new Column[columns.length];
|
Column[] newColumns = new Column[columns.length];
|
||||||
|
Index newIx = index.mask(mask, cardinality);
|
||||||
for (int i = 0; i < columns.length; i++) {
|
for (int i = 0; i < columns.length; i++) {
|
||||||
newColumns[i] = columns[i].mask(mask, cardinality);
|
newColumns[i] = columns[i].mask(newIx, mask, cardinality);
|
||||||
}
|
}
|
||||||
return new Table(newColumns);
|
return new Table(newColumns, newIx);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -99,13 +115,136 @@ public class Table {
|
|||||||
Column[] newCols = new Column[columns.length];
|
Column[] newCols = new Column[columns.length];
|
||||||
System.arraycopy(columns, 0, newCols, 0, columns.length);
|
System.arraycopy(columns, 0, newCols, 0, columns.length);
|
||||||
newCols[ix] = newCol;
|
newCols[ix] = newCol;
|
||||||
return new Table(newCols);
|
return new Table(newCols, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Table addColumn(Column newColumn) {
|
private Table addColumn(Column newColumn) {
|
||||||
Column[] newCols = new Column[columns.length + 1];
|
Column[] newCols = new Column[columns.length + 1];
|
||||||
System.arraycopy(columns, 0, newCols, 0, columns.length);
|
System.arraycopy(columns, 0, newCols, 0, columns.length);
|
||||||
newCols[columns.length] = newColumn;
|
newCols[columns.length] = newColumn;
|
||||||
return new Table(newCols);
|
return new Table(newCols, index);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the index of this table.
|
||||||
|
*
|
||||||
|
* @return the index of this table
|
||||||
|
*/
|
||||||
|
public Index getIndex() {
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reindexes this table by using values from the column with the given name.
|
||||||
|
*
|
||||||
|
* @param name the column name to use as index
|
||||||
|
* @return a table indexed by the proper column
|
||||||
|
*/
|
||||||
|
public Table indexFromColumn(String name) {
|
||||||
|
Column col = getColumnByName(name);
|
||||||
|
if (col == null) throw new NoSuchColumnException(name);
|
||||||
|
Storage storage = col.getStorage();
|
||||||
|
Index ix = HashIndex.fromStorage(col.getName(), storage);
|
||||||
|
Column[] newColumns = new Column[columns.length - 1];
|
||||||
|
int j = 0;
|
||||||
|
for (int i = 0; i < columns.length; i++) {
|
||||||
|
if (!columns[i].getName().equals(name)) {
|
||||||
|
newColumns[j++] = columns[i].withIndex(ix);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new Table(newColumns, ix);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selects a subset of columns of this table, by names.
|
||||||
|
*
|
||||||
|
* @param colNames the column names to select
|
||||||
|
* @return a table containing only selected columns
|
||||||
|
*/
|
||||||
|
public Table selectColumns(List<String> colNames) {
|
||||||
|
Column[] newCols =
|
||||||
|
colNames.stream()
|
||||||
|
.map(this::getColumnByName)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.toArray(Column[]::new);
|
||||||
|
return new Table(newCols, index);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Joins this table with another, by combining rows from this with rows of other with a matching
|
||||||
|
* index.
|
||||||
|
*
|
||||||
|
* @param other the table being joined with
|
||||||
|
* @param dropUnmatched whether the rows containing unmatched values in this should be dropped
|
||||||
|
* @param on a column name in this that should be used as the join key. If this is null, index is
|
||||||
|
* used instead
|
||||||
|
* @param lsuffix the suffix to add to names of columns of this in case there's a name conflict
|
||||||
|
* @param rsuffix the suffix to add to names of columns of other in case there's a name conflict
|
||||||
|
* @return the result of performing the join
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public Table join(Table other, boolean dropUnmatched, String on, String lsuffix, String rsuffix) {
|
||||||
|
int s = (int) nrows();
|
||||||
|
List<Integer>[] matches = new List[s];
|
||||||
|
if (on == null) {
|
||||||
|
for (int i = 0; i < s; i++) {
|
||||||
|
matches[i] = other.index.loc(index.iloc(i));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Storage onS = getColumnByName(on).getStorage();
|
||||||
|
for (int i = 0; i < s; i++) {
|
||||||
|
matches[i] = other.index.loc(onS.getItemBoxed(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int outSize = 0;
|
||||||
|
int[] countMask = new int[s];
|
||||||
|
for (int i = 0; i < s; i++) {
|
||||||
|
if (matches[i] == null) {
|
||||||
|
countMask[i] = dropUnmatched ? 0 : 1;
|
||||||
|
} else {
|
||||||
|
countMask[i] = matches[i].size();
|
||||||
|
}
|
||||||
|
outSize += countMask[i];
|
||||||
|
}
|
||||||
|
int[] orderMask = new int[outSize];
|
||||||
|
int orderMaskPosition = 0;
|
||||||
|
for (int i = 0; i < s; i++) {
|
||||||
|
if (matches[i] == null) {
|
||||||
|
if (!dropUnmatched) {
|
||||||
|
orderMask[orderMaskPosition++] = Index.NOT_FOUND;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (Integer x : matches[i]) {
|
||||||
|
orderMask[orderMaskPosition++] = x;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Column[] newColumns = new Column[this.columns.length + other.columns.length];
|
||||||
|
Index newIndex = index.countMask(countMask, outSize);
|
||||||
|
Set<String> lnames =
|
||||||
|
Arrays.stream(this.columns).map(Column::getName).collect(Collectors.toSet());
|
||||||
|
Set<String> rnames =
|
||||||
|
Arrays.stream(other.columns).map(Column::getName).collect(Collectors.toSet());
|
||||||
|
for (int i = 0; i < columns.length; i++) {
|
||||||
|
Column original = columns[i];
|
||||||
|
newColumns[i] =
|
||||||
|
new Column(
|
||||||
|
suffixIfNecessary(rnames, original.getName(), lsuffix),
|
||||||
|
newIndex,
|
||||||
|
original.getStorage().countMask(countMask, outSize));
|
||||||
|
}
|
||||||
|
for (int i = 0; i < other.columns.length; i++) {
|
||||||
|
Column original = other.columns[i];
|
||||||
|
newColumns[i + columns.length] =
|
||||||
|
new Column(
|
||||||
|
suffixIfNecessary(lnames, original.getName(), rsuffix),
|
||||||
|
newIndex,
|
||||||
|
original.getStorage().orderMask(orderMask));
|
||||||
|
}
|
||||||
|
return new Table(newColumns, newIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String suffixIfNecessary(Set<String> names, String name, String suffix) {
|
||||||
|
return names.contains(name) ? name + suffix : name;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,16 @@
|
|||||||
|
package org.enso.table.error;
|
||||||
|
|
||||||
|
/** An exception thrown when a column is looked up by a non-existent name. */
|
||||||
|
public class NoSuchColumnException extends RuntimeException {
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new instance of this error.
|
||||||
|
*
|
||||||
|
* @param name the column name
|
||||||
|
*/
|
||||||
|
public NoSuchColumnException(String name) {
|
||||||
|
super("The column with name " + name + " does not exist.");
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
}
|
@ -5,6 +5,7 @@ import com.univocity.parsers.csv.CsvParserSettings;
|
|||||||
import org.enso.table.data.column.storage.Storage;
|
import org.enso.table.data.column.storage.Storage;
|
||||||
import org.enso.table.data.column.builder.string.StorageBuilder;
|
import org.enso.table.data.column.builder.string.StorageBuilder;
|
||||||
import org.enso.table.data.column.builder.string.PrimInferredStorageBuilder;
|
import org.enso.table.data.column.builder.string.PrimInferredStorageBuilder;
|
||||||
|
import org.enso.table.data.index.DefaultIndex;
|
||||||
import org.enso.table.data.table.Column;
|
import org.enso.table.data.table.Column;
|
||||||
import org.enso.table.data.table.Table;
|
import org.enso.table.data.table.Table;
|
||||||
|
|
||||||
@ -68,7 +69,7 @@ public class Parser {
|
|||||||
for (int i = 0; i < builders.length; i++) {
|
for (int i = 0; i < builders.length; i++) {
|
||||||
String name = header != null ? header[i] : unnamedColumnPrefix + i;
|
String name = header != null ? header[i] : unnamedColumnPrefix + i;
|
||||||
Storage col = builders[i].seal();
|
Storage col = builders[i].seal();
|
||||||
columns[i] = new Column(name, col);
|
columns[i] = new Column(name, new DefaultIndex(col.size()), col);
|
||||||
}
|
}
|
||||||
return new Table(columns);
|
return new Table(columns);
|
||||||
}
|
}
|
||||||
|
@ -65,7 +65,18 @@ spec =
|
|||||||
|
|
||||||
t.to_json.should_equal expected
|
t.to_json.should_equal expected
|
||||||
|
|
||||||
describe "Mapping operations" <|
|
describe "JSON construction" <|
|
||||||
|
it "should allow converting a JSON array into a table" <|
|
||||||
|
r_1 = Json.from_pairs [['foo', 20], ['bar', 'baz'], ['baz', False]]
|
||||||
|
r_2 = Json.from_pairs [['bar', 'xyz'], ['baz', True]]
|
||||||
|
r_3 = Json.from_pairs [['baz', False], ['foo', 13]]
|
||||||
|
t = [r_1, r_2, r_3].to_json.to_table ['foo', 'bar', 'baz']
|
||||||
|
t.columns.map name . should_equal ['foo', 'bar', 'baz']
|
||||||
|
t.at 'foo' . to_vector . should_equal [20, Nothing, 13]
|
||||||
|
t.at 'bar' . to_vector . should_equal ['baz', 'xyz', Nothing]
|
||||||
|
t.at 'baz' . to_vector . should_equal [False, True, False]
|
||||||
|
|
||||||
|
describe "Mapping Operations" <|
|
||||||
it "should allow mapping a function over a column" <|
|
it "should allow mapping a function over a column" <|
|
||||||
c_str = Column.from_vector 'x' ['a', 'b', Nothing, 'b']
|
c_str = Column.from_vector 'x' ['a', 'b', Nothing, 'b']
|
||||||
c_str.map (+ "x") . to_vector . should_equal ['ax', 'bx', Nothing, 'bx']
|
c_str.map (+ "x") . to_vector . should_equal ['ax', 'bx', Nothing, 'bx']
|
||||||
@ -90,7 +101,7 @@ spec =
|
|||||||
c_any = Column.from_vector 'x' [My 1 6, My 6 3, My 2 5, My 3 4, My 200 300]
|
c_any = Column.from_vector 'x' [My 1 6, My 6 3, My 2 5, My 3 4, My 200 300]
|
||||||
(c_any == My 7 0).to_vector.should_equal [True, False, True, True, False]
|
(c_any == My 7 0).to_vector.should_equal [True, False, True, True, False]
|
||||||
|
|
||||||
describe "Masking tables" <|
|
describe "Masking Tables" <|
|
||||||
it "should allow selecting table rows based on a boolean column" <|
|
it "should allow selecting table rows based on a boolean column" <|
|
||||||
df = (Enso_Project.data / "simple_empty.csv").read_csv
|
df = (Enso_Project.data / "simple_empty.csv").read_csv
|
||||||
r = df.where (Column.from_vector 'x' [True, False, False, True])
|
r = df.where (Column.from_vector 'x' [True, False, False, True])
|
||||||
@ -103,3 +114,47 @@ spec =
|
|||||||
r.at "a" . to_vector . should_equal [4]
|
r.at "a" . to_vector . should_equal [4]
|
||||||
r.at "b" . to_vector . should_equal [Nothing]
|
r.at "b" . to_vector . should_equal [Nothing]
|
||||||
r.at "c" . to_vector . should_equal [6]
|
r.at "c" . to_vector . should_equal [6]
|
||||||
|
|
||||||
|
describe "Joining Tables" <|
|
||||||
|
a_0 = ['x', [0, 1, 7, 3, 6]]
|
||||||
|
a_1 = ['y', ["foo", "bar", "baz", "spam", "eggs"]]
|
||||||
|
a = Table.new [a_0, a_1]
|
||||||
|
b_0 = ['w', [6, 3, 5, 5, 3, 3]]
|
||||||
|
b_1 = ['z', ["foo", "foo", "bar", "spam", "bar", "eggs"]]
|
||||||
|
b = Table.new [b_0, b_1]
|
||||||
|
|
||||||
|
it "should allow joining tables index-on-index" <|
|
||||||
|
r_1 = a.set_index 'x' . join (b.set_index 'w')
|
||||||
|
r_1.at 'y' . to_vector . should_equal ['foo', 'bar', 'baz', 'spam', 'spam', 'spam', 'eggs']
|
||||||
|
r_1.at 'z' . to_vector . should_equal [Nothing, Nothing, Nothing, 'foo', 'bar', 'eggs', 'foo']
|
||||||
|
r_2 = a.set_index 'y' . join (b.set_index 'z') drop_unmatched=True
|
||||||
|
r_2.at 'x' . to_vector . should_equal [0, 0, 1, 1, 3, 6]
|
||||||
|
r_2.at 'w' . to_vector . should_equal [6, 3, 5, 3, 5, 3]
|
||||||
|
|
||||||
|
it "should allow joining tables column-on-index" <|
|
||||||
|
r_1 = a.join (b.set_index 'w') on='x'
|
||||||
|
r_1.at 'y' . to_vector . should_equal ['foo', 'bar', 'baz', 'spam', 'spam', 'spam', 'eggs']
|
||||||
|
r_1.at 'z' . to_vector . should_equal [Nothing, Nothing, Nothing, 'foo', 'bar', 'eggs', 'foo']
|
||||||
|
r_2 = a.join (b.set_index 'z') drop_unmatched=True on='y'
|
||||||
|
r_2.at 'x' . to_vector . should_equal [0, 0, 1, 1, 3, 6]
|
||||||
|
r_2.at 'w' . to_vector . should_equal [6, 3, 5, 3, 5, 3]
|
||||||
|
|
||||||
|
it "should allow joining tables on the default index" <|
|
||||||
|
x = Table.new [['x', [1, 2, 4, 6]]]
|
||||||
|
y = Table.new [['y', [8, 9]]]
|
||||||
|
r_1 = x.join y
|
||||||
|
r_1.at 'x' . to_vector . should_equal [1, 2, 4, 6]
|
||||||
|
r_1.at 'y' . to_vector . should_equal [8, 9, Nothing, Nothing]
|
||||||
|
|
||||||
|
r_2 = x.join y drop_unmatched=True
|
||||||
|
r_2.at 'x' . to_vector . should_equal [1, 2]
|
||||||
|
r_2.at 'y' . to_vector . should_equal [8, 9]
|
||||||
|
|
||||||
|
it "should append suffixes to disambiguate column names" <|
|
||||||
|
x = Table.new [['x', [1, 2, 4, 6]]]
|
||||||
|
y = Table.new [['x', [8, 9]]]
|
||||||
|
r_1 = x.join y
|
||||||
|
r_1.columns.map name . should_equal ['x_left', 'x_right']
|
||||||
|
|
||||||
|
r_2 = x.join y left_suffix='_old' right_suffix='_new'
|
||||||
|
r_2.columns.map name . should_equal ['x_old', 'x_new']
|
||||||
|
Loading…
Reference in New Issue
Block a user