Table: Indexes & Joins (#1317)

This commit is contained in:
Marcin Kostrzewa 2020-11-30 16:21:55 +01:00 committed by GitHub
parent 07190a729c
commit a40989e7c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 773 additions and 56 deletions

View File

@ -66,11 +66,26 @@ type Json
to_text : Text to_text : Text
to_text = Internal.render_helper this to_text = Internal.render_helper this
## Recursively unwraps the JSON value into primitive values.
unwrap : Any
unwrap = case this of
Json.Array its -> its.map unwrap
Json.Boolean b -> b
Json.Number n -> n
Json.String t -> t
Json.Null -> Nothing
Json.Object f -> f.map unwrap
## A failure indicating malformed text input into the JSON parser. ## A failure indicating malformed text input into the JSON parser.
Check the `message` field for detailed information on the specific failure. Check the `message` field for detailed information on the specific failure.
type Parse_Error message type Parse_Error message
## Gets the value associated with the given key in this object. Returns
`Nothing` if the associated key is not defined.
Object.get : Text -> Json | Nothing
Object.get field = this.fields.get field
## Parses an RFC-8259 compliant JSON text into a `Json` structure. ## Parses an RFC-8259 compliant JSON text into a `Json` structure.
parse : Text -> Json ! Parse_Error parse : Text -> Json ! Parse_Error
parse json_text = parse json_text =

View File

@ -200,6 +200,14 @@ type Vector
0.up_to arr.length . each ix-> new_arr.set_at ix (function (arr.at ix)) 0.up_to arr.length . each ix-> new_arr.set_at ix (function (arr.at ix))
Vector new_arr Vector new_arr
## Applies a function to each element of the vector, returning the vector
of results.
The function is called with both the element index as well as the
element itself.
map_with_index : (Int -> Any -> Any) -> Vector
map_with_index function = here.new this.length i-> function i (this.at i)
## Applies a function to each element of the vector. ## Applies a function to each element of the vector.
Unlike `map`, this method does not return the individual results, Unlike `map`, this method does not return the individual results,

View File

@ -10,16 +10,20 @@ type Column
Arguments: Arguments:
- show_rows: the number of initial rows that should be displayed. - show_rows: the number of initial rows that should be displayed.
display : Integer -> Text - format_terminal: whether ANSI-terminal formatting should be used
display show_rows=10 = display : Integer -> Boolean -> Text
display show_rows=10 format_terminal=False =
java_col = this.java_column java_col = this.java_column
index = java_col.getIndex []
col_name = java_col.getName [] col_name = java_col.getName []
storage = java_col.getStorage [] storage = java_col.getStorage []
num_rows = java_col.getSize [] num_rows = java_col.getSize []
display_rows = min num_rows show_rows display_rows = min num_rows show_rows
items = Vector.new display_rows num-> items = Vector.new display_rows num->
[if storage.isNa [num] then "NA" else here.get_item_string storage num] row = if storage.isNa [num] then "Nothing" else
table = Table.print_table [col_name] items here.get_item_string storage num
[index.ilocString [num], row]
table = Table.print_table [index.getName [], col_name] items format_terminal
if num_rows - display_rows <= 0 then table else if num_rows - display_rows <= 0 then table else
missing = '\n\u2026 and ' + (num_rows - display_rows).to_text + ' hidden rows.' missing = '\n\u2026 and ' + (num_rows - display_rows).to_text + ' hidden rows.'
table + missing table + missing
@ -29,7 +33,8 @@ type Column
Arguments: Arguments:
- show_rows: the number of initial rows that should be displayed. - show_rows: the number of initial rows that should be displayed.
print show_rows=10 = print show_rows=10 =
IO.println (this.display show_rows) IO.println (this.display show_rows format_terminal=True)
IO.println ''
## Element-wise equality comparison. Returns a column with results of ## Element-wise equality comparison. Returns a column with results of
comparing this column's elements against `other`. comparing this column's elements against `other`.
@ -47,12 +52,18 @@ type Column
not = not =
here.run_vectorized_op this "not" not Nothing here.run_vectorized_op this "not" not Nothing
## Returns a column of booleans, with `True` items at the positions where
this column contains a `Nothing`.
is_missing : Column
is_missing = here.run_vectorized_op this "is_missing" (== Nothing) Nothing
## Applies `function` to each item in this column and returns the column ## Applies `function` to each item in this column and returns the column
of results. of results.
map function = map function =
storage = this.java_column.getStorage [] storage = this.java_column.getStorage []
index = this.java_column.getIndex []
new_st = storage.map [function] new_st = storage.map [function]
col = Java_Column.new ["Result", new_st].to_array col = Java_Column.new ["Result", index, new_st].to_array
Column col Column col
## Returns a new column, containing the same elements as `this`, but with ## Returns a new column, containing the same elements as `this`, but with
@ -104,9 +115,10 @@ from_vector name items = Column (Java_Column.fromItems [name, items.to_array])
## PRIVATE ## PRIVATE
run_vectorized_op column java_op_name fallback_method operand = run_vectorized_op column java_op_name fallback_method operand =
storage = column.java_column.getStorage [] storage = column.java_column.getStorage []
ix = column.java_column.getIndex []
rs = if storage.isOpVectorized [java_op_name] then storage.runVectorizedOp [java_op_name, operand] else rs = if storage.isOpVectorized [java_op_name] then storage.runVectorizedOp [java_op_name, operand] else
storage.map [fallback_method] storage.map [fallback_method]
Column (Java_Column.new ["Result", rs].to_array) Column (Java_Column.new ["Result", ix, rs].to_array)
## PRIVATE ## PRIVATE

View File

@ -1,6 +1,7 @@
from Base import all from Base import all
import Table.Io.Csv import Table.Io.Csv
import Table.Data.Column import Table.Data.Column
import Base.System.Platform
polyglot java import org.enso.table.data.table.Table as Java_Table polyglot java import org.enso.table.data.table.Table as Java_Table
@ -12,17 +13,20 @@ type Table
Arguments: Arguments:
- show_rows: the number of initial rows that should be displayed. - show_rows: the number of initial rows that should be displayed.
display : Integer -> Text - format_terminal: whether ANSI-terminal formatting should be used
display show_rows=10 = display : Integer -> Boolean -> Text
display show_rows=10 format_terminal=False =
cols = Vector.Vector (this.java_table.getColumns []) cols = Vector.Vector (this.java_table.getColumns [])
col_names = cols.map (_.getName []) index = this.java_table.getIndex []
col_names = [index.getName[]] + cols.map (_.getName [])
col_vals = cols.map (_.getStorage []) col_vals = cols.map (_.getStorage [])
num_rows = this.java_table.nrows [] num_rows = this.java_table.nrows []
display_rows = min num_rows show_rows display_rows = min num_rows show_rows
rows = Vector.new display_rows row_num-> rows = Vector.new display_rows row_num->
col_vals.map col-> cols = col_vals.map col->
if col.isNa [row_num] then "NA" else Column.get_item_string col row_num if col.isNa [row_num] then "Nothing" else Column.get_item_string col row_num
table = here.print_table col_names rows [index.ilocString [row_num]] + cols
table = here.print_table col_names rows format_terminal
if num_rows - display_rows <= 0 then table else if num_rows - display_rows <= 0 then table else
missing = '\n\u2026 and ' + (num_rows - display_rows).to_text + ' hidden rows.' missing = '\n\u2026 and ' + (num_rows - display_rows).to_text + ' hidden rows.'
table + missing table + missing
@ -32,7 +36,8 @@ type Table
Arguments: Arguments:
- show_rows: the number of initial rows that should be displayed. - show_rows: the number of initial rows that should be displayed.
print show_rows=10 = print show_rows=10 =
IO.println (this.display show_rows) IO.println (this.display show_rows format_terminal=True)
IO.println ''
## Converts this table to a JSON structure. ## Converts this table to a JSON structure.
to_json : Json to_json : Json
@ -67,6 +72,34 @@ type Table
columns = columns =
Vector.Vector (this.java_table.getColumns []) . map Column.Column Vector.Vector (this.java_table.getColumns []) . map Column.Column
## Sets the index of this table, using the column with the provided name.
set_index : Text -> Table
set_index index =
Table (this.java_table.indexFromColumn [index])
## Selects a subset of columns from this table by name.
select columns = Table (this.java_table.selectColumns [columns.to_array])
## Efficiently joins two tables based on either the index or a key column.
The resulting table contains rows of `this` extended with rows of
`other` with matching indexes. If the index in `other` is not unique,
the corresponding rows of `this` will be duplicated in the result.
Arguments:
- other: the table being the right operand of this join operation.
- on: the column of `this` that should be used as the join key. If
this argument is not provided, the index of `this` will be used.
- drop_unmatched: whether the rows of `this` without corresponding
matches in `other` should be dropped from the result.
- left_suffix: a suffix that should be added to the columns of `this`
when there's a name conflict with a column of `other`.
- right_suffix: a suffix that should be added to the columns of `other`
when there's a name conflict with a column of `this`.
join : Table -> Text | Nothing -> Boolean -> Text -> Text -> Table
join other on=Nothing drop_unmatched=False left_suffix='_left' right_suffix='_right' =
Table (this.java_table.join [other.java_table, drop_unmatched, on, left_suffix, right_suffix])
## PRIVATE ## PRIVATE
from_columns cols = Table (Java_Table.new [cols.to_array].to_array) from_columns cols = Table (Java_Table.new [cols.to_array].to_array)
@ -86,13 +119,22 @@ pad txt len =
txt + (" ".repeat (len - true_len)) txt + (" ".repeat (len - true_len))
## PRIVATE ## PRIVATE
print_table header rows = ansi_bold enabled txt =
case Platform.os of
## Output formatting for Windows is not currently supported.
Platform.Windows -> txt
_ -> if enabled then '\e[1m' + txt + '\e[m' else txt
## PRIVATE
print_table header rows format_term =
content_lengths = Vector.new header.length i-> content_lengths = Vector.new header.length i->
max_row = 0.up_to rows.length . fold 0 a-> j-> max a (rows.at j . at i . characters . length) max_row = 0.up_to rows.length . fold 0 a-> j-> max a (rows.at j . at i . characters . length)
max max_row (header.at i . characters . length) max max_row (header.at i . characters . length)
header_line = zip header content_lengths here.pad . join ' | ' header_line = zip header content_lengths here.pad . map (here.ansi_bold format_term) . join ' | '
divider = content_lengths . map (l -> "-".repeat l+2) . join '+' divider = content_lengths . map (l -> "-".repeat l+2) . join '+'
row_lines = rows.map r-> row_lines = rows.map r->
x = zip r content_lengths here.pad . join ' | ' x = zip r content_lengths here.pad
" " + x with_bold_ix = [here.ansi_bold format_term (x.at 0)] + x.drop_start 1
y = with_bold_ix . join ' | '
" " + y
([" " + header_line, divider] + row_lines).join '\n' ([" " + header_line, divider] + row_lines).join '\n'

View File

@ -8,3 +8,25 @@ from Table.Io.Csv export all hiding Parser
export Table.Data.Column export Table.Data.Column
from Table.Data.Table export new from Table.Data.Table export new
## Converts a JSON array into a dataframe, by looking up the requested keys
from each item.
It assumes the items are JSON objects containing the requested keys.
In case an item is not an object, or the request key does not exist, the
relevant values of the table will be set to `Nothing`.
Arguments:
- fields: a vector of texts representing the names of fields to look up.
Json.Array.to_table : Vector -> Table
Json.Array.to_table fields = case this of
Json.Array items ->
rows = items.map <| case _ of
Json.Object fs ->
fields.map n-> case fs.get n of
Nothing -> Nothing
js -> js.unwrap
_ -> Vector.fill fields.length Nothing
cols = fields.map_with_index i-> n->
[n, rows.map (_.at i)]
Table.new cols

View File

@ -1,10 +1,10 @@
package org.enso.table.data.column.storage; package org.enso.table.data.column.storage;
import org.enso.table.data.index.Index;
import java.util.BitSet; import java.util.BitSet;
/** /** A boolean column storage. */
* A boolean column storage.
*/
public class BoolStorage extends Storage { public class BoolStorage extends Storage {
private final BitSet values; private final BitSet values;
private final BitSet isMissing; private final BitSet isMissing;
@ -19,7 +19,7 @@ public class BoolStorage extends Storage {
} }
@Override @Override
public long size() { public int size() {
return size; return size;
} }
@ -44,7 +44,7 @@ public class BoolStorage extends Storage {
@Override @Override
public boolean isOpVectorized(String op) { public boolean isOpVectorized(String op) {
return op.equals(Ops.EQ) || op.equals(Ops.NOT); return op.equals(Ops.EQ) || op.equals(Ops.NOT) || op.equals(Ops.IS_MISSING);
} }
@Override @Override
@ -53,6 +53,8 @@ public class BoolStorage extends Storage {
return runVectorizedEq(operand); return runVectorizedEq(operand);
} else if (Ops.NOT.equals(name)) { } else if (Ops.NOT.equals(name)) {
return new BoolStorage(values, isMissing, size, !negated); return new BoolStorage(values, isMissing, size, !negated);
} else if (Ops.IS_MISSING.equals(name)) {
return new BoolStorage(isMissing, new BitSet(), size, false);
} }
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@ -98,6 +100,36 @@ public class BoolStorage extends Storage {
return new BoolStorage(newValues, newMissing, cardinality, negated); return new BoolStorage(newValues, newMissing, cardinality, negated);
} }
@Override
public Storage orderMask(int[] positions) {
BitSet newNa = new BitSet();
BitSet newVals = new BitSet();
for (int i = 0; i < positions.length; i++) {
if (positions[i] == Index.NOT_FOUND || isMissing.get(positions[i])) {
newNa.set(i);
} else if (values.get(positions[i])) {
values.set(i);
}
}
return new BoolStorage(newVals, newNa, positions.length, negated);
}
@Override
public Storage countMask(int[] counts, int total) {
BitSet newNa = new BitSet();
BitSet newVals = new BitSet();
int pos = 0;
for (int i = 0; i < counts.length; i++) {
if (isMissing.get(i)) {
newNa.set(pos, pos + counts[i]);
} else if (values.get(i)) {
newVals.set(pos, pos + counts[i]);
}
pos += counts[i];
}
return new BoolStorage(newVals, newNa, total, negated);
}
public boolean isNegated() { public boolean isNegated() {
return negated; return negated;
} }

View File

@ -1,5 +1,7 @@
package org.enso.table.data.column.storage; package org.enso.table.data.column.storage;
import org.enso.table.data.index.Index;
import java.util.BitSet; import java.util.BitSet;
import java.util.function.Function; import java.util.function.Function;
@ -24,7 +26,7 @@ public class DoubleStorage extends Storage {
/** @inheritDoc */ /** @inheritDoc */
@Override @Override
public long size() { public int size() {
return size; return size;
} }
@ -55,13 +57,15 @@ public class DoubleStorage extends Storage {
@Override @Override
public boolean isOpVectorized(String op) { public boolean isOpVectorized(String op) {
return op.equals("=="); return Ops.EQ.equals(op) || Ops.IS_MISSING.equals(op);
} }
@Override @Override
public Storage runVectorizedOp(String name, Object operand) { public Storage runVectorizedOp(String name, Object operand) {
if (name.equals("==")) { if (name.equals(Ops.EQ)) {
return runVectorizedEq(operand); return runVectorizedEq(operand);
} else if (name.equals(Ops.IS_MISSING)) {
return new BoolStorage(isMissing, new BitSet(), size, false);
} }
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@ -98,4 +102,36 @@ public class DoubleStorage extends Storage {
} }
return new DoubleStorage(newData, cardinality, newMissing); return new DoubleStorage(newData, cardinality, newMissing);
} }
@Override
public Storage orderMask(int[] positions) {
long[] newData = new long[positions.length];
BitSet newMissing = new BitSet();
for (int i = 0; i < positions.length; i++) {
if (positions[i] == Index.NOT_FOUND || isMissing.get(positions[i])) {
newMissing.set(i);
} else {
newData[i] = data[positions[i]];
}
}
return new DoubleStorage(newData, positions.length, newMissing);
}
@Override
public Storage countMask(int[] counts, int total) {
long[] newData = new long[total];
BitSet newMissing = new BitSet();
int pos = 0;
for (int i = 0; i < counts.length; i++) {
if (isMissing.get(i)) {
newMissing.set(pos, pos + counts[i]);
pos += counts[i];
} else {
for (int j = 0; j < counts[i]; j++) {
newData[pos++] = data[i];
}
}
}
return new DoubleStorage(newData, total, newMissing);
}
} }

View File

@ -1,5 +1,7 @@
package org.enso.table.data.column.storage; package org.enso.table.data.column.storage;
import org.enso.table.data.index.Index;
import java.util.BitSet; import java.util.BitSet;
/** A column storing 64-bit integers. */ /** A column storing 64-bit integers. */
@ -22,7 +24,7 @@ public class LongStorage extends Storage {
/** @inheritDoc */ /** @inheritDoc */
@Override @Override
public long size() { public int size() {
return size; return size;
} }
@ -53,13 +55,15 @@ public class LongStorage extends Storage {
@Override @Override
public boolean isOpVectorized(String op) { public boolean isOpVectorized(String op) {
return Ops.EQ.equals(op); return Ops.EQ.equals(op) || Ops.IS_MISSING.equals(op);
} }
@Override @Override
public Storage runVectorizedOp(String name, Object operand) { public Storage runVectorizedOp(String name, Object operand) {
if (Ops.EQ.equals(name)) { if (Ops.EQ.equals(name)) {
return runVectorizedEq(operand); return runVectorizedEq(operand);
} else if (Ops.IS_MISSING.equals(name)) {
return new BoolStorage(isMissing, new BitSet(), size, false);
} }
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@ -94,4 +98,36 @@ public class LongStorage extends Storage {
} }
return new LongStorage(newData, cardinality, newMissing); return new LongStorage(newData, cardinality, newMissing);
} }
@Override
public Storage orderMask(int[] positions) {
long[] newData = new long[positions.length];
BitSet newMissing = new BitSet();
for (int i = 0; i < positions.length; i++) {
if (positions[i] == Index.NOT_FOUND || isMissing.get(positions[i])) {
newMissing.set(i);
} else {
newData[i] = data[positions[i]];
}
}
return new LongStorage(newData, positions.length, newMissing);
}
@Override
public Storage countMask(int[] counts, int total) {
long[] newData = new long[total];
BitSet newMissing = new BitSet();
int pos = 0;
for (int i = 0; i < counts.length; i++) {
if (isMissing.get(i)) {
newMissing.set(pos, pos + counts[i]);
pos += counts[i];
} else {
for (int j = 0; j < counts[i]; j++) {
newData[pos++] = data[i];
}
}
}
return new LongStorage(newData, total, newMissing);
}
} }

View File

@ -3,6 +3,7 @@ package org.enso.table.data.column.storage;
import org.enso.table.data.column.builder.object.BoolBuilder; import org.enso.table.data.column.builder.object.BoolBuilder;
import org.enso.table.data.column.builder.object.Builder; import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.InferredBuilder; import org.enso.table.data.column.builder.object.InferredBuilder;
import org.enso.table.data.index.Index;
import java.util.BitSet; import java.util.BitSet;
import java.util.function.Function; import java.util.function.Function;
@ -23,7 +24,7 @@ public class ObjectStorage extends Storage {
/** @inheritDoc */ /** @inheritDoc */
@Override @Override
public long size() { public int size() {
return size; return size;
} }
@ -54,14 +55,27 @@ public class ObjectStorage extends Storage {
@Override @Override
public boolean isOpVectorized(String op) { public boolean isOpVectorized(String op) {
return false; return Ops.IS_MISSING.equals(op);
} }
@Override @Override
public Storage runVectorizedOp(String name, Object operand) { public Storage runVectorizedOp(String name, Object operand) {
if (Ops.IS_MISSING.equals(name)) {
return runIsMissing();
}
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
private BoolStorage runIsMissing() {
BitSet vals = new BitSet();
for (int i = 0; i < size; i++) {
if (data[i] == null) {
vals.set(i);
}
}
return new BoolStorage(vals, new BitSet(), size, false);
}
@Override @Override
public ObjectStorage mask(BitSet mask, int cardinality) { public ObjectStorage mask(BitSet mask, int cardinality) {
Object[] newData = new Object[cardinality]; Object[] newData = new Object[cardinality];
@ -74,7 +88,32 @@ public class ObjectStorage extends Storage {
return new ObjectStorage(newData, cardinality); return new ObjectStorage(newData, cardinality);
} }
protected Object[] getData() { @Override
public ObjectStorage orderMask(int[] positions) {
Object[] newData = new Object[positions.length];
for (int i = 0; i < positions.length; i++) {
if (positions[i] == Index.NOT_FOUND) {
newData[i] = null;
} else {
newData[i] = data[positions[i]];
}
}
return new ObjectStorage(newData, positions.length);
}
@Override
public ObjectStorage countMask(int[] counts, int total) {
Object[] newData = new Object[total];
int pos = 0;
for (int i = 0; i < counts.length; i++) {
for (int j = 0; j < counts[i]; j++) {
newData[pos++] = data[i];
}
}
return new ObjectStorage(newData, total);
}
public Object[] getData() {
return data; return data;
} }
} }

View File

@ -6,10 +6,12 @@ import org.enso.table.data.column.builder.object.InferredBuilder;
import java.util.BitSet; import java.util.BitSet;
import java.util.function.Function; import java.util.function.Function;
import org.graalvm.polyglot.Value;
/** An abstract representation of a data column. */ /** An abstract representation of a data column. */
public abstract class Storage { public abstract class Storage {
/** @return the number of elements in this column (including NAs) */ /** @return the number of elements in this column (including NAs) */
public abstract long size(); public abstract int size();
/** @return the type tag of this column's storage. Must be one of {@link Type} */ /** @return the type tag of this column's storage. Must be one of {@link Type} */
public abstract long getType(); public abstract long getType();
@ -49,6 +51,7 @@ public abstract class Storage {
public static final class Ops { public static final class Ops {
public static final String EQ = "=="; public static final String EQ = "==";
public static final String NOT = "not"; public static final String NOT = "not";
public static final String IS_MISSING = "is_missing";
} }
/** /**
@ -69,15 +72,6 @@ public abstract class Storage {
*/ */
public abstract Storage runVectorizedOp(String name, Object operand); public abstract Storage runVectorizedOp(String name, Object operand);
/**
* Return a new storage, containing only the items marked true in the mask.
*
* @param mask the mask to use
* @param cardinality the number of true values in mask
* @return a new storage, masked with the given mask
*/
public abstract Storage mask(BitSet mask, int cardinality);
/** /**
* Runs a function on each non-missing element in this storage and gathers the results. * Runs a function on each non-missing element in this storage and gathers the results.
* *
@ -96,4 +90,38 @@ public abstract class Storage {
} }
return builder.seal(); return builder.seal();
} }
/**
* Return a new storage, containing only the items marked true in the mask.
*
* @param mask the mask to use
* @param cardinality the number of true values in mask
* @return a new storage, masked with the given mask
*/
public abstract Storage mask(BitSet mask, int cardinality);
/**
* Returns a new storage, ordered according to the rules specified in a mask. The resulting
* storage should contain the {@code positions[i]}-th element of the original storage at the i-th
* position. {@code positions[i]} may be equal to {@link
* org.enso.table.data.index.Index.NOT_FOUND}, in which case a missing value should be inserted at
* this position.
*
* @param positions an array specifying the ordering as described
* @return a storage resulting from applying the reordering rules
*/
public abstract Storage orderMask(int[] positions);
/**
* Returns a new storage, resulting from applying the rules specified in a mask. The resulting
* storage should contain the elements of the original storage, in the same order. However, the
* number of consecutive copies of the i-th element of the original storage should be {@code
* counts[i]}.
*
* @param counts the mask specifying elements duplication
* @param total the sum of all elements in the mask, also interpreted as the size of the resulting
* storage
* @return the storage masked according to the specified rules
*/
public abstract Storage countMask(int[] counts, int total);
} }

View File

@ -1,5 +1,7 @@
package org.enso.table.data.column.storage; package org.enso.table.data.column.storage;
import org.enso.table.data.index.Index;
import java.util.BitSet; import java.util.BitSet;
/** A column storing strings. */ /** A column storing strings. */
@ -29,7 +31,7 @@ public class StringStorage extends ObjectStorage {
@Override @Override
public boolean isOpVectorized(String op) { public boolean isOpVectorized(String op) {
return op.equals("=="); return op.equals("==") || super.isOpVectorized(op);
} }
@Override @Override
@ -37,7 +39,7 @@ public class StringStorage extends ObjectStorage {
if (Ops.EQ.equals(name)) { if (Ops.EQ.equals(name)) {
return runVectorizedEq(operand); return runVectorizedEq(operand);
} }
throw new UnsupportedOperationException(); return super.runVectorizedOp(name, operand);
} }
public BoolStorage runVectorizedEq(Object that) { public BoolStorage runVectorizedEq(Object that) {
@ -58,4 +60,16 @@ public class StringStorage extends ObjectStorage {
ObjectStorage storage = super.mask(mask, cardinality); ObjectStorage storage = super.mask(mask, cardinality);
return new StringStorage(storage.getData(), cardinality); return new StringStorage(storage.getData(), cardinality);
} }
@Override
public StringStorage orderMask(int[] positions) {
ObjectStorage storage = super.orderMask(positions);
return new StringStorage(storage.getData(), (int) storage.size());
}
@Override
public StringStorage countMask(int[] counts, int total) {
ObjectStorage storage = super.countMask(counts, total);
return new StringStorage(storage.getData(), total);
}
} }

View File

@ -0,0 +1,53 @@
package org.enso.table.data.index;
import java.util.BitSet;
import java.util.Collections;
import java.util.List;
public class DefaultIndex extends Index {
private final int size;
public DefaultIndex(int size) {
this.size = size;
}
@Override
public Integer iloc(int loc) {
return loc;
}
@Override
public List<Integer> loc(Object item) {
if (item instanceof Integer) {
if ((Integer) item < size) {
return Collections.singletonList((Integer) item);
}
} else if (item instanceof Long) {
long l = (Long) item;
if (l < size) {
return Collections.singletonList((int) l);
}
}
return null;
}
@Override
public String ilocString(int loc) {
return String.valueOf(loc);
}
@Override
public String getName() {
return "";
}
@Override
public Index mask(BitSet mask, int cardinality) {
return new DefaultIndex(cardinality);
}
@Override
public Index countMask(int[] counts, int total) {
return new DefaultIndex(total);
}
}

View File

@ -0,0 +1,92 @@
package org.enso.table.data.index;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import java.util.*;
import java.util.stream.Collectors;
public class HashIndex extends Index {
private final Object[] items;
private final Map<Object, List<Integer>> locs;
private final String name;
private final int size;
private HashIndex(Object[] items, Map<Object, List<Integer>> locs, String name, int size) {
this.items = items;
this.locs = locs;
this.name = name;
this.size = size;
}
private HashIndex(String name, Object[] items, int size) {
Map<Object, List<Integer>> locations = new HashMap<>();
for (int i = 0; i < size; i++) {
List<Integer> its = locations.computeIfAbsent(items[i], x -> new ArrayList<>());
its.add(i);
}
this.locs = locations;
this.items = items;
this.name = name;
this.size = size;
}
public static HashIndex fromStorage(String name, Storage storage) {
Object[] data = new Object[(int) storage.size()];
for (int i = 0; i < storage.size(); i++) {
data[i] = storage.getItemBoxed(i);
}
return new HashIndex(name, data, (int) storage.size());
}
public Object iloc(int i) {
return items[i];
}
@Override
public List<Integer> loc(Object item) {
return locs.get(item);
}
@Override
public String ilocString(int loc) {
return iloc(loc).toString();
}
@Override
public String getName() {
return name;
}
@Override
public Index mask(BitSet mask, int cardinality) {
Map<Object, List<Integer>> newLocs = new HashMap<>();
for (Map.Entry<Object, List<Integer>> entry : locs.entrySet()) {
List<Integer> newIxes =
entry.getValue().stream().filter(mask::get).collect(Collectors.toList());
if (!newIxes.isEmpty()) {
newLocs.put(entry.getKey(), newIxes);
}
}
Object[] newItems = new Object[cardinality];
int j = 0;
for (int i = 0; i < size; i++) {
if (mask.get(i)) {
newItems[j++] = items[i];
}
}
return new HashIndex(newItems, newLocs, name, cardinality);
}
@Override
public Index countMask(int[] counts, int total) {
Object[] newItems = new Object[total];
int pos = 0;
for (int i = 0; i < size; i++) {
for (int j = 0; j < counts[i]; j++) {
newItems[pos++] = items[i];
}
}
return new HashIndex(name, newItems, total);
}
}

View File

@ -0,0 +1,58 @@
package org.enso.table.data.index;
import java.util.BitSet;
import java.util.List;
/** A storage class for ordered multisets. */
public abstract class Index {
public static final int NOT_FOUND = -1;
/**
* Returns the element at a given (0-based) position.
*
* @param loc the position
* @return the corresponding element
*/
public abstract Object iloc(int loc);
/**
* Returns a string representation of the item at a given position.
*
* @param loc the position
* @return a string representing the element at the given position
*/
public abstract String ilocString(int loc);
/**
* Returns the list of positions where the given object is contained. The result may be null if
* the item is not found.
*
* @param item the item to lookup
* @return the list of all positions containing {@code item}
*/
public abstract List<Integer> loc(Object item);
/** @return the name of this index */
public abstract String getName();
/**
* Return a new index, containing only the items marked true in the mask.
*
* @param mask the mask to use
* @param cardinality the number of true values in mask
* @return a new index, masked with the given mask
*/
public abstract Index mask(BitSet mask, int cardinality);
/**
* Returns a new index, resulting from applying the rules specified in a mask. The resulting index
* should contain the elements of the original storage, in the same order. However, the number of
* consecutive copies of the i-th element of the original index should be {@code counts[i]}.
*
* @param counts the mask specifying elements duplication
* @param total the sum of all elements in the mask, also interpreted as the size of the resulting
* index
* @return the index masked according to the specified rules
*/
public abstract Index countMask(int[] counts, int total);
}

View File

@ -2,6 +2,8 @@ package org.enso.table.data.table;
import org.enso.table.data.column.builder.object.InferredBuilder; import org.enso.table.data.column.builder.object.InferredBuilder;
import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.index.Index;
import org.enso.table.data.index.DefaultIndex;
import java.util.BitSet; import java.util.BitSet;
import java.util.List; import java.util.List;
@ -10,6 +12,7 @@ import java.util.List;
public class Column { public class Column {
private final String name; private final String name;
private final Storage storage; private final Storage storage;
private final Index index;
/** /**
* Creates a new column. * Creates a new column.
@ -17,9 +20,10 @@ public class Column {
* @param name the column name * @param name the column name
* @param storage the underlying storage * @param storage the underlying storage
*/ */
public Column(String name, Storage storage) { public Column(String name, Index index, Storage storage) {
this.name = name; this.name = name;
this.storage = storage; this.storage = storage;
this.index = index;
} }
/** @return the column name */ /** @return the column name */
@ -44,8 +48,8 @@ public class Column {
* @param cardinality the number of true values in mask * @param cardinality the number of true values in mask
* @return a new column, masked with the given mask * @return a new column, masked with the given mask
*/ */
public Column mask(BitSet mask, int cardinality) { public Column mask(Index maskedIndex, BitSet mask, int cardinality) {
return new Column(name, storage.mask(mask, cardinality)); return new Column(name, maskedIndex, storage.mask(mask, cardinality));
} }
/** /**
@ -55,7 +59,7 @@ public class Column {
* @return a new column with the given name * @return a new column with the given name
*/ */
public Column rename(String name) { public Column rename(String name) {
return new Column(name, storage); return new Column(name, index, storage);
} }
/** /**
@ -70,6 +74,21 @@ public class Column {
for (Object item : items) { for (Object item : items) {
builder.append(item); builder.append(item);
} }
return new Column(name, builder.seal()); return new Column(name, new DefaultIndex(items.size()), builder.seal());
}
/**
* Changes the index of this column.
*
* @param ix the index to use
* @return a column indexed by {@code ix}
*/
public Column withIndex(Index ix) {
return new Column(name, ix, storage);
}
/** @return the index of this column */
public Index getIndex() {
return index;
} }
} }

View File

@ -1,14 +1,21 @@
package org.enso.table.data.table; package org.enso.table.data.table;
import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.index.Index;
import org.enso.table.data.index.DefaultIndex;
import org.enso.table.data.index.HashIndex;
import org.enso.table.error.NoSuchColumnException;
import org.enso.table.error.UnexpectedColumnTypeException; import org.enso.table.error.UnexpectedColumnTypeException;
import java.util.BitSet; import java.util.*;
import java.util.stream.Collectors;
/** A representation of a table structure. */ /** A representation of a table structure. */
public class Table { public class Table {
private final Column[] columns; private final Column[] columns;
private final Index index;
/** /**
* Creates a new table * Creates a new table
@ -16,7 +23,15 @@ public class Table {
* @param columns the columns contained in this table. * @param columns the columns contained in this table.
*/ */
public Table(Column[] columns) { public Table(Column[] columns) {
this(
columns,
new DefaultIndex(
(columns == null || columns.length == 0) ? 0 : (int) columns[0].getSize()));
}
private Table(Column[] columns, Index index) {
this.columns = columns; this.columns = columns;
this.index = index;
} }
/** @return the number of rows in this table */ /** @return the number of rows in this table */
@ -68,10 +83,11 @@ public class Table {
mask.andNot(storage.getIsMissing()); mask.andNot(storage.getIsMissing());
int cardinality = mask.cardinality(); int cardinality = mask.cardinality();
Column[] newColumns = new Column[columns.length]; Column[] newColumns = new Column[columns.length];
Index newIx = index.mask(mask, cardinality);
for (int i = 0; i < columns.length; i++) { for (int i = 0; i < columns.length; i++) {
newColumns[i] = columns[i].mask(mask, cardinality); newColumns[i] = columns[i].mask(newIx, mask, cardinality);
} }
return new Table(newColumns); return new Table(newColumns, newIx);
} }
/** /**
@ -99,13 +115,136 @@ public class Table {
Column[] newCols = new Column[columns.length]; Column[] newCols = new Column[columns.length];
System.arraycopy(columns, 0, newCols, 0, columns.length); System.arraycopy(columns, 0, newCols, 0, columns.length);
newCols[ix] = newCol; newCols[ix] = newCol;
return new Table(newCols); return new Table(newCols, index);
} }
private Table addColumn(Column newColumn) { private Table addColumn(Column newColumn) {
Column[] newCols = new Column[columns.length + 1]; Column[] newCols = new Column[columns.length + 1];
System.arraycopy(columns, 0, newCols, 0, columns.length); System.arraycopy(columns, 0, newCols, 0, columns.length);
newCols[columns.length] = newColumn; newCols[columns.length] = newColumn;
return new Table(newCols); return new Table(newCols, index);
}
/**
* Returns the index of this table.
*
* @return the index of this table
*/
public Index getIndex() {
return index;
}
/**
* Reindexes this table by using values from the column with the given name.
*
* @param name the column name to use as index
* @return a table indexed by the proper column
*/
public Table indexFromColumn(String name) {
Column col = getColumnByName(name);
if (col == null) throw new NoSuchColumnException(name);
Storage storage = col.getStorage();
Index ix = HashIndex.fromStorage(col.getName(), storage);
Column[] newColumns = new Column[columns.length - 1];
int j = 0;
for (int i = 0; i < columns.length; i++) {
if (!columns[i].getName().equals(name)) {
newColumns[j++] = columns[i].withIndex(ix);
}
}
return new Table(newColumns, ix);
}
/**
* Selects a subset of columns of this table, by names.
*
* @param colNames the column names to select
* @return a table containing only selected columns
*/
public Table selectColumns(List<String> colNames) {
Column[] newCols =
colNames.stream()
.map(this::getColumnByName)
.filter(Objects::nonNull)
.toArray(Column[]::new);
return new Table(newCols, index);
}
/**
* Joins this table with another, by combining rows from this with rows of other with a matching
* index.
*
* @param other the table being joined with
* @param dropUnmatched whether the rows containing unmatched values in this should be dropped
* @param on a column name in this that should be used as the join key. If this is null, index is
* used instead
* @param lsuffix the suffix to add to names of columns of this in case there's a name conflict
* @param rsuffix the suffix to add to names of columns of other in case there's a name conflict
* @return the result of performing the join
*/
@SuppressWarnings("unchecked")
public Table join(Table other, boolean dropUnmatched, String on, String lsuffix, String rsuffix) {
int s = (int) nrows();
List<Integer>[] matches = new List[s];
if (on == null) {
for (int i = 0; i < s; i++) {
matches[i] = other.index.loc(index.iloc(i));
}
} else {
Storage onS = getColumnByName(on).getStorage();
for (int i = 0; i < s; i++) {
matches[i] = other.index.loc(onS.getItemBoxed(i));
}
}
int outSize = 0;
int[] countMask = new int[s];
for (int i = 0; i < s; i++) {
if (matches[i] == null) {
countMask[i] = dropUnmatched ? 0 : 1;
} else {
countMask[i] = matches[i].size();
}
outSize += countMask[i];
}
int[] orderMask = new int[outSize];
int orderMaskPosition = 0;
for (int i = 0; i < s; i++) {
if (matches[i] == null) {
if (!dropUnmatched) {
orderMask[orderMaskPosition++] = Index.NOT_FOUND;
}
} else {
for (Integer x : matches[i]) {
orderMask[orderMaskPosition++] = x;
}
}
}
Column[] newColumns = new Column[this.columns.length + other.columns.length];
Index newIndex = index.countMask(countMask, outSize);
Set<String> lnames =
Arrays.stream(this.columns).map(Column::getName).collect(Collectors.toSet());
Set<String> rnames =
Arrays.stream(other.columns).map(Column::getName).collect(Collectors.toSet());
for (int i = 0; i < columns.length; i++) {
Column original = columns[i];
newColumns[i] =
new Column(
suffixIfNecessary(rnames, original.getName(), lsuffix),
newIndex,
original.getStorage().countMask(countMask, outSize));
}
for (int i = 0; i < other.columns.length; i++) {
Column original = other.columns[i];
newColumns[i + columns.length] =
new Column(
suffixIfNecessary(lnames, original.getName(), rsuffix),
newIndex,
original.getStorage().orderMask(orderMask));
}
return new Table(newColumns, newIndex);
}
private String suffixIfNecessary(Set<String> names, String name, String suffix) {
return names.contains(name) ? name + suffix : name;
} }
} }

View File

@ -0,0 +1,16 @@
package org.enso.table.error;
/** An exception thrown when a column is looked up by a non-existent name. */
public class NoSuchColumnException extends RuntimeException {
private final String name;
/**
* Creates a new instance of this error.
*
* @param name the column name
*/
public NoSuchColumnException(String name) {
super("The column with name " + name + " does not exist.");
this.name = name;
}
}

View File

@ -5,6 +5,7 @@ import com.univocity.parsers.csv.CsvParserSettings;
import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.builder.string.StorageBuilder; import org.enso.table.data.column.builder.string.StorageBuilder;
import org.enso.table.data.column.builder.string.PrimInferredStorageBuilder; import org.enso.table.data.column.builder.string.PrimInferredStorageBuilder;
import org.enso.table.data.index.DefaultIndex;
import org.enso.table.data.table.Column; import org.enso.table.data.table.Column;
import org.enso.table.data.table.Table; import org.enso.table.data.table.Table;
@ -68,7 +69,7 @@ public class Parser {
for (int i = 0; i < builders.length; i++) { for (int i = 0; i < builders.length; i++) {
String name = header != null ? header[i] : unnamedColumnPrefix + i; String name = header != null ? header[i] : unnamedColumnPrefix + i;
Storage col = builders[i].seal(); Storage col = builders[i].seal();
columns[i] = new Column(name, col); columns[i] = new Column(name, new DefaultIndex(col.size()), col);
} }
return new Table(columns); return new Table(columns);
} }

View File

@ -65,7 +65,18 @@ spec =
t.to_json.should_equal expected t.to_json.should_equal expected
describe "Mapping operations" <| describe "JSON construction" <|
it "should allow converting a JSON array into a table" <|
r_1 = Json.from_pairs [['foo', 20], ['bar', 'baz'], ['baz', False]]
r_2 = Json.from_pairs [['bar', 'xyz'], ['baz', True]]
r_3 = Json.from_pairs [['baz', False], ['foo', 13]]
t = [r_1, r_2, r_3].to_json.to_table ['foo', 'bar', 'baz']
t.columns.map name . should_equal ['foo', 'bar', 'baz']
t.at 'foo' . to_vector . should_equal [20, Nothing, 13]
t.at 'bar' . to_vector . should_equal ['baz', 'xyz', Nothing]
t.at 'baz' . to_vector . should_equal [False, True, False]
describe "Mapping Operations" <|
it "should allow mapping a function over a column" <| it "should allow mapping a function over a column" <|
c_str = Column.from_vector 'x' ['a', 'b', Nothing, 'b'] c_str = Column.from_vector 'x' ['a', 'b', Nothing, 'b']
c_str.map (+ "x") . to_vector . should_equal ['ax', 'bx', Nothing, 'bx'] c_str.map (+ "x") . to_vector . should_equal ['ax', 'bx', Nothing, 'bx']
@ -90,7 +101,7 @@ spec =
c_any = Column.from_vector 'x' [My 1 6, My 6 3, My 2 5, My 3 4, My 200 300] c_any = Column.from_vector 'x' [My 1 6, My 6 3, My 2 5, My 3 4, My 200 300]
(c_any == My 7 0).to_vector.should_equal [True, False, True, True, False] (c_any == My 7 0).to_vector.should_equal [True, False, True, True, False]
describe "Masking tables" <| describe "Masking Tables" <|
it "should allow selecting table rows based on a boolean column" <| it "should allow selecting table rows based on a boolean column" <|
df = (Enso_Project.data / "simple_empty.csv").read_csv df = (Enso_Project.data / "simple_empty.csv").read_csv
r = df.where (Column.from_vector 'x' [True, False, False, True]) r = df.where (Column.from_vector 'x' [True, False, False, True])
@ -103,3 +114,47 @@ spec =
r.at "a" . to_vector . should_equal [4] r.at "a" . to_vector . should_equal [4]
r.at "b" . to_vector . should_equal [Nothing] r.at "b" . to_vector . should_equal [Nothing]
r.at "c" . to_vector . should_equal [6] r.at "c" . to_vector . should_equal [6]
describe "Joining Tables" <|
a_0 = ['x', [0, 1, 7, 3, 6]]
a_1 = ['y', ["foo", "bar", "baz", "spam", "eggs"]]
a = Table.new [a_0, a_1]
b_0 = ['w', [6, 3, 5, 5, 3, 3]]
b_1 = ['z', ["foo", "foo", "bar", "spam", "bar", "eggs"]]
b = Table.new [b_0, b_1]
it "should allow joining tables index-on-index" <|
r_1 = a.set_index 'x' . join (b.set_index 'w')
r_1.at 'y' . to_vector . should_equal ['foo', 'bar', 'baz', 'spam', 'spam', 'spam', 'eggs']
r_1.at 'z' . to_vector . should_equal [Nothing, Nothing, Nothing, 'foo', 'bar', 'eggs', 'foo']
r_2 = a.set_index 'y' . join (b.set_index 'z') drop_unmatched=True
r_2.at 'x' . to_vector . should_equal [0, 0, 1, 1, 3, 6]
r_2.at 'w' . to_vector . should_equal [6, 3, 5, 3, 5, 3]
it "should allow joining tables column-on-index" <|
r_1 = a.join (b.set_index 'w') on='x'
r_1.at 'y' . to_vector . should_equal ['foo', 'bar', 'baz', 'spam', 'spam', 'spam', 'eggs']
r_1.at 'z' . to_vector . should_equal [Nothing, Nothing, Nothing, 'foo', 'bar', 'eggs', 'foo']
r_2 = a.join (b.set_index 'z') drop_unmatched=True on='y'
r_2.at 'x' . to_vector . should_equal [0, 0, 1, 1, 3, 6]
r_2.at 'w' . to_vector . should_equal [6, 3, 5, 3, 5, 3]
it "should allow joining tables on the default index" <|
x = Table.new [['x', [1, 2, 4, 6]]]
y = Table.new [['y', [8, 9]]]
r_1 = x.join y
r_1.at 'x' . to_vector . should_equal [1, 2, 4, 6]
r_1.at 'y' . to_vector . should_equal [8, 9, Nothing, Nothing]
r_2 = x.join y drop_unmatched=True
r_2.at 'x' . to_vector . should_equal [1, 2]
r_2.at 'y' . to_vector . should_equal [8, 9]
it "should append suffixes to disambiguate column names" <|
x = Table.new [['x', [1, 2, 4, 6]]]
y = Table.new [['x', [8, 9]]]
r_1 = x.join y
r_1.columns.map name . should_equal ['x_left', 'x_right']
r_2 = x.join y left_suffix='_old' right_suffix='_new'
r_2.columns.map name . should_equal ['x_old', 'x_new']