diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml index ceb751b9a31..773569ee04e 100644 --- a/.github/workflows/scala.yml +++ b/.github/workflows/scala.yml @@ -268,6 +268,17 @@ jobs: run: | $ENGINE_DIST_DIR/bin/enso.bat --run test/Tests + - name: Test Tables Library (Unix) + shell: bash + if: runner.os != 'Windows' + run: | + $ENGINE_DIST_DIR/bin/enso --run test/Table_Tests + - name: Test Tables Library (Windows) + shell: bash + if: runner.os == 'Windows' + run: | + $ENGINE_DIST_DIR/bin/enso.bat --run test/Table_Tests + # Publish - name: Publish the Engine Distribution Artifact uses: actions/upload-artifact@v2 diff --git a/build.sbt b/build.sbt index f5f19883b35..9c2f976d429 100644 --- a/build.sbt +++ b/build.sbt @@ -59,6 +59,11 @@ GatherLicenses.distributions := Seq( "std-lib-Base", file("distribution/std-lib/Base/THIRD-PARTY"), Distribution.sbtProjects(`std-bits`) + ), + Distribution( + "std-lib-Table", + file("distribution/std-lib/Table/THIRD-PARTY"), + Distribution.sbtProjects(`table`) ) ) GatherLicenses.licenseConfigurations := Set("compile") @@ -996,6 +1001,7 @@ lazy val runtime = (project in file("engine/runtime")) .settings( (Runtime / compile) := (Runtime / compile) .dependsOn(`std-bits` / Compile / packageBin) + .dependsOn(table / Compile / packageBin) .value ) .settings( @@ -1214,6 +1220,7 @@ lazy val `runtime-version-manager-test` = project val `std-lib-root` = file("distribution/std-lib/") val `std-lib-polyglot-root` = `std-lib-root` / "Base" / "polyglot" / "java" +val `table-polyglot-root` = `std-lib-root` / "Table" / "polyglot" / "java" lazy val `std-bits` = project .in(file("std-bits")) @@ -1237,6 +1244,28 @@ lazy val `std-bits` = project }.value ) +lazy val `table` = project + .in(file("table")) + .settings( + autoScalaLibrary := false, + Compile / packageBin / artifactPath := + `table-polyglot-root` / "table.jar", + libraryDependencies ++= Seq( + "com.univocity" % "univocity-parsers" % "2.9.0" + ), + Compile / packageBin := Def.task { + val result = (Compile / packageBin).value + StdBits + .copyDependencies( + `table-polyglot-root`, + "table.jar", + ignoreScalaLibrary = true + ) + .value + result + }.value + ) + /* Note [HTTPS in the Launcher] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * The launcher uses Apache HttpClient for making web requests. It does not use diff --git a/distribution/std-lib/Base/src/Data/Json.enso b/distribution/std-lib/Base/src/Data/Json.enso index 2c042637fb7..c03f502f88a 100644 --- a/distribution/std-lib/Base/src/Data/Json.enso +++ b/distribution/std-lib/Base/src/Data/Json.enso @@ -117,3 +117,25 @@ Any.to_json = instead define their own `to_json` implementations. Meta.Polyglot _ -> Null Meta.Primitive _ -> Null + +## Method used by object builders to convert a value into a valid JSON key. +Text.to_json_key : Text +Text.to_json_key = this + +## A smart constructor, building an object representation based on a vector + of key-value pairs. + + All values used as keys must define a `to_json_key : Text` method. + + > Example + The following code: + Json.from_pairs [["foo", 533], ["bar", False]] + Returns a JSON object, that after serialization becomes: + { "foo": 533, "bar": false } +from_pairs : Vector -> Object +from_pairs contents = + fs = contents.fold Map.empty map-> kv_pair-> + key = kv_pair . at 0 . to_json_key + val = kv_pair . at 1 . to_json + map.insert key val + Object fs diff --git a/distribution/std-lib/Base/src/Data/Map.enso b/distribution/std-lib/Base/src/Data/Map.enso index d9b76f1b438..17b012d635d 100644 --- a/distribution/std-lib/Base/src/Data/Map.enso +++ b/distribution/std-lib/Base/src/Data/Map.enso @@ -77,3 +77,6 @@ type Map empty : Map empty = Tip +## Returns a single-element map with the given key and value present. +singleton : Any -> Any -> Map +singleton key value = Bin 1 key value Tip Tip diff --git a/distribution/std-lib/Base/src/Data/Text/Extensions.enso b/distribution/std-lib/Base/src/Data/Text/Extensions.enso index 76b3150a5d7..5952fa2c544 100644 --- a/distribution/std-lib/Base/src/Data/Text/Extensions.enso +++ b/distribution/std-lib/Base/src/Data/Text/Extensions.enso @@ -143,3 +143,9 @@ Text.contains sequence = Text_Utils.contains [this, sequence] ## Text to JSON conversion. Text.to_json : Json.String Text.to_json = Json.String this + +## Takes a non-negative integer and returns a new text, consisting of `count` + concatenated copies of `this`. +Text.repeat : Integer -> Text +Text.repeat count = + 0.upto count . fold "" acc-> _-> acc + this diff --git a/distribution/std-lib/Base/src/System/File.enso b/distribution/std-lib/Base/src/System/File.enso index 6d42ac08e75..a0cd50d1dd0 100644 --- a/distribution/std-lib/Base/src/System/File.enso +++ b/distribution/std-lib/Base/src/System/File.enso @@ -6,6 +6,7 @@ export Base.System.File.Option polyglot java import java.nio.file.NoSuchFileException polyglot java import java.nio.file.AccessDeniedException polyglot java import java.io.IOException +polyglot java import java.io.InputStream as Java_Input_Stream type File_Error type No_Such_File_Error file @@ -106,6 +107,13 @@ type Input_Stream close : Unit close = Managed_Resource.finalize this.stream_resource + ## Exposes operations on the underlying Java input stream. + + Useful when integrating with polyglot functions requiring an + `InputStream` as an argument. + with_java_stream : (Java_Input_Stream -> Any) -> Any + with_java_stream f = Managed_Resource.with this.stream_resource f + type File type File prim_file diff --git a/distribution/std-lib/Table/THIRD-PARTY/NOTICE b/distribution/std-lib/Table/THIRD-PARTY/NOTICE new file mode 100644 index 00000000000..34afedd8862 --- /dev/null +++ b/distribution/std-lib/Table/THIRD-PARTY/NOTICE @@ -0,0 +1,7 @@ +Enso +Copyright 2020 New Byte Order sp. z o. o. + +'univocity-parsers', licensed under the Apache 2, is distributed with the std-lib-Table. +The license file can be found at `licenses/APACHE2.0`. +Copyright notices related to this dependency can be found in the directory `com.univocity.univocity-parsers-2.9.0`. + diff --git a/distribution/std-lib/Table/THIRD-PARTY/com.univocity.univocity-parsers-2.9.0/NOTICES b/distribution/std-lib/Table/THIRD-PARTY/com.univocity.univocity-parsers-2.9.0/NOTICES new file mode 100644 index 00000000000..95947ed986c --- /dev/null +++ b/distribution/std-lib/Table/THIRD-PARTY/com.univocity.univocity-parsers-2.9.0/NOTICES @@ -0,0 +1,15 @@ +Copyright 2017 Univocity Software Pty Ltd + +Copyright 2016 Univocity Software Pty Ltd + +Copyright 2018 Univocity Software Pty Ltd + +Copyright (c) 2018. Univocity Software Pty Ltd + +Copyright 2019 Univocity Software Pty Ltd + +Copyright 2014 Univocity Software Pty Ltd + +Copyright 2015 Univocity Software Pty Ltd + +Copyright (c) 2015. Univocity Software Pty Ltd diff --git a/distribution/std-lib/Table/THIRD-PARTY/licenses/APACHE2.0 b/distribution/std-lib/Table/THIRD-PARTY/licenses/APACHE2.0 new file mode 100644 index 00000000000..261eeb9e9f8 --- /dev/null +++ b/distribution/std-lib/Table/THIRD-PARTY/licenses/APACHE2.0 @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/distribution/std-lib/Table/package.yaml b/distribution/std-lib/Table/package.yaml new file mode 100644 index 00000000000..b6539c0c51b --- /dev/null +++ b/distribution/std-lib/Table/package.yaml @@ -0,0 +1,6 @@ +license: APLv2 +name: Table +enso-version: default +version: "0.0.1" +author: "Enso Team " +maintainer: "Enso Team " diff --git a/distribution/std-lib/Table/src/Io/Csv.enso b/distribution/std-lib/Table/src/Io/Csv.enso new file mode 100644 index 00000000000..49bab419b5a --- /dev/null +++ b/distribution/std-lib/Table/src/Io/Csv.enso @@ -0,0 +1,21 @@ +from Base import all +import Table.Table + +polyglot java import org.enso.table.format.csv.Parser + +## Reads the contents of `this` and parses them as a CSV dataframe. + + Arguments + - has_header: Specifies whether the first line of the file should be + interpreted as a header, containing storage names. If set to `False`, + storage names will be automatically generated. + - prefix: text that should be prepended to automatically generated storage + names. For example, if `prefix` is set to `X`, the columns will be named + `X0`, `X1`, etc. This argument has no effect if the storage name is + inferred from the CSV header row or set manually. +File.File.read_csv : Boolean -> Text -> Table +File.File.read_csv has_header=True prefix='C' = + parser_inst = Parser.create [has_header, prefix] + this.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + Table.Table (parser_inst.parse [java_stream]) diff --git a/distribution/std-lib/Table/src/Main.enso b/distribution/std-lib/Table/src/Main.enso new file mode 100644 index 00000000000..b91c0011be7 --- /dev/null +++ b/distribution/std-lib/Table/src/Main.enso @@ -0,0 +1,5 @@ +from Base import all + +import Table.Io.Csv + +from Table.Io.Csv export all hiding Parser diff --git a/distribution/std-lib/Table/src/Table.enso b/distribution/std-lib/Table/src/Table.enso new file mode 100644 index 00000000000..9b1fd3c3467 --- /dev/null +++ b/distribution/std-lib/Table/src/Table.enso @@ -0,0 +1,95 @@ +from Base import all +import Table.Io.Csv + +## Represents a column-oriented table data structure. +type Table + type Table java_table + + ## Returns a text containing an ASCII-art table displaying this data. + + Arguments: + - show_rows: the number of initial rows that should be displayed. + display : Integer -> Text + display show_rows=10 = + cols = Vector (this.java_table.getColumns []) + col_names = cols.map (_.getName []) + col_vals = cols.map (_.getStorage []) + num_rows = this.java_table.nrows [] + display_rows = min num_rows show_rows + rows = Vector.new display_rows row_num-> + col_vals.map col-> + if col.isNa [row_num] then "NA" else here.get_item_string col row_num + table = here.print_table col_names rows + if num_rows - display_rows <= 0 then table else + missing = '\n\u2026 and ' + (num_rows - display_rows).to_text + ' hidden rows.' + table + missing + + ## Converts this table to a JSON structure. + to_json : Json + to_json = + col_jsons = Vector (this.java_table.getColumns []) . map here.column_to_json + cols_json = Json.Array col_jsons + fields = Map.singleton "columns" cols_json + Json.Object fields + +## PRIVATE + + Keep this in sync with `org.enso.table.data.Storage.Type.LONG` +storage_type_long = 1 + +## PRIVATE + + Keep this in sync with `org.enso.table.data.Storage.Type.DOUBLE` +storage_type_double = 2 + +## PRIVATE + + Keep this in sync with `org.enso.table.data.Storage.Type.STRING` +storage_type_string = 3 + +## PRIVATE +string_storage_to_json storage = + Vector.new (storage.size []) ix-> + if storage.isNa [ix] then Json.Null else + Json.String (storage.getItem [ix]) + +## PRIVATE +numeric_storage_to_json storage = + Vector.new (storage.size []) ix-> + if storage.isNa [ix] then Json.Null else + Json.Number (storage.getItem [ix]) + +## PRIVATE +column_to_json col = + name = col.getName [] + storage = col.getStorage [] + storage_type = storage.getType [] + storage_jsons = if storage_type == Storage_Type_String then here.string_storage_to_json storage else + here.numeric_storage_to_json storage + fields = Map.singleton "name" (Json.String name) . insert "data" (Json.Array storage_jsons) + Json.Object fields + + +## PRIVATE +get_item_string column ix = + tp = column.getType [] + if tp == Storage_Type_String then column.getItem [ix] else + column.getItem [ix] . to_text + +## PRIVATE +pad txt len = + true_len = txt.characters.length + txt + (" ".repeat (len - true_len)) + +## PRIVATE +print_table header rows = + content_lengths = Vector.new header.length i-> + max_row = 0.upto rows.length . fold 0 a-> j-> max a (rows.at j . at i . characters . length) + max max_row (header.at i . characters . length) + header_line = zip header content_lengths here.pad . join ' | ' + divider = content_lengths . map (l -> "-".repeat l+2) . join '+' + row_lines = rows.map r-> + x = zip r content_lengths here.pad . join ' | ' + " " + x + ([" " + header_line, divider] + row_lines).join '\n' + diff --git a/table/src/main/java/org/enso/table/data/column/DoubleStorage.java b/table/src/main/java/org/enso/table/data/column/DoubleStorage.java new file mode 100644 index 00000000000..038c3b5ffb3 --- /dev/null +++ b/table/src/main/java/org/enso/table/data/column/DoubleStorage.java @@ -0,0 +1,48 @@ +package org.enso.table.data.column; + +import java.util.BitSet; + +/** A column containing floating point numbers. */ +public class DoubleStorage extends Storage { + private final long[] data; + private final BitSet isMissing; + private final int size; + + /** + * @param data the underlying data + * @param size the number of items stored + * @param isMissing a bit set denoting at index {@code i} whether or not the value at index {@code + * i} is missing. + */ + public DoubleStorage(long[] data, int size, BitSet isMissing) { + this.data = data; + this.isMissing = isMissing; + this.size = size; + } + + /** @inheritDoc */ + @Override + public long size() { + return size; + } + + /** + * @param idx an index + * @return the data item contained at the given index. + */ + public double getItem(long idx) { + return Double.longBitsToDouble(data[(int) idx]); + } + + /** @inheritDoc */ + @Override + public long getType() { + return Type.DOUBLE; + } + + /** @inheritDoc */ + @Override + public boolean isNa(long idx) { + return isMissing.get((int) idx); + } +} diff --git a/table/src/main/java/org/enso/table/data/column/LongStorage.java b/table/src/main/java/org/enso/table/data/column/LongStorage.java new file mode 100644 index 00000000000..adb0b730285 --- /dev/null +++ b/table/src/main/java/org/enso/table/data/column/LongStorage.java @@ -0,0 +1,48 @@ +package org.enso.table.data.column; + +import java.util.BitSet; + +/** A column storing 64-bit integers. */ +public class LongStorage extends Storage { + private final long[] data; + private final BitSet isMissing; + private final int size; + + /** + * @param data the underlying data + * @param size the number of items stored + * @param isMissing a bit set denoting at index {@code i} whether or not the value at index {@code + * i} is missing. + */ + public LongStorage(long[] data, int size, BitSet isMissing) { + this.data = data; + this.isMissing = isMissing; + this.size = size; + } + + /** @inheritDoc */ + @Override + public long size() { + return size; + } + + /** + * @param idx an index + * @return the data item contained at the given index. + */ + public long getItem(long idx) { + return data[(int) idx]; + } + + /** @inheritDoc */ + @Override + public long getType() { + return Type.LONG; + } + + /** @inheritDoc */ + @Override + public boolean isNa(long idx) { + return isMissing.get((int) idx); + } +} diff --git a/table/src/main/java/org/enso/table/data/column/Storage.java b/table/src/main/java/org/enso/table/data/column/Storage.java new file mode 100644 index 00000000000..4fa0b7703ec --- /dev/null +++ b/table/src/main/java/org/enso/table/data/column/Storage.java @@ -0,0 +1,31 @@ +package org.enso.table.data.column; + +/** An abstract representation of a data column. */ +public abstract class Storage { + /** @return the number of elements in this column (including NAs) */ + public abstract long size(); + + /** @return the type tag of this column's storage. Must be one of {@link Type} */ + public abstract long getType(); + + /** + * Checks whether the value at {@code idx} is missing. + * + * @param idx the index to check. + * @return whether or not the value is missing. + */ + public abstract boolean isNa(long idx); + + /** + * Enumerating possible storage types. + * + *

Keep in sync with variables in {@code Table.Table}. These variables are copied between Enso + * and Java code, in order to make them trivially constant on the Enso side, without invoking the + * polyglot machinery to access them. + */ + public static final class Type { + public static final long LONG = 1; + public static final long DOUBLE = 2; + public static final long STRING = 3; + } +} diff --git a/table/src/main/java/org/enso/table/data/column/StringStorage.java b/table/src/main/java/org/enso/table/data/column/StringStorage.java new file mode 100644 index 00000000000..c8ee9648e85 --- /dev/null +++ b/table/src/main/java/org/enso/table/data/column/StringStorage.java @@ -0,0 +1,42 @@ +package org.enso.table.data.column; + +/** A column storing strings. */ +public class StringStorage extends Storage { + private final String[] data; + private final int size; + + /** + * @param data the underlying data + * @param size the number of items stored + */ + public StringStorage(String[] data, int size) { + this.data = data; + this.size = size; + } + + /** @inheritDoc */ + @Override + public long size() { + return size; + } + + /** + * @param idx an index + * @return the data item contained at the given index. + */ + public String getItem(long idx) { + return data[(int) idx]; + } + + /** @inheritDoc */ + @Override + public long getType() { + return Type.STRING; + } + + /** @inheritDoc */ + @Override + public boolean isNa(long idx) { + return data[(int) idx] == null; + } +} diff --git a/table/src/main/java/org/enso/table/data/column/builder/PrimInferredStorageBuilder.java b/table/src/main/java/org/enso/table/data/column/builder/PrimInferredStorageBuilder.java new file mode 100644 index 00000000000..fc9fd780f6a --- /dev/null +++ b/table/src/main/java/org/enso/table/data/column/builder/PrimInferredStorageBuilder.java @@ -0,0 +1,119 @@ +package org.enso.table.data.column.builder; + +import org.enso.table.data.column.DoubleStorage; +import org.enso.table.data.column.LongStorage; +import org.enso.table.data.column.Storage; + +import java.util.BitSet; + +/** + * A column builder for numeric types. Tries to interpret all data as 64-bit integers. If that + * becomes impossible, retypes itself to store 64-bit floats. When even that fails, falls back to a + * {@link StringStorageBuilder}. + */ +public class PrimInferredStorageBuilder extends StorageBuilder { + private enum Type { + LONG, + DOUBLE + } + + private int size = 0; + private long[] data = new long[64]; + private String[] rawData = new String[64]; + private final BitSet isMissing = new BitSet(); + private Type type = Type.LONG; + + /** @inheritDoc */ + @Override + public StorageBuilder parseAndAppend(String value) { + if (value == null) { + ensureAppendable(); + isMissing.set(size); + size++; + return this; + } + switch (type) { + case LONG: + return appendLong(value); + case DOUBLE: + return appendDouble(value); + default: + throw new IllegalStateException(); + } + } + + private StorageBuilder appendLong(String value) { + try { + long l = Long.parseLong(value); + ensureAppendable(); + rawData[size] = value; + data[size] = l; + size++; + return this; + } catch (NumberFormatException ignored) { + return failedLong(value); + } + } + + private StorageBuilder appendDouble(String value) { + try { + double d = Double.parseDouble(value); + ensureAppendable(); + data[size] = Double.doubleToRawLongBits(d); + rawData[size] = value; + size++; + return this; + } catch (NumberFormatException ignored) { + return failedDouble(value); + } + } + + private StorageBuilder failedLong(String value) { + try { + double d = Double.parseDouble(value); + retypeToDouble(); + ensureAppendable(); + data[size] = Double.doubleToRawLongBits(d); + rawData[size] = value; + size++; + return this; + } catch (NumberFormatException ignored) { + return failedDouble(value); + } + } + + private StorageBuilder failedDouble(String value) { + StringStorageBuilder newBuilder = new StringStorageBuilder(rawData, size); + newBuilder.parseAndAppend(value); + return newBuilder; + } + + private void retypeToDouble() { + for (int i = 0; i < size; i++) { + data[i] = Double.doubleToRawLongBits(data[i]); + } + type = Type.DOUBLE; + } + + // TODO[MK] Consider storing data `rawData` in non-linear storage to avoid reallocations. + private void ensureAppendable() { + if (size >= data.length) { + long[] newData = new long[2 * data.length]; + String[] newRawData = new String[2 * data.length]; + System.arraycopy(data, 0, newData, 0, data.length); + System.arraycopy(rawData, 0, newRawData, 0, rawData.length); + data = newData; + rawData = newRawData; + } + } + + /** @inheritDoc */ + @Override + public Storage seal() { + if (type == Type.LONG) { + return new LongStorage(data, size, isMissing); + } else { + return new DoubleStorage(data, size, isMissing); + } + } +} diff --git a/table/src/main/java/org/enso/table/data/column/builder/StorageBuilder.java b/table/src/main/java/org/enso/table/data/column/builder/StorageBuilder.java new file mode 100644 index 00000000000..3ecc295e393 --- /dev/null +++ b/table/src/main/java/org/enso/table/data/column/builder/StorageBuilder.java @@ -0,0 +1,23 @@ +package org.enso.table.data.column.builder; + +import org.enso.table.data.column.Storage; + +/** A builder used by the parser to add items into a column. */ +public abstract class StorageBuilder { + /** + * Called by the parser to notify the builder about the next value being appended. The value is + * passed in a String form and the builder is responsible for parsing it into its own format. The + * value may be null, in which case it should be considered missing. + * + * @param value the value to parse and append + * @return a storage builder instance to use for future calls + */ + public abstract StorageBuilder parseAndAppend(String value); + + /** + * Closes the storage builder and returns a fully parsed column. + * + * @return the storage resulting from this builder's operation. + */ + public abstract Storage seal(); +} diff --git a/table/src/main/java/org/enso/table/data/column/builder/StringStorageBuilder.java b/table/src/main/java/org/enso/table/data/column/builder/StringStorageBuilder.java new file mode 100644 index 00000000000..36e530ee523 --- /dev/null +++ b/table/src/main/java/org/enso/table/data/column/builder/StringStorageBuilder.java @@ -0,0 +1,51 @@ +package org.enso.table.data.column.builder; + +import org.enso.table.data.column.Storage; +import org.enso.table.data.column.StringStorage; + +/** A column builder appending all the values passed to it in an unchanged form. */ +public class StringStorageBuilder extends StorageBuilder { + + private String[] data; + private int size; + + /** + * Creates a new builder from given partial data. Useful for other builders when a type transition + * is required. + * + * @param data the initial data storage + * @param size the number of already filled elements + */ + public StringStorageBuilder(String[] data, int size) { + this.data = data; + this.size = size; + } + + /** Creates an empty builder. */ + public StringStorageBuilder() { + data = new String[64]; + size = 0; + } + + /** @inheritDoc */ + @Override + public StorageBuilder parseAndAppend(String value) { + ensureAppendable(); + data[size++] = value; + return this; + } + + private void ensureAppendable() { + if (size >= data.length) { + String[] newData = new String[2 * data.length]; + System.arraycopy(data, 0, newData, 0, data.length); + data = newData; + } + } + + /** @inheritDoc */ + @Override + public Storage seal() { + return new StringStorage(data, size); + } +} diff --git a/table/src/main/java/org/enso/table/data/table/Column.java b/table/src/main/java/org/enso/table/data/table/Column.java new file mode 100644 index 00000000000..da9985c8fe8 --- /dev/null +++ b/table/src/main/java/org/enso/table/data/table/Column.java @@ -0,0 +1,30 @@ +package org.enso.table.data.table; + +import org.enso.table.data.column.Storage; + +/** A representation of a column. Consists of a column name and the underlying storage. */ +public class Column { + private final String name; + private final Storage storage; + + /** + * Creates a new column. + * + * @param name the column name + * @param storage the underlying storage + */ + public Column(String name, Storage storage) { + this.name = name; + this.storage = storage; + } + + /** @return the column name */ + public String getName() { + return name; + } + + /** @return the underlying storage */ + public Storage getStorage() { + return storage; + } +} diff --git a/table/src/main/java/org/enso/table/data/table/Table.java b/table/src/main/java/org/enso/table/data/table/Table.java new file mode 100644 index 00000000000..909c6bd2dea --- /dev/null +++ b/table/src/main/java/org/enso/table/data/table/Table.java @@ -0,0 +1,30 @@ +package org.enso.table.data.table; + +/** A representation of a table structure. */ +public class Table { + + private final Column[] columns; + + /** + * Creates a new table + * + * @param columns the columns contained in this table. + */ + public Table(Column[] columns) { + this.columns = columns; + } + + /** @return the number of rows in this table */ + public long nrows() { + if (columns == null || columns.length == 0) { + return 0; + } else { + return columns[0].getStorage().size(); + } + } + + /** @return the columns of this table */ + public Column[] getColumns() { + return columns; + } +} diff --git a/table/src/main/java/org/enso/table/format/csv/Parser.java b/table/src/main/java/org/enso/table/format/csv/Parser.java new file mode 100644 index 00000000000..c79277f62ec --- /dev/null +++ b/table/src/main/java/org/enso/table/format/csv/Parser.java @@ -0,0 +1,90 @@ +package org.enso.table.format.csv; + +import com.univocity.parsers.csv.CsvParser; +import com.univocity.parsers.csv.CsvParserSettings; +import org.enso.table.data.column.Storage; +import org.enso.table.data.column.builder.StorageBuilder; +import org.enso.table.data.column.builder.PrimInferredStorageBuilder; +import org.enso.table.data.table.Column; +import org.enso.table.data.table.Table; + +import java.io.InputStream; + +/** A CSV parser. */ +public class Parser { + private final boolean hasHeader; + private final String unnamedColumnPrefix; + + private Parser(boolean hasHeader, String unnamedColumnPrefix) { + this.hasHeader = hasHeader; + this.unnamedColumnPrefix = unnamedColumnPrefix; + } + + /** + * Creates a new parser with given parameters. + * + * @param hasHeader whether or not the first line of the file should be used as a header line + * @param unnamedColumnPrefix the string to prepend to column index for columns with unknown name. + * @return a CSV parser + */ + public static Parser create(boolean hasHeader, String unnamedColumnPrefix) { + return new Parser(hasHeader, unnamedColumnPrefix); + } + + /** + * Parses the given input stream into a Table. + * + * @param inputStream the input stream to parse + * @return a table corresponding to the contents of the stream + */ + public Table parse(InputStream inputStream) { + CsvParserSettings settings = new CsvParserSettings(); + settings.setHeaderExtractionEnabled(hasHeader); + settings.detectFormatAutomatically(); + CsvParser parser = new CsvParser(settings); + parser.beginParsing(inputStream); + StorageBuilder[] builders = null; + String[] header = parser.getContext().headers(); + // TODO[MK] Handle irregular table sizes + if (header != null) { + builders = initBuilders(header.length); + } + String[] row = parser.parseNext(); + if (row == null) { + return new Table(new Column[0]); + } + if (builders == null) { + builders = initBuilders(row.length); + } + for (int i = 0; i < builders.length; i++) { + builders[i] = builders[i].parseAndAppend(handleNa(row[i])); + } + while ((row = parser.parseNext()) != null) { + for (int i = 0; i < builders.length; i++) { + builders[i] = builders[i].parseAndAppend(handleNa(row[i])); + } + } + Column[] columns = new Column[builders.length]; + for (int i = 0; i < builders.length; i++) { + String name = header != null ? header[i] : unnamedColumnPrefix + i; + Storage col = builders[i].seal(); + columns[i] = new Column(name, col); + } + return new Table(columns); + } + + private StorageBuilder[] initBuilders(int count) { + StorageBuilder[] res = new StorageBuilder[count]; + for (int i = 0; i < count; i++) { + res[i] = new PrimInferredStorageBuilder(); + } + return res; + } + + private String handleNa(String raw) { + if (raw == null || raw.length() == 0) { + return null; + } + return raw; + } +} diff --git a/test/Table_Tests/data/simple_empty.csv b/test/Table_Tests/data/simple_empty.csv new file mode 100644 index 00000000000..f93618d5d4d --- /dev/null +++ b/test/Table_Tests/data/simple_empty.csv @@ -0,0 +1,5 @@ +a,b,c +1,2, +4,,6 +7,8,9 +10,11,12 diff --git a/test/Table_Tests/data/varied_column.csv b/test/Table_Tests/data/varied_column.csv new file mode 100644 index 00000000000..28a9458e78a --- /dev/null +++ b/test/Table_Tests/data/varied_column.csv @@ -0,0 +1,8 @@ +2005-02-25,2005-02-25,1,1,1,1 +2005-02-28,2005-02-28,2,2,2,2 +4,2005-03-01,3,3,3,3 +2005-03-02,,4,4,4,4 +,2005-03-03,5,5,5,5 +2005-03-04,2005-03-04,,6,6.25,6.25 +2005-03-07,2005-03-07,7,7,7,7 +2005-03-08,2005-03-08,8,8,8,osiem diff --git a/test/Table_Tests/package.yaml b/test/Table_Tests/package.yaml new file mode 100644 index 00000000000..de5a67f65f0 --- /dev/null +++ b/test/Table_Tests/package.yaml @@ -0,0 +1,6 @@ +name: Table_Tests +version: 0.0.1 +enso-version: default +license: MIT +author: enso-dev@enso.org +maintainer: enso-dev@enso.org diff --git a/test/Table_Tests/src/Main.enso b/test/Table_Tests/src/Main.enso new file mode 100644 index 00000000000..2429a47f856 --- /dev/null +++ b/test/Table_Tests/src/Main.enso @@ -0,0 +1,6 @@ +import Test + +import Table_Tests.Table_Spec + +main = Test.Suite.runMain <| + Table_Spec.spec diff --git a/test/Table_Tests/src/Table_Spec.enso b/test/Table_Tests/src/Table_Spec.enso new file mode 100644 index 00000000000..9308a1e8b7c --- /dev/null +++ b/test/Table_Tests/src/Table_Spec.enso @@ -0,0 +1,36 @@ +from Base import all +from Table import all +import Test + +spec = describe "Tables" <| + it "should parse a simple numeric table and convert it to JSON" <| + simple_empty = (Enso_Project.data / "simple_empty.csv") . read_csv + c_1_data = [1, 4, 7, 10] + c_2_data = [2, Json.Null, 8, 11] + c_3_data = [Json.Null, 6, 9, 12] + + c_1 = Json.from_pairs [["name", "a"], ["data", c_1_data]] + c_2 = Json.from_pairs [["name", "b"], ["data", c_2_data]] + c_3 = Json.from_pairs [["name", "c"], ["data", c_3_data]] + + expected = Json.from_pairs [["columns", [c_1, c_2, c_3]]] + + simple_empty.to_json.should equal expected + it "should correctly infer types of varied-type columns" <| + varied_column = (Enso_Project.data / "varied_column.csv") . read_csv has_header=False + c_1_data = ["2005-02-25", "2005-02-28", "4", "2005-03-02", Json.Null, "2005-03-04", "2005-03-07", "2005-03-08"] + c_2_data = ["2005-02-25", "2005-02-28", "2005-03-01", Json.Null, "2005-03-03", "2005-03-04", "2005-03-07", "2005-03-08"] + c_3_data = [1, 2, 3, 4, 5, Json.Null, 7, 8] + c_4_data = [1, 2, 3, 4, 5, 6, 7, 8] + c_5_data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.25, 7.0, 8.0] + c_6_data = ['1', '2', '3', '4', '5', '6.25', '7', 'osiem'] + + c_1 = Json.from_pairs [["name", "C0"], ["data", c_1_data]] + c_2 = Json.from_pairs [["name", "C1"], ["data", c_2_data]] + c_3 = Json.from_pairs [["name", "C2"], ["data", c_3_data]] + c_4 = Json.from_pairs [["name", "C3"], ["data", c_4_data]] + c_5 = Json.from_pairs [["name", "C4"], ["data", c_5_data]] + c_6 = Json.from_pairs [["name", "C5"], ["data", c_6_data]] + + expected = Json.from_pairs [["columns", [c_1, c_2, c_3, c_4, c_5, c_6]]] + varied_column.to_json.should equal expected diff --git a/tools/legal-review/std-lib-Table/com.univocity.univocity-parsers-2.9.0/copyright-keep b/tools/legal-review/std-lib-Table/com.univocity.univocity-parsers-2.9.0/copyright-keep new file mode 100644 index 00000000000..894ba097fb1 --- /dev/null +++ b/tools/legal-review/std-lib-Table/com.univocity.univocity-parsers-2.9.0/copyright-keep @@ -0,0 +1,16 @@ +Copyright (c) 2015. Univocity Software Pty Ltd +Copyright (c) 2018. Univocity Software Pty Ltd +Copyright 2014 Univocity Software Pty Ltd +Copyright 2015 Univocity Software Pty Ltd +Copyright 2016 Univocity Software Pty Ltd +Copyright 2017 Univocity Software Pty Ltd +Copyright 2018 Univocity Software Pty Ltd +Copyright 2019 Univocity Software Pty Ltd +Copyright (c) 2015. Univocity Software Pty Ltd +Copyright (c) 2018. Univocity Software Pty Ltd +Copyright 2014 Univocity Software Pty Ltd +Copyright 2015 Univocity Software Pty Ltd +Copyright 2016 Univocity Software Pty Ltd +Copyright 2017 Univocity Software Pty Ltd +Copyright 2018 Univocity Software Pty Ltd +Copyright 2019 Univocity Software Pty Ltd diff --git a/tools/legal-review/std-lib-Table/report-state b/tools/legal-review/std-lib-Table/report-state new file mode 100644 index 00000000000..dad5c28cfab --- /dev/null +++ b/tools/legal-review/std-lib-Table/report-state @@ -0,0 +1,3 @@ +B0D696DAAB04B954B3E2B2259CA8B8E9A9632DE3FBCB462074D2C2C61382A211 +F365EBA0EFA7B64DA286C399792A4121F9F386B125CFD02C186936A3AB7B8E71 +0 diff --git a/tools/legal-review/std-lib-Table/reviewed-licenses/Apache_2 b/tools/legal-review/std-lib-Table/reviewed-licenses/Apache_2 new file mode 100644 index 00000000000..ff46ef6ff41 --- /dev/null +++ b/tools/legal-review/std-lib-Table/reviewed-licenses/Apache_2 @@ -0,0 +1 @@ +tools/legal-review/license-texts/APACHE2.0