Remove duplicate Line_Ending_Style and update defaults (#3597)

Implements https://www.pivotaltracker.com/story/show/182749831
This commit is contained in:
Radosław Waśko 2022-07-27 11:43:51 +02:00 committed by GitHub
parent c6d0843a2c
commit ee91656f30
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 430 additions and 96 deletions

View File

@ -644,6 +644,19 @@ type File
resource = Managed_Resource.register stream close_stream
Output_Stream self resource
## PRIVATE
Reads last `n` bytes from the file (or less if the file is too small) and
returns a vector of bytes.
read_last_bytes : Integer -> Vector ! File_Error
read_last_bytes n =
handle_java_exceptions self <|
Vector.Vector (self.read_last_bytes_builtin n)
## PRIVATE
read_last_bytes_builtin : Integer -> Array
read_last_bytes_builtin n = @Builtin_Method "File.read_last_bytes_builtin"
## Lists files contained in the directory denoted by this file.
Arguments:

View File

@ -112,8 +112,8 @@ type Delimited
`Nothing` or dropping the excess columns) or dropped.
- line_endings: Sets the line ending style to use. Defaults to `Infer` -
when reading a file or appending to an existing file, the line endings
are detected from file contents; when writing a new file, the OS
defaults are used.
are detected from file contents; when writing a new file in `Infer`
mode the `Unix` line endings are used.
- comment_character: Sets the character which indicates the start of a
comment within a delimited file. Any line that begins with the comment
character is skipped. The comment character is treated as any other

View File

@ -1,29 +0,0 @@
from Standard.Base import Nothing
from Standard.Table.IO.File_Format import Infer
## Specifies what line endings to use in a file format.
type Line_Ending_Style
## The line ending style is chosen automatically.
When reading a file or appending to an existing file, the line endings
are detected from file contents. When writing a new file, the OS defaults
are used.
Infer
## The UNIX line endings.
type Unix_Line_Endings
## The Windows line endings.
type Windows_Line_Endings
## The classic Mac OS line endings. Used for legacy applications, as modern
Mac OS uses the UNIX line endings.
type Classic_Mac_Line_Endings
## PRIVATE
line_separator_sequence : Line_Ending_Style -> Text
line_separator_sequence line_endings = case line_endings of
Unix_Line_Endings -> '\n'
Windows_Line_Endings -> '\r\n'
Classic_Mac_Line_Endings -> '\r'
Infer -> Nothing

View File

@ -1,15 +1,16 @@
from Standard.Base import all
import Standard.Table
import Standard.Base.Data.Statistics
import Standard.Base.Error.Common as Errors
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Ignore
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Ignore, Report_Error
from Standard.Table.Errors as Table_Errors import Duplicate_Output_Column_Names, Invalid_Output_Column_Names, Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
from Standard.Table.IO.File_Format import Infer
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
import Standard.Table.IO.Quote_Style
from Standard.Table.IO.Line_Ending_Style import line_separator_sequence
polyglot java import org.enso.base.encoding.NewlineDetector
polyglot java import org.enso.table.read.DelimitedReader
polyglot java import org.enso.table.read.ParsingFailedException
polyglot java import org.enso.table.parsing.problems.InvalidRow
@ -19,7 +20,6 @@ polyglot java import org.enso.table.util.problems.DuplicateNames
polyglot java import org.enso.table.util.problems.InvalidNames
polyglot java import java.io.IOException
polyglot java import com.univocity.parsers.common.TextParsingException
polyglot java import org.enso.base.Encoding_Utils
polyglot java import java.io.InputStream
polyglot java import java.io.Reader
polyglot java import java.io.StringReader
@ -97,7 +97,7 @@ read_from_reader format java_reader on_problems max_columns=4096 =
on_problems.attach_problems_after (Table.Table result_with_problems.value) parsing_problems
## PRIVATE
prepare_delimited_reader java_reader format max_columns on_problems =
prepare_delimited_reader java_reader format max_columns on_problems newline_override=Nothing =
java_headers = case format.headers of
True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS
Infer -> DelimitedReader.HeaderBehavior.INFER
@ -106,7 +106,7 @@ prepare_delimited_reader java_reader format max_columns on_problems =
Nothing -> -1
Integer -> format.row_limit
_ -> Error.throw (Illegal_Argument_Error "`row_limit` should be Integer or Nothing.")
warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error
warnings_as_errors = on_problems == Report_Error
quote_characters = case format.quote_style of
Quote_Style.No_Quotes -> Pair Nothing Nothing
Quote_Style.With_Quotes _ quote quote_escape -> Pair quote quote_escape
@ -120,7 +120,9 @@ prepare_delimited_reader java_reader format max_columns on_problems =
cell_type_guesser = if format.headers != Infer then Nothing else
formatter = format.value_formatter.if_nothing Data_Formatter
TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new
newline = line_separator_sequence format.line_endings
newline = newline_override.if_nothing <| case format.line_endings of
Infer -> Nothing
endings -> endings.to_text
DelimitedReader.new java_reader format.delimiter quote_characters.first quote_characters.second java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows newline format.comment_character warnings_as_errors
translate_reader_problem problem =
@ -147,11 +149,18 @@ type Detected_Headers
## Indicates that the file exists but no headers have been found, so only positional column matching is possible.
type No_Headers (column_count : Integer)
## PRIVATE
An internal type representing metadata describing the format of a specific
Delimited file.
type Detected_File_Metadata
type Detected_File_Metadata (headers : Detected_Headers) (line_separator : Text|Nothing)
## PRIVATE
An internal type representing metadata describing the format of a specific
Delimited file.
Arguments:
- headers: headers present in the file.
- line_separator: line separator detected from file contents, if any.
- ends_with_newline: specifies if the last line ends with a line
separator that is consistent with the detected one.
- has_any_content: specifies if the file contains any content.
type Detected_File_Metadata (headers : Detected_Headers) (line_separator : Text|Nothing) (ends_with_newline : Boolean) (has_any_content : Boolean)
## PRIVATE
Reads the beginning of the file to detect the existing headers and column
@ -160,13 +169,18 @@ detect_metadata : File -> File_Format.Delimited -> Detected_Headers
detect_metadata file format =
on_problems = Ignore
result = handle_io_exception file <| Illegal_Argument_Error.handle_java_exception <| handle_parsing_failure <| handle_parsing_exception <|
trailing_line_separator = newline_at_eof file format.encoding
has_trailing_line_separator = trailing_line_separator.is_nothing.not
file.with_input_stream [File.Option.Read] stream->
stream.with_stream_decoder format.encoding on_problems java_reader->
## We don't need to close this one, as closing the parent stream
will suffice.
newline_detecting_reader = NewlineDetector.new java_reader
## We use the default `max_columns` setting. If we want to be able to
read files with unlimited column limits (risking OutOfMemory
exceptions), we can catch the exception indicating the limit has been
reached and restart parsing with an increased limit.
reader = prepare_delimited_reader java_reader format max_columns=default_max_columns on_problems
reader = prepare_delimited_reader newline_detecting_reader format max_columns=default_max_columns on_problems newline_override=trailing_line_separator
defined_columns = reader.getDefinedColumnNames
headers = case defined_columns of
Nothing ->
@ -174,9 +188,34 @@ detect_metadata file format =
if column_count == 0 then Nothing else
No_Headers column_count
_ -> Existing_Headers (Vector.Vector defined_columns)
line_separator = reader.getEffectiveLineSeparator
Detected_File_Metadata headers line_separator
result.catch File.File_Not_Found (_->(Detected_File_Metadata Nothing Nothing))
line_separator_from_parser = reader.getEffectiveLineSeparator
has_seen_newline = newline_detecting_reader.newlineEncountered
## If the parser has seen a newline, we can trust that it
detected the newline correctly. However if it has not, we
cannot trust it as it tends to just fall back to the system
default which is wrong. Thus we return the trailing line
separator (which may be `Nothing`).
effective_line_separator = case has_seen_newline of
True -> line_separator_from_parser
False -> trailing_line_separator
has_any_content = reader.getVisitedCharactersCount > 0
Detected_File_Metadata headers effective_line_separator has_trailing_line_separator has_any_content
result.catch File.File_Not_Found (_->(Detected_File_Metadata Nothing Nothing False False))
## PRIVATE
Checks if the file has a newline at the end.
Returns the newline sequence if found, `Nothing` otherwise.
newline_at_eof : File -> Encoding -> Text|Nothing
newline_at_eof file encoding =
newlines = ['\r\n', '\n', '\r']
newline_bytes = newlines.map (x-> x.bytes encoding Report_Error)
most_bytes = newline_bytes.map .length . compute Statistics.Maximum
file_last_bytes = file.read_last_bytes most_bytes
result = newlines.zip newline_bytes . find pair->
bytes = pair.second
bytes == (file_last_bytes.take_end bytes.length)
result.first . catch Nothing
## PRIVATE
handle_parsing_failure =

View File

@ -11,7 +11,7 @@ from Standard.Table.IO.File_Format import Infer
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
import Standard.Table.Data.Storage
import Standard.Table.IO.Quote_Style
from Standard.Table.IO.Line_Ending_Style import line_separator_sequence
import Standard.Base.Data.Text.Line_Ending_Style
from Standard.Table.Internal.Delimited_Reader import Existing_Headers, No_Headers
import Standard.Table.Data.Match_Columns
@ -54,16 +54,17 @@ append_to_file table format file match_columns on_problems =
inferring_format = format.with_line_endings Infer
metadata = Delimited_Reader.detect_metadata file inferring_format
preexisting_headers = metadata.headers
case format.line_endings of
Infer -> Nothing
effective_line_separator = case format.line_endings of
Infer -> metadata.line_separator.if_nothing default_line_separator_for_writing.to_text
other_ending_style ->
selected_separator = line_separator_sequence other_ending_style
selected_separator = other_ending_style.to_text
existing_separator = metadata.line_separator
if selected_separator != existing_separator then
if existing_separator.is_nothing.not && (selected_separator != existing_separator) then
Panic.throw <| Illegal_Argument_Error <|
# Ensure that these are properly escaped once `to_text` meaning is changed.
"The explicitly provided line endings (" + selected_separator.to_text + ") do not match the line endings in the file (" + existing_separator.to_text + ")."
effective_line_separator = metadata.line_separator
other_ending_style.to_text
reordered_java_table = case preexisting_headers of
Nothing -> table.java_table
Existing_Headers column_names -> case match_columns of
@ -82,8 +83,10 @@ append_to_file table format file match_columns on_problems =
amended_format = case writing_new_file && (should_write_headers format.headers) of
True -> format.with_headers
False -> format.without_headers
needs_leading_newline =
metadata.has_any_content && metadata.ends_with_newline.not
Existing_File_Behavior.Append.write file stream->
write_to_stream reordered_table amended_format stream on_problems related_file=file separator_override=effective_line_separator
write_to_stream reordered_table amended_format stream on_problems related_file=file separator_override=effective_line_separator needs_leading_newline=needs_leading_newline
## PRIVATE
Returns a Text value representing the table in the delimited format.
@ -108,14 +111,14 @@ write_text table format =
or `Nothing`. It is used for more detailed error reporting.
- separator_override: An optional override for the line separator to use
instead of the one from `format`.
write_to_stream : Table -> File_Format.Delimited -> Output_Stream -> Problem_Behavior -> File | Nothing -> Text | Nothing -> Any
write_to_stream table format stream on_problems related_file=Nothing separator_override=Nothing =
write_to_stream : Table -> File_Format.Delimited -> Output_Stream -> Problem_Behavior -> File | Nothing -> Text | Nothing -> Boolean -> Any
write_to_stream table format stream on_problems related_file=Nothing separator_override=Nothing needs_leading_newline=False =
handle_io_exception ~action = Panic.catch IOException action caught_panic->
Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)
handle_io_exception <|
stream.with_stream_encoder format.encoding on_problems reporting_stream_encoder->
write_to_writer table format reporting_stream_encoder separator_override=separator_override
write_to_writer table format reporting_stream_encoder separator_override=separator_override needs_leading_newline=needs_leading_newline
## PRIVATE
Writes data to the provided `Writer` according to the provided format.
@ -129,8 +132,8 @@ write_to_stream table format stream on_problems related_file=Nothing separator_o
- java_writer: A Java `Writer` to which characters will be written.
- separator_override: An optional override for the line separator to use
instead of the one from `format`.
write_to_writer : Table -> File_Format.Delimited -> Writer -> Text | Nothing -> Any
write_to_writer table format java_writer separator_override=Nothing =
write_to_writer : Table -> File_Format.Delimited -> Writer -> Text | Nothing -> Boolean -> Any
write_to_writer table format java_writer separator_override=Nothing needs_leading_newline=False =
column_formatters = Panic.recover Illegal_Argument_Error <| case format.value_formatter of
Nothing -> table.columns.map column-> case column.storage_type of
Storage.Text -> TextFormatter.new
@ -148,9 +151,12 @@ write_to_writer table format java_writer separator_override=Nothing =
Quote_Style.With_Quotes _ quote quote_escape -> Pair quote quote_escape
write_headers = should_write_headers format.headers
newline = separator_override.if_nothing <|
separator_from_format = line_separator_sequence format.line_endings
separator_from_format.if_nothing System.default_line_separator
writer = DelimitedWriter.new java_writer column_formatters.to_array format.delimiter newline quote_characters.first quote_characters.second quote_behavior write_headers
case format.line_endings of
Infer -> default_line_separator_for_writing.to_text
endings -> endings.to_text
if needs_leading_newline then
java_writer.write newline
writer = DelimitedWriter.new java_writer column_formatters.to_array format.delimiter newline quote_characters.first quote_characters.second format.comment_character quote_behavior write_headers
writer.write table.java_table
## PRIVATE
@ -158,3 +164,8 @@ should_write_headers headers = case headers of
True -> True
Infer -> True
False -> False
## PRIVATE
The default line separator used for newly written delimited files, if no
specific separator has been set.
default_line_separator_for_writing = Line_Ending_Style.Unix

View File

@ -733,8 +733,8 @@ run_spec ~behavior =
case ex of
Failure _ -> ex
Finished_With_Error err stack_trace_text ->
Failure ("An unexpected error was returned: " + err.to_text + '\n' + stack_trace_text)
_ -> Failure ("An unexpected panic was thrown: " + ex.to_text + '\n' + maybeExc.get_stack_trace_text)
Failure ("An unexpected error was returned: " + err.to_display_text + '\n' + stack_trace_text)
_ -> Failure ("An unexpected panic was thrown: " + ex.to_display_text + '\n' + maybeExc.get_stack_trace_text)
result
## PRIVATE

View File

@ -0,0 +1,41 @@
package org.enso.interpreter.runtime.data;
import com.oracle.truffle.api.interop.InteropLibrary;
import com.oracle.truffle.api.interop.TruffleObject;
import com.oracle.truffle.api.library.ExportLibrary;
import com.oracle.truffle.api.library.ExportMessage;
import java.nio.ByteBuffer;
@ExportLibrary(InteropLibrary.class)
public final class ArrayOverBuffer implements TruffleObject {
private final ByteBuffer buffer;
private ArrayOverBuffer(ByteBuffer buffer) {
this.buffer = buffer;
}
@ExportMessage
Object readArrayElement(long index) {
return (long) buffer.get(buffer.position() + Math.toIntExact(index));
}
@ExportMessage
boolean hasArrayElements() {
return true;
}
@ExportMessage
boolean isArrayElementReadable(long index) {
return index >= 0 && index < getArraySize();
}
@ExportMessage
long getArraySize() {
return buffer.remaining();
}
public static ArrayOverBuffer wrapBuffer(ByteBuffer buffer) {
return new ArrayOverBuffer(buffer);
}
}

View File

@ -17,8 +17,11 @@ import org.enso.interpreter.runtime.library.dispatch.MethodDispatchLibrary;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.CopyOption;
import java.nio.file.OpenOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermission;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
@ -51,6 +54,23 @@ public class EnsoFile implements TruffleObject {
return this.truffleFile.newInputStream(opts);
}
@Builtin.Method(name = "read_last_bytes_builtin")
@Builtin.WrapException(from = IOException.class, to = PolyglotError.class, propagate = true)
public ArrayOverBuffer readLastBytes(long n) throws IOException {
try (SeekableByteChannel channel =
this.truffleFile.newByteChannel(Set.of(StandardOpenOption.READ))) {
int bytesToRead = Math.toIntExact(Math.min(channel.size(), n));
channel.position(channel.size() - bytesToRead);
ByteBuffer buffer = ByteBuffer.allocate(bytesToRead);
while (buffer.hasRemaining()) {
channel.read(buffer);
}
buffer.flip();
return ArrayOverBuffer.wrapBuffer(buffer);
}
}
@Builtin.Method(name = "resolve")
@Builtin.Specialize
public EnsoFile resolve(String subPath) {

View File

@ -41,6 +41,7 @@ import org.enso.polyglot.data.TypeGraph;
UnresolvedConversion.class,
UnresolvedSymbol.class,
Array.class,
ArrayOverBuffer.class,
EnsoBigInteger.class,
ManagedResource.class,
ModuleScope.class,
@ -128,7 +129,7 @@ public class Types {
return Constants.UNRESOLVED_SYMBOL;
} else if (TypesGen.isManagedResource(value)) {
return ConstantsGen.MANAGED_RESOURCE;
} else if (TypesGen.isArray(value)) {
} else if (TypesGen.isArray(value) || TypesGen.isArrayOverBuffer(value)) {
return ConstantsGen.ARRAY;
} else if (TypesGen.isModuleScope(value)) {
return Constants.MODULE_SCOPE;

View File

@ -40,6 +40,7 @@ public record TypeWithKind(String baseType, TypeKind kind) {
List.of(
"org.enso.interpreter.runtime.callable.atom.Atom",
"org.enso.interpreter.runtime.data.Array",
"org.enso.interpreter.runtime.data.ArrayOverBuffer",
"org.enso.interpreter.runtime.data.EnsoFile",
"org.enso.interpreter.runtime.data.EnsoDate",
"org.enso.interpreter.runtime.data.ManagedResource",

View File

@ -0,0 +1,41 @@
package org.enso.base.encoding;
import java.io.IOException;
import java.io.Reader;
/** A reader that wraps another reader and checks if a newline character has been encountered. */
public class NewlineDetector extends Reader {
private final Reader underlying;
private boolean newlineEncountered = false;
public NewlineDetector(Reader underlying) {
this.underlying = underlying;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int read = underlying.read(cbuf, off, len);
if (!newlineEncountered) {
for (int i = 0; i < read; ++i) {
char c = cbuf[off + i];
if (c == '\n' || c == '\r') {
newlineEncountered = true;
break;
}
}
}
return read;
}
@Override
public void close() throws IOException {
underlying.close();
}
/** Checks if a newline character has been encountered within data that has been read so far. */
public boolean newlineEncountered() {
return newlineEncountered;
}
}

View File

@ -41,7 +41,7 @@ public class DelimitedReader {
private final DatatypeParser valueParser;
private final TypeInferringParser cellTypeGuesser;
private final boolean keepInvalidRows;
private final String newlineSetting;
private String newlineSetting;
private final boolean warningsAsErrors;
private final NoOpProblemAggregator noOpProblemAggregator = new NoOpProblemAggregator();
private long invalidRowsCount = 0;
@ -353,18 +353,22 @@ public class DelimitedReader {
return effectiveColumnNames.length;
}
/** Returns the line separator used in the file.
/** Returns the line separator.
*
* If a specific separator is set at construction, it is just returned. If it
* was set to null, the separator inferred from the file contents is returned.
* If it was provided explicitly at construction, the selected separator is used.
* If the initial separator was set to {@code null}, the reader tries to detect
* the separator from file contents.
*/
public String getEffectiveLineSeparator() {
if (newlineSetting != null) {
return newlineSetting;
} else {
if (newlineSetting == null) {
ensureHeadersDetected();
return parser.getDetectedFormat().getLineSeparatorString();
}
return newlineSetting;
}
public long getVisitedCharactersCount() {
ensureHeadersDetected();
return parser.getContext().currentChar();
}
private void ensureHeadersDetected() {
@ -375,6 +379,12 @@ public class DelimitedReader {
private void detectHeaders() {
Row firstRow = loadNextRow();
// Resolve the newline separator:
if (newlineSetting == null) {
newlineSetting = parser.getDetectedFormat().getLineSeparatorString();
}
if (firstRow == null) {
effectiveColumnNames = new String[0];
headerProblems = Collections.emptyList();
@ -389,7 +399,7 @@ public class DelimitedReader {
case INFER -> {
Row secondRow = loadNextRow();
if (secondRow == null) {
/** If there is only one row in the file, we generate the headers and
/* If there is only one row in the file, we generate the headers and
* stop further processing (as nothing more to process). */
headerNames = generateDefaultHeaders(expectedColumnCount);
pendingRows.add(firstRow);

View File

@ -21,6 +21,8 @@ public class DelimitedWriter {
private final char quoteChar;
private final char quoteEscapeChar;
private final char commentChar;
private final String quoteReplacement;
private final String quoteEscapeReplacement;
@ -35,6 +37,7 @@ public class DelimitedWriter {
String newline,
String quote,
String quoteEscape,
String comment,
WriteQuoteBehavior writeQuoteBehavior,
boolean writeHeaders) {
this.newline = newline;
@ -97,6 +100,17 @@ public class DelimitedWriter {
quoteEscapeReplacement = null;
}
if (comment != null) {
if (comment.length() != 1) {
throw new IllegalArgumentException(
"The comment character must consist of exactly 1 codepoint.");
}
commentChar = comment.charAt(0);
} else {
commentChar = '\0';
}
this.writeQuoteBehavior = writeQuoteBehavior;
this.writeHeaders = writeHeaders;
emptyValue = this.quote + "" + this.quote;
@ -172,7 +186,11 @@ public class DelimitedWriter {
boolean containsQuote = value.indexOf(quoteChar) >= 0;
boolean containsQuoteEscape = quoteEscape != null && value.indexOf(quoteEscapeChar) >= 0;
boolean shouldQuote =
wantsQuoting || containsQuote || containsQuoteEscape || value.indexOf(delimiter) >= 0;
wantsQuoting
|| containsQuote
|| containsQuoteEscape
|| value.indexOf(delimiter) >= 0
|| value.indexOf(commentChar) >= 0;
if (!shouldQuote) {
return value;
}

View File

@ -9,7 +9,7 @@ import Standard.Table.IO.File_Read
from Standard.Table.IO.File_Format import Delimited
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
import Standard.Table.IO.Quote_Style
from Standard.Table.IO.Line_Ending_Style import all
import Standard.Base.Data.Text.Line_Ending_Style
import Standard.Test
import Standard.Test.Problems
@ -144,17 +144,17 @@ spec =
['a', 'b', 'c\nd', 'e', 'f\n1', 2, 3].map_with_index i-> v->
["Column_" + (i+1).to_text, [v]]
File.read file format . should_equal reference_table
File.read file (format.with_line_endings Unix_Line_Endings) . should_equal reference_table
File.read file (format.with_line_endings Classic_Mac_Line_Endings) . should_equal collapsed_table
File.read file (format.with_line_endings Windows_Line_Endings) . should_equal collapsed_table
File.read file (format.with_line_endings Line_Ending_Style.Unix) . should_equal reference_table
File.read file (format.with_line_endings Line_Ending_Style.Mac_Legacy) . should_equal collapsed_table
File.read file (format.with_line_endings Line_Ending_Style.Windows) . should_equal collapsed_table
file.delete
file_2 = enso_project.data / "transient" / "crlf.csv"
lines.join '\r\n' . write file_2
File.read file_2 (format.with_line_endings Windows_Line_Endings) . should_equal reference_table
File.read file_2 (format.with_line_endings Line_Ending_Style.Windows) . should_equal reference_table
# For some reason loading the CRLF file in Unix mode trims the CR characters. We may want to revisit this at some point.
table = File.read file_2 (format.with_line_endings Unix_Line_Endings)
table = File.read file_2 (format.with_line_endings Line_Ending_Style.Unix)
table . should_equal reference_table
file_2.delete
@ -399,6 +399,6 @@ spec =
Delimited ',' . with_comments . should_equal (Delimited ',' comment_character='#')
Delimited ',' . with_comments ';' . should_equal (Delimited ',' comment_character=';')
Delimited ',' comment_character='#' . without_comments . should_equal (Delimited ',' comment_character=Nothing)
Delimited ',' . with_line_endings Unix_Line_Endings . should_equal (Delimited ',' line_endings=Unix_Line_Endings)
Delimited ',' . with_line_endings Line_Ending_Style.Unix . should_equal (Delimited ',' line_endings=Line_Ending_Style.Unix)
main = Test.Suite.run_main spec

View File

@ -2,6 +2,7 @@ from Standard.Base import all
from Standard.Base.Error.Problem_Behavior import all
import Standard.Base.System.File.Existing_File_Behavior
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
import Standard.Base.Data.Text.Line_Ending_Style
import Standard.Base.Data.Time.Date
import Standard.Base.Data.Time.Time_Of_Day
import Standard.Base.System
@ -13,9 +14,9 @@ import Standard.Table.IO.File_Read
from Standard.Table.IO.File_Format import Delimited
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
import Standard.Table.IO.Quote_Style
from Standard.Table.IO.Line_Ending_Style import all
import Standard.Table.Data.Match_Columns
import Standard.Table.Data.Column_Name_Mapping
from Standard.Table.Data.Column_Selector as Column_Selector_Module import By_Name
from Standard.Table.Errors as Table_Errors import Column_Count_Mismatch, Column_Name_Mismatch
import Standard.Test
@ -29,9 +30,13 @@ type My_Type
to_text : Text
to_text = "[[[My Type :: " + self.x.to_text + "]]]"
default_line_endings_for_new_files = Line_Ending_Style.Unix
join_lines lines trailing_newline=True =
eol = default_line_endings_for_new_files.to_text
if trailing_newline then lines.join eol suffix=eol else lines.join eol
spec =
line_ending_pairs = [[Unix_Line_Endings, '\n'], [Windows_Line_Endings, '\r\n'], [Classic_Mac_Line_Endings, '\r']]
system_separator = if System.os == "windows" then '\r\n' else '\n'
line_ending_pairs = [[Line_Ending_Style.Unix, '\n'], [Line_Ending_Style.Windows, '\r\n'], [Line_Ending_Style.Mac_Legacy, '\r']]
Test.group "Delimited File Writing" <|
Test.specify "should correctly write a simple table" <|
table = Table.new [["A", [1,2,3]], ["B", [1.0,1.5,2.2]], ["C", ["x","y","z"]], ["D", ["a", 2, My_Type 10]]]
@ -131,6 +136,27 @@ spec =
text.should_equal expected_text
file.delete
Test.specify "should quote values containing the comment symbol if comments are enabled" <|
table = Table.new [["#", ['b', 'x', '#']], ["B", [Nothing,"#","abc"]]]
file = (enso_project.data / "transient" / "comments.csv")
file.delete_if_exists
table.write file on_problems=Report_Error . should_succeed
expected_text = join_lines ['#,B','b,', 'x,#', '#,abc']
text = File.read_text file
text.should_equal expected_text
file.delete
format = File_Format.Delimited ',' . with_comments
table.write file format on_problems=Report_Error . should_succeed
expected_text_2 = normalize_lines <| """
"#",B
b,
x,"#"
"#",abc
text_2 = File.read_text file
text_2.should_equal expected_text_2
file.delete
Test.specify 'should not quote values if quoting is disabled' <|
format = File_Format.Delimited "," value_formatter=(Data_Formatter decimal_point=",") . without_quotes
table = Table.new [['The Column "Name"', ["foo","'bar'",'"baz"', 'one, two, three']], ["Hello, Column?", [1.0, 1000000.5, 2.2, -1.5]]]
@ -187,8 +213,7 @@ spec =
text = File.read_text file encoding=Encoding.ascii
text.should_equal expected_text
result . should_equal Nothing
sep_length = System.default_line_separator.codepoints.length
positions = [6 + sep_length, 7 + sep_length, 13 + 2*sep_length]
positions = [7, 8, 15]
msg = "Encoding issues at codepoints " +
positions.map .to_text . join separator=", " suffix="."
Warning.get_all result . map .value . should_equal [Encoding_Error msg]
@ -238,6 +263,17 @@ spec =
got_table.should_equal table
file.delete
Test.specify "should correctly append to a file with a missing newline at EOF" <|
table = Table.new [["A", [1,2,3]], ["B", [1.0,1.5,2.2]], ["C", ["x","y","z"]]]
file = (enso_project.data / "transient" / "append_missing_newline.csv")
file.delete_if_exists
'A,B,C\r0,0,0'.write file
table.write file on_existing_file=Existing_File_Behavior.Append on_problems=Report_Error . should_succeed
text = File.read_text file
expected_lines = ["A,B,C", "0,0,0", "1,1.0,x", "2,1.5,y", "3,2.2,z"]
text.should_equal (expected_lines.join '\r' suffix='\r')
file.delete
Test.specify "should append to a file, matching columns by name (headers=Infer)" <|
existing_table = Table.new [["A", [1,2]], ["B", [1.0,1.5]], ["C", ["x","y"]]]
appending_table = Table.new [["B", [33,44]], ["A", [Nothing, 0]], ["C", ["a","BB"]]]
@ -358,7 +394,7 @@ spec =
text.should_equal (expected_lines.join separator suffix=separator)
file.delete
Test.specify "should use the system default line ending style when appending to an empty or nonexistent file" <|
Test.specify "should use Unix line ending style when appending to an empty or nonexistent file" <|
empty_file = (enso_project.data / "transient" / "empty.csv")
"".write empty_file
nonexistent_file = (enso_project.data / "transient" / "nonexistent.csv")
@ -369,7 +405,7 @@ spec =
table_to_append.write empty_file on_existing_file=Existing_File_Behavior.Append on_problems=Report_Error . should_succeed
expected_lines = ["a,d", "x,z", "y,w"]
expected_text = (expected_lines.join system_separator suffix=system_separator)
expected_text = join_lines expected_lines
File.read_text empty_file . should_equal expected_text
File.read_text nonexistent_file . should_equal expected_text
@ -389,13 +425,130 @@ spec =
text.should_equal expected_text
file.delete
Test.specify "should use the existing line ending style when appending to a file consisting of only comments missing last EOL" <|
initial_lines = ["# comment 1", "# comment 2 without EOL"]
table_to_append = Table.new [["a", ["x", "y"]], ["b", ["z", "w"]]]
expected_lines = initial_lines + ["a,b", "x,z", "y,w"]
file = (enso_project.data / "transient" / "endings_comments_only.csv")
line_ending_pairs.each setting->
separator=setting.second
file.delete_if_exists
(initial_lines.join separator).write file
format = File_Format.Delimited ',' . with_comments
table_to_append.write file format on_existing_file=Existing_File_Behavior.Append on_problems=Report_Error . should_succeed
text = File.read_text file
expected_text = expected_lines.join separator suffix=separator
text.should_equal expected_text
file.delete
Test.specify "should correctly handle append edge cases" <|
table = Table.new [["a", [1, 2]]]
file = (enso_project.data / "transient" / "append_edge_cases.csv")
file.delete_if_exists
format = File_Format.Delimited ',' . without_headers
# A long line but without a trailing newline
base_line = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-ABC"
expected_lines_1 = [base_line, "1", "2"]
# 1 character with trailing newline
line_ending_pairs.each setting->
separator=setting.second
(base_line+separator).write file
table.write file format on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position on_problems=Report_Error . should_succeed
text = File.read_text file
expected_text = expected_lines_1.join separator suffix=separator
text.should_equal expected_text
file.delete
base_line.write file
table.write file format on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position on_problems=Report_Error . should_succeed
File.read_text file . should_equal <| normalize_lines base_line+'\n1\n2\n'
file.delete
# 1 character without trailing newline
"#".write file
table.write file format on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position on_problems=Report_Error . should_succeed
File.read_text file . should_equal <| normalize_lines '#\n1\n2\n'
file.delete
"#".write file
table.write file format.with_comments on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position on_problems=Report_Error . should_succeed
File.read_text file . should_equal <| normalize_lines '#\n1\n2\n'
file.delete
expected_lines_2 = ["#", "1", "2"]
# 1 character with trailing newline
line_ending_pairs.each setting->
[format.with_comments, format].each format->
separator=setting.second
("#"+separator).write file
table.write file format on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position on_problems=Report_Error . should_succeed
text = File.read_text file
expected_text = expected_lines_2.join separator suffix=separator
text.should_equal expected_text
file.delete
["B", "#"].each middle_line->
expected_lines_3 = ["A", middle_line, "1", "2"]
[format.with_comments, format].each format->
# 2 lines without trailing newline
line_ending_pairs.each setting->
separator=setting.second
("A"+separator+middle_line).write file
table.write file format on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position on_problems=Report_Error . should_succeed
text = File.read_text file
expected_text = expected_lines_3.join separator suffix=separator
text.should_equal expected_text
file.delete
# 2 lines with trailing newline
line_ending_pairs.each setting->
separator=setting.second
("A"+separator+middle_line+separator).write file
table.write file format on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position on_problems=Report_Error . should_succeed
text = File.read_text file
expected_text = expected_lines_3.join separator suffix=separator
text.should_equal expected_text
file.delete
Test.specify "should use the existing line ending style when appending to a file consisting of only one comment with EOL" <|
initial_line = "# comment 1 with EOL"
table_to_append = Table.new [["a", ["x", "y"]], ["b", ["z", "w"]]]
expected_lines = [initial_line] + ["a,b", "x,z", "y,w"]
file = (enso_project.data / "transient" / "endings_comments_only.csv")
line_ending_pairs.each setting->
separator=setting.second
file.delete_if_exists
(initial_line+separator).write file
format = File_Format.Delimited ',' . with_comments
table_to_append.write file format on_existing_file=Existing_File_Behavior.Append on_problems=Report_Error . should_succeed
text = File.read_text file
expected_text = expected_lines.join separator suffix=separator
text.should_equal expected_text
file.delete
Test.specify "should use the Unix line ending style when appending to a file consisting of only one comment and missing the EOL" <|
initial_lines = ["# comment 1 without EOL"]
table_to_append = Table.new [["a", ["x", "y"]], ["b", ["z", "w"]]]
expected_lines = initial_lines + ["a,b", "x,z", "y,w"]
file = (enso_project.data / "transient" / "endings_comments_only.csv")
file.delete_if_exists
(join_lines initial_lines trailing_newline=False).write file
format = File_Format.Delimited ',' . with_comments
table_to_append.write file format on_existing_file=Existing_File_Behavior.Append on_problems=Report_Error . should_succeed
text = File.read_text file
expected_text = join_lines expected_lines
text.should_equal expected_text
file.delete
Test.specify "should fail if explicitly provided line endings do not match line endings in the file when appending" <|
initial_table = Table.new [["a", [1, 2]]]
table_to_append = Table.new [["a", ["x", "y"]]]
file = (enso_project.data / "transient" / "endings_mismatch.csv")
file.delete_if_exists
initial_table.write file (File_Format.Delimited ',' line_endings=Classic_Mac_Line_Endings)
result = table_to_append.write file (File_Format.Delimited ',' line_endings=Unix_Line_Endings) on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position
initial_table.write file (File_Format.Delimited ',' line_endings=Line_Ending_Style.Mac_Legacy)
result = table_to_append.write file (File_Format.Delimited ',' line_endings=Line_Ending_Style.Unix) on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position
result . should_fail_with Illegal_Argument_Error
result.catch.message . should_equal "The explicitly provided line endings ('\n') do not match the line endings in the file ('\r')."
file.delete

View File

@ -9,6 +9,7 @@ import project.Delimited_Write_Spec
import project.Excel_Spec
import project.Json_Spec
import project.Table_Spec
import project.Table_Date_Spec
import project.Aggregate_Column_Spec
import project.Aggregate_Spec
@ -20,6 +21,7 @@ in_memory_spec =
Excel_Spec.spec
Json_Spec.spec
Table_Spec.spec
Table_Date_Spec.spec
Aggregate_Column_Spec.spec
Aggregate_Spec.spec

View File

@ -1,14 +1,14 @@
from Standard.Base import all
import Standard.Base.Data.Time.Date
import Standard.Base.Data.Text.Line_Ending_Style
import Standard.Table
import Standard.Table.Data.Column
import Standard.Table.Io.File_Format
import Standard.Table.IO.File_Format
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
from Standard.Table.Io.Line_Ending_Style import Unix_Line_Endings
import Standard.Test
import project.Util
from project.Util import all
spec =
c_number = ["Number", [71, 72, 73, 74, 75, 76, 77]]
@ -40,15 +40,16 @@ spec =
Test.group "Should be able to serialise a table with Dates to Text" <|
Test.specify "should serialise back to input" <|
expected_text = (enso_project.data / "prime_ministers.csv").read_text
delimited = Text.from expected format=(File_Format.Delimited "," line_endings=Unix_Line_Endings)
expected_text = normalize_lines <|
(enso_project.data / "prime_ministers.csv").read_text
delimited = Text.from expected format=(File_Format.Delimited "," line_endings=Line_Ending_Style.Unix)
delimited.should_equal expected_text
Test.specify "should serialise dates with format" <|
test_table = Table.new [c_from]
expected_text = 'From\n04.05.1979\n28.11.1990\n02.05.1997\n27.06.2007\n11.05.2010\n13.07.2016\n24.07.2019\n'
data_formatter = Data_Formatter . with_datetime_formats date_formats=["dd.MM.yyyy"]
delimited = Text.from test_table format=(File_Format.Delimited "," value_formatter=data_formatter line_endings=Unix_Line_Endings)
delimited = Text.from test_table format=(File_Format.Delimited "," value_formatter=data_formatter line_endings=Line_Ending_Style.Unix)
delimited.should_equal expected_text
main = Test.Suite.run_main spec

View File

@ -1,6 +1,7 @@
from Standard.Base import all
import Standard.Base.System
import Standard.Base.Data.Text.Line_Ending_Style
import Standard.Table
import Standard.Table.Data.Column
import Standard.Test
@ -16,7 +17,7 @@ Column.Column.should_equal expected =
self.length.should_equal expected.length
self.to_vector.should_equal expected.to_vector
normalize_lines string line_separator=System.default_line_separator newline_at_end=True =
normalize_lines string line_separator=Line_Ending_Style.Unix.to_text newline_at_end=True =
case newline_at_end of
True -> string.lines.join line_separator suffix=line_separator
False -> string.lines.join line_separator

View File

@ -93,6 +93,17 @@ spec =
contents = File.read_bytes full_path
contents.take_start 6 . should_equal [67, 117, 112, 99, 97, 107]
Test.specify "should allow to read last n bytes from a file" <|
file = enso_project.data / "transient" / "bytes.txt"
data = [1, 0, 0, 1, 2, 100, 20]
data.write_bytes file
file.read_last_bytes 0 . should_equal []
file.read_last_bytes 1 . should_equal [20]
file.read_last_bytes 2 . should_equal [100, 20]
file.read_last_bytes 5 . should_equal [0, 1, 2, 100, 20]
file.read_last_bytes 1000 . should_equal data
file.delete
Test.specify "should handle exceptions when reading a non-existent file" <|
file = File.new "does_not_exist.txt"
File.read_bytes "does_not_exist.txt" . should_fail_with File.File_Not_Found