Add a lazy input reader for flexer (#1014)

This commit is contained in:
Josef 2020-07-21 17:25:02 +02:00 committed by GitHub
parent f1e18973e8
commit 9fcd4e2f3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 605 additions and 14 deletions

View File

@ -13,6 +13,7 @@ members = [
"lib/rust/lexer/definition",
"lib/rust/lexer/generation",
"lib/rust/lexer/tests",
"lib/rust/lazy-reader",
]
[profile.dev]

View File

@ -23,26 +23,66 @@ project is going to use, as well as backing formats for the stream.
## Reader Functionality
The reader trait needs to have the following functionality:
The reader has the following functionality:
- It must read its input _lazily_, not requiring the entire input to be in
memory.
- It should provide the interface to `next_character`, returning rust-native
UTF-8, and hence abstract away the various underlying encodings.
- It reads its input _lazily_, not requiring the entire input to be in memory.
- It provides the interface to `next_character`, returning rust-native UTF-32,
and abstracts away the various underlying encodings.
- It allows to bookmark the character that was last read, and return to it later
by calling `rewind`.
## Provided Readers
## Reader Structure
The parser implementation currently provides the following reader utilities to
clients.
The lazy reader consists of the following parts:
### UTF-8 Reader
### Read
The `Read` trait is similar to `std::io::Read`, but supports different encodings
than just `&[u8]`. It provides the interface
`fn read(&mut self, buffer:&mut [Self::Item]) -> usize` that fills the provided
buffer with the data that is being read.
Any structure that implements `std::io::Read` also implements `Read<Item=u8>`.
### Decoder
The `Decoder` trait is an interface for reading a single character from an
underlying buffer `fn decode(words:&[Self::Word]) -> Char`. The type of buffer
depends on the type of the underlying encoding so that i.e. UTF-32 can use
`&[char]` directly.
#### Example Usage
To put things into perspective, this is how the reader is constructed from a
file and a string.
```rust
let string = "Hello, World!";
let byte_reader = Reader::new(string.as_bytes(), DecoderUTF8(), 0);
let file_reader = Reader::new(File::open("foo.txt")?, DecoderUTF8(), 0);
```
## Provided Encodings
The decoders currently provides the following input encodings.
### UTF-8
Rust natively uses UTF-8 encoding for its strings. In order for the IDE to make
use of the parser, it must provide a simple rust-native reader.
use of the parser, a simple rust-native UTF-8 encoding is provided.
### UTF-16 Reader
### UTF-16
As the JVM as a platform makes use of UTF-16 for encoding its strings, we need
to provide a reader that will let JVM clients of the parser provide the source
code in a streaming fashion without needing to re-encode it prior to passing it
to the parser.
to have a reader that lets JVM clients of the parser provide the source code in
a streaming fashion without needing to re-encode it prior to passing it to the
parser.
### UTF-32
Rust also uses UTF-32 encoding for its characters. Therefore, this encoding is
required in order to support inputs as `&[char]`.
### Benchmarks
7/17/2020: The reader throughput is around 1e+8 chars/s (or 1e-8 secs/char).

View File

@ -0,0 +1,24 @@
[package]
name = "lazy_reader"
version = "0.1.0"
authors = ["Enso Team <enso-dev@enso.org>"]
edition = "2018"
description = "An efficient buffered reader."
readme = "README.md"
homepage = "https://github.com/enso-org/enso/lib/rust/lazy-reader"
repository = "https://github.com/enso-org/enso"
license-file = "../../../LICENSE"
keywords = ["read", "UTF"]
publish = false
[lib]
name = "lazy_reader"
crate-type = ["cdylib", "rlib"]
test = true
bench = true
[dependencies]
itertools = "0.8"

View File

@ -0,0 +1,181 @@
#![allow(unsafe_code)]
//! This module exports various UTF decoders for decoding UTF32 characters.
use std::fmt::Debug;
// ===============
// === Decoder ===
// ===============
/// Trait for decoding UTF32 characters.
pub trait Decoder {
/// The input of the decoder.
type Word : Default + Copy + Debug;
/// The maximum amount of words needed to decode one symbol.
const MAX_CODEPOINT_LEN: usize;
/// Decodes the first symbol from the slice and returns it with its length (in words).
///
/// This function can panic if `words.len() < MAX_CODEPOINT_LEN`.
fn decode(words:&[Self::Word]) -> Char;
}
// === Char ===
/// The result of `decoder.decode`.
#[derive(Debug,Clone,Copy)]
pub struct Char {
/// The decoded character.
pub char: Option<char>,
/// The number of words read.
pub size: usize,
}
// =====================
// === UTF-8 Decoder ===
// =====================
/// Decoder for UTF-8.
///
/// For more info on UTF-8 and the algorithm used see [UTF-8](https://en.wikipedia.org/wiki/UTF-8).
#[derive(Debug,Copy,Clone)]
pub struct DecoderUTF8();
// === Trait Impls ===
impl Decoder for DecoderUTF8 {
type Word = u8;
const MAX_CODEPOINT_LEN: usize = 4;
fn decode(words: &[u8]) -> Char {
let size = match !words[0] >> 4 {
0 => 4,
1 => 3,
2 | 3 => 2,
_ => 1,
};
let mut char = (words[0] << size >> size) as u32;
for word in &words[1..size] {
char = char << 6 | (word & 0b_0011_1111) as u32;
}
Char{char:std::char::from_u32(char),size}
}
}
// ======================
// === UTF-16 Decoder ===
// ======================
/// Decoder for UTF-16.
///
/// For more info on UTF-16 and the algorithm used see [UTF-16](https://en.wikipedia.org/wiki/UTF-16).
#[derive(Debug,Copy,Clone)]
pub struct DecoderUTF16();
// === Trait Impls ===
impl Decoder for DecoderUTF16 {
type Word = u16;
const MAX_CODEPOINT_LEN: usize = 2;
fn decode(words: &[u16]) -> Char {
if words[0] < 0xD800 || 0xDFFF < words[0] {
let char = Some(unsafe{std::char::from_u32_unchecked(words[0] as u32)});
return Char{char,size:1};
}
let char = (((words[0] - 0xD800) as u32) << 10 | (words[1] - 0xDC00) as u32) + 0x1_0000;
Char{char:std::char::from_u32(char), size:2}
}
}
// ======================
// === UTF-32 Decoder ===
// ======================
/// Trivial decoder for UTF-32 (`char`).
#[derive(Debug,Copy,Clone)]
pub struct DecoderUTF32();
// === Trait Impls ===
impl Decoder for DecoderUTF32 {
type Word = char;
const MAX_CODEPOINT_LEN: usize = 1;
fn decode(words: &[char]) -> Char {
Char{char:Some(words[0]), size:1}
}
}
// =============
// === Tests ===
// =============
#[cfg(test)]
mod tests {
use super::*;
use itertools::Itertools;
#[test]
fn test_utf8() {
let string = "a.b^c! #𤭢界んにち𤭢#𤭢";
let mut buf = string.as_bytes();
let mut str = String::from("");
while !buf.is_empty() {
let char = DecoderUTF8::decode(buf);
str.push(char.char.unwrap());
buf = &buf[char.size..];
}
assert_eq!(str, string);
}
#[test]
fn test_utf16() {
let string = "a.b^c! #𤭢界んにち𤭢#𤭢";
let buffer = string.encode_utf16().collect_vec();
let mut buf = &buffer[..];
let mut str = String::from("");
while !buf.is_empty() {
let char = DecoderUTF16::decode(buf);
str.push(char.char.unwrap());
buf = &buf[char.size..];
}
assert_eq!(str, string);
}
#[test]
fn test_utf32() {
let string = "a.b^c! #𤭢界んにち𤭢#𤭢".chars().collect_vec();
let mut buf = &string[..];
let mut str = vec![];
while !buf.is_empty() {
let char = DecoderUTF32::decode(buf);
str.push(char.char.unwrap());
buf = &buf[char.size..];
}
assert_eq!(str, string);
}
}

View File

@ -0,0 +1,345 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This module exports a reader that is able to process large textual inputs in constant memory.
pub mod decoder;
use decoder::Decoder;
// ============
// === Read ===
// ============
/// Trait for reading input data into a buffer.
///
/// Compared to `std::io::Read` this reader supports multiple input encodings.
pub trait Read {
/// The type of the data in the buffer.
type Item;
/// Fills the buffer and returns amount of elements read.
///
/// In case it isn't possible to fill the whole buffer (i.e. if an error like EOF is encountered),
/// the buffer will be filled with all the data read before encountering such error.
fn read(&mut self, buffer:&mut [Self::Item]) -> usize;
}
// === Trait Impls ===
impl<R:std::io::Read> Read for R {
type Item = u8;
fn read(&mut self, mut buffer:&mut [u8]) -> usize {
let length = buffer.len();
while !buffer.is_empty() {
match self.read(buffer) {
Err(_) => break,
Ok (0) => break,
Ok (n) => {
buffer = &mut buffer[n..];
}
}
}
length - buffer.len()
}
}
/// ==================
/// === LazyReader ===
/// ==================
/// Set of errors returned by lazy reader.
#[derive(Debug,Clone,Copy,PartialEq,Eq)]
pub enum Error {
/// End Of Input.
EOF,
/// Couldn't decode character.
InvalidChar,
}
/// Strongly typed identifier of `Bookmark`
#[derive(Debug,Clone,Copy)]
pub struct BookmarkId {
#[allow(missing_docs)]
id: usize
}
/// Bookmarks a specific character in buffer, so that `LazyReader` can return to it when needed.
#[derive(Debug,Clone,Copy)]
pub struct Bookmark {
/// The position of bookmarked character in `reader.buffer`.
offset: usize,
}
/// The default size of buffer.
pub const BUFFER_SIZE: usize = 32768;
/// A buffered reader able to efficiently read big inputs in constant memory.
///
/// It supports various encodings via `Decoder` and also bookmarks which allow it to return
/// back to a character at specific offset.
#[derive(Debug,Clone)]
pub struct Reader<D:Decoder,Read> {
/// The reader that holds the input.
pub reader: Read,
/// The buffer that stores the input data.
pub buffer: Vec<D::Word>,
/// The buffer offset of the current element read.
pub offset: usize,
/// The number of elements stored in buffer.
pub length: usize,
/// Flag that is true iff the reader was just rewinded and no new chars were read.
pub rewinded: bool,
/// Bookmarks allow reader to return to a character at specific offset.
pub bookmark: Vec<Bookmark>,
/// The last character read.
pub character: decoder::Char,
}
impl<D:Decoder,R: Read<Item=D::Word>> Reader<D,R> {
/// Returns new instance of `LazyReader`.
pub fn new(reader:R, _decoder:D, bookmarks:usize) -> Self {
let mut reader = Reader::<D,R> {
reader,
buffer : vec![D::Word::default(); BUFFER_SIZE],
offset : 0,
length : 0,
rewinded : false,
bookmark : vec![Bookmark{offset:0};bookmarks],
character : decoder::Char{char:None, size:0},
};
reader.length = reader.reader.read(&mut reader.buffer[..]);
reader
}
/// Bookmarks the current character, so that the reader can return to it later with `rewind()`.
pub fn bookmark(&mut self, bookmark:BookmarkId) {
self.bookmark[bookmark.id].offset = self.offset - self.character.size;
}
/// Returns to the bookmarked character.
pub fn rewind(&mut self, bookmark:BookmarkId) {
self.offset = self.bookmark[bookmark.id].offset;
let _ = self.next_char();
self.rewinded = true;
}
/// How many words could be rewinded
fn max_possible_rewind_len(&self) -> usize {
if let Some(offset) = self.bookmark.iter().map(|b| b.offset).min() {
return self.buffer.len() - offset
}
D::MAX_CODEPOINT_LEN
}
/// Decrease the offset all bookmarks.
pub fn decrease_offset(&mut self, off:usize) {
for bookmark in self.bookmark.iter_mut() {
bookmark.offset -= off
}
}
/// Fill the buffer with words from input.
pub fn fill(&mut self) {
let len = self.buffer.len();
let words = len - self.offset;
self.offset = self.max_possible_rewind_len();
if self.offset == len {
panic!("Rewind won't be possible. Buffer is too small.")
}
self.decrease_offset(len - self.offset);
for i in 1..=self.offset {
self.buffer[self.offset - i] = self.buffer[len - i];
}
self.length = self.offset + self.reader.read(&mut self.buffer[self.offset..]);
self.offset = self.offset - words;
}
/// Is the reader empty.
pub fn empty(&self) -> bool {
self.length < self.buffer.len() && self.length <= self.offset
}
/// Reads the next char from input.
pub fn next_char(&mut self) -> Result<char,Error> {
if self.empty() { return Err(Error::EOF) }
if self.offset >= self.buffer.len() - D::MAX_CODEPOINT_LEN {
self.fill();
}
self.character = D::decode(&self.buffer[self.offset..]);
self.rewinded = false;
self.offset = self.offset + self.character.size;
match self.character.char {
Some(char) => Ok(char),
None => Err(Error::InvalidChar)
}
}
}
// === Trait Impls ===
impl From<Error> for u32 {
fn from(error: Error) -> Self {
match error {
Error::EOF => u32::max_value(),
Error::InvalidChar => u32::max_value() - 1,
}
}
}
// =============
// === Tests ===
// =============
#[cfg(test)]
mod tests {
extern crate test;
use super::*;
use decoder::*;
use test::Bencher;
// ================
// === Repeater ===
// ================
/// Struct that holds state of `Reader` that repeats an input n times.
#[derive(Debug, Clone)]
struct Repeat<T> {
/// The input to be repeated.
buffer: Vec<T>,
/// The current offset of element currently read from buffer.
offset: usize,
/// How many more times the input should be repeated.
repeat: usize,
}
/// Creates a reader that repeats an input n times.
fn repeat<T:Copy>(input:Vec<T>, repeat:usize) -> impl Read<Item=T> {
Repeat { buffer:input, repeat, offset: 0 }
}
// === Trait Impls ===
impl<T:Copy> Read for Repeat<T> {
type Item = T;
fn read(&mut self, mut buffer:&mut [Self::Item]) -> usize {
if self.repeat == 0 { return 0 }
let len = self.buffer.len();
let read = buffer.len();
if read < len - self.offset {
buffer.copy_from_slice(&self.buffer[self.offset..self.offset + read]);
self.offset += read;
return read
}
buffer[..len - self.offset].copy_from_slice(&self.buffer[self.offset..]);
buffer = &mut buffer[len - self.offset..];
let repeat = std::cmp::min(buffer.len() / len, self.repeat - 1);
self.repeat = self.repeat - repeat - 1;
for _ in 0..repeat {
buffer[..len].copy_from_slice(&self.buffer[..]);
buffer = &mut buffer[len..];
}
if self.repeat == 0 {
return len - self.offset + repeat * len
}
buffer.copy_from_slice(&self.buffer[..buffer.len()]);
self.offset = buffer.len();
read
}
}
// =============
// === Tests ===
// =============
#[test]
fn test_repeater_with_small_buffer() {
let mut repeater = repeat(vec![1, 2, 3], 1);
let mut buffer = [0; 2];
assert_eq!(repeater.read(&mut buffer), 2);
assert_eq!(&buffer, &[1, 2]);
assert_eq!(repeater.read(&mut buffer), 1);
assert_eq!(&buffer, &[3, 2])
}
#[test]
fn test_repeater_with_big_buffer() {
let mut repeater = repeat(vec![1, 2], 3);
let mut buffer = [0; 5];
assert_eq!(repeater.read(&mut buffer), 5);
assert_eq!(&buffer, &[1, 2, 1, 2, 1]);
assert_eq!(repeater.read(&mut buffer), 1);
assert_eq!(&buffer, &[2, 2, 1, 2, 1])
}
#[test]
fn test_reader_small_input() {
let str = "a.b^c! #𤭢界んにち𤭢#𤭢";
let mut reader = Reader::new(str.as_bytes(), DecoderUTF8(), 0);
let mut result = String::from("");
while let Ok(char) = reader.next_char() {
result.push(char);
}
assert_eq!(&result, str);
}
#[test]
fn test_reader_big_input() {
let str = "a.b^c! #𤭢界んにち𤭢#𤭢".repeat(10_000);
let mut reader = Reader::new(str.as_bytes(), DecoderUTF8(), 0);
let mut result = String::from("");
while let Ok(char) = reader.next_char() {
result.push(char);
}
assert_eq!(&result, &str);
assert_eq!(reader.bookmark.len(), 0);
assert_eq!(reader.buffer.len(), BUFFER_SIZE);
}
#[bench]
fn bench_reader(bencher:&mut Bencher) {
let run = || {
let str = repeat("Hello, World!".as_bytes().to_vec(), 10_000_000);
let mut reader = Reader::new(str, DecoderUTF8(), 0);
let mut count = 0;
while reader.next_char() != Err(Error::EOF) {
count += 1;
}
count
};
bencher.iter(run);
}
}