Add a lazy input reader for flexer (#1014)

2024-09-11 13:15:52 +03:00 · 2020-07-21 17:25:02 +02:00 · 2020-07-21 17:25:02 +02:00 · 9fcd4e2f3a
commit 9fcd4e2f3a
parent f1e18973e8
5 changed files with 605 additions and 14 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -13,6 +13,7 @@ members = [
    "lib/rust/lexer/definition",
    "lib/rust/lexer/generation",
    "lib/rust/lexer/tests",
+    "lib/rust/lazy-reader",
 ]

 [profile.dev]
--- a/docs/parser/reader.md
+++ b/docs/parser/reader.md
@ -23,26 +23,66 @@ project is going to use, as well as backing formats for the stream.

 ## Reader Functionality

-The reader trait needs to have the following functionality:
+The reader has the following functionality:

- It must read its input _lazily_, not requiring the entire input to be in
-  memory.
- It should provide the interface to `next_character`, returning rust-native
-  UTF-8, and hence abstract away the various underlying encodings.
+- It reads its input _lazily_, not requiring the entire input to be in memory.
+- It provides the interface to `next_character`, returning rust-native UTF-32,
+  and abstracts away the various underlying encodings.
+- It allows to bookmark the character that was last read, and return to it later
+  by calling `rewind`.

-## Provided Readers
+## Reader Structure

-The parser implementation currently provides the following reader utilities to
-clients.
+The lazy reader consists of the following parts:

-### UTF-8 Reader
+### Read
+
+The `Read` trait is similar to `std::io::Read`, but supports different encodings
+than just `&[u8]`. It provides the interface
+`fn read(&mut self, buffer:&mut [Self::Item]) -> usize` that fills the provided
+buffer with the data that is being read.
+
+Any structure that implements `std::io::Read` also implements `Read<Item=u8>`.
+
+### Decoder
+
+The `Decoder` trait is an interface for reading a single character from an
+underlying buffer `fn decode(words:&[Self::Word]) -> Char`. The type of buffer
+depends on the type of the underlying encoding so that i.e. UTF-32 can use
+`&[char]` directly.
+
+#### Example Usage
+
+To put things into perspective, this is how the reader is constructed from a
+file and a string.
+
+```rust
+let string      = "Hello, World!";
+let byte_reader = Reader::new(string.as_bytes(), DecoderUTF8(), 0);
+let file_reader = Reader::new(File::open("foo.txt")?, DecoderUTF8(), 0);
+```
+
+## Provided Encodings
+
+The decoders currently provides the following input encodings.
+
+### UTF-8

 Rust natively uses UTF-8 encoding for its strings. In order for the IDE to make
-use of the parser, it must provide a simple rust-native reader.
+use of the parser, a simple rust-native UTF-8 encoding is provided.

-### UTF-16 Reader
+### UTF-16

 As the JVM as a platform makes use of UTF-16 for encoding its strings, we need
-to provide a reader that will let JVM clients of the parser provide the source
-code in a streaming fashion without needing to re-encode it prior to passing it
-to the parser.
+to have a reader that lets JVM clients of the parser provide the source code in
+a streaming fashion without needing to re-encode it prior to passing it to the
+parser.
+
+### UTF-32
+
+Rust also uses UTF-32 encoding for its characters. Therefore, this encoding is
+required in order to support inputs as `&[char]`.
+
+### Benchmarks
+
+7/17/2020: The reader throughput is around 1e+8 chars/s (or 1e-8 secs/char).
--- a/lib/rust/lazy-reader/Cargo.toml
+++ b/lib/rust/lazy-reader/Cargo.toml
@ -0,0 +1,24 @@
+[package]
+name    = "lazy_reader"
+version = "0.1.0"
+authors = ["Enso Team <enso-dev@enso.org>"]
+edition = "2018"
+
+description  = "An efficient buffered reader."
+readme       = "README.md"
+homepage     = "https://github.com/enso-org/enso/lib/rust/lazy-reader"
+repository   = "https://github.com/enso-org/enso"
+license-file = "../../../LICENSE"
+
+keywords   = ["read", "UTF"]
+
+publish = false
+
+[lib]
+name       = "lazy_reader"
+crate-type = ["cdylib", "rlib"]
+test       = true
+bench      = true
+
+[dependencies]
+itertools = "0.8"
--- a/lib/rust/lazy-reader/src/decoder.rs
+++ b/lib/rust/lazy-reader/src/decoder.rs
@ -0,0 +1,181 @@
+#![allow(unsafe_code)]
+
+//! This module exports various UTF decoders for decoding UTF32 characters.
+
+use std::fmt::Debug;
+
+
+
+// ===============
+// === Decoder ===
+// ===============
+
+/// Trait for decoding UTF32 characters.
+pub trait Decoder {
+    /// The input of the decoder.
+    type Word : Default + Copy + Debug;
+    /// The maximum amount of words needed to decode one symbol.
+    const MAX_CODEPOINT_LEN: usize;
+
+    /// Decodes the first symbol from the slice and returns it with its length (in words).
+    ///
+    /// This function can panic if `words.len() < MAX_CODEPOINT_LEN`.
+    fn decode(words:&[Self::Word]) -> Char;
+}
+
+
+// === Char ===
+
+/// The result of `decoder.decode`.
+#[derive(Debug,Clone,Copy)]
+pub struct Char {
+    /// The decoded character.
+    pub char: Option<char>,
+    /// The number of words read.
+    pub size: usize,
+}
+
+
+
+// =====================
+// === UTF-8 Decoder ===
+// =====================
+
+/// Decoder for UTF-8.
+///
+/// For more info on UTF-8 and the algorithm used see [UTF-8](https://en.wikipedia.org/wiki/UTF-8).
+#[derive(Debug,Copy,Clone)]
+pub struct DecoderUTF8();
+
+
+// === Trait Impls ===
+
+impl Decoder for DecoderUTF8 {
+    type Word = u8;
+
+    const MAX_CODEPOINT_LEN: usize = 4;
+
+    fn decode(words: &[u8]) -> Char {
+        let size = match !words[0] >> 4 {
+            0     => 4,
+            1     => 3,
+            2 | 3 => 2,
+            _     => 1,
+        };
+
+        let mut char = (words[0] << size >> size) as u32;
+        for word in &words[1..size] {
+            char = char << 6 | (word & 0b_0011_1111) as u32;
+        }
+
+        Char{char:std::char::from_u32(char),size}
+    }
+}
+
+
+
+// ======================
+// === UTF-16 Decoder ===
+// ======================
+
+/// Decoder for UTF-16.
+///
+/// For more info on UTF-16 and the algorithm used see [UTF-16](https://en.wikipedia.org/wiki/UTF-16).
+#[derive(Debug,Copy,Clone)]
+pub struct DecoderUTF16();
+
+
+// === Trait Impls ===
+
+impl Decoder for DecoderUTF16 {
+    type Word = u16;
+
+    const MAX_CODEPOINT_LEN: usize = 2;
+
+    fn decode(words: &[u16]) -> Char {
+        if words[0] < 0xD800 || 0xDFFF < words[0] {
+            let char = Some(unsafe{std::char::from_u32_unchecked(words[0] as u32)});
+            return Char{char,size:1};
+        }
+        let char = (((words[0] - 0xD800) as u32) << 10 | (words[1] - 0xDC00) as u32) + 0x1_0000;
+
+        Char{char:std::char::from_u32(char), size:2}
+    }
+}
+
+
+
+// ======================
+// === UTF-32 Decoder ===
+// ======================
+
+/// Trivial decoder for UTF-32 (`char`).
+#[derive(Debug,Copy,Clone)]
+pub struct DecoderUTF32();
+
+
+// === Trait Impls ===
+
+impl Decoder for DecoderUTF32 {
+    type Word = char;
+
+    const MAX_CODEPOINT_LEN: usize = 1;
+
+    fn decode(words: &[char]) -> Char {
+        Char{char:Some(words[0]), size:1}
+    }
+}
+
+
+
+// =============
+// === Tests ===
+// =============
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use itertools::Itertools;
+
+
+
+    #[test]
+    fn test_utf8() {
+        let string  = "a.b^c! #𤭢界んにち𤭢#𤭢";
+        let mut buf = string.as_bytes();
+        let mut str = String::from("");
+        while !buf.is_empty() {
+            let char = DecoderUTF8::decode(buf);
+            str.push(char.char.unwrap());
+            buf = &buf[char.size..];
+        }
+        assert_eq!(str, string);
+    }
+
+    #[test]
+    fn test_utf16() {
+        let string  = "a.b^c! #𤭢界んにち𤭢#𤭢";
+        let buffer  = string.encode_utf16().collect_vec();
+        let mut buf = &buffer[..];
+        let mut str = String::from("");
+        while !buf.is_empty() {
+            let char = DecoderUTF16::decode(buf);
+            str.push(char.char.unwrap());
+            buf = &buf[char.size..];
+        }
+        assert_eq!(str, string);
+    }
+
+    #[test]
+    fn test_utf32() {
+        let string  = "a.b^c! #𤭢界んにち𤭢#𤭢".chars().collect_vec();
+        let mut buf = &string[..];
+        let mut str = vec![];
+        while !buf.is_empty() {
+            let char = DecoderUTF32::decode(buf);
+            str.push(char.char.unwrap());
+            buf = &buf[char.size..];
+        }
+        assert_eq!(str, string);
+    }
+}
--- a/lib/rust/lazy-reader/src/lib.rs
+++ b/lib/rust/lazy-reader/src/lib.rs
@ -0,0 +1,345 @@
+#![feature(test)]
+#![deny(unconditional_recursion)]
+#![warn(missing_copy_implementations)]
+#![warn(missing_debug_implementations)]
+#![warn(missing_docs)]
+#![warn(trivial_casts)]
+#![warn(trivial_numeric_casts)]
+#![warn(unsafe_code)]
+#![warn(unused_import_braces)]
+
+//! This module exports a reader that is able to process large textual inputs in constant memory.
+
+pub mod decoder;
+
+use decoder::Decoder;
+
+
+
+// ============
+// === Read ===
+// ============
+
+/// Trait for reading input data into a buffer.
+///
+/// Compared to `std::io::Read` this reader supports multiple input encodings.
+pub trait Read {
+    /// The type of the data in the buffer.
+    type Item;
+
+    /// Fills the buffer and returns amount of elements read.
+    ///
+    /// In case it isn't possible to fill the whole buffer (i.e. if an error like EOF is encountered),
+    /// the buffer will be filled with all the data read before encountering such error.
+    fn read(&mut self, buffer:&mut [Self::Item]) -> usize;
+}
+
+
+// === Trait Impls ===
+
+impl<R:std::io::Read> Read for R {
+    type Item = u8;
+
+    fn read(&mut self, mut buffer:&mut [u8]) -> usize {
+        let length = buffer.len();
+        while !buffer.is_empty() {
+            match self.read(buffer) {
+                Err(_) => break,
+                Ok (0) => break,
+                Ok (n) => {
+                    buffer = &mut buffer[n..];
+                }
+            }
+        }
+        length - buffer.len()
+    }
+}
+
+
+
+/// ==================
+/// === LazyReader ===
+/// ==================
+
+/// Set of errors returned by lazy reader.
+#[derive(Debug,Clone,Copy,PartialEq,Eq)]
+pub enum Error {
+    /// End Of Input.
+    EOF,
+    /// Couldn't decode character.
+    InvalidChar,
+}
+
+/// Strongly typed identifier of `Bookmark`
+#[derive(Debug,Clone,Copy)]
+pub struct BookmarkId {
+    #[allow(missing_docs)]
+    id: usize
+}
+
+/// Bookmarks a specific character in buffer, so that `LazyReader` can return to it when needed.
+#[derive(Debug,Clone,Copy)]
+pub struct Bookmark {
+    /// The position of bookmarked character in `reader.buffer`.
+    offset: usize,
+}
+
+/// The default size of buffer.
+pub const BUFFER_SIZE: usize = 32768;
+
+/// A buffered reader able to efficiently read big inputs in constant memory.
+///
+/// It supports various encodings via `Decoder` and also bookmarks which allow it to return
+/// back to a character at specific offset.
+#[derive(Debug,Clone)]
+pub struct Reader<D:Decoder,Read> {
+    /// The reader that holds the input.
+    pub reader: Read,
+    /// The buffer that stores the input data.
+    pub buffer: Vec<D::Word>,
+    /// The buffer offset of the current element read.
+    pub offset: usize,
+    /// The number of elements stored in buffer.
+    pub length: usize,
+    /// Flag that is true iff the reader was just rewinded and no new chars were read.
+    pub rewinded: bool,
+    /// Bookmarks allow reader to return to a character at specific offset.
+    pub bookmark: Vec<Bookmark>,
+    /// The last character read.
+    pub character: decoder::Char,
+}
+
+impl<D:Decoder,R: Read<Item=D::Word>> Reader<D,R> {
+    /// Returns new instance of `LazyReader`.
+    pub fn new(reader:R, _decoder:D, bookmarks:usize) -> Self {
+        let mut reader = Reader::<D,R> {
+            reader,
+            buffer    : vec![D::Word::default(); BUFFER_SIZE],
+            offset    : 0,
+            length    : 0,
+            rewinded  : false,
+            bookmark  : vec![Bookmark{offset:0};bookmarks],
+            character : decoder::Char{char:None, size:0},
+        };
+        reader.length = reader.reader.read(&mut reader.buffer[..]);
+        reader
+    }
+
+    /// Bookmarks the current character, so that the reader can return to it later with `rewind()`.
+    pub fn bookmark(&mut self, bookmark:BookmarkId) {
+        self.bookmark[bookmark.id].offset = self.offset - self.character.size;
+    }
+
+    /// Returns to the bookmarked character.
+    pub fn rewind(&mut self, bookmark:BookmarkId) {
+        self.offset = self.bookmark[bookmark.id].offset;
+        let _ = self.next_char();
+        self.rewinded = true;
+    }
+
+    /// How many words could be rewinded
+    fn max_possible_rewind_len(&self) -> usize {
+        if let Some(offset) = self.bookmark.iter().map(|b| b.offset).min() {
+            return self.buffer.len() - offset
+        }
+        D::MAX_CODEPOINT_LEN
+    }
+
+    /// Decrease the offset all bookmarks.
+    pub fn decrease_offset(&mut self, off:usize) {
+        for bookmark in self.bookmark.iter_mut() {
+            bookmark.offset -= off
+        }
+    }
+
+    /// Fill the buffer with words from input.
+    pub fn fill(&mut self) {
+        let len     = self.buffer.len();
+        let words   = len - self.offset;
+        self.offset = self.max_possible_rewind_len();
+        if self.offset == len {
+            panic!("Rewind won't be possible. Buffer is too small.")
+        }
+
+        self.decrease_offset(len - self.offset);
+        for i in 1..=self.offset {
+            self.buffer[self.offset - i] = self.buffer[len - i];
+        }
+        self.length = self.offset + self.reader.read(&mut self.buffer[self.offset..]);
+        self.offset = self.offset - words;
+    }
+
+    /// Is the reader empty.
+    pub fn empty(&self) -> bool {
+        self.length < self.buffer.len() && self.length <= self.offset
+    }
+
+    /// Reads the next char from input.
+    pub fn next_char(&mut self) -> Result<char,Error> {
+        if self.empty() { return Err(Error::EOF) }
+
+        if self.offset >= self.buffer.len() - D::MAX_CODEPOINT_LEN {
+            self.fill();
+        }
+
+        self.character = D::decode(&self.buffer[self.offset..]);
+        self.rewinded  = false;
+        self.offset    = self.offset + self.character.size;
+
+        match self.character.char {
+            Some(char) => Ok(char),
+            None       => Err(Error::InvalidChar)
+        }
+    }
+}
+
+
+// === Trait Impls ===
+
+impl From<Error> for u32 {
+    fn from(error: Error) -> Self {
+        match error {
+            Error::EOF         => u32::max_value(),
+            Error::InvalidChar => u32::max_value() - 1,
+        }
+    }
+}
+
+
+
+// =============
+// === Tests ===
+// =============
+
+#[cfg(test)]
+mod tests {
+    extern crate test;
+
+    use super::*;
+    use decoder::*;
+
+    use test::Bencher;
+
+
+
+    // ================
+    // === Repeater ===
+    // ================
+
+    /// Struct that holds state of `Reader` that repeats an input n times.
+    #[derive(Debug, Clone)]
+    struct Repeat<T> {
+        /// The input to be repeated.
+        buffer: Vec<T>,
+        /// The current offset of element currently read from buffer.
+        offset: usize,
+        /// How many more times the input should be repeated.
+        repeat: usize,
+    }
+
+    /// Creates a reader that repeats an input n times.
+    fn repeat<T:Copy>(input:Vec<T>, repeat:usize) -> impl Read<Item=T> {
+        Repeat { buffer:input, repeat, offset: 0 }
+    }
+
+
+    // === Trait Impls ===
+
+    impl<T:Copy> Read for Repeat<T> {
+        type Item = T;
+
+        fn read(&mut self, mut buffer:&mut [Self::Item]) -> usize {
+            if self.repeat == 0 { return 0 }
+
+            let len  = self.buffer.len();
+            let read = buffer.len();
+
+            if read < len - self.offset {
+                buffer.copy_from_slice(&self.buffer[self.offset..self.offset + read]);
+                self.offset += read;
+                return read
+            }
+
+            buffer[..len - self.offset].copy_from_slice(&self.buffer[self.offset..]);
+            buffer = &mut buffer[len - self.offset..];
+
+            let repeat  = std::cmp::min(buffer.len() / len, self.repeat - 1);
+            self.repeat = self.repeat - repeat - 1;
+            for _ in 0..repeat {
+                buffer[..len].copy_from_slice(&self.buffer[..]);
+                buffer = &mut buffer[len..];
+            }
+
+            if self.repeat == 0 {
+                return len - self.offset + repeat * len
+            }
+            buffer.copy_from_slice(&self.buffer[..buffer.len()]);
+            self.offset = buffer.len();
+            read
+        }
+    }
+
+
+
+    // =============
+    // === Tests ===
+    // =============
+
+    #[test]
+    fn test_repeater_with_small_buffer() {
+        let mut repeater = repeat(vec![1, 2, 3], 1);
+        let mut buffer = [0; 2];
+        assert_eq!(repeater.read(&mut buffer), 2);
+        assert_eq!(&buffer, &[1, 2]);
+        assert_eq!(repeater.read(&mut buffer), 1);
+        assert_eq!(&buffer, &[3, 2])
+    }
+
+    #[test]
+    fn test_repeater_with_big_buffer() {
+        let mut repeater = repeat(vec![1, 2], 3);
+        let mut buffer = [0; 5];
+        assert_eq!(repeater.read(&mut buffer), 5);
+        assert_eq!(&buffer, &[1, 2, 1, 2, 1]);
+        assert_eq!(repeater.read(&mut buffer), 1);
+        assert_eq!(&buffer, &[2, 2, 1, 2, 1])
+    }
+
+    #[test]
+    fn test_reader_small_input() {
+        let     str    = "a.b^c! #𤭢界んにち𤭢#𤭢";
+        let mut reader = Reader::new(str.as_bytes(), DecoderUTF8(), 0);
+        let mut result = String::from("");
+        while let Ok(char) = reader.next_char() {
+            result.push(char);
+        }
+        assert_eq!(&result, str);
+    }
+
+    #[test]
+    fn test_reader_big_input() {
+        let     str    = "a.b^c! #𤭢界んにち𤭢#𤭢".repeat(10_000);
+        let mut reader = Reader::new(str.as_bytes(), DecoderUTF8(), 0);
+        let mut result = String::from("");
+        while let Ok(char) = reader.next_char() {
+            result.push(char);
+        }
+        assert_eq!(&result, &str);
+        assert_eq!(reader.bookmark.len(), 0);
+        assert_eq!(reader.buffer.len(), BUFFER_SIZE);
+    }
+
+    #[bench]
+    fn bench_reader(bencher:&mut Bencher) {
+        let run = || {
+            let     str    = repeat("Hello, World!".as_bytes().to_vec(), 10_000_000);
+            let mut reader = Reader::new(str, DecoderUTF8(), 0);
+            let mut count  = 0;
+            while reader.next_char() != Err(Error::EOF) {
+                count += 1;
+            }
+            count
+        };
+        bencher.iter(run);
+    }
+}