Update training to include new lines

2024-09-11 05:15:46 +03:00 · 2020-01-03 20:23:58 -05:00 · 2020-01-03 20:23:58 -05:00 · 805dc58949
commit 805dc58949
parent a1891387ed
1 changed files with 17 additions and 9 deletions
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@ -411,17 +411,25 @@ impl Tokenizer {
                let mut words = HashMap::new();

                let file: std::fs::File = File::open(file)?;
-                let file = BufReader::new(file);
+                let mut file = BufReader::new(file);

-                for line in file.lines() {
-                    let line = line?;
-                    let normalized = self.normalize(&line)?;
+                let mut buf = String::new();
+                loop {
+                    buf.clear();
+                    // We read new lines using this API instead of the Lines Iterator
+                    // on purpose. We want to keep the `\n` and potential `\r` between each lines
+                    match file.read_line(&mut buf)? {
+                        0 => break,
+                        _ => {
+                            let normalized = self.normalize(&buf)?;
                            let pre_tokenized = self.pre_tokenize(normalized.get())?;
                            trainer.process_tokens(
                                &mut words,
                                pre_tokenized.into_iter().map(|(t, _)| t).collect(),
                            );
                        }
+                    }
+                }

                Ok(words)
            })