mirror of
https://github.com/hasktorch/tokenizers.git
synced 2024-09-11 05:15:46 +03:00
Update training to include new lines
This commit is contained in:
parent
a1891387ed
commit
805dc58949
@ -411,17 +411,25 @@ impl Tokenizer {
|
||||
let mut words = HashMap::new();
|
||||
|
||||
let file: std::fs::File = File::open(file)?;
|
||||
let file = BufReader::new(file);
|
||||
let mut file = BufReader::new(file);
|
||||
|
||||
for line in file.lines() {
|
||||
let line = line?;
|
||||
let normalized = self.normalize(&line)?;
|
||||
let mut buf = String::new();
|
||||
loop {
|
||||
buf.clear();
|
||||
// We read new lines using this API instead of the Lines Iterator
|
||||
// on purpose. We want to keep the `\n` and potential `\r` between each lines
|
||||
match file.read_line(&mut buf)? {
|
||||
0 => break,
|
||||
_ => {
|
||||
let normalized = self.normalize(&buf)?;
|
||||
let pre_tokenized = self.pre_tokenize(normalized.get())?;
|
||||
trainer.process_tokens(
|
||||
&mut words,
|
||||
pre_tokenized.into_iter().map(|(t, _)| t).collect(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
})
|
||||
|
Loading…
Reference in New Issue
Block a user