From f024350deed5a72d517763c77e23b62b21edb2a4 Mon Sep 17 00:00:00 2001 From: Guillaume B Date: Sat, 26 Jun 2021 11:07:17 +0200 Subject: [PATCH] Fixed various documentation typos --- CHANGELOG.md | 1 + README.md | 3 +- examples/translation_m2m100.rs | 2 +- src/bart/bart_model.rs | 8 ++--- src/common/resources.rs | 2 +- src/gpt_neo/gpt_neo_model.rs | 4 +-- src/lib.rs | 1 + src/m2m_100/m2m_100_model.rs | 11 +++--- src/m2m_100/mod.rs | 58 ++++++++++++++++++++++++++++++ src/marian/marian_model.rs | 4 +-- src/mbart/mbart_model.rs | 8 ++--- src/mbart/mod.rs | 4 +-- src/mobilebert/mobilebert_model.rs | 2 +- src/pegasus/pegasus_model.rs | 6 ++-- src/pipelines/conversation.rs | 4 +-- src/pipelines/translation.rs | 2 +- src/prophetnet/prophetnet_model.rs | 6 ++-- src/t5/t5_model.rs | 4 +-- 18 files changed, 96 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2847b1..3582cf1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. The format - (BREAKING) Support for `prefix_allowed_tokens_fn` argument for generation, allowing users to control the generation via custom functions - (BREAKING) Support for `forced_bos_token_id` argument for generation, allowing users to force a given BOS token for generation (useful for MBart/M2M-class models) - Addition of the MBart Language model and support for text generation / direct translation between 50 language +- Addition of the M2M100 Language model and support for text generation / direct translation between 100 language ## Changed - Updated GPT2 architecture to re-use embeddings for the output projection layer (resulting in smaller model weights files and memory footprint) diff --git a/README.md b/README.md index a04c929..14b370c 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ GPT-Neo| | | |✅ | | | | BART|✅| | |✅ |✅| | | Marian| | | | | |✅| | MBart|✅| | |✅ | | | | +M2M100| | | |✅ | | | | Electra | |✅| | | | |✅| ALBERT |✅|✅|✅| | | |✅| T5 | | | |✅ |✅|✅| | @@ -62,7 +63,7 @@ Pegasus| | | | |✅| | | ## Getting started This library relies on the [tch](https://github.com/LaurentMazare/tch-rs) crate for bindings to the C++ Libtorch API. -The libtorch library is required can be downloaded either automatically or manually. The following provides a reference on how to set-up yoru environment +The libtorch library is required can be downloaded either automatically or manually. The following provides a reference on how to set-up your environment to use these bindings, please refer to the [tch](https://github.com/LaurentMazare/tch-rs) for detailed information or support. Furthermore, this library relies on a cache folder for downloading pre-trained models. diff --git a/examples/translation_m2m100.rs b/examples/translation_m2m100.rs index 97ee4d0..feae4d1 100644 --- a/examples/translation_m2m100.rs +++ b/examples/translation_m2m100.rs @@ -1,4 +1,4 @@ -// Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +// Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. // Copyright 2019 Guillaume Becquin // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/bart/bart_model.rs b/src/bart/bart_model.rs index ba0b20e..566c3e2 100644 --- a/src/bart/bart_model.rs +++ b/src/bart/bart_model.rs @@ -388,7 +388,7 @@ impl BartModel { /// /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. @@ -551,7 +551,7 @@ impl BartForConditionalGeneration { /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. /// @@ -745,7 +745,7 @@ impl BartForSequenceClassification { /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. /// @@ -844,7 +844,7 @@ impl LMHeadModel for BartForConditionalGeneration { /// # Arguments /// /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`) - /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing th elast calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. + /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1 /// * `input_embeds` - Unused for BART /// * `token_type_ids` - Unused for BART diff --git a/src/common/resources.rs b/src/common/resources.rs index 65b8228..aea64d7 100644 --- a/src/common/resources.rs +++ b/src/common/resources.rs @@ -115,7 +115,7 @@ impl RemoteResource { } /// Creates a new RemoteResource from an URL and local name. Will define a local path pointing to - /// ~/.cache/.rusbert/model_name. Note that this does not download the resource (only declares + /// ~/.cache/.rustbert/model_name. Note that this does not download the resource (only declares /// the remote and local locations) /// /// # Arguments diff --git a/src/gpt_neo/gpt_neo_model.rs b/src/gpt_neo/gpt_neo_model.rs index cc600b7..9a44c3b 100644 --- a/src/gpt_neo/gpt_neo_model.rs +++ b/src/gpt_neo/gpt_neo_model.rs @@ -263,7 +263,7 @@ impl GptNeoModel { /// - `hidden_states` - `Tensor` of shape (*batch size*, *sequence_length*, *hidden_size*) representing the activations of the last hidden state /// - `next_cache` - `Option>>` of length *n_layer* containing the past content for the the attention layers /// - `all_hidden_states` - `Option>` of length *n_layer + 1* with shape (*batch size*, *sequence_length*, *hidden_size*) - /// - `all_attentions` - `Option>` of length *n_layer* containign the attention weights for each layer + /// - `all_attentions` - `Option>` of length *n_layer* containing the attention weights for each layer /// /// # Example /// @@ -504,7 +504,7 @@ impl GptNeoForCausalLM { /// - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position /// - `next_cache` - `Option>>` of length *n_layer* containing the past content for the the attention layers /// - `all_hidden_states` - `Option>` of length *n_layer + 1* with shape (*batch size*, *sequence_length*, *hidden_size*) - /// - `all_attentions` - `Option>` of length *n_layer* containign the attention weights for each layer + /// - `all_attentions` - `Option>` of length *n_layer* containing the attention weights for each layer /// /// # Example /// diff --git a/src/lib.rs b/src/lib.rs index 5065f3b..4cdf480 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -59,6 +59,7 @@ //! BART|✅| | |✅ |✅| | | //! Marian| | | | | |✅| | //! MBart|✅| | |✅ | | | | +//! M2M100| | | |✅ | | | | //! Electra | |✅| | | | |✅| //! ALBERT |✅|✅|✅| | | |✅| //! T5 | | | |✅ |✅|✅| | diff --git a/src/m2m_100/m2m_100_model.rs b/src/m2m_100/m2m_100_model.rs index daa28f3..94b53ad 100644 --- a/src/m2m_100/m2m_100_model.rs +++ b/src/m2m_100/m2m_100_model.rs @@ -41,7 +41,7 @@ pub struct M2M100ConfigResources; /// # M2M100 Pretrained model vocab files pub struct M2M100VocabResources; -/// # M2M100 Pretrained model ,erges files +/// # M2M100 Pretrained model merges files pub struct M2M100MergesResources; impl M2M100ModelResources { @@ -168,7 +168,7 @@ impl M2M100Model { /// /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. @@ -320,7 +320,8 @@ impl M2M100ForConditionalGeneration { /// let device = Device::Cpu; /// let p = nn::VarStore::new(device); /// let config = M2M100Config::from_file(config_path); - /// let m2m100: M2M100ForConditionalGeneration = M2M100ForConditionalGeneration::new(&p.root(), &config); + /// let m2m100: M2M100ForConditionalGeneration = + /// M2M100ForConditionalGeneration::new(&p.root(), &config); /// ``` pub fn new<'p, P>(p: P, config: &M2M100Config) -> M2M100ForConditionalGeneration where @@ -338,7 +339,7 @@ impl M2M100ForConditionalGeneration { /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. /// @@ -432,7 +433,7 @@ impl LMHeadModel for M2M100ForConditionalGeneration { /// # Arguments /// /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`) - /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing th elast calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. + /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1 /// * `input_embeds` - Unused for M2M100 /// * `token_type_ids` - Unused for M2M100 diff --git a/src/m2m_100/mod.rs b/src/m2m_100/mod.rs index 741096c..e3fa63b 100644 --- a/src/m2m_100/mod.rs +++ b/src/m2m_100/mod.rs @@ -1,3 +1,61 @@ +//! # M2M-100 (Fan et al.) +//! +//! Implementation of the M2M-100 language model ([Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) Fan, Bhosale, Schwenk, Ma, El-Kishky, Goyal, Baines, Celebi, Wenzel, Chaudhary, Goyal, Birch, Liptchinsky, Edunov, Grave, Auli, Joulin, 2020). +//! The base model is implemented in the `m2m_100::M2M100Model` struct. The model also includes a language model head: `m2m_100::M2M100ForConditionalGeneration` +//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information). +//! This model allows for direct translation between 100 languages. +//! The translation capabilities are illustrated in `examples/translation_m2m100`, run with `cargo run --example translation_m2m100`. +//! +//! # Model set-up and pre-trained weights loading +//! +//! All models expect the following resources: +//! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers) +//! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format. +//! - `M2M100Tokenizer` using a `config.json` vocabulary and a `spiece.model` SentencePiece BPE model +//! Pretrained models are available and can be downloaded using RemoteResources. +//! +//! ```no_run +//! # fn main() -> anyhow::Result<()> { +//! # +//! use tch::{nn, Device}; +//! # use std::path::PathBuf; +//! use rust_bert::m2m_100::{M2M100Config, M2M100Model}; +//! use rust_bert::resources::{LocalResource, Resource}; +//! use rust_bert::Config; +//! use rust_tokenizers::tokenizer::M2M100Tokenizer; +//! +//! let config_resource = Resource::Local(LocalResource { +//! local_path: PathBuf::from("path/to/config.json"), +//! }); +//! let vocab_resource = Resource::Local(LocalResource { +//! local_path: PathBuf::from("path/to/vocab.txt"), +//! }); +//! let merges_resource = Resource::Local(LocalResource { +//! local_path: PathBuf::from("path/to/spiece.model"), +//! }); +//! let weights_resource = Resource::Local(LocalResource { +//! local_path: PathBuf::from("path/to/model.ot"), +//! }); +//! let config_path = config_resource.get_local_path()?; +//! let vocab_path = vocab_resource.get_local_path()?; +//! let merges_path = merges_resource.get_local_path()?; +//! let weights_path = weights_resource.get_local_path()?; +//! +//! let device = Device::cuda_if_available(); +//! let mut vs = nn::VarStore::new(device); +//! let tokenizer: M2M100Tokenizer = M2M100Tokenizer::from_file( +//! vocab_path.to_str().unwrap(), +//! merges_path.to_str().unwrap(), +//! false, +//! )?; +//! let config = M2M100Config::from_file(config_path); +//! let m2m100_model = M2M100Model::new(&vs.root(), &config); +//! vs.load(weights_path)?; +//! +//! # Ok(()) +//! # } +//! ``` + mod attention; mod decoder; mod embeddings; diff --git a/src/marian/marian_model.rs b/src/marian/marian_model.rs index 4a72b42..8ffb00d 100644 --- a/src/marian/marian_model.rs +++ b/src/marian/marian_model.rs @@ -546,7 +546,7 @@ impl MarianForConditionalGeneration { /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. /// @@ -648,7 +648,7 @@ impl LMHeadModel for MarianForConditionalGeneration { /// * `position_ids` - Unused for BART /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. /// /// diff --git a/src/mbart/mbart_model.rs b/src/mbart/mbart_model.rs index 3d6dfcb..75f8310 100644 --- a/src/mbart/mbart_model.rs +++ b/src/mbart/mbart_model.rs @@ -244,7 +244,7 @@ impl MBartModel { /// /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. @@ -420,7 +420,7 @@ impl MBartForConditionalGeneration { /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. /// @@ -568,7 +568,7 @@ impl MBartForSequenceClassification { /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. /// @@ -667,7 +667,7 @@ impl LMHeadModel for MBartForConditionalGeneration { /// # Arguments /// /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`) - /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing th elast calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. + /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1 /// * `input_embeds` - Unused for BART /// * `token_type_ids` - Unused for BART diff --git a/src/mbart/mod.rs b/src/mbart/mod.rs index 021bfd9..ab65fda 100644 --- a/src/mbart/mod.rs +++ b/src/mbart/mod.rs @@ -6,7 +6,7 @@ //! //! # Model set-up and pre-trained weights loading //! -//! The summarization capabilities are illustrated in `examples/translation_mbart`, run with `cargo run --example translation_mbart`. +//! The translation capabilities are illustrated in `examples/translation_mbart`, run with `cargo run --example translation_mbart`. //! All models expect the following resources: //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers) //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format. @@ -41,7 +41,7 @@ //! let tokenizer: MBart50Tokenizer = //! MBart50Tokenizer::from_file(vocab_path.to_str().unwrap(), false)?; //! let config = MBartConfig::from_file(config_path); -//! let bart_model = MBartModel::new(&vs.root(), &config); +//! let mbart_model = MBartModel::new(&vs.root(), &config); //! vs.load(weights_path)?; //! //! # Ok(()) diff --git a/src/mobilebert/mobilebert_model.rs b/src/mobilebert/mobilebert_model.rs index 1f071b3..0d481b3 100644 --- a/src/mobilebert/mobilebert_model.rs +++ b/src/mobilebert/mobilebert_model.rs @@ -319,7 +319,7 @@ impl MobileBertModel { /// /// * `p` - Variable store path for the root of the MobileBERT model /// * `config` - `MobileBertConfig` object defining the model architecture and decoder status - /// * `add_poling_layer` - boolean flag indicating if a pooling layer shuld be added after the encoder + /// * `add_poling_layer` - boolean flag indicating if a pooling layer should be added after the encoder /// /// # Example /// diff --git a/src/pegasus/pegasus_model.rs b/src/pegasus/pegasus_model.rs index 8e0e4e1..aab9381 100644 --- a/src/pegasus/pegasus_model.rs +++ b/src/pegasus/pegasus_model.rs @@ -156,7 +156,7 @@ impl PegasusModel { /// /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. @@ -329,7 +329,7 @@ impl PegasusForConditionalGeneration { /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*). /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. /// @@ -437,7 +437,7 @@ impl LMHeadModel for PegasusForConditionalGeneration { /// # Arguments /// /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`) - /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing th elast calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. + /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1 /// * `input_embeds` - Unused for Pegasus /// * `token_type_ids` - Unused for Pegasus diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs index a3cad2e..354bbfb 100644 --- a/src/pipelines/conversation.rs +++ b/src/pipelines/conversation.rs @@ -622,7 +622,7 @@ impl ConversationManager { /// /// # Returns /// - /// * `Option` deregistered conversation + /// * `Option` de-registered conversation /// /// # Example /// @@ -643,7 +643,7 @@ impl ConversationManager { /// /// # Returns /// - /// * `HashMap` deregistered conversations + /// * `HashMap` de-registered conversations /// /// # Example /// diff --git a/src/pipelines/translation.rs b/src/pipelines/translation.rs index 220b11e..59da3a8 100644 --- a/src/pipelines/translation.rs +++ b/src/pipelines/translation.rs @@ -428,7 +428,7 @@ pub struct TranslationConfig { } impl TranslationConfig { - /// Create a new `TranslationCondiguration` from an available language. + /// Create a new `TranslationConfiguration` from an available language. /// /// # Arguments /// diff --git a/src/prophetnet/prophetnet_model.rs b/src/prophetnet/prophetnet_model.rs index 3b60978..be0bf31 100644 --- a/src/prophetnet/prophetnet_model.rs +++ b/src/prophetnet/prophetnet_model.rs @@ -190,7 +190,7 @@ impl ProphetNetModel { /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). This or `input_embeds` must be provided. /// * `attention_mask` - Optional attention mask of shape (*batch size*, *sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `input_embeds` - Optional input tensor of shape (*batch size*, *sequence_length*, *embeddings dimension*). This or `input_ids` must be provided. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `encoder_hidden_states` - Optional tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) corresponding to pre-calculated encoder hidden states (useful for conditional generation) /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. @@ -393,7 +393,7 @@ impl ProphetNetForConditionalGeneration { /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). This or `input_embeds` must be provided. /// * `attention_mask` - Optional attention mask of shape (*batch size*, *sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `input_embeds` - Optional input tensor of shape (*batch size*, *sequence_length*, *embeddings dimension*). This or `input_ids` must be provided. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `encoder_hidden_states` - Optional tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) corresponding to pre-calculated encoder hidden states (useful for conditional generation) /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks. @@ -693,7 +693,7 @@ impl ProphetNetForCausalGeneration { /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). This or `input_embeds` must be provided. /// * `attention_mask` - Optional attention mask of shape (*batch size*, *sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked. /// * `input_embeds` - Optional input tensor of shape (*batch size*, *sequence_length*, *embeddings dimension*). This or `input_ids` must be provided. - /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialiazed with a BOS token) + /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token) /// * `old_layer_states` - Optional Vector `Option, Option<&LayerState>>>` of length *n_layer* containing tuples with the past keys and values for both the self attention and the encoder cross attention of each layer of the decoder. /// * `decoder_input_embeds` - Optional input tensor of shape (*batch size*, *target_sequence_length*, *embeddings dimension*). This or `decoder_input_ids` must be provided. /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. diff --git a/src/t5/t5_model.rs b/src/t5/t5_model.rs index 3d817da..b44a2ae 100644 --- a/src/t5/t5_model.rs +++ b/src/t5/t5_model.rs @@ -450,7 +450,7 @@ impl T5ForConditionalGeneration { /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked. /// * `input_embeds` - Optional input tensor of shape (*batch size*, *source_sequence_length*, *embeddings dimension*). This or `input_ids` must be provided. /// * `decoder_input_embeds` - Optional input tensor of shape (*batch size*, *target_sequence_length*, *embeddings dimension*). This or `decoder_input_ids` must be provided. - /// * `old_layer_states` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing th elast calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. + /// * `old_layer_states` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference. /// /// # Returns @@ -558,7 +558,7 @@ impl LMHeadModel for T5ForConditionalGeneration { /// # Arguments /// /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`) - /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing th elast calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. + /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding. /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1 /// * `input_embeds` - Unused for T5 /// * `token_type_ids` - Unused for T5