Generation traits simplification (#339)

* - Remove LMHeadModel trait (integrate with PrivateLanguageGenerator) - Simplify PrivateLanguageGenerator trait definition (no longer requires defined by objects implementing `LMHeadModel`, `Vocab` and `Tokenizer` traits) * - Removed BART duplicated code, updated docs * - Fixed BART-based model incorrect order of generation arguments * - Updated changelog * Fixed Clippy warning
2024-08-16 16:10:25 +03:00 · 2023-03-17 16:21:37 +00:00 · 2023-03-17 16:21:37 +00:00 · b05ec7b24f
commit b05ec7b24f
parent c448862185
35 changed files with 718 additions and 1527 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file. The format

 ## Changed
 - Bumped the tokenizers dependency from 7.x to 8.x, exposing additional options for special token mapping and adding the NLLBTokenizer.
+- (BREAKING) Simplified the generation traits (removal of LMHeadModel and elimination of unnecessary specification for LanguageGenerator)

 ## Fixed
 - MIN/MAX computation for float-like (was set to infinity instead of min/max)
--- a/Cargo.toml
+++ b/Cargo.toml
@ -70,7 +70,7 @@ features = ["doc-only"]

 [dependencies]
 rust_tokenizers = "8.0.0"
-tch = "~0.10.1"
+tch = "~0.10"
 serde_json = "1"
 serde = { version = "1", features = ["derive"] }
 ordered-float = "3"
@ -88,6 +88,6 @@ anyhow = "1"
 csv = "1"
 criterion = "0.4"
 tokio = { version = "1.24", features = ["sync", "rt-multi-thread", "macros"] }
-torch-sys = "=0.10.0"
+torch-sys = "=0.10"
 tempfile = "3"
 itertools = "0.10"
--- a/examples/generation_gptj.rs
+++ b/examples/generation_gptj.rs
@ -44,7 +44,6 @@ use tch::Device;
 /// ```
 ///
 /// [gpt-j-6B-float16]: https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16
-///
 fn main() -> anyhow::Result<()> {
    // Resources paths

--- a/src/bart/bart_model.rs
+++ b/src/bart/bart_model.rs
@ -21,12 +21,10 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::{Config, RustBertError};
-use rust_tokenizers::tokenizer::{RobertaTokenizer, TruncationStrategy};
-use rust_tokenizers::vocab::RobertaVocab;
+use rust_tokenizers::tokenizer::TruncationStrategy;
+
 use serde::{Deserialize, Serialize};
 use std::borrow::Borrow;
 use std::collections::HashMap;
@ -826,7 +824,7 @@ impl BartForSequenceClassification {
    /// # let device = Device::Cpu;
    /// # let vs = nn::VarStore::new(device);
    /// # let config = BartConfig::from_file(config_path);
-    /// # let bart_model: BartForSequenceClassification = BartForSequenceClassification::new(&vs.root(), &config).unwrap();;
+    /// # let bart_model: BartForSequenceClassification = BartForSequenceClassification::new(&vs.root(), &config).unwrap();
    ///  let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
    ///  let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
    ///  let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
@ -891,110 +889,6 @@ impl BartForSequenceClassification {
    }
 }

-impl LMHeadModel for BartForConditionalGeneration {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding.
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Unused for BART
-    /// * `token_type_ids` - Unused for BART
-    /// * `position_ids` - Unused for BART
-    /// * `encoder_outputs` - Optional tensor of shape (*batch size*, *source_sequence_length*, *hidden_size*). When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
-    /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `BartCache` made of `Option<Vec<(Option<Vec<&LayerState, &LayerState>>)>>` of length *n_layer* containing the encoder past keys and values for
-    ///     both the self attention and the encoder cross attention of each layer of the decoder.
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::pipelines::generation_utils::LMHeadModel;
-    /// use rust_bert::bart::{BartForConditionalGeneration, BartConfig};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = BartConfig::from_file(config_path);
-    /// # let bart_model: BartForConditionalGeneration = BartForConditionalGeneration::new(&vs.root(), &config);
-    ///  let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
-    ///  let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
-    ///  let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
-    ///  let encoder_attention_mask = Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///  let decoder_attention_mask = Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///
-    ///  let model_output = no_grad(|| {
-    ///    bart_model
-    ///         .forward_t(Some(&input_tensor),
-    ///                    Some(&encoder_attention_mask),
-    ///                    None,
-    ///                    Some(&target_tensor),
-    ///                    Some(&decoder_attention_mask),
-    ///                    None,
-    ///                    false)
-    ///    });
-    /// ```
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        cache: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        _input_embeds: Option<&Tensor>,
-        encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match cache {
-            Cache::BARTCache(cached_layer_states) => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids,
-                encoder_outputs,
-                None,
-                cached_layer_states,
-                train,
-            ),
-
-            Cache::None => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids,
-                encoder_outputs,
-                None,
-                None,
-                train,
-            ),
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with BART Model".into(),
-                ));
-            }
-        };
-
-        let lm_logits = base_model_output
-            .decoder_output
-            .linear::<Tensor>(&self.base_model.embeddings.ws, None);
-        Ok(LMModelOutput {
-            lm_logits,
-            cache: Cache::BARTCache(base_model_output.cache),
-        })
-    }
-}
-
 /// Container holding a BART model output. The decoder output may hold the hidden state of
 /// the last layer of the decoder, or may hold logits for a custom head module after the
 /// decoder (e.g. for classification or language modeling tasks)
@ -1143,12 +1037,7 @@ impl BartGenerator {
    }
 }

-impl PrivateLanguageGenerator<BartForConditionalGeneration, RobertaVocab, RobertaTokenizer>
-    for BartGenerator
-{
-    fn get_model(&self) -> &BartForConditionalGeneration {
-        &self.model
-    }
+impl PrivateLanguageGenerator for BartGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -1183,6 +1072,51 @@ impl PrivateLanguageGenerator<BartForConditionalGeneration, RobertaVocab, Robert
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        cache: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        _input_embeds: Option<&Tensor>,
+        encoder_outputs: Option<&Tensor>,
+        decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match cache {
+            Cache::BARTCache(cached_layer_states) => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                cached_layer_states,
+                train,
+            ),
+
+            Cache::None => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                None,
+                train,
+            ),
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with BART Model".into(),
+                ));
+            }
+        };
+
+        Ok(LMModelOutput {
+            lm_logits: base_model_output.decoder_output,
+            cache: Cache::BARTCache(base_model_output.cache),
+        })
+    }
+
    fn prepare_scores_for_generation(
        &self,
        scores: &mut Tensor,
@ -1203,7 +1137,7 @@ impl PrivateLanguageGenerator<BartForConditionalGeneration, RobertaVocab, Robert
    }

    fn encode(&self, input_ids: &Tensor, attention_mask: Option<&Tensor>) -> Option<Tensor> {
-        Some(self.get_model().encode(input_ids, attention_mask))
+        Some(self.model.encode(input_ids, attention_mask))
    }

    fn prepare_inputs_for_generation<'a>(
@ -1312,10 +1246,7 @@ impl PrivateLanguageGenerator<BartForConditionalGeneration, RobertaVocab, Robert
    }
 }

-impl LanguageGenerator<BartForConditionalGeneration, RobertaVocab, RobertaTokenizer>
-    for BartGenerator
-{
-}
+impl LanguageGenerator for BartGenerator {}

 #[cfg(test)]
 mod test {
--- a/src/bart/mod.rs
+++ b/src/bart/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the BART language model ([BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) Lewis, Liu, Goyal, Ghazvininejad, Mohamed, Levy, Stoyanov, Zettlemoyer, 2019).
 //! The base model is implemented in the `bart_model::BartModel` struct. The model also includes a language model head: `bart_model::BartForConditionalGeneration`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //!
 //! # Model set-up and pre-trained weights loading
 //!
--- a/src/gpt2/gpt2_model.rs
+++ b/src/gpt2/gpt2_model.rs
@ -20,12 +20,8 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::{Config, RustBertError};
-use rust_tokenizers::tokenizer::Gpt2Tokenizer;
-use rust_tokenizers::vocab::Gpt2Vocab;
 use serde::{Deserialize, Serialize};
 use std::borrow::{Borrow, BorrowMut};
 use tch::kind::Kind::Int64;
@ -529,118 +525,26 @@ impl GPT2LMHeadModel {

        GPT2LMHeadModel { transformer }
    }
-}

-impl LMHeadModel for GPT2LMHeadModel {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `layer_past` - Optional vector of size *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*). When provided, these are concatenated with the current input keys and values.
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
-    /// * `token_type_ids` - Optional token type ids used to indicate the portion of the input the token belongs to. If not None, token type embeddings will be added to the token and position embeddings.
-    /// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented starting from the length of the past input.
-    /// * `_encoder_outputs` - Optional tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*). Unused for GPT2
-    /// * `_decoder_input_ids` - Optional tensor of shape (*batch size*, *target_sequence_length*). Unused for GPT2
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `Gpt2Cache` made of `Option<Vec<Tensor>>` of length *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*)
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::gpt2::{GPT2LMHeadModel, Gpt2Config};
-    /// use rust_bert::pipelines::generation_utils::{Cache, LMHeadModel};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = Gpt2Config::from_file(config_path);
-    /// # let mut gpt2_model: GPT2LMHeadModel = GPT2LMHeadModel::new(&vs.root(), &config);
-    /// let (batch_size, sequence_length, past_sequence_length) = (64, 128, 56);
-    /// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
-    /// let mut past: Vec<Tensor> = Vec::with_capacity(config.n_layer as usize);
-    /// for _ in 0..config.n_layer as usize {
-    ///     past.push(Tensor::rand(
-    ///         &[
-    ///             2,
-    ///             batch_size,
-    ///             config.n_head,
-    ///             past_sequence_length,
-    ///             config.n_embd / config.n_head,
-    ///         ],
-    ///         (Double, device),
-    ///     ))
-    /// }
-    /// let attention_mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
-    /// let token_type_ids = Tensor::ones(&[batch_size, sequence_length], (Int64, device));
-    /// let position_ids = Tensor::arange(sequence_length, (Int64, device))
-    ///     .expand(&[batch_size, sequence_length], true);
-    ///
-    /// let model_output = no_grad(|| {
-    ///     gpt2_model
-    ///         .forward_t(
-    ///             Some(&input_tensor),
-    ///             Cache::GPT2Cache(Some(past)),
-    ///             Some(&attention_mask),
-    ///             Some(&token_type_ids),
-    ///             Some(&position_ids),
-    ///             None,
-    ///             None,
-    ///             None,
-    ///             false,
-    ///         )
-    ///         .unwrap()
-    /// });
-    /// ```
-    fn forward_t(
+    pub fn forward_t(
        &self,
        input_ids: Option<&Tensor>,
-        layer_past: Cache,
+        layer_past: Option<&Vec<Tensor>>,
        attention_mask: Option<&Tensor>,
        token_type_ids: Option<&Tensor>,
        position_ids: Option<&Tensor>,
        input_embeds: Option<&Tensor>,
-        _encoder_outputs: Option<&Tensor>,
-        _decoder_input_ids: Option<&Tensor>,
        train: bool,
    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match layer_past {
-            Cache::GPT2Cache(layer_past) => self.transformer.forward_t(
-                input_ids,
-                layer_past.as_ref(),
-                attention_mask,
-                token_type_ids,
-                position_ids,
-                input_embeds,
-                train,
-            ),
-            Cache::None => self.transformer.forward_t(
-                input_ids,
-                None,
-                attention_mask,
-                token_type_ids,
-                position_ids,
-                input_embeds,
-                train,
-            ),
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with GPT2 Model".into(),
-                ));
-            }
-        }?;
+        let base_model_output = self.transformer.forward_t(
+            input_ids,
+            layer_past,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            input_embeds,
+            train,
+        )?;

        let lm_logits = base_model_output
            .output
@ -769,10 +673,7 @@ impl GPT2Generator {
    }
 }

-impl PrivateLanguageGenerator<GPT2LMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for GPT2Generator {
-    fn get_model(&self) -> &GPT2LMHeadModel {
-        &self.model
-    }
+impl PrivateLanguageGenerator for GPT2Generator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -807,6 +708,43 @@ impl PrivateLanguageGenerator<GPT2LMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for GPT
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        layer_past: Cache,
+        attention_mask: Option<&Tensor>,
+        token_type_ids: Option<&Tensor>,
+        position_ids: Option<&Tensor>,
+        input_embeds: Option<&Tensor>,
+        _encoder_outputs: Option<&Tensor>,
+        _decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        match layer_past {
+            Cache::GPT2Cache(layer_past) => self.model.forward_t(
+                input_ids,
+                layer_past.as_ref(),
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                input_embeds,
+                train,
+            ),
+            Cache::None => self.model.forward_t(
+                input_ids,
+                None,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                input_embeds,
+                train,
+            ),
+            _ => Err(RustBertError::ValueError(
+                "Cache not compatible with GPT2 Model".into(),
+            )),
+        }
+    }
+
    fn prepare_inputs_for_generation<'a>(
        &self,
        input_ids: Tensor,
@ -875,4 +813,4 @@ impl PrivateLanguageGenerator<GPT2LMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for GPT
    }
 }

-impl LanguageGenerator<GPT2LMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for GPT2Generator {}
+impl LanguageGenerator for GPT2Generator {}
--- a/src/gpt2/mod.rs
+++ b/src/gpt2/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the GPT2 language model ([Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) Radford, Wu, Child, Luan, Amodei, Sutskever 2019).
 //! The base model is implemented in the `gpt2_model::Gpt2Model` struct. The model also includes a language model head: `gpt2_model::GPT2LMHeadModel`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //!
 //! # Model set-up and pre-trained weights loading
 //!
--- a/src/gpt_j/gpt_j_model.rs
+++ b/src/gpt_j/gpt_j_model.rs
@ -20,12 +20,8 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::{Config, RustBertError};
-use rust_tokenizers::tokenizer::Gpt2Tokenizer;
-use rust_tokenizers::vocab::Gpt2Vocab;
 use serde::{Deserialize, Serialize};
 use std::borrow::{Borrow, BorrowMut};
 use tch::nn::{embedding, Linear};
@ -46,7 +42,7 @@ pub struct GptJMergesResources;
 /// Model weights for Rust are not available out of the box for GPT-J but can be created
 /// simply with the following command:
 ///
-/// ```
+/// ```ignore
 /// python utils/convert_model.py path/to/gpt_j/pytorch_model.bin
 /// ```
 ///
@ -57,7 +53,6 @@ pub struct GptJMergesResources;
 ///
 /// [gpt-j-6B]: https://huggingface.co/EleutherAI/gpt-j-6B/tree/main
 /// [gpt-j-6B-float16]:https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16
-///
 impl GptJModelResources {
    pub const GPT_J_TINY_RANDOM: (&'static str, &'static str) = (
        "gpt-j-tiny-random/model",
@ -335,7 +330,7 @@ impl GptJModel {
    ///     gpt_j_model
    ///         .forward_t(
    ///             Some(&input_tensor),
-    ///             Some(&past),
+    ///             Some(past),
    ///             Some(&attention_mask),
    ///             Some(&token_type_ids),
    ///             None,
@ -450,7 +445,7 @@ impl GptJLMHeadModel {
    /// # Example
    ///
    /// ```no_run
-    /// use rust_bert::gpt_j::{GptJLMHeadModel, GptJConfig};
+    /// use rust_bert::gpt_j::{GptJConfig, GptJLMHeadModel};
    /// use rust_bert::Config;
    /// use std::path::Path;
    /// use tch::{nn, Device};
@ -483,82 +478,8 @@ impl GptJLMHeadModel {
            lm_head,
        }
    }
-}

-impl LMHeadModel for GptJLMHeadModel {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `layer_past` - Optional vector of size *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*). When provided, these are concatenated with the current input keys and values.
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
-    /// * `token_type_ids` - Optional token type ids used to indicate the portion of the input the token belongs to. If not None, token type embeddings will be added to the token and position embeddings.
-    /// * `_position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented starting from the length of the past input.
-    /// * `_encoder_outputs` - Optional tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*). Unused for GPT-J
-    /// * `_decoder_input_ids` - Optional tensor of shape (*batch size*, *target_sequence_length*). Unused for GPT_J
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `GptJCache` made of `Option<Vec<Tensor>>` of length *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*)
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::gpt_j::{GptJLMHeadModel, GptJConfig};
-    /// use rust_bert::pipelines::generation_utils::{Cache, LMHeadModel};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = GptJConfig::from_file(config_path);
-    /// # let mut gpt_j_model: GptJLMHeadModel = GptJLMHeadModel::new(&vs.root(), &config);
-    /// let (batch_size, sequence_length, past_sequence_length) = (64, 128, 56);
-    /// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
-    /// let mut past: Vec<Tensor> = Vec::with_capacity(config.n_layer as usize);
-    /// for _ in 0..config.n_layer as usize {
-    ///     past.push(Tensor::rand(
-    ///         &[
-    ///             2,
-    ///             batch_size,
-    ///             config.n_head,
-    ///             past_sequence_length,
-    ///             config.n_embd / config.n_head,
-    ///         ],
-    ///         (Double, device),
-    ///     ))
-    /// }
-    /// let attention_mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
-    /// let token_type_ids = Tensor::ones(&[batch_size, sequence_length], (Int64, device));
-    /// let position_ids = Tensor::arange(sequence_length, (Int64, device))
-    ///     .expand(&[batch_size, sequence_length], true);
-    ///
-    /// let model_output = no_grad(|| {
-    ///     gpt_j_model
-    ///         .forward_t(
-    ///             Some(&input_tensor),
-    ///             Cache::GPTJCache(Some(past)),
-    ///             Some(&attention_mask),
-    ///             Some(&token_type_ids),
-    ///             None,
-    ///             None,
-    ///             None,
-    ///             None,
-    ///             false,
-    ///         )
-    ///         .unwrap()
-    /// });
-    /// ```
-    fn forward_t(
+    pub fn forward_t(
        &self,
        input_ids: Option<&Tensor>,
        layer_past: Cache,
@ -648,7 +569,7 @@ impl GptJGenerator {
    /// use rust_bert::pipelines::generation_utils::GenerateConfig;
    ///
    /// let generate_config = GenerateConfig {
-    ///     max_length: 30,
+    ///     max_length: Some(30),
    ///     do_sample: true,
    ///     num_beams: 5,
    ///     temperature: 1.1,
@ -728,10 +649,7 @@ impl GptJGenerator {
    }
 }

-impl PrivateLanguageGenerator<GptJLMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for GptJGenerator {
-    fn get_model(&self) -> &GptJLMHeadModel {
-        &self.model
-    }
+impl PrivateLanguageGenerator for GptJGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -766,6 +684,52 @@ impl PrivateLanguageGenerator<GptJLMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for Gpt
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        layer_past: Cache,
+        attention_mask: Option<&Tensor>,
+        token_type_ids: Option<&Tensor>,
+        position_ids: Option<&Tensor>,
+        input_embeds: Option<&Tensor>,
+        _encoder_outputs: Option<&Tensor>,
+        _decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match layer_past {
+            Cache::GPTJCache(layer_past) => self.model.transformer.forward_t(
+                input_ids,
+                layer_past,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                input_embeds,
+                train,
+            ),
+            Cache::None => self.model.transformer.forward_t(
+                input_ids,
+                None,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                input_embeds,
+                train,
+            ),
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with GPT-J Model".into(),
+                ));
+            }
+        }?;
+
+        let lm_logits = base_model_output.output.apply(&self.model.lm_head);
+
+        Ok(LMModelOutput {
+            lm_logits,
+            cache: Cache::GPTJCache(base_model_output.cache),
+        })
+    }
+
    fn prepare_inputs_for_generation<'a>(
        &self,
        input_ids: Tensor,
@ -833,4 +797,4 @@ impl PrivateLanguageGenerator<GptJLMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for Gpt
    }
 }

-impl LanguageGenerator<GptJLMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for GptJGenerator {}
+impl LanguageGenerator for GptJGenerator {}
--- a/src/gpt_j/mod.rs
+++ b/src/gpt_j/mod.rs
@ -9,7 +9,7 @@
 //! #
 //! use tch::{nn, Device};
 //! # use std::path::PathBuf;
-//! use rust_bert::gpt_j::{GptJLMHeadModel, GptJConfig};
+//! use rust_bert::gpt_j::{GptJConfig, GptJLMHeadModel};
 //! use rust_bert::resources::{LocalResource, ResourceProvider};
 //! use rust_bert::Config;
 //! use rust_tokenizers::tokenizer::Gpt2Tokenizer;
--- a/src/gpt_neo/gpt_neo_model.rs
+++ b/src/gpt_neo/gpt_neo_model.rs
@ -18,12 +18,8 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::{Activation, Config, RustBertError};
-use rust_tokenizers::tokenizer::Gpt2Tokenizer;
-use rust_tokenizers::vocab::Gpt2Vocab;
 use serde::{Deserialize, Serialize};
 use std::borrow::{Borrow, BorrowMut};
 use tch::{nn, Kind, Tensor};
@ -570,52 +566,6 @@ impl GptNeoForCausalLM {
    }
 }

-impl LMHeadModel for GptNeoForCausalLM {
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        layer_past: Cache,
-        attention_mask: Option<&Tensor>,
-        token_type_ids: Option<&Tensor>,
-        position_ids: Option<&Tensor>,
-        input_embeds: Option<&Tensor>,
-        _encoder_outputs: Option<&Tensor>,
-        _decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match layer_past {
-            Cache::GPTNeoCache(layer_past) => self.forward_t(
-                input_ids,
-                input_embeds,
-                token_type_ids,
-                position_ids,
-                layer_past,
-                attention_mask,
-                train,
-            ),
-            Cache::None => self.forward_t(
-                input_ids,
-                input_embeds,
-                token_type_ids,
-                position_ids,
-                None,
-                attention_mask,
-                train,
-            ),
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with GPT-Neo Model".into(),
-                ));
-            }
-        }?;
-
-        Ok(LMModelOutput {
-            lm_logits: base_model_output.lm_logits,
-            cache: Cache::GPTNeoCache(base_model_output.next_cache),
-        })
-    }
-}
-
 /// Container for the GPT-Neo model output.
 pub struct GptNeoModelOutput {
    /// Last hidden states from the model
@ -743,10 +693,7 @@ impl GptNeoGenerator {
    }
 }

-impl PrivateLanguageGenerator<GptNeoForCausalLM, Gpt2Vocab, Gpt2Tokenizer> for GptNeoGenerator {
-    fn get_model(&self) -> &GptNeoForCausalLM {
-        &self.model
-    }
+impl PrivateLanguageGenerator for GptNeoGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -777,10 +724,54 @@ impl PrivateLanguageGenerator<GptNeoForCausalLM, Gpt2Vocab, Gpt2Tokenizer> for G
    fn get_decoder_start_id(&self) -> Option<i64> {
        self.decoder_start_id
    }
+
    fn get_max_positions_embeddings(&self) -> i64 {
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        layer_past: Cache,
+        attention_mask: Option<&Tensor>,
+        token_type_ids: Option<&Tensor>,
+        position_ids: Option<&Tensor>,
+        input_embeds: Option<&Tensor>,
+        _encoder_outputs: Option<&Tensor>,
+        _decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match layer_past {
+            Cache::GPTNeoCache(layer_past) => self.model.forward_t(
+                input_ids,
+                input_embeds,
+                token_type_ids,
+                position_ids,
+                layer_past,
+                attention_mask,
+                train,
+            ),
+            Cache::None => self.model.forward_t(
+                input_ids,
+                input_embeds,
+                token_type_ids,
+                position_ids,
+                None,
+                attention_mask,
+                train,
+            ),
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with GPT-Neo Model".into(),
+                ));
+            }
+        }?;
+
+        Ok(LMModelOutput {
+            lm_logits: base_model_output.lm_logits,
+            cache: Cache::GPTNeoCache(base_model_output.next_cache),
+        })
+    }
    fn prepare_inputs_for_generation<'a>(
        &self,
        input_ids: Tensor,
@ -851,4 +842,4 @@ impl PrivateLanguageGenerator<GptNeoForCausalLM, Gpt2Vocab, Gpt2Tokenizer> for G
    }
 }

-impl LanguageGenerator<GptNeoForCausalLM, Gpt2Vocab, Gpt2Tokenizer> for GptNeoGenerator {}
+impl LanguageGenerator for GptNeoGenerator {}
--- a/src/longt5/longt5_model.rs
+++ b/src/longt5/longt5_model.rs
@ -16,13 +16,10 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::t5::{FeedForwardProj, T5Config, T5ModelOutput, TaskSpecificParams};
 use crate::{Config, RustBertError};
-use rust_tokenizers::tokenizer::{T5Tokenizer, TruncationStrategy};
-use rust_tokenizers::vocab::T5Vocab;
+use rust_tokenizers::tokenizer::TruncationStrategy;
 use serde::{Deserialize, Serialize};
 use std::borrow::Borrow;
 use tch::nn::{embedding, LinearConfig};
@ -548,124 +545,6 @@ impl LongT5ForConditionalGeneration {
    }
 }

-impl LMHeadModel for LongT5ForConditionalGeneration {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `cache` - `Cache` object containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding.
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Unused for LongT5
-    /// * `token_type_ids` - Unused for LongT5
-    /// * `position_ids` - Unused for LongT5
-    /// * `encoder_outputs` - Optional tensor of shape (*batch size*, *source_sequence_length*, *hidden_size*). When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
-    /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*).
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `T5Cache` made of `Option<Vec<(Option<Vec<&LayerState, &LayerState>>)>>` of length *n_layer* containing the encoder past keys and values for
-    ///      both the self attention and the encoder cross attention of each layer of the decoder.
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::longt5::{LongT5Config, LongT5ForConditionalGeneration};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = LongT5Config::from_file(config_path);
-    /// # let longt5_model: LongT5ForConditionalGeneration = LongT5ForConditionalGeneration::new(&vs.root(), &config);
-    /// let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
-    /// let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
-    /// let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
-    /// let encoder_attention_mask =
-    ///     Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    /// let decoder_attention_mask =
-    ///     Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///
-    /// let model_output = no_grad(|| {
-    ///     longt5_model.forward_t(
-    ///         Some(&input_tensor),
-    ///         Some(&encoder_attention_mask),
-    ///         None,
-    ///         Some(&target_tensor),
-    ///         Some(&decoder_attention_mask),
-    ///         None,
-    ///         None,
-    ///         None,
-    ///         false,
-    ///     )
-    /// });
-    /// ```
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        cache: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        _input_embeds: Option<&Tensor>,
-        encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match cache {
-            Cache::LongT5Cache(cached_layer_states) => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                encoder_outputs,
-                decoder_input_ids,
-                None,
-                None,
-                None,
-                cached_layer_states,
-                train,
-            )?,
-            Cache::None => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                encoder_outputs,
-                decoder_input_ids,
-                None,
-                None,
-                None,
-                None,
-                train,
-            )?,
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with LongT5 Model".into(),
-                ));
-            }
-        };
-
-        let lm_logits = if self.tie_word_embeddings {
-            base_model_output
-                .decoder_output
-                .linear::<Tensor>(&self.base_model.embeddings.ws, None)
-                * (self.model_dim.powf(-0.5))
-        } else {
-            base_model_output
-                .decoder_output
-                .apply(self.lm_head.as_ref().unwrap())
-        };
-
-        Ok(LMModelOutput {
-            lm_logits,
-            cache: Cache::LongT5Cache(base_model_output.next_cache),
-        })
-    }
-}
-
 /// Container holding a LongT5 model output.
 pub type LongT5ModelOutput = T5ModelOutput;

@ -742,12 +621,7 @@ impl LongT5Generator {
    }
 }

-impl PrivateLanguageGenerator<LongT5ForConditionalGeneration, T5Vocab, T5Tokenizer>
-    for LongT5Generator
-{
-    fn get_model(&self) -> &LongT5ForConditionalGeneration {
-        &self.model
-    }
+impl PrivateLanguageGenerator for LongT5Generator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -782,8 +656,56 @@ impl PrivateLanguageGenerator<LongT5ForConditionalGeneration, T5Vocab, T5Tokeniz
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        cache: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        _input_embeds: Option<&Tensor>,
+        encoder_outputs: Option<&Tensor>,
+        decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match cache {
+            Cache::LongT5Cache(cached_layer_states) => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                None,
+                None,
+                cached_layer_states,
+                train,
+            )?,
+            Cache::None => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                None,
+                None,
+                None,
+                train,
+            )?,
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with LongT5 Model".into(),
+                ));
+            }
+        };
+
+        Ok(LMModelOutput {
+            lm_logits: base_model_output.decoder_output,
+            cache: Cache::LongT5Cache(base_model_output.next_cache),
+        })
+    }
+
    fn encode(&self, input_ids: &Tensor, attention_mask: Option<&Tensor>) -> Option<Tensor> {
-        Some(self.get_model().encode(input_ids, attention_mask))
+        Some(self.model.encode(input_ids, attention_mask))
    }

    fn prepare_inputs_for_generation<'a>(
@ -891,4 +813,4 @@ impl PrivateLanguageGenerator<LongT5ForConditionalGeneration, T5Vocab, T5Tokeniz
    }
 }

-impl LanguageGenerator<LongT5ForConditionalGeneration, T5Vocab, T5Tokenizer> for LongT5Generator {}
+impl LanguageGenerator for LongT5Generator {}
--- a/src/longt5/mod.rs
+++ b/src/longt5/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the LongT5 language model ([LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) Guo, Ainslie, Uthus, Ontanon, Ni, Sung, Yang, 2021).
 //! The base model is implemented in the `longt5_model::LongT5Model` struct. This model includes a language model head: `longt5_model::LongT5ForConditionalGeneration`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //!
 //! # Model set-up and pre-trained weights loading
 //!
--- a/src/m2m_100/m2m_100_model.rs
+++ b/src/m2m_100/m2m_100_model.rs
@ -18,13 +18,10 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::pipelines::translation::Language;
 use crate::{Config, RustBertError};
-use rust_tokenizers::tokenizer::{M2M100Tokenizer, TruncationStrategy};
-use rust_tokenizers::vocab::M2M100Vocab;
+use rust_tokenizers::tokenizer::TruncationStrategy;
 use std::borrow::Borrow;
 use tch::nn::{embedding, EmbeddingConfig};
 use tch::{nn, Kind, Tensor};
@ -458,109 +455,6 @@ impl M2M100ForConditionalGeneration {
    }
 }

-impl LMHeadModel for M2M100ForConditionalGeneration {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding.
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Unused for M2M100
-    /// * `token_type_ids` - Unused for M2M100
-    /// * `position_ids` - Unused for M2M100
-    /// * `encoder_outputs` - Optional tensor of shape (*batch size*, *source_sequence_length*, *hidden_size*). When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
-    /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `BARTCache` made of `Option<Vec<(Option<Vec<&LayerState, &LayerState>>)>>` of length *n_layer* containing the encoder past keys and values for
-    ///     both the self attention and the encoder cross attention of each layer of the decoder.
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::pipelines::generation_utils::LMHeadModel;
-    /// use rust_bert::m2m_100::{M2M100ForConditionalGeneration, M2M100Config};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = M2M100Config::from_file(config_path);
-    /// # let m2m100_model: M2M100ForConditionalGeneration = M2M100ForConditionalGeneration::new(&vs.root(), &config);
-    ///  let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
-    ///  let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
-    ///  let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
-    ///  let encoder_attention_mask = Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///  let decoder_attention_mask = Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///
-    ///  let model_output = no_grad(|| {
-    ///    m2m100_model
-    ///         .forward_t(Some(&input_tensor),
-    ///                    Some(&encoder_attention_mask),
-    ///                    None,
-    ///                    Some(&target_tensor),
-    ///                    Some(&decoder_attention_mask),
-    ///                    None,
-    ///                    false)
-    ///    });
-    /// ```
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        cache: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        _input_embeds: Option<&Tensor>,
-        encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match cache {
-            Cache::BARTCache(cached_layer_states) => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids,
-                encoder_outputs,
-                None,
-                cached_layer_states,
-                train,
-            ),
-
-            Cache::None => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids,
-                encoder_outputs,
-                None,
-                None,
-                train,
-            ),
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with M2M100 Model".into(),
-                ));
-            }
-        };
-
-        let lm_logits = base_model_output
-            .decoder_output
-            .linear::<Tensor>(&self.base_model.embeddings.ws, None);
-        Ok(LMModelOutput {
-            lm_logits,
-            cache: Cache::BARTCache(base_model_output.cache),
-        })
-    }
-}
-
 /// # Language generation model based on the M2M100 architecture
 pub struct M2M100Generator {
    model: M2M100ForConditionalGeneration,
@ -689,12 +583,7 @@ impl M2M100Generator {
    }
 }

-impl PrivateLanguageGenerator<M2M100ForConditionalGeneration, M2M100Vocab, M2M100Tokenizer>
-    for M2M100Generator
-{
-    fn get_model(&self) -> &M2M100ForConditionalGeneration {
-        &self.model
-    }
+impl PrivateLanguageGenerator for M2M100Generator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -725,11 +614,55 @@ impl PrivateLanguageGenerator<M2M100ForConditionalGeneration, M2M100Vocab, M2M10
    fn get_decoder_start_id(&self) -> Option<i64> {
        self.decoder_start_id
    }
-
    fn get_max_positions_embeddings(&self) -> i64 {
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        cache: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        _input_embeds: Option<&Tensor>,
+        encoder_outputs: Option<&Tensor>,
+        decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match cache {
+            Cache::BARTCache(cached_layer_states) => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                cached_layer_states,
+                train,
+            ),
+
+            Cache::None => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                None,
+                train,
+            ),
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with M2M100 Model".into(),
+                ));
+            }
+        };
+
+        Ok(LMModelOutput {
+            lm_logits: base_model_output.decoder_output,
+            cache: Cache::BARTCache(base_model_output.cache),
+        })
+    }
+
    fn prepare_scores_for_generation(
        &self,
        scores: &mut Tensor,
@ -747,7 +680,7 @@ impl PrivateLanguageGenerator<M2M100ForConditionalGeneration, M2M100Vocab, M2M10
    }

    fn encode(&self, input_ids: &Tensor, attention_mask: Option<&Tensor>) -> Option<Tensor> {
-        Some(self.get_model().encode(input_ids, attention_mask))
+        Some(self.model.encode(input_ids, attention_mask))
    }

    fn prepare_inputs_for_generation<'a>(
@ -856,10 +789,7 @@ impl PrivateLanguageGenerator<M2M100ForConditionalGeneration, M2M100Vocab, M2M10
    }
 }

-impl LanguageGenerator<M2M100ForConditionalGeneration, M2M100Vocab, M2M100Tokenizer>
-    for M2M100Generator
-{
-}
+impl LanguageGenerator for M2M100Generator {}

 #[cfg(test)]
 mod test {
@ -882,7 +812,7 @@ mod test {

        //    Set-up masked LM model
        let device = Device::cuda_if_available();
-        let vs = tch::nn::VarStore::new(device);
+        let vs = nn::VarStore::new(device);
        let config = M2M100Config::from_file(config_path);

        let _: Box<dyn Send> = Box::new(M2M100Model::new(vs.root(), &config));
--- a/src/m2m_100/mod.rs
+++ b/src/m2m_100/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the M2M-100 language model ([Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) Fan, Bhosale, Schwenk, Ma, El-Kishky, Goyal, Baines, Celebi, Wenzel, Chaudhary, Goyal, Birch, Liptchinsky, Edunov, Grave, Auli, Joulin, 2020).
 //! The base model is implemented in the `m2m_100::M2M100Model` struct. The model also includes a language model head: `m2m_100::M2M100ForConditionalGeneration`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //! This model allows for direct translation between 100 languages.
 //! The translation capabilities are illustrated in `examples/translation_m2m100`, run with `cargo run --example translation_m2m100`.
 //!
--- a/src/marian/marian_model.rs
+++ b/src/marian/marian_model.rs
@ -16,13 +16,10 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::pipelines::translation::Language;
 use crate::{Config, RustBertError};
-use rust_tokenizers::tokenizer::{MarianTokenizer, TruncationStrategy};
-use rust_tokenizers::vocab::MarianVocab;
+use rust_tokenizers::tokenizer::TruncationStrategy;
 use std::borrow::Borrow;
 use tch::nn::Init;
 use tch::{nn, Kind, Tensor};
@ -653,7 +650,8 @@ impl MarianForConditionalGeneration {

        let lm_logits = base_model_output
            .decoder_output
-            .linear::<Tensor>(&self.base_model.embeddings.ws, None);
+            .linear::<Tensor>(&self.base_model.embeddings.ws, None)
+            + &self.final_logits_bias;
        BartModelOutput {
            decoder_output: lm_logits,
            ..base_model_output
@ -673,114 +671,6 @@ impl MarianForConditionalGeneration {
    }
 }

-impl LMHeadModel for MarianForConditionalGeneration {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `layer_past` - Unused for BART
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Unused for BART
-    /// * `token_type_ids` - Unused for BART
-    /// * `position_ids` - Unused for BART
-    /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
-    /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `BartCache` made of `Option<Vec<(Option<Vec<&LayerState, &LayerState>>)>>` of length *n_layer* containing the encoder past keys and values for
-    ///     both the self attention and the encoder cross attention of each layer of the decoder.
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::bart::BartConfig;
-    /// use rust_bert::marian::MarianForConditionalGeneration;
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = BartConfig::from_file(config_path);
-    /// # let marian_model = MarianForConditionalGeneration::new(&vs.root(), &config);
-    /// let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
-    /// let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
-    /// let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
-    /// let encoder_attention_mask =
-    ///     Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    /// let decoder_attention_mask =
-    ///     Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///
-    /// let model_output = no_grad(|| {
-    ///     marian_model.forward_t(
-    ///         Some(&input_tensor),
-    ///         Some(&encoder_attention_mask),
-    ///         None,
-    ///         Some(&target_tensor),
-    ///         Some(&decoder_attention_mask),
-    ///         None,
-    ///         false,
-    ///     )
-    /// });
-    /// ```
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        cache: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        _input_embeds: Option<&Tensor>,
-        encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match cache {
-            Cache::BARTCache(cached_layer_states) => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids,
-                encoder_outputs,
-                None,
-                cached_layer_states,
-                train,
-            ),
-            Cache::None => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids,
-                encoder_outputs,
-                None,
-                None,
-                train,
-            ),
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with Marian Model".into(),
-                ));
-            }
-        };
-
-        let lm_logits = base_model_output
-            .decoder_output
-            .linear::<Tensor>(&self.base_model.embeddings.ws, None)
-            + &self.final_logits_bias;
-        Ok(LMModelOutput {
-            lm_logits,
-            cache: Cache::BARTCache(base_model_output.cache),
-        })
-    }
-}
-
 /// # Language generation model based on the Marian architecture for machine translation
 pub struct MarianGenerator {
    model: MarianForConditionalGeneration,
@ -914,12 +804,7 @@ impl MarianGenerator {
    }
 }

-impl PrivateLanguageGenerator<MarianForConditionalGeneration, MarianVocab, MarianTokenizer>
-    for MarianGenerator
-{
-    fn get_model(&self) -> &MarianForConditionalGeneration {
-        &self.model
-    }
+impl PrivateLanguageGenerator for MarianGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -954,6 +839,50 @@ impl PrivateLanguageGenerator<MarianForConditionalGeneration, MarianVocab, Maria
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        cache: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        _input_embeds: Option<&Tensor>,
+        encoder_outputs: Option<&Tensor>,
+        decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match cache {
+            Cache::BARTCache(cached_layer_states) => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                cached_layer_states,
+                train,
+            ),
+            Cache::None => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                None,
+                train,
+            ),
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with Marian Model".into(),
+                ));
+            }
+        };
+
+        Ok(LMModelOutput {
+            lm_logits: base_model_output.decoder_output,
+            cache: Cache::BARTCache(base_model_output.cache),
+        })
+    }
+
    fn prepare_scores_for_generation(
        &self,
        scores: &mut Tensor,
@ -976,7 +905,7 @@ impl PrivateLanguageGenerator<MarianForConditionalGeneration, MarianVocab, Maria
    }

    fn encode(&self, input_ids: &Tensor, attention_mask: Option<&Tensor>) -> Option<Tensor> {
-        Some(self.get_model().encode(input_ids, attention_mask))
+        Some(self.model.encode(input_ids, attention_mask))
    }

    fn prepare_inputs_for_generation<'a>(
@ -1086,7 +1015,4 @@ impl PrivateLanguageGenerator<MarianForConditionalGeneration, MarianVocab, Maria
    }
 }

-impl LanguageGenerator<MarianForConditionalGeneration, MarianVocab, MarianTokenizer>
-    for MarianGenerator
-{
-}
+impl LanguageGenerator for MarianGenerator {}
--- a/src/marian/mod.rs
+++ b/src/marian/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the Marian language model ([Marian: Fast Neural Machine Translation in {C++}](http://www.aclweb.org/anthology/P18-4020) Junczys-Dowmunt, Grundkiewicz, Dwojak, Hoang, Heafield, Neckermann, Seide, Germann, Fikri Aji, Bogoychev, Martins, Birch, 2018).
 //! The base model is implemented in the `bart_model::BartModel` struct. This model includes a language model head: `marian_model::MarianForConditionalGeneration`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //!
 //! # Model set-up and pre-trained weights loading
 //!
--- a/src/mbart/mbart_model.rs
+++ b/src/mbart/mbart_model.rs
@ -19,13 +19,10 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::pipelines::translation::Language;
 use crate::{Activation, Config, RustBertError};
-use rust_tokenizers::tokenizer::{MBart50Tokenizer, TruncationStrategy};
-use rust_tokenizers::vocab::MBart50Vocab;
+use rust_tokenizers::tokenizer::TruncationStrategy;
 use serde::{Deserialize, Serialize};
 use std::borrow::Borrow;
 use std::collections::HashMap;
@ -717,111 +714,6 @@ impl MBartForSequenceClassification {
    }
 }

-impl LMHeadModel for MBartForConditionalGeneration {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding.
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Unused for MBart
-    /// * `token_type_ids` - Unused for MBart
-    /// * `position_ids` - Unused for MBart
-    /// * `encoder_outputs` - Optional tensor of shape (*batch size*, *source_sequence_length*, *hidden_size*). When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
-    /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `BartCache` made of `Option<Vec<(Option<Vec<&LayerState, &LayerState>>)>>` of length *n_layer* containing the encoder past keys and values for
-    ///     both the self attention and the encoder cross attention of each layer of the decoder.
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::pipelines::generation_utils::LMHeadModel;
-    /// use rust_bert::mbart::{MBartForConditionalGeneration, MBartConfig};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = MBartConfig::from_file(config_path);
-    /// # let mbart_model: MBartForConditionalGeneration = MBartForConditionalGeneration::new(&vs.root(), &config);
-    ///  let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
-    ///  let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
-    ///  let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
-    ///  let encoder_attention_mask = Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///  let decoder_attention_mask = Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///
-    ///  let model_output = no_grad(|| {
-    ///    mbart_model
-    ///         .forward_t(Some(&input_tensor),
-    ///                    Some(&encoder_attention_mask),
-    ///                    None,
-    ///                    Some(&target_tensor),
-    ///                    Some(&decoder_attention_mask),
-    ///                    None,
-    ///                    false)
-    ///    });
-    /// ```
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        cache: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        _input_embeds: Option<&Tensor>,
-        encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match cache {
-            Cache::BARTCache(cached_layer_states) => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids,
-                encoder_outputs,
-                None,
-                cached_layer_states,
-                train,
-            ),
-
-            Cache::None => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids,
-                encoder_outputs,
-                None,
-                None,
-                train,
-            ),
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with MBART Model".into(),
-                ));
-            }
-        };
-
-        let lm_logits = base_model_output
-            .decoder_output
-            .linear::<Tensor>(&self.base_model.embeddings.ws, None)
-            + &self.final_logits_bias;
-        Ok(LMModelOutput {
-            lm_logits,
-            cache: Cache::BARTCache(base_model_output.cache),
-        })
-    }
-}
-
 /// Container holding a MBART model output
 pub type MBartModelOutput = BartModelOutput;

@ -944,12 +836,7 @@ impl MBartGenerator {
    }
 }

-impl PrivateLanguageGenerator<MBartForConditionalGeneration, MBart50Vocab, MBart50Tokenizer>
-    for MBartGenerator
-{
-    fn get_model(&self) -> &MBartForConditionalGeneration {
-        &self.model
-    }
+impl PrivateLanguageGenerator for MBartGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -981,6 +868,51 @@ impl PrivateLanguageGenerator<MBartForConditionalGeneration, MBart50Vocab, MBart
        self.decoder_start_id
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        cache: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        _input_embeds: Option<&Tensor>,
+        encoder_outputs: Option<&Tensor>,
+        decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match cache {
+            Cache::BARTCache(cached_layer_states) => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                cached_layer_states,
+                train,
+            ),
+
+            Cache::None => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                None,
+                train,
+            ),
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with MBART Model".into(),
+                ));
+            }
+        };
+
+        Ok(LMModelOutput {
+            lm_logits: base_model_output.decoder_output,
+            cache: Cache::BARTCache(base_model_output.cache),
+        })
+    }
+
    fn get_max_positions_embeddings(&self) -> i64 {
        self.max_position_embeddings
    }
@ -1002,7 +934,7 @@ impl PrivateLanguageGenerator<MBartForConditionalGeneration, MBart50Vocab, MBart
    }

    fn encode(&self, input_ids: &Tensor, attention_mask: Option<&Tensor>) -> Option<Tensor> {
-        Some(self.get_model().encode(input_ids, attention_mask))
+        Some(self.model.encode(input_ids, attention_mask))
    }

    fn prepare_inputs_for_generation<'a>(
@ -1111,10 +1043,7 @@ impl PrivateLanguageGenerator<MBartForConditionalGeneration, MBart50Vocab, MBart
    }
 }

-impl LanguageGenerator<MBartForConditionalGeneration, MBart50Vocab, MBart50Tokenizer>
-    for MBartGenerator
-{
-}
+impl LanguageGenerator for MBartGenerator {}

 #[cfg(test)]
 mod test {
--- a/src/mbart/mod.rs
+++ b/src/mbart/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the MBart language model ([Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) Liu, Gu, Goyal, Li, Edunov, Ghazvininejad, Lewis, Zettlemoyer, 2020).
 //! The base model is implemented in the `mbart_model::MBartModel` struct. The model also includes a language model head: `mbart_model::MBartForConditionalGeneration`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //!
 //! # Model set-up and pre-trained weights loading
 //!
--- a/src/openai_gpt/mod.rs
+++ b/src/openai_gpt/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the GPT2 language model ([Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf) Radford, Narasimhan, Salimans, Sutskever 2018).
 //! The base model is implemented in the `openai_gpt_model::OpenAiGptModel` struct. The model also includes a language model head: `openai_gpt_model::OpenAIGPTLMHeadModel`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //!
 //! # Model set-up and pre-trained weights loading
 //!
--- a/src/openai_gpt/openai_gpt_model.rs
+++ b/src/openai_gpt/openai_gpt_model.rs
@ -19,12 +19,8 @@ use crate::gpt2::Gpt2Config;
 use crate::openai_gpt::transformer::Block;
 use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::PrivateLanguageGenerator;
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::{Config, RustBertError};
-use rust_tokenizers::tokenizer::OpenAiGptTokenizer;
-use rust_tokenizers::vocab::OpenAiGptVocab;
 use std::borrow::{Borrow, BorrowMut};
 use tch::kind::Kind::Int64;
 use tch::nn::embedding;
@ -326,9 +322,7 @@ impl OpenAIGPTLMHeadModel {
            lm_head,
        }
    }
-}

-impl LMHeadModel for OpenAIGPTLMHeadModel {
    /// Forward pass through the model
    ///
    /// # Arguments
@ -362,7 +356,7 @@ impl LMHeadModel for OpenAIGPTLMHeadModel {
    /// # use tch::kind::Kind::{Int64, Double};
    /// use rust_bert::gpt2::Gpt2Config;
    /// use rust_bert::openai_gpt::OpenAIGPTLMHeadModel;
-    /// use rust_bert::pipelines::generation_utils::{LMHeadModel, Cache};
+    /// use rust_bert::pipelines::generation_utils::Cache;
    /// # let config_path = Path::new("path/to/config.json");
    /// # let vocab_path = Path::new("path/to/vocab.txt");
    /// # let device = Device::Cpu;
@ -388,7 +382,7 @@ impl LMHeadModel for OpenAIGPTLMHeadModel {
    ///                    false).unwrap()
    ///    });
    /// ```
-    fn forward_t(
+    pub fn forward_t(
        &self,
        input_ids: Option<&Tensor>,
        _layer_past: Cache,
@ -531,12 +525,7 @@ impl OpenAIGenerator {
    }
 }

-impl PrivateLanguageGenerator<OpenAIGPTLMHeadModel, OpenAiGptVocab, OpenAiGptTokenizer>
-    for OpenAIGenerator
-{
-    fn get_model(&self) -> &OpenAIGPTLMHeadModel {
-        &self.model
-    }
+impl PrivateLanguageGenerator for OpenAIGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -570,9 +559,31 @@ impl PrivateLanguageGenerator<OpenAIGPTLMHeadModel, OpenAiGptVocab, OpenAiGptTok
    fn get_max_positions_embeddings(&self) -> i64 {
        self.max_position_embeddings
    }
+
+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        _layer_past: Cache,
+        attention_mask: Option<&Tensor>,
+        token_type_ids: Option<&Tensor>,
+        position_ids: Option<&Tensor>,
+        input_embeds: Option<&Tensor>,
+        _encoder_outputs: Option<&Tensor>,
+        _decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        self.model.forward_t(
+            input_ids,
+            _layer_past,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            input_embeds,
+            _encoder_outputs,
+            _decoder_input_ids,
+            train,
+        )
+    }
 }

-impl LanguageGenerator<OpenAIGPTLMHeadModel, OpenAiGptVocab, OpenAiGptTokenizer>
-    for OpenAIGenerator
-{
-}
+impl LanguageGenerator for OpenAIGenerator {}
--- a/src/pegasus/mod.rs
+++ b/src/pegasus/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the Pegasus language model ([PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) Zhang, Zhao, Saleh, Liu, 2019).
 //! The base model is implemented in the `pegasus_model::PegasusModel` struct and leverages an implementation that is broadly similar to BART. The model also includes a language model head: `pegasus_model::PegasusForConditionalGeneration`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //!
 //! # Model set-up and pre-trained weights loading
 //!
--- a/src/pegasus/pegasus_model.rs
+++ b/src/pegasus/pegasus_model.rs
@ -20,12 +20,9 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::{Config, RustBertError};
-use rust_tokenizers::tokenizer::{PegasusTokenizer, TruncationStrategy};
-use rust_tokenizers::vocab::PegasusVocab;
+use rust_tokenizers::tokenizer::TruncationStrategy;
 use std::borrow::Borrow;
 use tch::nn::{embedding, EmbeddingConfig, Init};
 use tch::{nn, Tensor};
@ -430,120 +427,6 @@ impl PegasusForConditionalGeneration {
    }
 }

-impl LMHeadModel for PegasusForConditionalGeneration {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding.
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Unused for Pegasus
-    /// * `token_type_ids` - Unused for Pegasus
-    /// * `position_ids` - Unused for Pegasus
-    /// * `encoder_outputs` - Optional tensor of shape (*batch size*, *source_sequence_length*, *hidden_size*). When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
-    /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `BartCache` made of `Option<Vec<(Option<Vec<&LayerState, &LayerState>>)>>` of length *n_layer* containing the encoder past keys and values for
-    ///     both the self attention and the encoder cross attention of each layer of the decoder.
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::pipelines::generation_utils::LMHeadModel;
-    /// use rust_bert::pegasus::{PegasusForConditionalGeneration, PegasusConfig};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = PegasusConfig::from_file(config_path);
-    /// # let pegasus_model: PegasusForConditionalGeneration = PegasusForConditionalGeneration::new(&vs.root(), &config);
-    ///  let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
-    ///  let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
-    ///  let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
-    ///  let encoder_attention_mask = Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///  let decoder_attention_mask = Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///
-    ///  let model_output = no_grad(|| {
-    ///    pegasus_model
-    ///         .forward_t(Some(&input_tensor),
-    ///                    Some(&encoder_attention_mask),
-    ///                    None,
-    ///                    Some(&target_tensor),
-    ///                    Some(&decoder_attention_mask),
-    ///                    None,
-    ///                    false)
-    ///    });
-    /// ```
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        cache: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        _input_embeds: Option<&Tensor>,
-        encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match cache {
-            Cache::BARTCache(cached_layer_states) => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids.ok_or_else(|| {
-                    RustBertError::ValueError(
-                        "Decoder input ids must be provided for Pegasus language models"
-                            .to_string(),
-                    )
-                })?,
-                encoder_outputs,
-                None,
-                cached_layer_states,
-                train,
-            ),
-            Cache::None => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                decoder_input_ids.ok_or_else(|| {
-                    RustBertError::ValueError(
-                        "Decoder input ids must be provided for Pegasus language models"
-                            .to_string(),
-                    )
-                })?,
-                encoder_outputs,
-                None,
-                None,
-                train,
-            ),
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with Pegasus Model".into(),
-                ));
-            }
-        };
-
-        let lm_logits = base_model_output
-            .decoder_output
-            .linear::<Tensor>(&self.base_model.embeddings.ws, None)
-            + &self.final_logits_bias;
-        Ok(LMModelOutput {
-            lm_logits,
-            cache: Cache::BARTCache(base_model_output.cache),
-        })
-    }
-}
-
 /// # Language generation model based on the Pegasus architecture
 pub struct PegasusConditionalGenerator {
    model: PegasusForConditionalGeneration,
@ -666,12 +549,7 @@ impl PegasusConditionalGenerator {
    }
 }

-impl PrivateLanguageGenerator<PegasusForConditionalGeneration, PegasusVocab, PegasusTokenizer>
-    for PegasusConditionalGenerator
-{
-    fn get_model(&self) -> &PegasusForConditionalGeneration {
-        &self.model
-    }
+impl PrivateLanguageGenerator for PegasusConditionalGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -706,6 +584,50 @@ impl PrivateLanguageGenerator<PegasusForConditionalGeneration, PegasusVocab, Peg
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        cache: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        _input_embeds: Option<&Tensor>,
+        encoder_outputs: Option<&Tensor>,
+        decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match cache {
+            Cache::BARTCache(cached_layer_states) => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                cached_layer_states,
+                train,
+            ),
+            Cache::None => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                None,
+                train,
+            ),
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with Pegasus Model".into(),
+                ));
+            }
+        };
+
+        Ok(LMModelOutput {
+            lm_logits: base_model_output.decoder_output,
+            cache: Cache::BARTCache(base_model_output.cache),
+        })
+    }
+
    fn prepare_scores_for_generation(
        &self,
        scores: &mut Tensor,
@ -721,7 +643,7 @@ impl PrivateLanguageGenerator<PegasusForConditionalGeneration, PegasusVocab, Peg
    }

    fn encode(&self, input_ids: &Tensor, attention_mask: Option<&Tensor>) -> Option<Tensor> {
-        Some(self.get_model().encode(input_ids, attention_mask))
+        Some(self.model.encode(input_ids, attention_mask))
    }

    fn prepare_inputs_for_generation<'a>(
@ -833,10 +755,7 @@ impl PrivateLanguageGenerator<PegasusForConditionalGeneration, PegasusVocab, Peg
    }
 }

-impl LanguageGenerator<PegasusForConditionalGeneration, PegasusVocab, PegasusTokenizer>
-    for PegasusConditionalGenerator
-{
-}
+impl LanguageGenerator for PegasusConditionalGenerator {}

 /// Container holding a Pegasus model output. The decoder output may hold the hidden state of
 /// the last layer of the decoder, or may hold logits for a custom head module after the
--- a/src/pipelines/generation_utils.rs
+++ b/src/pipelines/generation_utils.rs
@ -66,13 +66,10 @@
 //! # ;
 //! ```

-use rust_tokenizers::tokenizer::Tokenizer;
-use rust_tokenizers::vocab::Vocab;
 use tch::kind::Kind::Int64;
 use tch::{no_grad, Device, Tensor};

 use crate::bart::LayerState as BartLayerState;
-use crate::common::error::RustBertError;
 use crate::common::resources::ResourceProvider;
 use crate::gpt_j::LayerState as GPTJLayerState;
 use crate::gpt_neo::LayerState as GPTNeoLayerState;
@ -234,18 +231,18 @@ pub(crate) mod private_generation_utils {
    use std::collections::HashMap;
    use std::mem;

-    use rust_tokenizers::tokenizer::{truncate_sequences, Tokenizer, TruncationStrategy};
-    use rust_tokenizers::vocab::Vocab;
+    use rust_tokenizers::tokenizer::{truncate_sequences, TruncationStrategy};
    use rust_tokenizers::TokenIdsWithOffsets;
    use tch::{nn, Device, Kind, Tensor};

    use crate::pipelines::common::TokenizerOption;
    use crate::pipelines::generation_utils::{
-        BeamHypotheses, Cache, GenerateConfig, LMHeadModel, PrefixAllowedFunction,
+        BeamHypotheses, Cache, GenerateConfig, LMModelOutput, PrefixAllowedFunction,
    };

    use super::ordered_float::OrderedFloat;
    use crate::common::kind::get_positive_infinity;
+    use crate::RustBertError;

    pub struct InternalGenerateOptions<'a> {
        pub min_length: i64,
@ -283,8 +280,7 @@ pub(crate) mod private_generation_utils {
        pub token_scores: Option<Vec<Vec<f64>>>,
    }

-    pub trait PrivateLanguageGenerator<T: LMHeadModel, V: Vocab, U: Tokenizer<V>> {
-        fn get_model(&self) -> &T;
+    pub trait PrivateLanguageGenerator {
        fn _get_tokenizer(&self) -> &TokenizerOption;
        fn get_var_store(&self) -> &nn::VarStore;
        fn get_var_store_mut(&mut self) -> &mut nn::VarStore;
@ -297,6 +293,19 @@ pub(crate) mod private_generation_utils {
        fn get_decoder_start_id(&self) -> Option<i64>;
        fn get_max_positions_embeddings(&self) -> i64;

+        fn forward_t(
+            &self,
+            input_ids: Option<&Tensor>,
+            layer_past: Cache,
+            attention_mask: Option<&Tensor>,
+            token_type_ids: Option<&Tensor>,
+            position_ids: Option<&Tensor>,
+            input_embeds: Option<&Tensor>,
+            encoder_outputs: Option<&Tensor>,
+            decoder_input_ids: Option<&Tensor>,
+            train: bool,
+        ) -> Result<LMModelOutput, RustBertError>;
+
        fn prepare_scores_for_generation(
            &self,
            _scores: &mut Tensor,
@ -778,7 +787,6 @@ pub(crate) mod private_generation_utils {
                    attention_mask.copy(),
                );
                let temp = self
-                    .get_model()
                    .forward_t(
                        prepared_input.prepared_input.as_ref(),
                        prepared_input.prepared_past,
@ -1054,7 +1062,6 @@ pub(crate) mod private_generation_utils {
                    attention_mask.copy(),
                );
                let temp = self
-                    .get_model()
                    .forward_t(
                        prepared_input.prepared_input.as_ref(),
                        prepared_input.prepared_past,
@ -1590,9 +1597,7 @@ macro_rules! unpack_config {

 /// # Common trait for text generation models.
 /// Main API for text generation
-pub trait LanguageGenerator<T: LMHeadModel, V: Vocab, U: Tokenizer<V>>:
-    PrivateLanguageGenerator<T, V, U>
-{
+pub trait LanguageGenerator: PrivateLanguageGenerator {
    /// Generate text based on a vector of promp texts.
    ///
    /// # Arguments
@ -2255,93 +2260,6 @@ impl BeamHypotheses {
    }
 }

-/// # Language Model trait
-/// Shared trait between language generation models (e.g. GPT2, GPT, BART) used in language generation pipelines.
-pub trait LMHeadModel {
-    /// Forward pass through the model. Example provided for GPT2.
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `layer_past` - Optional vector of size *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*). When provided, these are concatenated with the current input keys and values.
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
-    /// * `token_type_ids` - Optional token type ids used to indicate the portion of the input the token belongs to. If not None, token type embeddings will be added to the token and position embeddings.
-    /// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented starting from the length of the past input.
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    /// # Returns
-    ///
-    /// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    /// * `past` - `Option<Vec<Tensor>>` of length *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*)
-    /// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
-    /// * `attentions` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::gpt2::{GPT2LMHeadModel, Gpt2Config};
-    /// use rust_bert::pipelines::generation_utils::{Cache, LMHeadModel};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = Gpt2Config::from_file(config_path);
-    /// # let mut gpt2_model: GPT2LMHeadModel = GPT2LMHeadModel::new(&vs.root(), &config);
-    /// let (batch_size, sequence_length, past_sequence_length) = (64, 128, 56);
-    /// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
-    /// let mut past: Vec<Tensor> = Vec::with_capacity(config.n_layer as usize);
-    /// for _ in 0..config.n_layer as usize {
-    ///     past.push(Tensor::rand(
-    ///         &[
-    ///             2,
-    ///             batch_size,
-    ///             config.n_head,
-    ///             past_sequence_length,
-    ///             config.n_embd / config.n_head,
-    ///         ],
-    ///         (Double, device),
-    ///     ))
-    /// }
-    /// let attention_mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
-    /// let token_type_ids = Tensor::ones(&[batch_size, sequence_length], (Int64, device));
-    /// let position_ids = Tensor::arange(sequence_length, (Int64, device))
-    ///     .expand(&[batch_size, sequence_length], true);
-    ///
-    /// let model_output = no_grad(|| {
-    ///     gpt2_model
-    ///         .forward_t(
-    ///             Some(&input_tensor),
-    ///             Cache::GPT2Cache(Some(past)),
-    ///             Some(&attention_mask),
-    ///             Some(&token_type_ids),
-    ///             Some(&position_ids),
-    ///             None,
-    ///             None,
-    ///             None,
-    ///             false,
-    ///         )
-    ///         .unwrap()
-    /// });
-    /// ```
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        layer_past: Cache,
-        attention_mask: Option<&Tensor>,
-        token_type_ids: Option<&Tensor>,
-        position_ids: Option<&Tensor>,
-        input_embeds: Option<&Tensor>,
-        encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError>;
-}
-
 /// Container holding a language model output for generation tasks
 pub struct LMModelOutput {
    /// Logits for each vocab item and position
--- a/src/prophetnet/mod.rs
+++ b/src/prophetnet/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the ProphetNet language model ([ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) Qi, Yan, Gong, Liu, Duan, Chen, Zhang, Zhou, 2020).
 //! The base model is implemented in the `prophetnet_model::ProphetNetModel` struct. Two language model heads have also been implemented:
-//! - Conditional language generation (encoder-decoder architecture): `prophetnet_model::ProphetNetForConditionalGeneration` implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information)
+//! - Conditional language generation (encoder-decoder architecture): `prophetnet_model::ProphetNetForConditionalGeneration` implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information)
 //! - Causal language generation (decoder architecture): `prophetnet_model::ProphetNetForCausalGeneration`
 //!
 //! # Model set-up and pre-trained weights loading
--- a/src/prophetnet/prophetnet_model.rs
+++ b/src/prophetnet/prophetnet_model.rs
@ -13,8 +13,7 @@
 use std::borrow::Borrow;
 use std::collections::HashMap;

-use rust_tokenizers::tokenizer::{ProphetNetTokenizer, TruncationStrategy};
-use rust_tokenizers::vocab::ProphetNetVocab;
+use rust_tokenizers::tokenizer::TruncationStrategy;
 use serde::{Deserialize, Serialize};
 use tch::{nn, Kind, Tensor};

@ -22,9 +21,7 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::prophetnet::attention::LayerState;
 use crate::prophetnet::decoder::ProphetNetDecoder;
 use crate::prophetnet::encoder::ProphetNetEncoder;
@ -585,56 +582,6 @@ impl ProphetNetForConditionalGeneration {
    }
 }

-impl LMHeadModel for ProphetNetForConditionalGeneration {
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        cache: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        input_embeds: Option<&Tensor>,
-        encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match cache {
-            Cache::ProphetNetCache(cached_layer_states) => self.forward_t(
-                input_ids,
-                attention_mask,
-                input_embeds,
-                decoder_input_ids,
-                None,
-                encoder_outputs,
-                cached_layer_states,
-                None,
-                train,
-            )?,
-            Cache::None => self.forward_t(
-                input_ids,
-                attention_mask,
-                input_embeds,
-                decoder_input_ids,
-                None,
-                encoder_outputs,
-                None,
-                None,
-                train,
-            )?,
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with ProphetNet Model".into(),
-                ));
-            }
-        };
-
-        Ok(LMModelOutput {
-            lm_logits: base_model_output.logits,
-            cache: Cache::ProphetNetCache(base_model_output.next_decoder_cache),
-        })
-    }
-}
-
 /// # ProphetNet Model for causal generation
 /// ProphetNet decoder with a vocabulary decoding head
 /// It is made of the following blocks:
@ -992,16 +939,7 @@ impl ProphetNetConditionalGenerator {
    }
 }

-impl
-    PrivateLanguageGenerator<
-        ProphetNetForConditionalGeneration,
-        ProphetNetVocab,
-        ProphetNetTokenizer,
-    > for ProphetNetConditionalGenerator
-{
-    fn get_model(&self) -> &ProphetNetForConditionalGeneration {
-        &self.model
-    }
+impl PrivateLanguageGenerator for ProphetNetConditionalGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -1036,9 +974,57 @@ impl
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        cache: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        input_embeds: Option<&Tensor>,
+        encoder_outputs: Option<&Tensor>,
+        decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match cache {
+            Cache::ProphetNetCache(cached_layer_states) => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                input_embeds,
+                decoder_input_ids,
+                None,
+                encoder_outputs,
+                cached_layer_states,
+                None,
+                train,
+            )?,
+            Cache::None => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                input_embeds,
+                decoder_input_ids,
+                None,
+                encoder_outputs,
+                None,
+                None,
+                train,
+            )?,
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with ProphetNet Model".into(),
+                ));
+            }
+        };
+
+        Ok(LMModelOutput {
+            lm_logits: base_model_output.logits,
+            cache: Cache::ProphetNetCache(base_model_output.next_decoder_cache),
+        })
+    }
+
    fn encode(&self, input_ids: &Tensor, attention_mask: Option<&Tensor>) -> Option<Tensor> {
        Some(
-            self.get_model()
+            self.model
                .encode(Some(input_ids), attention_mask, None)
                .unwrap(),
        )
@ -1150,7 +1136,4 @@ impl
    }
 }

-impl LanguageGenerator<ProphetNetForConditionalGeneration, ProphetNetVocab, ProphetNetTokenizer>
-    for ProphetNetConditionalGenerator
-{
-}
+impl LanguageGenerator for ProphetNetConditionalGenerator {}
--- a/src/reformer/mod.rs
+++ b/src/reformer/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the Reformer language model ([Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) Kitaev, kaiser, Levskaya, 2020).
 //! The base model is implemented in the `reformer_model::ReformerModel` struct. The model also includes a language model head: `reformer_model::ReformerModelWithLMHead`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //!
 //! # Model set-up and pre-trained weights loading
 //!
--- a/src/reformer/reformer_model.rs
+++ b/src/reformer/reformer_model.rs
@ -14,8 +14,6 @@
 use std::borrow::Borrow;
 use std::collections::HashMap;

-use rust_tokenizers::tokenizer::ReformerTokenizer;
-use rust_tokenizers::vocab::ReformerVocab;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use tch::{nn, Device, Kind, Tensor};
@ -27,9 +25,7 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::reformer::attention::{AttentionType, LayerState};
 use crate::reformer::attention_utils::{get_least_common_mult_chunk_len, get_min_chunk_len};
 use crate::reformer::embeddings::ReformerEmbeddings;
@ -649,44 +645,6 @@ impl ReformerModelWithLMHead {
    }
 }

-impl LMHeadModel for ReformerModelWithLMHead {
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        cache: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        _input_embeds: Option<&Tensor>,
-        _encoder_outputs: Option<&Tensor>,
-        _decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let output = match cache {
-            Cache::ReformerCache(cached_layer_states) => self.forward_t(
-                input_ids,
-                None,
-                None,
-                attention_mask,
-                None,
-                cached_layer_states,
-                train,
-            ),
-            Cache::None => self.forward_t(input_ids, None, None, attention_mask, None, None, train),
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with Reformer Model".into(),
-                ));
-            }
-        }?;
-
-        Ok(LMModelOutput {
-            lm_logits: output.logits,
-            cache: Cache::ReformerCache(output.next_cache),
-        })
-    }
-}
-
 pub struct ReformerClassificationHead {
    dense: nn::Linear,
    dropout: Dropout,
@ -1119,12 +1077,7 @@ impl ReformerGenerator {
    }
 }

-impl PrivateLanguageGenerator<ReformerModelWithLMHead, ReformerVocab, ReformerTokenizer>
-    for ReformerGenerator
-{
-    fn get_model(&self) -> &ReformerModelWithLMHead {
-        &self.model
-    }
+impl PrivateLanguageGenerator for ReformerGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -1159,6 +1112,45 @@ impl PrivateLanguageGenerator<ReformerModelWithLMHead, ReformerVocab, ReformerTo
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        cache: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        _input_embeds: Option<&Tensor>,
+        _encoder_outputs: Option<&Tensor>,
+        _decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let output = match cache {
+            Cache::ReformerCache(cached_layer_states) => self.model.forward_t(
+                input_ids,
+                None,
+                None,
+                attention_mask,
+                None,
+                cached_layer_states,
+                train,
+            ),
+            Cache::None => {
+                self.model
+                    .forward_t(input_ids, None, None, attention_mask, None, None, train)
+            }
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with Reformer Model".into(),
+                ));
+            }
+        }?;
+
+        Ok(LMModelOutput {
+            lm_logits: output.logits,
+            cache: Cache::ReformerCache(output.next_cache),
+        })
+    }
+
    fn prepare_inputs_for_generation<'a>(
        &self,
        input_ids: Tensor,
@ -1213,7 +1205,4 @@ impl PrivateLanguageGenerator<ReformerModelWithLMHead, ReformerVocab, ReformerTo
    }
 }

-impl LanguageGenerator<ReformerModelWithLMHead, ReformerVocab, ReformerTokenizer>
-    for ReformerGenerator
-{
-}
+impl LanguageGenerator for ReformerGenerator {}
--- a/src/t5/mod.rs
+++ b/src/t5/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the T5 language model ([Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) Raffel, Shazeer, Roberts, Lee, Narang, Matena, Zhou, Li, Liu, 2019).
 //! The base model is implemented in the `t5_model::T5Model` struct. This model includes a language model head: `t5_model::T5ForConditionalGeneration`
-//! implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information).
+//! implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information).
 //!
 //! # Model set-up and pre-trained weights loading
 //!
--- a/src/t5/t5_model.rs
+++ b/src/t5/t5_model.rs
@ -12,8 +12,7 @@

 use std::borrow::Borrow;

-use rust_tokenizers::tokenizer::{T5Tokenizer, TruncationStrategy};
-use rust_tokenizers::vocab::T5Vocab;
+use rust_tokenizers::tokenizer::TruncationStrategy;
 use serde::{Deserialize, Serialize};
 use tch::nn::{embedding, LinearConfig};
 use tch::{nn, Tensor};
@ -22,9 +21,7 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::pipelines::translation::Language;
 use crate::t5::attention::LayerState;
 use crate::t5::encoder::T5Stack;
@ -622,124 +619,6 @@ impl T5ForConditionalGeneration {
    }
 }

-impl LMHeadModel for T5ForConditionalGeneration {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
-    /// * `layer_past` - Optional vector of length `num_layers` containing tuples of optional `LayerStates` containing the last calculated key and value pairs for the decoder. This avoids recomputing attention weights at past positions and speeds up decoding.
-    /// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
-    /// * `input_embeds` - Unused for T5
-    /// * `token_type_ids` - Unused for T5
-    /// * `position_ids` - Unused for T5
-    /// * `encoder_outputs` - Optional tensor of shape (*batch size*, *source_sequence_length*, *hidden_size*). When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
-    /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*).
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `T5Cache` made of `Option<Vec<(Option<Vec<&LayerState, &LayerState>>)>>` of length *n_layer* containing the encoder past keys and values for
-    ///      both the self attention and the encoder cross attention of each layer of the decoder.
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::t5::{T5Config, T5ForConditionalGeneration};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = T5Config::from_file(config_path);
-    /// # let t5_model: T5ForConditionalGeneration = T5ForConditionalGeneration::new(&vs.root(), &config);
-    /// let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
-    /// let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
-    /// let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
-    /// let encoder_attention_mask =
-    ///     Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    /// let decoder_attention_mask =
-    ///     Tensor::ones(&[batch_size, source_sequence_length], (Int64, device));
-    ///
-    /// let model_output = no_grad(|| {
-    ///     t5_model.forward_t(
-    ///         Some(&input_tensor),
-    ///         Some(&encoder_attention_mask),
-    ///         None,
-    ///         Some(&target_tensor),
-    ///         Some(&decoder_attention_mask),
-    ///         None,
-    ///         None,
-    ///         None,
-    ///         false,
-    ///     )
-    /// });
-    /// ```
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        cache: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        _input_embeds: Option<&Tensor>,
-        encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        let base_model_output = match cache {
-            Cache::T5Cache(cached_layer_states) => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                encoder_outputs,
-                decoder_input_ids,
-                None,
-                None,
-                None,
-                cached_layer_states,
-                train,
-            ),
-            Cache::None => self.base_model.forward_t(
-                input_ids,
-                attention_mask,
-                encoder_outputs,
-                decoder_input_ids,
-                None,
-                None,
-                None,
-                None,
-                train,
-            ),
-            _ => {
-                return Err(RustBertError::ValueError(
-                    "Cache not compatible with T5 Model".into(),
-                ));
-            }
-        };
-
-        let lm_logits = if self.tie_word_embeddings {
-            base_model_output
-                .decoder_output
-                .linear::<Tensor>(&self.base_model.embeddings.ws, None)
-                * (self.model_dim.powf(-0.5))
-        } else {
-            base_model_output
-                .decoder_output
-                .apply(self.lm_head.as_ref().unwrap())
-        };
-
-        Ok(LMModelOutput {
-            lm_logits,
-            cache: Cache::T5Cache(base_model_output.next_cache),
-        })
-    }
-}
-
 /// # T5 for sentence embeddings
 /// Transformer usable in [`SentenceEmbeddingsModel`](crate::pipelines::sentence_embeddings::SentenceEmbeddingsModel).
 pub struct T5ForSentenceEmbeddings {
@ -912,10 +791,7 @@ impl T5Generator {
    }
 }

-impl PrivateLanguageGenerator<T5ForConditionalGeneration, T5Vocab, T5Tokenizer> for T5Generator {
-    fn get_model(&self) -> &T5ForConditionalGeneration {
-        &self.model
-    }
+impl PrivateLanguageGenerator for T5Generator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -949,9 +825,55 @@ impl PrivateLanguageGenerator<T5ForConditionalGeneration, T5Vocab, T5Tokenizer>
    fn get_max_positions_embeddings(&self) -> i64 {
        self.max_position_embeddings
    }
+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        cache: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        _input_embeds: Option<&Tensor>,
+        encoder_outputs: Option<&Tensor>,
+        decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        let base_model_output = match cache {
+            Cache::T5Cache(cached_layer_states) => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                None,
+                None,
+                cached_layer_states,
+                train,
+            ),
+            Cache::None => self.model.forward_t(
+                input_ids,
+                attention_mask,
+                encoder_outputs,
+                decoder_input_ids,
+                None,
+                None,
+                None,
+                None,
+                train,
+            ),
+            _ => {
+                return Err(RustBertError::ValueError(
+                    "Cache not compatible with T5 Model".into(),
+                ));
+            }
+        };

+        Ok(LMModelOutput {
+            lm_logits: base_model_output.decoder_output,
+            cache: Cache::T5Cache(base_model_output.next_cache),
+        })
+    }
    fn encode(&self, input_ids: &Tensor, attention_mask: Option<&Tensor>) -> Option<Tensor> {
-        Some(self.get_model().encode(input_ids, attention_mask))
+        Some(self.model.encode(input_ids, attention_mask))
    }

    fn prepare_inputs_for_generation<'a>(
@ -1059,4 +981,4 @@ impl PrivateLanguageGenerator<T5ForConditionalGeneration, T5Vocab, T5Tokenizer>
    }
 }

-impl LanguageGenerator<T5ForConditionalGeneration, T5Vocab, T5Tokenizer> for T5Generator {}
+impl LanguageGenerator for T5Generator {}
--- a/src/xlnet/mod.rs
+++ b/src/xlnet/mod.rs
@ -2,7 +2,7 @@
 //!
 //! Implementation of the XLNet language model ([Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) Yang, Dai, Yang, Carbonell, Salakhutdinov, Le, 2019).
 //! The base model is implemented in the `xlnet_model::XLNetModel` struct. Several language model heads have also been implemented, including:
-//! - Language generation: `xlnet_model::XLNetLMHeadModel` implementing the common `generation_utils::LMHeadModel` trait shared between the models used for generation (see `pipelines` for more information)
+//! - Language generation: `xlnet_model::XLNetLMHeadModel` implementing the common `generation_utils::LanguageGenerator` trait shared between the models used for generation (see `pipelines` for more information)
 //! - Multiple choices: `xlnet_model:XLNetForMultipleChoice`
 //! - Question answering: `xlnet_model::XLNetForQuestionAnswering`
 //! - Sequence classification: `xlnet_model::XLNetForSequenceClassification`
--- a/src/xlnet/xlnet_model.rs
+++ b/src/xlnet/xlnet_model.rs
@ -19,14 +19,10 @@ use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::{
    PreparedInput, PrivateLanguageGenerator,
 };
-use crate::pipelines::generation_utils::{
-    Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
-};
+use crate::pipelines::generation_utils::{Cache, GenerateConfig, LMModelOutput, LanguageGenerator};
 use crate::xlnet::attention::LayerState;
 use crate::xlnet::encoder::XLNetLayer;
 use crate::{Config, RustBertError};
-use rust_tokenizers::tokenizer::XLNetTokenizer;
-use rust_tokenizers::vocab::XLNetVocab;
 use serde::{Deserialize, Serialize};
 use std::borrow::{Borrow, BorrowMut};
 use std::collections::HashMap;
@ -791,102 +787,6 @@ impl XLNetLMHeadModel {
    }
 }

-impl LMHeadModel for XLNetLMHeadModel {
-    /// Forward pass through the model
-    ///
-    /// # Arguments
-    ///
-    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). This or `input_embeds` must be provided.
-    /// * `attention_mask` - Optional attention mask of shape (*batch size*, *sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
-    /// * `perm_mask` - Optional tensor of shape (*batch size*, *sequence_length*, *sequence_length*). Mask to indicate the attention pattern for each input token (only used for pre-training over permutations, rather than simple token masking).
-    /// * `target_mapping ` - Optional tensor of shape (*batch size*, *num_tokens*, *sequence_length*) indicating the position of the masked words to predict.
-    /// * `token_type_ids` - Optional tensor (*batch size*, *sequence_length*) indicating the sentence ID of the token (0: first sentence, 1: second sentence).
-    /// * `input_embeds` - Optional input tensor of shape (*batch size*, *sequence_length*, *embeddings dimension*). This or `input_ids` must be provided.
-    /// * `old_layer_states` - Optional vector of length `num_layers` containing optional `LayerStates` containing the last calculated content for the attention layers. This avoids recomputing attention weights at past positions and speeds up decoding.
-    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
-    ///
-    /// # Returns
-    ///
-    /// * `LMModelOutput` containing:
-    ///   - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
-    ///   - `cache` - `XLNetCache` made of `Option<Vec<Option<LayerState>>>` of length *n_layers*  and shape (*past_sequence_length*, *batch size*, *hidden_size*) containing the previous content
-    ///
-    /// # Example
-    ///
-    /// ```no_run
-    /// # use tch::{nn, Device, Tensor, no_grad, Kind};
-    /// # use rust_bert::Config;
-    /// # use std::path::Path;
-    /// # use tch::kind::Kind::{Int64, Double};
-    /// use rust_bert::xlnet::{XLNetConfig, XLNetLMHeadModel};
-    /// # let config_path = Path::new("path/to/config.json");
-    /// # let vocab_path = Path::new("path/to/vocab.txt");
-    /// # let device = Device::Cpu;
-    /// # let vs = nn::VarStore::new(device);
-    /// # let config = XLNetConfig::from_file(config_path);
-    /// # let xlnet_model: XLNetLMHeadModel = XLNetLMHeadModel::new(&vs.root(), &config);
-    /// let (batch_size, sequence_length) = (64, 128);
-    /// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
-    /// let attention_mask = Tensor::ones(&[batch_size, sequence_length], (Int64, device));
-    /// let target_tensor = Tensor::ones(&[batch_size, sequence_length], (Int64, device));
-    /// let target_mapping = Tensor::zeros(&[64, 1, 128], (Kind::Float, device));
-    /// let _ = target_mapping.narrow(2, 3, 1).fill_(1.0);
-    ///
-    /// let model_output = no_grad(|| {
-    ///     xlnet_model.forward_t(
-    ///         Some(&input_tensor),
-    ///         Some(&attention_mask),
-    ///         None,
-    ///         Some(&target_mapping),
-    ///         None,
-    ///         None,
-    ///         None,
-    ///         false,
-    ///     )
-    /// });
-    /// ```
-    fn forward_t(
-        &self,
-        input_ids: Option<&Tensor>,
-        layer_past: Cache,
-        attention_mask: Option<&Tensor>,
-        _token_type_ids: Option<&Tensor>,
-        _position_ids: Option<&Tensor>,
-        _input_embeds: Option<&Tensor>,
-        _encoder_outputs: Option<&Tensor>,
-        decoder_input_ids: Option<&Tensor>,
-        train: bool,
-    ) -> Result<LMModelOutput, RustBertError> {
-        match layer_past {
-            Cache::XLNetCache(layer_past) => self.forward_t(
-                input_ids,
-                None,
-                layer_past,
-                attention_mask,
-                // For XLNet the decoder_input_ids are used as a placeholder for the target mapping
-                decoder_input_ids,
-                None,
-                None,
-                train,
-            ),
-            Cache::None => self.forward_t(
-                input_ids,
-                None,
-                None,
-                attention_mask,
-                // For XLNet the decoder_input_ids are used as a placeholder for the target mapping
-                decoder_input_ids,
-                None,
-                None,
-                train,
-            ),
-            _ => Err(RustBertError::ValueError(
-                "Cache not compatible with XLNet Model".into(),
-            )),
-        }
-    }
-}
-
 /// # XLNetForSequenceClassification
 /// XLNet model with a classification head for sequence classification tasks
 /// It is made of the following blocks:
@ -1684,10 +1584,7 @@ impl XLNetGenerator {
    }
 }

-impl PrivateLanguageGenerator<XLNetLMHeadModel, XLNetVocab, XLNetTokenizer> for XLNetGenerator {
-    fn get_model(&self) -> &XLNetLMHeadModel {
-        &self.model
-    }
+impl PrivateLanguageGenerator for XLNetGenerator {
    fn _get_tokenizer(&self) -> &TokenizerOption {
        &self.tokenizer
    }
@ -1723,6 +1620,47 @@ impl PrivateLanguageGenerator<XLNetLMHeadModel, XLNetVocab, XLNetTokenizer> for
        self.max_position_embeddings
    }

+    fn forward_t(
+        &self,
+        input_ids: Option<&Tensor>,
+        layer_past: Cache,
+        attention_mask: Option<&Tensor>,
+        _token_type_ids: Option<&Tensor>,
+        _position_ids: Option<&Tensor>,
+        _input_embeds: Option<&Tensor>,
+        _encoder_outputs: Option<&Tensor>,
+        decoder_input_ids: Option<&Tensor>,
+        train: bool,
+    ) -> Result<LMModelOutput, RustBertError> {
+        match layer_past {
+            Cache::XLNetCache(layer_past) => self.model.forward_t(
+                input_ids,
+                None,
+                layer_past,
+                attention_mask,
+                // For XLNet the decoder_input_ids are used as a placeholder for the target mapping
+                decoder_input_ids,
+                None,
+                None,
+                train,
+            ),
+            Cache::None => self.model.forward_t(
+                input_ids,
+                None,
+                None,
+                attention_mask,
+                // For XLNet the decoder_input_ids are used as a placeholder for the target mapping
+                decoder_input_ids,
+                None,
+                None,
+                train,
+            ),
+            _ => Err(RustBertError::ValueError(
+                "Cache not compatible with XLNet Model".into(),
+            )),
+        }
+    }
+
    fn prepare_inputs_for_generation<'a>(
        &self,
        input_ids: Tensor,
@ -1842,4 +1780,4 @@ impl PrivateLanguageGenerator<XLNetLMHeadModel, XLNetVocab, XLNetTokenizer> for
    }
 }

-impl LanguageGenerator<XLNetLMHeadModel, XLNetVocab, XLNetTokenizer> for XLNetGenerator {}
+impl LanguageGenerator for XLNetGenerator {}
--- a/tests/distilgpt2.rs
+++ b/tests/distilgpt2.rs
@ -2,7 +2,7 @@ use rust_bert::gpt2::{
    GPT2LMHeadModel, Gpt2Config, Gpt2ConfigResources, Gpt2MergesResources, Gpt2ModelResources,
    Gpt2VocabResources,
 };
-use rust_bert::pipelines::generation_utils::{Cache, LMHeadModel};
+use rust_bert::pipelines::generation_utils::Cache;
 use rust_bert::resources::{RemoteResource, ResourceProvider};
 use rust_bert::Config;
 use rust_tokenizers::tokenizer::{Gpt2Tokenizer, Tokenizer, TruncationStrategy};
@ -61,17 +61,7 @@ fn distilgpt2_lm_model() -> anyhow::Result<()> {

    //    Forward pass
    let model_output = gpt2_model
-        .forward_t(
-            Some(&input_tensor),
-            Cache::None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            false,
-        )
+        .forward_t(Some(&input_tensor), None, None, None, None, None, false)
        .unwrap();

    let next_word_id = model_output
--- a/tests/gpt2.rs
+++ b/tests/gpt2.rs
@ -7,7 +7,7 @@ use rust_bert::pipelines::conversation::{
    ConversationConfig, ConversationManager, ConversationModel,
 };
 use rust_bert::pipelines::generation_utils::{
-    Cache, GenerateConfig, GenerateOptions, LMHeadModel, LanguageGenerator,
+    Cache, GenerateConfig, GenerateOptions, LanguageGenerator,
 };
 use rust_bert::pipelines::text_generation::{TextGenerationConfig, TextGenerationModel};
 use rust_bert::resources::{RemoteResource, ResourceProvider};
@ -59,17 +59,7 @@ fn gpt2_lm_model() -> anyhow::Result<()> {

    //    Forward pass
    let model_output = gpt2_model
-        .forward_t(
-            Some(&input_tensor),
-            Cache::None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            false,
-        )
+        .forward_t(Some(&input_tensor), None, None, None, None, None, false)
        .unwrap();

    let next_word_id = model_output
--- a/tests/gpt_j.rs
+++ b/tests/gpt_j.rs
@ -2,7 +2,7 @@ use rust_bert::gpt_j::{
    GptJConfig, GptJConfigResources, GptJLMHeadModel, GptJMergesResources, GptJModelResources,
    GptJVocabResources,
 };
-use rust_bert::pipelines::generation_utils::{Cache, LMHeadModel};
+use rust_bert::pipelines::generation_utils::Cache;
 use rust_bert::resources::{RemoteResource, ResourceProvider};
 use rust_bert::Config;
 use rust_tokenizers::tokenizer::{Gpt2Tokenizer, Tokenizer};
--- a/tests/openai_gpt.rs
+++ b/tests/openai_gpt.rs
@ -3,7 +3,7 @@ use rust_bert::openai_gpt::{
    OpenAiGptModelResources, OpenAiGptVocabResources,
 };
 use rust_bert::pipelines::common::ModelType;
-use rust_bert::pipelines::generation_utils::{Cache, LMHeadModel};
+use rust_bert::pipelines::generation_utils::Cache;
 use rust_bert::pipelines::text_generation::{TextGenerationConfig, TextGenerationModel};
 use rust_bert::resources::{RemoteResource, ResourceProvider};
 use rust_bert::Config;