Merge pull request #54 from guillaume-be/albert_implementation

Albert implementation
2024-10-05 16:47:24 +03:00 · 2020-06-22 21:36:09 +02:00 · 2020-06-22 21:36:09 +02:00 · 0624a5368c
commit 0624a5368c
parent 7a86436f38 b5b6f68410
14 changed files with 1747 additions and 27 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,7 +8,7 @@ repository = "https://github.com/guillaume-be/rust-bert"
 documentation = "https://docs.rs/rust-bert"
 license = "Apache-2.0"
 readme = "README.md"
-keywords = ["nlp", "deep-learning", "machine-learning", "bert", "transformers"]
+keywords = ["nlp", "deep-learning", "machine-learning", "bert", "transformers", "summarization", "translation", "NER", "classification", "language", "sentiment-analysis", "question-answering"]

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

@ -30,7 +30,7 @@ all-tests = []
 features = [ "doc-only" ]

 [dependencies]
-rust_tokenizers = "~3.1.2"
+rust_tokenizers = "~3.1.4"
 tch = "~0.1.7"
 serde_json = "1.0.51"
 serde = {version = "1.0.106", features = ["derive"]}
--- a/README.md
+++ b/README.md
@ -10,17 +10,17 @@ This repository exposes the model base architecture, task-specific heads (see be

 The following models are currently implemented:

- | |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**|
-:-----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:----:
-Masked LM|✅ |✅ |✅ | | | |✅| |
-Sequence classification|✅ |✅ |✅| | | | | |
-Token classification|✅ |✅ | ✅| | | |✅| |
-Question answering|✅ |✅ |✅| | | | | |
-Multiple choices| |✅ |✅| | | | | |
-Next token prediction| | | |✅|✅|✅| | |
-Natural Language Generation| | | |✅|✅|✅| | |
-Summarization | | | | | |✅| | |
-Translation | | | | | |✅| |✅ |
+ | |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**|**ALBERT**|
+:-----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:----:|:----:
+Masked LM|✅ |✅ |✅ | | | |✅| |✅ |
+Sequence classification|✅ |✅ |✅| | | | | |✅ |
+Token classification|✅ |✅ | ✅| | | |✅| |✅ |
+Question answering|✅ |✅ |✅| | | | | |✅ |
+Multiple choices| |✅ |✅| | | | | |✅ |
+Next token prediction| | | |✅|✅|✅| | | |
+Natural Language Generation| | | |✅|✅|✅| | | |
+Summarization | | | | | |✅| | | |
+Translation | | | | | |✅| |✅ | |

 ## Ready-to-use pipelines

--- a/examples/albert.rs
+++ b/examples/albert.rs
@ -0,0 +1,77 @@
+// Copyright 2018 Google AI and Google Brain team.
+// Copyright 2020-present, the HuggingFace Inc. team.
+// Copyright 2020 Guillaume Becquin
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+extern crate failure;
+
+use tch::{Device, nn, Tensor, no_grad};
+use rust_tokenizers::{AlbertTokenizer, TruncationStrategy, Tokenizer, Vocab};
+use rust_bert::Config;
+use rust_bert::resources::{Resource, download_resource, RemoteResource};
+use rust_bert::albert::{AlbertConfig, AlbertForMaskedLM, AlbertConfigResources, AlbertVocabResources, AlbertModelResources};
+
+
+fn main() -> failure::Fallible<()> {
+    //    Resources paths
+    let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
+    let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
+    let weights_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertModelResources::ALBERT_BASE_V2));
+    let config_path = download_resource(&config_resource)?;
+    let vocab_path = download_resource(&vocab_resource)?;
+    let weights_path = download_resource(&weights_resource)?;
+
+//    Set-up masked LM model
+    let device = Device::Cpu;
+    let mut vs = nn::VarStore::new(device);
+    let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
+    let config = AlbertConfig::from_file(config_path);
+    let albert_model = AlbertForMaskedLM::new(&vs.root(), &config);
+    vs.load(weights_path)?;
+
+//    Define input
+    let input = ["Looks like one [MASK] is missing", "It was a very nice and [MASK] day"];
+    let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
+    let tokenized_input = tokenized_input.
+        iter().
+        map(|input| input.token_ids.clone()).
+        map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        }).
+        map(|input|
+            Tensor::of_slice(&(input))).
+        collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
+
+//    Forward pass
+    let (output, _, _) = no_grad(|| {
+        albert_model
+            .forward_t(Some(input_tensor),
+                       None,
+                       None,
+                       None,
+                       None,
+                       false)
+    });
+    println!("{:?}", output.double_value(&[0, 0, 0]));
+//    Print masked tokens
+    let index_1 = output.get(0).get(4).argmax(0, false);
+    let index_2 = output.get(1).get(7).argmax(0, false);
+    let word_1 = tokenizer.vocab().id_to_token(&index_1.int64_value(&[]));
+    let word_2 = tokenizer.vocab().id_to_token(&index_2.int64_value(&[]));
+
+    println!("{} - {}", &index_1.int64_value(&[]), word_1); // Outputs "_them" : "Looks like one [them] is missing"
+    println!("{} - {}", &index_2.int64_value(&[]), word_2); // Outputs "_enjoyable" : "It was a very nice and [enjoyable] day"
+
+    Ok(())
+}
--- a/examples/download_all_dependencies.rs
+++ b/examples/download_all_dependencies.rs
@ -8,6 +8,7 @@ use rust_bert::bert::{BertConfigResources, BertVocabResources, BertModelResource
 use rust_bert::bart::{BartConfigResources, BartVocabResources, BartMergesResources, BartModelResources};
 use rust_bert::resources::{Resource, download_resource, RemoteResource};
 use rust_bert::electra::{ElectraConfigResources, ElectraVocabResources, ElectraModelResources};
+use rust_bert::albert::{AlbertConfigResources, AlbertVocabResources, AlbertModelResources};

 /// This example downloads and caches all dependencies used in model tests. This allows for safe
 /// multi threaded testing (two test using the same resource would otherwise download the file to
@ -169,6 +170,17 @@ fn download_electra_discriminator() -> failure::Fallible<()> {
    Ok(())
 }

+fn download_albert_base_v2() -> failure::Fallible<()> {
+// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
+    let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
+    let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
+    let weights_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertModelResources::ALBERT_BASE_V2));
+    let _ = download_resource(&config_resource)?;
+    let _ = download_resource(&vocab_resource)?;
+    let _ = download_resource(&weights_resource)?;
+    Ok(())
+}
+
 fn main() -> failure::Fallible<()> {
    let _ = download_distil_gpt2();
    let _ = download_distilbert_sst2();
@ -183,6 +195,7 @@ fn main() -> failure::Fallible<()> {
    let _ = download_bart_cnn();
    let _ = download_electra_generator();
    let _ = download_electra_discriminator();
+    let _ = download_albert_base_v2();

    Ok(())
 }
--- a/src/albert/albert.rs
+++ b/src/albert/albert.rs
@ -0,0 +1,830 @@
+// Copyright 2018 Google AI and Google Brain team.
+// Copyright 2020-present, the HuggingFace Inc. team.
+// Copyright 2020 Guillaume Becquin
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::collections::HashMap;
+use crate::Config;
+use serde::{Deserialize, Serialize};
+use crate::albert::embeddings::AlbertEmbeddings;
+use crate::albert::encoder::AlbertTransformer;
+use tch::{nn, Tensor, Kind};
+use crate::common::activations::{_tanh, _gelu_new, _gelu, _relu, _mish};
+use tch::nn::Module;
+use crate::common::dropout::Dropout;
+
+/// # ALBERT Pretrained model weight files
+pub struct AlbertModelResources;
+
+/// # ALBERT Pretrained model config files
+pub struct AlbertConfigResources;
+
+/// # ALBERT Pretrained model vocab files
+pub struct AlbertVocabResources;
+
+impl AlbertModelResources {
+    /// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
+    pub const ALBERT_BASE_V2: (&'static str, &'static str) = ("albert-base-v2/model.ot", "https://cdn.huggingface.co/albert-base-v2/rust_model.ot");
+}
+
+impl AlbertConfigResources {
+    /// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
+    pub const ALBERT_BASE_V2: (&'static str, &'static str) = ("albert-base-v2/config.json", "https://cdn.huggingface.co/albert-base-v2-config.json");
+}
+
+impl AlbertVocabResources {
+    /// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
+    pub const ALBERT_BASE_V2: (&'static str, &'static str) = ("albert-base-v2/spiece.model", "https://cdn.huggingface.co/albert-base-v2-spiece.model");
+}
+
+
+#[allow(non_camel_case_types)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
+/// # Activation function used in the attention layer and masked language model head
+pub enum Activation {
+    /// Gaussian Error Linear Unit ([Hendrycks et al., 2016,](https://arxiv.org/abs/1606.08415))
+    gelu_new,
+    /// Gaussian Error Linear Unit ([Hendrycks et al., 2016,](https://arxiv.org/abs/1606.08415))
+    gelu,
+    /// Rectified Linear Unit
+    relu,
+    /// Mish ([Misra, 2019](https://arxiv.org/abs/1908.08681))
+    mish,
+}
+
+
+#[derive(Debug, Serialize, Deserialize)]
+/// # ALBERT model configuration
+/// Defines the ALBERT model architecture (e.g. number of layers, hidden layer size, label mapping...)
+pub struct AlbertConfig {
+    pub hidden_act: Activation,
+    pub attention_probs_dropout_prob: f64,
+    pub classifier_dropout_prob: Option<f64>,
+    pub bos_token_id: i64,
+    pub eos_token_id: i64,
+    pub down_scale_factor: i64,
+    pub embedding_size: i64,
+    pub gap_size: i64,
+    pub hidden_dropout_prob: f64,
+    pub hidden_size: i64,
+    pub initializer_range: f32,
+    pub inner_group_num: i64,
+    pub intermediate_size: i64,
+    pub layer_norm_eps: Option<f64>,
+    pub max_position_embeddings: i64,
+    pub net_structure_type: i64,
+    pub num_attention_heads: i64,
+    pub num_hidden_groups: i64,
+    pub num_hidden_layers: i64,
+    pub num_memory_blocks: i64,
+    pub pad_token_id: i64,
+    pub type_vocab_size: i64,
+    pub vocab_size: i64,
+    pub output_attentions: Option<bool>,
+    pub output_hidden_states: Option<bool>,
+    pub is_decoder: Option<bool>,
+    pub id2label: Option<HashMap<i64, String>>,
+    pub label2id: Option<HashMap<String, i64>>,
+}
+
+impl Config<AlbertConfig> for AlbertConfig {}
+
+/// # ALBERT Base model
+/// Base architecture for ALBERT models. Task-specific models will be built from this common base model
+/// It is made of the following blocks:
+/// - `embeddings`: `token`, `position` and `segment_id` embeddings
+/// - `encoder`: Encoder (transformer) made of a vector of layers. Each layer is made of a self-attention layer, an intermediate (linear) and output (linear + layer norm) layers. Note that the weights are shared across layers, allowing for a reduction in the model memory footprint.
+/// - `pooler`: linear layer applied to the first element of the sequence (*[MASK]* token)
+/// - `pooler_activation`: Tanh activation function for the pooling layer
+pub struct AlbertModel {
+    embeddings: AlbertEmbeddings,
+    encoder: AlbertTransformer,
+    pooler: nn::Linear,
+    pooler_activation: Box<dyn Fn(&Tensor) -> Tensor>,
+}
+
+impl AlbertModel {
+    /// Build a new `AlbertModel`
+    ///
+    /// # Arguments
+    ///
+    /// * `p` - Variable store path for the root of the ALBERT model
+    /// * `config` - `AlbertConfig` object defining the model architecture and decoder status
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use tch::{nn, Device};
+    /// use rust_bert::Config;
+    /// use std::path::Path;
+    /// use rust_bert::albert::{AlbertConfig, AlbertModel};
+    ///
+    /// let config_path = Path::new("path/to/config.json");
+    /// let device = Device::Cpu;
+    /// let p = nn::VarStore::new(device);
+    /// let config = AlbertConfig::from_file(config_path);
+    /// let albert: AlbertModel = AlbertModel::new(&(&p.root() / "albert"), &config);
+    /// ```
+    ///
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertModel {
+        let embeddings = AlbertEmbeddings::new(&(p / "embeddings"), config);
+        let encoder = AlbertTransformer::new(&(p / "encoder"), config);
+        let pooler = nn::linear(&(p / "pooler"), config.hidden_size, config.hidden_size, Default::default());
+        let pooler_activation = Box::new(_tanh);
+
+        AlbertModel { embeddings, encoder, pooler, pooler_activation }
+    }
+
+    /// Forward pass through the model
+    ///
+    /// # Arguments
+    ///
+    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
+    /// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
+    /// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
+    /// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
+    /// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
+    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
+    ///
+    /// # Returns
+    ///
+    /// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *hidden_size*)
+    /// * `pooled_output` - `Tensor` of shape (*batch size*, *hidden_size*)
+    /// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    /// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    ///# use tch::{nn, Device, Tensor, no_grad};
+    ///# use rust_bert::Config;
+    ///# use std::path::Path;
+    ///# use tch::kind::Kind::Int64;
+    /// use rust_bert::albert::{AlbertConfig, AlbertModel};
+    ///# let config_path = Path::new("path/to/config.json");
+    ///# let device = Device::Cpu;
+    ///# let vs = nn::VarStore::new(device);
+    ///# let config = AlbertConfig::from_file(config_path);
+    ///# let albert_model: AlbertModel = AlbertModel::new(&vs.root(), &config);
+    ///  let (batch_size, sequence_length) = (64, 128);
+    ///  let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
+    ///  let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
+    ///
+    ///  let (output, pooled_output, all_hidden_states, all_attentions) = no_grad(|| {
+    ///    albert_model
+    ///         .forward_t(Some(input_tensor),
+    ///                    Some(mask),
+    ///                    Some(token_type_ids),
+    ///                    Some(position_ids),
+    ///                    None,
+    ///                    false).unwrap()
+    ///    });
+    ///
+    /// ```
+    ///
+    pub fn forward_t(&self,
+                     input_ids: Option<Tensor>,
+                     mask: Option<Tensor>,
+                     token_type_ids: Option<Tensor>,
+                     position_ids: Option<Tensor>,
+                     input_embeds: Option<Tensor>,
+                     train: bool)
+                     -> Result<(Tensor, Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>), &'static str> {
+        let (input_shape, device) = match &input_ids {
+            Some(input_value) => match &input_embeds {
+                Some(_) => { return Err("Only one of input ids or input embeddings may be set"); }
+                None => (input_value.size(), input_value.device())
+            }
+            None => match &input_embeds {
+                Some(embeds) => (vec!(embeds.size()[0], embeds.size()[1]), embeds.device()),
+                None => { return Err("At least one of input ids or input embeddings must be set"); }
+            }
+        };
+
+        let mask = match mask {
+            Some(value) => value,
+            None => Tensor::ones(&input_shape, (Kind::Int64, device))
+        };
+
+        let extended_attention_mask = mask.unsqueeze(1).unsqueeze(2);
+        let extended_attention_mask: Tensor = (extended_attention_mask.ones_like() - extended_attention_mask) * -10000.0;
+
+        let embedding_output = match self.embeddings.forward_t(input_ids, token_type_ids, position_ids, input_embeds, train) {
+            Ok(value) => value,
+            Err(e) => { return Err(e); }
+        };
+
+        let (hidden_state, all_hidden_states, all_attentions) =
+            self.encoder.forward_t(&embedding_output,
+                                   Some(extended_attention_mask),
+                                   train);
+
+        let pooled_output = self.pooler.forward(&hidden_state.select(1, 0));
+        let pooled_output = (self.pooler_activation)(&pooled_output);
+
+        Ok((hidden_state, pooled_output, all_hidden_states, all_attentions))
+    }
+}
+
+pub struct AlbertMLMHead {
+    layer_norm: nn::LayerNorm,
+    dense: nn::Linear,
+    decoder: nn::Linear,
+    activation: Box<dyn Fn(&Tensor) -> Tensor>,
+}
+
+impl AlbertMLMHead {
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertMLMHead {
+        let layer_norm_eps = match config.layer_norm_eps {
+            Some(value) => value,
+            None => 1e-12
+        };
+        let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
+        let layer_norm = nn::layer_norm(&(p / "LayerNorm"), vec![config.embedding_size], layer_norm_config);
+        let dense = nn::linear(&(p / "dense"), config.hidden_size, config.embedding_size, Default::default());
+        let decoder = nn::linear(&(p / "decoder"), config.embedding_size, config.vocab_size, Default::default());
+
+        let activation = Box::new(match &config.hidden_act {
+            Activation::gelu_new => _gelu_new,
+            Activation::gelu => _gelu,
+            Activation::relu => _relu,
+            Activation::mish => _mish
+        });
+
+        AlbertMLMHead { layer_norm, dense, decoder, activation }
+    }
+
+    pub fn forward(&self, hidden_states: &Tensor) -> Tensor {
+        let output: Tensor = (self.activation)(&hidden_states.apply(&self.dense));
+        output.apply(&self.layer_norm).apply(&self.decoder)
+    }
+}
+
+/// # ALBERT for masked language model
+/// Base ALBERT model with a masked language model head to predict missing tokens, for example `"Looks like one [MASK] is missing" -> "person"`
+/// It is made of the following blocks:
+/// - `albert`: Base AlbertModel
+/// - `predictions`: ALBERT MLM prediction head
+pub struct AlbertForMaskedLM {
+    albert: AlbertModel,
+    predictions: AlbertMLMHead,
+}
+
+impl AlbertForMaskedLM {
+    /// Build a new `AlbertForMaskedLM`
+    ///
+    /// # Arguments
+    ///
+    /// * `p` - Variable store path for the root of the ALBERT model
+    /// * `config` - `AlbertConfig` object defining the model architecture and decoder status
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use tch::{nn, Device};
+    /// use rust_bert::Config;
+    /// use std::path::Path;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForMaskedLM};
+    ///
+    /// let config_path = Path::new("path/to/config.json");
+    /// let device = Device::Cpu;
+    /// let p = nn::VarStore::new(device);
+    /// let config = AlbertConfig::from_file(config_path);
+    /// let albert: AlbertForMaskedLM = AlbertForMaskedLM::new(&p.root(), &config);
+    /// ```
+    ///
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForMaskedLM {
+        let albert = AlbertModel::new(&(p / "albert"), config);
+        let predictions = AlbertMLMHead::new(&(p / "predictions"), config);
+
+        AlbertForMaskedLM { albert, predictions }
+    }
+
+    /// Forward pass through the model
+    ///
+    /// # Arguments
+    ///
+    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
+    /// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
+    /// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
+    /// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
+    /// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
+    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
+    ///
+    /// # Returns
+    ///
+    /// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*)
+    /// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    /// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    ///# use tch::{nn, Device, Tensor, no_grad};
+    ///# use rust_bert::Config;
+    ///# use std::path::Path;
+    ///# use tch::kind::Kind::Int64;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForMaskedLM};
+    ///# let config_path = Path::new("path/to/config.json");
+    ///# let device = Device::Cpu;
+    ///# let vs = nn::VarStore::new(device);
+    ///# let config = AlbertConfig::from_file(config_path);
+    ///# let albert_model: AlbertForMaskedLM = AlbertForMaskedLM::new(&vs.root(), &config);
+    ///  let (batch_size, sequence_length) = (64, 128);
+    ///  let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
+    ///  let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
+    ///
+    ///  let (output, all_hidden_states, all_attentions) = no_grad(|| {
+    ///    albert_model
+    ///         .forward_t(Some(input_tensor),
+    ///                    Some(mask),
+    ///                    Some(token_type_ids),
+    ///                    Some(position_ids),
+    ///                    None,
+    ///                    false)
+    ///    });
+    ///
+    /// ```
+    ///
+    pub fn forward_t(&self,
+                     input_ids: Option<Tensor>,
+                     mask: Option<Tensor>,
+                     token_type_ids: Option<Tensor>,
+                     position_ids: Option<Tensor>,
+                     input_embeds: Option<Tensor>,
+                     train: bool) -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
+        let (hidden_state, _, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
+        let prediction_scores = self.predictions.forward(&hidden_state);
+        (prediction_scores, all_hidden_states, all_attentions)
+    }
+}
+
+/// # ALBERT for sequence classification
+/// Base ALBERT model with a classifier head to perform sentence or document-level classification
+/// It is made of the following blocks:
+/// - `albert`: Base AlbertModel
+/// - `dropout`: Dropout layer
+/// - `classifier`: linear layer for classification
+pub struct AlbertForSequenceClassification {
+    albert: AlbertModel,
+    dropout: Dropout,
+    classifier: nn::Linear,
+}
+
+impl AlbertForSequenceClassification {
+    /// Build a new `AlbertForSequenceClassification`
+    ///
+    /// # Arguments
+    ///
+    /// * `p` - Variable store path for the root of the ALBERT model
+    /// * `config` - `AlbertConfig` object defining the model architecture and decoder status
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use tch::{nn, Device};
+    /// use rust_bert::Config;
+    /// use std::path::Path;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForSequenceClassification};
+    ///
+    /// let config_path = Path::new("path/to/config.json");
+    /// let device = Device::Cpu;
+    /// let p = nn::VarStore::new(device);
+    /// let config = AlbertConfig::from_file(config_path);
+    /// let albert: AlbertForSequenceClassification = AlbertForSequenceClassification::new(&p.root(), &config);
+    /// ```
+    ///
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForSequenceClassification {
+        let albert = AlbertModel::new(&(p / "albert"), config);
+        let classifier_dropout_prob = match config.classifier_dropout_prob {
+            Some(value) => value,
+            None => 0.1
+        };
+        let dropout = Dropout::new(classifier_dropout_prob);
+        let num_labels = config.id2label.as_ref().expect("num_labels not provided in configuration").len() as i64;
+        let classifier = nn::linear(&(p / "classifier"), config.hidden_size, num_labels, Default::default());
+
+        AlbertForSequenceClassification { albert, dropout, classifier }
+    }
+
+    /// Forward pass through the model
+    ///
+    /// # Arguments
+    ///
+    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
+    /// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
+    /// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
+    /// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
+    /// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
+    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
+    ///
+    /// # Returns
+    ///
+    /// * `output` - `Tensor` of shape (*batch size*, *num_labels*)
+    /// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    /// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    ///# use tch::{nn, Device, Tensor, no_grad};
+    ///# use rust_bert::Config;
+    ///# use std::path::Path;
+    ///# use tch::kind::Kind::Int64;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForSequenceClassification};
+    ///# let config_path = Path::new("path/to/config.json");
+    ///# let device = Device::Cpu;
+    ///# let vs = nn::VarStore::new(device);
+    ///# let config = AlbertConfig::from_file(config_path);
+    ///# let albert_model: AlbertForSequenceClassification = AlbertForSequenceClassification::new(&vs.root(), &config);
+    ///  let (batch_size, sequence_length) = (64, 128);
+    ///  let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
+    ///  let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
+    ///
+    ///  let (output, all_hidden_states, all_attentions) = no_grad(|| {
+    ///    albert_model
+    ///         .forward_t(Some(input_tensor),
+    ///                    Some(mask),
+    ///                    Some(token_type_ids),
+    ///                    Some(position_ids),
+    ///                    None,
+    ///                    false)
+    ///    });
+    ///
+    /// ```
+    ///
+    pub fn forward_t(&self,
+                     input_ids: Option<Tensor>,
+                     mask: Option<Tensor>,
+                     token_type_ids: Option<Tensor>,
+                     position_ids: Option<Tensor>,
+                     input_embeds: Option<Tensor>,
+                     train: bool) -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
+        let (_, pooled_output, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
+        let logits = pooled_output.apply_t(&self.dropout, train).apply(&self.classifier);
+        (logits, all_hidden_states, all_attentions)
+    }
+}
+
+/// # ALBERT for token classification (e.g. NER, POS)
+/// Token-level classifier predicting a label for each token provided. Note that because of SentencePiece tokenization, the labels predicted are
+/// not necessarily aligned with words in the sentence.
+/// It is made of the following blocks:
+/// - `albert`: Base AlbertModel
+/// - `dropout`: Dropout to apply on the encoder last hidden states
+/// - `classifier`: Linear layer for token classification
+pub struct AlbertForTokenClassification {
+    albert: AlbertModel,
+    dropout: Dropout,
+    classifier: nn::Linear,
+}
+
+impl AlbertForTokenClassification {
+    /// Build a new `AlbertForTokenClassification`
+    ///
+    /// # Arguments
+    ///
+    /// * `p` - Variable store path for the root of the ALBERT model
+    /// * `config` - `AlbertConfig` object defining the model architecture and decoder status
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use tch::{nn, Device};
+    /// use rust_bert::Config;
+    /// use std::path::Path;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForTokenClassification};
+    ///
+    /// let config_path = Path::new("path/to/config.json");
+    /// let device = Device::Cpu;
+    /// let p = nn::VarStore::new(device);
+    /// let config = AlbertConfig::from_file(config_path);
+    /// let albert: AlbertForTokenClassification = AlbertForTokenClassification::new(&p.root(), &config);
+    /// ```
+    ///
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForTokenClassification {
+        let albert = AlbertModel::new(&(p / "albert"), config);
+        let dropout = Dropout::new(config.hidden_dropout_prob);
+        let num_labels = config.id2label.as_ref().expect("num_labels not provided in configuration").len() as i64;
+        let classifier = nn::linear(&(p / "classifier"), config.hidden_size, num_labels, Default::default());
+
+        AlbertForTokenClassification { albert, dropout, classifier }
+    }
+
+    /// Forward pass through the model
+    ///
+    /// # Arguments
+    ///
+    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
+    /// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
+    /// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
+    /// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
+    /// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
+    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
+    ///
+    /// # Returns
+    ///
+    /// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *num_labels*) containing the logits for each of the input tokens and classes
+    /// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    /// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    ///# use tch::{nn, Device, Tensor, no_grad};
+    ///# use rust_bert::Config;
+    ///# use std::path::Path;
+    ///# use tch::kind::Kind::Int64;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForTokenClassification};
+    ///# let config_path = Path::new("path/to/config.json");
+    ///# let device = Device::Cpu;
+    ///# let vs = nn::VarStore::new(device);
+    ///# let config = AlbertConfig::from_file(config_path);
+    ///# let albert_model: AlbertForTokenClassification = AlbertForTokenClassification::new(&vs.root(), &config);
+    ///  let (batch_size, sequence_length) = (64, 128);
+    ///  let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
+    ///  let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
+    ///
+    ///  let (output, all_hidden_states, all_attentions) = no_grad(|| {
+    ///    albert_model
+    ///         .forward_t(Some(input_tensor),
+    ///                    Some(mask),
+    ///                    Some(token_type_ids),
+    ///                    Some(position_ids),
+    ///                    None,
+    ///                    false)
+    ///    });
+    ///
+    /// ```
+    ///
+    pub fn forward_t(&self,
+                     input_ids: Option<Tensor>,
+                     mask: Option<Tensor>,
+                     token_type_ids: Option<Tensor>,
+                     position_ids: Option<Tensor>,
+                     input_embeds: Option<Tensor>,
+                     train: bool) -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
+        let (sequence_output, _, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
+        let logits = sequence_output.apply_t(&self.dropout, train).apply(&self.classifier);
+        (logits, all_hidden_states, all_attentions)
+    }
+}
+
+/// # ALBERT for question answering
+/// Extractive question-answering model based on a ALBERT language model. Identifies the segment of a context that answers a provided question.
+/// Please note that a significant amount of pre- and post-processing is required to perform end-to-end question answering.
+/// See the question answering pipeline (also provided in this crate) for more details.
+/// It is made of the following blocks:
+/// - `albert`: Base AlbertModel
+/// - `qa_outputs`: Linear layer for question answering
+pub struct AlbertForQuestionAnswering {
+    albert: AlbertModel,
+    qa_outputs: nn::Linear,
+}
+
+impl AlbertForQuestionAnswering {
+    /// Build a new `AlbertForQuestionAnswering`
+    ///
+    /// # Arguments
+    ///
+    /// * `p` - Variable store path for the root of the ALBERT model
+    /// * `config` - `AlbertConfig` object defining the model architecture and decoder status
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use tch::{nn, Device};
+    /// use rust_bert::Config;
+    /// use std::path::Path;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForQuestionAnswering};
+    ///
+    /// let config_path = Path::new("path/to/config.json");
+    /// let device = Device::Cpu;
+    /// let p = nn::VarStore::new(device);
+    /// let config = AlbertConfig::from_file(config_path);
+    /// let albert: AlbertForQuestionAnswering = AlbertForQuestionAnswering::new(&p.root(), &config);
+    /// ```
+    ///
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForQuestionAnswering {
+        let albert = AlbertModel::new(&(p / "albert"), config);
+        let num_labels = 2;
+        let qa_outputs = nn::linear(&(p / "qa_outputs"), config.hidden_size, num_labels, Default::default());
+
+        AlbertForQuestionAnswering { albert, qa_outputs }
+    }
+
+    /// Forward pass through the model
+    ///
+    /// # Arguments
+    ///
+    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
+    /// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
+    /// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
+    /// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
+    /// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
+    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
+    ///
+    /// # Returns
+    ///
+    /// * `start_scores` - `Tensor` of shape (*batch size*, *sequence_length*) containing the logits for start of the answer
+    /// * `end_scores` - `Tensor` of shape (*batch size*, *sequence_length*) containing the logits for end of the answer
+    /// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    /// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    ///# use tch::{nn, Device, Tensor, no_grad};
+    ///# use rust_bert::Config;
+    ///# use std::path::Path;
+    ///# use tch::kind::Kind::Int64;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForQuestionAnswering};
+    ///# let config_path = Path::new("path/to/config.json");
+    ///# let device = Device::Cpu;
+    ///# let vs = nn::VarStore::new(device);
+    ///# let config = AlbertConfig::from_file(config_path);
+    ///# let albert_model: AlbertForQuestionAnswering = AlbertForQuestionAnswering::new(&vs.root(), &config);
+    ///  let (batch_size, sequence_length) = (64, 128);
+    ///  let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
+    ///  let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
+    ///
+    ///  let (start_logits, end_logits, all_hidden_states, all_attentions) = no_grad(|| {
+    ///    albert_model
+    ///         .forward_t(Some(input_tensor),
+    ///                    Some(mask),
+    ///                    Some(token_type_ids),
+    ///                    Some(position_ids),
+    ///                    None,
+    ///                    false)
+    ///    });
+    ///
+    /// ```
+    ///
+    pub fn forward_t(&self,
+                     input_ids: Option<Tensor>,
+                     mask: Option<Tensor>,
+                     token_type_ids: Option<Tensor>,
+                     position_ids: Option<Tensor>,
+                     input_embeds: Option<Tensor>,
+                     train: bool) -> (Tensor, Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
+        let (sequence_output, _, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
+        let logits = sequence_output.apply(&self.qa_outputs).split(1, -1);
+        let (start_logits, end_logits) = (&logits[0], &logits[1]);
+        let start_logits = start_logits.squeeze1(-1);
+        let end_logits = end_logits.squeeze1(-1);
+
+        (start_logits, end_logits, all_hidden_states, all_attentions)
+    }
+}
+
+/// # ALBERT for multiple choices
+/// Multiple choices model using a ALBERT base model and a linear classifier.
+/// Input should be in the form `[CLS] Context [SEP] Possible choice [SEP]`. The choice is made along the batch axis,
+/// assuming all elements of the batch are alternatives to be chosen from for a given context.
+/// It is made of the following blocks:
+/// - `albert`: Base AlbertModel
+/// - `dropout`: Dropout for hidden states output
+/// - `classifier`: Linear layer for multiple choices
+pub struct AlbertForMultipleChoice {
+    albert: AlbertModel,
+    dropout: Dropout,
+    classifier: nn::Linear,
+}
+
+impl AlbertForMultipleChoice {
+    /// Build a new `AlbertForMultipleChoice`
+    ///
+    /// # Arguments
+    ///
+    /// * `p` - Variable store path for the root of the ALBERT model
+    /// * `config` - `AlbertConfig` object defining the model architecture and decoder status
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use tch::{nn, Device};
+    /// use rust_bert::Config;
+    /// use std::path::Path;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForMultipleChoice};
+    ///
+    /// let config_path = Path::new("path/to/config.json");
+    /// let device = Device::Cpu;
+    /// let p = nn::VarStore::new(device);
+    /// let config = AlbertConfig::from_file(config_path);
+    /// let albert: AlbertForMultipleChoice = AlbertForMultipleChoice::new(&p.root(), &config);
+    /// ```
+    ///
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForMultipleChoice {
+        let albert = AlbertModel::new(&(p / "albert"), config);
+        let dropout = Dropout::new(config.hidden_dropout_prob);
+        let num_labels = 1;
+        let classifier = nn::linear(&(p / "classifier"), config.hidden_size, num_labels, Default::default());
+
+        AlbertForMultipleChoice { albert, dropout, classifier }
+    }
+
+    /// Forward pass through the model
+    ///
+    /// # Arguments
+    ///
+    /// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
+    /// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
+    /// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
+    /// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
+    /// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
+    /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
+    ///
+    /// # Returns
+    ///
+    /// * `output` - `Tensor` of shape (*1*, *batch size*) containing the logits for each of the alternatives given
+    /// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    /// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    ///# use tch::{nn, Device, Tensor, no_grad};
+    ///# use rust_bert::Config;
+    ///# use std::path::Path;
+    ///# use tch::kind::Kind::Int64;
+    /// use rust_bert::albert::{AlbertConfig, AlbertForMultipleChoice};
+    ///# let config_path = Path::new("path/to/config.json");
+    ///# let device = Device::Cpu;
+    ///# let vs = nn::VarStore::new(device);
+    ///# let config = AlbertConfig::from_file(config_path);
+    ///# let albert_model: AlbertForMultipleChoice = AlbertForMultipleChoice::new(&vs.root(), &config);
+    ///  let (batch_size, sequence_length) = (64, 128);
+    ///  let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
+    ///  let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
+    ///  let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
+    ///
+    ///  let (output, all_hidden_states, all_attentions) = no_grad(|| {
+    ///    albert_model
+    ///         .forward_t(Some(input_tensor),
+    ///                    Some(mask),
+    ///                    Some(token_type_ids),
+    ///                    Some(position_ids),
+    ///                    None,
+    ///                    false).unwrap()
+    ///    });
+    ///
+    /// ```
+    ///
+    pub fn forward_t(&self,
+                     input_ids: Option<Tensor>,
+                     mask: Option<Tensor>,
+                     token_type_ids: Option<Tensor>,
+                     position_ids: Option<Tensor>,
+                     input_embeds: Option<Tensor>,
+                     train: bool) -> Result<(Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>), &'static str> {
+        let (input_ids, input_embeds, num_choices) = match &input_ids {
+            Some(input_value) => match &input_embeds {
+                Some(_) => { return Err("Only one of input ids or input embeddings may be set"); }
+                None => (Some(input_value.view((-1, *input_value.size().last().unwrap()))), None, input_value.size()[1])
+            }
+            None => match &input_embeds {
+                Some(embeds) => (None, Some(embeds.view((-1, embeds.size()[1], embeds.size()[2]))), embeds.size()[1]),
+                None => { return Err("At least one of input ids or input embeddings must be set"); }
+            }
+        };
+
+        let mask = match mask {
+            Some(value) => Some(value.view((-1, *value.size().last().unwrap()))),
+            None => None
+        };
+        let token_type_ids = match token_type_ids {
+            Some(value) => Some(value.view((-1, *value.size().last().unwrap()))),
+            None => None
+        };
+        let position_ids = match position_ids {
+            Some(value) => Some(value.view((-1, *value.size().last().unwrap()))),
+            None => None
+        };
+
+
+        let (_, pooled_output, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
+        let logits = pooled_output.apply_t(&self.dropout, train).apply(&self.classifier).view((-1, num_choices));
+
+        Ok((logits, all_hidden_states, all_attentions))
+    }
+}
--- a/src/albert/attention.rs
+++ b/src/albert/attention.rs
@ -0,0 +1,107 @@
+// Copyright 2018 Google AI and Google Brain team.
+// Copyright 2020-present, the HuggingFace Inc. team.
+// Copyright 2020 Guillaume Becquin
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::common::dropout::Dropout;
+use tch::{nn, Tensor};
+use crate::albert::AlbertConfig;
+use tch::kind::Kind::Float;
+
+#[derive(Debug)]
+pub struct AlbertSelfAttention {
+    num_attention_heads: i64,
+    attention_head_size: i64,
+    hidden_size: i64,
+    dropout: Dropout,
+    output_attentions: bool,
+    query: nn::Linear,
+    key: nn::Linear,
+    value: nn::Linear,
+    dense: nn::Linear,
+    layer_norm: nn::LayerNorm,
+}
+
+impl AlbertSelfAttention {
+    pub fn new(p: nn::Path, config: &AlbertConfig) -> AlbertSelfAttention {
+        assert_eq!(config.hidden_size % config.num_attention_heads, 0, "Hidden size not a multiple of the number of attention heads");
+
+        let query = nn::linear(&p / "query", config.hidden_size, config.hidden_size, Default::default());
+        let key = nn::linear(&p / "key", config.hidden_size, config.hidden_size, Default::default());
+        let value = nn::linear(&p / "value", config.hidden_size, config.hidden_size, Default::default());
+        let dense = nn::linear(&p / "dense", config.hidden_size, config.hidden_size, Default::default());
+        let dropout = Dropout::new(config.attention_probs_dropout_prob);
+        let attention_head_size = config.hidden_size / config.num_attention_heads;
+        let output_attentions = match config.output_attentions {
+            Some(value) => value,
+            None => false
+        };
+        let layer_norm_eps = match config.layer_norm_eps {
+            Some(value) => value,
+            None => 1e-12
+        };
+        let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
+        let layer_norm = nn::layer_norm(&p / "LayerNorm", vec![config.hidden_size], layer_norm_config);
+
+        AlbertSelfAttention {
+            num_attention_heads: config.num_attention_heads,
+            attention_head_size,
+            hidden_size: config.hidden_size,
+            dropout,
+            output_attentions,
+            query,
+            key,
+            value,
+            dense,
+            layer_norm,
+        }
+    }
+
+    fn split_heads(&self, x: Tensor, bs: i64, dim_per_head: i64) -> Tensor {
+        x.view((bs, -1, self.num_attention_heads, dim_per_head)).transpose(1, 2)
+    }
+
+    pub fn forward_t(&self,
+                     input_ids: &Tensor,
+                     mask: &Option<Tensor>,
+                     train: bool) -> (Tensor, Option<Tensor>) {
+
+        let bs = *input_ids.size().first().unwrap();
+
+        let key_layer = self.split_heads(input_ids.apply(&self.key), bs, self.attention_head_size);
+        let value_layer = self.split_heads(input_ids.apply(&self.value), bs, self.attention_head_size);
+        let query_layer = self.split_heads(input_ids.apply(&self.query), bs, self.attention_head_size);
+
+        let query_layer: Tensor = query_layer / (self.attention_head_size as f64).sqrt();
+
+        let scores = if let Some(mask) = mask {
+            query_layer.matmul(&key_layer.transpose(-1, -2)) + mask
+        } else {
+            query_layer.matmul(&key_layer.transpose(-1, -2))
+        };
+
+        let weights = scores.softmax(-1, Float).apply_t(&self.dropout, train);
+        let context = weights.matmul(&value_layer).transpose(1, 2).contiguous();
+
+        let w = self.dense.ws
+            .transpose(0, 1)
+            .view((self.num_attention_heads, self.attention_head_size, self.hidden_size));
+
+        let context: Tensor = Tensor::einsum("bfnd,ndh->bfh", &[context, w]) + &self.dense.bs;
+        let context = (input_ids + context.apply_t(&self.dropout, train)).apply(&self.layer_norm);
+
+        if !self.output_attentions {
+            (context, None)
+        } else {
+            (context, Some(weights))
+        }
+    }
+}
--- a/src/albert/embeddings.rs
+++ b/src/albert/embeddings.rs
@ -0,0 +1,102 @@
+// Copyright 2018 Google AI and Google Brain team.
+// Copyright 2020-present, the HuggingFace Inc. team.
+// Copyright 2020 Guillaume Becquin
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use tch::{nn, Tensor, Kind};
+use crate::common::dropout::Dropout;
+use crate::albert::AlbertConfig;
+use tch::nn::{EmbeddingConfig, embedding};
+
+/// # Embeddings implementation for Albert model
+#[derive(Debug)]
+/// # Embeddings implementation for Electra model
+pub struct AlbertEmbeddings {
+    word_embeddings: nn::Embedding,
+    position_embeddings: nn::Embedding,
+    token_type_embeddings: nn::Embedding,
+    layer_norm: nn::LayerNorm,
+    dropout: Dropout,
+}
+
+impl AlbertEmbeddings {
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertEmbeddings {
+        let embedding_config = EmbeddingConfig {
+            padding_idx: config.pad_token_id,
+            ..Default::default()
+        };
+
+        let word_embeddings: nn::Embedding = embedding(p / "word_embeddings",
+                                                       config.vocab_size,
+                                                       config.embedding_size,
+                                                       embedding_config);
+
+        let position_embeddings: nn::Embedding = embedding(p / "position_embeddings",
+                                                           config.max_position_embeddings,
+                                                           config.embedding_size,
+                                                           Default::default());
+
+        let token_type_embeddings: nn::Embedding = embedding(p / "token_type_embeddings",
+                                                             config.type_vocab_size,
+                                                             config.embedding_size,
+                                                             Default::default());
+
+        let layer_norm_eps = match config.layer_norm_eps {
+            Some(value) => value,
+            None => 1e-12
+        };
+        let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
+        let layer_norm: nn::LayerNorm = nn::layer_norm(p / "LayerNorm", vec![config.embedding_size], layer_norm_config);
+        let dropout: Dropout = Dropout::new(config.hidden_dropout_prob);
+        AlbertEmbeddings { word_embeddings, position_embeddings, token_type_embeddings, layer_norm, dropout}
+    }
+
+    pub fn forward_t(&self,
+                     input_ids: Option<Tensor>,
+                     token_type_ids: Option<Tensor>,
+                     position_ids: Option<Tensor>,
+                     input_embeds: Option<Tensor>,
+                     train: bool) -> Result<Tensor, &'static str> {
+        let (input_embeddings, input_shape) = match input_ids {
+            Some(input_value) => match input_embeds {
+                Some(_) => { return Err("Only one of input ids or input embeddings may be set"); }
+                None => (input_value.apply_t(&self.word_embeddings, train), input_value.size())
+            }
+            None => match input_embeds {
+                Some(embeds) => {
+                    let size = vec!(embeds.size()[0], embeds.size()[1]);
+                    (embeds, size)
+                },
+                None => { return Err("Only one of input ids or input embeddings may be set"); }
+            }
+        };
+
+        let seq_length = input_embeddings.as_ref().size()[1].to_owned();
+
+        let position_ids = match position_ids {
+            Some(value) => value,
+            None => Tensor::arange(seq_length, (Kind::Int64, input_embeddings.device()))
+                .unsqueeze(0).
+                expand(&input_shape, true)
+        };
+
+        let token_type_ids = match token_type_ids {
+            Some(value) => value,
+            None => Tensor::zeros(&input_shape, (Kind::Int64, input_embeddings.device()))
+        };
+
+        let position_embeddings = position_ids.apply(&self.position_embeddings);
+        let token_type_embeddings = token_type_ids.apply(&self.token_type_embeddings);
+
+        let input_embeddings: Tensor = input_embeddings + position_embeddings + token_type_embeddings;
+        Ok(input_embeddings.apply(&self.layer_norm).apply_t(&self.dropout, train))
+    }
+}
--- a/src/albert/encoder.rs
+++ b/src/albert/encoder.rs
@ -0,0 +1,198 @@
+// Copyright 2018 Google AI and Google Brain team.
+// Copyright 2020-present, the HuggingFace Inc. team.
+// Copyright 2020 Guillaume Becquin
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::albert::attention::AlbertSelfAttention;
+use tch::{nn, Tensor};
+use crate::albert::AlbertConfig;
+use crate::albert::albert::Activation;
+use crate::common::activations::{_gelu_new, _gelu, _relu, _mish};
+use std::borrow::BorrowMut;
+
+pub struct AlbertLayer {
+    attention: AlbertSelfAttention,
+    full_layer_layer_norm: nn::LayerNorm,
+    ffn: nn::Linear,
+    ffn_output: nn::Linear,
+    activation: Box<dyn Fn(&Tensor) -> Tensor>,
+}
+
+impl AlbertLayer {
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertLayer {
+        let attention = AlbertSelfAttention::new(p / "attention", &config);
+
+        let layer_norm_eps = match config.layer_norm_eps {
+            Some(value) => value,
+            None => 1e-12
+        };
+        let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
+        let full_layer_layer_norm = nn::layer_norm(&(p / "full_layer_layer_norm"), vec![config.hidden_size], layer_norm_config);
+
+        let ffn = nn::linear(&(p / "ffn"), config.hidden_size, config.intermediate_size, Default::default());
+        let ffn_output = nn::linear(&(p / "ffn_output"), config.intermediate_size, config.hidden_size, Default::default());
+
+        let activation = Box::new(match &config.hidden_act {
+            Activation::gelu_new => _gelu_new,
+            Activation::gelu => _gelu,
+            Activation::relu => _relu,
+            Activation::mish => _mish
+        });
+
+        AlbertLayer { attention, full_layer_layer_norm, ffn, ffn_output, activation }
+    }
+
+    pub fn forward_t(&self,
+                     hidden_states: &Tensor,
+                     mask: &Option<Tensor>,
+                     train: bool) -> (Tensor, Option<Tensor>) {
+        let (attention_output, attention_weights) = self.attention.forward_t(hidden_states, mask, train);
+        let ffn_output = attention_output.apply(&self.ffn);
+        let ffn_output: Tensor = (self.activation)(&ffn_output);
+        let ffn_output = ffn_output.apply(&self.ffn_output);
+        let ffn_output = (ffn_output + attention_output).apply(&self.full_layer_layer_norm);
+
+        (ffn_output, attention_weights)
+    }
+}
+
+pub struct AlbertLayerGroup {
+    output_hidden_states: bool,
+    output_attentions: bool,
+    layers: Vec<AlbertLayer>,
+}
+
+impl AlbertLayerGroup {
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertLayerGroup {
+        let p = &(p / "albert_layers");
+
+        let output_attentions = match config.output_attentions {
+            Some(value) => value,
+            None => false
+        };
+
+        let output_hidden_states = match config.output_hidden_states {
+            Some(value) => value,
+            None => false
+        };
+
+        let mut layers: Vec<AlbertLayer> = vec!();
+        for layer_index in 0..config.inner_group_num {
+            layers.push(AlbertLayer::new(&(p / layer_index), config));
+        };
+
+        AlbertLayerGroup { output_hidden_states, output_attentions, layers }
+    }
+
+    pub fn forward_t(&self,
+                     hidden_states: &Tensor,
+                     mask: &Option<Tensor>,
+                     train: bool)
+                     -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Tensor>>) {
+        let mut all_hidden_states: Option<Vec<Tensor>> = if self.output_hidden_states { Some(vec!()) } else { None };
+        let mut all_attentions: Option<Vec<Tensor>> = if self.output_attentions { Some(vec!()) } else { None };
+
+        let mut hidden_state = hidden_states.copy();
+        let mut attention_weights: Option<Tensor>;
+        let mut layers = self.layers.iter();
+        loop {
+            match layers.next() {
+                Some(layer) => {
+                    if let Some(hidden_states) = all_hidden_states.borrow_mut() {
+                        hidden_states.push(hidden_state.as_ref().copy());
+                    };
+
+                    let temp = layer.forward_t(&hidden_state, &mask, train);
+                    hidden_state = temp.0;
+                    attention_weights = temp.1;
+                    if let Some(attentions) = all_attentions.borrow_mut() {
+                        attentions.push(attention_weights.as_ref().unwrap().copy());
+                    };
+                }
+                None => break
+            };
+        };
+
+        (hidden_state, all_hidden_states, all_attentions)
+    }
+}
+
+pub struct AlbertTransformer {
+    output_hidden_states: bool,
+    output_attentions: bool,
+    num_hidden_layers: i64,
+    num_hidden_groups: i64,
+    embedding_hidden_mapping_in: nn::Linear,
+    layers: Vec<AlbertLayerGroup>,
+}
+
+impl AlbertTransformer {
+    pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertTransformer {
+        let p_layers = &(p / "albert_layer_groups");
+
+        let output_attentions = match config.output_attentions {
+            Some(value) => value,
+            None => false
+        };
+
+        let output_hidden_states = match config.output_hidden_states {
+            Some(value) => value,
+            None => false
+        };
+
+        let embedding_hidden_mapping_in = nn::linear(&(p / "embedding_hidden_mapping_in"), config.embedding_size, config.hidden_size, Default::default());
+
+        let mut layers: Vec<AlbertLayerGroup> = vec!();
+        for layer_index in 0..config.inner_group_num {
+            layers.push(AlbertLayerGroup::new(&(p_layers / layer_index), config));
+        };
+
+        AlbertTransformer {
+            output_hidden_states,
+            output_attentions,
+            num_hidden_layers: config.num_hidden_layers,
+            num_hidden_groups: config.num_hidden_groups,
+            embedding_hidden_mapping_in,
+            layers,
+        }
+    }
+
+    pub fn forward_t(&self,
+                     hidden_states: &Tensor,
+                     mask: Option<Tensor>,
+                     train: bool)
+                     -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
+        let mut hidden_state = hidden_states.apply(&self.embedding_hidden_mapping_in);
+
+        let mut all_hidden_states: Option<Vec<Tensor>> = if self.output_hidden_states { Some(vec!()) } else { None };
+        let mut all_attentions: Option<Vec<Vec<Tensor>>> = if self.output_attentions { Some(vec!()) } else { None };
+
+
+        for i in 0..self.num_hidden_layers {
+            let group_idx = i / (self.num_hidden_layers / self.num_hidden_groups);
+            let layer = &self.layers[group_idx as usize];
+
+            if let Some(hidden_states) = all_hidden_states.borrow_mut() {
+                hidden_states.push(hidden_state.as_ref().copy());
+            };
+
+            let temp = layer.forward_t(&hidden_state, &mask, train);
+            hidden_state = temp.0;
+            let attention_weights = temp.1;
+            if let Some(attentions) = all_attentions.borrow_mut() {
+                attentions.push(attention_weights.unwrap());
+            };
+        };
+
+        (hidden_state, all_hidden_states, all_attentions)
+    }
+}
+
--- a/src/albert/mod.rs
+++ b/src/albert/mod.rs
@ -0,0 +1,56 @@
+//! # ALBERT: A Lite BERT for Self-supervised Learning of Language Representations (Lan et al.)
+//!
+//! Implementation of the ALBERT language model ([https://arxiv.org/abs/1909.11942](https://arxiv.org/abs/1909.11942) Lan, Chen, Goodman, Gimpel, Sharma, Soricut, 2019).
+//! This model offers a greatly reduced memory footprint for similar effective size (number and size of layers). The computational cost remains however similar to the original BERT model.
+//! The base model is implemented in the `albert::AlbertModel` struct. Several language model heads have also been implemented, including:
+//! - Masked language model: `albert::AlbertForMaskedLM`
+//! - Multiple choices: `albert:AlbertForMultipleChoice`
+//! - Question answering: `albert::AlbertForQuestionAnswering`
+//! - Sequence classification: `albert::AlbertForSequenceClassification`
+//! - Token classification (e.g. NER, POS tagging): `albert::AlbertForTokenClassification`
+//!
+//! # Model set-up and pre-trained weights loading
+//!
+//! A full working example is provided in `examples/albert.rs`, run with `cargo run --example albert`.
+//! The example below illustrate a Masked language model example, the structure is similar for other models.
+//! All models expect the following resources:
+//! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
+//! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
+//! - `BertTokenizer` using a `vocab.txt` vocabulary
+//! Pretrained models are available and can be downloaded using RemoteResources.
+//!
+//! ```no_run
+//!# fn main() -> failure::Fallible<()> {
+//!#
+//! use rust_tokenizers::AlbertTokenizer;
+//! use tch::{nn, Device};
+//!# use std::path::PathBuf;
+//! use rust_bert::albert::{AlbertForMaskedLM, AlbertConfig};
+//! use rust_bert::Config;
+//! use rust_bert::resources::{Resource, download_resource, LocalResource};
+//!
+//! let config_resource = Resource::Local(LocalResource { local_path: PathBuf::from("path/to/config.json")});
+//! let vocab_resource = Resource::Local(LocalResource { local_path: PathBuf::from("path/to/vocab.txt")});
+//! let weights_resource = Resource::Local(LocalResource { local_path: PathBuf::from("path/to/model.ot")});
+//! let config_path = download_resource(&config_resource)?;
+//! let vocab_path = download_resource(&vocab_resource)?;
+//! let weights_path = download_resource(&weights_resource)?;
+//! let device = Device::cuda_if_available();
+//! let mut vs = nn::VarStore::new(device);
+//! let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, true);
+//! let config = AlbertConfig::from_file(config_path);
+//! let bert_model = AlbertForMaskedLM::new(&vs.root(), &config);
+//! vs.load(weights_path)?;
+//!
+//!# Ok(())
+//!# }
+//! ```
+
+
+
+mod encoder;
+mod attention;
+mod embeddings;
+mod albert;
+
+pub use albert::{AlbertConfig, AlbertModelResources, AlbertConfigResources, AlbertVocabResources, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForTokenClassification, AlbertForQuestionAnswering, AlbertForMultipleChoice};
--- a/src/bert/embeddings.rs
+++ b/src/bert/embeddings.rs
@ -142,7 +142,10 @@ impl BertEmbedding for BertEmbeddings {
                None => (input_value.apply_t(&self.word_embeddings, train), input_value.size())
            }
            None => match input_embeds {
-                Some(embeds) => (embeds.copy(), vec!(embeds.size()[0], embeds.size()[1])),
+                Some(embeds) => {
+                    let size = vec!(embeds.size()[0], embeds.size()[1]);
+                    (embeds, size)
+                },
                None => { return Err("Only one of input ids or input embeddings may be set"); }
            }
        };
--- a/src/electra/embeddings.rs
+++ b/src/electra/embeddings.rs
@ -25,7 +25,6 @@ pub struct ElectraEmbeddings {
    token_type_embeddings: nn::Embedding,
    layer_norm: nn::LayerNorm,
    dropout: Dropout,
-    padding_index: i64,
 }

 impl ElectraEmbeddings {
@ -57,7 +56,7 @@ impl ElectraEmbeddings {
        let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
        let layer_norm: nn::LayerNorm = nn::layer_norm(p / "LayerNorm", vec![config.embedding_size], layer_norm_config);
        let dropout: Dropout = Dropout::new(config.hidden_dropout_prob);
-        ElectraEmbeddings { word_embeddings, position_embeddings, token_type_embeddings, layer_norm, dropout, padding_index: 1 }
+        ElectraEmbeddings { word_embeddings, position_embeddings, token_type_embeddings, layer_norm, dropout}
    }

    pub fn forward_t(&self,
--- a/src/lib.rs
+++ b/src/lib.rs
@ -29,17 +29,17 @@
 //! ```
 //! - Transformer models base architectures with customized heads. These allow to load pre-trained models for customized inference in Rust
 //!
-//!  | |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**
-//! :-----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:
-//! Masked LM|✅ |✅ |✅ | | | |✅| |
-//! Sequence classification|✅ |✅ |✅| | | | | |
-//! Token classification|✅ |✅ | ✅| | | |✅| |
-//! Question answering|✅ |✅ |✅| | | | | |
-//! Multiple choices| |✅ |✅| | | | | |
-//! Next token prediction| | | |✅|✅| | | |
-//! Natural Language Generation| | | |✅|✅| | | |
-//! Summarization| | | | | |✅| | |
-//! Translation| | | | | | | |✅|
+//!  | |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**|**ALBERT**
+//! :-----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:
+//! Masked LM|✅ |✅ |✅ | | | |✅| |✅ |
+//! Sequence classification|✅ |✅ |✅| | | | | |✅ |
+//! Token classification|✅ |✅ | ✅| | | |✅| |✅ |
+//! Question answering|✅ |✅ |✅| | | | | |✅ |
+//! Multiple choices| |✅ |✅| | | | | |✅ |
+//! Next token prediction| | | |✅|✅| | | | |
+//! Natural Language Generation| | | |✅|✅| | | | |
+//! Summarization| | | | | |✅| | | |
+//! Translation| | | | | | | |✅| |
 //!
 //! # Loading pre-trained models
 //!
@ -65,6 +65,7 @@ pub mod gpt2;
 pub mod bart;
 pub mod electra;
 pub mod marian;
+pub mod albert;
 mod common;
 pub mod pipelines;

--- a/tests/albert.rs
+++ b/tests/albert.rs
@ -0,0 +1,285 @@
+extern crate failure;
+extern crate dirs;
+
+use tch::{Device, nn, Tensor, no_grad};
+use rust_tokenizers::{TruncationStrategy, Tokenizer, Vocab, AlbertTokenizer};
+use rust_bert::Config;
+use rust_bert::resources::{Resource, RemoteResource, download_resource};
+use rust_bert::albert::{AlbertConfigResources, AlbertVocabResources, AlbertModelResources, AlbertConfig, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForMultipleChoice, AlbertForTokenClassification, AlbertForQuestionAnswering};
+use std::collections::HashMap;
+
+
+#[test]
+fn albert_masked_lm() -> failure::Fallible<()> {
+    //    Resources paths
+    let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
+    let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
+    let weights_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertModelResources::ALBERT_BASE_V2));
+    let config_path = download_resource(&config_resource)?;
+    let vocab_path = download_resource(&vocab_resource)?;
+    let weights_path = download_resource(&weights_resource)?;
+
+//    Set-up masked LM model
+    let device = Device::Cpu;
+    let mut vs = nn::VarStore::new(device);
+    let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
+    let config = AlbertConfig::from_file(config_path);
+    let albert_model = AlbertForMaskedLM::new(&vs.root(), &config);
+    vs.load(weights_path)?;
+
+//    Define input
+    let input = ["Looks like one [MASK] is missing", "It\'s like comparing [MASK] to apples"];
+    let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
+    let tokenized_input = tokenized_input.
+        iter().
+        map(|input| input.token_ids.clone()).
+        map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        }).
+        map(|input|
+            Tensor::of_slice(&(input))).
+        collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
+
+//    Forward pass
+    let (output, _, _) = no_grad(|| {
+        albert_model
+            .forward_t(Some(input_tensor),
+                       None,
+                       None,
+                       None,
+                       None,
+                       false)
+    });
+
+//    Print masked tokens
+    let index_1 = output.get(0).get(4).argmax(0, false);
+    let index_2 = output.get(1).get(6).argmax(0, false);
+    let word_1 = tokenizer.vocab().id_to_token(&index_1.int64_value(&[]));
+    let word_2 = tokenizer.vocab().id_to_token(&index_2.int64_value(&[]));
+
+    assert_eq!("▁them", word_1); // Outputs "_them" : "Looks like one [them] is missing (? this is identical with the original implementation)"
+    assert_eq!("▁grapes", word_2);// Outputs "grapes" : "It\'s like comparing [grapes] to apples"
+    assert!((output.double_value(&[0, 0, 0]) - 4.6143).abs() < 1e-4);
+    Ok(())
+}
+
+#[test]
+fn albert_for_sequence_classification() -> failure::Fallible<()> {
+    //    Resources paths
+    let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
+    let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
+    let config_path = download_resource(&config_resource)?;
+    let vocab_path = download_resource(&vocab_resource)?;
+
+//    Set-up model
+    let device = Device::Cpu;
+    let vs = nn::VarStore::new(device);
+    let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
+    let mut config = AlbertConfig::from_file(config_path);
+    let mut dummy_label_mapping = HashMap::new();
+    dummy_label_mapping.insert(0, String::from("Positive"));
+    dummy_label_mapping.insert(1, String::from("Negative"));
+    dummy_label_mapping.insert(3, String::from("Neutral"));
+    config.id2label = Some(dummy_label_mapping);
+    config.output_attentions = Some(true);
+    config.output_hidden_states = Some(true);
+    let albert_model = AlbertForSequenceClassification::new(&vs.root(), &config);
+
+
+//    Define input
+    let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
+    let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
+    let tokenized_input = tokenized_input.
+        iter().
+        map(|input| input.token_ids.clone()).
+        map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        }).
+        map(|input|
+            Tensor::of_slice(&(input))).
+        collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
+
+//    Forward pass
+    let (output, all_hidden_states, all_attentions) = no_grad(|| {
+        albert_model
+            .forward_t(Some(input_tensor),
+                       None,
+                       None,
+                       None,
+                       None,
+                       false)
+    });
+
+    assert_eq!(output.size(), &[2, 3]);
+    assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
+    assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
+
+    Ok(())
+}
+
+#[test]
+fn albert_for_multiple_choice() -> failure::Fallible<()> {
+    //    Resources paths
+    let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
+    let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
+    let config_path = download_resource(&config_resource)?;
+    let vocab_path = download_resource(&vocab_resource)?;
+
+//    Set-up model
+    let device = Device::Cpu;
+    let vs = nn::VarStore::new(device);
+    let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
+    let mut config = AlbertConfig::from_file(config_path);
+    config.output_attentions = Some(true);
+    config.output_hidden_states = Some(true);
+    let albert_model = AlbertForMultipleChoice::new(&vs.root(), &config);
+
+//    Define input
+    let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
+    let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
+    let tokenized_input = tokenized_input.
+        iter().
+        map(|input| input.token_ids.clone()).
+        map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        }).
+        map(|input|
+            Tensor::of_slice(&(input))).
+        collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device).unsqueeze(0);
+
+//    Forward pass
+    let (output, all_hidden_states, all_attentions) = no_grad(|| {
+        albert_model
+            .forward_t(Some(input_tensor),
+                       None,
+                       None,
+                       None,
+                       None,
+                       false).unwrap()
+    });
+
+    assert_eq!(output.size(), &[1, 2]);
+    assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
+    assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
+
+    Ok(())
+}
+
+#[test]
+fn albert_for_token_classification() -> failure::Fallible<()> {
+    //    Resources paths
+    let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
+    let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
+    let config_path = download_resource(&config_resource)?;
+    let vocab_path = download_resource(&vocab_resource)?;
+
+//    Set-up model
+    let device = Device::Cpu;
+    let vs = nn::VarStore::new(device);
+    let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
+    let mut config = AlbertConfig::from_file(config_path);
+    let mut dummy_label_mapping = HashMap::new();
+    dummy_label_mapping.insert(0, String::from("O"));
+    dummy_label_mapping.insert(1, String::from("LOC"));
+    dummy_label_mapping.insert(2, String::from("PER"));
+    dummy_label_mapping.insert(3, String::from("ORG"));
+    config.id2label = Some(dummy_label_mapping);
+    config.output_attentions = Some(true);
+    config.output_hidden_states = Some(true);
+    let bert_model = AlbertForTokenClassification::new(&vs.root(), &config);
+
+
+//    Define input
+    let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
+    let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
+    let tokenized_input = tokenized_input.
+        iter().
+        map(|input| input.token_ids.clone()).
+        map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        }).
+        map(|input|
+            Tensor::of_slice(&(input))).
+        collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
+
+//    Forward pass
+    let (output, all_hidden_states, all_attentions) = no_grad(|| {
+        bert_model
+            .forward_t(Some(input_tensor),
+                       None,
+                       None,
+                       None,
+                       None,
+                       false)
+    });
+
+    assert_eq!(output.size(), &[2, 12, 4]);
+    assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
+    assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
+
+    Ok(())
+}
+
+#[test]
+fn albert_for_question_answering() -> failure::Fallible<()> {
+    //    Resources paths
+    let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
+    let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
+    let config_path = download_resource(&config_resource)?;
+    let vocab_path = download_resource(&vocab_resource)?;
+
+//    Set-up model
+    let device = Device::Cpu;
+    let vs = nn::VarStore::new(device);
+    let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
+    let mut config = AlbertConfig::from_file(config_path);
+    config.output_attentions = Some(true);
+    config.output_hidden_states = Some(true);
+    let albert_model = AlbertForQuestionAnswering::new(&vs.root(), &config);
+
+//    Define input
+    let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
+    let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
+    let tokenized_input = tokenized_input.
+        iter().
+        map(|input| input.token_ids.clone()).
+        map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        }).
+        map(|input|
+            Tensor::of_slice(&(input))).
+        collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
+
+//    Forward pass
+    let (start_scores, end_scores, all_hidden_states, all_attentions) = no_grad(|| {
+        albert_model
+            .forward_t(Some(input_tensor),
+                       None,
+                       None,
+                       None,
+                       None,
+                       false)
+    });
+
+    assert_eq!(start_scores.size(), &[2, 12]);
+    assert_eq!(end_scores.size(), &[2, 12]);
+    assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
+    assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
+
+    Ok(())
+}
--- a/utils/download-dependencies_albert.py
+++ b/utils/download-dependencies_albert.py
@ -0,0 +1,49 @@
+from transformers import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from transformers.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from transformers.tokenization_albert import PRETRAINED_VOCAB_FILES_MAP
+from transformers.file_utils import get_from_cache
+from pathlib import Path
+import shutil
+import os
+import numpy as np
+import torch
+import subprocess
+
+config_path = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP["albert-base-v2"]
+vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["albert-base-v2"]
+weights_path = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP["albert-base-v2"]
+
+target_path = Path.home() / 'rustbert' / 'albert-base-v2'
+
+temp_config = get_from_cache(config_path)
+temp_vocab = get_from_cache(vocab_path)
+temp_weights = get_from_cache(weights_path)
+
+os.makedirs(str(target_path), exist_ok=True)
+
+config_path = str(target_path / 'config.json')
+vocab_path = str(target_path / 'spiece.model')
+model_path = str(target_path / 'model.bin')
+
+shutil.copy(temp_config, config_path)
+shutil.copy(temp_vocab, vocab_path)
+shutil.copy(temp_weights, model_path)
+
+weights = torch.load(temp_weights, map_location='cpu')
+nps = {}
+for k, v in weights.items():
+    k = k.replace("gamma", "weight").replace("beta", "bias")
+    nps[k] = np.ascontiguousarray(v.cpu().numpy())
+
+np.savez(target_path / 'model.npz', **nps)
+
+source = str(target_path / 'model.npz')
+target = str(target_path / 'model.ot')
+
+toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve()
+
+subprocess.call(
+    ['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target])
+
+os.remove(str(target_path / 'model.bin'))
+os.remove(str(target_path / 'model.npz'))