Merge pull request #54 from guillaume-be/albert_implementation

Albert implementation
This commit is contained in:
guillaume-be 2020-06-22 21:36:09 +02:00 committed by GitHub
commit 0624a5368c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 1747 additions and 27 deletions

View File

@ -8,7 +8,7 @@ repository = "https://github.com/guillaume-be/rust-bert"
documentation = "https://docs.rs/rust-bert"
license = "Apache-2.0"
readme = "README.md"
keywords = ["nlp", "deep-learning", "machine-learning", "bert", "transformers"]
keywords = ["nlp", "deep-learning", "machine-learning", "bert", "transformers", "summarization", "translation", "NER", "classification", "language", "sentiment-analysis", "question-answering"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@ -30,7 +30,7 @@ all-tests = []
features = [ "doc-only" ]
[dependencies]
rust_tokenizers = "~3.1.2"
rust_tokenizers = "~3.1.4"
tch = "~0.1.7"
serde_json = "1.0.51"
serde = {version = "1.0.106", features = ["derive"]}

View File

@ -10,17 +10,17 @@ This repository exposes the model base architecture, task-specific heads (see be
The following models are currently implemented:
| |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**|
:-----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:----:
Masked LM|✅ |✅ |✅ | | | |✅| |
Sequence classification|✅ |✅ |✅| | | | | |
Token classification|✅ |✅ | ✅| | | |✅| |
Question answering|✅ |✅ |✅| | | | | |
Multiple choices| |✅ |✅| | | | | |
Next token prediction| | | |✅|✅|✅| | |
Natural Language Generation| | | |✅|✅|✅| | |
Summarization | | | | | |✅| | |
Translation | | | | | |✅| |✅ |
| |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**|**ALBERT**|
:-----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:----:|:----:
Masked LM|✅ |✅ |✅ | | | |✅| |✅ |
Sequence classification|✅ |✅ |✅| | | | | |✅ |
Token classification|✅ |✅ | ✅| | | |✅| |✅ |
Question answering|✅ |✅ |✅| | | | | |✅ |
Multiple choices| |✅ |✅| | | | | |✅ |
Next token prediction| | | |✅|✅|✅| | | |
Natural Language Generation| | | |✅|✅|✅| | | |
Summarization | | | | | |✅| | | |
Translation | | | | | |✅| |✅ | |
## Ready-to-use pipelines

77
examples/albert.rs Normal file
View File

@ -0,0 +1,77 @@
// Copyright 2018 Google AI and Google Brain team.
// Copyright 2020-present, the HuggingFace Inc. team.
// Copyright 2020 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
extern crate failure;
use tch::{Device, nn, Tensor, no_grad};
use rust_tokenizers::{AlbertTokenizer, TruncationStrategy, Tokenizer, Vocab};
use rust_bert::Config;
use rust_bert::resources::{Resource, download_resource, RemoteResource};
use rust_bert::albert::{AlbertConfig, AlbertForMaskedLM, AlbertConfigResources, AlbertVocabResources, AlbertModelResources};
fn main() -> failure::Fallible<()> {
// Resources paths
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
let weights_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertModelResources::ALBERT_BASE_V2));
let config_path = download_resource(&config_resource)?;
let vocab_path = download_resource(&vocab_resource)?;
let weights_path = download_resource(&weights_resource)?;
// Set-up masked LM model
let device = Device::Cpu;
let mut vs = nn::VarStore::new(device);
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
let config = AlbertConfig::from_file(config_path);
let albert_model = AlbertForMaskedLM::new(&vs.root(), &config);
vs.load(weights_path)?;
// Define input
let input = ["Looks like one [MASK] is missing", "It was a very nice and [MASK] day"];
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
let tokenized_input = tokenized_input.
iter().
map(|input| input.token_ids.clone()).
map(|mut input| {
input.extend(vec![0; max_len - input.len()]);
input
}).
map(|input|
Tensor::of_slice(&(input))).
collect::<Vec<_>>();
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
// Forward pass
let (output, _, _) = no_grad(|| {
albert_model
.forward_t(Some(input_tensor),
None,
None,
None,
None,
false)
});
println!("{:?}", output.double_value(&[0, 0, 0]));
// Print masked tokens
let index_1 = output.get(0).get(4).argmax(0, false);
let index_2 = output.get(1).get(7).argmax(0, false);
let word_1 = tokenizer.vocab().id_to_token(&index_1.int64_value(&[]));
let word_2 = tokenizer.vocab().id_to_token(&index_2.int64_value(&[]));
println!("{} - {}", &index_1.int64_value(&[]), word_1); // Outputs "_them" : "Looks like one [them] is missing"
println!("{} - {}", &index_2.int64_value(&[]), word_2); // Outputs "_enjoyable" : "It was a very nice and [enjoyable] day"
Ok(())
}

View File

@ -8,6 +8,7 @@ use rust_bert::bert::{BertConfigResources, BertVocabResources, BertModelResource
use rust_bert::bart::{BartConfigResources, BartVocabResources, BartMergesResources, BartModelResources};
use rust_bert::resources::{Resource, download_resource, RemoteResource};
use rust_bert::electra::{ElectraConfigResources, ElectraVocabResources, ElectraModelResources};
use rust_bert::albert::{AlbertConfigResources, AlbertVocabResources, AlbertModelResources};
/// This example downloads and caches all dependencies used in model tests. This allows for safe
/// multi threaded testing (two test using the same resource would otherwise download the file to
@ -169,6 +170,17 @@ fn download_electra_discriminator() -> failure::Fallible<()> {
Ok(())
}
fn download_albert_base_v2() -> failure::Fallible<()> {
// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
let weights_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertModelResources::ALBERT_BASE_V2));
let _ = download_resource(&config_resource)?;
let _ = download_resource(&vocab_resource)?;
let _ = download_resource(&weights_resource)?;
Ok(())
}
fn main() -> failure::Fallible<()> {
let _ = download_distil_gpt2();
let _ = download_distilbert_sst2();
@ -183,6 +195,7 @@ fn main() -> failure::Fallible<()> {
let _ = download_bart_cnn();
let _ = download_electra_generator();
let _ = download_electra_discriminator();
let _ = download_albert_base_v2();
Ok(())
}

830
src/albert/albert.rs Normal file
View File

@ -0,0 +1,830 @@
// Copyright 2018 Google AI and Google Brain team.
// Copyright 2020-present, the HuggingFace Inc. team.
// Copyright 2020 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use crate::Config;
use serde::{Deserialize, Serialize};
use crate::albert::embeddings::AlbertEmbeddings;
use crate::albert::encoder::AlbertTransformer;
use tch::{nn, Tensor, Kind};
use crate::common::activations::{_tanh, _gelu_new, _gelu, _relu, _mish};
use tch::nn::Module;
use crate::common::dropout::Dropout;
/// # ALBERT Pretrained model weight files
pub struct AlbertModelResources;
/// # ALBERT Pretrained model config files
pub struct AlbertConfigResources;
/// # ALBERT Pretrained model vocab files
pub struct AlbertVocabResources;
impl AlbertModelResources {
/// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
pub const ALBERT_BASE_V2: (&'static str, &'static str) = ("albert-base-v2/model.ot", "https://cdn.huggingface.co/albert-base-v2/rust_model.ot");
}
impl AlbertConfigResources {
/// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
pub const ALBERT_BASE_V2: (&'static str, &'static str) = ("albert-base-v2/config.json", "https://cdn.huggingface.co/albert-base-v2-config.json");
}
impl AlbertVocabResources {
/// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
pub const ALBERT_BASE_V2: (&'static str, &'static str) = ("albert-base-v2/spiece.model", "https://cdn.huggingface.co/albert-base-v2-spiece.model");
}
#[allow(non_camel_case_types)]
#[derive(Clone, Debug, Serialize, Deserialize)]
/// # Activation function used in the attention layer and masked language model head
pub enum Activation {
/// Gaussian Error Linear Unit ([Hendrycks et al., 2016,](https://arxiv.org/abs/1606.08415))
gelu_new,
/// Gaussian Error Linear Unit ([Hendrycks et al., 2016,](https://arxiv.org/abs/1606.08415))
gelu,
/// Rectified Linear Unit
relu,
/// Mish ([Misra, 2019](https://arxiv.org/abs/1908.08681))
mish,
}
#[derive(Debug, Serialize, Deserialize)]
/// # ALBERT model configuration
/// Defines the ALBERT model architecture (e.g. number of layers, hidden layer size, label mapping...)
pub struct AlbertConfig {
pub hidden_act: Activation,
pub attention_probs_dropout_prob: f64,
pub classifier_dropout_prob: Option<f64>,
pub bos_token_id: i64,
pub eos_token_id: i64,
pub down_scale_factor: i64,
pub embedding_size: i64,
pub gap_size: i64,
pub hidden_dropout_prob: f64,
pub hidden_size: i64,
pub initializer_range: f32,
pub inner_group_num: i64,
pub intermediate_size: i64,
pub layer_norm_eps: Option<f64>,
pub max_position_embeddings: i64,
pub net_structure_type: i64,
pub num_attention_heads: i64,
pub num_hidden_groups: i64,
pub num_hidden_layers: i64,
pub num_memory_blocks: i64,
pub pad_token_id: i64,
pub type_vocab_size: i64,
pub vocab_size: i64,
pub output_attentions: Option<bool>,
pub output_hidden_states: Option<bool>,
pub is_decoder: Option<bool>,
pub id2label: Option<HashMap<i64, String>>,
pub label2id: Option<HashMap<String, i64>>,
}
impl Config<AlbertConfig> for AlbertConfig {}
/// # ALBERT Base model
/// Base architecture for ALBERT models. Task-specific models will be built from this common base model
/// It is made of the following blocks:
/// - `embeddings`: `token`, `position` and `segment_id` embeddings
/// - `encoder`: Encoder (transformer) made of a vector of layers. Each layer is made of a self-attention layer, an intermediate (linear) and output (linear + layer norm) layers. Note that the weights are shared across layers, allowing for a reduction in the model memory footprint.
/// - `pooler`: linear layer applied to the first element of the sequence (*[MASK]* token)
/// - `pooler_activation`: Tanh activation function for the pooling layer
pub struct AlbertModel {
embeddings: AlbertEmbeddings,
encoder: AlbertTransformer,
pooler: nn::Linear,
pooler_activation: Box<dyn Fn(&Tensor) -> Tensor>,
}
impl AlbertModel {
/// Build a new `AlbertModel`
///
/// # Arguments
///
/// * `p` - Variable store path for the root of the ALBERT model
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
///
/// # Example
///
/// ```no_run
/// use tch::{nn, Device};
/// use rust_bert::Config;
/// use std::path::Path;
/// use rust_bert::albert::{AlbertConfig, AlbertModel};
///
/// let config_path = Path::new("path/to/config.json");
/// let device = Device::Cpu;
/// let p = nn::VarStore::new(device);
/// let config = AlbertConfig::from_file(config_path);
/// let albert: AlbertModel = AlbertModel::new(&(&p.root() / "albert"), &config);
/// ```
///
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertModel {
let embeddings = AlbertEmbeddings::new(&(p / "embeddings"), config);
let encoder = AlbertTransformer::new(&(p / "encoder"), config);
let pooler = nn::linear(&(p / "pooler"), config.hidden_size, config.hidden_size, Default::default());
let pooler_activation = Box::new(_tanh);
AlbertModel { embeddings, encoder, pooler, pooler_activation }
}
/// Forward pass through the model
///
/// # Arguments
///
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
///
/// # Returns
///
/// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *hidden_size*)
/// * `pooled_output` - `Tensor` of shape (*batch size*, *hidden_size*)
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
///
/// # Example
///
/// ```no_run
///# use tch::{nn, Device, Tensor, no_grad};
///# use rust_bert::Config;
///# use std::path::Path;
///# use tch::kind::Kind::Int64;
/// use rust_bert::albert::{AlbertConfig, AlbertModel};
///# let config_path = Path::new("path/to/config.json");
///# let device = Device::Cpu;
///# let vs = nn::VarStore::new(device);
///# let config = AlbertConfig::from_file(config_path);
///# let albert_model: AlbertModel = AlbertModel::new(&vs.root(), &config);
/// let (batch_size, sequence_length) = (64, 128);
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
///
/// let (output, pooled_output, all_hidden_states, all_attentions) = no_grad(|| {
/// albert_model
/// .forward_t(Some(input_tensor),
/// Some(mask),
/// Some(token_type_ids),
/// Some(position_ids),
/// None,
/// false).unwrap()
/// });
///
/// ```
///
pub fn forward_t(&self,
input_ids: Option<Tensor>,
mask: Option<Tensor>,
token_type_ids: Option<Tensor>,
position_ids: Option<Tensor>,
input_embeds: Option<Tensor>,
train: bool)
-> Result<(Tensor, Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>), &'static str> {
let (input_shape, device) = match &input_ids {
Some(input_value) => match &input_embeds {
Some(_) => { return Err("Only one of input ids or input embeddings may be set"); }
None => (input_value.size(), input_value.device())
}
None => match &input_embeds {
Some(embeds) => (vec!(embeds.size()[0], embeds.size()[1]), embeds.device()),
None => { return Err("At least one of input ids or input embeddings must be set"); }
}
};
let mask = match mask {
Some(value) => value,
None => Tensor::ones(&input_shape, (Kind::Int64, device))
};
let extended_attention_mask = mask.unsqueeze(1).unsqueeze(2);
let extended_attention_mask: Tensor = (extended_attention_mask.ones_like() - extended_attention_mask) * -10000.0;
let embedding_output = match self.embeddings.forward_t(input_ids, token_type_ids, position_ids, input_embeds, train) {
Ok(value) => value,
Err(e) => { return Err(e); }
};
let (hidden_state, all_hidden_states, all_attentions) =
self.encoder.forward_t(&embedding_output,
Some(extended_attention_mask),
train);
let pooled_output = self.pooler.forward(&hidden_state.select(1, 0));
let pooled_output = (self.pooler_activation)(&pooled_output);
Ok((hidden_state, pooled_output, all_hidden_states, all_attentions))
}
}
pub struct AlbertMLMHead {
layer_norm: nn::LayerNorm,
dense: nn::Linear,
decoder: nn::Linear,
activation: Box<dyn Fn(&Tensor) -> Tensor>,
}
impl AlbertMLMHead {
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertMLMHead {
let layer_norm_eps = match config.layer_norm_eps {
Some(value) => value,
None => 1e-12
};
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
let layer_norm = nn::layer_norm(&(p / "LayerNorm"), vec![config.embedding_size], layer_norm_config);
let dense = nn::linear(&(p / "dense"), config.hidden_size, config.embedding_size, Default::default());
let decoder = nn::linear(&(p / "decoder"), config.embedding_size, config.vocab_size, Default::default());
let activation = Box::new(match &config.hidden_act {
Activation::gelu_new => _gelu_new,
Activation::gelu => _gelu,
Activation::relu => _relu,
Activation::mish => _mish
});
AlbertMLMHead { layer_norm, dense, decoder, activation }
}
pub fn forward(&self, hidden_states: &Tensor) -> Tensor {
let output: Tensor = (self.activation)(&hidden_states.apply(&self.dense));
output.apply(&self.layer_norm).apply(&self.decoder)
}
}
/// # ALBERT for masked language model
/// Base ALBERT model with a masked language model head to predict missing tokens, for example `"Looks like one [MASK] is missing" -> "person"`
/// It is made of the following blocks:
/// - `albert`: Base AlbertModel
/// - `predictions`: ALBERT MLM prediction head
pub struct AlbertForMaskedLM {
albert: AlbertModel,
predictions: AlbertMLMHead,
}
impl AlbertForMaskedLM {
/// Build a new `AlbertForMaskedLM`
///
/// # Arguments
///
/// * `p` - Variable store path for the root of the ALBERT model
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
///
/// # Example
///
/// ```no_run
/// use tch::{nn, Device};
/// use rust_bert::Config;
/// use std::path::Path;
/// use rust_bert::albert::{AlbertConfig, AlbertForMaskedLM};
///
/// let config_path = Path::new("path/to/config.json");
/// let device = Device::Cpu;
/// let p = nn::VarStore::new(device);
/// let config = AlbertConfig::from_file(config_path);
/// let albert: AlbertForMaskedLM = AlbertForMaskedLM::new(&p.root(), &config);
/// ```
///
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForMaskedLM {
let albert = AlbertModel::new(&(p / "albert"), config);
let predictions = AlbertMLMHead::new(&(p / "predictions"), config);
AlbertForMaskedLM { albert, predictions }
}
/// Forward pass through the model
///
/// # Arguments
///
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
///
/// # Returns
///
/// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*)
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
///
/// # Example
///
/// ```no_run
///# use tch::{nn, Device, Tensor, no_grad};
///# use rust_bert::Config;
///# use std::path::Path;
///# use tch::kind::Kind::Int64;
/// use rust_bert::albert::{AlbertConfig, AlbertForMaskedLM};
///# let config_path = Path::new("path/to/config.json");
///# let device = Device::Cpu;
///# let vs = nn::VarStore::new(device);
///# let config = AlbertConfig::from_file(config_path);
///# let albert_model: AlbertForMaskedLM = AlbertForMaskedLM::new(&vs.root(), &config);
/// let (batch_size, sequence_length) = (64, 128);
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
///
/// let (output, all_hidden_states, all_attentions) = no_grad(|| {
/// albert_model
/// .forward_t(Some(input_tensor),
/// Some(mask),
/// Some(token_type_ids),
/// Some(position_ids),
/// None,
/// false)
/// });
///
/// ```
///
pub fn forward_t(&self,
input_ids: Option<Tensor>,
mask: Option<Tensor>,
token_type_ids: Option<Tensor>,
position_ids: Option<Tensor>,
input_embeds: Option<Tensor>,
train: bool) -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
let (hidden_state, _, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
let prediction_scores = self.predictions.forward(&hidden_state);
(prediction_scores, all_hidden_states, all_attentions)
}
}
/// # ALBERT for sequence classification
/// Base ALBERT model with a classifier head to perform sentence or document-level classification
/// It is made of the following blocks:
/// - `albert`: Base AlbertModel
/// - `dropout`: Dropout layer
/// - `classifier`: linear layer for classification
pub struct AlbertForSequenceClassification {
albert: AlbertModel,
dropout: Dropout,
classifier: nn::Linear,
}
impl AlbertForSequenceClassification {
/// Build a new `AlbertForSequenceClassification`
///
/// # Arguments
///
/// * `p` - Variable store path for the root of the ALBERT model
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
///
/// # Example
///
/// ```no_run
/// use tch::{nn, Device};
/// use rust_bert::Config;
/// use std::path::Path;
/// use rust_bert::albert::{AlbertConfig, AlbertForSequenceClassification};
///
/// let config_path = Path::new("path/to/config.json");
/// let device = Device::Cpu;
/// let p = nn::VarStore::new(device);
/// let config = AlbertConfig::from_file(config_path);
/// let albert: AlbertForSequenceClassification = AlbertForSequenceClassification::new(&p.root(), &config);
/// ```
///
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForSequenceClassification {
let albert = AlbertModel::new(&(p / "albert"), config);
let classifier_dropout_prob = match config.classifier_dropout_prob {
Some(value) => value,
None => 0.1
};
let dropout = Dropout::new(classifier_dropout_prob);
let num_labels = config.id2label.as_ref().expect("num_labels not provided in configuration").len() as i64;
let classifier = nn::linear(&(p / "classifier"), config.hidden_size, num_labels, Default::default());
AlbertForSequenceClassification { albert, dropout, classifier }
}
/// Forward pass through the model
///
/// # Arguments
///
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
///
/// # Returns
///
/// * `output` - `Tensor` of shape (*batch size*, *num_labels*)
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
///
/// # Example
///
/// ```no_run
///# use tch::{nn, Device, Tensor, no_grad};
///# use rust_bert::Config;
///# use std::path::Path;
///# use tch::kind::Kind::Int64;
/// use rust_bert::albert::{AlbertConfig, AlbertForSequenceClassification};
///# let config_path = Path::new("path/to/config.json");
///# let device = Device::Cpu;
///# let vs = nn::VarStore::new(device);
///# let config = AlbertConfig::from_file(config_path);
///# let albert_model: AlbertForSequenceClassification = AlbertForSequenceClassification::new(&vs.root(), &config);
/// let (batch_size, sequence_length) = (64, 128);
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
///
/// let (output, all_hidden_states, all_attentions) = no_grad(|| {
/// albert_model
/// .forward_t(Some(input_tensor),
/// Some(mask),
/// Some(token_type_ids),
/// Some(position_ids),
/// None,
/// false)
/// });
///
/// ```
///
pub fn forward_t(&self,
input_ids: Option<Tensor>,
mask: Option<Tensor>,
token_type_ids: Option<Tensor>,
position_ids: Option<Tensor>,
input_embeds: Option<Tensor>,
train: bool) -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
let (_, pooled_output, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
let logits = pooled_output.apply_t(&self.dropout, train).apply(&self.classifier);
(logits, all_hidden_states, all_attentions)
}
}
/// # ALBERT for token classification (e.g. NER, POS)
/// Token-level classifier predicting a label for each token provided. Note that because of SentencePiece tokenization, the labels predicted are
/// not necessarily aligned with words in the sentence.
/// It is made of the following blocks:
/// - `albert`: Base AlbertModel
/// - `dropout`: Dropout to apply on the encoder last hidden states
/// - `classifier`: Linear layer for token classification
pub struct AlbertForTokenClassification {
albert: AlbertModel,
dropout: Dropout,
classifier: nn::Linear,
}
impl AlbertForTokenClassification {
/// Build a new `AlbertForTokenClassification`
///
/// # Arguments
///
/// * `p` - Variable store path for the root of the ALBERT model
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
///
/// # Example
///
/// ```no_run
/// use tch::{nn, Device};
/// use rust_bert::Config;
/// use std::path::Path;
/// use rust_bert::albert::{AlbertConfig, AlbertForTokenClassification};
///
/// let config_path = Path::new("path/to/config.json");
/// let device = Device::Cpu;
/// let p = nn::VarStore::new(device);
/// let config = AlbertConfig::from_file(config_path);
/// let albert: AlbertForTokenClassification = AlbertForTokenClassification::new(&p.root(), &config);
/// ```
///
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForTokenClassification {
let albert = AlbertModel::new(&(p / "albert"), config);
let dropout = Dropout::new(config.hidden_dropout_prob);
let num_labels = config.id2label.as_ref().expect("num_labels not provided in configuration").len() as i64;
let classifier = nn::linear(&(p / "classifier"), config.hidden_size, num_labels, Default::default());
AlbertForTokenClassification { albert, dropout, classifier }
}
/// Forward pass through the model
///
/// # Arguments
///
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
///
/// # Returns
///
/// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *num_labels*) containing the logits for each of the input tokens and classes
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
///
/// # Example
///
/// ```no_run
///# use tch::{nn, Device, Tensor, no_grad};
///# use rust_bert::Config;
///# use std::path::Path;
///# use tch::kind::Kind::Int64;
/// use rust_bert::albert::{AlbertConfig, AlbertForTokenClassification};
///# let config_path = Path::new("path/to/config.json");
///# let device = Device::Cpu;
///# let vs = nn::VarStore::new(device);
///# let config = AlbertConfig::from_file(config_path);
///# let albert_model: AlbertForTokenClassification = AlbertForTokenClassification::new(&vs.root(), &config);
/// let (batch_size, sequence_length) = (64, 128);
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
///
/// let (output, all_hidden_states, all_attentions) = no_grad(|| {
/// albert_model
/// .forward_t(Some(input_tensor),
/// Some(mask),
/// Some(token_type_ids),
/// Some(position_ids),
/// None,
/// false)
/// });
///
/// ```
///
pub fn forward_t(&self,
input_ids: Option<Tensor>,
mask: Option<Tensor>,
token_type_ids: Option<Tensor>,
position_ids: Option<Tensor>,
input_embeds: Option<Tensor>,
train: bool) -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
let (sequence_output, _, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
let logits = sequence_output.apply_t(&self.dropout, train).apply(&self.classifier);
(logits, all_hidden_states, all_attentions)
}
}
/// # ALBERT for question answering
/// Extractive question-answering model based on a ALBERT language model. Identifies the segment of a context that answers a provided question.
/// Please note that a significant amount of pre- and post-processing is required to perform end-to-end question answering.
/// See the question answering pipeline (also provided in this crate) for more details.
/// It is made of the following blocks:
/// - `albert`: Base AlbertModel
/// - `qa_outputs`: Linear layer for question answering
pub struct AlbertForQuestionAnswering {
albert: AlbertModel,
qa_outputs: nn::Linear,
}
impl AlbertForQuestionAnswering {
/// Build a new `AlbertForQuestionAnswering`
///
/// # Arguments
///
/// * `p` - Variable store path for the root of the ALBERT model
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
///
/// # Example
///
/// ```no_run
/// use tch::{nn, Device};
/// use rust_bert::Config;
/// use std::path::Path;
/// use rust_bert::albert::{AlbertConfig, AlbertForQuestionAnswering};
///
/// let config_path = Path::new("path/to/config.json");
/// let device = Device::Cpu;
/// let p = nn::VarStore::new(device);
/// let config = AlbertConfig::from_file(config_path);
/// let albert: AlbertForQuestionAnswering = AlbertForQuestionAnswering::new(&p.root(), &config);
/// ```
///
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForQuestionAnswering {
let albert = AlbertModel::new(&(p / "albert"), config);
let num_labels = 2;
let qa_outputs = nn::linear(&(p / "qa_outputs"), config.hidden_size, num_labels, Default::default());
AlbertForQuestionAnswering { albert, qa_outputs }
}
/// Forward pass through the model
///
/// # Arguments
///
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
///
/// # Returns
///
/// * `start_scores` - `Tensor` of shape (*batch size*, *sequence_length*) containing the logits for start of the answer
/// * `end_scores` - `Tensor` of shape (*batch size*, *sequence_length*) containing the logits for end of the answer
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
///
/// # Example
///
/// ```no_run
///# use tch::{nn, Device, Tensor, no_grad};
///# use rust_bert::Config;
///# use std::path::Path;
///# use tch::kind::Kind::Int64;
/// use rust_bert::albert::{AlbertConfig, AlbertForQuestionAnswering};
///# let config_path = Path::new("path/to/config.json");
///# let device = Device::Cpu;
///# let vs = nn::VarStore::new(device);
///# let config = AlbertConfig::from_file(config_path);
///# let albert_model: AlbertForQuestionAnswering = AlbertForQuestionAnswering::new(&vs.root(), &config);
/// let (batch_size, sequence_length) = (64, 128);
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
///
/// let (start_logits, end_logits, all_hidden_states, all_attentions) = no_grad(|| {
/// albert_model
/// .forward_t(Some(input_tensor),
/// Some(mask),
/// Some(token_type_ids),
/// Some(position_ids),
/// None,
/// false)
/// });
///
/// ```
///
pub fn forward_t(&self,
input_ids: Option<Tensor>,
mask: Option<Tensor>,
token_type_ids: Option<Tensor>,
position_ids: Option<Tensor>,
input_embeds: Option<Tensor>,
train: bool) -> (Tensor, Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
let (sequence_output, _, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
let logits = sequence_output.apply(&self.qa_outputs).split(1, -1);
let (start_logits, end_logits) = (&logits[0], &logits[1]);
let start_logits = start_logits.squeeze1(-1);
let end_logits = end_logits.squeeze1(-1);
(start_logits, end_logits, all_hidden_states, all_attentions)
}
}
/// # ALBERT for multiple choices
/// Multiple choices model using a ALBERT base model and a linear classifier.
/// Input should be in the form `[CLS] Context [SEP] Possible choice [SEP]`. The choice is made along the batch axis,
/// assuming all elements of the batch are alternatives to be chosen from for a given context.
/// It is made of the following blocks:
/// - `albert`: Base AlbertModel
/// - `dropout`: Dropout for hidden states output
/// - `classifier`: Linear layer for multiple choices
pub struct AlbertForMultipleChoice {
albert: AlbertModel,
dropout: Dropout,
classifier: nn::Linear,
}
impl AlbertForMultipleChoice {
/// Build a new `AlbertForMultipleChoice`
///
/// # Arguments
///
/// * `p` - Variable store path for the root of the ALBERT model
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
///
/// # Example
///
/// ```no_run
/// use tch::{nn, Device};
/// use rust_bert::Config;
/// use std::path::Path;
/// use rust_bert::albert::{AlbertConfig, AlbertForMultipleChoice};
///
/// let config_path = Path::new("path/to/config.json");
/// let device = Device::Cpu;
/// let p = nn::VarStore::new(device);
/// let config = AlbertConfig::from_file(config_path);
/// let albert: AlbertForMultipleChoice = AlbertForMultipleChoice::new(&p.root(), &config);
/// ```
///
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForMultipleChoice {
let albert = AlbertModel::new(&(p / "albert"), config);
let dropout = Dropout::new(config.hidden_dropout_prob);
let num_labels = 1;
let classifier = nn::linear(&(p / "classifier"), config.hidden_size, num_labels, Default::default());
AlbertForMultipleChoice { albert, dropout, classifier }
}
/// Forward pass through the model
///
/// # Arguments
///
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
///
/// # Returns
///
/// * `output` - `Tensor` of shape (*1*, *batch size*) containing the logits for each of the alternatives given
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
///
/// # Example
///
/// ```no_run
///# use tch::{nn, Device, Tensor, no_grad};
///# use rust_bert::Config;
///# use std::path::Path;
///# use tch::kind::Kind::Int64;
/// use rust_bert::albert::{AlbertConfig, AlbertForMultipleChoice};
///# let config_path = Path::new("path/to/config.json");
///# let device = Device::Cpu;
///# let vs = nn::VarStore::new(device);
///# let config = AlbertConfig::from_file(config_path);
///# let albert_model: AlbertForMultipleChoice = AlbertForMultipleChoice::new(&vs.root(), &config);
/// let (batch_size, sequence_length) = (64, 128);
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
///
/// let (output, all_hidden_states, all_attentions) = no_grad(|| {
/// albert_model
/// .forward_t(Some(input_tensor),
/// Some(mask),
/// Some(token_type_ids),
/// Some(position_ids),
/// None,
/// false).unwrap()
/// });
///
/// ```
///
pub fn forward_t(&self,
input_ids: Option<Tensor>,
mask: Option<Tensor>,
token_type_ids: Option<Tensor>,
position_ids: Option<Tensor>,
input_embeds: Option<Tensor>,
train: bool) -> Result<(Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>), &'static str> {
let (input_ids, input_embeds, num_choices) = match &input_ids {
Some(input_value) => match &input_embeds {
Some(_) => { return Err("Only one of input ids or input embeddings may be set"); }
None => (Some(input_value.view((-1, *input_value.size().last().unwrap()))), None, input_value.size()[1])
}
None => match &input_embeds {
Some(embeds) => (None, Some(embeds.view((-1, embeds.size()[1], embeds.size()[2]))), embeds.size()[1]),
None => { return Err("At least one of input ids or input embeddings must be set"); }
}
};
let mask = match mask {
Some(value) => Some(value.view((-1, *value.size().last().unwrap()))),
None => None
};
let token_type_ids = match token_type_ids {
Some(value) => Some(value.view((-1, *value.size().last().unwrap()))),
None => None
};
let position_ids = match position_ids {
Some(value) => Some(value.view((-1, *value.size().last().unwrap()))),
None => None
};
let (_, pooled_output, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
let logits = pooled_output.apply_t(&self.dropout, train).apply(&self.classifier).view((-1, num_choices));
Ok((logits, all_hidden_states, all_attentions))
}
}

107
src/albert/attention.rs Normal file
View File

@ -0,0 +1,107 @@
// Copyright 2018 Google AI and Google Brain team.
// Copyright 2020-present, the HuggingFace Inc. team.
// Copyright 2020 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::common::dropout::Dropout;
use tch::{nn, Tensor};
use crate::albert::AlbertConfig;
use tch::kind::Kind::Float;
#[derive(Debug)]
pub struct AlbertSelfAttention {
num_attention_heads: i64,
attention_head_size: i64,
hidden_size: i64,
dropout: Dropout,
output_attentions: bool,
query: nn::Linear,
key: nn::Linear,
value: nn::Linear,
dense: nn::Linear,
layer_norm: nn::LayerNorm,
}
impl AlbertSelfAttention {
pub fn new(p: nn::Path, config: &AlbertConfig) -> AlbertSelfAttention {
assert_eq!(config.hidden_size % config.num_attention_heads, 0, "Hidden size not a multiple of the number of attention heads");
let query = nn::linear(&p / "query", config.hidden_size, config.hidden_size, Default::default());
let key = nn::linear(&p / "key", config.hidden_size, config.hidden_size, Default::default());
let value = nn::linear(&p / "value", config.hidden_size, config.hidden_size, Default::default());
let dense = nn::linear(&p / "dense", config.hidden_size, config.hidden_size, Default::default());
let dropout = Dropout::new(config.attention_probs_dropout_prob);
let attention_head_size = config.hidden_size / config.num_attention_heads;
let output_attentions = match config.output_attentions {
Some(value) => value,
None => false
};
let layer_norm_eps = match config.layer_norm_eps {
Some(value) => value,
None => 1e-12
};
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
let layer_norm = nn::layer_norm(&p / "LayerNorm", vec![config.hidden_size], layer_norm_config);
AlbertSelfAttention {
num_attention_heads: config.num_attention_heads,
attention_head_size,
hidden_size: config.hidden_size,
dropout,
output_attentions,
query,
key,
value,
dense,
layer_norm,
}
}
fn split_heads(&self, x: Tensor, bs: i64, dim_per_head: i64) -> Tensor {
x.view((bs, -1, self.num_attention_heads, dim_per_head)).transpose(1, 2)
}
pub fn forward_t(&self,
input_ids: &Tensor,
mask: &Option<Tensor>,
train: bool) -> (Tensor, Option<Tensor>) {
let bs = *input_ids.size().first().unwrap();
let key_layer = self.split_heads(input_ids.apply(&self.key), bs, self.attention_head_size);
let value_layer = self.split_heads(input_ids.apply(&self.value), bs, self.attention_head_size);
let query_layer = self.split_heads(input_ids.apply(&self.query), bs, self.attention_head_size);
let query_layer: Tensor = query_layer / (self.attention_head_size as f64).sqrt();
let scores = if let Some(mask) = mask {
query_layer.matmul(&key_layer.transpose(-1, -2)) + mask
} else {
query_layer.matmul(&key_layer.transpose(-1, -2))
};
let weights = scores.softmax(-1, Float).apply_t(&self.dropout, train);
let context = weights.matmul(&value_layer).transpose(1, 2).contiguous();
let w = self.dense.ws
.transpose(0, 1)
.view((self.num_attention_heads, self.attention_head_size, self.hidden_size));
let context: Tensor = Tensor::einsum("bfnd,ndh->bfh", &[context, w]) + &self.dense.bs;
let context = (input_ids + context.apply_t(&self.dropout, train)).apply(&self.layer_norm);
if !self.output_attentions {
(context, None)
} else {
(context, Some(weights))
}
}
}

102
src/albert/embeddings.rs Normal file
View File

@ -0,0 +1,102 @@
// Copyright 2018 Google AI and Google Brain team.
// Copyright 2020-present, the HuggingFace Inc. team.
// Copyright 2020 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use tch::{nn, Tensor, Kind};
use crate::common::dropout::Dropout;
use crate::albert::AlbertConfig;
use tch::nn::{EmbeddingConfig, embedding};
/// # Embeddings implementation for Albert model
#[derive(Debug)]
/// # Embeddings implementation for Electra model
pub struct AlbertEmbeddings {
word_embeddings: nn::Embedding,
position_embeddings: nn::Embedding,
token_type_embeddings: nn::Embedding,
layer_norm: nn::LayerNorm,
dropout: Dropout,
}
impl AlbertEmbeddings {
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertEmbeddings {
let embedding_config = EmbeddingConfig {
padding_idx: config.pad_token_id,
..Default::default()
};
let word_embeddings: nn::Embedding = embedding(p / "word_embeddings",
config.vocab_size,
config.embedding_size,
embedding_config);
let position_embeddings: nn::Embedding = embedding(p / "position_embeddings",
config.max_position_embeddings,
config.embedding_size,
Default::default());
let token_type_embeddings: nn::Embedding = embedding(p / "token_type_embeddings",
config.type_vocab_size,
config.embedding_size,
Default::default());
let layer_norm_eps = match config.layer_norm_eps {
Some(value) => value,
None => 1e-12
};
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
let layer_norm: nn::LayerNorm = nn::layer_norm(p / "LayerNorm", vec![config.embedding_size], layer_norm_config);
let dropout: Dropout = Dropout::new(config.hidden_dropout_prob);
AlbertEmbeddings { word_embeddings, position_embeddings, token_type_embeddings, layer_norm, dropout}
}
pub fn forward_t(&self,
input_ids: Option<Tensor>,
token_type_ids: Option<Tensor>,
position_ids: Option<Tensor>,
input_embeds: Option<Tensor>,
train: bool) -> Result<Tensor, &'static str> {
let (input_embeddings, input_shape) = match input_ids {
Some(input_value) => match input_embeds {
Some(_) => { return Err("Only one of input ids or input embeddings may be set"); }
None => (input_value.apply_t(&self.word_embeddings, train), input_value.size())
}
None => match input_embeds {
Some(embeds) => {
let size = vec!(embeds.size()[0], embeds.size()[1]);
(embeds, size)
},
None => { return Err("Only one of input ids or input embeddings may be set"); }
}
};
let seq_length = input_embeddings.as_ref().size()[1].to_owned();
let position_ids = match position_ids {
Some(value) => value,
None => Tensor::arange(seq_length, (Kind::Int64, input_embeddings.device()))
.unsqueeze(0).
expand(&input_shape, true)
};
let token_type_ids = match token_type_ids {
Some(value) => value,
None => Tensor::zeros(&input_shape, (Kind::Int64, input_embeddings.device()))
};
let position_embeddings = position_ids.apply(&self.position_embeddings);
let token_type_embeddings = token_type_ids.apply(&self.token_type_embeddings);
let input_embeddings: Tensor = input_embeddings + position_embeddings + token_type_embeddings;
Ok(input_embeddings.apply(&self.layer_norm).apply_t(&self.dropout, train))
}
}

198
src/albert/encoder.rs Normal file
View File

@ -0,0 +1,198 @@
// Copyright 2018 Google AI and Google Brain team.
// Copyright 2020-present, the HuggingFace Inc. team.
// Copyright 2020 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::albert::attention::AlbertSelfAttention;
use tch::{nn, Tensor};
use crate::albert::AlbertConfig;
use crate::albert::albert::Activation;
use crate::common::activations::{_gelu_new, _gelu, _relu, _mish};
use std::borrow::BorrowMut;
pub struct AlbertLayer {
attention: AlbertSelfAttention,
full_layer_layer_norm: nn::LayerNorm,
ffn: nn::Linear,
ffn_output: nn::Linear,
activation: Box<dyn Fn(&Tensor) -> Tensor>,
}
impl AlbertLayer {
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertLayer {
let attention = AlbertSelfAttention::new(p / "attention", &config);
let layer_norm_eps = match config.layer_norm_eps {
Some(value) => value,
None => 1e-12
};
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
let full_layer_layer_norm = nn::layer_norm(&(p / "full_layer_layer_norm"), vec![config.hidden_size], layer_norm_config);
let ffn = nn::linear(&(p / "ffn"), config.hidden_size, config.intermediate_size, Default::default());
let ffn_output = nn::linear(&(p / "ffn_output"), config.intermediate_size, config.hidden_size, Default::default());
let activation = Box::new(match &config.hidden_act {
Activation::gelu_new => _gelu_new,
Activation::gelu => _gelu,
Activation::relu => _relu,
Activation::mish => _mish
});
AlbertLayer { attention, full_layer_layer_norm, ffn, ffn_output, activation }
}
pub fn forward_t(&self,
hidden_states: &Tensor,
mask: &Option<Tensor>,
train: bool) -> (Tensor, Option<Tensor>) {
let (attention_output, attention_weights) = self.attention.forward_t(hidden_states, mask, train);
let ffn_output = attention_output.apply(&self.ffn);
let ffn_output: Tensor = (self.activation)(&ffn_output);
let ffn_output = ffn_output.apply(&self.ffn_output);
let ffn_output = (ffn_output + attention_output).apply(&self.full_layer_layer_norm);
(ffn_output, attention_weights)
}
}
pub struct AlbertLayerGroup {
output_hidden_states: bool,
output_attentions: bool,
layers: Vec<AlbertLayer>,
}
impl AlbertLayerGroup {
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertLayerGroup {
let p = &(p / "albert_layers");
let output_attentions = match config.output_attentions {
Some(value) => value,
None => false
};
let output_hidden_states = match config.output_hidden_states {
Some(value) => value,
None => false
};
let mut layers: Vec<AlbertLayer> = vec!();
for layer_index in 0..config.inner_group_num {
layers.push(AlbertLayer::new(&(p / layer_index), config));
};
AlbertLayerGroup { output_hidden_states, output_attentions, layers }
}
pub fn forward_t(&self,
hidden_states: &Tensor,
mask: &Option<Tensor>,
train: bool)
-> (Tensor, Option<Vec<Tensor>>, Option<Vec<Tensor>>) {
let mut all_hidden_states: Option<Vec<Tensor>> = if self.output_hidden_states { Some(vec!()) } else { None };
let mut all_attentions: Option<Vec<Tensor>> = if self.output_attentions { Some(vec!()) } else { None };
let mut hidden_state = hidden_states.copy();
let mut attention_weights: Option<Tensor>;
let mut layers = self.layers.iter();
loop {
match layers.next() {
Some(layer) => {
if let Some(hidden_states) = all_hidden_states.borrow_mut() {
hidden_states.push(hidden_state.as_ref().copy());
};
let temp = layer.forward_t(&hidden_state, &mask, train);
hidden_state = temp.0;
attention_weights = temp.1;
if let Some(attentions) = all_attentions.borrow_mut() {
attentions.push(attention_weights.as_ref().unwrap().copy());
};
}
None => break
};
};
(hidden_state, all_hidden_states, all_attentions)
}
}
pub struct AlbertTransformer {
output_hidden_states: bool,
output_attentions: bool,
num_hidden_layers: i64,
num_hidden_groups: i64,
embedding_hidden_mapping_in: nn::Linear,
layers: Vec<AlbertLayerGroup>,
}
impl AlbertTransformer {
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertTransformer {
let p_layers = &(p / "albert_layer_groups");
let output_attentions = match config.output_attentions {
Some(value) => value,
None => false
};
let output_hidden_states = match config.output_hidden_states {
Some(value) => value,
None => false
};
let embedding_hidden_mapping_in = nn::linear(&(p / "embedding_hidden_mapping_in"), config.embedding_size, config.hidden_size, Default::default());
let mut layers: Vec<AlbertLayerGroup> = vec!();
for layer_index in 0..config.inner_group_num {
layers.push(AlbertLayerGroup::new(&(p_layers / layer_index), config));
};
AlbertTransformer {
output_hidden_states,
output_attentions,
num_hidden_layers: config.num_hidden_layers,
num_hidden_groups: config.num_hidden_groups,
embedding_hidden_mapping_in,
layers,
}
}
pub fn forward_t(&self,
hidden_states: &Tensor,
mask: Option<Tensor>,
train: bool)
-> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
let mut hidden_state = hidden_states.apply(&self.embedding_hidden_mapping_in);
let mut all_hidden_states: Option<Vec<Tensor>> = if self.output_hidden_states { Some(vec!()) } else { None };
let mut all_attentions: Option<Vec<Vec<Tensor>>> = if self.output_attentions { Some(vec!()) } else { None };
for i in 0..self.num_hidden_layers {
let group_idx = i / (self.num_hidden_layers / self.num_hidden_groups);
let layer = &self.layers[group_idx as usize];
if let Some(hidden_states) = all_hidden_states.borrow_mut() {
hidden_states.push(hidden_state.as_ref().copy());
};
let temp = layer.forward_t(&hidden_state, &mask, train);
hidden_state = temp.0;
let attention_weights = temp.1;
if let Some(attentions) = all_attentions.borrow_mut() {
attentions.push(attention_weights.unwrap());
};
};
(hidden_state, all_hidden_states, all_attentions)
}
}

56
src/albert/mod.rs Normal file
View File

@ -0,0 +1,56 @@
//! # ALBERT: A Lite BERT for Self-supervised Learning of Language Representations (Lan et al.)
//!
//! Implementation of the ALBERT language model ([https://arxiv.org/abs/1909.11942](https://arxiv.org/abs/1909.11942) Lan, Chen, Goodman, Gimpel, Sharma, Soricut, 2019).
//! This model offers a greatly reduced memory footprint for similar effective size (number and size of layers). The computational cost remains however similar to the original BERT model.
//! The base model is implemented in the `albert::AlbertModel` struct. Several language model heads have also been implemented, including:
//! - Masked language model: `albert::AlbertForMaskedLM`
//! - Multiple choices: `albert:AlbertForMultipleChoice`
//! - Question answering: `albert::AlbertForQuestionAnswering`
//! - Sequence classification: `albert::AlbertForSequenceClassification`
//! - Token classification (e.g. NER, POS tagging): `albert::AlbertForTokenClassification`
//!
//! # Model set-up and pre-trained weights loading
//!
//! A full working example is provided in `examples/albert.rs`, run with `cargo run --example albert`.
//! The example below illustrate a Masked language model example, the structure is similar for other models.
//! All models expect the following resources:
//! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
//! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
//! - `BertTokenizer` using a `vocab.txt` vocabulary
//! Pretrained models are available and can be downloaded using RemoteResources.
//!
//! ```no_run
//!# fn main() -> failure::Fallible<()> {
//!#
//! use rust_tokenizers::AlbertTokenizer;
//! use tch::{nn, Device};
//!# use std::path::PathBuf;
//! use rust_bert::albert::{AlbertForMaskedLM, AlbertConfig};
//! use rust_bert::Config;
//! use rust_bert::resources::{Resource, download_resource, LocalResource};
//!
//! let config_resource = Resource::Local(LocalResource { local_path: PathBuf::from("path/to/config.json")});
//! let vocab_resource = Resource::Local(LocalResource { local_path: PathBuf::from("path/to/vocab.txt")});
//! let weights_resource = Resource::Local(LocalResource { local_path: PathBuf::from("path/to/model.ot")});
//! let config_path = download_resource(&config_resource)?;
//! let vocab_path = download_resource(&vocab_resource)?;
//! let weights_path = download_resource(&weights_resource)?;
//! let device = Device::cuda_if_available();
//! let mut vs = nn::VarStore::new(device);
//! let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, true);
//! let config = AlbertConfig::from_file(config_path);
//! let bert_model = AlbertForMaskedLM::new(&vs.root(), &config);
//! vs.load(weights_path)?;
//!
//!# Ok(())
//!# }
//! ```
mod encoder;
mod attention;
mod embeddings;
mod albert;
pub use albert::{AlbertConfig, AlbertModelResources, AlbertConfigResources, AlbertVocabResources, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForTokenClassification, AlbertForQuestionAnswering, AlbertForMultipleChoice};

View File

@ -142,7 +142,10 @@ impl BertEmbedding for BertEmbeddings {
None => (input_value.apply_t(&self.word_embeddings, train), input_value.size())
}
None => match input_embeds {
Some(embeds) => (embeds.copy(), vec!(embeds.size()[0], embeds.size()[1])),
Some(embeds) => {
let size = vec!(embeds.size()[0], embeds.size()[1]);
(embeds, size)
},
None => { return Err("Only one of input ids or input embeddings may be set"); }
}
};

View File

@ -25,7 +25,6 @@ pub struct ElectraEmbeddings {
token_type_embeddings: nn::Embedding,
layer_norm: nn::LayerNorm,
dropout: Dropout,
padding_index: i64,
}
impl ElectraEmbeddings {
@ -57,7 +56,7 @@ impl ElectraEmbeddings {
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
let layer_norm: nn::LayerNorm = nn::layer_norm(p / "LayerNorm", vec![config.embedding_size], layer_norm_config);
let dropout: Dropout = Dropout::new(config.hidden_dropout_prob);
ElectraEmbeddings { word_embeddings, position_embeddings, token_type_embeddings, layer_norm, dropout, padding_index: 1 }
ElectraEmbeddings { word_embeddings, position_embeddings, token_type_embeddings, layer_norm, dropout}
}
pub fn forward_t(&self,

View File

@ -29,17 +29,17 @@
//! ```
//! - Transformer models base architectures with customized heads. These allow to load pre-trained models for customized inference in Rust
//!
//! | |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**
//! :-----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:
//! Masked LM|✅ |✅ |✅ | | | |✅| |
//! Sequence classification|✅ |✅ |✅| | | | | |
//! Token classification|✅ |✅ | ✅| | | |✅| |
//! Question answering|✅ |✅ |✅| | | | | |
//! Multiple choices| |✅ |✅| | | | | |
//! Next token prediction| | | |✅|✅| | | |
//! Natural Language Generation| | | |✅|✅| | | |
//! Summarization| | | | | |✅| | |
//! Translation| | | | | | | |✅|
//! | |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**|**ALBERT**
//! :-----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:
//! Masked LM|✅ |✅ |✅ | | | |✅| |✅ |
//! Sequence classification|✅ |✅ |✅| | | | | |✅ |
//! Token classification|✅ |✅ | ✅| | | |✅| |✅ |
//! Question answering|✅ |✅ |✅| | | | | |✅ |
//! Multiple choices| |✅ |✅| | | | | |✅ |
//! Next token prediction| | | |✅|✅| | | | |
//! Natural Language Generation| | | |✅|✅| | | | |
//! Summarization| | | | | |✅| | | |
//! Translation| | | | | | | |✅| |
//!
//! # Loading pre-trained models
//!
@ -65,6 +65,7 @@ pub mod gpt2;
pub mod bart;
pub mod electra;
pub mod marian;
pub mod albert;
mod common;
pub mod pipelines;

285
tests/albert.rs Normal file
View File

@ -0,0 +1,285 @@
extern crate failure;
extern crate dirs;
use tch::{Device, nn, Tensor, no_grad};
use rust_tokenizers::{TruncationStrategy, Tokenizer, Vocab, AlbertTokenizer};
use rust_bert::Config;
use rust_bert::resources::{Resource, RemoteResource, download_resource};
use rust_bert::albert::{AlbertConfigResources, AlbertVocabResources, AlbertModelResources, AlbertConfig, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForMultipleChoice, AlbertForTokenClassification, AlbertForQuestionAnswering};
use std::collections::HashMap;
#[test]
fn albert_masked_lm() -> failure::Fallible<()> {
// Resources paths
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
let weights_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertModelResources::ALBERT_BASE_V2));
let config_path = download_resource(&config_resource)?;
let vocab_path = download_resource(&vocab_resource)?;
let weights_path = download_resource(&weights_resource)?;
// Set-up masked LM model
let device = Device::Cpu;
let mut vs = nn::VarStore::new(device);
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
let config = AlbertConfig::from_file(config_path);
let albert_model = AlbertForMaskedLM::new(&vs.root(), &config);
vs.load(weights_path)?;
// Define input
let input = ["Looks like one [MASK] is missing", "It\'s like comparing [MASK] to apples"];
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
let tokenized_input = tokenized_input.
iter().
map(|input| input.token_ids.clone()).
map(|mut input| {
input.extend(vec![0; max_len - input.len()]);
input
}).
map(|input|
Tensor::of_slice(&(input))).
collect::<Vec<_>>();
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
// Forward pass
let (output, _, _) = no_grad(|| {
albert_model
.forward_t(Some(input_tensor),
None,
None,
None,
None,
false)
});
// Print masked tokens
let index_1 = output.get(0).get(4).argmax(0, false);
let index_2 = output.get(1).get(6).argmax(0, false);
let word_1 = tokenizer.vocab().id_to_token(&index_1.int64_value(&[]));
let word_2 = tokenizer.vocab().id_to_token(&index_2.int64_value(&[]));
assert_eq!("▁them", word_1); // Outputs "_them" : "Looks like one [them] is missing (? this is identical with the original implementation)"
assert_eq!("▁grapes", word_2);// Outputs "grapes" : "It\'s like comparing [grapes] to apples"
assert!((output.double_value(&[0, 0, 0]) - 4.6143).abs() < 1e-4);
Ok(())
}
#[test]
fn albert_for_sequence_classification() -> failure::Fallible<()> {
// Resources paths
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
let config_path = download_resource(&config_resource)?;
let vocab_path = download_resource(&vocab_resource)?;
// Set-up model
let device = Device::Cpu;
let vs = nn::VarStore::new(device);
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
let mut config = AlbertConfig::from_file(config_path);
let mut dummy_label_mapping = HashMap::new();
dummy_label_mapping.insert(0, String::from("Positive"));
dummy_label_mapping.insert(1, String::from("Negative"));
dummy_label_mapping.insert(3, String::from("Neutral"));
config.id2label = Some(dummy_label_mapping);
config.output_attentions = Some(true);
config.output_hidden_states = Some(true);
let albert_model = AlbertForSequenceClassification::new(&vs.root(), &config);
// Define input
let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
let tokenized_input = tokenized_input.
iter().
map(|input| input.token_ids.clone()).
map(|mut input| {
input.extend(vec![0; max_len - input.len()]);
input
}).
map(|input|
Tensor::of_slice(&(input))).
collect::<Vec<_>>();
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
// Forward pass
let (output, all_hidden_states, all_attentions) = no_grad(|| {
albert_model
.forward_t(Some(input_tensor),
None,
None,
None,
None,
false)
});
assert_eq!(output.size(), &[2, 3]);
assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
Ok(())
}
#[test]
fn albert_for_multiple_choice() -> failure::Fallible<()> {
// Resources paths
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
let config_path = download_resource(&config_resource)?;
let vocab_path = download_resource(&vocab_resource)?;
// Set-up model
let device = Device::Cpu;
let vs = nn::VarStore::new(device);
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
let mut config = AlbertConfig::from_file(config_path);
config.output_attentions = Some(true);
config.output_hidden_states = Some(true);
let albert_model = AlbertForMultipleChoice::new(&vs.root(), &config);
// Define input
let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
let tokenized_input = tokenized_input.
iter().
map(|input| input.token_ids.clone()).
map(|mut input| {
input.extend(vec![0; max_len - input.len()]);
input
}).
map(|input|
Tensor::of_slice(&(input))).
collect::<Vec<_>>();
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device).unsqueeze(0);
// Forward pass
let (output, all_hidden_states, all_attentions) = no_grad(|| {
albert_model
.forward_t(Some(input_tensor),
None,
None,
None,
None,
false).unwrap()
});
assert_eq!(output.size(), &[1, 2]);
assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
Ok(())
}
#[test]
fn albert_for_token_classification() -> failure::Fallible<()> {
// Resources paths
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
let config_path = download_resource(&config_resource)?;
let vocab_path = download_resource(&vocab_resource)?;
// Set-up model
let device = Device::Cpu;
let vs = nn::VarStore::new(device);
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
let mut config = AlbertConfig::from_file(config_path);
let mut dummy_label_mapping = HashMap::new();
dummy_label_mapping.insert(0, String::from("O"));
dummy_label_mapping.insert(1, String::from("LOC"));
dummy_label_mapping.insert(2, String::from("PER"));
dummy_label_mapping.insert(3, String::from("ORG"));
config.id2label = Some(dummy_label_mapping);
config.output_attentions = Some(true);
config.output_hidden_states = Some(true);
let bert_model = AlbertForTokenClassification::new(&vs.root(), &config);
// Define input
let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
let tokenized_input = tokenized_input.
iter().
map(|input| input.token_ids.clone()).
map(|mut input| {
input.extend(vec![0; max_len - input.len()]);
input
}).
map(|input|
Tensor::of_slice(&(input))).
collect::<Vec<_>>();
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
// Forward pass
let (output, all_hidden_states, all_attentions) = no_grad(|| {
bert_model
.forward_t(Some(input_tensor),
None,
None,
None,
None,
false)
});
assert_eq!(output.size(), &[2, 12, 4]);
assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
Ok(())
}
#[test]
fn albert_for_question_answering() -> failure::Fallible<()> {
// Resources paths
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
let config_path = download_resource(&config_resource)?;
let vocab_path = download_resource(&vocab_resource)?;
// Set-up model
let device = Device::Cpu;
let vs = nn::VarStore::new(device);
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
let mut config = AlbertConfig::from_file(config_path);
config.output_attentions = Some(true);
config.output_hidden_states = Some(true);
let albert_model = AlbertForQuestionAnswering::new(&vs.root(), &config);
// Define input
let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
let tokenized_input = tokenized_input.
iter().
map(|input| input.token_ids.clone()).
map(|mut input| {
input.extend(vec![0; max_len - input.len()]);
input
}).
map(|input|
Tensor::of_slice(&(input))).
collect::<Vec<_>>();
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
// Forward pass
let (start_scores, end_scores, all_hidden_states, all_attentions) = no_grad(|| {
albert_model
.forward_t(Some(input_tensor),
None,
None,
None,
None,
false)
});
assert_eq!(start_scores.size(), &[2, 12]);
assert_eq!(end_scores.size(), &[2, 12]);
assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
Ok(())
}

View File

@ -0,0 +1,49 @@
from transformers import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from transformers.tokenization_albert import PRETRAINED_VOCAB_FILES_MAP
from transformers.file_utils import get_from_cache
from pathlib import Path
import shutil
import os
import numpy as np
import torch
import subprocess
config_path = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP["albert-base-v2"]
vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["albert-base-v2"]
weights_path = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP["albert-base-v2"]
target_path = Path.home() / 'rustbert' / 'albert-base-v2'
temp_config = get_from_cache(config_path)
temp_vocab = get_from_cache(vocab_path)
temp_weights = get_from_cache(weights_path)
os.makedirs(str(target_path), exist_ok=True)
config_path = str(target_path / 'config.json')
vocab_path = str(target_path / 'spiece.model')
model_path = str(target_path / 'model.bin')
shutil.copy(temp_config, config_path)
shutil.copy(temp_vocab, vocab_path)
shutil.copy(temp_weights, model_path)
weights = torch.load(temp_weights, map_location='cpu')
nps = {}
for k, v in weights.items():
k = k.replace("gamma", "weight").replace("beta", "bias")
nps[k] = np.ascontiguousarray(v.cpu().numpy())
np.savez(target_path / 'model.npz', **nps)
source = str(target_path / 'model.npz')
target = str(target_path / 'model.ot')
toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve()
subprocess.call(
['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target])
os.remove(str(target_path / 'model.bin'))
os.remove(str(target_path / 'model.npz'))