mirror of
https://github.com/guillaume-be/rust-bert.git
synced 2024-10-05 16:47:24 +03:00
Merge pull request #54 from guillaume-be/albert_implementation
Albert implementation
This commit is contained in:
commit
0624a5368c
@ -8,7 +8,7 @@ repository = "https://github.com/guillaume-be/rust-bert"
|
||||
documentation = "https://docs.rs/rust-bert"
|
||||
license = "Apache-2.0"
|
||||
readme = "README.md"
|
||||
keywords = ["nlp", "deep-learning", "machine-learning", "bert", "transformers"]
|
||||
keywords = ["nlp", "deep-learning", "machine-learning", "bert", "transformers", "summarization", "translation", "NER", "classification", "language", "sentiment-analysis", "question-answering"]
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
@ -30,7 +30,7 @@ all-tests = []
|
||||
features = [ "doc-only" ]
|
||||
|
||||
[dependencies]
|
||||
rust_tokenizers = "~3.1.2"
|
||||
rust_tokenizers = "~3.1.4"
|
||||
tch = "~0.1.7"
|
||||
serde_json = "1.0.51"
|
||||
serde = {version = "1.0.106", features = ["derive"]}
|
||||
|
22
README.md
22
README.md
@ -10,17 +10,17 @@ This repository exposes the model base architecture, task-specific heads (see be
|
||||
|
||||
The following models are currently implemented:
|
||||
|
||||
| |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**|
|
||||
:-----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:----:
|
||||
Masked LM|✅ |✅ |✅ | | | |✅| |
|
||||
Sequence classification|✅ |✅ |✅| | | | | |
|
||||
Token classification|✅ |✅ | ✅| | | |✅| |
|
||||
Question answering|✅ |✅ |✅| | | | | |
|
||||
Multiple choices| |✅ |✅| | | | | |
|
||||
Next token prediction| | | |✅|✅|✅| | |
|
||||
Natural Language Generation| | | |✅|✅|✅| | |
|
||||
Summarization | | | | | |✅| | |
|
||||
Translation | | | | | |✅| |✅ |
|
||||
| |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**|**ALBERT**|
|
||||
:-----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:----:|:----:
|
||||
Masked LM|✅ |✅ |✅ | | | |✅| |✅ |
|
||||
Sequence classification|✅ |✅ |✅| | | | | |✅ |
|
||||
Token classification|✅ |✅ | ✅| | | |✅| |✅ |
|
||||
Question answering|✅ |✅ |✅| | | | | |✅ |
|
||||
Multiple choices| |✅ |✅| | | | | |✅ |
|
||||
Next token prediction| | | |✅|✅|✅| | | |
|
||||
Natural Language Generation| | | |✅|✅|✅| | | |
|
||||
Summarization | | | | | |✅| | | |
|
||||
Translation | | | | | |✅| |✅ | |
|
||||
|
||||
## Ready-to-use pipelines
|
||||
|
||||
|
77
examples/albert.rs
Normal file
77
examples/albert.rs
Normal file
@ -0,0 +1,77 @@
|
||||
// Copyright 2018 Google AI and Google Brain team.
|
||||
// Copyright 2020-present, the HuggingFace Inc. team.
|
||||
// Copyright 2020 Guillaume Becquin
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
extern crate failure;
|
||||
|
||||
use tch::{Device, nn, Tensor, no_grad};
|
||||
use rust_tokenizers::{AlbertTokenizer, TruncationStrategy, Tokenizer, Vocab};
|
||||
use rust_bert::Config;
|
||||
use rust_bert::resources::{Resource, download_resource, RemoteResource};
|
||||
use rust_bert::albert::{AlbertConfig, AlbertForMaskedLM, AlbertConfigResources, AlbertVocabResources, AlbertModelResources};
|
||||
|
||||
|
||||
fn main() -> failure::Fallible<()> {
|
||||
// Resources paths
|
||||
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
|
||||
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
|
||||
let weights_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertModelResources::ALBERT_BASE_V2));
|
||||
let config_path = download_resource(&config_resource)?;
|
||||
let vocab_path = download_resource(&vocab_resource)?;
|
||||
let weights_path = download_resource(&weights_resource)?;
|
||||
|
||||
// Set-up masked LM model
|
||||
let device = Device::Cpu;
|
||||
let mut vs = nn::VarStore::new(device);
|
||||
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
|
||||
let config = AlbertConfig::from_file(config_path);
|
||||
let albert_model = AlbertForMaskedLM::new(&vs.root(), &config);
|
||||
vs.load(weights_path)?;
|
||||
|
||||
// Define input
|
||||
let input = ["Looks like one [MASK] is missing", "It was a very nice and [MASK] day"];
|
||||
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
|
||||
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
|
||||
let tokenized_input = tokenized_input.
|
||||
iter().
|
||||
map(|input| input.token_ids.clone()).
|
||||
map(|mut input| {
|
||||
input.extend(vec![0; max_len - input.len()]);
|
||||
input
|
||||
}).
|
||||
map(|input|
|
||||
Tensor::of_slice(&(input))).
|
||||
collect::<Vec<_>>();
|
||||
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
|
||||
|
||||
// Forward pass
|
||||
let (output, _, _) = no_grad(|| {
|
||||
albert_model
|
||||
.forward_t(Some(input_tensor),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
false)
|
||||
});
|
||||
println!("{:?}", output.double_value(&[0, 0, 0]));
|
||||
// Print masked tokens
|
||||
let index_1 = output.get(0).get(4).argmax(0, false);
|
||||
let index_2 = output.get(1).get(7).argmax(0, false);
|
||||
let word_1 = tokenizer.vocab().id_to_token(&index_1.int64_value(&[]));
|
||||
let word_2 = tokenizer.vocab().id_to_token(&index_2.int64_value(&[]));
|
||||
|
||||
println!("{} - {}", &index_1.int64_value(&[]), word_1); // Outputs "_them" : "Looks like one [them] is missing"
|
||||
println!("{} - {}", &index_2.int64_value(&[]), word_2); // Outputs "_enjoyable" : "It was a very nice and [enjoyable] day"
|
||||
|
||||
Ok(())
|
||||
}
|
@ -8,6 +8,7 @@ use rust_bert::bert::{BertConfigResources, BertVocabResources, BertModelResource
|
||||
use rust_bert::bart::{BartConfigResources, BartVocabResources, BartMergesResources, BartModelResources};
|
||||
use rust_bert::resources::{Resource, download_resource, RemoteResource};
|
||||
use rust_bert::electra::{ElectraConfigResources, ElectraVocabResources, ElectraModelResources};
|
||||
use rust_bert::albert::{AlbertConfigResources, AlbertVocabResources, AlbertModelResources};
|
||||
|
||||
/// This example downloads and caches all dependencies used in model tests. This allows for safe
|
||||
/// multi threaded testing (two test using the same resource would otherwise download the file to
|
||||
@ -169,6 +170,17 @@ fn download_electra_discriminator() -> failure::Fallible<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn download_albert_base_v2() -> failure::Fallible<()> {
|
||||
// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
|
||||
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
|
||||
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
|
||||
let weights_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertModelResources::ALBERT_BASE_V2));
|
||||
let _ = download_resource(&config_resource)?;
|
||||
let _ = download_resource(&vocab_resource)?;
|
||||
let _ = download_resource(&weights_resource)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> failure::Fallible<()> {
|
||||
let _ = download_distil_gpt2();
|
||||
let _ = download_distilbert_sst2();
|
||||
@ -183,6 +195,7 @@ fn main() -> failure::Fallible<()> {
|
||||
let _ = download_bart_cnn();
|
||||
let _ = download_electra_generator();
|
||||
let _ = download_electra_discriminator();
|
||||
let _ = download_albert_base_v2();
|
||||
|
||||
Ok(())
|
||||
}
|
830
src/albert/albert.rs
Normal file
830
src/albert/albert.rs
Normal file
@ -0,0 +1,830 @@
|
||||
// Copyright 2018 Google AI and Google Brain team.
|
||||
// Copyright 2020-present, the HuggingFace Inc. team.
|
||||
// Copyright 2020 Guillaume Becquin
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
use std::collections::HashMap;
|
||||
use crate::Config;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use crate::albert::embeddings::AlbertEmbeddings;
|
||||
use crate::albert::encoder::AlbertTransformer;
|
||||
use tch::{nn, Tensor, Kind};
|
||||
use crate::common::activations::{_tanh, _gelu_new, _gelu, _relu, _mish};
|
||||
use tch::nn::Module;
|
||||
use crate::common::dropout::Dropout;
|
||||
|
||||
/// # ALBERT Pretrained model weight files
|
||||
pub struct AlbertModelResources;
|
||||
|
||||
/// # ALBERT Pretrained model config files
|
||||
pub struct AlbertConfigResources;
|
||||
|
||||
/// # ALBERT Pretrained model vocab files
|
||||
pub struct AlbertVocabResources;
|
||||
|
||||
impl AlbertModelResources {
|
||||
/// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
|
||||
pub const ALBERT_BASE_V2: (&'static str, &'static str) = ("albert-base-v2/model.ot", "https://cdn.huggingface.co/albert-base-v2/rust_model.ot");
|
||||
}
|
||||
|
||||
impl AlbertConfigResources {
|
||||
/// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
|
||||
pub const ALBERT_BASE_V2: (&'static str, &'static str) = ("albert-base-v2/config.json", "https://cdn.huggingface.co/albert-base-v2-config.json");
|
||||
}
|
||||
|
||||
impl AlbertVocabResources {
|
||||
/// Shared under Apache 2.0 license by the Google team at https://github.com/google-research/ALBERT. Modified with conversion to C-array format.
|
||||
pub const ALBERT_BASE_V2: (&'static str, &'static str) = ("albert-base-v2/spiece.model", "https://cdn.huggingface.co/albert-base-v2-spiece.model");
|
||||
}
|
||||
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
/// # Activation function used in the attention layer and masked language model head
|
||||
pub enum Activation {
|
||||
/// Gaussian Error Linear Unit ([Hendrycks et al., 2016,](https://arxiv.org/abs/1606.08415))
|
||||
gelu_new,
|
||||
/// Gaussian Error Linear Unit ([Hendrycks et al., 2016,](https://arxiv.org/abs/1606.08415))
|
||||
gelu,
|
||||
/// Rectified Linear Unit
|
||||
relu,
|
||||
/// Mish ([Misra, 2019](https://arxiv.org/abs/1908.08681))
|
||||
mish,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
/// # ALBERT model configuration
|
||||
/// Defines the ALBERT model architecture (e.g. number of layers, hidden layer size, label mapping...)
|
||||
pub struct AlbertConfig {
|
||||
pub hidden_act: Activation,
|
||||
pub attention_probs_dropout_prob: f64,
|
||||
pub classifier_dropout_prob: Option<f64>,
|
||||
pub bos_token_id: i64,
|
||||
pub eos_token_id: i64,
|
||||
pub down_scale_factor: i64,
|
||||
pub embedding_size: i64,
|
||||
pub gap_size: i64,
|
||||
pub hidden_dropout_prob: f64,
|
||||
pub hidden_size: i64,
|
||||
pub initializer_range: f32,
|
||||
pub inner_group_num: i64,
|
||||
pub intermediate_size: i64,
|
||||
pub layer_norm_eps: Option<f64>,
|
||||
pub max_position_embeddings: i64,
|
||||
pub net_structure_type: i64,
|
||||
pub num_attention_heads: i64,
|
||||
pub num_hidden_groups: i64,
|
||||
pub num_hidden_layers: i64,
|
||||
pub num_memory_blocks: i64,
|
||||
pub pad_token_id: i64,
|
||||
pub type_vocab_size: i64,
|
||||
pub vocab_size: i64,
|
||||
pub output_attentions: Option<bool>,
|
||||
pub output_hidden_states: Option<bool>,
|
||||
pub is_decoder: Option<bool>,
|
||||
pub id2label: Option<HashMap<i64, String>>,
|
||||
pub label2id: Option<HashMap<String, i64>>,
|
||||
}
|
||||
|
||||
impl Config<AlbertConfig> for AlbertConfig {}
|
||||
|
||||
/// # ALBERT Base model
|
||||
/// Base architecture for ALBERT models. Task-specific models will be built from this common base model
|
||||
/// It is made of the following blocks:
|
||||
/// - `embeddings`: `token`, `position` and `segment_id` embeddings
|
||||
/// - `encoder`: Encoder (transformer) made of a vector of layers. Each layer is made of a self-attention layer, an intermediate (linear) and output (linear + layer norm) layers. Note that the weights are shared across layers, allowing for a reduction in the model memory footprint.
|
||||
/// - `pooler`: linear layer applied to the first element of the sequence (*[MASK]* token)
|
||||
/// - `pooler_activation`: Tanh activation function for the pooling layer
|
||||
pub struct AlbertModel {
|
||||
embeddings: AlbertEmbeddings,
|
||||
encoder: AlbertTransformer,
|
||||
pooler: nn::Linear,
|
||||
pooler_activation: Box<dyn Fn(&Tensor) -> Tensor>,
|
||||
}
|
||||
|
||||
impl AlbertModel {
|
||||
/// Build a new `AlbertModel`
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `p` - Variable store path for the root of the ALBERT model
|
||||
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// use tch::{nn, Device};
|
||||
/// use rust_bert::Config;
|
||||
/// use std::path::Path;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertModel};
|
||||
///
|
||||
/// let config_path = Path::new("path/to/config.json");
|
||||
/// let device = Device::Cpu;
|
||||
/// let p = nn::VarStore::new(device);
|
||||
/// let config = AlbertConfig::from_file(config_path);
|
||||
/// let albert: AlbertModel = AlbertModel::new(&(&p.root() / "albert"), &config);
|
||||
/// ```
|
||||
///
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertModel {
|
||||
let embeddings = AlbertEmbeddings::new(&(p / "embeddings"), config);
|
||||
let encoder = AlbertTransformer::new(&(p / "encoder"), config);
|
||||
let pooler = nn::linear(&(p / "pooler"), config.hidden_size, config.hidden_size, Default::default());
|
||||
let pooler_activation = Box::new(_tanh);
|
||||
|
||||
AlbertModel { embeddings, encoder, pooler, pooler_activation }
|
||||
}
|
||||
|
||||
/// Forward pass through the model
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
|
||||
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
|
||||
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
|
||||
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
|
||||
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
|
||||
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
/// * `pooled_output` - `Tensor` of shape (*batch size*, *hidden_size*)
|
||||
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
///# use tch::{nn, Device, Tensor, no_grad};
|
||||
///# use rust_bert::Config;
|
||||
///# use std::path::Path;
|
||||
///# use tch::kind::Kind::Int64;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertModel};
|
||||
///# let config_path = Path::new("path/to/config.json");
|
||||
///# let device = Device::Cpu;
|
||||
///# let vs = nn::VarStore::new(device);
|
||||
///# let config = AlbertConfig::from_file(config_path);
|
||||
///# let albert_model: AlbertModel = AlbertModel::new(&vs.root(), &config);
|
||||
/// let (batch_size, sequence_length) = (64, 128);
|
||||
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
|
||||
///
|
||||
/// let (output, pooled_output, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
/// albert_model
|
||||
/// .forward_t(Some(input_tensor),
|
||||
/// Some(mask),
|
||||
/// Some(token_type_ids),
|
||||
/// Some(position_ids),
|
||||
/// None,
|
||||
/// false).unwrap()
|
||||
/// });
|
||||
///
|
||||
/// ```
|
||||
///
|
||||
pub fn forward_t(&self,
|
||||
input_ids: Option<Tensor>,
|
||||
mask: Option<Tensor>,
|
||||
token_type_ids: Option<Tensor>,
|
||||
position_ids: Option<Tensor>,
|
||||
input_embeds: Option<Tensor>,
|
||||
train: bool)
|
||||
-> Result<(Tensor, Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>), &'static str> {
|
||||
let (input_shape, device) = match &input_ids {
|
||||
Some(input_value) => match &input_embeds {
|
||||
Some(_) => { return Err("Only one of input ids or input embeddings may be set"); }
|
||||
None => (input_value.size(), input_value.device())
|
||||
}
|
||||
None => match &input_embeds {
|
||||
Some(embeds) => (vec!(embeds.size()[0], embeds.size()[1]), embeds.device()),
|
||||
None => { return Err("At least one of input ids or input embeddings must be set"); }
|
||||
}
|
||||
};
|
||||
|
||||
let mask = match mask {
|
||||
Some(value) => value,
|
||||
None => Tensor::ones(&input_shape, (Kind::Int64, device))
|
||||
};
|
||||
|
||||
let extended_attention_mask = mask.unsqueeze(1).unsqueeze(2);
|
||||
let extended_attention_mask: Tensor = (extended_attention_mask.ones_like() - extended_attention_mask) * -10000.0;
|
||||
|
||||
let embedding_output = match self.embeddings.forward_t(input_ids, token_type_ids, position_ids, input_embeds, train) {
|
||||
Ok(value) => value,
|
||||
Err(e) => { return Err(e); }
|
||||
};
|
||||
|
||||
let (hidden_state, all_hidden_states, all_attentions) =
|
||||
self.encoder.forward_t(&embedding_output,
|
||||
Some(extended_attention_mask),
|
||||
train);
|
||||
|
||||
let pooled_output = self.pooler.forward(&hidden_state.select(1, 0));
|
||||
let pooled_output = (self.pooler_activation)(&pooled_output);
|
||||
|
||||
Ok((hidden_state, pooled_output, all_hidden_states, all_attentions))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AlbertMLMHead {
|
||||
layer_norm: nn::LayerNorm,
|
||||
dense: nn::Linear,
|
||||
decoder: nn::Linear,
|
||||
activation: Box<dyn Fn(&Tensor) -> Tensor>,
|
||||
}
|
||||
|
||||
impl AlbertMLMHead {
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertMLMHead {
|
||||
let layer_norm_eps = match config.layer_norm_eps {
|
||||
Some(value) => value,
|
||||
None => 1e-12
|
||||
};
|
||||
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
|
||||
let layer_norm = nn::layer_norm(&(p / "LayerNorm"), vec![config.embedding_size], layer_norm_config);
|
||||
let dense = nn::linear(&(p / "dense"), config.hidden_size, config.embedding_size, Default::default());
|
||||
let decoder = nn::linear(&(p / "decoder"), config.embedding_size, config.vocab_size, Default::default());
|
||||
|
||||
let activation = Box::new(match &config.hidden_act {
|
||||
Activation::gelu_new => _gelu_new,
|
||||
Activation::gelu => _gelu,
|
||||
Activation::relu => _relu,
|
||||
Activation::mish => _mish
|
||||
});
|
||||
|
||||
AlbertMLMHead { layer_norm, dense, decoder, activation }
|
||||
}
|
||||
|
||||
pub fn forward(&self, hidden_states: &Tensor) -> Tensor {
|
||||
let output: Tensor = (self.activation)(&hidden_states.apply(&self.dense));
|
||||
output.apply(&self.layer_norm).apply(&self.decoder)
|
||||
}
|
||||
}
|
||||
|
||||
/// # ALBERT for masked language model
|
||||
/// Base ALBERT model with a masked language model head to predict missing tokens, for example `"Looks like one [MASK] is missing" -> "person"`
|
||||
/// It is made of the following blocks:
|
||||
/// - `albert`: Base AlbertModel
|
||||
/// - `predictions`: ALBERT MLM prediction head
|
||||
pub struct AlbertForMaskedLM {
|
||||
albert: AlbertModel,
|
||||
predictions: AlbertMLMHead,
|
||||
}
|
||||
|
||||
impl AlbertForMaskedLM {
|
||||
/// Build a new `AlbertForMaskedLM`
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `p` - Variable store path for the root of the ALBERT model
|
||||
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// use tch::{nn, Device};
|
||||
/// use rust_bert::Config;
|
||||
/// use std::path::Path;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForMaskedLM};
|
||||
///
|
||||
/// let config_path = Path::new("path/to/config.json");
|
||||
/// let device = Device::Cpu;
|
||||
/// let p = nn::VarStore::new(device);
|
||||
/// let config = AlbertConfig::from_file(config_path);
|
||||
/// let albert: AlbertForMaskedLM = AlbertForMaskedLM::new(&p.root(), &config);
|
||||
/// ```
|
||||
///
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForMaskedLM {
|
||||
let albert = AlbertModel::new(&(p / "albert"), config);
|
||||
let predictions = AlbertMLMHead::new(&(p / "predictions"), config);
|
||||
|
||||
AlbertForMaskedLM { albert, predictions }
|
||||
}
|
||||
|
||||
/// Forward pass through the model
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
|
||||
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
|
||||
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
|
||||
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
|
||||
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
|
||||
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*)
|
||||
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
///# use tch::{nn, Device, Tensor, no_grad};
|
||||
///# use rust_bert::Config;
|
||||
///# use std::path::Path;
|
||||
///# use tch::kind::Kind::Int64;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForMaskedLM};
|
||||
///# let config_path = Path::new("path/to/config.json");
|
||||
///# let device = Device::Cpu;
|
||||
///# let vs = nn::VarStore::new(device);
|
||||
///# let config = AlbertConfig::from_file(config_path);
|
||||
///# let albert_model: AlbertForMaskedLM = AlbertForMaskedLM::new(&vs.root(), &config);
|
||||
/// let (batch_size, sequence_length) = (64, 128);
|
||||
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
|
||||
///
|
||||
/// let (output, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
/// albert_model
|
||||
/// .forward_t(Some(input_tensor),
|
||||
/// Some(mask),
|
||||
/// Some(token_type_ids),
|
||||
/// Some(position_ids),
|
||||
/// None,
|
||||
/// false)
|
||||
/// });
|
||||
///
|
||||
/// ```
|
||||
///
|
||||
pub fn forward_t(&self,
|
||||
input_ids: Option<Tensor>,
|
||||
mask: Option<Tensor>,
|
||||
token_type_ids: Option<Tensor>,
|
||||
position_ids: Option<Tensor>,
|
||||
input_embeds: Option<Tensor>,
|
||||
train: bool) -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
|
||||
let (hidden_state, _, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
|
||||
let prediction_scores = self.predictions.forward(&hidden_state);
|
||||
(prediction_scores, all_hidden_states, all_attentions)
|
||||
}
|
||||
}
|
||||
|
||||
/// # ALBERT for sequence classification
|
||||
/// Base ALBERT model with a classifier head to perform sentence or document-level classification
|
||||
/// It is made of the following blocks:
|
||||
/// - `albert`: Base AlbertModel
|
||||
/// - `dropout`: Dropout layer
|
||||
/// - `classifier`: linear layer for classification
|
||||
pub struct AlbertForSequenceClassification {
|
||||
albert: AlbertModel,
|
||||
dropout: Dropout,
|
||||
classifier: nn::Linear,
|
||||
}
|
||||
|
||||
impl AlbertForSequenceClassification {
|
||||
/// Build a new `AlbertForSequenceClassification`
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `p` - Variable store path for the root of the ALBERT model
|
||||
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// use tch::{nn, Device};
|
||||
/// use rust_bert::Config;
|
||||
/// use std::path::Path;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForSequenceClassification};
|
||||
///
|
||||
/// let config_path = Path::new("path/to/config.json");
|
||||
/// let device = Device::Cpu;
|
||||
/// let p = nn::VarStore::new(device);
|
||||
/// let config = AlbertConfig::from_file(config_path);
|
||||
/// let albert: AlbertForSequenceClassification = AlbertForSequenceClassification::new(&p.root(), &config);
|
||||
/// ```
|
||||
///
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForSequenceClassification {
|
||||
let albert = AlbertModel::new(&(p / "albert"), config);
|
||||
let classifier_dropout_prob = match config.classifier_dropout_prob {
|
||||
Some(value) => value,
|
||||
None => 0.1
|
||||
};
|
||||
let dropout = Dropout::new(classifier_dropout_prob);
|
||||
let num_labels = config.id2label.as_ref().expect("num_labels not provided in configuration").len() as i64;
|
||||
let classifier = nn::linear(&(p / "classifier"), config.hidden_size, num_labels, Default::default());
|
||||
|
||||
AlbertForSequenceClassification { albert, dropout, classifier }
|
||||
}
|
||||
|
||||
/// Forward pass through the model
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
|
||||
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
|
||||
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
|
||||
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
|
||||
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
|
||||
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `output` - `Tensor` of shape (*batch size*, *num_labels*)
|
||||
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
///# use tch::{nn, Device, Tensor, no_grad};
|
||||
///# use rust_bert::Config;
|
||||
///# use std::path::Path;
|
||||
///# use tch::kind::Kind::Int64;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForSequenceClassification};
|
||||
///# let config_path = Path::new("path/to/config.json");
|
||||
///# let device = Device::Cpu;
|
||||
///# let vs = nn::VarStore::new(device);
|
||||
///# let config = AlbertConfig::from_file(config_path);
|
||||
///# let albert_model: AlbertForSequenceClassification = AlbertForSequenceClassification::new(&vs.root(), &config);
|
||||
/// let (batch_size, sequence_length) = (64, 128);
|
||||
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
|
||||
///
|
||||
/// let (output, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
/// albert_model
|
||||
/// .forward_t(Some(input_tensor),
|
||||
/// Some(mask),
|
||||
/// Some(token_type_ids),
|
||||
/// Some(position_ids),
|
||||
/// None,
|
||||
/// false)
|
||||
/// });
|
||||
///
|
||||
/// ```
|
||||
///
|
||||
pub fn forward_t(&self,
|
||||
input_ids: Option<Tensor>,
|
||||
mask: Option<Tensor>,
|
||||
token_type_ids: Option<Tensor>,
|
||||
position_ids: Option<Tensor>,
|
||||
input_embeds: Option<Tensor>,
|
||||
train: bool) -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
|
||||
let (_, pooled_output, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
|
||||
let logits = pooled_output.apply_t(&self.dropout, train).apply(&self.classifier);
|
||||
(logits, all_hidden_states, all_attentions)
|
||||
}
|
||||
}
|
||||
|
||||
/// # ALBERT for token classification (e.g. NER, POS)
|
||||
/// Token-level classifier predicting a label for each token provided. Note that because of SentencePiece tokenization, the labels predicted are
|
||||
/// not necessarily aligned with words in the sentence.
|
||||
/// It is made of the following blocks:
|
||||
/// - `albert`: Base AlbertModel
|
||||
/// - `dropout`: Dropout to apply on the encoder last hidden states
|
||||
/// - `classifier`: Linear layer for token classification
|
||||
pub struct AlbertForTokenClassification {
|
||||
albert: AlbertModel,
|
||||
dropout: Dropout,
|
||||
classifier: nn::Linear,
|
||||
}
|
||||
|
||||
impl AlbertForTokenClassification {
|
||||
/// Build a new `AlbertForTokenClassification`
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `p` - Variable store path for the root of the ALBERT model
|
||||
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// use tch::{nn, Device};
|
||||
/// use rust_bert::Config;
|
||||
/// use std::path::Path;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForTokenClassification};
|
||||
///
|
||||
/// let config_path = Path::new("path/to/config.json");
|
||||
/// let device = Device::Cpu;
|
||||
/// let p = nn::VarStore::new(device);
|
||||
/// let config = AlbertConfig::from_file(config_path);
|
||||
/// let albert: AlbertForTokenClassification = AlbertForTokenClassification::new(&p.root(), &config);
|
||||
/// ```
|
||||
///
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForTokenClassification {
|
||||
let albert = AlbertModel::new(&(p / "albert"), config);
|
||||
let dropout = Dropout::new(config.hidden_dropout_prob);
|
||||
let num_labels = config.id2label.as_ref().expect("num_labels not provided in configuration").len() as i64;
|
||||
let classifier = nn::linear(&(p / "classifier"), config.hidden_size, num_labels, Default::default());
|
||||
|
||||
AlbertForTokenClassification { albert, dropout, classifier }
|
||||
}
|
||||
|
||||
/// Forward pass through the model
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
|
||||
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
|
||||
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
|
||||
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
|
||||
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
|
||||
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `output` - `Tensor` of shape (*batch size*, *sequence_length*, *num_labels*) containing the logits for each of the input tokens and classes
|
||||
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
///# use tch::{nn, Device, Tensor, no_grad};
|
||||
///# use rust_bert::Config;
|
||||
///# use std::path::Path;
|
||||
///# use tch::kind::Kind::Int64;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForTokenClassification};
|
||||
///# let config_path = Path::new("path/to/config.json");
|
||||
///# let device = Device::Cpu;
|
||||
///# let vs = nn::VarStore::new(device);
|
||||
///# let config = AlbertConfig::from_file(config_path);
|
||||
///# let albert_model: AlbertForTokenClassification = AlbertForTokenClassification::new(&vs.root(), &config);
|
||||
/// let (batch_size, sequence_length) = (64, 128);
|
||||
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
|
||||
///
|
||||
/// let (output, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
/// albert_model
|
||||
/// .forward_t(Some(input_tensor),
|
||||
/// Some(mask),
|
||||
/// Some(token_type_ids),
|
||||
/// Some(position_ids),
|
||||
/// None,
|
||||
/// false)
|
||||
/// });
|
||||
///
|
||||
/// ```
|
||||
///
|
||||
pub fn forward_t(&self,
|
||||
input_ids: Option<Tensor>,
|
||||
mask: Option<Tensor>,
|
||||
token_type_ids: Option<Tensor>,
|
||||
position_ids: Option<Tensor>,
|
||||
input_embeds: Option<Tensor>,
|
||||
train: bool) -> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
|
||||
let (sequence_output, _, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
|
||||
let logits = sequence_output.apply_t(&self.dropout, train).apply(&self.classifier);
|
||||
(logits, all_hidden_states, all_attentions)
|
||||
}
|
||||
}
|
||||
|
||||
/// # ALBERT for question answering
|
||||
/// Extractive question-answering model based on a ALBERT language model. Identifies the segment of a context that answers a provided question.
|
||||
/// Please note that a significant amount of pre- and post-processing is required to perform end-to-end question answering.
|
||||
/// See the question answering pipeline (also provided in this crate) for more details.
|
||||
/// It is made of the following blocks:
|
||||
/// - `albert`: Base AlbertModel
|
||||
/// - `qa_outputs`: Linear layer for question answering
|
||||
pub struct AlbertForQuestionAnswering {
|
||||
albert: AlbertModel,
|
||||
qa_outputs: nn::Linear,
|
||||
}
|
||||
|
||||
impl AlbertForQuestionAnswering {
|
||||
/// Build a new `AlbertForQuestionAnswering`
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `p` - Variable store path for the root of the ALBERT model
|
||||
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// use tch::{nn, Device};
|
||||
/// use rust_bert::Config;
|
||||
/// use std::path::Path;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForQuestionAnswering};
|
||||
///
|
||||
/// let config_path = Path::new("path/to/config.json");
|
||||
/// let device = Device::Cpu;
|
||||
/// let p = nn::VarStore::new(device);
|
||||
/// let config = AlbertConfig::from_file(config_path);
|
||||
/// let albert: AlbertForQuestionAnswering = AlbertForQuestionAnswering::new(&p.root(), &config);
|
||||
/// ```
|
||||
///
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForQuestionAnswering {
|
||||
let albert = AlbertModel::new(&(p / "albert"), config);
|
||||
let num_labels = 2;
|
||||
let qa_outputs = nn::linear(&(p / "qa_outputs"), config.hidden_size, num_labels, Default::default());
|
||||
|
||||
AlbertForQuestionAnswering { albert, qa_outputs }
|
||||
}
|
||||
|
||||
/// Forward pass through the model
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
|
||||
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
|
||||
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
|
||||
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
|
||||
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
|
||||
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `start_scores` - `Tensor` of shape (*batch size*, *sequence_length*) containing the logits for start of the answer
|
||||
/// * `end_scores` - `Tensor` of shape (*batch size*, *sequence_length*) containing the logits for end of the answer
|
||||
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
///# use tch::{nn, Device, Tensor, no_grad};
|
||||
///# use rust_bert::Config;
|
||||
///# use std::path::Path;
|
||||
///# use tch::kind::Kind::Int64;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForQuestionAnswering};
|
||||
///# let config_path = Path::new("path/to/config.json");
|
||||
///# let device = Device::Cpu;
|
||||
///# let vs = nn::VarStore::new(device);
|
||||
///# let config = AlbertConfig::from_file(config_path);
|
||||
///# let albert_model: AlbertForQuestionAnswering = AlbertForQuestionAnswering::new(&vs.root(), &config);
|
||||
/// let (batch_size, sequence_length) = (64, 128);
|
||||
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
|
||||
///
|
||||
/// let (start_logits, end_logits, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
/// albert_model
|
||||
/// .forward_t(Some(input_tensor),
|
||||
/// Some(mask),
|
||||
/// Some(token_type_ids),
|
||||
/// Some(position_ids),
|
||||
/// None,
|
||||
/// false)
|
||||
/// });
|
||||
///
|
||||
/// ```
|
||||
///
|
||||
pub fn forward_t(&self,
|
||||
input_ids: Option<Tensor>,
|
||||
mask: Option<Tensor>,
|
||||
token_type_ids: Option<Tensor>,
|
||||
position_ids: Option<Tensor>,
|
||||
input_embeds: Option<Tensor>,
|
||||
train: bool) -> (Tensor, Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
|
||||
let (sequence_output, _, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
|
||||
let logits = sequence_output.apply(&self.qa_outputs).split(1, -1);
|
||||
let (start_logits, end_logits) = (&logits[0], &logits[1]);
|
||||
let start_logits = start_logits.squeeze1(-1);
|
||||
let end_logits = end_logits.squeeze1(-1);
|
||||
|
||||
(start_logits, end_logits, all_hidden_states, all_attentions)
|
||||
}
|
||||
}
|
||||
|
||||
/// # ALBERT for multiple choices
|
||||
/// Multiple choices model using a ALBERT base model and a linear classifier.
|
||||
/// Input should be in the form `[CLS] Context [SEP] Possible choice [SEP]`. The choice is made along the batch axis,
|
||||
/// assuming all elements of the batch are alternatives to be chosen from for a given context.
|
||||
/// It is made of the following blocks:
|
||||
/// - `albert`: Base AlbertModel
|
||||
/// - `dropout`: Dropout for hidden states output
|
||||
/// - `classifier`: Linear layer for multiple choices
|
||||
pub struct AlbertForMultipleChoice {
|
||||
albert: AlbertModel,
|
||||
dropout: Dropout,
|
||||
classifier: nn::Linear,
|
||||
}
|
||||
|
||||
impl AlbertForMultipleChoice {
|
||||
/// Build a new `AlbertForMultipleChoice`
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `p` - Variable store path for the root of the ALBERT model
|
||||
/// * `config` - `AlbertConfig` object defining the model architecture and decoder status
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
/// use tch::{nn, Device};
|
||||
/// use rust_bert::Config;
|
||||
/// use std::path::Path;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForMultipleChoice};
|
||||
///
|
||||
/// let config_path = Path::new("path/to/config.json");
|
||||
/// let device = Device::Cpu;
|
||||
/// let p = nn::VarStore::new(device);
|
||||
/// let config = AlbertConfig::from_file(config_path);
|
||||
/// let albert: AlbertForMultipleChoice = AlbertForMultipleChoice::new(&p.root(), &config);
|
||||
/// ```
|
||||
///
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertForMultipleChoice {
|
||||
let albert = AlbertModel::new(&(p / "albert"), config);
|
||||
let dropout = Dropout::new(config.hidden_dropout_prob);
|
||||
let num_labels = 1;
|
||||
let classifier = nn::linear(&(p / "classifier"), config.hidden_size, num_labels, Default::default());
|
||||
|
||||
AlbertForMultipleChoice { albert, dropout, classifier }
|
||||
}
|
||||
|
||||
/// Forward pass through the model
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
|
||||
/// * `mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
|
||||
/// * `token_type_ids` - Optional segment id of shape (*batch size*, *sequence_length*). Convention is value of 0 for the first sentence (incl. *[SEP]*) and 1 for the second sentence. If None set to 0.
|
||||
/// * `position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented from 0.
|
||||
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
|
||||
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `output` - `Tensor` of shape (*1*, *batch size*) containing the logits for each of the alternatives given
|
||||
/// * `hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
/// * `attentions` - `Option<Vec<Vec<Tensor>>>` of length *num_hidden_layers* of nested length *inner_group_num* with shape (*batch size*, *sequence_length*, *hidden_size*)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no_run
|
||||
///# use tch::{nn, Device, Tensor, no_grad};
|
||||
///# use rust_bert::Config;
|
||||
///# use std::path::Path;
|
||||
///# use tch::kind::Kind::Int64;
|
||||
/// use rust_bert::albert::{AlbertConfig, AlbertForMultipleChoice};
|
||||
///# let config_path = Path::new("path/to/config.json");
|
||||
///# let device = Device::Cpu;
|
||||
///# let vs = nn::VarStore::new(device);
|
||||
///# let config = AlbertConfig::from_file(config_path);
|
||||
///# let albert_model: AlbertForMultipleChoice = AlbertForMultipleChoice::new(&vs.root(), &config);
|
||||
/// let (batch_size, sequence_length) = (64, 128);
|
||||
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let token_type_ids = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
|
||||
/// let position_ids = Tensor::arange(sequence_length, (Int64, device)).expand(&[batch_size, sequence_length], true);
|
||||
///
|
||||
/// let (output, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
/// albert_model
|
||||
/// .forward_t(Some(input_tensor),
|
||||
/// Some(mask),
|
||||
/// Some(token_type_ids),
|
||||
/// Some(position_ids),
|
||||
/// None,
|
||||
/// false).unwrap()
|
||||
/// });
|
||||
///
|
||||
/// ```
|
||||
///
|
||||
pub fn forward_t(&self,
|
||||
input_ids: Option<Tensor>,
|
||||
mask: Option<Tensor>,
|
||||
token_type_ids: Option<Tensor>,
|
||||
position_ids: Option<Tensor>,
|
||||
input_embeds: Option<Tensor>,
|
||||
train: bool) -> Result<(Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>), &'static str> {
|
||||
let (input_ids, input_embeds, num_choices) = match &input_ids {
|
||||
Some(input_value) => match &input_embeds {
|
||||
Some(_) => { return Err("Only one of input ids or input embeddings may be set"); }
|
||||
None => (Some(input_value.view((-1, *input_value.size().last().unwrap()))), None, input_value.size()[1])
|
||||
}
|
||||
None => match &input_embeds {
|
||||
Some(embeds) => (None, Some(embeds.view((-1, embeds.size()[1], embeds.size()[2]))), embeds.size()[1]),
|
||||
None => { return Err("At least one of input ids or input embeddings must be set"); }
|
||||
}
|
||||
};
|
||||
|
||||
let mask = match mask {
|
||||
Some(value) => Some(value.view((-1, *value.size().last().unwrap()))),
|
||||
None => None
|
||||
};
|
||||
let token_type_ids = match token_type_ids {
|
||||
Some(value) => Some(value.view((-1, *value.size().last().unwrap()))),
|
||||
None => None
|
||||
};
|
||||
let position_ids = match position_ids {
|
||||
Some(value) => Some(value.view((-1, *value.size().last().unwrap()))),
|
||||
None => None
|
||||
};
|
||||
|
||||
|
||||
let (_, pooled_output, all_hidden_states, all_attentions) = self.albert.forward_t(input_ids, mask, token_type_ids, position_ids, input_embeds, train).unwrap();
|
||||
let logits = pooled_output.apply_t(&self.dropout, train).apply(&self.classifier).view((-1, num_choices));
|
||||
|
||||
Ok((logits, all_hidden_states, all_attentions))
|
||||
}
|
||||
}
|
107
src/albert/attention.rs
Normal file
107
src/albert/attention.rs
Normal file
@ -0,0 +1,107 @@
|
||||
// Copyright 2018 Google AI and Google Brain team.
|
||||
// Copyright 2020-present, the HuggingFace Inc. team.
|
||||
// Copyright 2020 Guillaume Becquin
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use crate::common::dropout::Dropout;
|
||||
use tch::{nn, Tensor};
|
||||
use crate::albert::AlbertConfig;
|
||||
use tch::kind::Kind::Float;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AlbertSelfAttention {
|
||||
num_attention_heads: i64,
|
||||
attention_head_size: i64,
|
||||
hidden_size: i64,
|
||||
dropout: Dropout,
|
||||
output_attentions: bool,
|
||||
query: nn::Linear,
|
||||
key: nn::Linear,
|
||||
value: nn::Linear,
|
||||
dense: nn::Linear,
|
||||
layer_norm: nn::LayerNorm,
|
||||
}
|
||||
|
||||
impl AlbertSelfAttention {
|
||||
pub fn new(p: nn::Path, config: &AlbertConfig) -> AlbertSelfAttention {
|
||||
assert_eq!(config.hidden_size % config.num_attention_heads, 0, "Hidden size not a multiple of the number of attention heads");
|
||||
|
||||
let query = nn::linear(&p / "query", config.hidden_size, config.hidden_size, Default::default());
|
||||
let key = nn::linear(&p / "key", config.hidden_size, config.hidden_size, Default::default());
|
||||
let value = nn::linear(&p / "value", config.hidden_size, config.hidden_size, Default::default());
|
||||
let dense = nn::linear(&p / "dense", config.hidden_size, config.hidden_size, Default::default());
|
||||
let dropout = Dropout::new(config.attention_probs_dropout_prob);
|
||||
let attention_head_size = config.hidden_size / config.num_attention_heads;
|
||||
let output_attentions = match config.output_attentions {
|
||||
Some(value) => value,
|
||||
None => false
|
||||
};
|
||||
let layer_norm_eps = match config.layer_norm_eps {
|
||||
Some(value) => value,
|
||||
None => 1e-12
|
||||
};
|
||||
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
|
||||
let layer_norm = nn::layer_norm(&p / "LayerNorm", vec![config.hidden_size], layer_norm_config);
|
||||
|
||||
AlbertSelfAttention {
|
||||
num_attention_heads: config.num_attention_heads,
|
||||
attention_head_size,
|
||||
hidden_size: config.hidden_size,
|
||||
dropout,
|
||||
output_attentions,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
dense,
|
||||
layer_norm,
|
||||
}
|
||||
}
|
||||
|
||||
fn split_heads(&self, x: Tensor, bs: i64, dim_per_head: i64) -> Tensor {
|
||||
x.view((bs, -1, self.num_attention_heads, dim_per_head)).transpose(1, 2)
|
||||
}
|
||||
|
||||
pub fn forward_t(&self,
|
||||
input_ids: &Tensor,
|
||||
mask: &Option<Tensor>,
|
||||
train: bool) -> (Tensor, Option<Tensor>) {
|
||||
|
||||
let bs = *input_ids.size().first().unwrap();
|
||||
|
||||
let key_layer = self.split_heads(input_ids.apply(&self.key), bs, self.attention_head_size);
|
||||
let value_layer = self.split_heads(input_ids.apply(&self.value), bs, self.attention_head_size);
|
||||
let query_layer = self.split_heads(input_ids.apply(&self.query), bs, self.attention_head_size);
|
||||
|
||||
let query_layer: Tensor = query_layer / (self.attention_head_size as f64).sqrt();
|
||||
|
||||
let scores = if let Some(mask) = mask {
|
||||
query_layer.matmul(&key_layer.transpose(-1, -2)) + mask
|
||||
} else {
|
||||
query_layer.matmul(&key_layer.transpose(-1, -2))
|
||||
};
|
||||
|
||||
let weights = scores.softmax(-1, Float).apply_t(&self.dropout, train);
|
||||
let context = weights.matmul(&value_layer).transpose(1, 2).contiguous();
|
||||
|
||||
let w = self.dense.ws
|
||||
.transpose(0, 1)
|
||||
.view((self.num_attention_heads, self.attention_head_size, self.hidden_size));
|
||||
|
||||
let context: Tensor = Tensor::einsum("bfnd,ndh->bfh", &[context, w]) + &self.dense.bs;
|
||||
let context = (input_ids + context.apply_t(&self.dropout, train)).apply(&self.layer_norm);
|
||||
|
||||
if !self.output_attentions {
|
||||
(context, None)
|
||||
} else {
|
||||
(context, Some(weights))
|
||||
}
|
||||
}
|
||||
}
|
102
src/albert/embeddings.rs
Normal file
102
src/albert/embeddings.rs
Normal file
@ -0,0 +1,102 @@
|
||||
// Copyright 2018 Google AI and Google Brain team.
|
||||
// Copyright 2020-present, the HuggingFace Inc. team.
|
||||
// Copyright 2020 Guillaume Becquin
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use tch::{nn, Tensor, Kind};
|
||||
use crate::common::dropout::Dropout;
|
||||
use crate::albert::AlbertConfig;
|
||||
use tch::nn::{EmbeddingConfig, embedding};
|
||||
|
||||
/// # Embeddings implementation for Albert model
|
||||
#[derive(Debug)]
|
||||
/// # Embeddings implementation for Electra model
|
||||
pub struct AlbertEmbeddings {
|
||||
word_embeddings: nn::Embedding,
|
||||
position_embeddings: nn::Embedding,
|
||||
token_type_embeddings: nn::Embedding,
|
||||
layer_norm: nn::LayerNorm,
|
||||
dropout: Dropout,
|
||||
}
|
||||
|
||||
impl AlbertEmbeddings {
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertEmbeddings {
|
||||
let embedding_config = EmbeddingConfig {
|
||||
padding_idx: config.pad_token_id,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let word_embeddings: nn::Embedding = embedding(p / "word_embeddings",
|
||||
config.vocab_size,
|
||||
config.embedding_size,
|
||||
embedding_config);
|
||||
|
||||
let position_embeddings: nn::Embedding = embedding(p / "position_embeddings",
|
||||
config.max_position_embeddings,
|
||||
config.embedding_size,
|
||||
Default::default());
|
||||
|
||||
let token_type_embeddings: nn::Embedding = embedding(p / "token_type_embeddings",
|
||||
config.type_vocab_size,
|
||||
config.embedding_size,
|
||||
Default::default());
|
||||
|
||||
let layer_norm_eps = match config.layer_norm_eps {
|
||||
Some(value) => value,
|
||||
None => 1e-12
|
||||
};
|
||||
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
|
||||
let layer_norm: nn::LayerNorm = nn::layer_norm(p / "LayerNorm", vec![config.embedding_size], layer_norm_config);
|
||||
let dropout: Dropout = Dropout::new(config.hidden_dropout_prob);
|
||||
AlbertEmbeddings { word_embeddings, position_embeddings, token_type_embeddings, layer_norm, dropout}
|
||||
}
|
||||
|
||||
pub fn forward_t(&self,
|
||||
input_ids: Option<Tensor>,
|
||||
token_type_ids: Option<Tensor>,
|
||||
position_ids: Option<Tensor>,
|
||||
input_embeds: Option<Tensor>,
|
||||
train: bool) -> Result<Tensor, &'static str> {
|
||||
let (input_embeddings, input_shape) = match input_ids {
|
||||
Some(input_value) => match input_embeds {
|
||||
Some(_) => { return Err("Only one of input ids or input embeddings may be set"); }
|
||||
None => (input_value.apply_t(&self.word_embeddings, train), input_value.size())
|
||||
}
|
||||
None => match input_embeds {
|
||||
Some(embeds) => {
|
||||
let size = vec!(embeds.size()[0], embeds.size()[1]);
|
||||
(embeds, size)
|
||||
},
|
||||
None => { return Err("Only one of input ids or input embeddings may be set"); }
|
||||
}
|
||||
};
|
||||
|
||||
let seq_length = input_embeddings.as_ref().size()[1].to_owned();
|
||||
|
||||
let position_ids = match position_ids {
|
||||
Some(value) => value,
|
||||
None => Tensor::arange(seq_length, (Kind::Int64, input_embeddings.device()))
|
||||
.unsqueeze(0).
|
||||
expand(&input_shape, true)
|
||||
};
|
||||
|
||||
let token_type_ids = match token_type_ids {
|
||||
Some(value) => value,
|
||||
None => Tensor::zeros(&input_shape, (Kind::Int64, input_embeddings.device()))
|
||||
};
|
||||
|
||||
let position_embeddings = position_ids.apply(&self.position_embeddings);
|
||||
let token_type_embeddings = token_type_ids.apply(&self.token_type_embeddings);
|
||||
|
||||
let input_embeddings: Tensor = input_embeddings + position_embeddings + token_type_embeddings;
|
||||
Ok(input_embeddings.apply(&self.layer_norm).apply_t(&self.dropout, train))
|
||||
}
|
||||
}
|
198
src/albert/encoder.rs
Normal file
198
src/albert/encoder.rs
Normal file
@ -0,0 +1,198 @@
|
||||
// Copyright 2018 Google AI and Google Brain team.
|
||||
// Copyright 2020-present, the HuggingFace Inc. team.
|
||||
// Copyright 2020 Guillaume Becquin
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use crate::albert::attention::AlbertSelfAttention;
|
||||
use tch::{nn, Tensor};
|
||||
use crate::albert::AlbertConfig;
|
||||
use crate::albert::albert::Activation;
|
||||
use crate::common::activations::{_gelu_new, _gelu, _relu, _mish};
|
||||
use std::borrow::BorrowMut;
|
||||
|
||||
pub struct AlbertLayer {
|
||||
attention: AlbertSelfAttention,
|
||||
full_layer_layer_norm: nn::LayerNorm,
|
||||
ffn: nn::Linear,
|
||||
ffn_output: nn::Linear,
|
||||
activation: Box<dyn Fn(&Tensor) -> Tensor>,
|
||||
}
|
||||
|
||||
impl AlbertLayer {
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertLayer {
|
||||
let attention = AlbertSelfAttention::new(p / "attention", &config);
|
||||
|
||||
let layer_norm_eps = match config.layer_norm_eps {
|
||||
Some(value) => value,
|
||||
None => 1e-12
|
||||
};
|
||||
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
|
||||
let full_layer_layer_norm = nn::layer_norm(&(p / "full_layer_layer_norm"), vec![config.hidden_size], layer_norm_config);
|
||||
|
||||
let ffn = nn::linear(&(p / "ffn"), config.hidden_size, config.intermediate_size, Default::default());
|
||||
let ffn_output = nn::linear(&(p / "ffn_output"), config.intermediate_size, config.hidden_size, Default::default());
|
||||
|
||||
let activation = Box::new(match &config.hidden_act {
|
||||
Activation::gelu_new => _gelu_new,
|
||||
Activation::gelu => _gelu,
|
||||
Activation::relu => _relu,
|
||||
Activation::mish => _mish
|
||||
});
|
||||
|
||||
AlbertLayer { attention, full_layer_layer_norm, ffn, ffn_output, activation }
|
||||
}
|
||||
|
||||
pub fn forward_t(&self,
|
||||
hidden_states: &Tensor,
|
||||
mask: &Option<Tensor>,
|
||||
train: bool) -> (Tensor, Option<Tensor>) {
|
||||
let (attention_output, attention_weights) = self.attention.forward_t(hidden_states, mask, train);
|
||||
let ffn_output = attention_output.apply(&self.ffn);
|
||||
let ffn_output: Tensor = (self.activation)(&ffn_output);
|
||||
let ffn_output = ffn_output.apply(&self.ffn_output);
|
||||
let ffn_output = (ffn_output + attention_output).apply(&self.full_layer_layer_norm);
|
||||
|
||||
(ffn_output, attention_weights)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AlbertLayerGroup {
|
||||
output_hidden_states: bool,
|
||||
output_attentions: bool,
|
||||
layers: Vec<AlbertLayer>,
|
||||
}
|
||||
|
||||
impl AlbertLayerGroup {
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertLayerGroup {
|
||||
let p = &(p / "albert_layers");
|
||||
|
||||
let output_attentions = match config.output_attentions {
|
||||
Some(value) => value,
|
||||
None => false
|
||||
};
|
||||
|
||||
let output_hidden_states = match config.output_hidden_states {
|
||||
Some(value) => value,
|
||||
None => false
|
||||
};
|
||||
|
||||
let mut layers: Vec<AlbertLayer> = vec!();
|
||||
for layer_index in 0..config.inner_group_num {
|
||||
layers.push(AlbertLayer::new(&(p / layer_index), config));
|
||||
};
|
||||
|
||||
AlbertLayerGroup { output_hidden_states, output_attentions, layers }
|
||||
}
|
||||
|
||||
pub fn forward_t(&self,
|
||||
hidden_states: &Tensor,
|
||||
mask: &Option<Tensor>,
|
||||
train: bool)
|
||||
-> (Tensor, Option<Vec<Tensor>>, Option<Vec<Tensor>>) {
|
||||
let mut all_hidden_states: Option<Vec<Tensor>> = if self.output_hidden_states { Some(vec!()) } else { None };
|
||||
let mut all_attentions: Option<Vec<Tensor>> = if self.output_attentions { Some(vec!()) } else { None };
|
||||
|
||||
let mut hidden_state = hidden_states.copy();
|
||||
let mut attention_weights: Option<Tensor>;
|
||||
let mut layers = self.layers.iter();
|
||||
loop {
|
||||
match layers.next() {
|
||||
Some(layer) => {
|
||||
if let Some(hidden_states) = all_hidden_states.borrow_mut() {
|
||||
hidden_states.push(hidden_state.as_ref().copy());
|
||||
};
|
||||
|
||||
let temp = layer.forward_t(&hidden_state, &mask, train);
|
||||
hidden_state = temp.0;
|
||||
attention_weights = temp.1;
|
||||
if let Some(attentions) = all_attentions.borrow_mut() {
|
||||
attentions.push(attention_weights.as_ref().unwrap().copy());
|
||||
};
|
||||
}
|
||||
None => break
|
||||
};
|
||||
};
|
||||
|
||||
(hidden_state, all_hidden_states, all_attentions)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AlbertTransformer {
|
||||
output_hidden_states: bool,
|
||||
output_attentions: bool,
|
||||
num_hidden_layers: i64,
|
||||
num_hidden_groups: i64,
|
||||
embedding_hidden_mapping_in: nn::Linear,
|
||||
layers: Vec<AlbertLayerGroup>,
|
||||
}
|
||||
|
||||
impl AlbertTransformer {
|
||||
pub fn new(p: &nn::Path, config: &AlbertConfig) -> AlbertTransformer {
|
||||
let p_layers = &(p / "albert_layer_groups");
|
||||
|
||||
let output_attentions = match config.output_attentions {
|
||||
Some(value) => value,
|
||||
None => false
|
||||
};
|
||||
|
||||
let output_hidden_states = match config.output_hidden_states {
|
||||
Some(value) => value,
|
||||
None => false
|
||||
};
|
||||
|
||||
let embedding_hidden_mapping_in = nn::linear(&(p / "embedding_hidden_mapping_in"), config.embedding_size, config.hidden_size, Default::default());
|
||||
|
||||
let mut layers: Vec<AlbertLayerGroup> = vec!();
|
||||
for layer_index in 0..config.inner_group_num {
|
||||
layers.push(AlbertLayerGroup::new(&(p_layers / layer_index), config));
|
||||
};
|
||||
|
||||
AlbertTransformer {
|
||||
output_hidden_states,
|
||||
output_attentions,
|
||||
num_hidden_layers: config.num_hidden_layers,
|
||||
num_hidden_groups: config.num_hidden_groups,
|
||||
embedding_hidden_mapping_in,
|
||||
layers,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forward_t(&self,
|
||||
hidden_states: &Tensor,
|
||||
mask: Option<Tensor>,
|
||||
train: bool)
|
||||
-> (Tensor, Option<Vec<Tensor>>, Option<Vec<Vec<Tensor>>>) {
|
||||
let mut hidden_state = hidden_states.apply(&self.embedding_hidden_mapping_in);
|
||||
|
||||
let mut all_hidden_states: Option<Vec<Tensor>> = if self.output_hidden_states { Some(vec!()) } else { None };
|
||||
let mut all_attentions: Option<Vec<Vec<Tensor>>> = if self.output_attentions { Some(vec!()) } else { None };
|
||||
|
||||
|
||||
for i in 0..self.num_hidden_layers {
|
||||
let group_idx = i / (self.num_hidden_layers / self.num_hidden_groups);
|
||||
let layer = &self.layers[group_idx as usize];
|
||||
|
||||
if let Some(hidden_states) = all_hidden_states.borrow_mut() {
|
||||
hidden_states.push(hidden_state.as_ref().copy());
|
||||
};
|
||||
|
||||
let temp = layer.forward_t(&hidden_state, &mask, train);
|
||||
hidden_state = temp.0;
|
||||
let attention_weights = temp.1;
|
||||
if let Some(attentions) = all_attentions.borrow_mut() {
|
||||
attentions.push(attention_weights.unwrap());
|
||||
};
|
||||
};
|
||||
|
||||
(hidden_state, all_hidden_states, all_attentions)
|
||||
}
|
||||
}
|
||||
|
56
src/albert/mod.rs
Normal file
56
src/albert/mod.rs
Normal file
@ -0,0 +1,56 @@
|
||||
//! # ALBERT: A Lite BERT for Self-supervised Learning of Language Representations (Lan et al.)
|
||||
//!
|
||||
//! Implementation of the ALBERT language model ([https://arxiv.org/abs/1909.11942](https://arxiv.org/abs/1909.11942) Lan, Chen, Goodman, Gimpel, Sharma, Soricut, 2019).
|
||||
//! This model offers a greatly reduced memory footprint for similar effective size (number and size of layers). The computational cost remains however similar to the original BERT model.
|
||||
//! The base model is implemented in the `albert::AlbertModel` struct. Several language model heads have also been implemented, including:
|
||||
//! - Masked language model: `albert::AlbertForMaskedLM`
|
||||
//! - Multiple choices: `albert:AlbertForMultipleChoice`
|
||||
//! - Question answering: `albert::AlbertForQuestionAnswering`
|
||||
//! - Sequence classification: `albert::AlbertForSequenceClassification`
|
||||
//! - Token classification (e.g. NER, POS tagging): `albert::AlbertForTokenClassification`
|
||||
//!
|
||||
//! # Model set-up and pre-trained weights loading
|
||||
//!
|
||||
//! A full working example is provided in `examples/albert.rs`, run with `cargo run --example albert`.
|
||||
//! The example below illustrate a Masked language model example, the structure is similar for other models.
|
||||
//! All models expect the following resources:
|
||||
//! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
|
||||
//! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
|
||||
//! - `BertTokenizer` using a `vocab.txt` vocabulary
|
||||
//! Pretrained models are available and can be downloaded using RemoteResources.
|
||||
//!
|
||||
//! ```no_run
|
||||
//!# fn main() -> failure::Fallible<()> {
|
||||
//!#
|
||||
//! use rust_tokenizers::AlbertTokenizer;
|
||||
//! use tch::{nn, Device};
|
||||
//!# use std::path::PathBuf;
|
||||
//! use rust_bert::albert::{AlbertForMaskedLM, AlbertConfig};
|
||||
//! use rust_bert::Config;
|
||||
//! use rust_bert::resources::{Resource, download_resource, LocalResource};
|
||||
//!
|
||||
//! let config_resource = Resource::Local(LocalResource { local_path: PathBuf::from("path/to/config.json")});
|
||||
//! let vocab_resource = Resource::Local(LocalResource { local_path: PathBuf::from("path/to/vocab.txt")});
|
||||
//! let weights_resource = Resource::Local(LocalResource { local_path: PathBuf::from("path/to/model.ot")});
|
||||
//! let config_path = download_resource(&config_resource)?;
|
||||
//! let vocab_path = download_resource(&vocab_resource)?;
|
||||
//! let weights_path = download_resource(&weights_resource)?;
|
||||
//! let device = Device::cuda_if_available();
|
||||
//! let mut vs = nn::VarStore::new(device);
|
||||
//! let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, true);
|
||||
//! let config = AlbertConfig::from_file(config_path);
|
||||
//! let bert_model = AlbertForMaskedLM::new(&vs.root(), &config);
|
||||
//! vs.load(weights_path)?;
|
||||
//!
|
||||
//!# Ok(())
|
||||
//!# }
|
||||
//! ```
|
||||
|
||||
|
||||
|
||||
mod encoder;
|
||||
mod attention;
|
||||
mod embeddings;
|
||||
mod albert;
|
||||
|
||||
pub use albert::{AlbertConfig, AlbertModelResources, AlbertConfigResources, AlbertVocabResources, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForTokenClassification, AlbertForQuestionAnswering, AlbertForMultipleChoice};
|
@ -142,7 +142,10 @@ impl BertEmbedding for BertEmbeddings {
|
||||
None => (input_value.apply_t(&self.word_embeddings, train), input_value.size())
|
||||
}
|
||||
None => match input_embeds {
|
||||
Some(embeds) => (embeds.copy(), vec!(embeds.size()[0], embeds.size()[1])),
|
||||
Some(embeds) => {
|
||||
let size = vec!(embeds.size()[0], embeds.size()[1]);
|
||||
(embeds, size)
|
||||
},
|
||||
None => { return Err("Only one of input ids or input embeddings may be set"); }
|
||||
}
|
||||
};
|
||||
|
@ -25,7 +25,6 @@ pub struct ElectraEmbeddings {
|
||||
token_type_embeddings: nn::Embedding,
|
||||
layer_norm: nn::LayerNorm,
|
||||
dropout: Dropout,
|
||||
padding_index: i64,
|
||||
}
|
||||
|
||||
impl ElectraEmbeddings {
|
||||
@ -57,7 +56,7 @@ impl ElectraEmbeddings {
|
||||
let layer_norm_config = nn::LayerNormConfig { eps: layer_norm_eps, ..Default::default() };
|
||||
let layer_norm: nn::LayerNorm = nn::layer_norm(p / "LayerNorm", vec![config.embedding_size], layer_norm_config);
|
||||
let dropout: Dropout = Dropout::new(config.hidden_dropout_prob);
|
||||
ElectraEmbeddings { word_embeddings, position_embeddings, token_type_embeddings, layer_norm, dropout, padding_index: 1 }
|
||||
ElectraEmbeddings { word_embeddings, position_embeddings, token_type_embeddings, layer_norm, dropout}
|
||||
}
|
||||
|
||||
pub fn forward_t(&self,
|
||||
|
23
src/lib.rs
23
src/lib.rs
@ -29,17 +29,17 @@
|
||||
//! ```
|
||||
//! - Transformer models base architectures with customized heads. These allow to load pre-trained models for customized inference in Rust
|
||||
//!
|
||||
//! | |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**
|
||||
//! :-----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:
|
||||
//! Masked LM|✅ |✅ |✅ | | | |✅| |
|
||||
//! Sequence classification|✅ |✅ |✅| | | | | |
|
||||
//! Token classification|✅ |✅ | ✅| | | |✅| |
|
||||
//! Question answering|✅ |✅ |✅| | | | | |
|
||||
//! Multiple choices| |✅ |✅| | | | | |
|
||||
//! Next token prediction| | | |✅|✅| | | |
|
||||
//! Natural Language Generation| | | |✅|✅| | | |
|
||||
//! Summarization| | | | | |✅| | |
|
||||
//! Translation| | | | | | | |✅|
|
||||
//! | |**DistilBERT**|**BERT**|**RoBERTa**|**GPT**|**GPT2**|**BART**|**Electra**|**Marian**|**ALBERT**
|
||||
//! :-----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:
|
||||
//! Masked LM|✅ |✅ |✅ | | | |✅| |✅ |
|
||||
//! Sequence classification|✅ |✅ |✅| | | | | |✅ |
|
||||
//! Token classification|✅ |✅ | ✅| | | |✅| |✅ |
|
||||
//! Question answering|✅ |✅ |✅| | | | | |✅ |
|
||||
//! Multiple choices| |✅ |✅| | | | | |✅ |
|
||||
//! Next token prediction| | | |✅|✅| | | | |
|
||||
//! Natural Language Generation| | | |✅|✅| | | | |
|
||||
//! Summarization| | | | | |✅| | | |
|
||||
//! Translation| | | | | | | |✅| |
|
||||
//!
|
||||
//! # Loading pre-trained models
|
||||
//!
|
||||
@ -65,6 +65,7 @@ pub mod gpt2;
|
||||
pub mod bart;
|
||||
pub mod electra;
|
||||
pub mod marian;
|
||||
pub mod albert;
|
||||
mod common;
|
||||
pub mod pipelines;
|
||||
|
||||
|
285
tests/albert.rs
Normal file
285
tests/albert.rs
Normal file
@ -0,0 +1,285 @@
|
||||
extern crate failure;
|
||||
extern crate dirs;
|
||||
|
||||
use tch::{Device, nn, Tensor, no_grad};
|
||||
use rust_tokenizers::{TruncationStrategy, Tokenizer, Vocab, AlbertTokenizer};
|
||||
use rust_bert::Config;
|
||||
use rust_bert::resources::{Resource, RemoteResource, download_resource};
|
||||
use rust_bert::albert::{AlbertConfigResources, AlbertVocabResources, AlbertModelResources, AlbertConfig, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForMultipleChoice, AlbertForTokenClassification, AlbertForQuestionAnswering};
|
||||
use std::collections::HashMap;
|
||||
|
||||
|
||||
#[test]
|
||||
fn albert_masked_lm() -> failure::Fallible<()> {
|
||||
// Resources paths
|
||||
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
|
||||
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
|
||||
let weights_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertModelResources::ALBERT_BASE_V2));
|
||||
let config_path = download_resource(&config_resource)?;
|
||||
let vocab_path = download_resource(&vocab_resource)?;
|
||||
let weights_path = download_resource(&weights_resource)?;
|
||||
|
||||
// Set-up masked LM model
|
||||
let device = Device::Cpu;
|
||||
let mut vs = nn::VarStore::new(device);
|
||||
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
|
||||
let config = AlbertConfig::from_file(config_path);
|
||||
let albert_model = AlbertForMaskedLM::new(&vs.root(), &config);
|
||||
vs.load(weights_path)?;
|
||||
|
||||
// Define input
|
||||
let input = ["Looks like one [MASK] is missing", "It\'s like comparing [MASK] to apples"];
|
||||
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
|
||||
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
|
||||
let tokenized_input = tokenized_input.
|
||||
iter().
|
||||
map(|input| input.token_ids.clone()).
|
||||
map(|mut input| {
|
||||
input.extend(vec![0; max_len - input.len()]);
|
||||
input
|
||||
}).
|
||||
map(|input|
|
||||
Tensor::of_slice(&(input))).
|
||||
collect::<Vec<_>>();
|
||||
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
|
||||
|
||||
// Forward pass
|
||||
let (output, _, _) = no_grad(|| {
|
||||
albert_model
|
||||
.forward_t(Some(input_tensor),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
false)
|
||||
});
|
||||
|
||||
// Print masked tokens
|
||||
let index_1 = output.get(0).get(4).argmax(0, false);
|
||||
let index_2 = output.get(1).get(6).argmax(0, false);
|
||||
let word_1 = tokenizer.vocab().id_to_token(&index_1.int64_value(&[]));
|
||||
let word_2 = tokenizer.vocab().id_to_token(&index_2.int64_value(&[]));
|
||||
|
||||
assert_eq!("▁them", word_1); // Outputs "_them" : "Looks like one [them] is missing (? this is identical with the original implementation)"
|
||||
assert_eq!("▁grapes", word_2);// Outputs "grapes" : "It\'s like comparing [grapes] to apples"
|
||||
assert!((output.double_value(&[0, 0, 0]) - 4.6143).abs() < 1e-4);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn albert_for_sequence_classification() -> failure::Fallible<()> {
|
||||
// Resources paths
|
||||
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
|
||||
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
|
||||
let config_path = download_resource(&config_resource)?;
|
||||
let vocab_path = download_resource(&vocab_resource)?;
|
||||
|
||||
// Set-up model
|
||||
let device = Device::Cpu;
|
||||
let vs = nn::VarStore::new(device);
|
||||
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
|
||||
let mut config = AlbertConfig::from_file(config_path);
|
||||
let mut dummy_label_mapping = HashMap::new();
|
||||
dummy_label_mapping.insert(0, String::from("Positive"));
|
||||
dummy_label_mapping.insert(1, String::from("Negative"));
|
||||
dummy_label_mapping.insert(3, String::from("Neutral"));
|
||||
config.id2label = Some(dummy_label_mapping);
|
||||
config.output_attentions = Some(true);
|
||||
config.output_hidden_states = Some(true);
|
||||
let albert_model = AlbertForSequenceClassification::new(&vs.root(), &config);
|
||||
|
||||
|
||||
// Define input
|
||||
let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
|
||||
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
|
||||
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
|
||||
let tokenized_input = tokenized_input.
|
||||
iter().
|
||||
map(|input| input.token_ids.clone()).
|
||||
map(|mut input| {
|
||||
input.extend(vec![0; max_len - input.len()]);
|
||||
input
|
||||
}).
|
||||
map(|input|
|
||||
Tensor::of_slice(&(input))).
|
||||
collect::<Vec<_>>();
|
||||
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
|
||||
|
||||
// Forward pass
|
||||
let (output, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
albert_model
|
||||
.forward_t(Some(input_tensor),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
false)
|
||||
});
|
||||
|
||||
assert_eq!(output.size(), &[2, 3]);
|
||||
assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
|
||||
assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn albert_for_multiple_choice() -> failure::Fallible<()> {
|
||||
// Resources paths
|
||||
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
|
||||
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
|
||||
let config_path = download_resource(&config_resource)?;
|
||||
let vocab_path = download_resource(&vocab_resource)?;
|
||||
|
||||
// Set-up model
|
||||
let device = Device::Cpu;
|
||||
let vs = nn::VarStore::new(device);
|
||||
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
|
||||
let mut config = AlbertConfig::from_file(config_path);
|
||||
config.output_attentions = Some(true);
|
||||
config.output_hidden_states = Some(true);
|
||||
let albert_model = AlbertForMultipleChoice::new(&vs.root(), &config);
|
||||
|
||||
// Define input
|
||||
let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
|
||||
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
|
||||
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
|
||||
let tokenized_input = tokenized_input.
|
||||
iter().
|
||||
map(|input| input.token_ids.clone()).
|
||||
map(|mut input| {
|
||||
input.extend(vec![0; max_len - input.len()]);
|
||||
input
|
||||
}).
|
||||
map(|input|
|
||||
Tensor::of_slice(&(input))).
|
||||
collect::<Vec<_>>();
|
||||
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device).unsqueeze(0);
|
||||
|
||||
// Forward pass
|
||||
let (output, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
albert_model
|
||||
.forward_t(Some(input_tensor),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
false).unwrap()
|
||||
});
|
||||
|
||||
assert_eq!(output.size(), &[1, 2]);
|
||||
assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
|
||||
assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn albert_for_token_classification() -> failure::Fallible<()> {
|
||||
// Resources paths
|
||||
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
|
||||
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
|
||||
let config_path = download_resource(&config_resource)?;
|
||||
let vocab_path = download_resource(&vocab_resource)?;
|
||||
|
||||
// Set-up model
|
||||
let device = Device::Cpu;
|
||||
let vs = nn::VarStore::new(device);
|
||||
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
|
||||
let mut config = AlbertConfig::from_file(config_path);
|
||||
let mut dummy_label_mapping = HashMap::new();
|
||||
dummy_label_mapping.insert(0, String::from("O"));
|
||||
dummy_label_mapping.insert(1, String::from("LOC"));
|
||||
dummy_label_mapping.insert(2, String::from("PER"));
|
||||
dummy_label_mapping.insert(3, String::from("ORG"));
|
||||
config.id2label = Some(dummy_label_mapping);
|
||||
config.output_attentions = Some(true);
|
||||
config.output_hidden_states = Some(true);
|
||||
let bert_model = AlbertForTokenClassification::new(&vs.root(), &config);
|
||||
|
||||
|
||||
// Define input
|
||||
let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
|
||||
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
|
||||
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
|
||||
let tokenized_input = tokenized_input.
|
||||
iter().
|
||||
map(|input| input.token_ids.clone()).
|
||||
map(|mut input| {
|
||||
input.extend(vec![0; max_len - input.len()]);
|
||||
input
|
||||
}).
|
||||
map(|input|
|
||||
Tensor::of_slice(&(input))).
|
||||
collect::<Vec<_>>();
|
||||
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
|
||||
|
||||
// Forward pass
|
||||
let (output, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
bert_model
|
||||
.forward_t(Some(input_tensor),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
false)
|
||||
});
|
||||
|
||||
assert_eq!(output.size(), &[2, 12, 4]);
|
||||
assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
|
||||
assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn albert_for_question_answering() -> failure::Fallible<()> {
|
||||
// Resources paths
|
||||
let config_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertConfigResources::ALBERT_BASE_V2));
|
||||
let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(AlbertVocabResources::ALBERT_BASE_V2));
|
||||
let config_path = download_resource(&config_resource)?;
|
||||
let vocab_path = download_resource(&vocab_resource)?;
|
||||
|
||||
// Set-up model
|
||||
let device = Device::Cpu;
|
||||
let vs = nn::VarStore::new(device);
|
||||
let tokenizer: AlbertTokenizer = AlbertTokenizer::from_file(vocab_path.to_str().unwrap(), true, false);
|
||||
let mut config = AlbertConfig::from_file(config_path);
|
||||
config.output_attentions = Some(true);
|
||||
config.output_hidden_states = Some(true);
|
||||
let albert_model = AlbertForQuestionAnswering::new(&vs.root(), &config);
|
||||
|
||||
// Define input
|
||||
let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
|
||||
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
|
||||
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
|
||||
let tokenized_input = tokenized_input.
|
||||
iter().
|
||||
map(|input| input.token_ids.clone()).
|
||||
map(|mut input| {
|
||||
input.extend(vec![0; max_len - input.len()]);
|
||||
input
|
||||
}).
|
||||
map(|input|
|
||||
Tensor::of_slice(&(input))).
|
||||
collect::<Vec<_>>();
|
||||
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
|
||||
|
||||
// Forward pass
|
||||
let (start_scores, end_scores, all_hidden_states, all_attentions) = no_grad(|| {
|
||||
albert_model
|
||||
.forward_t(Some(input_tensor),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
false)
|
||||
});
|
||||
|
||||
assert_eq!(start_scores.size(), &[2, 12]);
|
||||
assert_eq!(end_scores.size(), &[2, 12]);
|
||||
assert_eq!(config.num_hidden_layers as usize, all_hidden_states.unwrap().len());
|
||||
assert_eq!(config.num_hidden_layers as usize, all_attentions.unwrap().len());
|
||||
|
||||
Ok(())
|
||||
}
|
49
utils/download-dependencies_albert.py
Normal file
49
utils/download-dependencies_albert.py
Normal file
@ -0,0 +1,49 @@
|
||||
from transformers import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
from transformers.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
from transformers.tokenization_albert import PRETRAINED_VOCAB_FILES_MAP
|
||||
from transformers.file_utils import get_from_cache
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import os
|
||||
import numpy as np
|
||||
import torch
|
||||
import subprocess
|
||||
|
||||
config_path = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP["albert-base-v2"]
|
||||
vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["albert-base-v2"]
|
||||
weights_path = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP["albert-base-v2"]
|
||||
|
||||
target_path = Path.home() / 'rustbert' / 'albert-base-v2'
|
||||
|
||||
temp_config = get_from_cache(config_path)
|
||||
temp_vocab = get_from_cache(vocab_path)
|
||||
temp_weights = get_from_cache(weights_path)
|
||||
|
||||
os.makedirs(str(target_path), exist_ok=True)
|
||||
|
||||
config_path = str(target_path / 'config.json')
|
||||
vocab_path = str(target_path / 'spiece.model')
|
||||
model_path = str(target_path / 'model.bin')
|
||||
|
||||
shutil.copy(temp_config, config_path)
|
||||
shutil.copy(temp_vocab, vocab_path)
|
||||
shutil.copy(temp_weights, model_path)
|
||||
|
||||
weights = torch.load(temp_weights, map_location='cpu')
|
||||
nps = {}
|
||||
for k, v in weights.items():
|
||||
k = k.replace("gamma", "weight").replace("beta", "bias")
|
||||
nps[k] = np.ascontiguousarray(v.cpu().numpy())
|
||||
|
||||
np.savez(target_path / 'model.npz', **nps)
|
||||
|
||||
source = str(target_path / 'model.npz')
|
||||
target = str(target_path / 'model.ot')
|
||||
|
||||
toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve()
|
||||
|
||||
subprocess.call(
|
||||
['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target])
|
||||
|
||||
os.remove(str(target_path / 'model.bin'))
|
||||
os.remove(str(target_path / 'model.npz'))
|
Loading…
Reference in New Issue
Block a user