Addition of FNet tests

2024-09-11 12:55:34 +03:00 · 2021-11-14 11:49:40 +01:00 · 2021-11-14 11:49:40 +01:00 · dcee4e3acc
commit dcee4e3acc
parent 61e5d2d563
4 changed files with 362 additions and 2 deletions
--- a/.github/workflows/continuous-integration.yml
+++ b/.github/workflows/continuous-integration.yml
@ -71,6 +71,7 @@ jobs:
            --test electra
            --test gpt2
            --test marian
+            --test fnet

  test-batch-1:
    name: Integration tests (batch 1)
--- a/src/fnet/mod.rs
+++ b/src/fnet/mod.rs
@ -1,3 +1,55 @@
+//! # FNet, Mixing Tokens with Fourier Transforms (Lee-Thorp et al.)
+//!
+//! Implementation of the FNet language model ([https://arxiv.org/abs/2105.03824](https://arxiv.org/abs/2105.03824) Lee-Thorp, Ainslie, Eckstein, Ontanon, 2021).
+//! The base model is implemented in the `fnet_model::FNetModel` struct. Several language model heads have also been implemented, including:
+//! - Masked language model: `fnet_model::FNetForMaskedLM`
+//! - Question answering: `fnet_model::FNetForQuestionAnswering`
+//! - Sequence classification: `fnet_model::FNetForSequenceClassification`
+//! - Token classification (e.g. NER, POS tagging): `fnet_model::FNetForTokenClassification`
+//!
+//! # Model set-up and pre-trained weights loading
+//!
+//! The example below illustrate a FNet Masked language model example, the structure is similar for other models.
+//! All models expect the following resources:
+//! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
+//! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
+//! - `FNetTokenizer` using a `spiece.model` SentencePiece (BPE) model file
+//! Pretrained models are available and can be downloaded using RemoteResources.
+//!
+//! ```no_run
+//! # fn main() -> anyhow::Result<()> {
+//! #
+//! use tch::{nn, Device};
+//! # use std::path::PathBuf;
+//! use rust_bert::resources::{LocalResource, RemoteResource, Resource};
+//! use rust_bert::fnet::{FNetConfig, FNetForMaskedLM};
+//! use rust_bert::Config;
+//! use rust_tokenizers::tokenizer::{BertTokenizer, FNetTokenizer};
+//!
+//! let config_resource = Resource::Local(LocalResource {
+//!     local_path: PathBuf::from("path/to/config.json"),
+//! });
+//! let vocab_resource = Resource::Local(LocalResource {
+//!     local_path: PathBuf::from("path/to/spiece.model"),
+//! });
+//! let weights_resource = Resource::Local(LocalResource {
+//!     local_path: PathBuf::from("path/to/model.ot"),
+//! });
+//! let config_path = config_resource.get_local_path()?;
+//! let vocab_path = vocab_resource.get_local_path()?;
+//! let weights_path = weights_resource.get_local_path()?;
+//! let device = Device::cuda_if_available();
+//! let mut vs = nn::VarStore::new(device);
+//! let tokenizer: FNetTokenizer =
+//!     FNetTokenizer::from_file(vocab_path.to_str().unwrap(), true, true)?;
+//! let config = FNetConfig::from_file(config_path);
+//! let bert_model = FNetForMaskedLM::new(&vs.root(), &config);
+//! vs.load(weights_path)?;
+//!
+//! # Ok(())
+//! # }
+//! ```
+
 mod attention;
 mod embeddings;
 mod encoder;
--- a/tests/albert.rs
+++ b/tests/albert.rs
@ -242,7 +242,7 @@ fn albert_for_token_classification() -> anyhow::Result<()> {
    config.id2label = Some(dummy_label_mapping);
    config.output_attentions = Some(true);
    config.output_hidden_states = Some(true);
-    let bert_model = AlbertForTokenClassification::new(&vs.root(), &config);
+    let albert_model = AlbertForTokenClassification::new(&vs.root(), &config);

    //    Define input
    let input = [
@ -268,7 +268,7 @@ fn albert_for_token_classification() -> anyhow::Result<()> {

    //    Forward pass
    let model_output =
-        no_grad(|| bert_model.forward_t(Some(&input_tensor), None, None, None, None, false));
+        no_grad(|| albert_model.forward_t(Some(&input_tensor), None, None, None, None, false));

    assert_eq!(model_output.logits.size(), &[2, 12, 4]);
    assert_eq!(
--- a/tests/fnet.rs
+++ b/tests/fnet.rs
@ -0,0 +1,307 @@
+extern crate anyhow;
+extern crate dirs;
+
+use rust_bert::fnet::{
+    FNetConfig, FNetConfigResources, FNetForMaskedLM, FNetForMultipleChoice,
+    FNetForQuestionAnswering, FNetForTokenClassification, FNetModelResources, FNetVocabResources,
+};
+use rust_bert::pipelines::common::ModelType;
+use rust_bert::pipelines::sentiment::{SentimentConfig, SentimentModel, SentimentPolarity};
+use rust_bert::resources::{RemoteResource, Resource};
+use rust_bert::Config;
+use rust_tokenizers::tokenizer::{FNetTokenizer, MultiThreadedTokenizer, TruncationStrategy};
+use rust_tokenizers::vocab::Vocab;
+use std::collections::HashMap;
+use tch::{nn, no_grad, Device, Tensor};
+
+#[test]
+fn fnet_masked_lm() -> anyhow::Result<()> {
+    //    Resources paths
+    let config_resource =
+        Resource::Remote(RemoteResource::from_pretrained(FNetConfigResources::BASE));
+    let vocab_resource =
+        Resource::Remote(RemoteResource::from_pretrained(FNetVocabResources::BASE));
+    let weights_resource =
+        Resource::Remote(RemoteResource::from_pretrained(FNetModelResources::BASE));
+    let config_path = config_resource.get_local_path()?;
+    let vocab_path = vocab_resource.get_local_path()?;
+    let weights_path = weights_resource.get_local_path()?;
+
+    //    Set-up masked LM model
+    let device = Device::Cpu;
+    let mut vs = nn::VarStore::new(device);
+    let tokenizer: FNetTokenizer =
+        FNetTokenizer::from_file(vocab_path.to_str().unwrap(), false, false)?;
+    let config = FNetConfig::from_file(config_path);
+    let fnet_model = FNetForMaskedLM::new(&vs.root(), &config);
+    vs.load(weights_path)?;
+
+    //    Define input
+    let input = [
+        "Looks like one [MASK] is missing",
+        "It was a very nice and [MASK] day",
+    ];
+    let tokenized_input = tokenizer.encode_list(&input, 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input
+        .iter()
+        .map(|input| input.token_ids.len())
+        .max()
+        .unwrap();
+    let tokenized_input = tokenized_input
+        .iter()
+        .map(|input| input.token_ids.clone())
+        .map(|mut input| {
+            input.extend(vec![3; max_len - input.len()]);
+            input
+        })
+        .map(|input| Tensor::of_slice(&(input)))
+        .collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
+
+    //    Forward pass
+    let model_output =
+        no_grad(|| fnet_model.forward_t(Some(&input_tensor), None, None, None, false))?;
+
+    //    Print masked tokens
+    let index_1 = model_output
+        .prediction_scores
+        .get(0)
+        .get(4)
+        .argmax(0, false);
+    let index_2 = model_output
+        .prediction_scores
+        .get(1)
+        .get(7)
+        .argmax(0, false);
+    let word_1 = tokenizer.vocab().id_to_token(&index_1.int64_value(&[]));
+    let word_2 = tokenizer.vocab().id_to_token(&index_2.int64_value(&[]));
+
+    assert_eq!("▁one", word_1);
+    assert_eq!("▁the", word_2);
+    assert!((f64::from(model_output.prediction_scores.get(0).get(4).max()) - 13.1721).abs() < 1e-4);
+    Ok(())
+}
+
+#[test]
+fn fnet_for_sequence_classification() -> anyhow::Result<()> {
+    // Set up classifier
+    let config_resource = Resource::Remote(RemoteResource::from_pretrained(
+        FNetConfigResources::BASE_SST2,
+    ));
+    let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(
+        FNetVocabResources::BASE_SST2,
+    ));
+    let model_resource = Resource::Remote(RemoteResource::from_pretrained(
+        FNetModelResources::BASE_SST2,
+    ));
+
+    let sentiment_config = SentimentConfig {
+        model_type: ModelType::FNet,
+        model_resource,
+        config_resource,
+        vocab_resource,
+        ..Default::default()
+    };
+
+    let sentiment_classifier = SentimentModel::new(sentiment_config)?;
+
+    //    Get sentiments
+    let input = [
+        "Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring.",
+        "This film tried to be too many things all at once: stinging political satire, Hollywood blockbuster, sappy romantic comedy, family values promo...",
+        "If you like original gut wrenching laughter you will like this movie. If you are young or old then you will love this movie, hell even my mom liked it.",
+    ];
+
+    let output = sentiment_classifier.predict(&input);
+
+    assert_eq!(output.len(), 3usize);
+    assert_eq!(output[0].polarity, SentimentPolarity::Negative);
+    assert!((output[0].score - 0.9978).abs() < 1e-4);
+    assert_eq!(output[1].polarity, SentimentPolarity::Negative);
+    assert!((output[1].score - 0.9982).abs() < 1e-4);
+    assert_eq!(output[2].polarity, SentimentPolarity::Positive);
+    assert!((output[2].score - 0.7570).abs() < 1e-4);
+
+    Ok(())
+}
+//
+#[test]
+fn fnet_for_multiple_choice() -> anyhow::Result<()> {
+    //    Resources paths
+    let config_resource =
+        Resource::Remote(RemoteResource::from_pretrained(FNetConfigResources::BASE));
+    let vocab_resource =
+        Resource::Remote(RemoteResource::from_pretrained(FNetVocabResources::BASE));
+    let config_path = config_resource.get_local_path()?;
+    let vocab_path = vocab_resource.get_local_path()?;
+
+    //    Set-up model
+    let device = Device::Cpu;
+    let vs = nn::VarStore::new(device);
+    let tokenizer: FNetTokenizer =
+        FNetTokenizer::from_file(vocab_path.to_str().unwrap(), false, false)?;
+    let mut config = FNetConfig::from_file(config_path);
+    config.output_attentions = Some(true);
+    config.output_hidden_states = Some(true);
+    let fnet_model = FNetForMultipleChoice::new(&vs.root(), &config);
+
+    //    Define input
+    let input = [
+        "Looks like one thing is missing",
+        "It\'s like comparing oranges to apples",
+    ];
+    let tokenized_input = tokenizer.encode_list(&input, 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input
+        .iter()
+        .map(|input| input.token_ids.len())
+        .max()
+        .unwrap();
+    let tokenized_input = tokenized_input
+        .iter()
+        .map(|input| input.token_ids.clone())
+        .map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        })
+        .map(|input| Tensor::of_slice(&(input)))
+        .collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0)
+        .to(device)
+        .unsqueeze(0);
+
+    //    Forward pass
+    let model_output = no_grad(|| {
+        fnet_model
+            .forward_t(Some(&input_tensor), None, None, None, false)
+            .unwrap()
+    });
+
+    assert_eq!(model_output.logits.size(), &[1, 2]);
+    assert_eq!(
+        config.num_hidden_layers as usize,
+        model_output.all_hidden_states.unwrap().len()
+    );
+
+    Ok(())
+}
+
+#[test]
+fn fnet_for_token_classification() -> anyhow::Result<()> {
+    //    Resources paths
+    let config_resource =
+        Resource::Remote(RemoteResource::from_pretrained(FNetConfigResources::BASE));
+    let vocab_resource =
+        Resource::Remote(RemoteResource::from_pretrained(FNetVocabResources::BASE));
+    let config_path = config_resource.get_local_path()?;
+    let vocab_path = vocab_resource.get_local_path()?;
+
+    //    Set-up model
+    let device = Device::Cpu;
+    let vs = nn::VarStore::new(device);
+    let tokenizer: FNetTokenizer =
+        FNetTokenizer::from_file(vocab_path.to_str().unwrap(), false, false)?;
+    let mut config = FNetConfig::from_file(config_path);
+    let mut dummy_label_mapping = HashMap::new();
+    dummy_label_mapping.insert(0, String::from("O"));
+    dummy_label_mapping.insert(1, String::from("LOC"));
+    dummy_label_mapping.insert(2, String::from("PER"));
+    dummy_label_mapping.insert(3, String::from("ORG"));
+    config.id2label = Some(dummy_label_mapping);
+    config.output_hidden_states = Some(true);
+    let fnet_model = FNetForTokenClassification::new(&vs.root(), &config);
+
+    //    Define input
+    let input = [
+        "Looks like one thing is missing",
+        "It\'s like comparing oranges to apples",
+    ];
+    let tokenized_input = tokenizer.encode_list(&input, 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input
+        .iter()
+        .map(|input| input.token_ids.len())
+        .max()
+        .unwrap();
+    let tokenized_input = tokenized_input
+        .iter()
+        .map(|input| input.token_ids.clone())
+        .map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        })
+        .map(|input| Tensor::of_slice(&(input)))
+        .collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
+
+    //    Forward pass
+    let model_output = no_grad(|| {
+        fnet_model
+            .forward_t(Some(&input_tensor), None, None, None, false)
+            .unwrap()
+    });
+
+    assert_eq!(model_output.logits.size(), &[2, 11, 4]);
+    assert_eq!(
+        config.num_hidden_layers as usize,
+        model_output.all_hidden_states.unwrap().len()
+    );
+
+    Ok(())
+}
+
+#[test]
+fn fnet_for_question_answering() -> anyhow::Result<()> {
+    //    Resources paths
+    let config_resource =
+        Resource::Remote(RemoteResource::from_pretrained(FNetConfigResources::BASE));
+    let vocab_resource =
+        Resource::Remote(RemoteResource::from_pretrained(FNetVocabResources::BASE));
+    let config_path = config_resource.get_local_path()?;
+    let vocab_path = vocab_resource.get_local_path()?;
+
+    //    Set-up model
+    let device = Device::Cpu;
+    let vs = nn::VarStore::new(device);
+    let tokenizer: FNetTokenizer =
+        FNetTokenizer::from_file(vocab_path.to_str().unwrap(), false, false)?;
+    let mut config = FNetConfig::from_file(config_path);
+    config.output_hidden_states = Some(true);
+    let fnet_model = FNetForQuestionAnswering::new(&vs.root(), &config);
+
+    //    Define input
+    let input = [
+        "Looks like one thing is missing",
+        "It\'s like comparing oranges to apples",
+    ];
+    let tokenized_input = tokenizer.encode_list(&input, 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input
+        .iter()
+        .map(|input| input.token_ids.len())
+        .max()
+        .unwrap();
+    let tokenized_input = tokenized_input
+        .iter()
+        .map(|input| input.token_ids.clone())
+        .map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        })
+        .map(|input| Tensor::of_slice(&(input)))
+        .collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
+
+    //    Forward pass
+    let model_output = no_grad(|| {
+        fnet_model
+            .forward_t(Some(&input_tensor), None, None, None, false)
+            .unwrap()
+    });
+
+    assert_eq!(model_output.start_logits.size(), &[2, 11]);
+    assert_eq!(model_output.end_logits.size(), &[2, 11]);
+    assert_eq!(
+        config.num_hidden_layers as usize,
+        model_output.all_hidden_states.unwrap().len()
+    );
+
+    Ok(())
+}