Add GPT-J support (#285) (#288)

* Add GPT-J support (#285)

* Improve GPT-J implementation

* Improve GPT-J tests

* Adapt GPT-J to latest master branch

* Specify how to convert GPT-J weights instead of providing them
This commit is contained in:
Romain Leroux 2023-02-15 20:10:47 +01:00 committed by GitHub
parent d7e9c03694
commit c448862185
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 1704 additions and 27 deletions

View File

@ -131,6 +131,7 @@ jobs:
args: --package rust-bert
--test sentence_embeddings
--test longt5
--test gpt_j
convert-model:
name: Model conversion test
@ -179,4 +180,4 @@ jobs:
- uses: actions-rs/cargo@v1
with:
command: clippy
args: --all-targets --all-features -- -D warnings -A clippy::assign_op_pattern -A clippy::upper-case-acronyms
args: --all-targets --all-features -- -D warnings -A clippy::assign_op_pattern -A clippy::upper-case-acronyms

100
examples/generation_gptj.rs Normal file
View File

@ -0,0 +1,100 @@
use std::path::PathBuf;
use rust_bert::gpt_j::{GptJConfigResources, GptJMergesResources, GptJVocabResources};
use rust_bert::pipelines::common::ModelType;
use rust_bert::pipelines::text_generation::{TextGenerationConfig, TextGenerationModel};
use rust_bert::resources::{LocalResource, RemoteResource};
use tch::Device;
/// Equivalent Python code:
///
/// ```python
/// import torch
/// from transformers import AutoTokenizer, GPTJForCausalLM
///
/// device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
///
/// model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16).to(device)
///
/// tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", padding_side="left")
/// tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
///
/// prompts = ["It was a very nice and sunny", "It was a gloom winter night, and"]
/// inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)
///
/// with torch.no_grad():
/// gen_tokens = model.generate(
/// **inputs,
/// min_length=0,
/// max_length=32,
/// do_sample=False,
/// early_stopping=True,
/// num_beams=1,
/// num_return_sequences=1
/// )
///
/// gen_texts = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
/// ````
///
/// To run this test you need to download `pytorch_model.bin` from [EleutherAI GPT-J 6B
/// (float16)][gpt-j-6B-float16] and then convert its weights with:
///
/// ```
/// python utils/convert_model.py resources/gpt-j-6B-float16/pytorch_model.bin
/// ```
///
/// [gpt-j-6B-float16]: https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16
///
fn main() -> anyhow::Result<()> {
// Resources paths
let config_resource = Box::new(RemoteResource::from_pretrained(
GptJConfigResources::GPT_J_6B_FLOAT16,
));
let vocab_resource = Box::new(RemoteResource::from_pretrained(
GptJVocabResources::GPT_J_6B_FLOAT16,
));
let merges_resource = Box::new(RemoteResource::from_pretrained(
GptJMergesResources::GPT_J_6B_FLOAT16,
));
let model_resource = Box::new(LocalResource::from(PathBuf::from(
"resources/gpt-j-6B-float16/rust_model.ot",
)));
// Set-up model
let generation_config = TextGenerationConfig {
model_type: ModelType::GPTJ,
model_resource,
config_resource,
vocab_resource,
merges_resource: Some(merges_resource),
min_length: 10,
max_length: Some(32),
do_sample: false,
early_stopping: true,
num_beams: 1,
num_return_sequences: 1,
device: Device::cuda_if_available(),
..Default::default()
};
let model = TextGenerationModel::new(generation_config)?;
// Generate text
let prompts = [
"It was a very nice and sunny",
"It was a gloom winter night, and",
];
let output = model.generate(&prompts, None);
assert_eq!(output.len(), 2);
assert_eq!(output[0], "It was a very nice and sunny day, and I was sitting in the garden of my house, enjoying the sun and the fresh air. I was thinking");
assert_eq!(output[1], "It was a gloom winter night, and the wind was howling. The snow was falling, and the temperature was dropping. The snow was coming down so hard");
Ok(())
}

323
src/gpt_j/attention.rs Normal file
View File

@ -0,0 +1,323 @@
// Copyright 2021 The Eleuther AI and HuggingFace Inc. team. All rights reserved.
// Copyright 2022 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::common::dropout::Dropout;
use crate::common::kind::get_min;
use crate::gpt_j::gpt_j_model::GptJConfig;
use std::borrow::Borrow;
use tch::nn::Linear;
use tch::{nn, IndexOp, Kind, NewAxis, Tensor};
#[derive(Debug)]
/// # Cache for GPT-J attention layers
/// Stores the cached value of key and value
pub struct LayerState {
/// Cached keys
pub prev_key: Tensor,
/// Cached values
pub prev_value: Tensor,
}
impl Clone for LayerState {
fn clone(&self) -> Self {
LayerState {
prev_key: self.prev_key.copy(),
prev_value: self.prev_value.copy(),
}
}
}
impl LayerState {
pub(crate) fn reorder_cache(&mut self, new_indices: &Tensor) {
self.prev_key = self.prev_key.index_select(0, new_indices);
self.prev_value = self.prev_value.index_select(0, new_indices);
}
}
pub struct GptJAttention {
bias: Tensor,
attn_dropout: Dropout,
resid_dropout: Dropout,
scale_attn: f32,
k_proj: Linear,
v_proj: Linear,
q_proj: Linear,
out_proj: Linear,
output_attentions: bool,
dim_per_head: i64,
n_head: i64,
rotary_dim: Option<i64>,
scale: bool,
use_cache: bool,
}
impl GptJAttention {
pub fn new<'p, P>(p: P, config: &GptJConfig) -> GptJAttention
where
P: Borrow<nn::Path<'p>>,
{
let p = p.borrow();
let max_positions = config.n_positions;
let bias = Tensor::ones(&[max_positions, max_positions], (Kind::Uint8, p.device()))
.tril(0)
.view([1, 1, max_positions, max_positions])
.requires_grad_(false);
let bias = p.var_copy("bias", &bias);
let attn_pdrop = config.attn_pdrop.unwrap_or(0.1);
let resid_pdrop = config.resid_pdrop.unwrap_or(0.1);
let output_attentions = config.output_attentions.unwrap_or(false);
let attn_dropout = Dropout::new(attn_pdrop);
let resid_dropout = Dropout::new(resid_pdrop);
assert_eq!(
config.n_embd % config.n_head,
0,
"Attention hidden states not a multiple of the number of heads"
);
let dim_per_head = config.n_embd / config.n_head;
let scale_attn = (dim_per_head as f32).sqrt();
let linear_config = nn::LinearConfig {
bias: false,
..Default::default()
};
let k_proj = nn::linear(p / "k_proj", config.n_embd, config.n_embd, linear_config);
if config.use_float16 {
(p / "k_proj").half();
}
let v_proj = nn::linear(p / "v_proj", config.n_embd, config.n_embd, linear_config);
if config.use_float16 {
(p / "v_proj").half();
}
let q_proj = nn::linear(p / "q_proj", config.n_embd, config.n_embd, linear_config);
if config.use_float16 {
(p / "q_proj").half();
}
let out_proj = nn::linear(p / "out_proj", config.n_embd, config.n_embd, linear_config);
if config.use_float16 {
(p / "out_proj").half();
}
GptJAttention {
bias,
attn_dropout,
resid_dropout,
output_attentions,
scale_attn,
k_proj,
v_proj,
q_proj,
out_proj,
dim_per_head,
n_head: config.n_head,
rotary_dim: config.rotary_dim,
scale: config.scale_attn_weights.unwrap_or(true),
use_cache: config.use_cache.unwrap_or(true),
}
}
fn split_heads(
tensor: &Tensor,
num_heads: i64,
attention_head_size: i64,
rotary: bool,
) -> Tensor {
let mut new_shape = tensor.size();
let _ = new_shape.pop();
new_shape.extend_from_slice(&[num_heads, attention_head_size]);
let tensor = tensor.view(new_shape.as_slice());
if rotary {
tensor
} else if tensor.size().len() == 5 {
tensor.permute(&[0, 1, 3, 2, 4]) // (batch, blocks, head, block_length, head_features)
} else if tensor.size().len() == 4 {
tensor.permute(&[0, 2, 1, 3]) // (batch, head, seq_length, head_features)
} else {
panic!(
"Input tensor should either be a rotary head, or its rank be one of [4, 5] but is: {}",
tensor.size().len()
)
}
}
fn merge_heads(tensor: &Tensor, num_heads: i64, attention_head_size: i64) -> Tensor {
let tensor = if tensor.size().len() == 5 {
tensor.permute(&[0, 1, 3, 2, 4]).contiguous()
} else if tensor.size().len() == 4 {
tensor.permute(&[0, 2, 1, 3]).contiguous()
} else {
panic!(
"Input tensor rank should be one of [4, 5], but is: {}",
tensor.size().len()
)
};
let mut new_shape = tensor.size();
new_shape.truncate(new_shape.len() - 2);
new_shape.push(num_heads * attention_head_size);
tensor.view(new_shape.as_slice())
}
fn attention(
&self,
query: &Tensor,
key: &Tensor,
value: &Tensor,
attention_mask: Option<&Tensor>,
train: bool,
) -> (Tensor, Tensor) {
let query = query.to_kind(Kind::Float);
let key = key.to_kind(Kind::Float);
let attention_weights = query.matmul(&key.transpose(-1, -2));
let query_dims = query.size();
let key_dims = key.size();
let query_length = query_dims[query_dims.len() - 2];
let key_length = key_dims[key_dims.len() - 2];
let causal_mask = &self
.bias
.slice(2, key_length - query_length, key_length, 1)
.slice(3, 0, key_length, 1)
.to_kind(Kind::Bool)
.to_device(attention_weights.device());
let mask_value = get_min(attention_weights.kind()).unwrap();
let mask_value = Tensor::full(
&attention_weights.size(),
mask_value,
(attention_weights.kind(), attention_weights.device()),
);
let mut attention_weights = attention_weights.where_self(causal_mask, &mask_value);
if self.scale {
attention_weights /= self.scale_attn;
}
if let Some(attention_mask_value) = attention_mask {
attention_weights += attention_mask_value;
};
let attention_weights = attention_weights.softmax(-1, attention_weights.kind());
let attention_weights = attention_weights
.to_kind(value.kind())
.apply_t(&self.attn_dropout, train);
let attention_output = attention_weights.matmul(value);
(attention_output, attention_weights)
}
pub fn forward_t(
&self,
hidden_states: &Tensor,
attention_mask: Option<&Tensor>,
layer_past: Option<&LayerState>,
train: bool,
) -> (Tensor, Option<LayerState>, Option<Tensor>) {
let query = hidden_states.apply(&self.q_proj);
let key = hidden_states.apply(&self.k_proj);
let value = hidden_states.apply(&self.v_proj);
let mut query = Self::split_heads(&query, self.n_head, self.dim_per_head, true);
let mut key = Self::split_heads(&key, self.n_head, self.dim_per_head, true);
let mut value = Self::split_heads(&value, self.n_head, self.dim_per_head, false);
let mut seq_len = key.size()[1];
let mut offset = 0;
if let Some(layer_past) = layer_past {
offset = layer_past.prev_key.size()[layer_past.prev_key.size().len() - 2];
seq_len += offset
};
if let Some(rotary_dim) = self.rotary_dim {
let k_rot = key.slice(3, 0, rotary_dim, 1);
let k_pass = key.slice(3, rotary_dim, key.size()[3], 1);
let q_rot = query.slice(3, 0, rotary_dim, 1);
let q_pass = query.slice(3, rotary_dim, query.size()[3], 1);
let sincos = fixed_pos_embedding(&k_rot, seq_len);
let k_rot = apply_rotary_pos_emb(&k_rot, &sincos, offset);
let q_rot = apply_rotary_pos_emb(&q_rot, &sincos, offset);
key = Tensor::cat(&[k_rot, k_pass], -1);
query = Tensor::cat(&[q_rot, q_pass], -1);
} else {
let sincos = fixed_pos_embedding(&key, seq_len);
key = apply_rotary_pos_emb(&key, &sincos, offset);
query = apply_rotary_pos_emb(&query, &sincos, offset);
}
key = key.permute(&[0, 2, 1, 3]);
query = query.permute(&[0, 2, 1, 3]);
if let Some(layer_past) = layer_past {
key = Tensor::cat(&[&layer_past.prev_key, &key], -2);
value = Tensor::cat(&[&layer_past.prev_value, &value], -2);
}
let present = self.use_cache.then(|| LayerState {
prev_key: key.copy(),
prev_value: value.copy(),
});
let (attn_output, attn_weights) =
self.attention(&query, &key, &value, attention_mask, train);
let attn_output = Self::merge_heads(&attn_output, self.n_head, self.dim_per_head)
.apply(&self.out_proj)
.apply_t(&self.resid_dropout, train);
let attn_weights = self.output_attentions.then_some(attn_weights);
(attn_output, present, attn_weights)
}
}
fn fixed_pos_embedding(x: &Tensor, seq_len: i64) -> (Tensor, Tensor) {
let dim = x.size()[x.size().len() - 1];
let inv_freq = 1.0
/ Tensor::pow_scalar(
10_000,
&(Tensor::arange_start_step(0, dim, 2, (x.kind(), x.device())) / dim),
);
let sinusoid_inp = Tensor::einsum(
"i , j -> i j",
&[Tensor::arange(seq_len, (x.kind(), x.device())), inv_freq],
None,
);
(sinusoid_inp.sin(), sinusoid_inp.cos())
}
fn apply_rotary_pos_emb(x: &Tensor, (sin, cos): &(Tensor, Tensor), offset: i64) -> Tensor {
let sin = duplicate_interleave(sin).i((NewAxis, offset..x.size()[1] + offset, NewAxis, ..));
let cos = duplicate_interleave(cos).i((NewAxis, offset..x.size()[1] + offset, NewAxis, ..));
(x * cos) + (rotate_every_two(x) * sin)
}
/// A simple version of `torch.repeat_interleave` for duplicating a matrix while interleaving the copy.
fn duplicate_interleave(m: &Tensor) -> Tensor {
let dim0 = m.size()[0];
m.view([-1, 1]) // flatten the matrix
.repeat(&[1, 2]) // repeat all elements into the 2nd dimension
.view([dim0, -1]) // reshape into a matrix, interleaving the copy
}
fn rotate_every_two(x: &Tensor) -> Tensor {
let x1 = x.slice(3, 0, x.size()[3], 2);
let x2 = x.slice(3, 1, x.size()[3], 2);
Tensor::stack(&[-x2, x1], -1).flatten(-2, -1)
}

836
src/gpt_j/gpt_j_model.rs Normal file
View File

@ -0,0 +1,836 @@
// Copyright 2021 The Eleuther AI and HuggingFace Inc. team. All rights reserved.
// Copyright 2022 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::common::activations::Activation;
use crate::common::dropout::Dropout;
use crate::common::embeddings::process_ids_embeddings_pair;
use crate::common::kind::get_min;
use crate::gpt_j::attention::LayerState;
use crate::gpt_j::transformer::GptJBlock;
use crate::pipelines::common::{ModelType, TokenizerOption};
use crate::pipelines::generation_utils::private_generation_utils::{
PreparedInput, PrivateLanguageGenerator,
};
use crate::pipelines::generation_utils::{
Cache, GenerateConfig, LMHeadModel, LMModelOutput, LanguageGenerator,
};
use crate::{Config, RustBertError};
use rust_tokenizers::tokenizer::Gpt2Tokenizer;
use rust_tokenizers::vocab::Gpt2Vocab;
use serde::{Deserialize, Serialize};
use std::borrow::{Borrow, BorrowMut};
use tch::nn::{embedding, Linear};
use tch::{nn, Device, Tensor};
/// # GPT-J Pretrained model weight files
pub struct GptJModelResources;
/// # GPT-J Pretrained model config files
pub struct GptJConfigResources;
/// # GPT-J Pretrained model vocab files
pub struct GptJVocabResources;
/// # GPT-J Pretrained model merges files
pub struct GptJMergesResources;
/// Model weights for Rust are not available out of the box for GPT-J but can be created
/// simply with the following command:
///
/// ```
/// python utils/convert_model.py path/to/gpt_j/pytorch_model.bin
/// ```
///
/// Where `pytorch_model.bin` was downloaded from [EleutherAI GPT-J 6B][gpt-j-6B] or
/// [EleutherAI GPT-J 6B (float16)][gpt-j-6B-float16]. Note that to convert GPT-J 6B you
/// will need about 32 Gb of RAM, and converting GPT-J 6B float16 requires about 12 Gb
/// of RAM.
///
/// [gpt-j-6B]: https://huggingface.co/EleutherAI/gpt-j-6B/tree/main
/// [gpt-j-6B-float16]:https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16
///
impl GptJModelResources {
pub const GPT_J_TINY_RANDOM: (&'static str, &'static str) = (
"gpt-j-tiny-random/model",
"https://huggingface.co/anton-l/gpt-j-tiny-random/resolve/main/rust_model.ot",
);
}
impl GptJConfigResources {
/// Shared under Apache 2.0 license by the EleutherAI contributors at <https://www.eleuther.ai>. Modified with conversion to C-array format.
pub const GPT_J_6B: (&'static str, &'static str) = (
"gpt-j-6B/config",
"https://huggingface.co/EleutherAI/gpt-j-6B/resolve/main/config.json",
);
pub const GPT_J_6B_FLOAT16: (&'static str, &'static str) = (
"gpt-j-6B/config",
"https://huggingface.co/EleutherAI/gpt-j-6B/resolve/float16/config.json",
);
pub const GPT_J_TINY_RANDOM: (&'static str, &'static str) = (
"gpt-j-tiny-random/config",
"https://huggingface.co/anton-l/gpt-j-tiny-random/resolve/main/config.json",
);
}
impl GptJVocabResources {
/// Shared under Apache 2.0 license by the EleutherAI contributors at <https://www.eleuther.ai>. Modified with conversion to C-array format.
pub const GPT_J_6B: (&'static str, &'static str) = (
"gpt-j-6B/vocab",
"https://huggingface.co/EleutherAI/gpt-j-6B/resolve/main/vocab.json",
);
pub const GPT_J_6B_FLOAT16: (&'static str, &'static str) = (
"gpt-j-6B/vocab",
"https://huggingface.co/EleutherAI/gpt-j-6B/resolve/float16/vocab.json",
);
pub const GPT_J_TINY_RANDOM: (&'static str, &'static str) = (
"gpt-j-tiny-random/vocab",
"https://huggingface.co/anton-l/gpt-j-tiny-random/resolve/main/vocab.json",
);
}
impl GptJMergesResources {
/// Shared under Apache 2.0 license by the EleutherAI contributors at <https://www.eleuther.ai>. Modified with conversion to C-array format.
pub const GPT_J_6B: (&'static str, &'static str) = (
"gpt-j-6B/merges",
"https://huggingface.co/EleutherAI/gpt-j-6B/resolve/main/merges.txt",
);
pub const GPT_J_6B_FLOAT16: (&'static str, &'static str) = (
"gpt-j-6B/merges",
"https://huggingface.co/EleutherAI/gpt-j-6B/resolve/float16/merges.txt",
);
pub const GPT_J_TINY_RANDOM: (&'static str, &'static str) = (
"gpt-j-tiny-random/merges",
"https://huggingface.co/anton-l/gpt-j-tiny-random/resolve/main/merges.txt",
);
}
#[derive(Debug, Serialize, Deserialize, Clone)]
/// # GPT-J model configuration
/// Defines the GPT-J model architecture (e.g. number of layers, hidden layer size, vocab size...).
pub struct GptJConfig {
pub attn_pdrop: Option<f64>,
pub embd_pdrop: Option<f64>,
pub hidden_dropout_prob: Option<f64>,
pub afn: Option<Activation>,
pub initializer_range: f64,
pub layer_norm_epsilon: f64,
pub n_embd: i64,
pub n_head: i64,
pub n_layer: i64,
pub n_positions: i64,
pub n_inner: Option<i64>,
pub num_labels: Option<i64>,
pub use_cache: Option<bool>,
pub output_attentions: Option<bool>,
pub output_hidden_states: Option<bool>,
pub resid_pdrop: Option<f64>,
pub rotary_dim: Option<i64>,
pub vocab_size: i64,
pub scale_attn_weights: Option<bool>,
#[serde(default = "default_use_float16")]
pub use_float16: bool,
#[serde(default = "default_preload_on_cpu")]
pub preload_on_cpu: bool,
}
impl Config for GptJConfig {}
impl Default for GptJConfig {
fn default() -> Self {
GptJConfig {
attn_pdrop: Some(0.1),
embd_pdrop: Some(0.1),
hidden_dropout_prob: None,
afn: Some(Activation::gelu_new),
initializer_range: 0.02,
layer_norm_epsilon: 1e-5,
n_embd: 4096,
n_head: 16,
n_layer: 28,
n_positions: 2048,
n_inner: None,
num_labels: None,
use_cache: None,
output_attentions: None,
output_hidden_states: None,
resid_pdrop: Some(0.1),
rotary_dim: Some(64),
vocab_size: 50400,
scale_attn_weights: Some(true),
use_float16: default_use_float16(),
preload_on_cpu: default_preload_on_cpu(),
}
}
}
fn default_use_float16() -> bool {
true
}
fn default_preload_on_cpu() -> bool {
true
}
/// # GPT-J Base model
/// Base architecture for GPT-J model. Usually complemented with a task-specific head, such as a language model head.
/// It is made of the following blocks:
/// - `wte`: `token` embeddings
/// - `h`: Encoder (transformer) made of a vector of layers. Each layer is made of a multi-head attention layer, a layer-normalization layer, and a MLP made of linear layers.
/// - `output_past`: flag indicating if the model should return a past state. This can be fed back to the model to improve the quality of text generated.
/// - `output_hidden_states`: flag indicating if the model should return all hidden states (as opposed to only the last layer)
/// - `output_attentions`: flag indicating if the model should return activation weights
pub struct GptJModel {
wte: nn::Embedding,
drop: Dropout,
ln_f: nn::LayerNorm,
h: Vec<GptJBlock>,
use_cache: bool,
output_hidden_states: bool,
output_attentions: bool,
}
impl GptJModel {
/// Build a new `GptJModel`
///
/// # Arguments
///
/// * `p` - Variable store path for the root of the GPT-J model
/// * `config` - `GptJConfig` object defining the model architecture
///
/// # Example
///
/// ```no_run
/// use rust_bert::gpt_j::{GptJConfig, GptJModel};
/// use rust_bert::Config;
/// use std::path::Path;
/// use tch::{nn, Device};
///
/// let config_path = Path::new("path/to/config.json");
/// let device = Device::Cpu;
/// let p = nn::VarStore::new(device);
/// let config = GptJConfig::from_file(config_path);
/// let gpt_j: GptJModel = GptJModel::new(&p.root() / "gpt_j", &config);
/// ```
pub fn new<'p, P>(p: P, config: &GptJConfig) -> GptJModel
where
P: Borrow<nn::Path<'p>>,
{
let p = p.borrow() / "transformer";
let wte = embedding(
&p / "wte",
config.vocab_size,
config.n_embd,
Default::default(),
);
if config.use_float16 {
(&(&p / "wte") / "weight").half()
};
let embd_pdrop = config.embd_pdrop.unwrap_or(0.1);
let drop = Dropout::new(embd_pdrop);
let layer_norm_config = nn::LayerNormConfig {
eps: config.layer_norm_epsilon,
..Default::default()
};
let ln_f = nn::layer_norm(&p / "ln_f", vec![config.n_embd], layer_norm_config);
if config.use_float16 {
(&p / "ln_f").half()
};
let mut h: Vec<GptJBlock> = vec![];
let h_path = &p / "h";
for layer_index in 0..config.n_layer {
h.push(GptJBlock::new(&h_path / layer_index, config));
}
let use_cache = config.use_cache.unwrap_or(true);
let output_attentions = config.output_attentions.unwrap_or(false);
let output_hidden_states = config.output_hidden_states.unwrap_or(false);
GptJModel {
wte,
drop,
ln_f,
h,
use_cache,
output_hidden_states,
output_attentions,
}
}
/// Forward pass through the model
///
/// # Arguments
///
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
/// * `layer_past` - Optional vector of length *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*). When provided, these are concatenated with the current input keys and values.
/// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
/// * `token_type_ids` - Optional token type ids used to indicate the portion of the input the token belongs to. If not None, token type embeddings will be added to the token and position embeddings.
/// * `_position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented starting from the length of the past input.
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
///
/// # Returns
///
/// * `GptJModelOutput` containing:
/// - `output` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the activations of the last hidden state
/// - `cache` - `Option<Vec<Tensor>>` of length *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*)
/// - `all_hidden_states` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
/// - `all_attentions` - `Option<Vec<Tensor>>` of length *num_hidden_layers* with shape (*batch size*, *sequence_length*, *hidden_size*)
///
/// # Example
///
/// ```no_run
/// # use tch::{nn, Device, Tensor, no_grad};
/// # use rust_bert::Config;
/// # use std::path::Path;
/// # use tch::kind::Kind::{Int64, Double};
/// use rust_bert::gpt_j::{GptJConfig, GptJModel, LayerState};
/// # let config_path = Path::new("path/to/config.json");
/// # let vocab_path = Path::new("path/to/vocab.txt");
/// # let device = Device::Cpu;
/// # let vs = nn::VarStore::new(device);
/// # let config = GptJConfig::from_file(config_path);
/// # let gpt_j_model: GptJModel = GptJModel::new(&vs.root(), &config);
/// let (batch_size, sequence_length, past_sequence_length) = (64, 128, 56);
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
/// let mut past: Vec<Option<LayerState>> = Vec::with_capacity(config.n_layer as usize);
/// for _ in 0..config.n_layer as usize {
/// past.push(Some(LayerState {
/// prev_key: Tensor::rand(
/// &[
/// batch_size,
/// config.n_head,
/// past_sequence_length,
/// config.n_embd / config.n_head,
/// ],
/// (Double, device),
/// ),
/// prev_value: Tensor::rand(
/// &[
/// batch_size,
/// config.n_head,
/// past_sequence_length,
/// config.n_embd / config.n_head,
/// ],
/// (Double, device),
/// ),
/// }))
/// }
/// let attention_mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let token_type_ids = Tensor::ones(&[batch_size, sequence_length], (Int64, device));
///
/// let model_output = no_grad(|| {
/// gpt_j_model
/// .forward_t(
/// Some(&input_tensor),
/// Some(&past),
/// Some(&attention_mask),
/// Some(&token_type_ids),
/// None,
/// None,
/// false,
/// )
/// .unwrap()
/// });
/// ```
pub fn forward_t(
&self,
input_ids: Option<&Tensor>,
layer_past: Option<Vec<Option<LayerState>>>,
attention_mask: Option<&Tensor>,
token_type_ids: Option<&Tensor>,
_position_ids: Option<&Tensor>,
input_embeds: Option<&Tensor>,
train: bool,
) -> Result<GptJModelOutput, RustBertError> {
let (calc_input_embeddings, _input_size, _device) =
process_ids_embeddings_pair(input_ids, input_embeds, &self.wte)?;
let input_embeddings =
input_embeds.unwrap_or_else(|| calc_input_embeddings.as_ref().unwrap());
let (layer_past, _layer_past_length) = match layer_past {
Some(value) => {
if value.len() != self.h.len() {
return Err(RustBertError::ValueError(format!(
"Past activations vector length ({}) must be equal to the number of layers ({})",
value.len(),
self.h.len()
)));
} else {
let length = value.len();
(value, length)
}
}
None => {
let mut out = Vec::with_capacity(self.h.len());
out.resize_with(self.h.len(), || None);
(out, 0)
}
};
let kind_min = get_min(input_embeddings.kind())?;
let attention_mask: Option<Tensor> = attention_mask.map(|value| {
let attention_mask = value
.view((input_embeddings.size()[0], -1))
.unsqueeze(1)
.unsqueeze(2)
.to_kind(input_embeddings.kind());
(attention_mask.ones_like() - attention_mask.to_kind(input_embeddings.kind()))
* kind_min
});
let mut hidden_state: Tensor = input_embeddings.copy();
if let Some(token_type_ids) = token_type_ids {
let token_type_embeds = token_type_ids.apply(&self.wte);
hidden_state = hidden_state + token_type_embeds;
}
hidden_state = hidden_state.apply_t(&self.drop, train);
let mut all_presents: Option<Vec<Option<LayerState>>> = self.use_cache.then(Vec::new);
let mut all_hidden_states: Option<Vec<Tensor>> = self.output_hidden_states.then(Vec::new);
let mut all_attentions: Option<Vec<Tensor>> = self.output_attentions.then(Vec::new);
for (layer, past) in self.h.iter().zip(layer_past) {
let temp =
layer.forward_t(&hidden_state, past.as_ref(), attention_mask.as_ref(), train);
hidden_state = temp.0;
if let Some(presents) = all_presents.borrow_mut() {
presents.push(temp.1);
};
if let Some(attentions) = all_attentions.borrow_mut() {
attentions.push(std::mem::take(&mut temp.2.unwrap()));
};
if let Some(hidden_states) = all_hidden_states.borrow_mut() {
hidden_states.push(std::mem::take(&mut hidden_state));
};
}
let output = hidden_state.apply(&self.ln_f);
Ok(GptJModelOutput {
output,
cache: all_presents,
all_hidden_states,
all_attentions,
})
}
}
/// # GPT-J Language Modeling head
/// GPT-J model with a decoding head (linear layer without bias). The weights of the linear layer are tied to the word embeddings
/// It is made of the following blocks:
/// - `transformer`: Base GptJModel
pub struct GptJLMHeadModel {
transformer: GptJModel,
lm_head: Linear,
}
impl GptJLMHeadModel {
/// Build a new `GptJLMHeadModel`
///
/// # Arguments
///
/// * `p` - Variable store path for the root of the GPT-J model
/// * `config` - `GptJConfig` object defining the model architecture
///
/// # Example
///
/// ```no_run
/// use rust_bert::gpt_j::{GptJLMHeadModel, GptJConfig};
/// use rust_bert::Config;
/// use std::path::Path;
/// use tch::{nn, Device};
///
/// let config_path = Path::new("path/to/config.json");
/// let device = Device::Cpu;
/// let p = nn::VarStore::new(device);
/// let config = GptJConfig::from_file(config_path);
/// let gpt_j: GptJLMHeadModel = GptJLMHeadModel::new(&p.root() / "gpt_j", &config);
/// ```
pub fn new<'p, P>(p: P, config: &GptJConfig) -> GptJLMHeadModel
where
P: Borrow<nn::Path<'p>>,
{
let p = p.borrow();
let transformer = GptJModel::new(p, config);
let lm_head = nn::linear(
p / "lm_head",
config.n_embd,
config.vocab_size,
Default::default(),
);
if config.use_float16 {
(p / "lm_head").half();
}
GptJLMHeadModel {
transformer,
lm_head,
}
}
}
impl LMHeadModel for GptJLMHeadModel {
/// Forward pass through the model
///
/// # Arguments
///
/// * `input_ids` - Optional input tensor of shape (*batch size*, *sequence_length*). If None, pre-computed embeddings must be provided (see `input_embeds`)
/// * `layer_past` - Optional vector of size *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*). When provided, these are concatenated with the current input keys and values.
/// * `attention_mask` - Optional mask of shape (*batch size*, *sequence_length*). Masked position have value 0, non-masked value 1. If None set to 1
/// * `input_embeds` - Optional pre-computed input embeddings of shape (*batch size*, *sequence_length*, *hidden_size*). If None, input ids must be provided (see `input_ids`)
/// * `token_type_ids` - Optional token type ids used to indicate the portion of the input the token belongs to. If not None, token type embeddings will be added to the token and position embeddings.
/// * `_position_ids` - Optional position ids of shape (*batch size*, *sequence_length*). If None, will be incremented starting from the length of the past input.
/// * `_encoder_outputs` - Optional tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*). Unused for GPT-J
/// * `_decoder_input_ids` - Optional tensor of shape (*batch size*, *target_sequence_length*). Unused for GPT_J
/// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
///
///
/// # Returns
///
/// * `LMModelOutput` containing:
/// - `lm_logits` - `Tensor` of shape (*batch size*, *sequence_length*, *vocab_size*) representing the logits for each vocab item and position
/// - `cache` - `GptJCache` made of `Option<Vec<Tensor>>` of length *n_layer* containing the past keys and values of each layer of shape (*2*, *batch size*, *number of heads*, *past_sequence_length*, *hidden size per head*)
///
/// # Example
///
/// ```no_run
/// # use tch::{nn, Device, Tensor, no_grad};
/// # use rust_bert::Config;
/// # use std::path::Path;
/// # use tch::kind::Kind::{Int64, Double};
/// use rust_bert::gpt_j::{GptJLMHeadModel, GptJConfig};
/// use rust_bert::pipelines::generation_utils::{Cache, LMHeadModel};
/// # let config_path = Path::new("path/to/config.json");
/// # let vocab_path = Path::new("path/to/vocab.txt");
/// # let device = Device::Cpu;
/// # let vs = nn::VarStore::new(device);
/// # let config = GptJConfig::from_file(config_path);
/// # let mut gpt_j_model: GptJLMHeadModel = GptJLMHeadModel::new(&vs.root(), &config);
/// let (batch_size, sequence_length, past_sequence_length) = (64, 128, 56);
/// let input_tensor = Tensor::rand(&[batch_size, sequence_length], (Int64, device));
/// let mut past: Vec<Tensor> = Vec::with_capacity(config.n_layer as usize);
/// for _ in 0..config.n_layer as usize {
/// past.push(Tensor::rand(
/// &[
/// 2,
/// batch_size,
/// config.n_head,
/// past_sequence_length,
/// config.n_embd / config.n_head,
/// ],
/// (Double, device),
/// ))
/// }
/// let attention_mask = Tensor::zeros(&[batch_size, sequence_length], (Int64, device));
/// let token_type_ids = Tensor::ones(&[batch_size, sequence_length], (Int64, device));
/// let position_ids = Tensor::arange(sequence_length, (Int64, device))
/// .expand(&[batch_size, sequence_length], true);
///
/// let model_output = no_grad(|| {
/// gpt_j_model
/// .forward_t(
/// Some(&input_tensor),
/// Cache::GPTJCache(Some(past)),
/// Some(&attention_mask),
/// Some(&token_type_ids),
/// None,
/// None,
/// None,
/// None,
/// false,
/// )
/// .unwrap()
/// });
/// ```
fn forward_t(
&self,
input_ids: Option<&Tensor>,
layer_past: Cache,
attention_mask: Option<&Tensor>,
token_type_ids: Option<&Tensor>,
position_ids: Option<&Tensor>,
input_embeds: Option<&Tensor>,
_encoder_outputs: Option<&Tensor>,
_decoder_input_ids: Option<&Tensor>,
train: bool,
) -> Result<LMModelOutput, RustBertError> {
let base_model_output = match layer_past {
Cache::GPTJCache(layer_past) => self.transformer.forward_t(
input_ids,
layer_past,
attention_mask,
token_type_ids,
position_ids,
input_embeds,
train,
),
Cache::None => self.transformer.forward_t(
input_ids,
None,
attention_mask,
token_type_ids,
position_ids,
input_embeds,
train,
),
_ => {
return Err(RustBertError::ValueError(
"Cache not compatible with GPT-J Model".into(),
));
}
}?;
let lm_logits = base_model_output.output.apply(&self.lm_head);
Ok(LMModelOutput {
lm_logits,
cache: Cache::GPTJCache(base_model_output.cache),
})
}
}
/// Container for the GPT-J model output.
pub struct GptJModelOutput {
/// Hidden state of the last layer of the decoder, or logits for a custom head
/// module after the decoder (e.g. vocabulary logits for language modeling tasks)
pub output: Tensor,
/// Cached attention layers keys and values if the model is used for generation
pub cache: Option<Vec<Option<LayerState>>>,
/// Hidden states for all intermediate layers
pub all_hidden_states: Option<Vec<Tensor>>,
/// Attention weights for all intermediate layers
pub all_attentions: Option<Vec<Tensor>>,
}
/// # Language generation model based on the GPT-J architecture
pub struct GptJGenerator {
model: GptJLMHeadModel,
tokenizer: TokenizerOption,
var_store: nn::VarStore,
generate_config: GenerateConfig,
bos_token_id: Option<i64>,
eos_token_ids: Option<Vec<i64>>,
pad_token_id: Option<i64>,
is_encoder_decoder: bool,
vocab_size: i64,
decoder_start_id: Option<i64>,
max_position_embeddings: i64,
}
impl GptJGenerator {
/// Build a new `GptJGenerator`
///
/// # Arguments
///
/// * `generate_config` - `GenerateConfig` object containing the resource references (model, vocabulary, configuration), generation options and device placement (CPU/GPU)
///
/// # Example
///
/// ```no_run
/// # fn main() -> anyhow::Result<()> {
/// use rust_bert::gpt_j::GptJGenerator;
/// use rust_bert::pipelines::generation_utils::GenerateConfig;
///
/// let generate_config = GenerateConfig {
/// max_length: 30,
/// do_sample: true,
/// num_beams: 5,
/// temperature: 1.1,
/// num_return_sequences: 3,
/// ..Default::default()
/// };
/// let gpt_j_generator = GptJGenerator::new(generate_config)?;
/// # Ok(())
/// # }
/// ```
pub fn new(generate_config: GenerateConfig) -> Result<GptJGenerator, RustBertError> {
let vocab_path = generate_config.vocab_resource.get_local_path()?;
let merges_path = generate_config
.merges_resource
.as_ref()
.ok_or_else(|| {
RustBertError::InvalidConfigurationError(
"GPT-J expects a merges resources to be provided".to_string(),
)
})?
.get_local_path()?;
let tokenizer = TokenizerOption::from_file(
ModelType::GPTJ,
vocab_path.to_str().unwrap(),
Some(merges_path.to_str().unwrap()),
false,
None,
None,
)?;
Self::new_with_tokenizer(generate_config, tokenizer)
}
pub fn new_with_tokenizer(
generate_config: GenerateConfig,
tokenizer: TokenizerOption,
) -> Result<GptJGenerator, RustBertError> {
let config_path = generate_config.config_resource.get_local_path()?;
let weights_path = generate_config.model_resource.get_local_path()?;
let device = generate_config.device;
generate_config.validate();
let mut var_store = nn::VarStore::new(device);
let config = GptJConfig::from_file(config_path);
let model = GptJLMHeadModel::new(var_store.root(), &config);
if config.preload_on_cpu && device != Device::Cpu {
var_store.set_device(Device::Cpu);
}
var_store.load(weights_path)?;
if device != Device::Cpu {
var_store.set_device(device);
}
let bos_token_id = tokenizer.get_bos_id();
let eos_token_ids = tokenizer.get_eos_id().map(|id| vec![id]);
let pad_token_id = tokenizer.get_pad_id();
let max_position_embeddings = config.n_positions;
let is_encoder_decoder = false;
let vocab_size = config.vocab_size;
let decoder_start_id = None;
Ok(GptJGenerator {
model,
tokenizer,
var_store,
generate_config,
bos_token_id,
eos_token_ids,
pad_token_id,
is_encoder_decoder,
vocab_size,
decoder_start_id,
max_position_embeddings,
})
}
}
impl PrivateLanguageGenerator<GptJLMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for GptJGenerator {
fn get_model(&self) -> &GptJLMHeadModel {
&self.model
}
fn _get_tokenizer(&self) -> &TokenizerOption {
&self.tokenizer
}
fn get_var_store(&self) -> &nn::VarStore {
&self.var_store
}
fn get_var_store_mut(&mut self) -> &mut nn::VarStore {
&mut self.var_store
}
fn get_config(&self) -> &GenerateConfig {
&self.generate_config
}
fn get_bos_id(&self) -> Option<i64> {
self.bos_token_id
}
fn get_eos_ids(&self) -> Option<&Vec<i64>> {
self.eos_token_ids.as_ref()
}
fn get_pad_id(&self) -> Option<i64> {
self.pad_token_id
}
fn is_encoder_decoder(&self) -> bool {
self.is_encoder_decoder
}
fn get_vocab_size(&self) -> i64 {
self.vocab_size
}
fn get_decoder_start_id(&self) -> Option<i64> {
self.decoder_start_id
}
fn get_max_positions_embeddings(&self) -> i64 {
self.max_position_embeddings
}
fn prepare_inputs_for_generation<'a>(
&self,
input_ids: Tensor,
_encoder_outputs: Option<&'a Tensor>,
past: Cache,
attention_mask: Tensor,
) -> PreparedInput<'a> {
match past {
Cache::GPTJCache(past) => {
if past.is_some() {
PreparedInput {
prepared_input: Some(input_ids.select(1, -1).unsqueeze(-1)),
prepared_attention_mask: Some(attention_mask),
prepared_encoder_output: None,
prepared_decoder_input: None,
prepared_position_ids: None,
prepared_past: Cache::GPTJCache(past),
}
} else {
PreparedInput {
prepared_input: Some(input_ids),
prepared_attention_mask: Some(attention_mask),
prepared_encoder_output: None,
prepared_decoder_input: None,
prepared_position_ids: None,
prepared_past: Cache::GPTJCache(None),
}
}
}
Cache::None => PreparedInput {
prepared_input: Some(input_ids),
prepared_attention_mask: Some(attention_mask),
prepared_encoder_output: None,
prepared_decoder_input: None,
prepared_position_ids: None,
prepared_past: Cache::GPTJCache(None),
},
_ => panic!("Cache type incompatible with GPT-J"),
}
}
fn reorder_cache(
&self,
past: &mut Cache,
_encoder_outputs: Option<Tensor>,
beam_indices: &Tensor,
) -> Option<Tensor> {
match past {
Cache::GPTJCache(cached_decoder_state) => match cached_decoder_state {
Some(old_cache) => {
for layer_state in old_cache.iter_mut() {
if layer_state.is_some() {
layer_state.as_mut().unwrap().reorder_cache(beam_indices)
};
}
None
}
None => None,
},
Cache::None => None,
_ => {
panic!("Invalid cache for GPT-J model");
}
}
}
}
impl LanguageGenerator<GptJLMHeadModel, Gpt2Vocab, Gpt2Tokenizer> for GptJGenerator {}

58
src/gpt_j/mod.rs Normal file
View File

@ -0,0 +1,58 @@
//! # GPT-J
//!
//! Implementation of the GPT-J language model
//!
//! # Model set-up and pre-trained weights loading
//!
//! ```no_run
//! # fn main() -> anyhow::Result<()> {
//! #
//! use tch::{nn, Device};
//! # use std::path::PathBuf;
//! use rust_bert::gpt_j::{GptJLMHeadModel, GptJConfig};
//! use rust_bert::resources::{LocalResource, ResourceProvider};
//! use rust_bert::Config;
//! use rust_tokenizers::tokenizer::Gpt2Tokenizer;
//!
//! let config_resource = LocalResource {
//! local_path: PathBuf::from("path/to/config.json"),
//! };
//! let vocab_resource = LocalResource {
//! local_path: PathBuf::from("path/to/vocab.txt"),
//! };
//! let merges_resource = LocalResource {
//! local_path: PathBuf::from("path/to/vocab.txt"),
//! };
//! let weights_resource = LocalResource {
//! local_path: PathBuf::from("path/to/model.ot"),
//! };
//! let config_path = config_resource.get_local_path()?;
//! let vocab_path = vocab_resource.get_local_path()?;
//! let merges_path = merges_resource.get_local_path()?;
//! let weights_path = weights_resource.get_local_path()?;
//!
//! let device = Device::cuda_if_available();
//! let mut vs = nn::VarStore::new(device);
//! let tokenizer: Gpt2Tokenizer = Gpt2Tokenizer::from_file(
//! vocab_path.to_str().unwrap(),
//! merges_path.to_str().unwrap(),
//! true,
//! )?;
//! let config = GptJConfig::from_file(config_path);
//! let gpt_j_model = GptJLMHeadModel::new(&vs.root(), &config);
//! vs.load(weights_path)?;
//!
//! # Ok(())
//! # }
//! ```
mod attention;
mod gpt_j_model;
mod transformer;
pub use gpt_j_model::{
GptJConfig, GptJConfigResources, GptJGenerator, GptJLMHeadModel, GptJMergesResources,
GptJModel, GptJModelOutput, GptJModelResources, GptJVocabResources,
};
pub use attention::LayerState;

131
src/gpt_j/transformer.rs Normal file
View File

@ -0,0 +1,131 @@
// Copyright 2021 The Eleuther AI and HuggingFace Inc. team. All rights reserved.
// Copyright 2022 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::common::activations::{Activation, TensorFunction};
use crate::common::dropout::Dropout;
use crate::gpt_j::attention::{GptJAttention, LayerState};
use crate::gpt_j::gpt_j_model::GptJConfig;
use std::borrow::Borrow;
use tch::nn::Linear;
use tch::{nn, Tensor};
pub struct GptJMLP {
fc_in: Linear,
fc_out: Linear,
activation: TensorFunction,
dropout: Dropout,
}
impl GptJMLP {
pub fn new<'p, P>(p: P, config: &GptJConfig) -> GptJMLP
where
P: Borrow<nn::Path<'p>>,
{
let p = p.borrow();
let intermediate_size = if let Some(n_inner) = config.n_inner {
n_inner
} else {
4 * config.n_embd
};
let fc_in = nn::linear(
p / "fc_in",
config.n_embd,
intermediate_size,
Default::default(),
);
if config.use_float16 {
(p / "fc_in").half()
};
let fc_out = nn::linear(
p / "fc_out",
intermediate_size,
config.n_embd,
Default::default(),
);
if config.use_float16 {
(p / "fc_out").half()
};
let activation = match &config.afn {
Some(activation_enum) => match activation_enum {
Activation::gelu => &Activation::gelu_new,
default => default,
},
None => &Activation::gelu_new,
}
.get_function();
let resid_pdrop = config.resid_pdrop.unwrap_or(0.1);
let dropout = Dropout::new(resid_pdrop);
GptJMLP {
fc_in,
fc_out,
activation,
dropout,
}
}
pub fn forward_t(&self, hidden_states: &Tensor, train: bool) -> Tensor {
let h = (self.activation.get_fn())(&hidden_states.apply(&self.fc_in));
h.apply(&self.fc_out).apply_t(&self.dropout, train)
}
}
pub struct GptJBlock {
ln_1: nn::LayerNorm,
attn: GptJAttention,
mlp: GptJMLP,
}
impl GptJBlock {
pub fn new<'p, P>(p: P, config: &GptJConfig) -> GptJBlock
where
P: Borrow<nn::Path<'p>>,
{
let p = p.borrow();
let layer_norm_config = nn::LayerNormConfig {
eps: config.layer_norm_epsilon,
..Default::default()
};
let ln_1 = nn::layer_norm(p / "ln_1", vec![config.n_embd], layer_norm_config);
if config.use_float16 {
(p / "ln_1").half()
};
let attn = GptJAttention::new(p / "attn", config);
let mlp = GptJMLP::new(p / "mlp", config);
GptJBlock { ln_1, attn, mlp }
}
pub fn forward_t(
&self,
hidden_states: &Tensor,
layer_past: Option<&LayerState>,
attention_mask: Option<&Tensor>,
train: bool,
) -> (Tensor, Option<LayerState>, Option<Tensor>) {
let residual = hidden_states;
let hidden_states = hidden_states.apply(&self.ln_1);
let (attn_output, present, attn_weights) =
self.attn
.forward_t(&hidden_states, attention_mask, layer_past, train);
let feed_forward_hidden_states = self.mlp.forward_t(&hidden_states, train);
let hidden_states = attn_output + feed_forward_hidden_states + residual;
(hidden_states, present, attn_weights)
}
}

View File

@ -584,7 +584,7 @@
//! # ;
//! ```
//! </details>
//! &nbsp;
//! &nbsp;
//! <details>
//! <summary> <b>11. Sentence embeddings </b> </summary>
//!
@ -600,7 +600,7 @@
//! "this is an example sentence",
//! "each sentence is converted"
//! ];
//!
//!
//! let output = model.encode(&sentences);
//! # Ok(())
//! # }
@ -708,6 +708,7 @@ pub mod distilbert;
pub mod electra;
pub mod fnet;
pub mod gpt2;
pub mod gpt_j;
pub mod gpt_neo;
pub mod longformer;
pub mod longt5;

View File

@ -26,6 +26,7 @@ use crate::distilbert::DistilBertConfig;
use crate::electra::ElectraConfig;
use crate::fnet::FNetConfig;
use crate::gpt2::Gpt2Config;
use crate::gpt_j::GptJConfig;
use crate::gpt_neo::GptNeoConfig;
use crate::longformer::LongformerConfig;
use crate::longt5::LongT5Config;
@ -78,6 +79,7 @@ pub enum ModelType {
Albert,
XLNet,
GPT2,
GPTJ,
OpenAiGpt,
Reformer,
ProphetNet,
@ -119,6 +121,8 @@ pub enum ConfigOption {
XLNet(XLNetConfig),
/// GPT2 configuration
GPT2(Gpt2Config),
/// GPT-J configuration
GPTJ(GptJConfig),
/// Reformer configuration
Reformer(ReformerConfig),
/// RoBERTa configuration
@ -196,6 +200,7 @@ impl ConfigOption {
ModelType::Albert => ConfigOption::Albert(AlbertConfig::from_file(path)),
ModelType::XLNet => ConfigOption::XLNet(XLNetConfig::from_file(path)),
ModelType::GPT2 => ConfigOption::GPT2(Gpt2Config::from_file(path)),
ModelType::GPTJ => ConfigOption::GPTJ(GptJConfig::from_file(path)),
ModelType::GPTNeo => ConfigOption::GPTNeo(GptNeoConfig::from_file(path)),
ModelType::OpenAiGpt => ConfigOption::OpenAiGpt(OpenAiGptConfig::from_file(path)),
ModelType::Reformer => ConfigOption::Reformer(ReformerConfig::from_file(path)),
@ -285,6 +290,7 @@ impl ConfigOption {
Self::LongT5(_) => panic!("LongT5 does not use a label mapping"),
Self::OpenAiGpt(_) => panic!("OpenAI GPT does not use a label mapping"),
Self::GPT2(_) => panic!("GPT2 does not use a label mapping"),
Self::GPTJ(_) => panic!("GPT-J does not use a label mapping"),
Self::GPTNeo(_) => panic!("GPT-Neo does not use a label mapping"),
Self::Pegasus(_) => panic!("Pegasus does not use a label mapping"),
}
@ -305,6 +311,7 @@ impl ConfigOption {
Self::Albert(config) => Some(config.max_position_embeddings),
Self::XLNet(_) => None,
Self::GPT2(config) => Some(config.n_positions),
Self::GPTJ(config) => Some(config.n_positions),
Self::Reformer(config) => Some(config.max_position_embeddings),
Self::ProphetNet(config) => Some(config.max_position_embeddings),
Self::Longformer(config) => Some(config.max_position_embeddings),
@ -555,11 +562,13 @@ impl TokenizerOption {
}
TokenizerOption::Reformer(ReformerTokenizer::from_file(vocab_path, lower_case)?)
}
ModelType::GPT2 | ModelType::GPTNeo => TokenizerOption::GPT2(Gpt2Tokenizer::from_file(
vocab_path,
merges_path.expect("No merges specified!"),
lower_case,
)?),
ModelType::GPT2 | ModelType::GPTNeo | ModelType::GPTJ => {
TokenizerOption::GPT2(Gpt2Tokenizer::from_file(
vocab_path,
merges_path.expect("No merges specified!"),
lower_case,
)?)
}
ModelType::OpenAiGpt => TokenizerOption::OpenAiGpt(OpenAiGptTokenizer::from_file(
vocab_path,
merges_path.expect("No merges specified!"),

View File

@ -74,6 +74,7 @@ use tch::{no_grad, Device, Tensor};
use crate::bart::LayerState as BartLayerState;
use crate::common::error::RustBertError;
use crate::common::resources::ResourceProvider;
use crate::gpt_j::LayerState as GPTJLayerState;
use crate::gpt_neo::LayerState as GPTNeoLayerState;
use crate::pipelines::generation_utils::private_generation_utils::{
InternalGenerateOptions, PrivateLanguageGenerator,
@ -224,6 +225,7 @@ pub enum Cache {
ReformerCache(Option<Vec<Option<ReformerLayerState>>>),
ProphetNetCache(Option<Vec<(Option<ProphetNetLayerState>, Option<ProphetNetLayerState>)>>),
GPTNeoCache(Option<Vec<Option<GPTNeoLayerState>>>),
GPTJCache(Option<Vec<Option<GPTJLayerState>>>),
None,
}

View File

@ -35,6 +35,7 @@ use tch::Device;
use crate::common::error::RustBertError;
use crate::gpt2::GPT2Generator;
use crate::gpt_j::GptJGenerator;
use crate::gpt_neo::GptNeoGenerator;
use crate::openai_gpt::OpenAIGenerator;
use crate::pipelines::common::{ModelType, TokenizerOption};
@ -192,6 +193,8 @@ pub enum TextGenerationOption {
GPT(OpenAIGenerator),
/// Text Generator based on GPT-Neo model
GPTNeo(GptNeoGenerator),
/// Text Generator based on GPT-J model
GPTJ(GptJGenerator),
/// Text Generator based on XLNet model
XLNet(XLNetGenerator),
/// Text Generator based on Reformer model
@ -216,6 +219,9 @@ impl TextGenerationOption {
ModelType::GPTNeo => Ok(TextGenerationOption::GPTNeo(GptNeoGenerator::new(
config.into(),
)?)),
ModelType::GPTJ => Ok(TextGenerationOption::GPTJ(GptJGenerator::new(
config.into(),
)?)),
_ => Err(RustBertError::InvalidConfigurationError(format!(
"Text generation not implemented for {:?}!",
config.model_type
@ -229,6 +235,7 @@ impl TextGenerationOption {
Self::GPT(_) => ModelType::OpenAiGpt,
Self::GPT2(_) => ModelType::GPT2,
Self::GPTNeo(_) => ModelType::GPTNeo,
Self::GPTJ(_) => ModelType::GPTJ,
Self::XLNet(_) => ModelType::XLNet,
Self::Reformer(_) => ModelType::Reformer,
}
@ -240,6 +247,7 @@ impl TextGenerationOption {
Self::GPT(model_ref) => model_ref._get_tokenizer(),
Self::GPT2(model_ref) => model_ref._get_tokenizer(),
Self::GPTNeo(model_ref) => model_ref._get_tokenizer(),
Self::GPTJ(model_ref) => model_ref._get_tokenizer(),
Self::XLNet(model_ref) => model_ref._get_tokenizer(),
Self::Reformer(model_ref) => model_ref._get_tokenizer(),
}
@ -276,6 +284,11 @@ impl TextGenerationOption {
.into_iter()
.map(|output| output.indices)
.collect(),
Self::GPTJ(ref model) => model
.generate_indices(prompt_texts, generate_options)
.into_iter()
.map(|output| output.indices)
.collect(),
Self::XLNet(ref model) => model
.generate_indices(prompt_texts, generate_options)
.into_iter()
@ -294,6 +307,7 @@ impl TextGenerationOption {
Self::GPT(model_ref) => model_ref.half(),
Self::GPT2(model_ref) => model_ref.half(),
Self::GPTNeo(model_ref) => model_ref.half(),
Self::GPTJ(model_ref) => model_ref.half(),
Self::XLNet(model_ref) => model_ref.half(),
Self::Reformer(model_ref) => model_ref.half(),
}
@ -304,6 +318,7 @@ impl TextGenerationOption {
Self::GPT(model_ref) => model_ref.float(),
Self::GPT2(model_ref) => model_ref.float(),
Self::GPTNeo(model_ref) => model_ref.float(),
Self::GPTJ(model_ref) => model_ref.float(),
Self::XLNet(model_ref) => model_ref.float(),
Self::Reformer(model_ref) => model_ref.float(),
}
@ -314,6 +329,7 @@ impl TextGenerationOption {
Self::GPT(model_ref) => model_ref.set_device(device),
Self::GPT2(model_ref) => model_ref.set_device(device),
Self::GPTNeo(model_ref) => model_ref.set_device(device),
Self::GPTJ(model_ref) => model_ref.set_device(device),
Self::XLNet(model_ref) => model_ref.set_device(device),
Self::Reformer(model_ref) => model_ref.set_device(device),
}

172
tests/gpt_j.rs Normal file
View File

@ -0,0 +1,172 @@
use rust_bert::gpt_j::{
GptJConfig, GptJConfigResources, GptJLMHeadModel, GptJMergesResources, GptJModelResources,
GptJVocabResources,
};
use rust_bert::pipelines::generation_utils::{Cache, LMHeadModel};
use rust_bert::resources::{RemoteResource, ResourceProvider};
use rust_bert::Config;
use rust_tokenizers::tokenizer::{Gpt2Tokenizer, Tokenizer};
use rust_tokenizers::vocab::Vocab;
use tch::{nn, Device, Tensor};
/// Equivalent Python code:
///
/// ```python
/// import torch
/// from transformers import AutoTokenizer, GPTJForCausalLM
///
/// device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
///
/// model = GPTJForCausalLM.from_pretrained("anton-l/gpt-j-tiny-random").to(device)
/// if torch.cuda.is_available(): model = model.half()
///
/// tokenizer = AutoTokenizer.from_pretrained("anton-l/gpt-j-tiny-random", padding_side="left")
/// tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
///
/// prompts = ["It was a very nice and sunny", "It was a gloom winter night, and"]
/// inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)
///
/// with torch.no_grad():
/// model.forward(**inputs).logits
/// ```
#[test]
fn gpt_j_correctness() -> anyhow::Result<()> {
// Resources paths
let config_resource = Box::new(RemoteResource::from_pretrained(
GptJConfigResources::GPT_J_TINY_RANDOM,
));
let vocab_resource = Box::new(RemoteResource::from_pretrained(
GptJVocabResources::GPT_J_TINY_RANDOM,
));
let merges_resource = Box::new(RemoteResource::from_pretrained(
GptJMergesResources::GPT_J_TINY_RANDOM,
));
let model_resource = Box::new(RemoteResource::from_pretrained(
GptJModelResources::GPT_J_TINY_RANDOM,
));
let device = Device::cuda_if_available();
// Set-up tokenizer
let vocab_path = vocab_resource.get_local_path()?;
let merges_path = merges_resource.get_local_path()?;
let lower_case = false;
let tokenizer = Gpt2Tokenizer::from_file(
vocab_path.to_str().unwrap(),
merges_path.to_str().unwrap(),
lower_case,
)?;
// Set-up model
let mut vs = nn::VarStore::new(device);
let config_path = config_resource.get_local_path()?;
let weights_path = model_resource.get_local_path()?;
let mut config = GptJConfig::from_file(config_path);
config.use_float16 = matches!(device, Device::Cuda(_));
let model = GptJLMHeadModel::new(vs.root(), &config);
vs.load(weights_path)?;
// Tokenize prompts
let prompts = [
"It was a very nice and sunny",
"It was a gloom winter night, and",
];
let pad_token = tokenizer.vocab().get_eos_value();
let &pad_token = tokenizer
.vocab()
.special_values()
.get(pad_token)
.unwrap_or(&2);
let tokens = Tokenizer::tokenize_list(&tokenizer, &prompts);
let max_len = tokens.iter().map(|input| input.len()).max().unwrap_or(0);
let token_ids = tokens
.into_iter()
.map(|prompt_tokens| {
let token_ids = tokenizer.convert_tokens_to_ids(&prompt_tokens);
let mut padded = vec![pad_token; max_len - token_ids.len()];
padded.extend(token_ids);
padded
})
.collect::<Vec<Vec<i64>>>();
let token_masks = token_ids
.iter()
.map(|input| {
Tensor::of_slice(
&input
.iter()
.map(|&e| i64::from(e != pad_token))
.collect::<Vec<_>>(),
)
.to(device)
})
.collect::<Vec<_>>();
let token_ids = token_ids
.into_iter()
.map(|tokens| Tensor::of_slice(&tokens).to(device))
.collect::<Vec<Tensor>>();
let input_tensor = Tensor::stack(&token_ids, 0);
let attention_tensor = Tensor::stack(&token_masks, 0);
// Run model inference
let logits = tch::no_grad(|| {
model.forward_t(
Some(&input_tensor),
Cache::None,
// None,
Some(&attention_tensor),
None,
None,
None,
None,
None,
false,
)
})?
.lm_logits;
if matches!(device, Device::Cpu) {
assert!((logits.double_value(&[0, 0, 0]) - -0.8343).abs() < 1e-4);
assert!((logits.double_value(&[0, 0, 1]) - 0.0203).abs() < 1e-4);
assert!((logits.double_value(&[0, 0, 2]) - 0.4745).abs() < 1e-4);
assert!((logits.double_value(&[0, 0, 50397]) - 0.2641).abs() < 1e-4);
assert!((logits.double_value(&[0, 0, 50398]) - 0.1926).abs() < 1e-4);
assert!((logits.double_value(&[0, 0, 50399]) - 0.0204).abs() < 1e-4);
assert!((logits.double_value(&[1, 0, 0]) - -0.0647).abs() < 1e-4);
assert!((logits.double_value(&[1, 0, 1]) - 0.0105).abs() < 1e-4);
assert!((logits.double_value(&[1, 0, 2]) - -0.3448).abs() < 1e-4);
assert!((logits.double_value(&[1, 0, 50397]) - -0.0445).abs() < 1e-4);
assert!((logits.double_value(&[1, 0, 50398]) - 0.0639).abs() < 1e-4);
assert!((logits.double_value(&[1, 0, 50399]) - -0.1167).abs() < 1e-4);
} else {
assert!((logits.double_value(&[0, 0, 0]) - -0.1110).abs() < 1e-2);
assert!((logits.double_value(&[0, 0, 1]) - 0.0565).abs() < 1e-2);
assert!((logits.double_value(&[0, 0, 2]) - 0.1273).abs() < 1e-2);
assert!((logits.double_value(&[0, 0, 50397]) - -0.1879).abs() < 1e-2);
assert!((logits.double_value(&[0, 0, 50398]) - -0.1114).abs() < 1e-2);
assert!((logits.double_value(&[0, 0, 50399]) - -0.3042).abs() < 1e-2);
assert!((logits.double_value(&[1, 0, 0]) - -0.0651).abs() < 1e-2);
assert!((logits.double_value(&[1, 0, 1]) - 0.0107).abs() < 1e-2);
assert!((logits.double_value(&[1, 0, 2]) - -0.3452).abs() < 1e-2);
assert!((logits.double_value(&[1, 0, 50397]) - -0.0436).abs() < 1e-2);
assert!((logits.double_value(&[1, 0, 50398]) - 0.0645).abs() < 1e-2);
assert!((logits.double_value(&[1, 0, 50399]) - -0.1166).abs() < 1e-2);
}
Ok(())
}

View File

@ -1,25 +1,41 @@
from pathlib import Path
import numpy as np
import torch
import subprocess
import argparse
import numpy as np
import subprocess
import sys
import torch
from pathlib import Path
from torch import Tensor
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("source_file", help="Absolute path to the Pytorch weights file to convert")
parser.add_argument("--skip_embeddings", action="store_true", help="Skip shared embeddings")
parser.add_argument("--skip_lm_head", action="store_true", help="Skip language model head")
parser.add_argument(
"source_file", help="Absolute path to the Pytorch weights file to convert"
)
parser.add_argument(
"--skip_embeddings",
action="store_true",
help="Skip shared embeddings",
)
parser.add_argument(
"--skip_lm_head", action="store_true", help="Skip language model head"
)
parser.add_argument("--prefix", help="Add a prefix on weight names")
parser.add_argument("--suffix", action="store_true", help="Split weight names on '.' and keep only last part")
parser.add_argument(
"--suffix",
action="store_true",
help="Split weight names on '.' and keep only last part",
)
parser.add_argument(
"--dtype",
help="Convert weights to a specific numpy DataType (float32, float16, ...)",
)
args = parser.parse_args()
source_file = Path(args.source_file)
target_folder = source_file.parent
weights = torch.load(str(source_file), map_location='cpu')
weights = torch.load(str(source_file), map_location="cpu")
nps = {}
for k, v in weights.items():
@ -29,7 +45,7 @@ if __name__ == "__main__":
"model.encoder.embed_tokens.weight",
"encoder.embed_tokens.weight",
"model.decoder.embed_tokens.weight",
"decoder.embed_tokens.weight"
"decoder.embed_tokens.weight",
}:
continue
if args.skip_lm_head:
@ -40,18 +56,30 @@ if __name__ == "__main__":
if args.prefix:
k = args.prefix + k
if args.suffix:
k = k.split('.')[-1]
k = k.split(".")[-1]
if isinstance(v, Tensor):
nps[k] = np.ascontiguousarray(v.cpu().numpy().astype(np.float32))
print(f'converted {k} - {str(sys.getsizeof(nps[k]))} bytes')
tensor = v.cpu().numpy()
if args.dtype is not None:
nps[k] = np.ascontiguousarray(tensor.astype(np.dtype(args.dtype)))
else:
nps[k] = np.ascontiguousarray(tensor)
print(f"converted {k} - {str(sys.getsizeof(nps[k]))} bytes")
else:
print(f'skipped non-tensor object: {k}')
np.savez(target_folder / 'model.npz', **nps)
print(f"skipped non-tensor object: {k}")
np.savez(target_folder / "model.npz", **nps)
source = str(target_folder / 'model.npz')
target = str(target_folder / 'rust_model.ot')
source = str(target_folder / "model.npz")
target = str(target_folder / "rust_model.ot")
toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve()
toml_location = (Path(__file__).resolve() / ".." / ".." / "Cargo.toml").resolve()
subprocess.run(
['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target],
[
"cargo",
"run",
"--bin=convert-tensor",
"--manifest-path=%s" % toml_location,
"--",
source,
target,
],
)