Merge branch 'facebookresearch:main' into main

This commit is contained in:
Abdallah Nasir 2023-04-17 15:50:50 +03:00 committed by GitHub
commit c2e44c09da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
737 changed files with 65011 additions and 4181 deletions

128
.circleci/config.yml Normal file
View File

@ -0,0 +1,128 @@
# Use 2.1 for orbs
version: 2.1
# -------------------------------------------------------------------------------------
# Environments to run the jobs in
# -------------------------------------------------------------------------------------
gpu: &gpu
environment:
CUDA_VERSION: "11.2"
machine:
image: ubuntu-2004-cuda-11.2:202103-01
resource_class: gpu.nvidia.medium.multi
# -------------------------------------------------------------------------------------
# Re-usable commands
# -------------------------------------------------------------------------------------
cache_key: &cache_key cache-key-{{ .Environment.CIRCLE_JOB }}-{{ checksum ".circleci/config.yml" }}-{{ checksum "setup.py"}}
install_dep_pt1_10: &install_dep_pt1_10
- run:
name: Install Pytorch Dependencies
command: |
source activate fairseq
pip install --upgrade setuptools
pip install torch==1.10.1+cu111 torchaudio==0.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html
python -c 'import torch; print("Torch version:", torch.__version__)'
install_dep_pt1_12: &install_dep_pt1_12
- run:
name: Install Pytorch Dependencies
command: |
source activate fairseq
pip install --upgrade setuptools
pip install torch==1.12.1+cu116 torchaudio==0.12.1+cu116 -f https://download.pytorch.org/whl/torch_stable.html
python -c 'import torch; print("Torch version:", torch.__version__)'
install_repo: &install_repo
- run:
name: Install Repository
command: |
source activate fairseq
python -m pip install fairscale
python -m pip install -e '.[dev,docs]'
python -c 'import torch; print("Torch version:", torch.__version__)'
run_unittests: &run_unittests
- run:
name: Run Unit Tests
command: |
source activate fairseq
pytest tests/gpu/test_binaries_gpu.py
check_nvidia_driver: &check_nvidia_driver
- run:
name: Check NVIDIA Driver
working_directory: ~/
command: |
pyenv versions
nvidia-smi
create_conda_env: &create_conda_env
- run:
name: Install and Create Conda Environment
command: |
curl -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x ~/miniconda.sh
bash ~/miniconda.sh -b -p $HOME/miniconda
rm ~/miniconda.sh
echo 'export PATH=$HOME/miniconda/bin:$PATH' >> $BASH_ENV
source $BASH_ENV
if [ ! -d ~/miniconda/envs/fairseq ]
then
conda create -y -n fairseq python=3.8
fi
source activate fairseq
python --version
pip install --upgrade pip
# -------------------------------------------------------------------------------------
# Jobs to run
# -------------------------------------------------------------------------------------
jobs:
gpu_tests_pt1_10:
<<: *gpu
working_directory: ~/fairseq-py
steps:
- checkout
- <<: *check_nvidia_driver
- <<: *create_conda_env
- restore_cache:
key: *cache_key
- <<: *install_dep_pt1_10
- save_cache:
paths:
- ~/miniconda/
key: *cache_key
- <<: *install_repo
- <<: *run_unittests
gpu_tests_pt1_12:
<<: *gpu
working_directory: ~/fairseq-py
steps:
- checkout
- <<: *check_nvidia_driver
- <<: *create_conda_env
- restore_cache:
key: *cache_key
- <<: *install_dep_pt1_12
- save_cache:
paths:
- ~/miniconda/
key: *cache_key
- <<: *install_repo
- <<: *run_unittests
workflows:
version: 2
build:
jobs:
- gpu_tests_pt1_12
- gpu_tests_pt1_10

21
.github/CODEOWNERS vendored Normal file
View File

@ -0,0 +1,21 @@
# Setting up CODEOWNERS for UST related codebase
# Documentation for open sourced models relevant to UST
examples/speech_to_text @kahne @sravyapopuri388 @jmp84
examples/speech_to_speech @an918tw @sravyapopuri388 @jmp84
examples/speech_synthesis @kahne @jmp84
examples/simultaneous_translation @kahne @jmp84
examples/speech_text_joint_to_text @yuntang @jmp84
# Speech related models relevant to UST
fairseq/models/speech_to_speech @sravyapopuri388 @jmp84
fairseq/models/speech_to_text @kahne @sravyapopuri388 @jmp84
fairseq/models/text_to_speech @kahne @jmp84
# CONFORMER IMPLEMENTATION
fairseq/modules/conformer_layer.py @sravyapopuri388 @jmp84
fairseq/modules/espnet_multihead_attention.py @sravyapopuri388 @jmp84
fairseq/modules/rotary_positional_embedding.py @sravyapopuri388 @jmp84
fairseq/modules/positional_encoding.py @sravyapopuri388 @jmp84
# Machine Translation/NLLB
fairseq/tasks/translation.py @gwenzek

View File

@ -14,7 +14,7 @@ jobs:
max-parallel: 4
matrix:
platform: [ubuntu-latest, macos-latest]
python-version: [3.6, 3.7]
python-version: [3.8, 3.9]
runs-on: ${{ matrix.platform }}
@ -34,22 +34,48 @@ jobs:
run: |
python -m pip install --upgrade pip
git submodule update --init --recursive
python setup.py build_ext --inplace
python -m pip install --editable .
python -m pip install .
- name: Check installation
working-directory: /tmp
run: python $GITHUB_WORKSPACE/scripts/check_installation.py
- name: Install optional test requirements
run: |
python -m pip install '.[dev,docs]'
python -m pip install iopath transformers pyarrow
python -m pip install git+https://github.com/facebookresearch/fairscale.git@main
python -m pip install pygit2 pgzip
- name: Install xformers for Macos
if: matrix.platform == 'macos-latest'
run: |
brew install llvm libomp
CC=/usr/local/opt/llvm/bin/clang CXX=clang++ pip install git+https://github.com/facebookresearch/xformers.git@main
- name: Install xformers for non-MacOS
if: matrix.platform != 'macos-latest'
run: |
python -m pip install --progress-bar off git+https://github.com/facebookresearch/xformers.git@main
- name: Lint with black
run: black --check --diff .
- name: Lint with flake8
run: |
pip install flake8
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --extend-exclude fairseq/model_parallel/megatron
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --extend-exclude fairseq/model_parallel/megatron
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Build doc
run: make singlehtml
working-directory: docs/
- name: Run tests
run: |
python setup.py test
# When installing in non-editable mode, the .so files will be generated in 'site-packages/fairseq'.
# But by default, pytest import machinery will load local fairseq, and won't see the .so.
# Use --import-mode=append to favorize the 'site-packages/fairseq'.
# https://docs.pytest.org/en/7.1.x/explanation/pythonpath.html
run: pytest --import-mode=append -vvv tests/

View File

@ -1,41 +0,0 @@
name: build_wheels
on:
push:
branches:
- v[0-9]+.[0-9]+.[x0-9]+
tags:
- v*
jobs:
build_wheels:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
steps:
- uses: actions/checkout@v2
- name: Install Python
uses: actions/setup-python@v2
with:
python-version: '3.7'
- name: Install cibuildwheel
run: |
python -m pip install cibuildwheel
- name: Build wheels for CPython
run: |
python -m cibuildwheel --output-dir dist
env:
CIBW_BUILD: "cp36-*64 cp37-*64 cp38-*64"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux1
CIBW_BEFORE_BUILD: git submodule update --init --recursive && pip install .
- uses: actions/upload-artifact@v2
with:
name: wheels
path: ./dist/*.whl

161
.github/workflows/release.yml vendored Normal file
View File

@ -0,0 +1,161 @@
name: Fairseq Release
on:
workflow_dispatch:
inputs:
name:
description: 'Release Type'
default: 'patch'
required: true
jobs:
get_next_version:
runs-on: ubuntu-latest
steps:
- name: checkout-repo-content
uses: actions/checkout@v2
- name: setup-python
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: get next version and tag
id: get-next-version-and-tag
run: |
output=$(python3 release_utils.py --release-type ${{ github.event.inputs.name }})
echo $output
new_version=$(echo $output | awk '{print $1}')
new_tag=$(echo $output | awk '{print $2}')
echo "new version is $new_version"
echo "new tag is $new_tag"
echo ::set-output name=version::$new_version
echo ::set-output name=tag::$new_tag
echo ::set-output name=branch_name::$new_version-release
echo "NEW_TAG=$new_tag" >> $GITHUB_ENV
echo "NEW_BRANCH=$new_version-release" >> $GITHUB_ENV
# update the version number in version.txt
- name: update version
id: update-version
run : |
echo "current folder = $PWD"
echo "current branch = $(git branch --show-current)"
output=$(python3 release_utils.py --release-type ${{ github.event.inputs.name }} --update-version)
- name: add and commit
uses: EndBug/add-and-commit@v9
with:
author_name: ${{ secrets.AUTHOR_NAME }}
author_email: ${{ secrets.AUTHOR_EMAIL }}
# TODO: change this to main once shipit is disabled.
new_branch: '${{ env.NEW_BRANCH }}'
default_author: github_actor
message: '${{ env.NEW_TAG }} release'
pathspec_error_handling: exitAtEnd
# Arguments for the git pull command. Use NO-PULL to avoid the action pulling at all.
# pull: 'NO-PULL'
tag: '${{ env.NEW_TAG }}'
outputs:
new_version: ${{ steps.get-next-version-and-tag.outputs.version }}
new_tag: ${{ steps.get-next-version-and-tag.outputs.tag }}
branch_name: ${{ steps.get-next-version-and-tag.outputs.branch_name }}
create_sdist:
runs-on: ubuntu-latest
name: Create Source Distribution
needs: get_next_version
steps:
- uses: actions/checkout@v3
with:
ref: ${{ needs.get_next_version.outputs.branch_name }}
- name: Install Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Upgrade pip
run: |
python3 -m pip install --upgrade pip
- name: Create Source Distribution
run: |
python3 -m pip install setuptools wheel twine torch
python3 setup.py sdist
- uses: actions/upload-artifact@v2
with:
path: dist/*.tar.gz
build_wheels:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
needs: get_next_version
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
steps:
- uses: actions/checkout@v3
with:
ref: ${{ needs.get_next_version.outputs.branch_name }}
- name: Install Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Upgrade pip
run: |
python3 -m pip install --upgrade pip
- name: Install cibuildwheel
run: |
python3 -m pip install cibuildwheel
- name: Build wheels for CPython
run: |
python3 -m cibuildwheel --output-dir dist
env:
CIBW_BUILD: "cp38-*64"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux1
CIBW_BEFORE_BUILD: git submodule update --init --recursive && pip install .
# Install system library
CIBW_BEFORE_BUILD_LINUX: (yum install -y libffi-devel || apt-get install -y libffi-devel || apk add --update --no-cache libffi-devel || true) && (yum install -y libc6 || apt-get install -y libc6 || apk add --update --no-cache libc6 || true)
CIBW_ENVIRONMENT: "PIP_ONLY_BINARY=numpy"
CIBW_SKIP: "*musllinux*"
- uses: actions/upload-artifact@v2
with:
path: dist
upload:
name: Upload to PyPi and create release
runs-on: ubuntu-latest
needs: [build_wheels, create_sdist, get_next_version]
steps:
- uses: actions/download-artifact@v2
with:
name: artifact
path: dist
# build the PyPI package and upload it
- name: upload
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
pip install setuptools wheel twine
python3 -m twine upload --repository pypi dist/*
# create the release on github
- name: create release on github
uses: ncipollo/release-action@v1
with:
tag: '${{ needs.get_next_version.outputs.new_tag }}'

5
.gitignore vendored
View File

@ -134,3 +134,8 @@ experimental/*
# Weights and Biases logs
wandb/
# Hydra artifacts
nohup.out
multirun
outputs

40
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,40 @@
exclude: 'build|stubs'
default_language_version:
python: python3
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0
hooks:
- id: trailing-whitespace
- id: check-ast
- id: check-merge-conflict
- id: no-commit-to-branch
args: ['--branch=master']
- id: check-added-large-files
args: ['--maxkb=500']
- id: end-of-file-fixer
- repo: https://github.com/ambv/black
rev: 22.3.0
hooks:
- id: black
language_version: python3.8
- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2
hooks:
- id: flake8
args: [
# only error for syntax errors and undefined names
"--select=E9,F63,F7,F82",
]
- repo: https://github.com/pycqa/isort
rev: 5.10.1
hooks:
- id: isort
exclude: README.md
additional_dependencies: [toml]
args: ["--profile", "black"]

View File

@ -26,3 +26,57 @@ clear and has sufficient instructions to be able to reproduce the issue.
By contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq),
you agree that your contributions will be licensed under the LICENSE file in
the root directory of this source tree.
## Pre-commit hooks
In order to ensure your code lints, there are pre-commit hooks configured in the repository which you can install.
After installation, they will automatically run each time you commit.
An abbreviated guide is given below; for more information, refer to [the offical pre-commit documentation](https://pre-commit.com/).
### Installation
```
pip install pre-commit
pre-commit install
```
### Usage
Just commit your changes:
```
git commit -m "My informative commit message"
```
If there was a failure, you will get feedback
```
[INFO] Initializing environment for https://github.com/PyCQA/flake8.
[INFO] Installing environment for https://github.com/pre-commit/pre-commit-hooks.
[INFO] Once installed this environment will be reused.
[INFO] This may take a few minutes...
[INFO] Installing environment for https://github.com/PyCQA/flake8.
[INFO] Once installed this environment will be reused.
[INFO] This may take a few minutes...
Trim Trailing Whitespace.................................................Failed
- hook id: trailing-whitespace
- exit code: 1
- files were modified by this hook
Fixing examples/nllb/modeling/wmt15_benchmark/eval_langs2.sh
Fix End of Files.........................................................Failed
- hook id: end-of-file-fixer
- exit code: 1
- files were modified by this hook
Fixing examples/few_shot/scripts/schedule_jobs_few_shot.py
flake8...................................................................Passed
```
Certain hooks modify your files to comply.
To include these modifications, you will need to add them (i.e. `git add ...`) and commit again.
If all is well, you should see something like:
```
Trim Trailing Whitespace.................................................Passed
Fix End of Files.........................................................Passed
flake8...................................................................Passed
[gshard-fix-ci 8698644e1] Fix lint, add pre-commit hooks
10 files changed, 148 insertions(+), 110 deletions(-)
create mode 100644 .flake8
create mode 100644 .pre-commit-config.yaml
rename examples/nllb/modeling/wmt15_benchmark/{eval_langs2.py => eval_langs2.sh} (99%)
```

1
MANIFEST.in Normal file
View File

@ -0,0 +1 @@
include fairseq/version.txt

View File

@ -2,10 +2,12 @@
<img src="docs/fairseq_logo.png" width="150">
<br />
<br />
<a href="https://opensource.fb.com/support-ukraine"><img alt="Support Ukraine" src="https://img.shields.io/badge/Support-Ukraine-FFD500?style=flat&labelColor=005BBB" /></a>
<a href="https://github.com/pytorch/fairseq/blob/main/LICENSE"><img alt="MIT License" src="https://img.shields.io/badge/license-MIT-blue.svg" /></a>
<a href="https://github.com/pytorch/fairseq/releases"><img alt="Latest Release" src="https://img.shields.io/github/release/pytorch/fairseq.svg" /></a>
<a href="https://github.com/pytorch/fairseq/actions?query=workflow:build"><img alt="Build Status" src="https://github.com/pytorch/fairseq/workflows/build/badge.svg" /></a>
<a href="https://fairseq.readthedocs.io/en/latest/?badge=latest"><img alt="Documentation Status" src="https://readthedocs.org/projects/fairseq/badge/?version=latest" /></a>
<a href="https://app.circleci.com/pipelines/github/facebookresearch/fairseq/"><img alt="CicleCI Status" src="https://circleci.com/gh/facebookresearch/fairseq.svg?style=shield" /></a>
</p>
--------------------------------------------------------------------------------
@ -68,6 +70,9 @@ We provide reference implementations of various sequence modeling papers:
</p></details>
### What's New:
* June 2022 [Released code for wav2vec-U 2.0 from Towards End-to-end Unsupervised Speech Recognition (Liu, et al., 2022)](examples/wav2vec/unsupervised/README.md)
* May 2022 [Integration with xFormers](https://github.com/facebookresearch/xformers)
* December 2021 [Released Direct speech-to-speech translation code](examples/speech_to_speech/README.md)
* October 2021 [Released VideoCLIP and VLM models](examples/MMPT/README.md)
* October 2021 [Released multilingual finetuned XLSR-53 model](examples/wav2vec/README.md)
* September 2021 [`master` branch renamed to `main`](https://github.com/github/renaming).
@ -142,8 +147,8 @@ and [RoBERTa](https://pytorch.org/hub/pytorch_fairseq_roberta/) for more example
# Requirements and Installation
* [PyTorch](http://pytorch.org/) version >= 1.5.0
* Python version >= 3.6
* [PyTorch](http://pytorch.org/) version >= 1.10.0
* Python version >= 3.8
* For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl)
* **To install fairseq** and develop locally:
@ -189,6 +194,7 @@ as well as example training and evaluation commands.
We also have more detailed READMEs to reproduce results from specific papers:
* [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale (Babu et al., 2021)](examples/wav2vec/xlsr/README.md)
* [Cross-lingual Retrieval for Iterative Self-Supervised Training (Tran et al., 2020)](examples/criss/README.md)
* [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](examples/wav2vec/README.md)
* [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](examples/unsupervised_quality_estimation/README.md)

13
RELEASE.md Normal file
View File

@ -0,0 +1,13 @@
# Creating a New Release
In order to create a new release:
1. Navigate to the [Fairseq Workflows](https://github.com/facebookresearch/fairseq/actions) and find the one named _Fairseq Release_.
2. Under _Run Workflow_ choose the branch `main` and for _Release Type_ enter either `major`, `minor`, or `patch`.
3. A branch named `$new_version-release` will be created where the `version.txt` file is updated. Merge those changes into `main`.
4. Make sure that a [new PYPI package](https://pypi.org/project/fairseq/) has been uploaded.
5. Make sure that a [new github release](https://github.com/facebookresearch/fairseq/releases) has been created.

View File

@ -1,9 +0,0 @@
.wy-table-responsive table td kbd {
white-space: nowrap;
}
.wy-table-responsive table td {
white-space: normal !important;
}
.wy-table-responsive {
overflow: visible !important;
}

View File

@ -88,43 +88,7 @@ todo_include_todos = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
html_context = {
"css_files": [
"_static/theme_overrides.css", # override wide tables in RTD theme
],
}
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# This is required for the alabaster theme
# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
# html_sidebars = {
# '**': [
# 'about.html',
# 'navigation.html',
# 'relations.html', # needs 'show_related': True theme option to display
# 'searchbox.html',
# 'donate.html',
# ]
# }
html_theme = "classic"
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {

View File

@ -1,2 +0,0 @@
sphinx<2.0
sphinx-argparse

View File

@ -208,7 +208,7 @@ following contents::
import torch
from fairseq.data import Dictionary, LanguagePairDataset
from fairseq.tasks import FairseqTask, register_task
from fairseq.tasks import LegacyFairseqTask, register_task
@register_task('simple_classification')

View File

@ -8,7 +8,7 @@ TODO (huxu): a general fairseq criterion for all your pre-defined losses.
"""
from fairseq.criterions import FairseqCriterion, register_criterion
from fairseq import metrics
from fairseq.logging import metrics
@register_criterion("mmloss")

View File

@ -25,9 +25,7 @@ class FairseqMMTask(LegacyFairseqTask):
parser.add_argument(
"taskconfig",
metavar="FILE",
help=(
"taskconfig to load all configurations"
"outside fairseq parser."),
help=("taskconfig to load all configurations" "outside fairseq parser."),
)
@classmethod
@ -68,20 +66,34 @@ class FairseqMMTask(LegacyFairseqTask):
epoch=1,
data_buffer_size=0,
disable_iterator_cache=False,
skip_remainder_batch=False,
grouped_shuffling=False,
update_epoch_batch_itr=False,
):
random.seed(epoch)
if dataset.mmdataset.split == "train" \
and isinstance(self.mmtask, RetriTask):
if dataset.mmdataset.split == "train" and isinstance(self.mmtask, RetriTask):
if epoch >= self.mmtask.config.retri_epoch:
if not hasattr(self.mmtask, "retri_dataloader"):
self.mmtask.build_dataloader()
self.mmtask.retrive_candidates(epoch)
return super().get_batch_iterator(
dataset, max_tokens, max_sentences, max_positions,
ignore_invalid_inputs, required_batch_size_multiple,
seed, num_shards, shard_id, num_workers, epoch,
data_buffer_size, disable_iterator_cache)
dataset,
max_tokens,
max_sentences,
max_positions,
ignore_invalid_inputs,
required_batch_size_multiple,
seed,
num_shards,
shard_id,
num_workers,
epoch,
data_buffer_size,
disable_iterator_cache,
grouped_shuffling,
update_epoch_batch_itr,
)
@property
def source_dictionary(self):

View File

@ -7,7 +7,8 @@ import math
from dataclasses import dataclass
import torch.nn.functional as F
from fairseq import metrics, utils
from fairseq import utils
from fairseq.logging import metrics
from fairseq.criterions import register_criterion
from fairseq.criterions.cross_entropy import CrossEntropyCriterion
from fairseq.dataclass import FairseqDataclass

View File

@ -0,0 +1,161 @@
# Pay Better Attention to Attention: Head Selection in Multilingual and Multi-Domain Sequence Modeling (Gong et al., 2021)
[https://arxiv.org/pdf/2106.10840.pdf](https://arxiv.org/pdf/2106.10840.pdf)
## Introduction
We present attention head selection strategies in multilingual and multi-domain sequence modeling including text translation, speech recognition and speech translation tasks.
Below is an example of training multilingual/multi-domain speech recognition models.
## Data Preparation
Prepare mTEDx data as in [mTEDx example](https://github.com/fairinternal/fairseq-py/blob/0d9c5851e6fac40f9e366b3633ccd615c2901788/examples/speech_to_text/docs/mtedx_example.md) and CoVoST data as in [CoVoST example](https://github.com/fairinternal/fairseq-py/blob/0d9c5851e6fac40f9e366b3633ccd615c2901788/examples/speech_to_text/docs/covost_example.md). Similarly prepare EuroParl data.
## Training a multilingual ASR model with attention head selection
```bash
data_dir=<path to mtedx data>
train_subset="train_ar_ar_tedx,train_de_de_tedx,train_el_el_tedx,train_es_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_pt_tedx,train_ru_ru_tedx"
valid_subset="valid_ar_ar_tedx,valid_de_de_tedx,valid_el_el_tedx,valid_es_es_tedx,valid_fr_fr_tedx,valid_it_it_tedx,valid_pt_pt_tedx,valid_ru_ru_tedx"
strateg=<subset or group>
fairseq-train ${data_dir} \
--user-dir examples/attention_head_selection/src \
--train-subset "${train_subset}" \
--valid-subset "${valid_subset}" \
--config-yaml 'config_asr.yaml' \
--arch 'head_selection_s2t_transformer_s' \
--task 'speech_to_text_head_selection' \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--lr-scheduler 'inverse_sqrt' --stop-min-lr -1.0 --warmup-updates 10000 \
--lr 5e-4 \
--clip-norm 10.0 \
--seed 1 \
--max-epoch 400 \
--max-tokens 32000 \
--ignore-prefix-size 1 \
--dropout 0.3 \
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
--skip-invalid-size-inputs-valid-test \
--encoder-attn-head-select \
--total-encoder-attention-heads 8 \
--decoder-self-attn-head-select \
--total-decoder-attention-heads 8 \
--attn-head-select-strategy ${strategy} \
--task-type lang \
```
## Training a multi-domain ASR model with attention head selection
```bash
data_dir=<path to multi-domain data>
train_subset="train_es_es_tedx,train_fr_fr_tedx,train_pt_pt_tedx,train_it_it_tedx,train_ru_ru_tedx,train_el_el_tedx,train_ar_ar_tedx,train_de_de_tedx,train_ar_ar_cv,train_de_de_cv,train_es_es_cv,train_fr_fr_cv,train_it_it_cv,train_pt_pt_cv,train_ru_ru_cv,train_de_de_ep,train_es_es_ep,train_fr_fr_ep,train_it_it_ep,train_pt_pt_ep"
valid_subset="dev_es_es_tedx,dev_fr_fr_tedx,dev_pt_pt_tedx,dev_it_it_tedx,dev_ru_ru_tedx,dev_el_el_tedx,dev_ar_ar_tedx,dev_de_de_tedx,dev_ar_ar_cv,dev_de_de_cv,dev_es_es_cv,dev_fr_fr_cv,dev_it_it_cv,dev_pt_pt_cv,dev_ru_ru_cv,dev_de_de_ep,dev_es_es_ep,dev_fr_fr_ep,dev_it_it_ep,dev_pt_pt_ep"
strateg=<subset or group>
fairseq-train ${data_dir} \
--user-dir examples/attention_head_selection/src \
--train-subset "${train_subset}" \
--valid-subset "${valid_subset}" \
--config-yaml 'config_asr.yaml' \
--arch head_selection_s2t_transformer_s \
--task speech_to_text_head_selection \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--lr-scheduler 'inverse_sqrt' --stop-min-lr -1.0 --warmup-updates 10000 \
--lr 5e-4 \
--clip-norm 10.0 \
--seed 1 \
--max-epoch 400 \
--max-tokens 32000 \
--ignore-prefix-size 1 \
--dropout 0.3 \
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
--skip-invalid-size-inputs-valid-test \
--encoder-attn-head-select \
--total-encoder-attention-heads 8 \
--decoder-self-attn-head-select \
--total-decoder-attention-heads 8 \
--attn-head-select-strategy ${strategy} \
--task-type domain
```
## Inference in multilingual setting
```bash
MODEL_DIR=<checkpoint directory>
data_dir=<path to mtedx data>
gen_subset=<data to test, e.g., test_ar_ar_tedx>
train_subset="train_ar_ar_tedx,train_de_de_tedx,train_el_el_tedx,train_es_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_pt_tedx,train_ru_ru_tedx"
last_n=10
CHECKPOINT_FILENAME="avg_last_${last_n}_checkpoint.pt"
CHECKPOINT="_avg"
RESULTS="${MODEL_DIR}/ckpt${CHECKPOINT}"
if [ ! -d $RESULTS ]; then
mkdir -p $RESULTS
fi;
python scripts/average_checkpoints.py \
--inputs ${MODEL_DIR} --num-epoch-checkpoints ${last_n} \
--output "${MODEL_DIR}/${CHECKPOINT_FILENAME}"
fairseq-generate ${data_dir} \
--user-dir examples/attention_head_selection/src \
--arch 'head_selection_s2t_transformer_s' \
--task 'speech_to_text_head_selection' \
--train-subset ${train_subset} \
--gen-subset ${gen_subset} \
--path "${MODEL_DIR}/${CHECKPOINT_FILENAME}" \
--config-yaml 'config_asr.yaml' \
--prefix-size 1 \
--max-tokens 40000 --beam 5 \
--skip-invalid-size-inputs-valid-test \
--results-path ${RESULTS} \
--scoring wer --wer-tokenizer 13a \
--wer-lowercase --wer-remove-punct --remove-bpe
```
## Inference in multi-domain setting
```bash
MODEL_DIR=<checkpoint directory>
data_dir=<path to multi-domain data>
gen_subset=<data to test, e.g., test_pt_pt_cv>
train_subset="train_es_es_tedx,train_fr_fr_tedx,train_pt_pt_tedx,train_it_it_tedx,train_ru_ru_tedx,train_el_el_tedx,train_ar_ar_tedx,train_de_de_tedx,train_ar_ar_cv,train_de_de_cv,train_es_es_cv,train_fr_fr_cv,train_it_it_cv,train_pt_pt_cv,train_ru_ru_cv,train_de_de_ep,train_es_es_ep,train_fr_fr_ep,train_it_it_ep,train_pt_pt_ep"
last_n=10
CHECKPOINT_FILENAME="avg_last_${last_n}_checkpoint.pt"
CHECKPOINT="_avg"
RESULTS="${MODEL_DIR}/ckpt${CHECKPOINT}"
if [ ! -d $RESULTS ]; then
mkdir -p $RESULTS
fi;
python scripts/average_checkpoints.py \
--inputs ${MODEL_DIR} --num-epoch-checkpoints ${last_n} \
--output "${MODEL_DIR}/${CHECKPOINT_FILENAME}"
fairseq-generate ${data_dir} \
--user-dir examples/attention_head_selection/src \
--arch 'head_selection_s2t_transformer_s' \
--task 'speech_to_text_head_selection' \
--train-subset ${train_subset} \
--gen-subset ${gen_subset} \
--path "${MODEL_DIR}/${CHECKPOINT_FILENAME}" \
--config-yaml 'config_asr.yaml' \
--prefix-size 1 \
--max-tokens 40000 --beam 5 \
--skip-invalid-size-inputs-valid-test \
--results-path ${RESULTS} \
--scoring wer --wer-tokenizer 13a \
--wer-lowercase --wer-remove-punct --remove-bpe
```
## Citation
```bibtex
@article{gong2021pay,
title={Pay Better Attention to Attention: Head Selection in Multilingual and Multi-Domain Sequence Modeling},
author={Gong, Hongyu and Tang, Yun and Pino, Juan and Li, Xian},
journal={arXiv preprint arXiv:2106.10840},
year={2021}
}
'''

View File

@ -0,0 +1,242 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
from pathlib import Path
from typing import Dict, List, Optional
from dataclasses import dataclass
import torch
from fairseq.data import (
ConcatDataset,
Dictionary,
FairseqDataset,
ResamplingDataset
)
from fairseq.data.audio.data_cfg import S2TDataConfig
from fairseq.data.audio.speech_to_text_dataset import (
SpeechToTextDatasetItem,
SpeechToTextDataset,
SpeechToTextDatasetCreator
)
logger = logging.getLogger(__name__)
@dataclass
class SpeechToTextDatasetItemWithDomain(SpeechToTextDatasetItem):
src_lang_id: Optional[torch.Tensor] = None
tgt_lang_id: Optional[torch.Tensor] = None
domain_id: Optional[torch.Tensor] = None
class SpeechToTextDatasetWithDomain(SpeechToTextDataset):
def __init__(
self,
split: str,
is_train_split: bool,
cfg: S2TDataConfig,
audio_paths: List[str],
n_frames: List[int],
src_texts: Optional[List[str]] = None,
tgt_texts: Optional[List[str]] = None,
speakers: Optional[List[str]] = None,
src_langs: Optional[List[str]] = None,
tgt_langs: Optional[List[str]] = None,
ids: Optional[List[str]] = None,
tgt_dict: Optional[Dictionary] = None,
pre_tokenizer=None,
bpe_tokenizer=None,
n_frames_per_step=1,
speaker_to_id=None,
src_lang_ids: Optional[List[int]] = None,
tgt_lang_ids: Optional[List[int]] = None,
domain_ids: Optional[List[int]] = None
):
super().__init__(
split, is_train_split, cfg, audio_paths, n_frames,
src_texts, tgt_texts, speakers, src_langs, tgt_langs,
ids, tgt_dict, pre_tokenizer, bpe_tokenizer,
n_frames_per_step, speaker_to_id
)
assert src_lang_ids is None or len(src_lang_ids) == self.n_samples
assert tgt_lang_ids is None or len(tgt_lang_ids) == self.n_samples
assert domain_ids is None or len(domain_ids) == self.n_samples
self.src_lang_ids = src_lang_ids
self.tgt_lang_ids = tgt_lang_ids
self.domain_ids = domain_ids
def __getitem__(self, index: int) -> SpeechToTextDatasetItemWithDomain:
item = super().__getitem__(index)
src_lang_id = self.src_lang_ids[index]
tgt_lang_id = self.tgt_lang_ids[index]
domain_id = self.domain_ids[index]
return SpeechToTextDatasetItemWithDomain(
index=item.index, source=item.source,
target=item.target, speaker_id=item.speaker_id,
src_lang_id=src_lang_id,
tgt_lang_id=tgt_lang_id,
domain_id=domain_id
)
def collater(
self, samples: List[SpeechToTextDatasetItem], return_order: bool = False
) -> Dict:
if len(samples) == 0:
return {}
out = super().collater(samples, return_order=True)
order = out["order"]
src_lang_ids = torch.tensor([x.src_lang_id for x in samples], dtype=torch.long).index_select(0, order)
tgt_lang_ids = torch.tensor([x.tgt_lang_id for x in samples], dtype=torch.long).index_select(0, order)
domain_ids = torch.tensor([x.domain_id for x in samples], dtype=torch.long).index_select(0, order)
out["src_lang_ids"] = src_lang_ids
out["tgt_lang_ids"] = tgt_lang_ids
out["domain_ids"] = domain_ids
if not return_order:
del out["order"]
return out
class SpeechToTextDatasetCreatorWithDomain(SpeechToTextDatasetCreator):
KEY_SRC_LANG_ID, KEY_TGT_LANG_ID = "src_lang_id", "tgt_lang_id"
KEY_DOMAIN_ID = "domain_id"
# default values
DEFAULT_SRC_LANG_ID, DEFAULT_TGT_LANG_ID, DEFAULT_DOMAIN_ID = 0, 0, 0
@classmethod
def _from_list(
cls,
split_name: str,
is_train_split,
samples: List[Dict],
cfg: S2TDataConfig,
tgt_dict,
pre_tokenizer,
bpe_tokenizer,
n_frames_per_step,
speaker_to_id
) -> SpeechToTextDatasetWithDomain:
audio_root = Path(cfg.audio_root)
ids = [s[cls.KEY_ID] for s in samples]
audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples]
n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples]
tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples]
src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples]
speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples]
src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
src_lang_ids = [s.get(cls.KEY_SRC_LANG_ID, cls.DEFAULT_SRC_LANG_ID) for s in samples]
tgt_lang_ids = [s.get(cls.KEY_TGT_LANG_ID, cls.DEFAULT_TGT_LANG_ID) for s in samples]
domain_ids = [s.get(cls.KEY_DOMAIN_ID, cls.DEFAULT_DOMAIN_ID) for s in samples]
return SpeechToTextDatasetWithDomain(
split_name,
is_train_split,
cfg,
audio_paths,
n_frames,
src_texts=src_texts,
tgt_texts=tgt_texts,
speakers=speakers,
src_langs=src_langs,
tgt_langs=tgt_langs,
ids=ids,
tgt_dict=tgt_dict,
pre_tokenizer=pre_tokenizer,
bpe_tokenizer=bpe_tokenizer,
n_frames_per_step=n_frames_per_step,
speaker_to_id=speaker_to_id,
src_lang_ids=src_lang_ids,
tgt_lang_ids=tgt_lang_ids,
domain_ids=domain_ids
)
@classmethod
def _load_samples_from_tsv(
cls,
root: str,
split: str,
src_lang_map,
tgt_lang_map,
domain_map
):
# metadata from split
_, src_lang, tgt_lang, domain = split.split("_")
src_lang_id = src_lang_map[src_lang]
tgt_lang_id = tgt_lang_map[tgt_lang]
domain_id = domain_map[domain]
samples = SpeechToTextDatasetCreator._load_samples_from_tsv(root, split)
for s in samples:
s.update({
cls.KEY_SRC_LANG_ID: src_lang_id,
cls.KEY_TGT_LANG_ID: tgt_lang_id,
cls.KEY_DOMAIN_ID: domain_id
})
return samples
@classmethod
def _from_tsv(
cls,
root: str,
cfg: S2TDataConfig,
split: str,
tgt_dict,
is_train_split: bool,
pre_tokenizer,
bpe_tokenizer,
n_frames_per_step,
speaker_to_id,
src_lang_map: Dict[str, int],
tgt_lang_map: Dict[str, int],
domain_map: Dict[str, int]
) -> SpeechToTextDatasetItemWithDomain:
samples = cls._load_samples_from_tsv(
root, split, src_lang_map,
tgt_lang_map, domain_map
)
return cls._from_list(
split, is_train_split, samples, cfg, tgt_dict, pre_tokenizer,
bpe_tokenizer, n_frames_per_step, speaker_to_id
)
@classmethod
def from_tsv(
cls,
root: str,
cfg: S2TDataConfig,
splits: str,
tgt_dict,
pre_tokenizer,
bpe_tokenizer,
is_train_split: bool,
epoch: int,
seed: int,
src_lang_map: Dict[str, int],
tgt_lang_map: Dict[str, int],
domain_map: Dict[str, int],
n_frames_per_step: int = 1,
speaker_to_id=None
) -> SpeechToTextDatasetWithDomain:
datasets = [
cls._from_tsv(
root, cfg, split, tgt_dict, is_train_split, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, src_lang_map, tgt_lang_map, domain_map
)
for split in splits.split(",")
]
if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0:
# temperature-based sampling
size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha)
datasets = [
ResamplingDataset(
d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0)
)
for r, d in zip(size_ratios, datasets)
]
return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0]

View File

@ -0,0 +1,27 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import math
import torch
from torch.nn.modules.loss import _Loss
class HeadSelectionLoss(_Loss):
def __init__(self, args):
super().__init__()
self.args = args
self.kl_weight = getattr(args, "kl_weight", 0.0)
def forward(self, head_samples, sample_sizes, prior=0.5, eps=1e-7):
"""
head_scores: (num_tasks, num_layers, num_heads)
sample_sizes: (num_tasks, )
"""
kl_loss = (head_samples * (torch.log(head_samples + eps) - math.log(prior))).sum(-1).sum(-1)
kl_loss /= (torch.numel(head_samples) / head_samples.size(0))
kl_loss = self.kl_weight * torch.matmul(kl_loss, sample_sizes)
return kl_loss

View File

@ -0,0 +1,170 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
from typing import Dict, List, Optional
from pathlib import Path
import torch.nn as nn
from torch import Tensor
from fairseq import checkpoint_utils
from fairseq.models import register_model, register_model_architecture
from fairseq.utils import safe_hasattr
from fairseq.models.speech_to_text.s2t_transformer import (
S2TTransformerModel,
S2TTransformerEncoder,
TransformerDecoderScriptable
)
from fairseq.models.speech_to_text.s2t_transformer import base_architecture as s2t_base_architecture
from ..modules.attn_head_selector import AttnHeadSelector
from ..modules.head_selection_transformer_layer import HeadSelectionTransformerEncoderLayer
from .head_selection_transformer import HeadSelectionTransformerDecoder
logger = logging.getLogger(__name__)
@register_model("head_selection_s2t_transformer")
class HeadSelectionS2TTransformerModel(S2TTransformerModel):
"""
Head selection implemented in S2TTransformer
"""
def __init__(self, encoder, decoder):
super().__init__(encoder, decoder)
@staticmethod
def add_args(parser):
S2TTransformerModel.add_args(parser)
# encoder head selection
parser.add_argument(
"--encoder-attn-head-select",
action="store_true",
default=False,
help="encoder head selection"
)
parser.add_argument(
"--total-encoder-attention-heads",
type=int,
help="total number of encoder attention heads"
)
# decoder self attention selection
parser.add_argument(
"--decoder-self-attn-head-select",
action="store_true",
default=False,
help="decoder self-attention head selection"
)
# decoder-encoder attention selection
parser.add_argument(
"--dec-enc-attn-head-select",
action="store_true",
default=False,
help="decoder-encoder attention head selection"
)
parser.add_argument(
"--total-decoder-attention-heads",
type=int,
help="total number of decoder attention heads"
)
# selection strategy
parser.add_argument(
"--attn-head-select-strategy",
type=str,
help="attention head selection strategy, subset or group"
)
@classmethod
def build_encoder(cls, args):
if safe_hasattr(args, "encoder_attn_head_select") and args.encoder_attn_head_select:
encoder = HeadSelectionS2TTransformerEncoder(args)
else:
encoder = S2TTransformerEncoder(args)
pretraining_path = getattr(args, "load_pretrained_encoder_from", None)
if pretraining_path is not None:
if not Path(pretraining_path).exists():
logger.warning(
f"skipped pretraining because {pretraining_path} does not exist"
)
else:
encoder = checkpoint_utils.load_pretrained_component_from_model(
component=encoder, checkpoint=pretraining_path
)
logger.info(f"loaded pretrained encoder from: {pretraining_path}")
return encoder
@classmethod
def build_decoder(cls, args, task, embed_tokens):
if (safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select) or (safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select):
return HeadSelectionTransformerDecoderScriptable(args, task.target_dictionary, embed_tokens)
else:
return TransformerDecoderScriptable(args, task.target_dictionary, embed_tokens)
class HeadSelectionS2TTransformerEncoder(S2TTransformerEncoder):
def __init__(self, args):
super().__init__(args)
self.attn_head_selector = AttnHeadSelector(
args.encoder_tasks,
args.encoder_layers,
args.total_encoder_attention_heads,
args.encoder_attention_heads,
args.attn_head_select_strategy,
)
self.task_ids = None
self.transformer_layers = nn.ModuleList([
HeadSelectionTransformerEncoderLayer(args, layer_idx, attn_head_selector=self.attn_head_selector) for layer_idx in range(args.encoder_layers)
])
def set_task_ids(self, task_ids):
self.task_ids = task_ids
def _forward(self, src_tokens, src_lengths, return_all_hiddens=False):
self.attn_head_selector.head_select(self.task_ids)
return super()._forward(src_tokens, src_lengths, return_all_hiddens)
class HeadSelectionTransformerDecoderScriptable(HeadSelectionTransformerDecoder):
def extract_features(
self,
prev_output_tokens,
encoder_out: Optional[Dict[str, List[Tensor]]] = None,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
full_context_alignment: bool = False,
alignment_layer: Optional[int] = None,
alignment_heads: Optional[int] = None,
):
# call scriptable method from parent class
x, _ = self.extract_features_scriptable(
prev_output_tokens,
encoder_out,
incremental_state,
full_context_alignment,
alignment_layer,
alignment_heads,
)
return x, None
@register_model_architecture(model_name="head_selection_s2t_transformer", arch_name="head_selection_s2t_transformer")
def base_architecture(args):
s2t_base_architecture(args)
args.encoder_attn_head_select = getattr(args, "encoder_attn_head_select", False)
args.decoder_self_attn_head_select = getattr(args, "decoder_self_attn_head_select", False)
args.dec_enc_attn_head_select = getattr(args, "dec_enc_attn_head_select", False)
args.total_encoder_attention_heads = getattr(args, "total_encoder_attention_heads", 8)
args.total_decoder_attention_heads = getattr(args, "total_decoder_attention_heads", 8)
args.attn_head_select_strategy = getattr(args, "attn_head_select_strategy", "group")
@register_model_architecture("head_selection_s2t_transformer", "head_selection_s2t_transformer_s")
def head_selection_s2t_transformer_s(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8)
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
args.dropout = getattr(args, "dropout", 0.1)
base_architecture(args)

View File

@ -0,0 +1,215 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from typing import Any, List, Dict, Optional
import torch
import torch.nn as nn
from torch import Tensor
from fairseq.utils import safe_hasattr
from fairseq.models.transformer import (
TransformerModel,
TransformerEncoder,
TransformerDecoder
)
from ..modules.attn_head_selector import AttnHeadSelector
from ..modules.head_selection_transformer_layer import (
HeadSelectionTransformerEncoderLayer,
HeadSelectionTransformerDecoderLayer
)
class HeadSelectionTransformerModel(TransformerModel):
def __init__(self, args, encoder, decoder):
super().__init__(args, encoder, decoder)
@staticmethod
def add_args(parser):
TransformerModel.add_args(parser)
# encoder head selection
parser.add_argument(
"--encoder-attn-head-select",
action="store_true",
default=False,
help="encoder head selection"
)
parser.add_argument(
"--total-encoder-attention-heads",
type=int,
help="total number of encoder attention heads"
)
# decoder self attention
parser.add_argument(
"--decoder-self-attn-head-select",
action="store_true",
default=False,
help="decoder self-attention head selection"
)
# decoder-encoder attention
parser.add_argument(
"--dec-enc-attn-head-select",
action="store_true",
default=False,
help="decoder-encoder attention head selection"
)
parser.add_argument(
"--total-decoder-attention-heads",
type=int,
help="total number of decoder attention heads"
)
# selection strategy
parser.add_argument(
"--attn-head-select-strategy",
type=str,
help="attention head selection strategy, subset or group"
)
@classmethod
def build_encoder(cls, args, src_dict, embed_tokens):
if safe_hasattr(args, "encoder_attn_head_select") and args.encoder_attn_head_select:
return HeadSelectionTransformerEncoder(
args, src_dict, embed_tokens
)
else:
return TransformerEncoder(args, src_dict, embed_tokens)
@classmethod
def build_decoder(cls, args, tgt_dict, embed_tokens):
if (safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select) or (safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select):
return HeadSelectionTransformerDecoder(
args, tgt_dict, embed_tokens
)
else:
return TransformerDecoder(args, tgt_dict, embed_tokens)
class HeadSelectionTransformerEncoder(TransformerEncoder):
def __init__(self, args, dictionary, embed_tokens):
self.num_tasks = args.encoder_tasks
self.num_layers = args.encoder_layers
self.total_num_heads = args.total_encoder_attention_heads
self.num_heads = args.encoder_attention_heads
self.select_strategy = args.attn_head_select_strategy
super().__init__(args, dictionary, embed_tokens)
self.attn_head_selector = AttnHeadSelector(
self.num_tasks,
self.num_layers,
self.total_num_heads,
self.num_heads,
self.select_strategy
)
self.task_ids = None
self.layers = nn.ModuleList(
[self.build_encoder_layer(args, i) for i in range(args.encoder_layers)]
)
def set_task_ids(self, task_ids):
self.task_ids = task_ids
def build_encoder_layer(self, args, layer_idx=None):
return HeadSelectionTransformerEncoderLayer(
args,
layer_idx,
attn_head_selector=self.attn_head_selector
)
def forward(
self,
src_tokens,
src_lengths: Optional[torch.Tensor] = None,
return_all_hiddens: bool = False,
token_embeddings: Optional[torch.Tensor] = None,
):
self.attn_head_selector.head_select(self.task_ids)
return super().forward(src_tokens, src_lengths, return_all_hiddens, token_embeddings)
class HeadSelectionTransformerDecoder(TransformerDecoder):
def __init__(
self,
args,
dictionary,
embed_tokens,
no_encoder_attn=False,
output_projection=None,
):
self.num_tasks = args.decoder_tasks
self.num_layers = args.decoder_layers
self.total_num_heads = args.total_decoder_attention_heads
self.num_heads = args.decoder_attention_heads
self.select_strategy = args.attn_head_select_strategy
super().__init__(
args, dictionary, embed_tokens,
no_encoder_attn=no_encoder_attn,
output_projection=output_projection
)
self.self_attn_head_selector = None
self.enc_attn_head_selector = None
if safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select:
self.self_attn_head_selector = AttnHeadSelector(
self.num_tasks,
self.num_layers,
self.total_num_heads,
self.num_heads,
self.select_strategy
)
if safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select:
self.enc_attn_head_selector = AttnHeadSelector(
self.num_tasks,
self.num_layers,
self.total_num_heads,
self.num_heads,
self.select_strategy
)
self.task_ids = None
self.layers = nn.ModuleList(
[
self.build_head_selection_decoder_layer(args, no_encoder_attn, idx) for idx in range(args.decoder_layers)
]
)
def set_task_ids(self, task_ids):
self.task_ids = task_ids
def build_head_selection_decoder_layer(self, args, no_encoder_attn=False, layer_idx=None):
return HeadSelectionTransformerDecoderLayer(
args,
layer_idx,
self.self_attn_head_selector,
self.enc_attn_head_selector,
no_encoder_attn=no_encoder_attn
)
def forward(
self,
prev_output_tokens,
encoder_out: Optional[Dict[str, List[Tensor]]] = None,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
features_only: bool = False,
full_context_alignment: bool = False,
alignment_layer: Optional[int] = None,
alignment_heads: Optional[int] = None,
src_lengths: Optional[Any] = None,
return_all_hiddens: bool = False,
):
if self.self_attn_head_selector is not None:
self.self_attn_head_selector.head_select(self.task_ids)
if self.enc_attn_head_selector is not None:
self.enc_attn_head_selector.head_select(self.task_ids)
return super().forward(
prev_output_tokens=prev_output_tokens,
encoder_out=encoder_out,
incremental_state=incremental_state,
features_only=features_only,
full_context_alignment=full_context_alignment,
alignment_layer=alignment_layer,
alignment_heads=alignment_heads,
src_lengths=src_lengths,
return_all_hiddens=return_all_hiddens
)

View File

@ -0,0 +1,81 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
import torch.nn as nn
import math
class AttnHeadSelector(nn.Module):
"""
Latent variable modeling of attention head selection
"""
def __init__(
self, num_tasks, num_layers,
total_num_heads, num_heads,
select_strategy="group",
head_select_temp=5.0
):
super(AttnHeadSelector, self).__init__()
self.num_tasks = num_tasks
self.num_layers = num_layers
self.total_num_heads = total_num_heads
self.num_heads = num_heads
self.select_strategy = select_strategy
self.temp = head_select_temp
self.head_logits = torch.nn.Parameter(
torch.Tensor(self.num_tasks, self.num_layers, total_num_heads),
requires_grad=True
)
nn.init.uniform_(
self.head_logits, a=math.log(0.01),
b=math.log(1.0)
)
def gumbel_sample(self, logits, tau=1.0):
gumbels1 = -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log()
gumbels2 = -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log()
gumbels1 = (logits + gumbels1 - gumbels2) / tau
y_soft = gumbels1.sigmoid()
return y_soft
def subset_select(self, y_soft, topk, dim=-1):
top_values, top_inds = torch.topk(y_soft, k=topk, dim=dim)
top_ret = 1.0 - top_values.detach() + top_values
return top_inds.detach(), top_ret
def group_selet(self, y_soft, topk, dim=-1):
# top_values: (num_tasks, num_layers, topk)
top_values, top_inds = torch.max(
y_soft.view(self.num_tasks, self.num_layers, -1, topk), dim=2
)
top_inds = top_inds * topk + torch.arange(topk, device=top_inds.device).unsqueeze(0).unsqueeze(1)
top_ret = 1.0 - top_values.detach() + top_values
return top_inds.detach(), top_ret
def head_select(self, task_ids=None):
# gumbel_sample
self.head_samples = self.gumbel_sample(self.head_logits, tau=self.temp)
# head select
if self.select_strategy == "subset":
self.subset_heads, self.subset_weights = self.subset_select(
self.head_samples,
topk=self.num_heads,
)
elif self.select_strategy == "group":
self.subset_heads, self.subset_weights = self.group_selet(
self.head_samples,
topk=self.num_heads,
)
else:
raise ValueError("{} is not supported".format(self.select_strategy))
self.batch_subset = self.subset_heads[task_ids, :, :]
self.batch_weights = self.subset_weights[task_ids, :, :]
def forward(self, layer_idx):
assert layer_idx is not None
batch_subset = self.batch_subset[:, layer_idx, :]
batch_weights = self.batch_weights[:, layer_idx, :]
return batch_subset, batch_weights

View File

@ -0,0 +1,92 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from fairseq.utils import safe_getattr
from fairseq.modules import TransformerEncoderLayer, TransformerDecoderLayer
from ..modules.multihead_attention_selection import MultiheadAttentionSelection
class HeadSelectionTransformerEncoderLayer(TransformerEncoderLayer):
def __init__(self, args, layer_idx, attn_head_selector=None):
super().__init__(args)
self.layer_idx = layer_idx
self.self_attn = self.build_self_attention_selection(
self.embed_dim, args, attn_head_selector
)
def build_self_attention_selection(self, embed_dim, args, attn_head_selector=None):
return MultiheadAttentionSelection(
embed_dim,
args.total_encoder_attention_heads,
args.encoder_attention_heads,
dropout=args.attention_dropout,
self_attention=True,
q_noise=self.quant_noise,
qn_block_size=self.quant_noise_block_size,
layer_idx=self.layer_idx,
attn_head_selector=attn_head_selector
)
class HeadSelectionTransformerDecoderLayer(TransformerDecoderLayer):
def __init__(
self,
args,
layer_idx,
self_attn_head_selector=None,
enc_attn_head_selector=None,
no_encoder_attn=False,
add_bias_kv=False,
add_zero_attn=False,
):
self.layer_idx = layer_idx
super().__init__(args, no_encoder_attn, add_bias_kv, add_zero_attn)
if self_attn_head_selector is not None:
self.self_attn = self.build_self_attention_selection(
self.embed_dim, args,
self_attn_head_selector=self_attn_head_selector,
add_bias_kv=add_bias_kv,
add_zero_attn=add_zero_attn
)
if enc_attn_head_selector is not None:
self.encoder_attn = self.build_encoder_attention_selection(
self.embed_dim, args,
enc_attn_head_selector=enc_attn_head_selector
)
def build_self_attention_selection(
self, embed_dim, args, self_attn_head_selector=None,
add_bias_kv=False, add_zero_attn=False
):
return MultiheadAttentionSelection(
embed_dim,
args.total_decoder_attention_heads,
args.decoder_attention_heads,
dropout=args.attention_dropout,
add_bias_kv=add_bias_kv,
add_zero_attn=add_zero_attn,
self_attention=not safe_getattr(args, "cross_self_attention"),
q_noise=self.quant_noise,
qn_block_size=self.quant_noise_block_size,
layer_idx=self.layer_idx,
attn_head_selector=self_attn_head_selector,
)
def build_encoder_attention_selection(self, embed_dim, args, enc_attn_head_selector=None):
return MultiheadAttentionSelection(
embed_dim,
args.total_decoder_attention_heads,
args.decoder_attention_heads,
kdim=args.encoder_embed_dim,
vdim=args.encoder_embed_dim,
dropout=args.attention_dropout,
encoder_decoder_attention=True,
q_noise=self.quant_noise,
qn_block_size=self.quant_noise_block_size,
layer_idx=self.layer_idx,
attn_head_selector=enc_attn_head_selector,
)

View File

@ -0,0 +1,355 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from typing import Dict, Optional, Tuple
import torch
from fairseq import utils
from fairseq.modules.quant_noise import quant_noise
from torch import Tensor, nn
from torch.nn import Parameter
from fairseq.modules.multihead_attention import MultiheadAttention
from ..modules.multihead_functional import multi_head_attention_forward
class MultiheadAttentionSelection(MultiheadAttention):
def __init__(
self,
embed_dim,
total_num_heads,
num_heads,
kdim=None,
vdim=None,
dropout=0.0,
bias=True,
add_bias_kv=False,
add_zero_attn=False,
self_attention=False,
encoder_decoder_attention=False,
q_noise=0.0,
qn_block_size=8,
layer_idx=0,
attn_head_selector=None
):
super().__init__(
embed_dim,
num_heads,
kdim=kdim,
vdim=vdim,
dropout=dropout,
bias=bias,
add_bias_kv=add_bias_kv,
add_zero_attn=add_zero_attn,
self_attention=self_attention,
encoder_decoder_attention=encoder_decoder_attention,
q_noise=q_noise,
qn_block_size=qn_block_size,
)
self.layer_idx = layer_idx
self.attn_head_selector = attn_head_selector
self.total_num_heads = total_num_heads
self.total_embed_dim = self.head_dim * total_num_heads
self.k_proj = quant_noise(
nn.Linear(self.kdim, self.total_embed_dim, bias=bias), q_noise, qn_block_size
)
self.v_proj = quant_noise(
nn.Linear(self.vdim, self.total_embed_dim, bias=bias), q_noise, qn_block_size
)
self.q_proj = quant_noise(
nn.Linear(embed_dim, self.total_embed_dim, bias=bias), q_noise, qn_block_size
)
if add_bias_kv:
self.bias_k = Parameter(torch.Tensor(1, 1, self.total_embed_dim))
self.bias_v = Parameter(torch.Tensor(1, 1, self.total_embed_dim))
else:
self.bias_k = self.bias_v = None
self.reset_parameters()
def forward(
self,
query,
key: Optional[Tensor],
value: Optional[Tensor],
key_padding_mask: Optional[Tensor] = None,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
need_weights: bool = True,
static_kv: bool = False,
attn_mask: Optional[Tensor] = None,
before_softmax: bool = False,
need_head_weights: bool = False,
# subset_heads: Optional[Tensor] = None,
# subset_weights: Optional[Tensor] = None
) -> Tuple[Tensor, Optional[Tensor]]:
if need_head_weights:
need_weights = True
is_tpu = query.device.type == "xla"
subset_heads, subset_weights = self.attn_head_selector(self.layer_idx)
tgt_len, bsz, embed_dim = query.size()
src_len = tgt_len
assert list(query.size()) == [tgt_len, bsz, self.embed_dim]
if key is not None:
src_len, key_bsz, _ = key.size()
if not torch.jit.is_scripting():
assert key_bsz == bsz
assert value is not None
assert src_len, bsz == value.shape[:2]
if (
not self.onnx_trace
and not is_tpu # don't use PyTorch version on TPUs
and incremental_state is None
and not static_kv
# A workaround for quantization to work. Otherwise JIT compilation
# treats bias in linear module as method.
and not torch.jit.is_scripting()
):
assert key is not None and value is not None
return multi_head_attention_forward(
query,
key,
value,
self.embed_dim,
self.total_num_heads,
self.num_heads,
torch.empty([0]),
torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
self.bias_k,
self.bias_v,
self.add_zero_attn,
self.dropout_module.p,
self.out_proj.weight,
self.out_proj.bias,
self.training or self.dropout_module.apply_during_inference,
key_padding_mask,
need_weights,
attn_mask,
use_separate_proj_weight=True,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
subset_heads=subset_heads,
subset_weights=subset_weights
)
if incremental_state is not None:
saved_state = self._get_input_buffer(incremental_state)
if saved_state is not None and "prev_key" in saved_state:
# previous time steps are cached - no need to recompute
# key and value if they are static
if static_kv:
assert self.encoder_decoder_attention and not self.self_attention
key = value = None
else:
saved_state = None
if self.self_attention:
q = self.q_proj(query)
k = self.k_proj(query)
v = self.v_proj(query)
elif self.encoder_decoder_attention:
# encoder-decoder attention
q = self.q_proj(query)
if key is None:
assert value is None
k = v = None
else:
k = self.k_proj(key)
v = self.v_proj(key)
else:
assert key is not None and value is not None
q = self.q_proj(query)
k = self.k_proj(key)
v = self.v_proj(value)
q *= self.scaling
if self.bias_k is not None:
assert self.bias_v is not None
k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
if attn_mask is not None:
attn_mask = torch.cat(
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[
key_padding_mask,
key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
],
dim=1,
)
q = (
q.contiguous()
.view(tgt_len, bsz * self.total_num_heads, self.head_dim)
.transpose(0, 1)
)
if k is not None:
k = (
k.contiguous()
.view(-1, bsz * self.total_num_heads, self.head_dim)
.transpose(0, 1)
)
if v is not None:
v = (
v.contiguous()
.view(-1, bsz * self.total_num_heads, self.head_dim)
.transpose(0, 1)
)
if saved_state is not None:
# saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
if "prev_key" in saved_state:
_prev_key = saved_state["prev_key"]
assert _prev_key is not None
prev_key = _prev_key.view(bsz * self.total_num_heads, -1, self.head_dim)
if static_kv:
k = prev_key
else:
assert k is not None
k = torch.cat([prev_key, k], dim=1)
src_len = k.size(1)
if "prev_value" in saved_state:
_prev_value = saved_state["prev_value"]
assert _prev_value is not None
prev_value = _prev_value.view(bsz * self.total_num_heads, -1, self.head_dim)
if static_kv:
v = prev_value
else:
assert v is not None
v = torch.cat([prev_value, v], dim=1)
prev_key_padding_mask: Optional[Tensor] = None
if "prev_key_padding_mask" in saved_state:
prev_key_padding_mask = saved_state["prev_key_padding_mask"]
assert k is not None and v is not None
key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
key_padding_mask=key_padding_mask,
prev_key_padding_mask=prev_key_padding_mask,
batch_size=bsz,
src_len=k.size(1),
static_kv=static_kv,
)
saved_state["prev_key"] = k.view(bsz, self.total_num_heads, -1, self.head_dim)
saved_state["prev_value"] = v.view(bsz, self.total_num_heads, -1, self.head_dim)
saved_state["prev_key_padding_mask"] = key_padding_mask
# In this branch incremental_state is never None
assert incremental_state is not None
incremental_state = self._set_input_buffer(incremental_state, saved_state)
assert k is not None
assert k.size(1) == src_len
# This is part of a workaround to get around fork/join parallelism
# not supporting Optional types.
if key_padding_mask is not None and key_padding_mask.dim() == 0:
key_padding_mask = None
if key_padding_mask is not None:
assert key_padding_mask.size(0) == bsz
assert key_padding_mask.size(1) == src_len
if self.add_zero_attn:
assert v is not None
src_len += 1
k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
if attn_mask is not None:
attn_mask = torch.cat(
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[
key_padding_mask,
torch.zeros(key_padding_mask.size(0), 1).type_as(
key_padding_mask
),
],
dim=1,
)
attn_weights = torch.bmm(q, k.transpose(1, 2))
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
assert list(attn_weights.size()) == [bsz * self.total_num_heads, tgt_len, src_len]
if attn_mask is not None:
attn_mask = attn_mask.unsqueeze(0)
if self.onnx_trace:
attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
attn_weights += attn_mask
if key_padding_mask is not None:
# don't attend to padding symbols
attn_weights = attn_weights.view(bsz, self.total_num_heads, tgt_len, src_len)
if not is_tpu:
attn_weights = attn_weights.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
float("-inf"),
)
else:
attn_weights = attn_weights.transpose(0, 2)
attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
attn_weights = attn_weights.transpose(0, 2)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if before_softmax:
return attn_weights, v
attn_weights_float = utils.softmax(
attn_weights, dim=-1, onnx_trace=self.onnx_trace
)
attn_weights = attn_weights_float.type_as(attn_weights)
attn_probs = self.dropout_module(attn_weights)
assert v is not None
# evaluation
if subset_heads is not None and subset_heads.numel() == 1:
subset_heads = subset_heads.repeat(bsz)
subset_weights = subset_weights.repeat(bsz)
if subset_heads is None:
attn = torch.bmm(attn_probs, v)
else:
# training with head selection
mixed_attn = torch.bmm(attn_probs, v).contiguous().view(bsz, self.total_num_heads, tgt_len, self.head_dim)
attn = torch.stack(
[mixed_attn[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1
)
attn = attn * subset_weights.unsqueeze(2).unsqueeze(3)
attn = attn.contiguous().view(bsz * self.num_heads, tgt_len, self.head_dim)
assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
if self.onnx_trace and attn.size(1) == 1:
# when ONNX tracing a single decoder step (sequence length == 1)
# the transpose is a no-op copy before view, thus unnecessary
attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
else:
attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
attn = self.out_proj(attn)
attn_weights: Optional[Tensor] = None
if need_weights:
if subset_heads is None:
attn_weights = attn_weights_float.view(
bsz, self.num_heads, tgt_len, src_len
).transpose(1, 0)
else:
mixed_attn_weights = attn_weights_float.view(
bsz, self.total_num_heads, tgt_len, src_len
)
attn_weights = torch.stack(
[mixed_attn_weights[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1
).transpose(1, 0)
if not need_head_weights:
# average attention weights over heads
attn_weights = attn_weights.mean(dim=0)
return attn, attn_weights

View File

@ -0,0 +1,278 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from typing import Optional, Tuple
import torch
from torch import Tensor
from torch.nn.functional import (
linear, softmax, dropout, pad,
has_torch_function,
handle_torch_function,
_in_projection_packed,
)
import math
import warnings
def _scaled_dot_product_attention(
q: Tensor,
k: Tensor,
v: Tensor,
attn_mask: Optional[Tensor] = None,
dropout_p: float = 0.0,
bsz: int = 1,
subset_heads: Optional[Tensor] = None,
subset_weights: Optional[Tensor] = None,
) -> Tuple[Tensor, Tensor]:
B, Nt, E = q.shape
q = q / math.sqrt(E)
# B: bsz * total_num_heads
# (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns)
attn = torch.bmm(q, k.transpose(-2, -1))
if attn_mask is not None:
attn += attn_mask
attn = softmax(attn, dim=-1)
if dropout_p > 0.0:
attn = dropout(attn, p=dropout_p)
if subset_heads is None:
# (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E)
output = torch.bmm(attn, v)
else:
mixed_output = torch.bmm(attn, v).contiguous().view(bsz, -1, Nt, E)
output = torch.stack(
[mixed_output[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))],
dim=1
)
output = output * subset_weights.unsqueeze(2).unsqueeze(3)
output = output.contiguous().view(-1, Nt, E)
if subset_heads is not None:
_, Nt, Ns = attn.size()
mixed_attn = attn.view(bsz, -1, Nt, Ns)
attn = torch.stack(
[mixed_attn[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1
)
return output, attn
def _in_projection(
q: Tensor,
k: Tensor,
v: Tensor,
w_q: Tensor,
w_k: Tensor,
w_v: Tensor,
b_q: Optional[Tensor] = None,
b_k: Optional[Tensor] = None,
b_v: Optional[Tensor] = None,
) -> Tuple[Tensor, Tensor, Tensor]:
return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
def multi_head_attention_forward(
query: Tensor,
key: Tensor,
value: Tensor,
embed_dim_to_check: int,
total_num_heads: int,
num_heads: int,
in_proj_weight: Tensor,
in_proj_bias: Optional[Tensor],
bias_k: Optional[Tensor],
bias_v: Optional[Tensor],
add_zero_attn: bool,
dropout_p: float,
out_proj_weight: Tensor,
out_proj_bias: Optional[Tensor],
training: bool = True,
key_padding_mask: Optional[Tensor] = None,
need_weights: bool = True,
attn_mask: Optional[Tensor] = None,
use_separate_proj_weight: bool = False,
q_proj_weight: Optional[Tensor] = None,
k_proj_weight: Optional[Tensor] = None,
v_proj_weight: Optional[Tensor] = None,
static_k: Optional[Tensor] = None,
static_v: Optional[Tensor] = None,
subset_heads: Optional[Tensor] = None,
subset_weights: Optional[Tensor] = None,
):
tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
if has_torch_function(tens_ops):
return handle_torch_function(
multi_head_attention_forward,
tens_ops,
query,
key,
value,
embed_dim_to_check,
total_num_heads,
num_heads,
in_proj_weight,
in_proj_bias,
bias_k,
bias_v,
add_zero_attn,
dropout_p,
out_proj_weight,
out_proj_bias,
training=training,
key_padding_mask=key_padding_mask,
need_weights=need_weights,
attn_mask=attn_mask,
use_separate_proj_weight=use_separate_proj_weight,
q_proj_weight=q_proj_weight,
k_proj_weight=k_proj_weight,
v_proj_weight=v_proj_weight,
static_k=static_k,
static_v=static_v,
subset_heads=subset_heads,
subset_weights=subset_weights
)
# set up shape vars
tgt_len, bsz, embed_dim = query.shape
src_len, _, _ = key.shape
assert embed_dim == embed_dim_to_check, \
f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
if isinstance(embed_dim, torch.Tensor):
# embed_dim can be a tensor when JIT tracing
head_dim = embed_dim.div(num_heads, rounding_mode='trunc')
else:
head_dim = embed_dim // num_heads
assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
if use_separate_proj_weight:
# allow MHA to have different embedding dimensions when separate projection weights are used
assert key.shape[:2] == value.shape[:2], \
f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
else:
assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
#
# compute in-projection
#
if not use_separate_proj_weight:
q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
else:
assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
if in_proj_bias is None:
b_q = b_k = b_v = None
else:
b_q, b_k, b_v = in_proj_bias.chunk(3)
q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
# prep attention mask
if attn_mask is not None:
if attn_mask.dtype == torch.uint8:
warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
attn_mask = attn_mask.to(torch.bool)
else:
assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, \
f"Only float, byte, and bool types are supported for attn_mask, not {attn_mask.dtype}"
# ensure attn_mask's dim is 3
if attn_mask.dim() == 2:
correct_2d_size = (tgt_len, src_len)
if attn_mask.shape != correct_2d_size:
raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
attn_mask = attn_mask.unsqueeze(0)
elif attn_mask.dim() == 3:
correct_3d_size = (bsz * total_num_heads, tgt_len, src_len)
if attn_mask.shape != correct_3d_size:
raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.")
else:
raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
# prep key padding mask
if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
key_padding_mask = key_padding_mask.to(torch.bool)
# add bias along batch dimension (currently second)
if bias_k is not None and bias_v is not None:
assert static_k is None, "bias cannot be added to static key."
assert static_v is None, "bias cannot be added to static value."
k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
if attn_mask is not None:
attn_mask = pad(attn_mask, (0, 1))
if key_padding_mask is not None:
key_padding_mask = pad(key_padding_mask, (0, 1))
else:
assert bias_k is None
assert bias_v is None
#
# reshape q, k, v for multihead attention and make em batch first
#
q = q.contiguous().view(tgt_len, bsz * total_num_heads, head_dim).transpose(0, 1)
if static_k is None:
k = k.contiguous().view(k.shape[0], bsz * total_num_heads, head_dim).transpose(0, 1)
else:
# TODO finish disentangling control flow so we don't do in-projections when statics are passed
assert static_k.size(0) == bsz * total_num_heads, \
f"expecting static_k.size(0) of {bsz * total_num_heads}, but got {static_k.size(0)}"
assert static_k.size(2) == head_dim, \
f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
k = static_k
if static_v is None:
v = v.contiguous().view(v.shape[0], bsz * total_num_heads, head_dim).transpose(0, 1)
else:
# TODO finish disentangling control flow so we don't do in-projections when statics are passed
assert static_v.size(0) == bsz * total_num_heads, \
f"expecting static_v.size(0) of {bsz * total_num_heads}, but got {static_v.size(0)}"
assert static_v.size(2) == head_dim, \
f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
v = static_v
# add zero attention along batch dimension (now first)
if add_zero_attn:
zero_attn_shape = (bsz * total_num_heads, 1, head_dim)
k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1)
if attn_mask is not None:
attn_mask = pad(attn_mask, (0, 1))
if key_padding_mask is not None:
key_padding_mask = pad(key_padding_mask, (0, 1))
# update source sequence length after adjustments
src_len = k.size(1)
# merge key padding and attention masks
if key_padding_mask is not None:
assert key_padding_mask.shape == (bsz, src_len), \
f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len). \
expand(-1, total_num_heads, -1, -1).reshape(bsz * total_num_heads, 1, src_len)
if attn_mask is None:
attn_mask = key_padding_mask
elif attn_mask.dtype == torch.bool:
attn_mask = attn_mask.logical_or(key_padding_mask)
else:
attn_mask = attn_mask.masked_fill(key_padding_mask, float("-inf"))
# convert mask to float
if attn_mask is not None and attn_mask.dtype == torch.bool:
new_attn_mask = torch.zeros_like(attn_mask, dtype=torch.float)
new_attn_mask.masked_fill_(attn_mask, float("-inf"))
attn_mask = new_attn_mask
# adjust dropout probability
if not training:
dropout_p = 0.0
#
# (deep breath) calculate attention and out projection
#
attn_output, attn_output_weights = _scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, bsz, subset_heads, subset_weights)
attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
if need_weights:
# average attention weights over heads
attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
return attn_output, attn_output_weights.sum(dim=1) / num_heads
else:
return attn_output, None

View File

@ -0,0 +1,180 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
from fairseq.optim.amp_optimizer import AMPOptimizer
from fairseq.tasks import register_task
from fairseq.tasks.speech_to_text import SpeechToTextTask
from .data.speech_to_text_dataset_with_domain import SpeechToTextDatasetCreatorWithDomain
from .loss.attention_head_selection import HeadSelectionLoss
@register_task("speech_to_text_head_selection")
class SpeechToTextHeadSelectionTask(SpeechToTextTask):
@classmethod
def add_args(cls, parser):
SpeechToTextTask.add_args(parser)
parser.add_argument(
"--task-type",
type=str,
default="lang",
help="task type for head selection, lang or domain"
)
parser.add_argument(
"--kl-weight",
type=float,
default=0.0,
help="the weight of KL loss"
)
def __init__(self, args, tgt_dict):
super().__init__(args, tgt_dict)
self.task_type = args.task_type
assert self.task_type in ["lang", "domain"], "invalid task_type: {}, should be either lang or domain".format(self.task_type)
self.map_task_to_id(args.train_subset)
self.encoder_head_prior = float(args.decoder_attention_heads) / args.total_decoder_attention_heads
self.decoder_head_prior = float(args.encoder_attention_heads) / args.total_encoder_attention_heads
self.kl_loss = HeadSelectionLoss(args)
def map_task_to_id(self, train_subset):
src_lang_set, tgt_lang_set, domain_set = set(), set(), set()
for split in train_subset.split(","):
seq = split.split("_")
assert len(seq) == 4, "subset {} should be in the format of train_src_tgt_domain".format(split)
_, src_lang, tgt_lang, domain = seq
src_lang_set.add(src_lang)
tgt_lang_set.add(tgt_lang)
domain_set.add(domain)
src_langs = sorted(src_lang_set)
tgt_langs = sorted(tgt_lang_set)
domains = sorted(domain_set)
self.src_lang_map = {src_lang: i for (i, src_lang) in enumerate(src_langs)}
self.tgt_lang_map = {tgt_lang: i for (i, tgt_lang) in enumerate(tgt_langs)}
self.domain_map = {domain: i for (i, domain) in enumerate(domains)}
if self.task_type == "lang":
self.encoder_tasks = len(self.src_lang_map)
self.decoder_tasks = len(self.tgt_lang_map)
elif self.task_type == "domain":
self.encoder_tasks = len(self.domain_map)
self.decoder_tasks = len(self.domain_map)
def load_dataset(self, split, epoch=1, combine=False, **kwargs):
is_train_split = split.startswith("train")
pre_tokenizer = self.build_tokenizer(self.args)
bpe_tokenizer = self.build_bpe(self.args)
self.datasets[split] = SpeechToTextDatasetCreatorWithDomain.from_tsv(
self.args.data,
self.data_cfg,
split,
self.tgt_dict,
pre_tokenizer,
bpe_tokenizer,
is_train_split=is_train_split,
epoch=epoch,
seed=self.args.seed,
src_lang_map=self.src_lang_map,
tgt_lang_map=self.tgt_lang_map,
domain_map=self.domain_map,
speaker_to_id=self.speaker_to_id
)
def build_model(self, args):
args.encoder_tasks = self.encoder_tasks
args.decoder_tasks = self.decoder_tasks
return super(SpeechToTextHeadSelectionTask, self).build_model(args)
def get_sample_sizes(self, sample, task_ids, num_tasks):
"""
task_ids: (bsz,)
get sample sizes for each task
"""
bsz = task_ids.size(0)
mat = torch.zeros((num_tasks, bsz), device=task_ids.device)
mat[task_ids, torch.arange(bsz)] = 1.0
ntokens = torch.sum(sample['target'] != 1, dim=-1)
sample_sizes = torch.matmul(mat, ntokens.float())
return sample_sizes
def train_step(
self, sample, model, criterion, optimizer, update_num, ignore_grad=False
):
model.train()
model.set_num_updates(update_num)
# task ids
if self.task_type == "lang":
encoder_task_ids = sample["src_lang_ids"]
decoder_task_ids = sample["tgt_lang_ids"]
elif self.task_type == "domain":
encoder_task_ids = sample["domain_ids"]
decoder_task_ids = sample["domain_ids"]
model.encoder.set_task_ids(encoder_task_ids)
model.decoder.set_task_ids(decoder_task_ids)
with torch.autograd.profiler.record_function("forward"):
with torch.cuda.amp.autocast(enabled=(isinstance(optimizer, AMPOptimizer))):
loss, sample_size, logging_output = criterion(model, sample)
# KL loss
if self.args.encoder_attn_head_select:
sample_sizes = self.get_sample_sizes(sample, encoder_task_ids, self.encoder_tasks)
loss += self.kl_loss(
model.encoder.attn_head_selector.head_samples,
sample_sizes,
self.encoder_head_prior
)
if self.args.decoder_self_attn_head_select:
sample_sizes = self.get_sample_sizes(sample, decoder_task_ids, self.decoder_tasks)
loss += self.kl_loss(
model.decoder.self_attn_head_selector.head_samples,
sample_sizes,
self.decoder_head_prior
)
if self.args.dec_enc_attn_head_select:
sample_sizes = self.get_sample_sizes(sample, decoder_task_ids, self.decoder_tasks)
loss += self.kl_loss(
model.decoder.enc_attn_head_selector.head_sampes,
sample_sizes,
self.decoder_head_prior
)
if ignore_grad:
loss *= 0
with torch.autograd.profiler.record_function("backward"):
optimizer.backward(loss)
return loss, sample_size, logging_output
def valid_step(self, sample, model, criterion):
model.eval()
# task ids
if self.task_type == "lang":
encoder_task_ids = sample["src_lang_ids"]
decoder_task_ids = sample["tgt_lang_ids"]
elif self.task_type == "domain":
encoder_task_ids = sample["domain_ids"]
decoder_task_ids = sample["domain_ids"]
model.encoder.set_task_ids(encoder_task_ids)
model.decoder.set_task_ids(decoder_task_ids)
with torch.no_grad():
loss, sample_size, logging_output = criterion(model, sample)
return loss, sample_size, logging_output
def inference_step(
self, generator, models, sample, prefix_tokens=None, constraints=None
):
with torch.no_grad():
# task ids
if self.task_type == "lang":
encoder_task_ids = sample["src_lang_ids"][:1]
decoder_task_ids = sample["tgt_lang_ids"][:1]
elif self.task_type == "domain":
encoder_task_ids = sample["domain_ids"][:1]
decoder_task_ids = sample["domain_ids"][:1]
for model in models:
model.encoder.set_task_ids(encoder_task_ids)
model.decoder.set_task_ids(decoder_task_ids)
return generator.generate(
models, sample, prefix_tokens=prefix_tokens, constraints=constraints
)

View File

@ -0,0 +1,53 @@
# End-to-end NLU
End-to-end spoken language understanding (SLU) predicts intent directly from audio using a single model. It promises to improve the performance of assistant systems by leveraging acoustic information lost in the intermediate textual representation and preventing cascading errors from Automatic Speech Recognition (ASR). Further, having one unified model has efficiency advantages when deploying assistant systems on-device.
This page releases the code for reproducing the results in [STOP: A dataset for Spoken Task Oriented Semantic Parsing](https://arxiv.org/abs/2207.10643)
The dataset can be downloaded here: [download link](https://dl.fbaipublicfiles.com/stop/stop.tar.gz)
The low-resource splits can be downloaded here: [download link](http://dl.fbaipublicfiles.com/stop/low_resource_splits.tar.gz)
## Pretrained models end-to-end NLU Models
| Speech Pretraining | ASR Pretraining | Test EM Accuracy | Tesst EM-Tree Accuracy | Link |
| ----------- | ----------- |----------|----------|----------|
| None | None | 36.54 | 57.01 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-none-none.pt) |
| Wav2Vec | None | 68.05 | 82.53 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-wav2vec-none.pt) |
| HuBERT | None | 68.40 | 82.85 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-hubert-none.pt) |
| Wav2Vec | STOP | 68.70 | 82.78 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-wav2vec-stop.pt) |
| HuBERT | STOP | 69.23 | 82.87 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-hubert-stop.pt) |
| Wav2Vec | Librispeech | 68.47 | 82.49 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-wav2vec-ls.pt) |
| HuBERT | Librispeech | 68.70 | 82.78 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-hubert-ls.pt) |
## Pretrained models ASR Models
| Speech Pre-training | ASR Dataset | STOP Eval WER | STOP Test WER | dev\_other WER | dev\_clean WER | test\_clean WER | test\_other WER | Link |
| ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |
| HuBERT | Librispeech | 8.47 | 2.99 | 3.25 | 8.06 | 25.68 | 26.19 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-hubert-ls.pt) |
| Wav2Vec | Librispeech | 9.215 | 3.204 | 3.334 | 9.006 | 27.257 | 27.588 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-wav2vec-ls.pt) |
| HuBERT | STOP | 46.31 | 31.30 | 31.52 | 47.16 | 4.29 | 4.26 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-hubert-stop.pt) |
| Wav2Vec | STOP | 43.103 | 27.833 | 28.479 | 28.479 | 4.679 | 4.667 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-wav2vec-stop.pt) |
| HuBERT | Librispeech + STOP | 9.015 | 3.211 | 3.372 | 8.635 | 5.133 | 5.056 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-hubert-ls-stop.pt) |
| Wav2Vec | Librispeech + STOP | 9.549 | 3.537 | 3.625 | 9.514 | 5.59 | 5.562 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-wav2vec-ls-stop.pt) |
## Creating the fairseq datasets from STOP
First, create the audio file manifests and label files:
```
python examples/audio_nlp/nlu/generate_manifests.py --stop_root $STOP_DOWNLOAD_DIR/stop --output $FAIRSEQ_DATASET_OUTPUT/
```
Run `./examples/audio_nlp/nlu/create_dict_stop.sh $FAIRSEQ_DATASET_OUTPUT` to generate the fairseq dictionaries.
## Training an End-to-end NLU Model
Download a wav2vec or hubert model from [link](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert) or [link](https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec)
```
python fairseq_cli/hydra-train --config-dir examples/audio_nlp/nlu/configs/ --config-name nlu_finetuning task.data=$FAIRSEQ_DATA_OUTPUT model.w2v_path=$PRETRAINED_MODEL_PATH
```

View File

@ -0,0 +1,59 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 10
tensorboard_logdir: tb
checkpoint:
no_epoch_checkpoints: true
best_checkpoint_metric: em_error
save_interval: 10
task:
_name: nlu_finetuning
data: ???
labels: parse
eval_wer_parse: true
autoregressive: true
dataset:
num_workers: 6
max_tokens: 1600000
skip_invalid_size_inputs_valid_test: true
valid_subset: eval,test
train_subset: train
validate_interval: 10
criterion:
_name: label_smoothed_cross_entropy
optimization:
max_update: 320000
lr: [0.0001]
sentence_avg: true
update_freq: [1]
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-08
lr_scheduler:
_name: tri_stage
phase_ratio: [0.1, 0.4, 0.5]
final_lr_scale: 0.05
model:
_name: wav2vec_seq2seq
w2v_path: ???
autoregressive: true
apply_mask: true
mask_prob: 0.5
mask_channel_prob: 0.5
mask_channel_length: 64
layerdrop: 0.1
activation_dropout: 0.1
feature_grad_mult: 0.0
freeze_finetune_updates: 0

View File

@ -0,0 +1,38 @@
#!/bin/bash
### Script handling creation of data binaries
### for model training within fairseq
fairseq_root="."
data_root=$1
train_prefix="${data_root}/train"
valid_prefix="${data_root}/eval"
test_prefix="${data_root}/test"
dest_dir="$data_root/"
#echo "src dict: $src_dict" > "$dest_dir/src_dict.txt"
#echo "trg dict: $tgt_dict" > "$dest_dir/tgt_dict.txt"
#--tgtdict $tgt_dict \
PYTHONPATH=$fairseq_root \
python $fairseq_root/fairseq_cli/preprocess.py \
--source-lang "parse" \
--trainpref "$train_prefix" \
--validpref "$valid_prefix" \
--destdir "$dest_dir" \
--only-source \
--dict-only \
--workers 60;
PYTHONPATH=$fairseq_root \
python $fairseq_root/fairseq_cli/preprocess.py \
--source-lang "ltr" \
--trainpref "$train_prefix" \
--validpref "$valid_prefix" \
--destdir "$dest_dir" \
--only-source \
--dict-only \
--workers 60;

View File

@ -0,0 +1,83 @@
import argparse
from pathlib import Path
import soundfile
def get_insl_frame(parse):
out = []
def is_ont_token(tok):
return tok[0] in ["[", "]"];
res = []
x = []
for tok in parse.split():
if is_ont_token(tok):
res.extend('_'.join(x))
x = []
res.append(tok.upper())
else:
x.append(tok.upper())
return " ".join(res) + ' | '
def sequencify_utterance(utterance):
utterance = utterance.upper()
utterance = utterance.replace(' ', '|') + '|'
utterance = list(utterance)
utterance = ' '.join(utterance)
return utterance
def generate_fairseq_manifests(manifest, output_path, audio_root=None):
with open(manifest, 'r') as i:
parses = []
utterances = []
filepaths = []
keys = None
for (idx, line) in enumerate(i):
if idx == 0: keys = line.strip().split('\t')
else:
data = { k: v for (k, v) in zip(keys, line.split('\t'))}
parses.append(get_insl_frame(data['decoupled_normalized_seqlogical']))
utterances.append(sequencify_utterance(data['normalized_utterance']))
filepaths.append(data['file_id'])
parses_fp = output_path.with_suffix('.parse')
with open(str(parses_fp), 'w') as o:
for p in parses:
o.write(p + '\n')
utterances_fp = output_path.with_suffix('.ltr')
with open(str(utterances_fp), 'w') as o:
for u in utterances:
o.write(u + '\n')
filepaths_fp = output_path.with_suffix('.tsv')
with open(str(filepaths_fp), 'w') as o:
o.write(str(audio_root) + '\n')
for f in filepaths:
fullpath = audio_root / f
assert fullpath.exists(), f'{fullpath}'
frames = soundfile.info(fullpath).frames
o.write(f'{f}\t{frames}\n')
def main(args):
splits = ['train', 'eval', 'test']
root = Path(args.stop_root)
output_root = Path(args.output)
for split in splits:
stop_manifest_path = root / 'manifests' / (split + '.tsv')
output_path = output_root / (split)
generate_fairseq_manifests(stop_manifest_path, output_path, root)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--stop_root', type=str,
help='path to stop root directory')
parser.add_argument('--output', type=str,
help='output directory')
args = parser.parse_args()
main(args)

261
examples/data2vec/README.md Normal file
View File

@ -0,0 +1,261 @@
# data2vec 2.0
data2vec 2.0 improves the training efficiency of the original data2vec algorithm. We make the following improvements for efficiency considerations - we forward only the unmasked timesteps through the encoder, we use convolutional decoder and we use multimasking to amortize the compute overhead of the teacher model. You can find details in the paper [Efficient Self-supervised Learning with Contextualized Target Representations for Vision, Speech and Language](https://arxiv.org/abs/2212.07525) and our [blog post](https://ai.facebook.com/blog/ai-self-supervised-learning-data2vec/).
## Pretrained and finetuned models
### Vision
| Model | Finetuning split | Link
|---|---|---
data2vec ViT-B | No fine-tuning | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_imagenet.pt)
data2vec ViT-B | Imagenet-1K | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_imagenet_ft.pt)
data2vec ViT-L | No fine-tuning | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_imagenet.pt)
data2vec ViT-L | Imagenet-1K | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_imagenet_ft.pt)
data2vec ViT-H | No fine-tuning | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/huge_imagenet.pt)
data2vec ViT-H | Imagenet-1K | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/huge_imagenet_ft.pt)
Vision models only are license under CC-BY-NC.
### Speech
| Model | Finetuning split | Dataset | Link
|---|---|---|---
data2vec Base | No fine-tuning | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_libri.pt)
data2vec Base | 960 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_libri_960h.pt)
data2vec Large | No fine-tuning | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_vox.pt)
data2vec Large | 960 hours | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_vox_960h.pt)
### NLP
| Model | Fine-tuning data | Dataset | Link | Dict | BPE
|---|---|---|---|---|---
data2vec Base | No fine-tuning | Books + Wiki | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/nlp_base.pt) | [dict](https://dl.fbaipublicfiles.com/fairseq/data2vec2/dict.txt) | [encoder](https://dl.fbaipublicfiles.com/fairseq/data2vec2/encoder.json) / [vocab](https://dl.fbaipublicfiles.com/fairseq/data2vec2/vocab.bpe)
[//]: # (## Data Preparation)
[//]: # ()
[//]: # (### Vision)
[//]: # (add details)
[//]: # (### Speech)
[//]: # (add details)
[//]: # ()
[//]: # (### NLP)
[//]: # (add details)
## Commands to train different models using data2vec 2.0
### Vision
Commands to pretrain different model configurations
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \
--config-name base_images_only_task task.data=/path/to/dir
```
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \
--config-name large_images_only_task task.data=/path/to/dir
```
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \
--config-name huge_images14_only_task task.data=/path/to/dir
```
Commands to finetune different model configurations
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/vision/finetuning \
--config-name mae_imagenet_clean task.data=/path/to/dir model.model_path=/path/to/pretrained/model
```
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/vision/finetuning \
--config-name mae_imagenet_large_clean task.data=/path/to/dir model.model_path=/path/to/pretrained/model
```
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/vision/finetuning \
--config-name mae_imagenet_huge_clean task.data=/path/to/dir model.model_path=/path/to/pretrained/model
```
### Speech
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \
--config-name base_audio_only_task task.data=/path/to/manifests
```
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \
--config-name large_audio_only_task task.data=/path/to/manifests
```
Finetuning:
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/wav2vec/config/finetuning --config-name vox_10h \
task.data=/path/to/manifests model.w2v_path=/path/to/pretrained/model common.user_dir=examples/data2vec
```
Replace vox_10h with the right config depending on your model and fine-tuning split.
See examples/wav2vec/config/finetuning for all available configs.
### NLP
Commands to pretrain
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \
--config-name base_text_only_task task.data=/path/to/file
```
Commands to fine-tune all GLUE tasks
```shell script
$ task=cola # choose from [cola|qnli|mrpc|rte|sst_2|mnli|qqp|sts_b]
$ lr=1e-5 # sweep [1e-5|2e-5|4e-5|6e-5] for each task
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2/text_finetuning \
--config-name $task task.data=/path/to/file model.model_path=/path/to/pretrained/model "optimization.lr=[${lr}]"
```
# data2vec
data2vec is a framework for self-supervised representation learning for images, speech, and text as described in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language (Baevski et al., 2022)](https://ai.facebook.com/research/data2vec-a-general-framework-for-self-supervised-learning-in-speech-vision-and-language). The algorithm uses the same learning mechanism for different modalities.
## Pre-trained models
### Vision
Code and pre-trained models for data2vec visions can be found [here](https://github.com/facebookresearch/data2vec_vision/tree/main/beit).
### Speech
| Model | Finetuning split | Dataset | Link
|---|---|---|---
data2vec Base | No fine-tuning | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls.pt)
data2vec Base | 10 minutes | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls_10m.pt)
data2vec Base | 100 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls_100h.pt)
data2vec Base | 960 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls_960h.pt)
data2vec Large | No fine-tuning | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_pretrained.pt)
data2vec Large | 10 minutes | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_10m.pt)
data2vec Large | 100 hours | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_100h.pt)
data2vec Large | 960 hours | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_960h.pt)
---
### NLP
Model | Fine-tuning data | Dataset | Link
|---|---|---|---|
data2vec Base | No fine-tuning | Books + Wiki | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/nlp_base.pt)
## Training a new speech model with the CLI tools
Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate file 10 to 30 seconds in length)
### Prepare training data manifest:
First, install the `soundfile` library:
```shell script
pip install soundfile
```
Next, run:
```shell script
$ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext $ext --valid-percent $valid
```
$ext should be set to flac, wav, or whatever format your dataset happens to use that soundfile can read.
$valid should be set to some reasonable percentage (like 0.01) of training data to use for validation.
To use a pre-defined validation set (like dev-other from librispeech), set to it 0 and then overwrite valid.tsv with a
separately pre-processed manifest file.
### Train a data2vec Base model:
This configuration was used for the base model trained on the Librispeech dataset in the data2vec paper
Note that the input is expected to be single channel, sampled at 16 kHz
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/audio/pretraining \
--config-name base_librispeech task.data=/path/to/manifests common.user_dir=examples/data2vec
```
Note: you can simulate 16 GPUs by using k GPUs and adding command line parameters
`distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 16/k
### Fine-tune a pre-trained model with CTC:
Fine-tuning a model requires parallel audio and labels file, as well as a vocabulary file in fairseq format.
A letter vocabulary can be downloaded [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt).
An example [script](../wav2vec/libri_labels.py) that generates labels for the Librispeech dataset from the tsv file produced by wav2vec_manifest.py can be used as follows:
```shell script
split=train
$ python libri_labels.py /path/to/tsv --output-dir /output/dir --output-name $split
```
Fine-tuning on 100h of Librispeech with letter targets:
```shell script
$ fairseq-hydra-train \
distributed_training.distributed_port=$PORT \
task.data=/path/to/data \
model.w2v_path=/path/to/model.pt \
--config-dir /path/to/fairseq-py/examples/wav2vec/config/finetuning \
--config-name base_100h common.user_dir=examples/data2vec
```
There are other config files in the config/finetuning directory that can be used to fine-tune on other splits.
You can specify the right config via the `--config-name` parameter.
Decoding with a language model during training requires flashlight [python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter).
If you want to use a language model, add `+criterion.wer_args='[/path/to/kenlm, /path/to/lexicon, 2, -1]'` to the command line.
### Evaluating a CTC model:
Evaluating a CTC model with a language model requires [flashlight python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter) to be installed.
Fairseq transformer language model used in the wav2vec 2.0 paper can be obtained from the [wav2letter model repository](https://github.com/facebookresearch/wav2letter/tree/master/recipes/sota/2019).
Be sure to upper-case the language model vocab after downloading it.
Letter dictionary for pre-trained models can be found [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt).
Next, run the evaluation command:
```shell script
python examples/speech_recognition/new/infer.py --config-dir examples/speech_recognition/new/conf \
--config-name infer task=audio_finetuning task.data=/path/to/manifests common.user_dir=examples/data2vec \
task.labels=ltr decoding.type=kenlm \
decoding.lmweight=${lmweight} decoding.wordscore=${wordscore} decoding.silweight=${silscore} \
decoding.lexicon=/path/to/lexicon \
decoding.lmpath=/path/to/lm decoding.unique_wer_file=True \
dataset.gen_subset=dev_clean,dev_other,test_clean,test_other \
common_eval.path=/path/to/checkpoint.pt decoding.beam=1500 distributed_training.distributed_world_size=${num_gpus}
```
To get raw numbers, use decoding.type=viterbi and omit the lexicon. To use the transformer language model, use decoding.type=fairseqlm.
## Training a new NLP model with the CLI tools
Please follow the [RoBERTa](../roberta/README.md) instructions to preprocess your data. To train a data2vec model on run:
```shell script
$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/text/pretraining \
--config-name base task.data=/path/to/data common.user_dir=examples/data2vec
```
As for speech models, you can simulate 16 gpus by using the update_freq parameter.
### Finetuning data2vec-text on GLUE
Please use a command similar to this:
```shell
$ python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \
--config-name $task task.data=$data_path checkpoint.restore_file="${/path/to/pretrained/model.pt}"
```

View File

View File

@ -0,0 +1,70 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
all_gather_list_size: 70000
tensorboard_logdir: tb
min_loss_scale: 1e-6
checkpoint:
save_interval: 1
no_epoch_checkpoints: true
best_checkpoint_metric: mAP
maximize_best_checkpoint_metric: true
task:
_name: audio_classification
data: ???
normalize: true
labels: lbl
dataset:
num_workers: 6
max_tokens: 2560000
skip_invalid_size_inputs_valid_test: true
valid_subset: eval
validate_interval: 5
distributed_training:
ddp_backend: legacy_ddp
distributed_world_size: 8
criterion:
_name: model
can_sum: false
log_keys:
- _predictions
- _targets
optimization:
max_update: 30000
lr: [0.00006] # scratch 53-5
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-08
lr_scheduler:
_name: cosine
warmup_updates: 5000
model:
_name: audio_classification
model_path: ???
apply_mask: true
mask_prob: 0.6
mask_length: 5 # scratch 1
mask_channel_prob: 0
mask_channel_length: 64
layerdrop: 0.1
dropout: 0.1
activation_dropout: 0.1
attention_dropout: 0.2
feature_grad_mult: 0 # scratch 1
label_mixup: true
source_mixup: 0.5
prediction_mode: lin_softmax # scratch average_sigmoid

View File

@ -0,0 +1,35 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 450
nodes: 1
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,35 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 1
tasks_per_node: 1
mem_gb: 100
nodes: 1
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb
max_num_timeout: 30

View File

@ -0,0 +1,35 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 450
nodes: 2
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,91 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
min_loss_scale: 1e-6
user_dir: /private/home/abaevski/fairseq-py/examples/data2vec
checkpoint:
save_interval: 1
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: audio_pretraining
data: /private/home/abaevski/data/audioset
max_sample_size: 320000
min_sample_size: 32000
normalize: true
dataset:
num_workers: 6
max_tokens: 3400000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
required_batch_size_multiple: 1
disable_validation: true
distributed_training:
distributed_world_size: 24
ddp_backend: legacy_ddp
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
# - avg_self_attn
# - weights
optimization:
max_update: 200000
lr: [0.0005]
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: cosine
warmup_updates: 10000
model:
_name: data2vec_audio
extractor_mode: layer_norm
encoder_layerdrop: 0.05
dropout_input: 0.0
dropout_features: 0.0
feature_grad_mult: 1.0
encoder_embed_dim: 768
mask_prob: 0.65
mask_length: 10
loss_beta: 0
loss_scale: null
instance_norm_target_layer: true
layer_norm_targets: true
average_top_k_layers: 12
self_attn_norm_type: deepnorm
final_norm_type: deepnorm
pos_conv_depth: 5
conv_pos: 95
ema_decay: 0.999
ema_end_decay: 0.9999
ema_anneal_end_step: 30000
ema_transformer_only: true
ema_layers_only: false
require_same_masks: true
mask_dropout: 0

View File

@ -0,0 +1,83 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
checkpoint:
save_interval: 5
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: audio_pretraining
data: ???
max_sample_size: 320000
min_sample_size: 32000
normalize: true
dataset:
num_workers: 6
max_tokens: 3800000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
required_batch_size_multiple: 1
disable_validation: true
distributed_training:
distributed_world_size: 16
ddp_backend: legacy_ddp
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
optimization:
max_update: 400000
lr: [0.0005]
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: tri_stage
phase_ratio: [0.03,0.9,0.07]
model:
_name: data2vec_audio
extractor_mode: layer_norm
encoder_layerdrop: 0.05
dropout_input: 0.0
dropout_features: 0.0
feature_grad_mult: 1.0
encoder_embed_dim: 768
mask_prob: 0.65
mask_length: 10
loss_beta: 0
loss_scale: null
instance_norm_target_layer: true
average_top_k_layers: 8
pos_conv_depth: 5
conv_pos: 95
ema_decay: 0.999
ema_end_decay: 0.9999
ema_anneal_end_step: 30000
ema_transformer_only: true
ema_layers_only: true
require_same_masks: true
mask_dropout: 0

View File

@ -0,0 +1,15 @@
# @package _global_
hydra:
sweep:
dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
distributed_training:
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
common:
log_interval: 1
dataset:
num_workers: 0

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 450
nodes: 1
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 0
nodes: 1
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec,learnlab,learnfair
max_num_timeout: 30

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 450
nodes: 2
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- task.post_save_script
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 2
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec,learnlab,learnfair
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 450
nodes: 3
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 450
nodes: 4
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- task.post_save_script
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 4
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec,learnlab,learnfair
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 6
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec,learnlab,learnfair
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 8
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec,learnlab,learnfair
max_num_timeout: 30

View File

@ -0,0 +1,77 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
checkpoint:
no_epoch_checkpoints: true
save_interval_updates: 50000
keep_interval_updates: 1
distributed_training:
distributed_world_size: 16
ddp_backend: legacy_ddp
task:
_name: masked_lm
data: ???
sample_break_mode: complete_doc
tokens_per_sample: 512
include_target_tokens: true
random_token_prob: 0
leave_unmasked_prob: 0
mask_prob: 0.35
mask_multiple_length: 4
criterion: model
dataset:
max_tokens: 8192
ignore_unused_valid_subsets: true
skip_invalid_size_inputs_valid_test: true
optimizer:
_name: adam
weight_decay: 0.01
adam_betas: (0.9,0.98)
adam_eps: 1e-06
lr_scheduler:
_name: cosine
warmup_updates: 10000
optimization:
clip_norm: 5
lr: [0.0002]
max_update: 1000000
update_freq: [1]
model:
_name: data2vec_text
head_layers: 2
average_top_k_layers: 10
layer_norm_target_layer: true
loss_scale: 1
ema_decay: 0.999
ema_end_decay: 0.9999
ema_anneal_end_step: 300000
loss_beta: 4
ema_transformer_layers_only: true
transformer:
dropout: 0.1
attention_dropout: 0.1
layernorm_embedding: true
activation_fn: gelu
no_scale_embedding: true
max_source_positions: 512
encoder:
embed_dim: 768
ffn_embed_dim: 3072
layers: 12
attention_heads: 12
normalize_before: false
learned_pos: true
layerdrop: 0

View File

@ -0,0 +1,15 @@
# @package _global_
hydra:
sweep:
dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
distributed_training:
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
common:
log_interval: 1
dataset:
num_workers: 0

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: '_'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}/submitit
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 0
nodes: 1
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec
max_num_timeout: 30
exclude: a100-st-p4d24xlarge-471

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 450
nodes: 2
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: '_'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}/submitit
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 2
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec
max_num_timeout: 30
exclude: a100-st-p4d24xlarge-471

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 450
nodes: 3
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 450
nodes: 4
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,41 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: '_'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}/submitit
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 4
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec
max_num_timeout: 30
exclude: a100-st-p4d24xlarge-471
distributed_training:
distributed_world_size: 32
ddp_backend: legacy_ddp

View File

@ -0,0 +1,41 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: '_'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}/submitit
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 8
name: pt
partition: wav2vec
max_num_timeout: 30
exclude: a100-st-p4d24xlarge-471
distributed_training:
distributed_world_size: 64
ddp_backend: legacy_ddp

View File

@ -0,0 +1,113 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
min_loss_scale: 1e-6
fp16_no_flatten_grads: false
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
save_interval: 1
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: audio_pretraining
data: /private/home/abaevski/data/librispeech/full
max_sample_size: 320000
min_sample_size: 32000
normalize: true
precompute_mask_config: {}
dataset:
num_workers: 6
max_tokens: 1000000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
required_batch_size_multiple: 1
disable_validation: true
distributed_training:
distributed_world_size: 8
ddp_backend: legacy_ddp
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
optimization:
max_update: 400000
lr: [0.00075]
debug_param_names: true
optimizer:
_name: adam
adam_betas: [ 0.9,0.98 ]
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: cosine
warmup_updates: 8000
model:
_name: data2vec_multi
loss_beta: 0
loss_scale: null
depth: 12
embed_dim: 768
clone_batch: 8
ema_decay: 0.999
ema_end_decay: 0.99999
ema_anneal_end_step: 75000
ema_encoder_only: false
average_top_k_layers: 8
instance_norm_target_layer: true
layer_norm_target_layer: false
layer_norm_targets: false
layerdrop: 0.05
norm_eps: 1e-5
supported_modality: AUDIO
modalities:
audio:
feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
conv_pos_depth: 5
conv_pos_width: 95
conv_pos_groups: 16
prenet_depth: 0
mask_prob: 0.5
mask_prob_adjust: 0.05
inverse_mask: false
mask_length: 5
mask_noise_std: 0.01
mask_dropout: 0
add_masks: false
ema_local_encoder: false
use_alibi_encoder: true
prenet_layerdrop: 0.05
prenet_dropout: 0.1
learned_alibi_scale: true
learned_alibi_scale_per_head: true
decoder:
input_dropout: 0.1
decoder_dim: 384
decoder_groups: 16
decoder_kernel: 7
decoder_layers: 4

View File

@ -0,0 +1,116 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
min_loss_scale: 1e-6
fp16_no_flatten_grads: true
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
save_interval: 5
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: mae_image_pretraining
data: /datasets01/imagenet_full_size/061417/
rebuild_batches: true
local_cache_path: /scratch/cache_abaevski/imagenet
key: source
precompute_mask_config: {}
dataset:
num_workers: 10
batch_size: 16
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1
disable_validation: true
distributed_training:
distributed_world_size: 16
ddp_backend: c10d
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
optimization:
max_update: 375300
lr: [ 0.001 ]
debug_param_names: true
clip_norm: 4
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 1e-3
optimizer:
_name: adam
adam_betas: [0.9,0.95]
weight_decay: 0.05
lr_scheduler:
_name: cosine
warmup_updates: 50040
lr_scheduler: pass_through
model:
_name: data2vec_multi
ema_decay: 0.9998
ema_end_decay: 0.99999
ema_anneal_end_step: 100000
instance_norm_target_layer: true
layer_norm_target_layer: false
layer_norm_targets: true
end_of_block_targets: false
depth: 10
average_top_k_layers: 10
clone_batch: 16
norm_eps: 1e-6
min_target_var: 0
min_pred_var: 0
encoder_dropout: 0
post_mlp_drop: 0
attention_dropout: 0
activation_dropout: 0
supported_modality: IMAGE
cls_loss: 0.01
ema_encoder_only: false
modalities:
image:
inverse_mask: true
mask_prob: 0.8
mask_prob_adjust: 0.07
mask_length: 3
mask_noise_std: 0.01
prenet_depth: 2
ema_local_encoder: true
num_extra_tokens: 1
init_extra_token_zero: false
use_alibi_encoder: false
decoder:
decoder_dim: 768
decoder_groups: 16
decoder_kernel: 3
decoder_layers: 6
input_dropout: 0

View File

@ -0,0 +1,112 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
fp16_no_flatten_grads: true
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
no_epoch_checkpoints: true
save_interval_updates: 50000
keep_interval_updates: 1
distributed_training:
distributed_world_size: 16
ddp_backend: legacy_ddp
task:
_name: masked_lm
data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
sample_break_mode: none
tokens_per_sample: 512
include_target_tokens: true
random_token_prob: 0
leave_unmasked_prob: 0
include_index: True
skip_masking: True
d2v2_multi: True
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
dataset:
batch_size: 4
ignore_unused_valid_subsets: true
skip_invalid_size_inputs_valid_test: true
disable_validation: true
optimization:
clip_norm: 1
lr: [0.0002]
max_update: 1000000
update_freq: [1]
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 0.0002
optimizer:
_name: adam
adam_betas: [0.9,0.98]
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: cosine
warmup_updates: 4000
lr_scheduler: pass_through
model:
_name: data2vec_multi
loss_beta: 0
loss_scale: 1
depth: 12
embed_dim: 768
clone_batch: 8
ema_decay: 0.9999
ema_end_decay: 0.99999
ema_anneal_end_step: 100000
ema_encoder_only: true
average_top_k_layers: 12
layer_norm_target_layer: false
instance_norm_target_layer: true
batch_norm_target_layer: false
instance_norm_targets: false
layer_norm_targets: false
layerdrop: 0
norm_eps: 1e-5
supported_modality: TEXT
modalities:
text:
mask_prob: 0.48
mask_length: 1
mask_noise_std: 0.01
prenet_depth: 0
decoder:
input_dropout: 0.1
decoder_dim: 768
decoder_groups: 1
decoder_kernel: 9
decoder_layers: 5
decoder_residual: false
projection_layers: 2
projection_ratio: 2.0

View File

@ -0,0 +1,122 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
min_loss_scale: 1e-6
fp16_no_flatten_grads: true
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
save_interval: 5
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: mae_image_pretraining
data: /datasets01/imagenet_full_size/061417/
rebuild_batches: true
local_cache_path: /scratch/cache_abaevski/imagenet
key: source
precompute_mask_config: {}
dataset:
num_workers: 10
batch_size: 8
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1
disable_validation: true
distributed_training:
distributed_world_size: 32
ddp_backend: c10d
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
optimization:
max_update: 500000
lr: [ 0.0004 ]
debug_param_names: true
clip_norm: 4
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 4e-4
optimizer:
_name: adam
adam_betas: [0.9,0.95]
weight_decay: 0.05
lr_scheduler:
_name: cosine
warmup_updates: 50040
lr_scheduler: pass_through
model:
_name: data2vec_multi
ema_decay: 0.9998
ema_end_decay: 1
ema_anneal_end_step: 300000
instance_norm_target_layer: true
layer_norm_target_layer: false
layer_norm_targets: true
end_of_block_targets: false
depth: 32
embed_dim: 1280
num_heads: 16
average_top_k_layers: 24
clone_batch: 16
norm_eps: 1e-6
min_target_var: 0
min_pred_var: 0
encoder_dropout: 0
post_mlp_drop: 0
attention_dropout: 0
activation_dropout: 0
supported_modality: IMAGE
cls_loss: 0.01
ema_encoder_only: false
modalities:
image:
patch_size: 14
inverse_mask: true
mask_prob: 0.75
mask_prob_adjust: 0.1
mask_length: 3
mask_noise_std: 0.01
prenet_depth: 0
ema_local_encoder: true
num_extra_tokens: 1
init_extra_token_zero: false
use_alibi_encoder: false
embed_dim: 1280
decoder:
decoder_dim: 1024
decoder_groups: 16
decoder_kernel: 5
decoder_layers: 3
final_layer_norm: false
input_dropout: 0

View File

@ -0,0 +1,120 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
min_loss_scale: 1e-6
fp16_no_flatten_grads: true
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
save_interval: 5
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: mae_image_pretraining
data: /datasets01/imagenet_full_size/061417/
rebuild_batches: true
local_cache_path: /scratch/cache_abaevski/imagenet
key: source
precompute_mask_config: {}
dataset:
num_workers: 10
batch_size: 8
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1
disable_validation: true
distributed_training:
distributed_world_size: 16
ddp_backend: c10d
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
optimization:
max_update: 375300
lr: [ 0.0004 ]
debug_param_names: true
clip_norm: 4
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 4e-4
optimizer:
_name: adam
adam_betas: [0.9,0.95]
weight_decay: 0.05
lr_scheduler:
_name: cosine
warmup_updates: 50040
lr_scheduler: pass_through
model:
_name: data2vec_multi
ema_decay: 0.9998
ema_end_decay: 0.99995
ema_anneal_end_step: 150000
instance_norm_target_layer: true
layer_norm_target_layer: false
layer_norm_targets: true
end_of_block_targets: false
depth: 32
embed_dim: 1280
num_heads: 16
average_top_k_layers: 24
clone_batch: 16
norm_eps: 1e-6
min_target_var: 0
min_pred_var: 0
encoder_dropout: 0
post_mlp_drop: 0
attention_dropout: 0
activation_dropout: 0
supported_modality: IMAGE
cls_loss: 0.01
ema_encoder_only: false
modalities:
image:
inverse_mask: true
mask_prob: 0.75
mask_prob_adjust: 0.1
mask_length: 3
mask_noise_std: 0.01
prenet_depth: 0
ema_local_encoder: true
num_extra_tokens: 1
init_extra_token_zero: false
use_alibi_encoder: false
embed_dim: 1280
decoder:
decoder_dim: 1024
decoder_groups: 16
decoder_kernel: 5
decoder_layers: 3
input_dropout: 0

View File

@ -0,0 +1,122 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
min_loss_scale: 1e-6
fp16_no_flatten_grads: true
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
save_interval: 1
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: audio_pretraining
data: /fsx-wav2vec/abaevski/data/librivox/no_silence
max_sample_size: 320000
min_sample_size: 32000
normalize: true
precompute_mask_config: {}
dataset:
num_workers: 8
max_tokens: 320000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
required_batch_size_multiple: 1
disable_validation: true
distributed_training:
distributed_world_size: 48
ddp_backend: c10d
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
optimization:
max_update: 600000
debug_param_names: true
clip_norm: 1
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 0.0004
optimizer:
_name: adam
adam_betas: [0.9,0.98]
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: cosine
warmup_updates: 10000
lr_scheduler: pass_through
model:
_name: data2vec_multi
loss_beta: 0
loss_scale: null
depth: 16
embed_dim: 1024
num_heads: 16
clone_batch: 12
ema_decay: 0.9997
ema_end_decay: 1
ema_anneal_end_step: 300000
ema_encoder_only: false
average_top_k_layers: 16
instance_norm_target_layer: true
layer_norm_target_layer: false
layer_norm_targets: false
layerdrop: 0
norm_eps: 1e-5
supported_modality: AUDIO
modalities:
audio:
feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
conv_pos_depth: 5
conv_pos_width: 95
conv_pos_groups: 16
prenet_depth: 8
mask_prob: 0.55
mask_prob_adjust: 0.1
inverse_mask: false
mask_length: 5
mask_noise_std: 0.01
mask_dropout: 0
add_masks: false
ema_local_encoder: false
use_alibi_encoder: true
prenet_layerdrop: 0
prenet_dropout: 0.1
learned_alibi_scale: true
learned_alibi_scale_per_head: true
decoder:
input_dropout: 0.1
decoder_dim: 768
decoder_groups: 16
decoder_kernel: 7
decoder_layers: 4

View File

@ -0,0 +1,120 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
min_loss_scale: 1e-6
fp16_no_flatten_grads: true
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
save_interval: 5
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: mae_image_pretraining
data: /datasets01/imagenet_full_size/061417/
rebuild_batches: true
local_cache_path: /scratch/cache_abaevski/imagenet
key: source
precompute_mask_config: {}
dataset:
num_workers: 10
batch_size: 8
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1
disable_validation: true
distributed_training:
distributed_world_size: 16
ddp_backend: c10d
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
optimization:
max_update: 375300
lr: [ 0.0004 ]
debug_param_names: true
clip_norm: 4
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 4e-4
optimizer:
_name: adam
adam_betas: [0.9,0.95]
weight_decay: 0.05
lr_scheduler:
_name: cosine
warmup_updates: 50040
lr_scheduler: pass_through
model:
_name: data2vec_multi
ema_decay: 0.9998
ema_end_decay: 0.99999
ema_anneal_end_step: 150000
instance_norm_target_layer: true
layer_norm_target_layer: false
layer_norm_targets: true
end_of_block_targets: false
depth: 24
embed_dim: 1024
num_heads: 16
average_top_k_layers: 18
clone_batch: 16
norm_eps: 1e-6
min_target_var: 0
min_pred_var: 0
encoder_dropout: 0
post_mlp_drop: 0
attention_dropout: 0
activation_dropout: 0
supported_modality: IMAGE
cls_loss: 0.01
ema_encoder_only: false
modalities:
image:
inverse_mask: true
mask_prob: 0.75
mask_prob_adjust: 0.1
mask_length: 3
mask_noise_std: 0.01
prenet_depth: 0
ema_local_encoder: true
num_extra_tokens: 1
init_extra_token_zero: false
use_alibi_encoder: false
embed_dim: 1024
decoder:
decoder_dim: 1024
decoder_groups: 16
decoder_kernel: 5
decoder_layers: 3
input_dropout: 0

View File

@ -0,0 +1,112 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
min_loss_scale: 1e-6
fp16_no_flatten_grads: true
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
save_interval_updates: 50000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: masked_lm
data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
sample_break_mode: none
tokens_per_sample: 512
include_target_tokens: true
random_token_prob: 0
leave_unmasked_prob: 0
include_index: True
skip_masking: True
d2v2_multi: True
dataset:
batch_size: 2
ignore_unused_valid_subsets: true
skip_invalid_size_inputs_valid_test: true
disable_validation: true
distributed_training:
distributed_world_size: 32
ddp_backend: c10d
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
optimization:
max_update: 600000
clip_norm: 1
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 0.0001
optimizer:
_name: adam
adam_betas: [0.9,0.98]
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: cosine
warmup_updates: 4000
lr_scheduler: pass_through
model:
_name: data2vec_multi
loss_beta: 0
loss_scale: 1
depth: 24
num_heads: 16
embed_dim: 1024
clone_batch: 8
ema_decay: 0.9999
ema_end_decay: 0.99999
ema_anneal_end_step: 100000
ema_encoder_only: true
average_top_k_layers: 24
layer_norm_target_layer: true
instance_norm_target_layer: false
batch_norm_target_layer: false
instance_norm_targets: true
layer_norm_targets: false
layerdrop: 0
norm_eps: 1e-5
supported_modality: TEXT
modalities:
text:
mask_prob: 0.5
mask_length: 1
mask_noise_std: 0.01
prenet_depth: 0
decoder:
input_dropout: 0.1
decoder_dim: 768
decoder_groups: 1
decoder_kernel: 9
decoder_layers: 5
decoder_residual: false
projection_layers: 2
projection_ratio: 2.0

View File

@ -0,0 +1,123 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
fp16_no_flatten_grads: true
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
no_epoch_checkpoints: true
save_interval_updates: 50000
keep_interval_updates: 1
distributed_training:
distributed_world_size: 32
ddp_backend: legacy_ddp
task:
_name: masked_lm
data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin
sample_break_mode: none
tokens_per_sample: 512
include_target_tokens: true
random_token_prob: 0
leave_unmasked_prob: 0
include_index: True
skip_masking: True
d2v2_multi: True
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
dataset:
batch_size: 2
ignore_unused_valid_subsets: true
skip_invalid_size_inputs_valid_test: true
disable_validation: true
optimization:
clip_norm: 1
lr: [3e-4]
max_update: 1000000
update_freq: [1]
optimizer:
_name: composite
groups:
default:
lr_float: 1e-4
optimizer:
_name: adam
adam_betas: [0.9,0.98]
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: cosine
warmup_updates: 4000
decoder:
lr_float: 1e-4
optimizer:
_name: adam
adam_betas: [0.9,0.98]
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: cosine
warmup_updates: 4000
lr_scheduler: pass_through
model:
_name: data2vec_multi
loss_beta: 4
loss_scale: 1
depth: 24
num_heads: 16
embed_dim: 1024
clone_batch: 8
ema_decay: 0.9999
ema_end_decay: 0.99999
ema_anneal_end_step: 100000
ema_encoder_only: true
average_top_k_layers: 24
layer_norm_target_layer: true
instance_norm_target_layer: false
batch_norm_target_layer: false
instance_norm_targets: true
layer_norm_targets: false
layerdrop: 0
norm_eps: 1e-5
supported_modality: TEXT
decoder_group: true
modalities:
text:
mask_prob: 0.5
mask_length: 1
mask_noise_std: 0.01
prenet_depth: 0
decoder:
input_dropout: 0.1
decoder_dim: 768
decoder_groups: 1
decoder_kernel: 9
decoder_layers: 5
decoder_residual: false
projection_layers: 2
projection_ratio: 2.0

View File

@ -0,0 +1,15 @@
# @package _global_
hydra:
sweep:
dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
distributed_training:
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
common:
log_interval: 1
dataset:
num_workers: 0

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 450
nodes: 1
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.local_cache_path
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 0
nodes: 1
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec,learnlab,learnfair
max_num_timeout: 30

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 450
nodes: 2
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,39 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.local_cache_path
- task.data
- task.post_save_script
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
- model.model_path
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 12
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 2
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 450
nodes: 3
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 450
nodes: 4
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- task.post_save_script
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 12
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 4
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 12
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 6
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec,learnlab,learnfair
max_num_timeout: 30

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 10
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 450
nodes: 8
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 12
gpus_per_node: 8
tasks_per_node: 8
mem_gb: 0
nodes: 8
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec,learnlab,learnfair
max_num_timeout: 30

View File

@ -0,0 +1,60 @@
# @package _group_
common:
fp16: true
fp16_init_scale: 4
threshold_loss_scale: 1
fp16_scale_window: 128
log_format: json
log_interval: 200
user_dir: ${env:PWD}/examples/data2vec
task:
_name: sentence_prediction
data: ???
init_token: 0
separator_token: 2
num_classes: 2
max_positions: 512
d2v2_multi: True
checkpoint:
best_checkpoint_metric: mcc
maximize_best_checkpoint_metric: true
no_epoch_checkpoints: true
distributed_training:
find_unused_parameters: true
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
criterion:
_name: sentence_prediction
report_mcc: True
dataset:
batch_size: 16
required_batch_size_multiple: 1
max_tokens: 4400
num_workers: 1
optimizer:
_name: adam
weight_decay: 0.1
adam_betas: (0.9,0.98)
adam_eps: 1e-06
lr_scheduler:
_name: polynomial_decay
warmup_updates: 320
optimization:
clip_norm: 0.0
lr: [2e-05]
max_update: 5336
max_epoch: 10
model:
_name: data2vec_text_classification
model_path: ???

View File

@ -0,0 +1,60 @@
# @package _group_
common:
fp16: true
fp16_init_scale: 4
threshold_loss_scale: 1
fp16_scale_window: 128
log_format: json
log_interval: 200
user_dir: ${env:PWD}/examples/data2vec
task:
_name: sentence_prediction
data: ???
init_token: 0
separator_token: 2
num_classes: 3
max_positions: 512
d2v2_multi: True
checkpoint:
best_checkpoint_metric: accuracy
maximize_best_checkpoint_metric: true
no_epoch_checkpoints: true
distributed_training:
find_unused_parameters: true
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
criterion:
_name: sentence_prediction
dataset:
batch_size: 32
required_batch_size_multiple: 1
max_tokens: 4400
valid_subset: valid,valid1
num_workers: 1
optimizer:
_name: adam
weight_decay: 0.1
adam_betas: (0.9,0.98)
adam_eps: 1e-06
lr_scheduler:
_name: polynomial_decay
warmup_updates: 7432
optimization:
clip_norm: 0.0
lr: [2e-05]
max_update: 123873
max_epoch: 10
model:
_name: data2vec_text_classification
model_path: ???

View File

@ -0,0 +1,60 @@
# @package _group_
common:
fp16: true
fp16_init_scale: 4
threshold_loss_scale: 1
fp16_scale_window: 128
log_format: json
log_interval: 200
user_dir: ${env:PWD}/examples/data2vec
task:
_name: sentence_prediction
data: ???
init_token: 0
separator_token: 2
num_classes: 2
max_positions: 512
d2v2_multi: True
checkpoint:
best_checkpoint_metric: acc_and_f1
maximize_best_checkpoint_metric: true
no_epoch_checkpoints: true
distributed_training:
find_unused_parameters: true
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
criterion:
_name: sentence_prediction
report_acc_and_f1: True
dataset:
batch_size: 16
required_batch_size_multiple: 1
max_tokens: 4400
num_workers: 1
optimizer:
_name: adam
weight_decay: 0.1
adam_betas: (0.9,0.98)
adam_eps: 1e-06
lr_scheduler:
_name: polynomial_decay
warmup_updates: 137
optimization:
clip_norm: 0.0
lr: [2e-05]
max_update: 2296
max_epoch: 10
model:
_name: data2vec_text_classification
model_path: ???

View File

@ -0,0 +1,59 @@
# @package _group_
common:
fp16: true
fp16_init_scale: 4
threshold_loss_scale: 1
fp16_scale_window: 128
log_format: json
log_interval: 200
user_dir: ${env:PWD}/examples/data2vec
task:
_name: sentence_prediction
data: ???
init_token: 0
separator_token: 2
num_classes: 2
max_positions: 512
d2v2_multi: True
checkpoint:
best_checkpoint_metric: accuracy
maximize_best_checkpoint_metric: true
no_epoch_checkpoints: true
distributed_training:
find_unused_parameters: true
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
criterion:
_name: sentence_prediction
dataset:
batch_size: 32
required_batch_size_multiple: 1
max_tokens: 4400
num_workers: 1
optimizer:
_name: adam
weight_decay: 0.1
adam_betas: (0.9,0.98)
adam_eps: 1e-06
lr_scheduler:
_name: polynomial_decay
warmup_updates: 1986
optimization:
clip_norm: 0.0
lr: [2e-05]
max_update: 33112
max_epoch: 10
model:
_name: data2vec_text_classification
model_path: ???

View File

@ -0,0 +1,60 @@
# @package _group_
common:
fp16: true
fp16_init_scale: 4
threshold_loss_scale: 1
fp16_scale_window: 128
log_format: json
log_interval: 200
user_dir: ${env:PWD}/examples/data2vec
task:
_name: sentence_prediction
data: ???
init_token: 0
separator_token: 2
num_classes: 2
max_positions: 512
d2v2_multi: True
checkpoint:
best_checkpoint_metric: acc_and_f1
maximize_best_checkpoint_metric: true
no_epoch_checkpoints: true
distributed_training:
find_unused_parameters: true
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
criterion:
_name: sentence_prediction
report_acc_and_f1: True
dataset:
batch_size: 32
required_batch_size_multiple: 1
max_tokens: 4400
num_workers: 1
optimizer:
_name: adam
weight_decay: 0.1
adam_betas: (0.9,0.98)
adam_eps: 1e-06
lr_scheduler:
_name: polynomial_decay
warmup_updates: 28318
optimization:
clip_norm: 0.0
lr: [2e-05]
max_update: 113272
max_epoch: 10
model:
_name: data2vec_text_classification
model_path: ???

View File

@ -0,0 +1,59 @@
# @package _group_
common:
fp16: true
fp16_init_scale: 4
threshold_loss_scale: 1
fp16_scale_window: 128
log_format: json
log_interval: 200
user_dir: ${env:PWD}/examples/data2vec
task:
_name: sentence_prediction
data: ???
init_token: 0
separator_token: 2
num_classes: 2
max_positions: 512
d2v2_multi: True
checkpoint:
best_checkpoint_metric: accuracy
maximize_best_checkpoint_metric: true
no_epoch_checkpoints: true
distributed_training:
find_unused_parameters: true
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
criterion:
_name: sentence_prediction
dataset:
batch_size: 16
required_batch_size_multiple: 1
max_tokens: 4400
num_workers: 1
optimizer:
_name: adam
weight_decay: 0.1
adam_betas: (0.9,0.98)
adam_eps: 1e-06
lr_scheduler:
_name: polynomial_decay
warmup_updates: 122
optimization:
clip_norm: 0.0
lr: [2e-05]
max_update: 2036
max_epoch: 10
model:
_name: data2vec_text_classification
model_path: ???

View File

@ -0,0 +1,15 @@
# @package _global_
hydra:
sweep:
dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
distributed_training:
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
common:
log_interval: 1
dataset:
num_workers: 0

View File

@ -0,0 +1,59 @@
# @package _group_
common:
fp16: true
fp16_init_scale: 4
threshold_loss_scale: 1
fp16_scale_window: 128
log_format: json
log_interval: 200
user_dir: ${env:PWD}/examples/data2vec
task:
_name: sentence_prediction
data: ???
init_token: 0
separator_token: 2
num_classes: 2
max_positions: 512
d2v2_multi: True
checkpoint:
best_checkpoint_metric: accuracy
maximize_best_checkpoint_metric: true
no_epoch_checkpoints: true
distributed_training:
find_unused_parameters: true
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
criterion:
_name: sentence_prediction
dataset:
batch_size: 32
required_batch_size_multiple: 1
max_tokens: 4400
num_workers: 1
optimizer:
_name: adam
weight_decay: 0.1
adam_betas: (0.9,0.98)
adam_eps: 1e-06
lr_scheduler:
_name: polynomial_decay
warmup_updates: 1256
optimization:
clip_norm: 0.0
lr: [2e-05]
max_update: 20935
max_epoch: 10
model:
_name: data2vec_text_classification
model_path: ???

View File

@ -0,0 +1,61 @@
# @package _group_
common:
fp16: true
fp16_init_scale: 4
threshold_loss_scale: 1
fp16_scale_window: 128
log_format: json
log_interval: 200
user_dir: ${env:PWD}/examples/data2vec
task:
_name: sentence_prediction
data: ???
init_token: 0
separator_token: 2
num_classes: 1
max_positions: 512
d2v2_multi: True
checkpoint:
best_checkpoint_metric: pearson_and_spearman
maximize_best_checkpoint_metric: true
no_epoch_checkpoints: true
distributed_training:
find_unused_parameters: true
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
criterion:
_name: sentence_prediction
regression_target: true
report_pearson_and_spearman: True
dataset:
batch_size: 16
required_batch_size_multiple: 1
max_tokens: 4400
num_workers: 1
optimizer:
_name: adam
weight_decay: 0.1
adam_betas: (0.9,0.98)
adam_eps: 1e-06
lr_scheduler:
_name: polynomial_decay
warmup_updates: 214
optimization:
clip_norm: 0.0
lr: [4e-05]
max_update: 3598
max_epoch: 10
model:
_name: data2vec_text_classification
model_path: ???

View File

@ -0,0 +1,52 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
checkpoint:
save_interval: 1
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
best_checkpoint_metric: accuracy
task:
_name: image_classification
data: /datasets01/imagenet_full_size/061417
dataset:
num_workers: 6
batch_size: 64
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1
valid_subset: val
distributed_training:
distributed_world_size: 8
ddp_backend: c10d
criterion:
_name: model
log_keys:
- correct
optimization:
max_update: 100000
lr: [0.0005]
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: cosine
warmup_updates: 10000
model:
_name: data2vec_image_classification
model_path: ???

View File

@ -0,0 +1,65 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
fp16_no_flatten_grads: true
checkpoint:
save_interval: 1
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
best_checkpoint_metric: accuracy
maximize_best_checkpoint_metric: true
task:
_name: mae_image_classification
data: /datasets01/imagenet_full_size/061417
dataset:
num_workers: 6
batch_size: 32
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 2
valid_subset: val
distributed_training:
distributed_world_size: 16
ddp_backend: c10d
criterion:
_name: model
log_keys:
- correct
optimization:
max_update: 250200
lr: [0.001]
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 0.001
optimizer:
_name: adam
adam_betas: [0.9,0.95]
weight_decay: 0.05
lr_scheduler:
_name: cosine
warmup_updates: 16000
min_lr: 1e-6
lr_scheduler: pass_through
model:
_name: mae_image_classification
mixup: 0.7
mixup_prob: 0.9
model_path: ???

View File

@ -0,0 +1,68 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
fp16_no_flatten_grads: true
checkpoint:
save_interval: 1
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
best_checkpoint_metric: accuracy
maximize_best_checkpoint_metric: true
task:
_name: mae_image_classification
data: /datasets01/imagenet_full_size/061417
dataset:
num_workers: 6
batch_size: 32
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 2
valid_subset: val
distributed_training:
distributed_world_size: 16
ddp_backend: c10d
criterion:
_name: model
log_keys:
- correct
optimization:
max_update: 125200
lr: [0.0005]
clip_norm: 4
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 0.0005
optimizer:
_name: adam
adam_betas: [0.9,0.95]
weight_decay: 0.05
lr_scheduler:
_name: cosine
warmup_updates: 16000
min_lr: 1e-20
lr_scheduler: pass_through
model:
_name: mae_image_classification
mixup: 0.7
mixup_prob: 0.9
layer_decay: 0.75
drop_path_rate: 0.2
model_path: ???

View File

@ -0,0 +1,68 @@
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
fp16_no_flatten_grads: true
checkpoint:
save_interval: 1
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
best_checkpoint_metric: accuracy
maximize_best_checkpoint_metric: true
task:
_name: mae_image_classification
data: /datasets01/imagenet_full_size/061417
dataset:
num_workers: 6
batch_size: 32
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 2
valid_subset: val
distributed_training:
distributed_world_size: 16
ddp_backend: c10d
criterion:
_name: model
log_keys:
- correct
optimization:
max_update: 125200
lr: [0.0005]
clip_norm: 4
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 0.0005
optimizer:
_name: adam
adam_betas: [0.9,0.95]
weight_decay: 0.05
lr_scheduler:
_name: cosine
warmup_updates: 16000
min_lr: 1e-7
lr_scheduler: pass_through
model:
_name: mae_image_classification
mixup: 0.7
mixup_prob: 0.9
layer_decay: 0.75
drop_path_rate: 0.2
model_path: ???

View File

@ -0,0 +1,15 @@
# @package _global_
hydra:
sweep:
dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S}
distributed_training:
distributed_world_size: 1
nprocs_per_node: 1
distributed_port: -1
common:
log_interval: 1
dataset:
num_workers: 0

View File

@ -0,0 +1,37 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 450
nodes: 1
name: ${env:PREFIX}_${hydra.job.config_name}
partition: devlab,learnlab,learnfair,scavenge
constraint: volta32gb,ib4
max_num_timeout: 30

View File

@ -0,0 +1,36 @@
# @package _global_
hydra:
job:
config:
override_dirname:
kv_sep: ':'
item_sep: '/'
exclude_keys:
- run_config
- distributed_training.distributed_port
- distributed_training.distributed_world_size
- model.pretrained_model_path
- model.target_network_path
- next_script
- task.cache_in_scratch
- task.data
- checkpoint.save_interval_updates
- checkpoint.keep_interval_updates
- checkpoint.save_on_overflow
- common.log_interval
- common.user_dir
sweep:
dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
subdir: ''
launcher:
submitit_folder: ${hydra.sweep.dir}
timeout_min: 4320
cpus_per_task: 80
gpus_per_node: 8
tasks_per_node: 1
mem_gb: 0
nodes: 1
name: ${env:PREFIX}_${hydra.job.config_name}
partition: wav2vec,learnlab,learnfair
max_num_timeout: 30

Some files were not shown because too many files have changed in this diff Show More