fairseq/setup.py

258 lines
7.5 KiB
Python
Raw Permalink Normal View History

2017-09-25 20:17:43 +03:00
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
2017-09-15 03:22:43 +03:00
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
2017-09-15 03:22:43 +03:00
import os
import subprocess
2017-09-15 03:22:43 +03:00
import sys
from setuptools import Extension, find_packages, setup
from torch.utils import cpp_extension
if sys.version_info < (3, 6):
sys.exit("Sorry, Python >= 3.6 is required for fairseq.")
2017-09-15 03:22:43 +03:00
def write_version_py():
with open(os.path.join("fairseq", "version.txt")) as f:
version = f.read().strip()
# write version info to fairseq/version.py
with open(os.path.join("fairseq", "version.py"), "w") as f:
f.write('__version__ = "{}"\n'.format(version))
return version
version = write_version_py()
with open("README.md") as f:
2017-09-15 03:22:43 +03:00
readme = f.read()
if sys.platform == "darwin":
extra_compile_args = ["-stdlib=libc++", "-O3"]
else:
extra_compile_args = ["-std=c++11", "-O3"]
2017-09-15 03:22:43 +03:00
class NumpyExtension(Extension):
"""Source: https://stackoverflow.com/a/54128391"""
def __init__(self, *args, **kwargs):
self.__include_dirs = []
super().__init__(*args, **kwargs)
@property
def include_dirs(self):
import numpy
return self.__include_dirs + [numpy.get_include()]
@include_dirs.setter
def include_dirs(self, dirs):
self.__include_dirs = dirs
extensions = [
Extension(
"fairseq.libbleu",
sources=[
"fairseq/clib/libbleu/libbleu.cpp",
"fairseq/clib/libbleu/module.cpp",
],
extra_compile_args=extra_compile_args,
),
NumpyExtension(
"fairseq.data.data_utils_fast",
sources=["fairseq/data/data_utils_fast.pyx"],
language="c++",
extra_compile_args=extra_compile_args,
),
NumpyExtension(
"fairseq.data.token_block_utils_fast",
sources=["fairseq/data/token_block_utils_fast.pyx"],
language="c++",
extra_compile_args=extra_compile_args,
),
]
extensions.extend(
[
cpp_extension.CppExtension(
"fairseq.libbase",
sources=[
"fairseq/clib/libbase/balanced_assignment.cpp",
],
),
cpp_extension.CppExtension(
"fairseq.libnat",
sources=[
"fairseq/clib/libnat/edit_dist.cpp",
],
),
cpp_extension.CppExtension(
"alignment_train_cpu_binding",
sources=[
"examples/operators/alignment_train_cpu.cpp",
],
),
]
)
if "CUDA_HOME" in os.environ:
extensions.extend(
[
cpp_extension.CppExtension(
"fairseq.libnat_cuda",
sources=[
"fairseq/clib/libnat_cuda/edit_dist.cu",
"fairseq/clib/libnat_cuda/binding.cpp",
],
),
cpp_extension.CppExtension(
"fairseq.ngram_repeat_block_cuda",
sources=[
"fairseq/clib/cuda/ngram_repeat_block_cuda.cpp",
"fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu",
],
),
cpp_extension.CppExtension(
"alignment_train_cuda_binding",
sources=[
"examples/operators/alignment_train_kernel.cu",
"examples/operators/alignment_train_cuda.cpp",
],
),
]
)
cmdclass = {"build_ext": cpp_extension.BuildExtension}
if "READTHEDOCS" in os.environ:
# don't build extensions when generating docs
extensions = []
if "build_ext" in cmdclass:
del cmdclass["build_ext"]
# use CPU build of PyTorch
dependency_links = [
"https://download.pytorch.org/whl/cpu/torch-1.7.0%2Bcpu-cp36-cp36m-linux_x86_64.whl"
]
else:
dependency_links = []
if "clean" in sys.argv[1:]:
# Source: https://bit.ly/2NLVsgE
print("deleting Cython files...")
subprocess.run(
["rm -f fairseq/*.so fairseq/**/*.so fairseq/*.pyd fairseq/**/*.pyd"],
shell=True,
)
extra_packages = []
if os.path.exists(os.path.join("fairseq", "model_parallel", "megatron", "mpu")):
extra_packages.append("fairseq.model_parallel.megatron.mpu")
def do_setup(package_data):
setup(
name="fairseq",
version=version,
description="Facebook AI Research Sequence-to-Sequence Toolkit",
url="https://github.com/pytorch/fairseq",
classifiers=[
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
long_description=readme,
long_description_content_type="text/markdown",
install_requires=[
"cffi",
"cython",
Compatibility fix with Hydra 1.1 (#3722) Summary: One of the changes in Hydra 1.1 is that the default composition order is changing. This is documented [here](https://hydra.cc/docs/upgrades/1.0_to_1.1/default_composition_order). In Hydra 1.1, a config is overriding values introduced by the defaults list while in Hydra 1.0 - the defaults list was overriding the values in the config. fairseq is currently depending on the previous behavior: The class `FairseqConfig` defines config values, and it's expecting them to be overridden by the defaults list. This result in a different config being created when running `fairseq_cli/hydra_train.py` with Hydra 1.0 and with 1.1. Hydra 1.1 introduced the `_self_` keyword in the defaults list to control the composition order. In order to achieve the behavior of Hydra 1.0, `_self_` should be added as the first item in the defaults list. To allow for a smoother migration, Hydra 1.0 is ignoring `_self_` starting from 1.0.7 (previous versions will issue an error). This diff adds `_self_` as the first item in the defaults list the fairseq config, and introduce a dependency a Hydra 1.0 version that is equal or newer to 1.0.7. ### Testing: I ensured that the following yield the same composed config: Default config with Hydra 1.0.6, 1.0.7 and 1.1.0 `examples/wav2vec/config/finetuning/base_10h.yaml` with Hydra 1.0.6, 1.0.7 and 1.1.0. This can be achieved by outputing the generated config using `--cfg job` and compating the outputs. Pull Request resolved: https://github.com/pytorch/fairseq/pull/3722 Reviewed By: dianaml0 Differential Revision: D29917677 Pulled By: jieru-hu fbshipit-source-id: 7e645b83cccb03fc80a6702e302c4643d2b14a78
2021-07-27 02:35:40 +03:00
"hydra-core>=1.0.7,<1.1",
"omegaconf<2.1",
2022-10-15 00:25:28 +03:00
"numpy>=1.21.3",
"regex",
"sacrebleu>=1.4.12",
"torch>=1.13",
"tqdm",
Indexed Huffman Coded dataset (#2029) Summary: ## What does this PR do? Currently, binarized dataset are stored as a bin representation of int tensors. At best, each int is coded as uint16 on disk. When coding a fixed size vocabulary dataset where we know the frequency of each symbol and where some symbols are more common than other, we can do better. This happens in particular when binarizing a dataset split in subword units as the most common "tokenizers" like bpe and spm will choose subwords with high frequencies over subwords with low frequencies. In practice, if we know the frequency of all symbols (or a good estimate), we can use entropy encoding methods to compress the data. The idea is to assign a compressed representation where frequent symbols will have shorter representations than unfrequent symbols. In this PR, we build a Huffman code from a frequency table and use this code to encode a dataset. The PR provides the huffman coder implementation (using the single queue approach as we usually start with a sorted set of symbols) as well as a memory map implementation of a dataset that stores the data compressed with a huffman code and can return indexed tensors from it. Over a whole dataset, depending on how many symbols we sample to evaluate the frequency, we can save between 25% and 30% of storage space. ## Follow Ups currently the binarizer/preprocess script make too many assumptions about the dataset writers so the huffman dataset writer cannot be used straight out of the box with it. I will make follow ups PRs to provide easy to use scripts to build such datasets. But it's as simple as doing: ``` code_builder = HuffmanCodeBuilder() with open(sample_file, 'r', encoding="utf-8") as input: for line in input: code_builder.add(*line.strip().split(" ")) coder = code_builder.build_code() with HuffmanMMapIndexedDatasetBuilder('/tmp/testing_huffman', coder) as builder: with open(dataset_file, 'r', encoding="utf-8") as input: for line in input: builder.add_item(line.strip().split(' ')) ``` a lot of the `HuffmanMMapIndexedDataset` code comes from the normal `MMapIndexedDataset` and we could probably extract commonalities in a base class the `HuffmanCoder` is also really a special kind of `Dictionary` and again, a common base class could be abstracted out of them. Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/2029 Reviewed By: dianaml0 Differential Revision: D29557468 Pulled By: Mortimerp9 fbshipit-source-id: a01b6d98f38f937934cadebb3786133e257adefe
2021-08-31 11:11:34 +03:00
"bitarray",
"torchaudio>=0.8.0",
"scikit-learn",
"packaging",
],
extras_require={
"dev": ["flake8", "pytest", "black==22.3.0"],
"docs": ["sphinx", "sphinx-argparse"],
},
dependency_links=dependency_links,
packages=find_packages(
exclude=[
"examples",
"examples.*",
"scripts",
"scripts.*",
"tests",
"tests.*",
]
)
+ extra_packages,
package_data=package_data,
ext_modules=extensions,
test_suite="tests",
entry_points={
"console_scripts": [
"fairseq-eval-lm = fairseq_cli.eval_lm:cli_main",
"fairseq-generate = fairseq_cli.generate:cli_main",
"fairseq-hydra-train = fairseq_cli.hydra_train:cli_main",
"fairseq-interactive = fairseq_cli.interactive:cli_main",
"fairseq-preprocess = fairseq_cli.preprocess:cli_main",
"fairseq-score = fairseq_cli.score:cli_main",
"fairseq-train = fairseq_cli.train:cli_main",
"fairseq-validate = fairseq_cli.validate:cli_main",
],
},
cmdclass=cmdclass,
zip_safe=False,
)
def get_files(path, relative_to="fairseq"):
all_files = []
for root, _dirs, files in os.walk(path, followlinks=True):
root = os.path.relpath(root, relative_to)
for file in files:
if file.endswith(".pyc"):
continue
all_files.append(os.path.join(root, file))
return all_files
if __name__ == "__main__":
try:
# symlink examples into fairseq package so package_data accepts them
fairseq_examples = os.path.join("fairseq", "examples")
if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples):
os.symlink(os.path.join("..", "examples"), fairseq_examples)
package_data = {
"fairseq": (
get_files(fairseq_examples)
+ get_files(os.path.join("fairseq", "config"))
)
}
do_setup(package_data)
finally:
Fix attempt to unlink directory copied into source package (Python 3.9) (#3235) Summary: # Before submitting - [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [x] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/master/CONTRIBUTING.md)? - [N/A] Did you make sure to update the docs? - [N/A] Did you write any new necessary tests? ## What does this PR do? Currently when installing the newest source package from PyPI I get an error like so: ``` Collecting fairseq Using cached fairseq-0.10.2.tar.gz (938 kB) Installing build dependencies ... done Getting requirements to build wheel ... error ERROR: Command errored out with exit status 1: command: /home/frankier/sources/datasets/.venv/bin/python3 /tmp/tmp_ujftsgi_in_process.py get_requires_for_build_wheel /tmp/tmpmn0eumq2 cwd: /tmp/pip-install-dg5d6q9y/fairseq Complete output (31 lines): Traceback (most recent call last): File "setup.py", line 214, in <module> do_setup(package_data) File "setup.py", line 136, in do_setup setup( File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/__init__.py", line 152, in setup _install_setup_requires(attrs) File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/__init__.py", line 147, in _install_setup_requires dist.fetch_build_eggs(dist.setup_requires) File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/build_meta.py", line 60, in fetch_build_eggs raise SetupRequirementsError(specifier_list) setuptools.build_meta.SetupRequirementsError: ['cython', 'numpy', 'setuptools>=18.0'] During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/tmp/tmp_ujftsgi_in_process.py", line 280, in <module> main() File "/tmp/tmp_ujftsgi_in_process.py", line 263, in main json_out['return_val'] = hook(**hook_input['kwargs']) File "/tmp/tmp_ujftsgi_in_process.py", line 114, in get_requires_for_build_wheel return hook(config_settings) File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/build_meta.py", line 149, in get_requires_for_build_wheel return self._get_build_requires( File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/build_meta.py", line 130, in _get_build_requires self.run_setup() File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/build_meta.py", line 145, in run_setup exec(compile(code, __file__, 'exec'), locals()) File "setup.py", line 217, in <module> os.unlink(fairseq_examples) IsADirectoryError: [Errno 21] Is a directory: 'fairseq/examples' ---------------------------------------- ERROR: Command errored out with exit status 1: /home/frankier/sources/datasets/.venv/bin/python3 /tmp/tmp_ujftsgi_in_process.py get_requires_for_build_wheel /tmp/tmpmn0eumq2 Check the logs for full command output. ``` I believe the reason for this is that the source package contains the examples directory because it was put there during package creation (it seems the symlink because a directory). Now, when setup.py is run again, it seems the setup.py attempts to unlink the directory, which is not possible because only symlinks can be unlinked. This PR therefore only attempts to unlink it if it is a symlink. I have not thoroughly tested whether my proposed cause is the true cause, but this should fix it in any case. Note that the source package is fetched because there is no wheel for Python 3.9, so most users will not see this because they will use the wheel. ## PR review Anyone in the community is free to review the PR once the tests have passed. If we didn't discuss your PR in Github issues there's a high chance it will not be merged. ## Did you have fun? Make sure you had fun coding � Pull Request resolved: https://github.com/pytorch/fairseq/pull/3235 Reviewed By: alexeib Differential Revision: D26513259 Pulled By: myleott fbshipit-source-id: 775d6c636a5867b9983bb6419829f13ee414e2fd
2021-02-20 17:21:45 +03:00
if "build_ext" not in sys.argv[1:] and os.path.islink(fairseq_examples):
os.unlink(fairseq_examples)