2017-09-25 20:17:43 +03:00
|
|
|
#!/usr/bin/env python3
|
2019-07-30 17:45:13 +03:00
|
|
|
# Copyright (c) Facebook, Inc. and its affiliates.
|
2017-09-15 03:22:43 +03:00
|
|
|
#
|
2019-07-30 17:45:13 +03:00
|
|
|
# This source code is licensed under the MIT license found in the
|
|
|
|
# LICENSE file in the root directory of this source tree.
|
2017-09-15 03:22:43 +03:00
|
|
|
|
2019-11-03 02:51:32 +03:00
|
|
|
import os
|
2020-10-22 22:45:19 +03:00
|
|
|
import subprocess
|
2017-09-15 03:22:43 +03:00
|
|
|
import sys
|
|
|
|
|
2020-10-19 04:13:29 +03:00
|
|
|
from setuptools import Extension, find_packages, setup
|
|
|
|
|
2019-12-17 06:46:03 +03:00
|
|
|
if sys.version_info < (3, 6):
|
2020-10-19 04:13:29 +03:00
|
|
|
sys.exit("Sorry, Python >= 3.6 is required for fairseq.")
|
2017-09-15 03:22:43 +03:00
|
|
|
|
2019-08-27 20:06:26 +03:00
|
|
|
|
2020-10-22 22:45:19 +03:00
|
|
|
def write_version_py():
|
|
|
|
with open(os.path.join("fairseq", "version.txt")) as f:
|
|
|
|
version = f.read().strip()
|
|
|
|
|
|
|
|
# write version info to fairseq/version.py
|
|
|
|
with open(os.path.join("fairseq", "version.py"), "w") as f:
|
2020-11-20 16:59:25 +03:00
|
|
|
f.write('__version__ = "{}"\n'.format(version))
|
2020-10-22 22:45:19 +03:00
|
|
|
return version
|
|
|
|
|
|
|
|
|
|
|
|
version = write_version_py()
|
|
|
|
|
|
|
|
|
2020-10-19 04:13:29 +03:00
|
|
|
with open("README.md") as f:
|
2017-09-15 03:22:43 +03:00
|
|
|
readme = f.read()
|
|
|
|
|
2019-08-27 20:06:26 +03:00
|
|
|
|
2020-10-19 04:13:29 +03:00
|
|
|
if sys.platform == "darwin":
|
|
|
|
extra_compile_args = ["-stdlib=libc++", "-O3"]
|
2019-07-06 22:27:58 +03:00
|
|
|
else:
|
2020-10-19 04:13:29 +03:00
|
|
|
extra_compile_args = ["-std=c++11", "-O3"]
|
2017-09-15 03:22:43 +03:00
|
|
|
|
2019-08-27 17:10:35 +03:00
|
|
|
|
2019-09-01 02:52:03 +03:00
|
|
|
class NumpyExtension(Extension):
|
|
|
|
"""Source: https://stackoverflow.com/a/54128391"""
|
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
self.__include_dirs = []
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def include_dirs(self):
|
|
|
|
import numpy
|
2020-10-19 04:13:29 +03:00
|
|
|
|
2019-09-01 02:52:03 +03:00
|
|
|
return self.__include_dirs + [numpy.get_include()]
|
|
|
|
|
|
|
|
@include_dirs.setter
|
|
|
|
def include_dirs(self, dirs):
|
|
|
|
self.__include_dirs = dirs
|
|
|
|
|
|
|
|
|
2019-08-27 20:06:26 +03:00
|
|
|
extensions = [
|
|
|
|
Extension(
|
2020-10-19 04:13:29 +03:00
|
|
|
"fairseq.libbleu",
|
2019-08-27 20:06:26 +03:00
|
|
|
sources=[
|
2020-10-19 04:13:29 +03:00
|
|
|
"fairseq/clib/libbleu/libbleu.cpp",
|
|
|
|
"fairseq/clib/libbleu/module.cpp",
|
2019-08-27 20:06:26 +03:00
|
|
|
],
|
2019-08-27 17:10:35 +03:00
|
|
|
extra_compile_args=extra_compile_args,
|
2019-08-27 20:06:26 +03:00
|
|
|
),
|
2019-09-01 02:52:03 +03:00
|
|
|
NumpyExtension(
|
2020-10-19 04:13:29 +03:00
|
|
|
"fairseq.data.data_utils_fast",
|
|
|
|
sources=["fairseq/data/data_utils_fast.pyx"],
|
|
|
|
language="c++",
|
2019-08-27 17:10:35 +03:00
|
|
|
extra_compile_args=extra_compile_args,
|
2019-08-27 20:06:26 +03:00
|
|
|
),
|
2019-09-01 02:52:03 +03:00
|
|
|
NumpyExtension(
|
2020-10-19 04:13:29 +03:00
|
|
|
"fairseq.data.token_block_utils_fast",
|
|
|
|
sources=["fairseq/data/token_block_utils_fast.pyx"],
|
|
|
|
language="c++",
|
2019-08-27 20:06:26 +03:00
|
|
|
extra_compile_args=extra_compile_args,
|
|
|
|
),
|
|
|
|
]
|
2019-08-27 17:10:35 +03:00
|
|
|
|
|
|
|
|
2019-11-03 02:51:32 +03:00
|
|
|
cmdclass = {}
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
# torch is not available when generating docs
|
|
|
|
from torch.utils import cpp_extension
|
2019-12-21 13:43:47 +03:00
|
|
|
|
2021-03-30 04:02:07 +03:00
|
|
|
extensions.extend(
|
|
|
|
[
|
|
|
|
cpp_extension.CppExtension(
|
|
|
|
"fairseq.libbase",
|
|
|
|
sources=[
|
|
|
|
"fairseq/clib/libbase/balanced_assignment.cpp",
|
|
|
|
],
|
|
|
|
)
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
2020-10-19 04:13:29 +03:00
|
|
|
extensions.extend(
|
|
|
|
[
|
2019-12-21 13:43:47 +03:00
|
|
|
cpp_extension.CppExtension(
|
2020-10-19 04:13:29 +03:00
|
|
|
"fairseq.libnat",
|
2019-12-21 13:43:47 +03:00
|
|
|
sources=[
|
2020-10-19 04:13:29 +03:00
|
|
|
"fairseq/clib/libnat/edit_dist.cpp",
|
2019-12-21 13:43:47 +03:00
|
|
|
],
|
2021-10-07 02:48:08 +03:00
|
|
|
),
|
|
|
|
cpp_extension.CppExtension(
|
|
|
|
"alignment_train_cpu_binding",
|
|
|
|
sources=[
|
|
|
|
"examples/operators/alignment_train_cpu.cpp",
|
|
|
|
],
|
|
|
|
),
|
2020-10-19 04:13:29 +03:00
|
|
|
]
|
|
|
|
)
|
|
|
|
if "CUDA_HOME" in os.environ:
|
|
|
|
extensions.extend(
|
|
|
|
[
|
|
|
|
cpp_extension.CppExtension(
|
|
|
|
"fairseq.libnat_cuda",
|
|
|
|
sources=[
|
|
|
|
"fairseq/clib/libnat_cuda/edit_dist.cu",
|
|
|
|
"fairseq/clib/libnat_cuda/binding.cpp",
|
|
|
|
],
|
2020-12-30 23:57:02 +03:00
|
|
|
),
|
|
|
|
cpp_extension.CppExtension(
|
|
|
|
"fairseq.ngram_repeat_block_cuda",
|
|
|
|
sources=[
|
|
|
|
"fairseq/clib/cuda/ngram_repeat_block_cuda.cpp",
|
|
|
|
"fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu",
|
|
|
|
],
|
|
|
|
),
|
2021-10-07 02:48:08 +03:00
|
|
|
cpp_extension.CppExtension(
|
|
|
|
"alignment_train_cuda_binding",
|
|
|
|
sources=[
|
|
|
|
"examples/operators/alignment_train_kernel.cu",
|
|
|
|
"examples/operators/alignment_train_cuda.cpp",
|
|
|
|
],
|
|
|
|
),
|
2020-10-19 04:13:29 +03:00
|
|
|
]
|
|
|
|
)
|
|
|
|
cmdclass["build_ext"] = cpp_extension.BuildExtension
|
2019-12-21 13:43:47 +03:00
|
|
|
|
2019-11-03 02:51:32 +03:00
|
|
|
except ImportError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2020-10-19 04:13:29 +03:00
|
|
|
if "READTHEDOCS" in os.environ:
|
2019-11-03 02:51:32 +03:00
|
|
|
# don't build extensions when generating docs
|
|
|
|
extensions = []
|
2020-10-19 04:13:29 +03:00
|
|
|
if "build_ext" in cmdclass:
|
|
|
|
del cmdclass["build_ext"]
|
2019-11-03 02:51:32 +03:00
|
|
|
|
|
|
|
# use CPU build of PyTorch
|
|
|
|
dependency_links = [
|
2020-11-19 01:30:02 +03:00
|
|
|
"https://download.pytorch.org/whl/cpu/torch-1.7.0%2Bcpu-cp36-cp36m-linux_x86_64.whl"
|
2019-11-03 02:51:32 +03:00
|
|
|
]
|
|
|
|
else:
|
|
|
|
dependency_links = []
|
|
|
|
|
|
|
|
|
2020-10-19 04:13:29 +03:00
|
|
|
if "clean" in sys.argv[1:]:
|
2019-11-13 21:47:05 +03:00
|
|
|
# Source: https://bit.ly/2NLVsgE
|
|
|
|
print("deleting Cython files...")
|
2020-10-19 04:13:29 +03:00
|
|
|
|
|
|
|
subprocess.run(
|
|
|
|
["rm -f fairseq/*.so fairseq/**/*.so fairseq/*.pyd fairseq/**/*.pyd"],
|
|
|
|
shell=True,
|
|
|
|
)
|
2019-11-26 00:38:42 +03:00
|
|
|
|
|
|
|
|
2020-11-19 01:30:02 +03:00
|
|
|
extra_packages = []
|
|
|
|
if os.path.exists(os.path.join("fairseq", "model_parallel", "megatron", "mpu")):
|
|
|
|
extra_packages.append("fairseq.model_parallel.megatron.mpu")
|
|
|
|
|
|
|
|
|
2020-10-19 19:22:28 +03:00
|
|
|
def do_setup(package_data):
|
|
|
|
setup(
|
|
|
|
name="fairseq",
|
2020-10-22 22:45:19 +03:00
|
|
|
version=version,
|
2020-10-19 19:22:28 +03:00
|
|
|
description="Facebook AI Research Sequence-to-Sequence Toolkit",
|
|
|
|
url="https://github.com/pytorch/fairseq",
|
|
|
|
classifiers=[
|
|
|
|
"Intended Audience :: Science/Research",
|
|
|
|
"License :: OSI Approved :: MIT License",
|
|
|
|
"Programming Language :: Python :: 3.6",
|
2020-12-16 04:46:37 +03:00
|
|
|
"Programming Language :: Python :: 3.7",
|
|
|
|
"Programming Language :: Python :: 3.8",
|
2020-10-19 19:22:28 +03:00
|
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
2019-02-05 18:46:44 +03:00
|
|
|
],
|
2020-10-19 19:22:28 +03:00
|
|
|
long_description=readme,
|
|
|
|
long_description_content_type="text/markdown",
|
|
|
|
setup_requires=[
|
|
|
|
"cython",
|
2020-12-04 05:17:09 +03:00
|
|
|
'numpy<1.20.0; python_version<"3.7"',
|
|
|
|
'numpy; python_version>="3.7"',
|
2020-10-19 19:22:28 +03:00
|
|
|
"setuptools>=18.0",
|
|
|
|
],
|
|
|
|
install_requires=[
|
|
|
|
"cffi",
|
|
|
|
"cython",
|
2020-12-04 05:17:09 +03:00
|
|
|
'dataclasses; python_version<"3.7"',
|
2021-07-27 02:35:40 +03:00
|
|
|
"hydra-core>=1.0.7,<1.1",
|
2020-12-16 04:46:37 +03:00
|
|
|
"omegaconf<2.1",
|
2020-12-04 05:17:09 +03:00
|
|
|
'numpy<1.20.0; python_version<"3.7"',
|
|
|
|
'numpy; python_version>="3.7"',
|
2020-10-19 19:22:28 +03:00
|
|
|
"regex",
|
|
|
|
"sacrebleu>=1.4.12",
|
|
|
|
"torch",
|
|
|
|
"tqdm",
|
Indexed Huffman Coded dataset (#2029)
Summary:
## What does this PR do?
Currently, binarized dataset are stored as a bin representation of int tensors. At best, each int is coded as uint16 on disk.
When coding a fixed size vocabulary dataset where we know the frequency of each symbol and where some symbols are more common than other, we can do better. This happens in particular when binarizing a dataset split in subword units as the most common "tokenizers" like bpe and spm will choose subwords with high frequencies over subwords with low frequencies.
In practice, if we know the frequency of all symbols (or a good estimate), we can use entropy encoding methods to compress the data. The idea is to assign a compressed representation where frequent symbols will have shorter representations than unfrequent symbols.
In this PR, we build a Huffman code from a frequency table and use this code to encode a dataset. The PR provides the huffman coder implementation (using the single queue approach as we usually start with a sorted set of symbols) as well as a memory map implementation of a dataset that stores the data compressed with a huffman code and can return indexed tensors from it.
Over a whole dataset, depending on how many symbols we sample to evaluate the frequency, we can save between 25% and 30% of storage space.
## Follow Ups
currently the binarizer/preprocess script make too many assumptions about the dataset writers so the huffman dataset writer cannot be used straight out of the box with it. I will make follow ups PRs to provide easy to use scripts to build such datasets. But it's as simple as doing:
```
code_builder = HuffmanCodeBuilder()
with open(sample_file, 'r', encoding="utf-8") as input:
for line in input:
code_builder.add(*line.strip().split(" "))
coder = code_builder.build_code()
with HuffmanMMapIndexedDatasetBuilder('/tmp/testing_huffman', coder) as builder:
with open(dataset_file, 'r', encoding="utf-8") as input:
for line in input:
builder.add_item(line.strip().split(' '))
```
a lot of the `HuffmanMMapIndexedDataset` code comes from the normal `MMapIndexedDataset` and we could probably extract commonalities in a base class
the `HuffmanCoder` is also really a special kind of `Dictionary` and again, a common base class could be abstracted out of them.
Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/2029
Reviewed By: dianaml0
Differential Revision: D29557468
Pulled By: Mortimerp9
fbshipit-source-id: a01b6d98f38f937934cadebb3786133e257adefe
2021-08-31 11:11:34 +03:00
|
|
|
"bitarray",
|
2021-09-14 04:12:38 +03:00
|
|
|
"torchaudio>=0.8.0",
|
2020-10-19 19:22:28 +03:00
|
|
|
],
|
|
|
|
dependency_links=dependency_links,
|
|
|
|
packages=find_packages(
|
|
|
|
exclude=[
|
|
|
|
"examples",
|
|
|
|
"examples.*",
|
|
|
|
"scripts",
|
|
|
|
"scripts.*",
|
|
|
|
"tests",
|
|
|
|
"tests.*",
|
|
|
|
]
|
2020-11-20 16:59:25 +03:00
|
|
|
)
|
|
|
|
+ extra_packages,
|
2020-10-19 19:22:28 +03:00
|
|
|
package_data=package_data,
|
|
|
|
ext_modules=extensions,
|
|
|
|
test_suite="tests",
|
|
|
|
entry_points={
|
|
|
|
"console_scripts": [
|
|
|
|
"fairseq-eval-lm = fairseq_cli.eval_lm:cli_main",
|
|
|
|
"fairseq-generate = fairseq_cli.generate:cli_main",
|
2020-11-20 16:59:25 +03:00
|
|
|
"fairseq-hydra-train = fairseq_cli.hydra_train:cli_main",
|
2020-10-19 19:22:28 +03:00
|
|
|
"fairseq-interactive = fairseq_cli.interactive:cli_main",
|
|
|
|
"fairseq-preprocess = fairseq_cli.preprocess:cli_main",
|
|
|
|
"fairseq-score = fairseq_cli.score:cli_main",
|
|
|
|
"fairseq-train = fairseq_cli.train:cli_main",
|
|
|
|
"fairseq-validate = fairseq_cli.validate:cli_main",
|
|
|
|
],
|
|
|
|
},
|
|
|
|
cmdclass=cmdclass,
|
|
|
|
zip_safe=False,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def get_files(path, relative_to="fairseq"):
|
|
|
|
all_files = []
|
|
|
|
for root, _dirs, files in os.walk(path, followlinks=True):
|
|
|
|
root = os.path.relpath(root, relative_to)
|
|
|
|
for file in files:
|
|
|
|
if file.endswith(".pyc"):
|
|
|
|
continue
|
|
|
|
all_files.append(os.path.join(root, file))
|
|
|
|
return all_files
|
|
|
|
|
|
|
|
|
2021-01-21 18:32:08 +03:00
|
|
|
if __name__ == "__main__":
|
|
|
|
try:
|
|
|
|
# symlink examples into fairseq package so package_data accepts them
|
|
|
|
fairseq_examples = os.path.join("fairseq", "examples")
|
|
|
|
if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples):
|
|
|
|
os.symlink(os.path.join("..", "examples"), fairseq_examples)
|
|
|
|
|
|
|
|
package_data = {
|
|
|
|
"fairseq": (
|
2021-11-29 23:30:10 +03:00
|
|
|
get_files(fairseq_examples)
|
|
|
|
+ get_files(os.path.join("fairseq", "config"))
|
2021-01-21 18:32:08 +03:00
|
|
|
)
|
|
|
|
}
|
|
|
|
do_setup(package_data)
|
|
|
|
finally:
|
Fix attempt to unlink directory copied into source package (Python 3.9) (#3235)
Summary:
# Before submitting
- [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements)
- [x] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/master/CONTRIBUTING.md)?
- [N/A] Did you make sure to update the docs?
- [N/A] Did you write any new necessary tests?
## What does this PR do?
Currently when installing the newest source package from PyPI I get an error like so:
```
Collecting fairseq
Using cached fairseq-0.10.2.tar.gz (938 kB)
Installing build dependencies ... done
Getting requirements to build wheel ... error
ERROR: Command errored out with exit status 1:
command: /home/frankier/sources/datasets/.venv/bin/python3 /tmp/tmp_ujftsgi_in_process.py get_requires_for_build_wheel /tmp/tmpmn0eumq2
cwd: /tmp/pip-install-dg5d6q9y/fairseq
Complete output (31 lines):
Traceback (most recent call last):
File "setup.py", line 214, in <module>
do_setup(package_data)
File "setup.py", line 136, in do_setup
setup(
File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/__init__.py", line 152, in setup
_install_setup_requires(attrs)
File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/__init__.py", line 147, in _install_setup_requires
dist.fetch_build_eggs(dist.setup_requires)
File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/build_meta.py", line 60, in fetch_build_eggs
raise SetupRequirementsError(specifier_list)
setuptools.build_meta.SetupRequirementsError: ['cython', 'numpy', 'setuptools>=18.0']
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tmp/tmp_ujftsgi_in_process.py", line 280, in <module>
main()
File "/tmp/tmp_ujftsgi_in_process.py", line 263, in main
json_out['return_val'] = hook(**hook_input['kwargs'])
File "/tmp/tmp_ujftsgi_in_process.py", line 114, in get_requires_for_build_wheel
return hook(config_settings)
File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/build_meta.py", line 149, in get_requires_for_build_wheel
return self._get_build_requires(
File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/build_meta.py", line 130, in _get_build_requires
self.run_setup()
File "/tmp/pip-build-env-hag0sxvp/overlay/lib/python3.9/site-packages/setuptools/build_meta.py", line 145, in run_setup
exec(compile(code, __file__, 'exec'), locals())
File "setup.py", line 217, in <module>
os.unlink(fairseq_examples)
IsADirectoryError: [Errno 21] Is a directory: 'fairseq/examples'
----------------------------------------
ERROR: Command errored out with exit status 1: /home/frankier/sources/datasets/.venv/bin/python3 /tmp/tmp_ujftsgi_in_process.py get_requires_for_build_wheel /tmp/tmpmn0eumq2 Check the logs for full command output.
```
I believe the reason for this is that the source package contains the examples directory because it was put there during package creation (it seems the symlink because a directory). Now, when setup.py is run again, it seems the setup.py attempts to unlink the directory, which is not possible because only symlinks can be unlinked. This PR therefore only attempts to unlink it if it is a symlink. I have not thoroughly tested whether my proposed cause is the true cause, but this should fix it in any case.
Note that the source package is fetched because there is no wheel for Python 3.9, so most users will not see this because they will use the wheel.
## PR review
Anyone in the community is free to review the PR once the tests have passed.
If we didn't discuss your PR in Github issues there's a high chance it will not be merged.
## Did you have fun?
Make sure you had fun coding �
Pull Request resolved: https://github.com/pytorch/fairseq/pull/3235
Reviewed By: alexeib
Differential Revision: D26513259
Pulled By: myleott
fbshipit-source-id: 775d6c636a5867b9983bb6419829f13ee414e2fd
2021-02-20 17:21:45 +03:00
|
|
|
if "build_ext" not in sys.argv[1:] and os.path.islink(fairseq_examples):
|
2021-01-21 18:32:08 +03:00
|
|
|
os.unlink(fairseq_examples)
|