progress bar and version bump

This commit is contained in:
Rico Sennrich 2021-12-08 11:01:01 +01:00
parent 823c880e4b
commit 7bae758b2e
3 changed files with 16 additions and 2 deletions

View File

@ -1,6 +1,12 @@
CHANGELOG
---------
v0.3.8:
- multiprocessing support (get_vocab and apply_bpe)
- progress bar for learn_bpe
- seed parameter for deterministic BPE dropout
- ignore some unicode line separators which would crash subword-nmt
v0.3.7:
- BPE dropout (Provilkov et al., 2019)
- more efficient glossaries (https://github.com/rsennrich/subword-nmt/pull/69)

View File

@ -11,7 +11,7 @@ def test_suite():
setup(
name='subword_nmt',
version='0.3.7',
version='0.3.8',
description='Unsupervised Word Segmentation for Neural Machine Translation and Text Generation',
long_description=(codecs.open("README.md", encoding='utf-8').read() +
"\n\n" + codecs.open("CHANGELOG.md", encoding='utf-8').read()),
@ -28,6 +28,8 @@ setup(
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 3',
],
install_requires=['mock',
'tqdm'],
packages=find_packages(),
entry_points={
'console_scripts': ['subword-nmt=subword_nmt.subword_nmt:main'],

View File

@ -25,6 +25,12 @@ import tempfile
from multiprocessing import Pool, cpu_count
from collections import defaultdict, Counter
try:
from tqdm import tqdm
except ImportError:
def tqdm(iterator, *args, **kwargs):
return iterator
# hack for python2/3 compatibility
from io import open
argparse.open = open
@ -294,7 +300,7 @@ def learn_bpe(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_d
# threshold is inspired by Zipfian assumption, but should only affect speed
threshold = max(stats.values()) / 10
for i in range(num_symbols):
for i in tqdm(range(num_symbols)):
if stats:
most_frequent = max(stats, key=lambda x: (stats[x], x))