progress bar and version bump

2024-11-22 23:30:16 +03:00 · 2021-12-08 11:01:01 +01:00 · 2021-12-08 11:01:01 +01:00 · 7bae758b2e
commit 7bae758b2e
parent 823c880e4b
3 changed files with 16 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,6 +1,12 @@
 CHANGELOG
 ---------

+v0.3.8:
+  - multiprocessing support (get_vocab and apply_bpe)
+  - progress bar for learn_bpe
+  - seed parameter for deterministic BPE dropout
+  - ignore some unicode line separators which would crash subword-nmt
+
 v0.3.7:
  - BPE dropout (Provilkov et al., 2019)
  - more efficient glossaries (https://github.com/rsennrich/subword-nmt/pull/69)
--- a/setup.py
+++ b/setup.py
@ -11,7 +11,7 @@ def test_suite():

 setup(
    name='subword_nmt',
-    version='0.3.7',
+    version='0.3.8',
    description='Unsupervised Word Segmentation for Neural Machine Translation and Text Generation',
    long_description=(codecs.open("README.md", encoding='utf-8').read() +
                      "\n\n" + codecs.open("CHANGELOG.md", encoding='utf-8').read()),
@ -28,6 +28,8 @@ setup(
    'Programming Language :: Python :: 2',
    'Programming Language :: Python :: 3',
    ],
+    install_requires=['mock',
+                      'tqdm'],
    packages=find_packages(),
    entry_points={
        'console_scripts': ['subword-nmt=subword_nmt.subword_nmt:main'],
--- a/subword_nmt/learn_bpe.py
+++ b/subword_nmt/learn_bpe.py
@ -25,6 +25,12 @@ import tempfile
 from multiprocessing import Pool, cpu_count
 from collections import defaultdict, Counter

+try:
+    from tqdm import tqdm
+except ImportError:
+    def tqdm(iterator, *args, **kwargs):
+        return iterator
+
 # hack for python2/3 compatibility
 from io import open
 argparse.open = open
@ -294,7 +300,7 @@ def learn_bpe(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_d

    # threshold is inspired by Zipfian assumption, but should only affect speed
    threshold = max(stats.values()) / 10
-    for i in range(num_symbols):
+    for i in tqdm(range(num_symbols)):
        if stats:
            most_frequent = max(stats, key=lambda x: (stats[x], x))