2015-09-01 12:45:31 +03:00
|
|
|
#!/usr/bin/python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# Author: Rico Sennrich
|
|
|
|
|
|
|
|
"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
|
|
|
|
Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
|
|
|
|
of a text to a configurable number of symbols, with only a small increase in the number of tokens.
|
|
|
|
|
|
|
|
Reference:
|
2016-06-01 16:49:14 +03:00
|
|
|
Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
|
|
|
|
Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
|
2015-09-01 12:45:31 +03:00
|
|
|
"""
|
|
|
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import codecs
|
|
|
|
import re
|
|
|
|
import copy
|
|
|
|
import argparse
|
|
|
|
from collections import defaultdict, Counter
|
|
|
|
|
|
|
|
# hack for python2/3 compatibility
|
|
|
|
from io import open
|
|
|
|
argparse.open = open
|
|
|
|
|
|
|
|
# python 2/3 compatibility
|
|
|
|
if sys.version_info < (3, 0):
|
|
|
|
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
|
|
|
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
|
|
|
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
2017-02-10 14:11:45 +03:00
|
|
|
else:
|
|
|
|
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
|
|
|
|
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
|
|
|
|
sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
|
2015-09-01 12:45:31 +03:00
|
|
|
|
|
|
|
def create_parser():
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
|
description="learn BPE-based word segmentation")
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
|
|
|
|
metavar='PATH',
|
|
|
|
help="Input text (default: standard input).")
|
2017-02-27 18:56:55 +03:00
|
|
|
|
2015-09-01 12:45:31 +03:00
|
|
|
parser.add_argument(
|
|
|
|
'--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
|
|
|
|
metavar='PATH',
|
|
|
|
help="Output file for BPE codes (default: standard output)")
|
|
|
|
parser.add_argument(
|
|
|
|
'--symbols', '-s', type=int, default=10000,
|
|
|
|
help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))")
|
2016-10-17 18:35:53 +03:00
|
|
|
parser.add_argument(
|
|
|
|
'--min-frequency', type=int, default=2, metavar='FREQ',
|
|
|
|
help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
|
2017-02-27 18:56:55 +03:00
|
|
|
parser.add_argument('--dict-input', action="store_true",
|
|
|
|
help="If set, input file is interpreted as a dictionary where each line contains a word-count pair")
|
2015-12-07 14:25:57 +03:00
|
|
|
parser.add_argument(
|
|
|
|
'--verbose', '-v', action="store_true",
|
|
|
|
help="verbose mode.")
|
2017-02-27 18:56:55 +03:00
|
|
|
|
2015-09-01 12:45:31 +03:00
|
|
|
return parser
|
|
|
|
|
2017-02-25 15:01:52 +03:00
|
|
|
def get_vocabulary(fobj, is_dict=False):
|
2015-09-01 12:45:31 +03:00
|
|
|
"""Read text and return dictionary that encodes vocabulary
|
|
|
|
"""
|
|
|
|
vocab = Counter()
|
|
|
|
for line in fobj:
|
2017-02-25 15:01:52 +03:00
|
|
|
if is_dict:
|
2017-02-27 18:56:55 +03:00
|
|
|
word, count = line.strip().split()
|
|
|
|
vocab[word] = int(count)
|
2017-02-25 15:01:52 +03:00
|
|
|
else:
|
|
|
|
for word in line.split():
|
|
|
|
vocab[word] += 1
|
2015-09-01 12:45:31 +03:00
|
|
|
return vocab
|
|
|
|
|
|
|
|
def update_pair_statistics(pair, changed, stats, indices):
|
|
|
|
"""Minimally update the indices and frequency of symbol pairs
|
|
|
|
|
|
|
|
if we merge a pair of symbols, only pairs that overlap with occurrences
|
|
|
|
of this pair are affected, and need to be updated.
|
|
|
|
"""
|
|
|
|
stats[pair] = 0
|
|
|
|
indices[pair] = defaultdict(int)
|
|
|
|
first, second = pair
|
|
|
|
new_pair = first+second
|
|
|
|
for j, word, old_word, freq in changed:
|
|
|
|
|
|
|
|
# find all instances of pair, and update frequency/indices around it
|
|
|
|
i = 0
|
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
i = old_word.index(first, i)
|
|
|
|
except ValueError:
|
|
|
|
break
|
|
|
|
if i < len(old_word)-1 and old_word[i+1] == second:
|
|
|
|
if i:
|
|
|
|
prev = old_word[i-1:i+1]
|
|
|
|
stats[prev] -= freq
|
|
|
|
indices[prev][j] -= 1
|
|
|
|
if i < len(old_word)-2:
|
|
|
|
# don't double-count consecutive pairs
|
|
|
|
if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
|
|
|
|
nex = old_word[i+1:i+3]
|
|
|
|
stats[nex] -= freq
|
|
|
|
indices[nex][j] -= 1
|
|
|
|
i += 2
|
|
|
|
else:
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
i = word.index(new_pair, i)
|
|
|
|
except ValueError:
|
|
|
|
break
|
|
|
|
if i:
|
|
|
|
prev = word[i-1:i+1]
|
|
|
|
stats[prev] += freq
|
|
|
|
indices[prev][j] += 1
|
|
|
|
# don't double-count consecutive pairs
|
|
|
|
if i < len(word)-1 and word[i+1] != new_pair:
|
|
|
|
nex = word[i:i+2]
|
|
|
|
stats[nex] += freq
|
|
|
|
indices[nex][j] += 1
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
def get_pair_statistics(vocab):
|
|
|
|
"""Count frequency of all symbol pairs, and create index"""
|
|
|
|
|
|
|
|
# data structure of pair frequencies
|
|
|
|
stats = defaultdict(int)
|
|
|
|
|
|
|
|
#index from pairs to words
|
|
|
|
indices = defaultdict(lambda: defaultdict(int))
|
|
|
|
|
|
|
|
for i, (word, freq) in enumerate(vocab):
|
|
|
|
prev_char = word[0]
|
|
|
|
for char in word[1:]:
|
|
|
|
stats[prev_char, char] += freq
|
|
|
|
indices[prev_char, char][i] += 1
|
|
|
|
prev_char = char
|
|
|
|
|
|
|
|
return stats, indices
|
|
|
|
|
|
|
|
|
|
|
|
def replace_pair(pair, vocab, indices):
|
|
|
|
"""Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
|
|
|
|
first, second = pair
|
|
|
|
pair_str = ''.join(pair)
|
2016-01-29 13:53:31 +03:00
|
|
|
pair_str = pair_str.replace('\\','\\\\')
|
2015-09-01 12:45:31 +03:00
|
|
|
changes = []
|
|
|
|
pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
|
2016-02-15 13:44:28 +03:00
|
|
|
if sys.version_info < (3, 0):
|
|
|
|
iterator = indices[pair].iteritems()
|
|
|
|
else:
|
|
|
|
iterator = indices[pair].items()
|
|
|
|
for j, freq in iterator:
|
2015-09-01 12:45:31 +03:00
|
|
|
if freq < 1:
|
|
|
|
continue
|
|
|
|
word, freq = vocab[j]
|
|
|
|
new_word = ' '.join(word)
|
|
|
|
new_word = pattern.sub(pair_str, new_word)
|
|
|
|
new_word = tuple(new_word.split())
|
|
|
|
|
|
|
|
vocab[j] = (new_word, freq)
|
|
|
|
changes.append((j, new_word, word, freq))
|
|
|
|
|
|
|
|
return changes
|
|
|
|
|
|
|
|
def prune_stats(stats, big_stats, threshold):
|
|
|
|
"""Prune statistics dict for efficiency of max()
|
|
|
|
|
|
|
|
The frequency of a symbol pair never increases, so pruning is generally safe
|
|
|
|
(until we the most frequent pair is less frequent than a pair we previously pruned)
|
|
|
|
big_stats keeps full statistics for when we need to access pruned items
|
|
|
|
"""
|
|
|
|
for item,freq in list(stats.items()):
|
|
|
|
if freq < threshold:
|
|
|
|
del stats[item]
|
|
|
|
if freq < 0:
|
|
|
|
big_stats[item] += freq
|
|
|
|
else:
|
|
|
|
big_stats[item] = freq
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
parser = create_parser()
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2017-02-10 14:11:45 +03:00
|
|
|
# read/write files as UTF-8
|
|
|
|
if args.input.name != '<stdin>':
|
|
|
|
args.input = codecs.open(args.input.name, encoding='utf-8')
|
|
|
|
if args.output.name != '<stdout>':
|
|
|
|
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
|
|
|
|
|
2017-02-27 18:56:55 +03:00
|
|
|
vocab = get_vocabulary(args.input, is_dict = args.dict_input)
|
2015-09-01 12:45:31 +03:00
|
|
|
vocab = dict([(tuple(x)+('</w>',) ,y) for (x,y) in vocab.items()])
|
|
|
|
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
|
|
stats, indices = get_pair_statistics(sorted_vocab)
|
|
|
|
big_stats = copy.deepcopy(stats)
|
|
|
|
# threshold is inspired by Zipfian assumption, but should only affect speed
|
|
|
|
threshold = max(stats.values()) / 10
|
|
|
|
for i in range(args.symbols):
|
2015-10-29 19:44:20 +03:00
|
|
|
if stats:
|
2017-02-22 16:58:21 +03:00
|
|
|
most_frequent = max(stats, key=lambda x: (stats[x], x))
|
2015-09-01 12:45:31 +03:00
|
|
|
|
|
|
|
# we probably missed the best pair because of pruning; go back to full statistics
|
2015-10-29 19:44:20 +03:00
|
|
|
if not stats or (i and stats[most_frequent] < threshold):
|
2015-09-01 12:45:31 +03:00
|
|
|
prune_stats(stats, big_stats, threshold)
|
|
|
|
stats = copy.deepcopy(big_stats)
|
2017-02-22 16:58:21 +03:00
|
|
|
most_frequent = max(stats, key=lambda x: (stats[x], x))
|
2015-09-01 12:45:31 +03:00
|
|
|
# threshold is inspired by Zipfian assumption, but should only affect speed
|
|
|
|
threshold = stats[most_frequent] * i/(i+10000.0)
|
|
|
|
prune_stats(stats, big_stats, threshold)
|
|
|
|
|
2016-10-17 18:35:53 +03:00
|
|
|
if stats[most_frequent] < args.min_frequency:
|
|
|
|
sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(args.min_frequency))
|
2015-09-01 12:45:31 +03:00
|
|
|
break
|
|
|
|
|
2015-12-07 14:25:57 +03:00
|
|
|
if args.verbose:
|
|
|
|
sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent]))
|
2015-09-01 12:45:31 +03:00
|
|
|
args.output.write('{0} {1}\n'.format(*most_frequent))
|
|
|
|
changes = replace_pair(most_frequent, sorted_vocab, indices)
|
|
|
|
update_pair_statistics(most_frequent, changes, stats, indices)
|
|
|
|
stats[most_frequent] = 0
|
|
|
|
if not i % 100:
|
|
|
|
prune_stats(stats, big_stats, threshold)
|