suppert argument --total-symbols in learn_joint_bpe_and_vocab

This commit is contained in:
Rico Sennrich 2018-08-20 12:07:21 +01:00
parent 5700db410d
commit 73a6e55d5b

View File

@ -67,6 +67,9 @@ def create_parser(subparsers=None):
parser.add_argument( parser.add_argument(
'--min-frequency', type=int, default=2, metavar='FREQ', '--min-frequency', type=int, default=2, metavar='FREQ',
help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))') help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
parser.add_argument(
'--total-symbols', '-t', action="store_true",
help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).")
parser.add_argument( parser.add_argument(
'--verbose', '-v', action="store_true", '--verbose', '-v', action="store_true",
help="verbose mode.") help="verbose mode.")
@ -93,7 +96,7 @@ def learn_joint_bpe_and_vocab(args):
# learn BPE on combined vocabulary # learn BPE on combined vocabulary
with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols)
with codecs.open(args.output.name, encoding='UTF-8') as codes: with codecs.open(args.output.name, encoding='UTF-8') as codes:
bpe = apply_bpe.BPE(codes, separator=args.separator) bpe = apply_bpe.BPE(codes, separator=args.separator)