mirror of
https://github.com/rsennrich/subword-nmt.git
synced 2024-11-30 05:46:47 +03:00
suppert argument --total-symbols in learn_joint_bpe_and_vocab
This commit is contained in:
parent
5700db410d
commit
73a6e55d5b
@ -67,6 +67,9 @@ def create_parser(subparsers=None):
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--min-frequency', type=int, default=2, metavar='FREQ',
|
'--min-frequency', type=int, default=2, metavar='FREQ',
|
||||||
help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
|
help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
|
||||||
|
parser.add_argument(
|
||||||
|
'--total-symbols', '-t', action="store_true",
|
||||||
|
help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--verbose', '-v', action="store_true",
|
'--verbose', '-v', action="store_true",
|
||||||
help="verbose mode.")
|
help="verbose mode.")
|
||||||
@ -93,7 +96,7 @@ def learn_joint_bpe_and_vocab(args):
|
|||||||
|
|
||||||
# learn BPE on combined vocabulary
|
# learn BPE on combined vocabulary
|
||||||
with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
|
with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
|
||||||
learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True)
|
learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols)
|
||||||
|
|
||||||
with codecs.open(args.output.name, encoding='UTF-8') as codes:
|
with codecs.open(args.output.name, encoding='UTF-8') as codes:
|
||||||
bpe = apply_bpe.BPE(codes, separator=args.separator)
|
bpe = apply_bpe.BPE(codes, separator=args.separator)
|
||||||
|
Loading…
Reference in New Issue
Block a user