2015-09-01 12:45:31 +03:00
#! /usr/bin/env python
2016-11-08 12:00:31 +03:00
from __future__ import print_function
2018-05-16 16:35:47 +03:00
import os
2015-09-01 12:45:31 +03:00
import sys
2018-05-16 16:35:47 +03:00
import inspect
import warnings
2018-05-16 16:47:59 +03:00
import argparse
import codecs
2018-05-16 16:35:47 +03:00
2015-09-01 12:45:31 +03:00
from collections import Counter
2018-05-16 14:22:01 +03:00
# hack for python2/3 compatibility
from io import open
argparse . open = open
2015-09-01 12:45:31 +03:00
2018-05-16 14:22:01 +03:00
def create_parser ( subparsers = None ) :
2015-09-01 12:45:31 +03:00
2018-05-16 14:22:01 +03:00
if subparsers :
parser = subparsers . add_parser ( ' get-vocab ' ,
formatter_class = argparse . RawDescriptionHelpFormatter ,
description = " Generates vocabulary " )
else :
2018-05-16 16:47:59 +03:00
parser = argparse . ArgumentParser (
2018-05-16 14:22:01 +03:00
formatter_class = argparse . RawDescriptionHelpFormatter ,
description = " Generates vocabulary " )
parser . add_argument (
2018-05-16 18:44:15 +03:00
' --input ' , ' -i ' , type = argparse . FileType ( ' r ' ) , default = sys . stdin ,
2018-05-16 14:22:01 +03:00
metavar = ' PATH ' ,
help = " Input file (default: standard input). " )
parser . add_argument (
2018-05-16 18:44:15 +03:00
' --output ' , ' -o ' , type = argparse . FileType ( ' w ' ) , default = sys . stdout ,
2018-05-16 14:22:01 +03:00
metavar = ' PATH ' ,
help = " Output file (default: standard output) " )
return parser
def get_vocab ( train_file , vocab_file ) :
c = Counter ( )
for line in train_file :
for word in line . strip ( ' \r \n ' ) . split ( ' ' ) :
if word :
c [ word ] + = 1
for key , f in sorted ( c . items ( ) , key = lambda x : x [ 1 ] , reverse = True ) :
vocab_file . write ( key + " " + str ( f ) + " \n " )
if __name__ == " __main__ " :
2018-05-16 16:35:47 +03:00
currentdir = os . path . dirname ( os . path . abspath ( inspect . getfile ( inspect . currentframe ( ) ) ) )
newdir = os . path . join ( currentdir , ' subword_nmt ' )
if os . path . isdir ( newdir ) :
warnings . warn (
" this script ' s location has moved to {0} . This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command ' subword-nmt ' " . format ( newdir ) ,
DeprecationWarning
)
2018-05-16 14:22:01 +03:00
# python 2/3 compatibility
if sys . version_info < ( 3 , 0 ) :
sys . stderr = codecs . getwriter ( ' UTF-8 ' ) ( sys . stderr )
sys . stdout = codecs . getwriter ( ' UTF-8 ' ) ( sys . stdout )
sys . stdin = codecs . getreader ( ' UTF-8 ' ) ( sys . stdin )
else :
sys . stderr = codecs . getwriter ( ' UTF-8 ' ) ( sys . stderr . buffer )
sys . stdout = codecs . getwriter ( ' UTF-8 ' ) ( sys . stdout . buffer )
sys . stdin = codecs . getreader ( ' UTF-8 ' ) ( sys . stdin . buffer )
parser = create_parser ( )
args = parser . parse_args ( )
2018-05-16 18:44:15 +03:00
# read/write files as UTF-8
if args . input . name != ' <stdin> ' :
args . input = codecs . open ( args . input . name , encoding = ' utf-8 ' )
if args . output . name != ' <stdout> ' :
args . output = codecs . open ( args . output . name , ' w ' , encoding = ' utf-8 ' )
2018-05-16 14:22:01 +03:00
2022-09-05 15:30:47 +03:00
get_vocab ( args . input , args . output )
# close files
if args . input . name != ' <stdin> ' :
args . input . close ( )
if args . output . name != ' <stdout> ' :
args . output . close ( )