2016-10-03 21:02:06 +03:00
# include <string>
# include <boost/program_options.hpp>
2014-06-11 14:47:00 +04:00
# include "util/usage.hh"
2014-06-13 13:20:30 +04:00
# include "moses/TranslationModel/ProbingPT/storing.hh"
2016-10-03 21:02:06 +03:00
# include "moses/InputFileStream.h"
# include "moses/OutputFileStream.h"
# include "moses/Util.h"
2014-06-11 14:47:00 +04:00
2016-10-03 21:02:06 +03:00
using namespace std ;
2016-02-16 19:03:37 +03:00
2016-10-03 21:02:06 +03:00
std : : string ReformatSCFGFile ( const std : : string & path ) ;
2014-06-11 14:47:00 +04:00
2015-01-14 14:07:42 +03:00
int main ( int argc , char * argv [ ] )
{
2016-10-06 17:14:56 +03:00
string inPath , outPath ;
int num_scores = 4 ;
int num_lex_scores = 0 ;
bool log_prob = false ;
bool scfg = false ;
int max_cache_size = 50000 ;
2014-06-11 14:47:00 +04:00
2016-10-03 21:02:06 +03:00
namespace po = boost : : program_options ;
po : : options_description desc ( " Options " ) ;
desc . add_options ( )
( " help " , " Print help messages " )
( " input-pt " , po : : value < string > ( ) - > required ( ) , " Text pt " )
( " output-dir " , po : : value < string > ( ) - > required ( ) , " Directory when binary files will be written " )
( " num-scores " , po : : value < int > ( ) - > default_value ( num_scores ) , " Number of pt scores " )
( " num-lex-scores " , po : : value < int > ( ) - > default_value ( num_lex_scores ) , " Number of lexicalized reordering scores " )
( " log-prob " , " log (and floor) probabilities before storing " )
( " max-cache-size " , po : : value < int > ( ) - > default_value ( max_cache_size ) , " Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit " )
( " scfg " , " Rules are SCFG in Moses format (ie. with non-terms and LHS " )
2016-01-07 14:11:31 +03:00
2016-10-06 17:14:56 +03:00
;
2016-10-03 21:02:06 +03:00
po : : variables_map vm ;
try {
po : : store ( po : : parse_command_line ( argc , argv , desc ) ,
vm ) ; // can throw
/** --help option
*/
if ( vm . count ( " help " ) ) {
std : : cout < < desc < < std : : endl ;
return EXIT_SUCCESS ;
}
po : : notify ( vm ) ; // throws on error, so do after help in case
// there are any problems
} catch ( po : : error & e ) {
std : : cerr < < " ERROR: " < < e . what ( ) < < std : : endl < < std : : endl ;
std : : cerr < < desc < < std : : endl ;
return EXIT_FAILURE ;
2016-02-16 19:03:37 +03:00
}
2016-10-03 21:02:06 +03:00
if ( vm . count ( " input-pt " ) ) inPath = vm [ " input-pt " ] . as < string > ( ) ;
if ( vm . count ( " output-dir " ) ) outPath = vm [ " output-dir " ] . as < string > ( ) ;
if ( vm . count ( " num-scores " ) ) num_scores = vm [ " num-scores " ] . as < int > ( ) ;
if ( vm . count ( " num-lex-scores " ) ) num_lex_scores = vm [ " num-lex-scores " ] . as < int > ( ) ;
if ( vm . count ( " max-cache-size " ) ) max_cache_size = vm [ " max-cache-size " ] . as < int > ( ) ;
if ( vm . count ( " log-prob " ) ) log_prob = true ;
if ( vm . count ( " scfg " ) ) scfg = true ;
if ( scfg ) {
inPath = ReformatSCFGFile ( inPath ) ;
2016-02-16 19:03:37 +03:00
}
2014-12-23 18:21:06 +03:00
2016-10-03 21:02:06 +03:00
Moses : : createProbingPT ( inPath , outPath , num_scores , num_lex_scores , log_prob , max_cache_size , scfg ) ;
2014-06-11 14:47:00 +04:00
2016-10-03 21:02:06 +03:00
//util::PrintUsage(std::cout);
2015-01-14 14:07:42 +03:00
return 0 ;
2014-06-11 14:47:00 +04:00
}
2016-10-03 21:02:06 +03:00
std : : string ReformatSCFGFile ( const std : : string & path )
{
Moses : : InputFileStream inFile ( path ) ;
string reformattedPath = path + " .reformat.gz " ;
Moses : : OutputFileStream outFile ( reformattedPath ) ;
string line ;
while ( getline ( inFile , line ) ) {
vector < string > toks = Moses : : TokenizeMultiCharSeparator ( line , " ||| " ) ;
assert ( toks . size ( ) > = 3 ) ;
// source
vector < string > sourceToks = Moses : : Tokenize ( toks [ 0 ] , " " ) ;
for ( size_t i = 0 ; i < sourceToks . size ( ) - 1 ; + + i ) {
outFile < < sourceToks [ i ] < < " " ;
}
// other columns
for ( size_t i = 1 ; i < toks . size ( ) ; + + i ) {
outFile < < " ||| " < < toks [ i ] ;
}
outFile < < endl ;
}
inFile . Close ( ) ;
outFile . Close ( ) ;
string sortedPath = path + " .reformat.sorted.gz " ;
string tmpPath = path + " .tmp " ;
string cmd = " mkdir " + tmpPath
2016-10-06 17:14:56 +03:00
+ " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath ;
2016-10-03 21:02:06 +03:00
system ( cmd . c_str ( ) ) ;
cmd = " rm -rf " + tmpPath + " " + reformattedPath ;
system ( cmd . c_str ( ) ) ;
return sortedPath ;
}