Merge branch 'master' of http://github.com/moses-smt/mosesdecoder into ranked-sampling

Conflicts:
	moses/TargetPhrase.cpp
	moses/TargetPhrase.h
This commit is contained in:
Ulrich Germann 2015-07-28 14:29:49 +01:00
commit d67723fd29
98 changed files with 1742 additions and 1039 deletions

View File

@ -179,7 +179,7 @@ if [ option.get "with-icu" : : "yes" ]
requirements += <library>icui18n/<link>shared ;
requirements += <cxxflags>-fPIC ;
requirements += <address-model>64 ;
requirements += <runtime-link>shared ;
# requirements += <runtime-link>shared ;
}
if [ option.get "with-probing-pt" : : "yes" ]

View File

@ -21,6 +21,11 @@ SuffixArray::SuffixArray()
m_wordInSentence(NULL),
m_sentence(NULL),
m_sentenceLength(NULL),
m_document(NULL),
m_documentName(NULL),
m_documentNameLength(0),
m_documentCount(0),
m_useDocument(false),
m_vcb(),
m_size(0),
m_sentenceCount(0) { }
@ -32,6 +37,8 @@ SuffixArray::~SuffixArray()
free(m_wordInSentence);
free(m_sentence);
free(m_sentenceLength);
free(m_document);
free(m_documentName);
}
void SuffixArray::Create(const string& fileName )
@ -46,22 +53,32 @@ void SuffixArray::Create(const string& fileName )
textFile.open(fileName.c_str());
if (!textFile) {
cerr << "no such file or directory " << fileName << endl;
cerr << "Error: no such file or directory " << fileName << endl;
exit(1);
}
// first pass through data: get size
istream *fileP = &textFile;
m_size = 0;
m_sentenceCount = 0;
m_documentCount = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
if (m_useDocument && ProcessDocumentLine(line,0)) continue;
vector< WORD_ID > words = m_vcb.Tokenize( line );
m_size += words.size() + 1;
m_sentenceCount++;
}
textFile.close();
cerr << m_size << " words (incl. sentence boundaries)" << endl;
if (m_useDocument) {
cerr << m_documentCount << " documents" << endl;
if (m_documentCount == 0) {
cerr << "Error: no documents found, aborting." << endl;
exit(1);
}
}
// allocate memory
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
@ -69,21 +86,31 @@ void SuffixArray::Create(const string& fileName )
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
// fill the array
int wordIndex = 0;
int sentenceId = 0;
textFile.open(fileName.c_str());
if (!textFile) {
cerr << "no such file or directory " << fileName << endl;
exit(1);
CheckAllocation(m_array != NULL, "m_array");
CheckAllocation(m_index != NULL, "m_index");
CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
CheckAllocation(m_sentence != NULL, "m_sentence");
CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
if (m_useDocument) {
m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
m_documentName = (INDEX*) calloc( sizeof( char ), m_documentCount );
m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
CheckAllocation(m_document != NULL, "m_document");
CheckAllocation(m_documentName != NULL, "m_documentName");
CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
}
// second pass through data: fill the arrays
int wordIndex = 0;
int sentenceId = 0;
m_documentNameLength = 0; // re-use as counter
m_documentCount = 0; // re-use as counter
textFile.open(fileName.c_str());
fileP = &textFile;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
if (m_useDocument && ProcessDocumentLine(line,sentenceId)) continue;
vector< WORD_ID > words = m_vcb.Tokenize( line );
vector< WORD_ID >::const_iterator i;
@ -105,7 +132,7 @@ void SuffixArray::Create(const string& fileName )
m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
if (m_buffer == NULL) {
cerr << "cannot allocate memory to m_buffer" << endl;
cerr << "Error: cannot allocate memory to m_buffer" << endl;
exit(1);
}
@ -114,6 +141,45 @@ void SuffixArray::Create(const string& fileName )
cerr << "done sorting" << endl;
}
// very specific code to deal with common crawl document ids
bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId )
{
size_t i;
// first 32 characters are hex-hash
for(i=0; i<32; i++) {
if ((line[i] < '0' || line[i] > '9') && (line[i] < 'a' || line[i] > 'f')) {
return false;
}
}
if (line[i++] != ' ') return false;
// second token is float
for (; line[i] != ' ' && line[i] != 0; i++) {
if (line[i] != '.' && (line[i] < '0' || line[i] > '9')) {
return false;
}
}
i++;
// last token is url (=name)
size_t startName = i;
for (; line[i] != ' ' && line[i] != 0; i++) {}
if (line[i] == ' ') return false;
size_t endName = i+1; // include '\0'
// second pass: record name and sentence number
if (m_document != NULL) {
m_documentName[m_documentCount] = m_documentNameLength;
for(size_t i=startName; i<endName; i++) {
m_documentNameBuffer[m_documentNameLength + i-startName] = line[i];
}
m_document[m_documentCount] = sentenceId;
}
m_documentNameLength += endName-startName;
m_documentCount++;
return true;
}
// good ol' quick sort
void SuffixArray::Sort(INDEX start, INDEX end)
{
@ -162,7 +228,6 @@ int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
{
// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
}
@ -272,13 +337,73 @@ void SuffixArray::List(INDEX start, INDEX end)
}
}
void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase )
{
cout << "QUERY\t";
for(size_t i=0; i<phrase.size(); i++) {
if (i>0) cout << " ";
cout << phrase[i];
}
cout << '\t';
INDEX start = 0;
INDEX end = m_size-1;
INDEX mid = FindFirst( phrase, start, end );
if (mid == m_size) { // no matches
cout << "0 matches" << endl;
return;
}
INDEX firstMatch = FindLast( phrase, mid, start, -1 );
INDEX lastMatch = FindLast( phrase, mid, end, 1 );
// loop through all matches
cout << (lastMatch-firstMatch+1) << " matches" << endl;
for(INDEX i=firstMatch; i<=lastMatch; i++) {
// get sentence information
INDEX pos = GetPosition( i );
INDEX start = pos - GetWordInSentence( pos );
char length = GetSentenceLength( GetSentence( pos ) );
// print document name
if (m_useDocument) {
INDEX sentence = GetSentence( pos );
INDEX document = GetDocument( sentence );
PrintDocumentName( document );
cout << '\t';
}
// print sentence
for(char i=0; i<length; i++) {
if (i>0) cout << " ";
cout << GetWord( start + i );
}
cout << endl;
}
}
SuffixArray::INDEX SuffixArray::GetDocument( INDEX sentence ) const
{
// binary search
INDEX min = 0;
INDEX max = m_documentCount-1;
if (sentence >= m_document[max]) {
return max;
}
while(true) {
INDEX mid = (min + max) / 2;
if (sentence >= m_document[mid] && sentence < m_document[mid+1]) {
return mid;
}
if (sentence < m_document[mid]) {
max = mid-1;
} else {
min = mid+1;
}
}
}
void SuffixArray::Save(const string& fileName ) const
{
FILE *pFile = fopen ( fileName.c_str() , "w" );
if (pFile == NULL) {
cerr << "Cannot open " << fileName << endl;
exit(1);
}
if (pFile == NULL) Error("cannot open",fileName);
fwrite( &m_size, sizeof(INDEX), 1, pFile );
fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
@ -288,6 +413,16 @@ void SuffixArray::Save(const string& fileName ) const
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
char useDocument = m_useDocument; // not sure if that is needed
fwrite( &useDocument, sizeof(char), 1, pFile );
if (m_useDocument) {
fwrite( &m_documentCount, sizeof(INDEX), 1, pFile );
fwrite( m_document, sizeof(INDEX), m_documentCount, pFile );
fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile );
fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile );
fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile );
}
fclose( pFile );
m_vcb.Save( fileName + ".src-vcb" );
@ -296,56 +431,81 @@ void SuffixArray::Save(const string& fileName ) const
void SuffixArray::Load(const string& fileName )
{
FILE *pFile = fopen ( fileName.c_str() , "r" );
if (pFile == NULL) {
cerr << "no such file or directory " << fileName << endl;
exit(1);
}
if (pFile == NULL) Error("no such file or directory", fileName);
cerr << "loading from " << fileName << endl;
fread( &m_size, sizeof(INDEX), 1, pFile );
fread( &m_size, sizeof(INDEX), 1, pFile )
|| Error("could not read m_size from", fileName);
cerr << "words in corpus: " << m_size << endl;
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
CheckAllocation(m_array != NULL, "m_array");
CheckAllocation(m_index != NULL, "m_index");
CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
CheckAllocation(m_sentence != NULL, "m_sentence");
fread( m_array, sizeof(WORD_ID), m_size, pFile ) // corpus
|| Error("could not read m_array from", fileName);
fread( m_index, sizeof(INDEX), m_size, pFile ) // suffix array
|| Error("could not read m_index from", fileName);
fread( m_wordInSentence, sizeof(char), m_size, pFile) // word index
|| Error("could not read m_wordInSentence from", fileName);
fread( m_sentence, sizeof(INDEX), m_size, pFile ) // sentence index
|| Error("could not read m_sentence from", fileName);
if (m_array == NULL) {
cerr << "Error: cannot allocate memory to m_array" << endl;
exit(1);
}
if (m_index == NULL) {
cerr << "Error: cannot allocate memory to m_index" << endl;
exit(1);
}
if (m_wordInSentence == NULL) {
cerr << "Error: cannot allocate memory to m_wordInSentence" << endl;
exit(1);
}
if (m_sentence == NULL) {
cerr << "Error: cannot allocate memory to m_sentence" << endl;
exit(1);
}
fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
fread( m_index, sizeof(INDEX), m_size, pFile ); // suffix array
fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index
fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile )
|| Error("could not read m_sentenceCount from", fileName);
cerr << "sentences in corpus: " << m_sentenceCount << endl;
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
if (m_sentenceLength == NULL) {
cerr << "Error: cannot allocate memory to m_sentenceLength" << endl;
exit(1);
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile) // sentence length
|| Error("could not read m_sentenceLength from", fileName);
if (m_useDocument) { // do not read it when you do not need it
char useDocument;
fread( &useDocument, sizeof(char), 1, pFile )
|| Error("could not read m_useDocument from", fileName);
if (!useDocument) {
cerr << "Error: stored suffix array does not have a document index\n";
exit(1);
}
fread( &m_documentCount, sizeof(INDEX), 1, pFile )
|| Error("could not read m_documentCount from", fileName);
m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
CheckAllocation(m_document != NULL, "m_document");
CheckAllocation(m_documentName != NULL, "m_documentName");
fread( m_document, sizeof(INDEX), m_documentCount, pFile )
|| Error("could not read m_document from", fileName);
fread( m_documentName, sizeof(INDEX), m_documentCount, pFile )
|| Error("could not read m_documentName from", fileName);
fread( &m_documentNameLength, sizeof(INDEX), 1, pFile )
|| Error("could not read m_documentNameLength from", fileName);
m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
fread( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile )
|| Error("could not read m_document from", fileName);
}
fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
fclose( pFile );
m_vcb.Load( fileName + ".src-vcb" );
}
void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const
{
if (check) return;
cerr << "Error: could not allocate memory for " << dataStructure << endl;
exit(1);
}
bool SuffixArray::Error( const char *message, const string &fileName) const
{
cerr << "Error: " << message << " " << fileName << endl;
exit(1);
return true; // yeah, i know.
}

View File

@ -15,6 +15,12 @@ private:
INDEX *m_sentence;
char *m_sentenceLength;
WORD_ID m_endOfSentence;
INDEX *m_document;
INDEX *m_documentName;
char *m_documentNameBuffer;
size_t m_documentNameLength;
size_t m_documentCount;
bool m_useDocument;
Vocabulary m_vcb;
INDEX m_size;
INDEX m_sentenceCount;
@ -28,6 +34,7 @@ public:
~SuffixArray();
void Create(const std::string& fileName );
bool ProcessDocumentLine( const char* const, const size_t );
void Sort(INDEX start, INDEX end);
int CompareIndex( INDEX a, INDEX b ) const;
inline int CompareWord( WORD_ID a, WORD_ID b ) const;
@ -40,6 +47,7 @@ public:
INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
int Match( const std::vector< WORD > &phrase, INDEX index );
void List( INDEX start, INDEX end );
void PrintSentenceMatches( const std::vector< WORD > &phrase );
inline INDEX GetPosition( INDEX index ) const {
return m_index[ index ];
}
@ -58,6 +66,17 @@ public:
inline WORD GetWord( INDEX position ) const {
return m_vcb.GetWord( m_array[position] );
}
void UseDocument() {
m_useDocument = true;
}
INDEX GetDocument( INDEX sentence ) const;
void PrintDocumentName( INDEX document ) {
for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) {
std::cout << m_documentNameBuffer[ i ];
}
}
void Save(const std::string& fileName ) const;
void Load(const std::string& fileName );
void CheckAllocation(bool, const char *dataStructure) const;
bool Error( const char* message, const std::string& fileName) const;
};

View File

@ -1,4 +1,5 @@
#include "SuffixArray.h"
#include "../util/tokenize.hh"
#include <getopt.h>
using namespace std;
@ -13,10 +14,12 @@ int main(int argc, char* argv[])
string query;
string fileNameSuffix;
string fileNameSource;
int loadFlag = false;
int saveFlag = false;
int createFlag = false;
int queryFlag = false;
bool loadFlag = false;
bool saveFlag = false;
bool createFlag = false;
bool queryFlag = false;
bool querySentenceFlag = false;
int stdioFlag = false; // receive requests from STDIN, respond to STDOUT
string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
while(1) {
@ -25,11 +28,14 @@ int main(int argc, char* argv[])
{"save", required_argument, 0, 's'},
{"create", required_argument, 0, 'c'},
{"query", required_argument, 0, 'q'},
{"query-sentence", required_argument, 0, 'Q'},
{"document", required_argument, 0, 'd'},
{"stdio", no_argument, 0, 'i'},
{"stdio-sentence", no_argument, 0, 'I'},
{0, 0, 0, 0}
};
int option_index = 0;
int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
int c = getopt_long (argc, argv, "l:s:c:q:Q:iId", long_options, &option_index);
if (c == -1) break;
switch (c) {
case 'l':
@ -48,17 +54,25 @@ int main(int argc, char* argv[])
query = string(optarg);
queryFlag = true;
break;
case 'Q':
query = string(optarg);
querySentenceFlag = true;
break;
case 'i':
stdioFlag = true;
break;
case 'I':
stdioFlag = true;
querySentenceFlag = true;
break;
case 'd':
suffixArray.UseDocument();
break;
default:
cerr << info;
exit(1);
}
}
if (stdioFlag) {
queryFlag = true;
}
// check if parameter settings are legal
if (saveFlag && !createFlag) {
@ -74,7 +88,7 @@ int main(int argc, char* argv[])
exit(1);
}
// do your thing
// get suffix array
if (createFlag) {
cerr << "will create\n";
cerr << "corpus is in " << fileNameSource << endl;
@ -88,16 +102,26 @@ int main(int argc, char* argv[])
cerr << "will load from " << fileNameSuffix << endl;
suffixArray.Load( fileNameSuffix );
}
// do something with it
if (stdioFlag) {
while(true) {
string query;
if (getline(cin, query, '\n').eof()) {
return 0;
}
cout << lookup( query ) << endl;
if (querySentenceFlag) {
vector< string > queryString = util::tokenize( query.c_str() );
suffixArray.PrintSentenceMatches( queryString );
} else {
cout << lookup( query ) << endl;
}
}
} else if (queryFlag) {
cout << lookup( query ) << endl;
} else if (querySentenceFlag) {
vector< string > queryString = util::tokenize( query.c_str() );
suffixArray.PrintSentenceMatches( queryString );
}
return 0;
}
@ -105,32 +129,6 @@ int main(int argc, char* argv[])
size_t lookup( string query )
{
cerr << "query is " << query << endl;
vector< string > queryString = tokenize( query.c_str() );
vector< string > queryString = util::tokenize( query.c_str() );
return suffixArray.Count( queryString );
}
// Duplicate of definition in util/tokenize.hh.
// TODO: Can we de-duplicate this? At the time of writing biconcor does not
// use util at all.
vector<string> tokenize(const char input[])
{
vector< string > token;
bool betweenWords = true;
int start=0;
int i;
for(i = 0; input[i] != '\0'; i++) {
const bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
} else if (isSpace && !betweenWords) {
token.push_back( string( input+start, i-start ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( string( input+start, i-start ) );
return token;
}

View File

@ -28,14 +28,16 @@ TEST_DIR: /home/moses-speedtest/phrase_tables/tests
TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
BASEBRANCH: RELEASE-2.1.1
MOSES_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-prof
MOSES_GOOGLE_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-gperftools
</pre>
The _MOSES\_REPO\_PATH_ is the place where you have set up and built moses.
The _DROP\_CACHES\_COMM_ is the command that would b eused to drop caches. It should run without needing root access.
The _DROP\_CACHES\_COMM_ is the command that would be used to drop caches. It should run without needing root access.
_TEST\_DIR_ is the directory where all the tests will reside.
_TEST\_LOG\_DIR_ is the directory where the performance logs will be gathered. It should be created before running the testsuite for the first time.
_BASEBRANCH_ is the branch against which all new tests will be compared. It should normally be set to be the latest Moses stable release.
_MOSES\_PROFILER\_REPO_ is a path to a moses repository set up and built with profiling enabled. Optional if you want to produce profiling results.
_MOSES\_GOOGLE\_PROFILER\_REPO is a path to moses repository set up with full tcmalloc and profiler, as well as shared link for use with gperftools.
### Creating tests
In order to create a test one should go into the TEST_DIR and create a new folder. That folder will be used for the name of the test.
@ -45,7 +47,7 @@ An example such configuration file is **test\_config**
<pre>
Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/,
Variants: vanilla, cached, ldpre, profile #Can't have cached without ldpre or vanilla
Variants: vanilla, cached, ldpre, profile, google-profiler #Can't have cached without ldpre or vanilla
</pre>
The _Command:_ line specifies the executable (which is looked up in the /bin directory of the repo.) and any arguments necessary. Before running the test, the script cds to the current test directory so you can use relative paths.
@ -61,11 +63,21 @@ The _Variants:_ line specifies what type of tests should we run. This particular
If you want to produce profiler results together in some tests you need to specify the _MOSES\_PROFILER\_REPO_ in the config
```bash
git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-profile
cd mosesdecoder
cd mosesdecoder-profile
./bjam -j10 --with-cmph=/usr/include/ variant=profile
```
Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run.
Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run (files ending in **\_profile**).
#### Produce google profiler results.
If you want to produce profiler results together in some tests you need to specify the _MOSES\_GOOGLE\_PROFILER\_REPO in the config
```bash
git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-google-profile
cd mosesdecoder
./bjam link=shared -j10 --full-tcmalloc --with-cmph=/usr/include/
```
Afterwards for testcases which contain the **google-profiler** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **google-profiler** output from every run (files prefixed with **pprof**). To analyze the output you need to use [pprof](http://google-perftools.googlecode.com/svn/trunk/doc/cpuprofile.html).
### Running tests.
Running the tests is done through the **runtests.py** script.

View File

@ -2,6 +2,7 @@
import os
import subprocess
import time
import shutil
from argparse import ArgumentParser
from testsuite_common import processLogLine
@ -26,16 +27,21 @@ def parse_cmd():
arguments = parser.parse_args()
return arguments
def repoinit(testconfig, profiler=True):
def repoinit(testconfig, profiler=None):
"""Determines revision and sets up the repo. If given the profiler optional
argument, wil init the profiler repo instead of the default one."""
revision = ''
#Update the repo
if profiler:
if profiler == "gnu-profiler":
if testconfig.repo_prof is not None:
os.chdir(testconfig.repo_prof)
else:
raise ValueError('Profiling repo is not defined')
elif profiler == "google-profiler":
if testconfig.repo_gprof is not None:
os.chdir(testconfig.repo_gprof)
else:
raise ValueError('Profiling repo is not defined')
else:
os.chdir(testconfig.repo)
#Checkout specific branch, else maintain main branch
@ -61,9 +67,10 @@ def repoinit(testconfig, profiler=True):
class Configuration:
"""A simple class to hold all of the configuration constatns"""
def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None):
def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None, repo_gprof=None):
self.repo = repo
self.repo_prof = repo_prof
self.repo_gprof = repo_gprof
self.drop_caches = drop_caches
self.tests = tests
self.testlogs = testlogs
@ -88,16 +95,17 @@ class Configuration:
class Test:
"""A simple class to contain all information about tests"""
def __init__(self, name, command, ldopts, permutations, prof_command=None):
def __init__(self, name, command, ldopts, permutations, prof_command=None, gprof_command=None):
self.name = name
self.command = command
self.prof_command = prof_command
self.gprof_command = gprof_command
self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet
self.permutations = permutations
def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):
def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None, moses_gprof_repo=None):
"""Parses the config file"""
command, ldopts, prof_command = '', '', None
command, ldopts, prof_command, gprof_command = '', '', None, None
permutations = []
fileopen = open(conffile, 'r')
for line in fileopen:
@ -108,8 +116,10 @@ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):
if opt == 'Command:':
command = args.replace('\n', '')
if moses_prof is not None: # Get optional command for profiling
if moses_prof_repo is not None: # Get optional command for profiling
prof_command = moses_prof_repo + '/bin/' + command
if moses_gprof_repo is not None: # Get optional command for google-perftools
gprof_command = moses_gprof_repo + '/bin/' + command
command = moses_repo + '/bin/' + command
elif opt == 'LDPRE:':
ldopts = args.replace('\n', '')
@ -118,14 +128,14 @@ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):
else:
raise ValueError('Unrecognized option ' + opt)
#We use the testdir as the name.
testcase = Test(testdir, command, ldopts, permutations, prof_command)
testcase = Test(testdir, command, ldopts, permutations, prof_command, gprof_command)
fileopen.close()
return testcase
def parse_testconfig(conffile):
"""Parses the config file for the whole testsuite."""
repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', ''
basebranch, baserev, repo_prof_path = '', '', None
basebranch, baserev, repo_prof_path, repo_gprof_path = '', '', None, None
fileopen = open(conffile, 'r')
for line in fileopen:
line = line.split('#')[0] # Discard comments
@ -146,10 +156,12 @@ def parse_testconfig(conffile):
baserev = args.replace('\n', '')
elif opt == 'MOSES_PROFILER_REPO:': # Optional
repo_prof_path = args.replace('\n', '')
elif opt == 'MOSES_GOOGLE_PROFILER_REPO:': # Optional
repo_gprof_path = args.replace('\n', '')
else:
raise ValueError('Unrecognized option ' + opt)
config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\
basebranch, baserev, repo_prof_path)
basebranch, baserev, repo_prof_path, repo_gprof_path)
fileopen.close()
return config
@ -160,7 +172,9 @@ def get_config():
config.additional_args(args.singletestdir, args.revision, args.branch)
revision = repoinit(config)
if config.repo_prof is not None:
repoinit(config, True)
repoinit(config, "gnu-profiler")
if config.repo_gprof is not None:
repoinit(config, "google-profiler")
config.set_revision(revision)
return config
@ -212,16 +226,27 @@ def write_gprof(command, name, variant, config):
executable_path = command.split(' ')[0] # Path to the moses binary
gprof_command = 'gprof ' + executable_path + ' ' + gmon_path + ' > ' + outputfile
subprocess.call([gprof_command], shell=True)
os.remove('gmon_path') # After we are done discard the gmon file
os.remove(gmon_path) # After we are done discard the gmon file
def execute_test(command, path, name, variant, config, profile=False):
def write_pprof(name, variant, config):
"""Copies the google-perftools profiler output to the corresponding test directory"""
output_dir = config.testlogs + '/' + name
if not os.path.exists(output_dir):
os.makedirs(output_dir)
outputfile = output_dir + '/pprof_' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant
shutil.move("/tmp/moses.prof", outputfile)
def execute_test(command, path, name, variant, config, profile=None):
"""Executes a testcase given a whole command, path to the test file output,
name of the test and variant tested. Config is the global configuration"""
subprocess.Popen([command], stdout=None, stderr=subprocess.PIPE, shell=True).communicate()
if not profile:
if profile is None:
write_log(path, name + '_' + variant, config)
else: # Basically produce a gmon output
elif profile == "gnu-profiler": # Basically produce a gmon output
write_gprof(command, name, variant, config)
elif profile == "google-profiler":
write_pprof(name, variant, config)
def execute_tests(testcase, cur_directory, config):
@ -271,9 +296,9 @@ def execute_tests(testcase, cur_directory, config):
if 'vanilla' in testcase.permutations:
whole_command = testcase.prof_command
execute_test(whole_command, time_path, testcase.name, 'profile', config, True)
execute_test(whole_command, time_path, testcase.name, 'profile', config, "gnu-profiler")
if 'cached' in testcase.permutations:
execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, True)
execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, "gnu-profiler")
if 'ldpre' in testcase.permutations:
for opt in testcase.ldopts:
@ -282,13 +307,42 @@ def execute_tests(testcase, cur_directory, config):
subprocess.call([config.drop_caches], shell=True)
#Create the command for executing moses:
whole_command = 'LD_PRELOAD=' + opt + testcase.prof_command
whole_command = 'LD_PRELOAD=' + opt + " " + testcase.prof_command
variant = 'profile_ldpre_' + opt
#test normal and cached
execute_test(whole_command, time_path, testcase.name, variant, config, True)
execute_test(whole_command, time_path, testcase.name, variant, config, "gnu-profiler")
if 'cached' in testcase.permutations:
execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, True)
execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, "gnu-profiler")
#Google-perftools profiler
if 'google-profiler' in testcase.permutations:
subprocess.call(['sync'], shell=True) # Drop caches first
subprocess.call([config.drop_caches], shell=True)
#Create the command for executing moses
whole_command = "CPUPROFILE=/tmp/moses.prof " + testcase.gprof_command
#test normal and cached
execute_test(whole_command, time_path, testcase.name, 'vanilla', config, 'google-profiler')
if 'cached' in testcase.permutations:
execute_test(whole_command, time_path, testcase.name, 'vanilla_cached', config, 'google-profiler')
#Now perform LD_PRELOAD tests
if 'ldpre' in testcase.permutations:
for opt in testcase.ldopts:
#Clear caches
subprocess.call(['sync'], shell=True)
subprocess.call([config.drop_caches], shell=True)
#Create the command for executing moses:
whole_command = 'LD_PRELOAD=' + opt + " " + whole_command
variant = 'ldpre_' + opt
#test normal and cached
execute_test(whole_command, time_path, testcase.name, variant, config, 'google-profiler')
if 'cached' in testcase.permutations:
execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, 'google-profiler')
# Go through all the test directories and executes tests
@ -319,7 +373,7 @@ if __name__ == '__main__':
for logfile in os.listdir(CONFIG.testlogs):
logfile_name = CONFIG.testlogs + '/' + logfile
if not check_for_basever(logfile_name, CONFIG.basebranch):
if os.path.isfile(logfile_name) and not check_for_basever(logfile_name, CONFIG.basebranch):
logfile = logfile.replace('_vanilla', '')
logfile = logfile.replace('_cached', '')
logfile = logfile.replace('_ldpre', '')
@ -330,7 +384,7 @@ if __name__ == '__main__':
#Create a new configuration for base version tests:
BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\
CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\
CONFIG.baserev, CONFIG.repo_prof)
CONFIG.baserev, CONFIG.repo_prof, CONFIG.repo_gprof)
BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch)
#Set up the repository and get its revision:
REVISION = repoinit(BASECONFIG)
@ -340,20 +394,28 @@ if __name__ == '__main__':
subprocess.call(['./previous.sh'], shell=True)
#If profiler configuration exists also init it
if BASECONFIG.repo_prof is not None:
repoinit(BASECONFIG, True)
repoinit(BASECONFIG, "gnu-profiler")
os.chdir(BASECONFIG.repo_prof)
subprocess.call(['./previous.sh'], shell=True)
if BASECONFIG.repo_gprof is not None:
repoinit(BASECONFIG, "google-profiler")
os.chdir(BASECONFIG.repo_gprof)
subprocess.call(['./previous.sh'], shell=True)
#Perform tests
for directory in FIRSTTIME:
cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\
'/config', directory, BASECONFIG.repo)
'/config', directory, BASECONFIG.repo, BASECONFIG.repo_prof, BASECONFIG.repo_gprof)
execute_tests(cur_testcase, directory, BASECONFIG)
#Reset back the repository to the normal configuration
repoinit(CONFIG)
if BASECONFIG.repo_prof is not None:
repoinit(CONFIG, True)
repoinit(CONFIG, "gnu-profiler")
if BASECONFIG.repo_gprof is not None:
repoinit(CONFIG, "google-profiler")
#Builds moses
os.chdir(CONFIG.repo)
@ -362,12 +424,16 @@ if __name__ == '__main__':
os.chdir(CONFIG.repo_prof)
subprocess.call(['./previous.sh'], shell=True)
if CONFIG.repo_gprof is not None:
os.chdir(CONFIG.repo_gprof)
subprocess.call(['./previous.sh'], shell=True)
if CONFIG.singletest:
TESTCASE = parse_configfile(CONFIG.tests + '/' +\
CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo)
CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
execute_tests(TESTCASE, CONFIG.singletest, CONFIG)
else:
for directory in ALL_DIR:
cur_testcase = parse_configfile(CONFIG.tests + '/' + directory +\
'/config', directory, CONFIG.repo)
'/config', directory, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
execute_tests(cur_testcase, directory, CONFIG)

View File

@ -1,6 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="pruneGeneration" InternalType="Console">
<Plugins>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
<Plugin Name="CMakePlugin">
<![CDATA[[{
"name": "Debug",
@ -13,9 +16,6 @@
"parentProject": ""
}]]]>
</Plugin>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
</Plugins>
<Description/>
<Dependencies/>
@ -44,8 +44,10 @@
<LibraryPath Value="../../../contrib/other-builds/moses/Debug"/>
<Library Value="boost_filesystem"/>
<Library Value="boost_system"/>
<Library Value="boost_iostreams"/>
<Library Value="moses"/>
<Library Value="z"/>
<Library Value="bz2"/>
</Linker>
<ResourceCompiler Options="" Required="no"/>
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>

View File

@ -13,7 +13,7 @@ with-xmlrpc-c = [ option.get "with-xmlrpc-c" ] ;
if $(with-xmlrpc-c) {
echo While building mosesserver ... ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "!!! You are linking the XMLRPC-C library; Do NOT use v.1.25.29 !!!" ;
echo "!!! You are linking the XMLRPC-C library; Must be v.1.32 (September 2012) or higher !!!" ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
build-moses-server = true ;

View File

@ -740,20 +740,23 @@ int main(int argc, char** argv)
myRegistry.addMethod("updater", updater);
myRegistry.addMethod("optimize", optimizer);
/* CODE FOR old xmlrpc-c v. 1.32 or lower
xmlrpc_c::serverAbyss myAbyssServer(
myRegistry,
port, // TCP port on which to listen
logfile
);
/* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04
*/
/* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04 */
xmlrpc_c::serverAbyss myAbyssServer(
xmlrpc_c::serverAbyss::constrOpt()
.registryPtr(&myRegistry)
.registryP(&myRegistry)
.portNumber(port) // TCP port on which to listen
.logFileName(logfile)
.allowOrigin("*")
.maxConn((unsigned int)numThreads)
);
*/
XVERBOSE(1,"Listening on port " << port << endl);
if (isSerial) {

View File

@ -5,10 +5,7 @@
#include <vector>
#include "StatisticsBasedScorer.h"
#include "moses/FF/InternalTree.h"
using Moses::TreePointer;
using Moses::InternalTree;
#include "InternalTree.h"
namespace MosesTuning
{

110
mert/InternalTree.cpp Normal file
View File

@ -0,0 +1,110 @@
#include "InternalTree.h"
namespace MosesTuning
{
InternalTree::InternalTree(const std::string & line, const bool terminal):
m_isTerminal(terminal)
{
size_t found = line.find_first_of("[] ");
if (found == line.npos) {
m_value = line;
}
else {
AddSubTree(line, 0);
}
}
size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
{
std::string value;
char token = 0;
while (token != ']' && pos != std::string::npos) {
size_t oldpos = pos;
pos = line.find_first_of("[] ", pos);
if (pos == std::string::npos) break;
token = line[pos];
value = line.substr(oldpos,pos-oldpos);
if (token == '[') {
if (m_value.size() > 0) {
m_children.push_back(boost::make_shared<InternalTree>(value,false));
pos = m_children.back()->AddSubTree(line, pos+1);
} else {
if (value.size() > 0) {
m_value = value;
}
pos = AddSubTree(line, pos+1);
}
} else if (token == ' ' || token == ']') {
if (value.size() > 0 && !(m_value.size() > 0)) {
m_value = value;
} else if (value.size() > 0) {
m_isTerminal = false;
m_children.push_back(boost::make_shared<InternalTree>(value,true));
}
if (token == ' ') {
pos++;
}
}
if (m_children.size() > 0) {
m_isTerminal = false;
}
}
if (pos == std::string::npos) {
return line.size();
}
return std::min(line.size(),pos+1);
}
std::string InternalTree::GetString(bool start) const
{
std::string ret = "";
if (!start) {
ret += " ";
}
if (!m_isTerminal) {
ret += "[";
}
ret += m_value;
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it) {
ret += (*it)->GetString(false);
}
if (!m_isTerminal) {
ret += "]";
}
return ret;
}
void InternalTree::Combine(const std::vector<TreePointer> &previous)
{
std::vector<TreePointer>::iterator it;
bool found = false;
leafNT next_leafNT(this);
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
found = next_leafNT(it);
if (found) {
*it = *it_prev;
} else {
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
}
}
}
}

77
mert/InternalTree.h Normal file
View File

@ -0,0 +1,77 @@
#pragma once
#include <iostream>
#include <string>
#include <map>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/make_shared.hpp>
#include "util/generator.hh"
#include "util/exception.hh"
namespace MosesTuning
{
class InternalTree;
typedef boost::shared_ptr<InternalTree> TreePointer;
typedef int NTLabel;
class InternalTree
{
std::string m_value;
std::vector<TreePointer> m_children;
bool m_isTerminal;
public:
InternalTree(const std::string & line, const bool terminal = false);
InternalTree(const InternalTree & tree):
m_value(tree.m_value),
m_isTerminal(tree.m_isTerminal) {
const std::vector<TreePointer> & children = tree.m_children;
for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
m_children.push_back(boost::make_shared<InternalTree>(**it));
}
}
size_t AddSubTree(const std::string & line, size_t start);
std::string GetString(bool start = true) const;
void Combine(const std::vector<TreePointer> &previous);
const std::string & GetLabel() const {
return m_value;
}
size_t GetLength() const {
return m_children.size();
}
std::vector<TreePointer> & GetChildren() {
return m_children;
}
bool IsTerminal() const {
return m_isTerminal;
}
bool IsLeafNT() const {
return (!m_isTerminal && m_children.size() == 0);
}
};
// Python-like generator that yields next nonterminal leaf on every call
$generator(leafNT)
{
std::vector<TreePointer>::iterator it;
InternalTree* tree;
leafNT(InternalTree* root = 0): tree(root) {}
$emit(std::vector<TreePointer>::iterator)
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
$yield(it);
} else if ((*it)->GetLength() > 0) {
if ((*it).get()) { // normal pointer to same object that TreePointer points to
$restart(tree = (*it).get());
}
}
}
$stop;
};
}

View File

@ -30,7 +30,7 @@ InterpolatedScorer.cpp
Point.cpp
PerScorer.cpp
HwcmScorer.cpp
../moses/FF/InternalTree.cpp
InternalTree.cpp
Scorer.cpp
ScorerFactory.cpp
Optimizer.cpp

View File

@ -28,7 +28,8 @@ BaseManager::GetSource() const
}
const ttasksptr
BaseManager::GetTtask() const {
BaseManager::GetTtask() const
{
return m_ttask.lock();
}

View File

@ -167,7 +167,14 @@ run_as_server()
myRegistry.addMethod("updater", updater);
myRegistry.addMethod("optimize", optimizer);
xmlrpc_c::serverAbyss myAbyssServer(myRegistry, port, logfile);
xmlrpc_c::serverAbyss myAbyssServer(
xmlrpc_c::serverAbyss::constrOpt()
.registryP(&myRegistry)
.portNumber(port) // TCP port on which to listen
.logFileName(logfile)
.allowOrigin("*")
.maxConn((unsigned int)num_threads)
);
XVERBOSE(1,"Listening on port " << port << endl);
if (isSerial) {

View File

@ -1,27 +1,24 @@
#include "InternalTree.h"
#include "moses/StaticData.h"
namespace Moses
{
InternalTree::InternalTree(const std::string & line, size_t start, size_t len, const bool terminal):
m_value_nt(0),
m_isTerminal(terminal)
InternalTree::InternalTree(const std::string & line, size_t start, size_t len, const bool nonterminal)
{
if (len > 0) {
m_value.assign(line, start, len);
m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(start, len), nonterminal);
}
}
InternalTree::InternalTree(const std::string & line, const bool terminal):
m_value_nt(0),
m_isTerminal(terminal)
InternalTree::InternalTree(const std::string & line, const bool nonterminal)
{
size_t found = line.find_first_of("[] ");
if (found == line.npos) {
m_value = line;
m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), line, nonterminal);
} else {
AddSubTree(line, 0);
}
@ -32,6 +29,7 @@ size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
char token = 0;
size_t len = 0;
bool has_value = false;
while (token != ']' && pos != std::string::npos) {
size_t oldpos = pos;
@ -41,30 +39,27 @@ size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
len = pos-oldpos;
if (token == '[') {
if (!m_value.empty()) {
m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, false));
if (has_value) {
m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, true));
pos = m_children.back()->AddSubTree(line, pos+1);
} else {
if (len > 0) {
m_value.assign(line, oldpos, len);
m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(oldpos, len), false);
has_value = true;
}
pos = AddSubTree(line, pos+1);
}
} else if (token == ' ' || token == ']') {
if (len > 0 && m_value.empty()) {
m_value.assign(line, oldpos, len);
if (len > 0 && !has_value) {
m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(oldpos, len), true);
has_value = true;
} else if (len > 0) {
m_isTerminal = false;
m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, true));
m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, false));
}
if (token == ' ') {
pos++;
}
}
if (!m_children.empty()) {
m_isTerminal = false;
}
}
if (pos == std::string::npos) {
@ -82,16 +77,16 @@ std::string InternalTree::GetString(bool start) const
ret += " ";
}
if (!m_isTerminal) {
if (!IsTerminal()) {
ret += "[";
}
ret += m_value;
ret += m_value.GetString(StaticData::Instance().GetOutputFactorOrder(), false);
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it) {
ret += (*it)->GetString(false);
}
if (!m_isTerminal) {
if (!IsTerminal()) {
ret += "]";
}
return ret;
@ -120,13 +115,13 @@ void InternalTree::Unbinarize()
{
// nodes with virtual label cannot be unbinarized
if (m_value.empty() || m_value[0] == '^') {
if (m_value.GetString(0).empty() || m_value.GetString(0).as_string()[0] == '^') {
return;
}
//if node has child that is virtual node, get unbinarized list of children
for (std::vector<TreePointer>::iterator it = m_children.begin(); it != m_children.end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLabel()[0] == '^') {
if (!(*it)->IsTerminal() && (*it)->GetLabel().GetString(0).as_string()[0] == '^') {
std::vector<TreePointer> new_children;
GetUnbinarizedChildren(new_children);
m_children = new_children;
@ -144,8 +139,8 @@ void InternalTree::Unbinarize()
void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const
{
for (std::vector<TreePointer>::const_iterator itx = m_children.begin(); itx != m_children.end(); ++itx) {
const std::string &label = (*itx)->GetLabel();
if (!label.empty() && label[0] == '^') {
const StringPiece label = (*itx)->GetLabel().GetString(0);
if (!label.empty() && label.as_string()[0] == '^') {
(*itx)->GetUnbinarizedChildren(ret);
} else {
ret.push_back(*itx);
@ -153,7 +148,7 @@ void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const
}
}
bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
bool InternalTree::FlatSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetLabel() == label) {
@ -163,7 +158,7 @@ bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer
return false;
}
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
bool InternalTree::RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetLabel() == label) {
@ -178,7 +173,7 @@ bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePo
return false;
}
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
bool InternalTree::RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetLabel() == label) {
@ -194,88 +189,4 @@ bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePo
return false;
}
bool InternalTree::FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetNTLabel() == label) {
return true;
}
}
return false;
}
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetNTLabel() == label) {
return true;
}
std::vector<TreePointer>::const_iterator it2;
if ((*it)->RecursiveSearch(label, it2)) {
it = it2;
return true;
}
}
return false;
}
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetNTLabel() == label) {
parent = this;
return true;
}
std::vector<TreePointer>::const_iterator it2;
if ((*it)->RecursiveSearch(label, it2, parent)) {
it = it2;
return true;
}
}
return false;
}
bool InternalTree::FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
return true;
}
}
return false;
}
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
return true;
}
std::vector<TreePointer>::const_iterator it2;
if ((*it)->RecursiveSearch(labels, it2)) {
it = it2;
return true;
}
}
return false;
}
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
parent = this;
return true;
}
std::vector<TreePointer>::const_iterator it2;
if ((*it)->RecursiveSearch(labels, it2, parent)) {
it = it2;
return true;
}
}
return false;
}
}

View File

@ -5,30 +5,28 @@
#include <map>
#include <vector>
#include "FFState.h"
#include "moses/Word.h"
#include <boost/shared_ptr.hpp>
#include <boost/make_shared.hpp>
#include "util/generator.hh"
#include "util/exception.hh"
#include "util/string_piece.hh"
namespace Moses
{
class InternalTree;
typedef boost::shared_ptr<InternalTree> TreePointer;
typedef int NTLabel;
class InternalTree
{
std::string m_value;
NTLabel m_value_nt;
Word m_value;
std::vector<TreePointer> m_children;
bool m_isTerminal;
public:
InternalTree(const std::string & line, size_t start, size_t len, const bool terminal);
InternalTree(const std::string & line, const bool terminal = false);
InternalTree(const std::string & line, const bool nonterminal = true);
InternalTree(const InternalTree & tree):
m_value(tree.m_value),
m_isTerminal(tree.m_isTerminal) {
m_value(tree.m_value) {
const std::vector<TreePointer> & children = tree.m_children;
for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
m_children.push_back(boost::make_shared<InternalTree>(**it));
@ -40,20 +38,10 @@ public:
void Combine(const std::vector<TreePointer> &previous);
void Unbinarize();
void GetUnbinarizedChildren(std::vector<TreePointer> &children) const;
const std::string & GetLabel() const {
const Word & GetLabel() const {
return m_value;
}
// optionally identify label by int instead of string;
// allows abstraction if multiple nonterminal strings should map to same label.
const NTLabel & GetNTLabel() const {
return m_value_nt;
}
void SetNTLabel(NTLabel value) {
m_value_nt = value;
}
size_t GetLength() const {
return m_children.size();
}
@ -62,38 +50,22 @@ public:
}
bool IsTerminal() const {
return m_isTerminal;
return !m_value.IsNonTerminal();
}
bool IsLeafNT() const {
return (!m_isTerminal && m_children.size() == 0);
return (m_value.IsNonTerminal() && m_children.size() == 0);
}
// different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents.
// can be used for formulating syntax constraints.
// if found, 'it' is iterator to first tree node that matches search string
bool FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
bool FlatSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const;
bool RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const;
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
// use NTLabel for search to reduce number of string comparisons / deal with synonymous labels
// if found, 'it' is iterator to first tree node that matches search string
bool FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
// pass vector of possible labels to search
// if found, 'it' is iterator to first tree node that matches search string
bool FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
bool RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
// Python-like generator that yields next nonterminal leaf on every call
$generator(leafNT) {

View File

@ -75,7 +75,7 @@ void Model1Vocabulary::Load(const std::string& fileName)
++i;
std::vector<std::string> tokens = Tokenize(line);
UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
unsigned id = Scan<unsigned>(tokens[0]);
unsigned id = std::atoll( tokens[0].c_str() );
if (! ( (id == 1) && (tokens[1] == "UNK") )) {
const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
bool stored = Store(factor, id);
@ -86,7 +86,7 @@ void Model1Vocabulary::Load(const std::string& fileName)
++i;
std::vector<std::string> tokens = Tokenize(line);
UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
unsigned id = Scan<unsigned>(tokens[0]);
unsigned id = std::atoll( tokens[0].c_str() );
const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
bool stored = Store(factor, id);
UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
@ -105,11 +105,11 @@ void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabular
++i;
std::vector<std::string> tokens = Tokenize(line);
UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
unsigned idS = Scan<unsigned>(tokens[0]);
unsigned idT = Scan<unsigned>(tokens[1]);
unsigned idS = std::atoll( tokens[0].c_str() );
unsigned idT = std::atoll( tokens[1].c_str() );
const Factor* wordS = vcbS.GetWord(idS);
const Factor* wordT = vcbT.GetWord(idT);
float prob = Scan<float>(tokens[2]);
float prob = std::atof( tokens[2].c_str() );
if ( (wordS != NULL) && (wordT != NULL) ) {
m_ltable[ wordS ][ wordT ] = prob;
}

View File

@ -134,7 +134,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
if (targetPhrase.GetAlignNonTerm().GetSize() != 0) {
// Initialize phrase orientation scoring object
MosesTraining::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(),
MosesTraining::Syntax::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(),
targetPhrase.GetAlignTerm(), targetPhrase.GetAlignNonTerm());
PhraseOrientationFeature::ReoClassData* reoClassData = new PhraseOrientationFeature::ReoClassData();
@ -150,7 +150,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
// LEFT-TO-RIGHT DIRECTION
MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_L2R);
MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::Syntax::GHKM::PhraseOrientation::REO_DIR_L2R);
if ( ((targetIndex == 0) || !phraseOrientation.TargetSpanIsAligned(0,targetIndex)) // boundary non-terminal in rule-initial position (left boundary)
&& (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule
@ -170,7 +170,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
if (reoClassData->firstNonTerminalPreviousSourceSpanIsAligned &&
reoClassData->firstNonTerminalFollowingSourceSpanIsAligned) {
// discontinuous
l2rOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
l2rOrientation = MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
} else {
reoClassData->firstNonTerminalIsBoundary = true;
}
@ -180,7 +180,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
// RIGHT-TO-LEFT DIRECTION
MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_R2L);
MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::Syntax::GHKM::PhraseOrientation::REO_DIR_R2L);
if ( ((targetIndex == targetPhrase.GetSize()-1) || !phraseOrientation.TargetSpanIsAligned(targetIndex,targetPhrase.GetSize()-1)) // boundary non-terminal in rule-final position (right boundary)
&& (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule
@ -200,7 +200,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
if (reoClassData->lastNonTerminalPreviousSourceSpanIsAligned &&
reoClassData->lastNonTerminalFollowingSourceSpanIsAligned) {
// discontinuous
r2lOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
r2lOrientation = MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
} else {
reoClassData->lastNonTerminalIsBoundary = true;
}
@ -335,25 +335,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
// LEFT-TO-RIGHT DIRECTION
MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT];
MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT];
IFFEATUREVERBOSE(2) {
FEATUREVERBOSE(2, "l2rOrientation ");
switch (l2rOrientation) {
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT:
FEATUREVERBOSE2(2, "mono" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
FEATUREVERBOSE2(2, "swap" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
FEATUREVERBOSE2(2, "dleft" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
FEATUREVERBOSE2(2, "dright" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
// modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
// modelType == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_MSLR
FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
break;
default:
@ -396,23 +396,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
} else {
if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
if ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
newScores[0] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityMono());
// if sub-derivation has left-boundary non-terminal:
// add recursive actual score of boundary non-terminal from subderivation
LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x1, newScores, accumulator);
} else if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
} else if ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
newScores[1] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilitySwap());
// if sub-derivation has left-boundary non-terminal:
// add recursive actual score of boundary non-terminal from subderivation
LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x2, newScores, accumulator);
} else if ( ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
} else if ( ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
newScores[2] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous());
// if sub-derivation has left-boundary non-terminal:
@ -437,25 +437,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
// RIGHT-TO-LEFT DIRECTION
MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT];
MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT];
IFFEATUREVERBOSE(2) {
FEATUREVERBOSE(2, "r2lOrientation ");
switch (r2lOrientation) {
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT:
FEATUREVERBOSE2(2, "mono" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
FEATUREVERBOSE2(2, "swap" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
FEATUREVERBOSE2(2, "dleft" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
FEATUREVERBOSE2(2, "dright" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
// modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
// modelType == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_MSLR
FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
break;
default:
@ -498,23 +498,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
} else {
if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
if ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
newScores[m_offsetR2LScores+0] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityMono());
// if sub-derivation has right-boundary non-terminal:
// add recursive actual score of boundary non-terminal from subderivation
RightBoundaryR2LScoreRecursive(featureID, prevState, 0x1, newScores, accumulator);
} else if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
} else if ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
newScores[m_offsetR2LScores+1] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilitySwap());
// if sub-derivation has right-boundary non-terminal:
// add recursive actual score of boundary non-terminal from subderivation
RightBoundaryR2LScoreRecursive(featureID, prevState, 0x2, newScores, accumulator);
} else if ( ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
} else if ( ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
newScores[m_offsetR2LScores+2] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous());
// if sub-derivation has right-boundary non-terminal:
@ -862,17 +862,17 @@ void PhraseOrientationFeature::SparseNonTerminalR2LScore(const Factor* nonTermin
}
const std::string* PhraseOrientationFeature::ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const
const std::string* PhraseOrientationFeature::ToString(const MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS o) const
{
if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
if ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
return &MORIENT;
} else if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
} else if ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
return &SORIENT;
} else if ( ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
} else if ( ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
return &DORIENT;
} else {

View File

@ -302,8 +302,8 @@ public:
struct ReoClassData {
public:
std::vector<MosesTraining::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassL2R;
std::vector<MosesTraining::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassR2L;
std::vector<MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassL2R;
std::vector<MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassR2L;
bool firstNonTerminalIsBoundary;
bool firstNonTerminalPreviousSourceSpanIsAligned;
bool firstNonTerminalFollowingSourceSpanIsAligned;
@ -401,7 +401,7 @@ protected:
ScoreComponentCollection* scoreBreakdown,
const std::string* o) const;
const std::string* ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const;
const std::string* ToString(const MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS o) const;
static const std::string MORIENT;
static const std::string SORIENT;

View File

@ -16,21 +16,29 @@ namespace Moses
PhrasePairFeature::PhrasePairFeature(const std::string &line)
:StatelessFeatureFunction(0, line)
,m_unrestricted(false)
,m_simple(true)
,m_sourceContext(false)
,m_domainTrigger(false)
,m_ignorePunctuation(false)
{
std::cerr << "Initializing PhrasePairFeature.." << std::endl;
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
ReadParameters();
if (m_simple == 1) std::cerr << "using simple phrase pairs.. ";
if (m_sourceContext == 1) std::cerr << "using source context.. ";
if (m_domainTrigger == 1) std::cerr << "using domain triggers.. ";
if (m_simple == 1) VERBOSE(1, " Using simple phrase pairs.");
if (m_sourceContext == 1) VERBOSE(1, " Using source context.");
if (m_domainTrigger == 1) VERBOSE(1, " Using domain triggers.");
// compile a list of punctuation characters
if (m_ignorePunctuation) {
std::cerr << "ignoring punctuation for triggers.. ";
VERBOSE(1, " Ignoring punctuation for triggers.");
char punctuation[] = "\"'!?¿·()#_,.:;•&@/\\0123456789~=";
for (size_t i=0; i < sizeof(punctuation)-1; ++i)
for (size_t i=0; i < sizeof(punctuation)-1; ++i) {
m_punctuationHash[punctuation[i]] = 1;
}
}
VERBOSE(1, " Done." << std::endl);
}
void PhrasePairFeature::SetParameter(const std::string& key, const std::string& value)
@ -76,7 +84,7 @@ void PhrasePairFeature::Load()
}
inFileSource.close();
} else {
} else if (!m_unrestricted) {
// restricted source word vocabulary
ifstream inFileSource(m_filePathSource.c_str());
UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);
@ -101,8 +109,6 @@ void PhrasePairFeature::Load()
}
inFileTarget.close();*/
m_unrestricted = false;
}
}
@ -114,25 +120,6 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
, ScoreComponentCollection *estimatedFutureScore) const
{
const Phrase& source = inputPath.GetPhrase();
if (m_simple) {
ostringstream namestr;
namestr << "pp_";
namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
namestr << ",";
namestr << sourceFactor->GetString();
}
namestr << "~";
namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
}
scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
if (m_domainTrigger) {
const Sentence& isnt = static_cast<const Sentence&>(input);
const bool use_topicid = isnt.GetUseTopicId();
@ -140,18 +127,18 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
// compute pair
ostringstream pair;
pair << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
pair << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() );
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
pair << ",";
pair << sourceFactor->GetString();
pair << "~";
pair << ReplaceTilde( sourceFactor->GetString() );
}
pair << "~";
pair << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
pair << "~~";
pair << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() );
for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
pair << ",";
pair << targetFactor->GetString();
pair << "~";
pair << ReplaceTilde( targetFactor->GetString() );
}
if (use_topicid || use_topicid_prob) {
@ -159,7 +146,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
// use topicid as trigger
const long topicid = isnt.GetTopicId();
stringstream feature;
feature << "pp_";
feature << m_description << "_";
if (topicid == -1)
feature << "unk";
else
@ -173,13 +160,13 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
const vector<string> &topicid_prob = *(isnt.GetTopicIdAndProb());
if (atol(topicid_prob[0].c_str()) == -1) {
stringstream feature;
feature << "pp_unk_";
feature << m_description << "_unk_";
feature << pair.str();
scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
stringstream feature;
feature << "pp_";
feature << m_description << "_";
feature << topicid_prob[i];
feature << "_";
feature << pair.str();
@ -193,7 +180,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
for (set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
string sourceTrigger = *p;
ostringstream namestr;
namestr << "pp_";
namestr << m_description << "_";
namestr << sourceTrigger;
namestr << "_";
namestr << pair.str();
@ -221,21 +208,21 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
if (m_unrestricted || sourceTriggerExists) {
ostringstream namestr;
namestr << "pp_";
namestr << m_description << "_";
namestr << sourceTrigger;
namestr << "~";
namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
namestr << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() );
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
namestr << ",";
namestr << sourceFactor->GetString();
namestr << "~";
namestr << ReplaceTilde( sourceFactor->GetString() );
}
namestr << "~";
namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
namestr << "~~";
namestr << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() );
for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
namestr << "~";
namestr << ReplaceTilde( targetFactor->GetString() );
}
scoreBreakdown.SparsePlusEquals(namestr.str(),1);
@ -244,6 +231,31 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
}
}
void PhrasePairFeature::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{
if (m_simple) {
ostringstream namestr;
namestr << m_description << "_";
namestr << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() );
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
namestr << "~";
namestr << ReplaceTilde( sourceFactor->GetString() );
}
namestr << "~~";
namestr << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() );
for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << "~";
namestr << ReplaceTilde( targetFactor->GetString() );
}
scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
}
bool PhrasePairFeature::IsUseable(const FactorMask &mask) const
{
bool ret = mask[m_targetFactorId];

View File

@ -1,5 +1,4 @@
#ifndef moses_PhrasePairFeature_h
#define moses_PhrasePairFeature_h
#pragma once
#include <stdexcept>
#include <boost/unordered_set.hpp>
@ -32,6 +31,16 @@ class PhrasePairFeature: public StatelessFeatureFunction
CharHash m_punctuationHash;
std::string m_filePathSource;
inline std::string ReplaceTilde(const StringPiece &str) const {
std::string out = str.as_string();
size_t pos = out.find('~');
while ( pos != std::string::npos ) {
out.replace(pos,1,"<TILDE>");
pos = out.find('~',pos);
}
return out;
};
public:
PhrasePairFeature(const std::string &line);
@ -43,8 +52,7 @@ public:
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const {
}
, ScoreComponentCollection &estimatedFutureScore) const;
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const {
@ -69,5 +77,3 @@ public:
}
#endif

View File

@ -12,7 +12,7 @@ namespace Moses
{
RulePairUnlexicalizedSource::RulePairUnlexicalizedSource(const std::string &line)
: StatelessFeatureFunction(0, line)
: StatelessFeatureFunction(1, line)
, m_glueRules(false)
, m_nonGlueRules(true)
, m_glueTargetLHSStr("Q")
@ -81,6 +81,9 @@ void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source
}
scoreBreakdown.PlusEquals(this, namestr.str(), 1);
if ( targetPhraseLHS != m_glueTargetLHS ) {
scoreBreakdown.PlusEquals(this, 1);
}
}
}

View File

@ -13,6 +13,7 @@ namespace Moses
SoftMatchingFeature::SoftMatchingFeature(const std::string &line)
: StatelessFeatureFunction(0, line)
, m_softMatches(moses_MaxNumNonterminals)
, m_scoreIdentical(true)
{
ReadParameters();
}
@ -26,6 +27,8 @@ void SoftMatchingFeature::SetParameter(const std::string& key, const std::string
} else if (key == "path") {
const std::string filePath = value;
Load(filePath);
} else if (key == "score-identical") {
m_scoreIdentical = Scan<bool>(value);
} else {
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
}
@ -80,8 +83,10 @@ void SoftMatchingFeature::EvaluateWhenApplied(const ChartHypothesis& hypo,
const ChartHypothesis* prevHypo = hypo.GetPrevHypo(nonTermInd);
const Word& prevLHS = prevHypo->GetTargetLHS();
const std::string &name = GetOrSetFeatureName(word, prevLHS);
accumulator->PlusEquals(this,name,1);
if ( (word != prevLHS) || m_scoreIdentical ) {
const std::string &name = GetOrSetFeatureName(word, prevLHS);
accumulator->PlusEquals(this,name,1);
}
}
}
}

View File

@ -55,6 +55,7 @@ public:
private:
mutable std::vector<std::vector<Word> > m_softMatches; // map RHS of new rule to list of possible LHS of old rule (subtree)
mutable std::vector<std::vector<std::string> > m_nameCache;
bool m_scoreIdentical;
#ifdef WITH_THREADS
//reader-writer lock

View File

@ -38,9 +38,8 @@ void SourceWordDeletionFeature::SetParameter(const std::string& key, const std::
void SourceWordDeletionFeature::Load()
{
if (m_filename == "") {
if (m_filename.empty())
return;
}
FEATUREVERBOSE(1, "Loading source word deletion word list from " << m_filename << std::endl);
ifstream inFile(m_filename.c_str());

View File

@ -13,33 +13,12 @@ void TreeStructureFeature::Load()
// syntactic constraints can be hooked in here.
m_constraints = NULL;
m_labelset = NULL;
StaticData &staticData = StaticData::InstanceNonConst();
staticData.SetTreeStructure(this);
}
// define NT labels (ints) that are mapped from strings for quicker comparison.
void TreeStructureFeature::AddNTLabels(TreePointer root) const
{
std::string label = root->GetLabel();
if (root->IsTerminal()) {
return;
}
std::map<std::string, NTLabel>::const_iterator it = m_labelset->string_to_label.find(label);
if (it != m_labelset->string_to_label.end()) {
root->SetNTLabel(it->second);
}
std::vector<TreePointer> children = root->GetChildren();
for (std::vector<TreePointer>::const_iterator it2 = children.begin(); it2 != children.end(); ++it2) {
AddNTLabels(*it2);
}
}
FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
, int featureID /* used to index the state in the previous hypotheses */
, ScoreComponentCollection* accumulator) const
@ -48,10 +27,6 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
const std::string *tree = property->GetValueString();
TreePointer mytree (boost::make_shared<InternalTree>(*tree));
if (m_labelset) {
AddNTLabels(mytree);
}
//get subtrees (in target order)
std::vector<TreePointer> previous_trees;
for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
@ -70,7 +45,7 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy
}
mytree->Combine(previous_trees);
bool full_sentence = (mytree->GetChildren().back()->GetLabel() == "</s>" || (mytree->GetChildren().back()->GetLabel() == "SEND" && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == "</s>"));
bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_send || (mytree->GetChildren().back()->GetLabel() == m_send_nt && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_send));
if (m_binarized && full_sentence) {
mytree->Unbinarize();
}

View File

@ -4,6 +4,7 @@
#include <map>
#include "StatefulFeatureFunction.h"
#include "FFState.h"
#include "moses/Word.h"
#include "InternalTree.h"
namespace Moses
@ -35,11 +36,18 @@ class TreeStructureFeature : public StatefulFeatureFunction
SyntaxConstraints* m_constraints;
LabelSet* m_labelset;
bool m_binarized;
Word m_send;
Word m_send_nt;
public:
TreeStructureFeature(const std::string &line)
:StatefulFeatureFunction(0, line)
, m_binarized(false) {
ReadParameters();
std::vector<FactorType> factors;
factors.push_back(0);
m_send.CreateFromString(Output, factors, "</s>", false);
m_send_nt.CreateFromString(Output, factors, "SEND", true);
}
~TreeStructureFeature() {
delete m_constraints;
@ -49,8 +57,6 @@ public:
return new TreeState(TreePointer());
}
void AddNTLabels(TreePointer root) const;
bool IsUseable(const FactorMask &mask) const {
return true;
}

View File

@ -110,7 +110,8 @@ void WordTranslationFeature::Load()
}
inFileSource.close();
} else {
} else if (!m_filePathSource.empty() || !m_filePathTarget.empty()) {
return;
// restricted source word vocabulary
ifstream inFileSource(m_filePathSource.c_str());
UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);

View File

@ -213,7 +213,8 @@ RecombineCompare(const Hypothesis &compare) const
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL) {
comp = m_ffStates[i] - compare.m_ffStates[i];
// TODO: Can this situation actually occur?
comp = int(m_ffStates[i] != NULL) - int(compare.m_ffStates[i] != NULL);
} else {
comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
}
@ -234,8 +235,8 @@ EvaluateWhenApplied(StatefulFeatureFunction const& sfff,
ttasksptr const& ttask = manager.GetTtask();
m_ffStates[state_idx] = sfff.EvaluateWhenAppliedWithContext
(ttask, *this, m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
&m_currScoreBreakdown);
(ttask, *this, m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
&m_currScoreBreakdown);
}
}
@ -585,7 +586,9 @@ OutputSurface(std::ostream &out, const Hypothesis &edge,
//preface surface form with UNK if marking unknowns
const Word &word = phrase.GetWord(pos);
if(markUnknown && word.IsOOV()) {
out << "UNK" << *factor;
out << StaticData::Instance().GetUnknownWordPrefix()
<< *factor
<< StaticData::Instance().GetUnknownWordSuffix();
} else {
out << *factor;
}

View File

@ -78,9 +78,9 @@ void LanguageModel::EvaluateInIsolation(const Phrase &source
float fullScore, nGramScore;
size_t oovCount;
if (targetPhrase.HasTtaskSPtr()){
if (targetPhrase.HasTtaskSPtr()) {
CalcScoreWithContext(targetPhrase.GetTtask(), targetPhrase, fullScore, nGramScore, oovCount);
}else{
} else {
CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
}
//CalcScore(targetPhrase, fullScore, nGramScore, oovCount);

View File

@ -70,7 +70,7 @@ void RDLM::Load()
static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr);
}
static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head);
static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head.GetString(0).as_string());
static_start_head = lm_head_base_instance_->lookup_input_word("<start_head>");
static_start_label = lm_head_base_instance_->lookup_input_word("<start_label>");
@ -211,7 +211,7 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost
}
// ignore virtual nodes (in binarization; except if it's the root)
if (m_binarized && root->GetLabel()[0] == '^' && !ancestor_heads.empty()) {
if (m_binarized && root->GetLabel().GetString(0).as_string()[0] == '^' && !ancestor_heads.empty()) {
// recursion
if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) {
root = back_pointers.find(root)->second.get();
@ -241,9 +241,9 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost
// root of tree: score without context
if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
std::vector<int> ngram_head_null (static_head_null);
ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel());
ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel().GetString(m_factorType).as_string());
if (m_isPretermBackoff && ngram_head_null.back() == 0) {
ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel());
ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel().GetString(m_factorType).as_string());
}
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) {
std::vector<int>::iterator it = ngram_head_null.begin();
@ -290,13 +290,13 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost
}
std::pair<int,int> head_ids;
InternalTree* found = GetHead(root, back_pointers, head_ids);
if (found == NULL) {
bool found = GetHead(root, back_pointers, head_ids);
if (!found) {
head_ids = std::make_pair(static_dummy_head, static_dummy_head);
}
size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
const std::string & head_label = root->GetLabel();
const std::string & head_label = root->GetLabel().GetString(0).as_string();
bool virtual_head = false;
int reached_end = 0;
int label_idx, label_idx_out;
@ -516,7 +516,7 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost
ancestor_labels.pop_back();
}
InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree* head_ptr) const
bool RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs) const
{
InternalTree *tree;
@ -527,52 +527,28 @@ InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_poin
tree = it->get();
}
if (m_binarized && tree->GetLabel()[0] == '^') {
head_ptr = GetHead(tree, back_pointers, IDs, head_ptr);
if (head_ptr != NULL && !m_isPTKVZ) {
return head_ptr;
if (m_binarized && tree->GetLabel().GetString(0).as_string()[0] == '^') {
bool found = GetHead(tree, back_pointers, IDs);
if (found) {
return true;
}
}
// assumption (only true for dependency parse): each constituent has a preterminal label, and corresponding terminal is head
// if constituent has multiple preterminals, first one is picked; if it has no preterminals, dummy_head is returned
else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal() && head_ptr == NULL) {
head_ptr = tree;
if (!m_isPTKVZ) {
GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
return head_ptr;
}
}
// add PTKVZ to lemma of verb
else if (m_isPTKVZ && head_ptr && tree->GetLabel() == "avz") {
InternalTree *tree2;
for (std::vector<TreePointer>::const_iterator it2 = tree->GetChildren().begin(); it2 != tree->GetChildren().end(); ++it2) {
if ((*it2)->IsLeafNT()) {
tree2 = back_pointers.find(it2->get())->second.get();
} else {
tree2 = it2->get();
}
if (tree2->GetLabel() == "PTKVZ" && tree2->GetLength() == 1 && tree2->GetChildren()[0]->IsTerminal()) {
std::string verb = tree2->GetChildren()[0]->GetLabel() + head_ptr->GetChildren()[0]->GetLabel();
GetIDs(verb, head_ptr->GetLabel(), IDs);
return head_ptr;
}
}
else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal()) {
GetIDs(tree->GetChildren()[0]->GetLabel(), tree->GetLabel(), IDs);
return true;
}
}
if (head_ptr != NULL) {
GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs);
}
return head_ptr;
return false;
}
void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_label, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const
{
std::pair<int,int> child_ids;
InternalTree* found;
size_t j = 0;
// score start label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes)
@ -616,13 +592,13 @@ void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & bac
continue;
}
found = GetHead(child, back_pointers, child_ids);
if (found == NULL) {
bool found = GetHead(child, back_pointers, child_ids);
if (!found) {
child_ids = std::make_pair(static_dummy_head, static_dummy_head);
}
labels[j] = lm_head->lookup_input_word(child->GetLabel());
labels_output[j] = lm_label->lookup_output_word(child->GetLabel());
labels[j] = lm_head->lookup_input_word(child->GetLabel().GetString(0).as_string());
labels_output[j] = lm_label->lookup_output_word(child->GetLabel().GetString(0).as_string());
heads[j] = child_ids.first;
heads_output[j] = child_ids.second;
j++;
@ -637,18 +613,18 @@ void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & bac
}
void RDLM::GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const
void RDLM::GetIDs(const Word & head, const Word & preterminal, std::pair<int,int> & IDs) const
{
IDs.first = lm_head_base_instance_->lookup_input_word(head);
IDs.first = lm_head_base_instance_->lookup_input_word(head.GetString(m_factorType).as_string());
if (m_isPretermBackoff && IDs.first == 0) {
IDs.first = lm_head_base_instance_->lookup_input_word(preterminal);
IDs.first = lm_head_base_instance_->lookup_input_word(preterminal.GetString(0).as_string());
}
if (m_sharedVocab) {
IDs.second = IDs.first;
} else {
IDs.second = lm_head_base_instance_->lookup_output_word(head);
IDs.second = lm_head_base_instance_->lookup_output_word(head.GetString(m_factorType).as_string());
if (m_isPretermBackoff && IDs.second == 0) {
IDs.second = lm_head_base_instance_->lookup_output_word(preterminal);
IDs.second = lm_head_base_instance_->lookup_output_word(preterminal.GetString(0).as_string());
}
}
}
@ -714,8 +690,6 @@ void RDLM::SetParameter(const std::string& key, const std::string& value)
m_path_head_lm = value;
} else if (key == "path_label_lm") {
m_path_label_lm = value;
} else if (key == "ptkvz") {
m_isPTKVZ = Scan<bool>(value);
} else if (key == "backoff") {
m_isPretermBackoff = Scan<bool>(value);
} else if (key == "context_up") {
@ -744,7 +718,9 @@ void RDLM::SetParameter(const std::string& key, const std::string& value)
else
UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value);
} else if (key == "glue_symbol") {
m_glueSymbol = value;
m_glueSymbolString = value;
} else if (key == "factor") {
m_factorType = Scan<FactorType>(value);
} else if (key == "cache_size") {
m_cacheSize = Scan<int>(value);
} else {

View File

@ -3,6 +3,7 @@
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/FFState.h"
#include "moses/FF/InternalTree.h"
#include "moses/Word.h"
#include <boost/thread/tss.hpp>
#include <boost/array.hpp>
@ -61,14 +62,14 @@ class RDLM : public StatefulFeatureFunction
nplm::neuralTM* lm_label_base_instance_;
mutable boost::thread_specific_ptr<nplm::neuralTM> lm_label_backend_;
std::string dummy_head;
std::string m_glueSymbol;
std::string m_startSymbol;
std::string m_endSymbol;
std::string m_endTag;
std::string m_glueSymbolString;
Word dummy_head;
Word m_glueSymbol;
Word m_startSymbol;
Word m_endSymbol;
Word m_endTag;
std::string m_path_head_lm;
std::string m_path_label_lm;
bool m_isPTKVZ;
bool m_isPretermBackoff;
size_t m_context_left;
size_t m_context_right;
@ -103,15 +104,12 @@ class RDLM : public StatefulFeatureFunction
int static_stop_label_output;
int static_start_label_output;
FactorType m_factorType;
public:
RDLM(const std::string &line)
: StatefulFeatureFunction(2, line)
, dummy_head("<dummy_head>")
, m_glueSymbol("Q")
, m_startSymbol("SSTART")
, m_endSymbol("SEND")
, m_endTag("</s>")
, m_isPTKVZ(false)
, m_glueSymbolString("Q")
, m_isPretermBackoff(true)
, m_context_left(3)
, m_context_right(0)
@ -122,8 +120,16 @@ public:
, m_normalizeLabelLM(false)
, m_sharedVocab(false)
, m_binarized(0)
, m_cacheSize(1000000) {
, m_cacheSize(1000000)
, m_factorType(0) {
ReadParameters();
std::vector<FactorType> factors;
factors.push_back(0);
dummy_head.CreateFromString(Output, factors, "<dummy_head>", false);
m_glueSymbol.CreateFromString(Output, factors, m_glueSymbolString, true);
m_startSymbol.CreateFromString(Output, factors, "SSTART", true);
m_endSymbol.CreateFromString(Output, factors, "SEND", true);
m_endTag.CreateFromString(Output, factors, "</s>", false);
}
~RDLM();
@ -133,9 +139,9 @@ public:
}
void Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float,4> &score, std::vector<int> &ancestor_heads, std::vector<int> &ancestor_labels, size_t &boundary_hash, int num_virtual = 0, int rescoring_levels = 0) const;
InternalTree* GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs, InternalTree * head_ptr=NULL) const;
bool GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs) const;
void GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_labels, std::vector<int> & heads, std::vector<int> & labels, std::vector<int> & heads_output, std::vector<int> & labels_output) const;
void GetIDs(const std::string & head, const std::string & preterminal, std::pair<int,int> & IDs) const;
void GetIDs(const Word & head, const Word & preterminal, std::pair<int,int> & IDs) const;
void ScoreFile(std::string &path); //for debugging
void PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const; //for debugging
@ -192,7 +198,7 @@ public:
_end = current->GetChildren().end();
iter = current->GetChildren().begin();
// expand virtual node
while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
while (binarized && !(*iter)->GetLabel().GetString(0).empty() && (*iter)->GetLabel().GetString(0).data()[0] == '^') {
stack.push_back(std::make_pair(current, iter));
// also go through trees or previous hypotheses to rescore nodes for which more context has become available
if ((*iter)->IsLeafNT()) {
@ -229,7 +235,7 @@ public:
}
}
// expand virtual node
while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') {
while (binarized && !(*iter)->GetLabel().GetString(0).empty() && (*iter)->GetLabel().GetString(0).data()[0] == '^') {
stack.push_back(std::make_pair(current, iter));
// also go through trees or previous hypotheses to rescore nodes for which more context has become available
if ((*iter)->IsLeafNT()) {

View File

@ -1737,7 +1737,9 @@ void Manager::OutputSurface(std::ostream &out, const Hypothesis &edge, const std
//preface surface form with UNK if marking unknowns
const Word &word = phrase.GetWord(pos);
if(markUnknown && word.IsOOV()) {
out << "UNK" << *factor;
out << StaticData::Instance().GetUnknownWordPrefix()
<< *factor
<< StaticData::Instance().GetUnknownWordSuffix();
} else {
out << *factor;
}

View File

@ -141,6 +141,8 @@ Parameter::Parameter()
po::options_description oov_opts("OOV Handling Options");
AddParam(oov_opts,"drop-unknown", "du", "drop unknown words instead of copying them");
AddParam(oov_opts,"mark-unknown", "mu", "mark unknown words in output");
AddParam(oov_opts,"unknown-word-prefix", "prefix to unknwon word when marked (default: 'UNK')");
AddParam(oov_opts,"unknown-word-suffix", "suffix to unknwon word when marked (default: '')");
AddParam(oov_opts,"lmodel-oov-feature", "add language model oov feature, one per model");
AddParam(oov_opts,"output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");
AddParam(oov_opts,"always-create-direct-transopt", "Always create a translation that translates the source word ad-verbatim");

View File

@ -119,10 +119,13 @@ std::string Phrase::GetStringRep(const vector<FactorType> factorsToPrint) const
stringstream strme;
for (size_t pos = 0 ; pos < GetSize() ; pos++) {
if(markUnknown && GetWord(pos).IsOOV()) {
strme << "UNK";
if (markUnknown && GetWord(pos).IsOOV()) {
strme << StaticData::Instance().GetUnknownWordPrefix();
}
strme << GetWord(pos).GetString(factorsToPrint, (pos != GetSize()-1));
if (markUnknown && GetWord(pos).IsOOV()) {
strme << StaticData::Instance().GetUnknownWordSuffix();
}
}
return strme.str();

View File

@ -438,6 +438,8 @@ StaticData
// unknown word processing
m_parameter->SetParameter(m_dropUnknown, "drop-unknown", false );
m_parameter->SetParameter(m_markUnknown, "mark-unknown", false );
m_parameter->SetParameter<string>(m_unknownWordPrefix, "unknown-word-prefix", "UNK" );
m_parameter->SetParameter<string>(m_unknownWordSuffix, "unknown-word-suffix", "" );
m_parameter->SetParameter(m_lmEnableOOVFeature, "lmodel-oov-feature", false);

View File

@ -114,6 +114,8 @@ protected:
// bool m_labeledNBestList,m_nBestIncludesSegmentation;
bool m_dropUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = drop (ignore) them
bool m_markUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = mark and (ignore) them
std::string m_unknownWordPrefix;
std::string m_unknownWordSuffix;
bool m_wordDeletionEnabled;
bool m_disableDiscarding;
@ -326,6 +328,12 @@ public:
inline bool GetMarkUnknown() const {
return m_markUnknown;
}
inline std::string GetUnknownWordPrefix() const {
return m_unknownWordPrefix;
}
inline std::string GetUnknownWordSuffix() const {
return m_unknownWordSuffix;
}
inline bool GetDisableDiscarding() const {
return m_disableDiscarding;
}

View File

@ -177,7 +177,8 @@ void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const
}
#endif
bool TargetPhrase::HasTtaskSPtr() const {
bool TargetPhrase::HasTtaskSPtr() const
{
return m_ttask_flag;
}

View File

@ -103,7 +103,7 @@ namespace ugdiss
operator[](ID key) const
{
if (start==stop) return INIT(0);
Cell const* c = lower_bound(start,stop,key);
Cell const* c = std::lower_bound(start,stop,key);
return (c != stop && c->id == key ? c->val : INIT(0));
}

View File

@ -21,6 +21,7 @@
#include "ug_ttrack_base.h"
#include "num_read_write.h"
#include "ug_load_primer.h"
#include "ug_tsa_base.h"
namespace ugdiss
{
@ -193,7 +194,7 @@ namespace ugdiss
findSid(TKN const* t) const
{
id_type tokenPos = t-data;
id_type const* p = upper_bound(index,index+this->numSent,tokenPos);
id_type const* p = std::upper_bound(index,index+this->numSent,tokenPos);
assert(p>index);
return p-index-1;
}
@ -203,7 +204,7 @@ namespace ugdiss
mmTtrack<TKN>::
findSid(id_type tokenPos) const
{
id_type const* p = upper_bound(index,index+this->numSent,tokenPos);
id_type const* p = std::upper_bound(index,index+this->numSent,tokenPos);
assert(p>index);
return p-index-1;
}

View File

@ -114,14 +114,14 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
// find match ranges in suffix array
vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
for(size_t start=0; start<input[sentenceInd].size(); start++) {
for(int start=0; start<input[sentenceInd].size(); start++) {
SuffixArray::INDEX prior_first_match = 0;
SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
vector< string > substring;
bool stillMatched = true;
vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
//cerr << "start: " << start;
for(int word=start; stillMatched && word<input[sentenceInd].size(); word++) {
for(size_t word=start; stillMatched && word<input[sentenceInd].size(); word++) {
substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
// only look up, if needed (i.e. no unnecessary short gram lookups)
@ -163,7 +163,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
count += range.second - range.first + 1;
for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
int position = suffixArray->GetPosition( i );
size_t position = suffixArray->GetPosition( i );
// sentence length mismatch
size_t sentence_id = suffixArray->GetSentence( position );
@ -261,7 +261,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
// quick look: how many words are matched
int words_matched = 0;
for(int m=0; m<match.size(); m++) {
for(size_t m=0; m<match.size(); m++) {
if (match[m].min_cost <= best_cost) // makes no difference
words_matched += match[m].input_end - match[m].input_start + 1;
@ -274,7 +274,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
// prune, check again how many words are matched
vector< Match > pruned = prune_matches( match, best_cost );
words_matched = 0;
for(int p=0; p<pruned.size(); p++) {
for(size_t p=0; p<pruned.size(); p++) {
words_matched += pruned[p].input_end - pruned[p].input_start + 1;
}
if (max(input_length,tm_length) - words_matched > best_cost) {
@ -323,7 +323,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
// do not try to find the best ... report multiple matches
if (multiple_flag) {
for(int si=0; si<best_tm.size(); si++) {
for(size_t si=0; si<best_tm.size(); si++) {
int s = best_tm[si];
string path;
sed( input[sentenceInd], source[s], path, true );
@ -776,7 +776,7 @@ void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translatio
wordIndex.clear();
// store input words and their positions in hash map
for(int i=0; i<input.size(); i++) {
for(size_t i=0; i<input.size(); i++) {
if (wordIndex.find( input[i] ) == wordIndex.end()) {
vector< int > position_vector;
wordIndex[ input[i] ] = position_vector;
@ -799,7 +799,7 @@ void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translation
input_word_hit = wordIndex.find( tm[t_pos] );
if (input_word_hit != wordIndex.end()) {
vector< int > &position_vector = input_word_hit->second;
for(int j=0; j<position_vector.size(); j++) {
for(size_t j=0; j<position_vector.size(); j++) {
int &i_pos = position_vector[j];
// before match
@ -870,7 +870,7 @@ int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length,
return input_length+tm_length;
int this_best_cost = input_length + tm_length;
for(int i=0; i<match.size(); i++) {
for(size_t i=0; i<match.size(); i++) {
this_best_cost = min( this_best_cost, match[i].max_cost );
}
// cerr << "\tthis best cost: " << this_best_cost << endl;
@ -892,8 +892,8 @@ int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length,
vector< Match > &first_match = multi_match[ first_level ];
vector< Match > &second_match = multi_match[ second_level ];
for(int i1 = 0; i1 < first_match.size(); i1++) {
for(int i2 = 0; i2 < second_match.size(); i2++) {
for(size_t i1 = 0; i1 < first_match.size(); i1++) {
for(size_t i2 = 0; i2 < second_match.size(); i2++) {
// do not combine the same pair twice
if (first_level == second_level && i2 <= i1) {

View File

@ -28,20 +28,10 @@ TO_STRING_BODY(WordsBitmap);
bool WordsBitmap::IsAdjacent(size_t startPos, size_t endPos) const
{
if (GetNumWordsCovered() == 0) {
return true;
}
size_t first = GetFirstGapPos();
size_t last = GetLastGapPos();
if (startPos == last || endPos == first) {
return true;
}
return false;
return
GetNumWordsCovered() == 0 ||
startPos == GetFirstGapPos() ||
endPos == GetLastGapPos();
}
}

View File

@ -22,6 +22,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#ifndef moses_WordsBitmap_h
#define moses_WordsBitmap_h
#include <algorithm>
#include <limits>
#include <vector>
#include <iostream>
@ -35,50 +36,34 @@ namespace Moses
{
typedef unsigned long WordsBitmapID;
/** vector of boolean used to represent whether a word has been translated or not
*/
/** Vector of boolean to represent whether a word has been translated or not.
*
* Implemented using a vector of char, which is usually the same representation
* for the elements that a C array of bool would use. A vector of bool, or a
* Boost dynamic_bitset, could be much more efficient in theory. Unfortunately
* algorithms like std::find() are not optimized for vector<bool> on gcc or
* clang, and dynamic_bitset lacks all the optimized search operations we want.
* Only benchmarking will tell what works best. Perhaps dynamic_bitset could
* still be a dramatic improvement, if we flip the meaning of the bits around
* so we can use its find_first() and find_next() for the most common searches.
*/
class WordsBitmap
{
friend std::ostream& operator<<(std::ostream& out, const WordsBitmap& wordsBitmap);
protected:
const size_t m_size; /**< number of words in sentence */
bool *m_bitmap; /**< ticks of words that have been done */
size_t m_firstGap; /** Position of first gap, pre-calculated as it is consulted often */
private:
std::vector<char> m_bitmap; //! Ticks of words in sentence that have been done.
size_t m_firstGap; //! Cached position of first gap, or NOT_FOUND.
WordsBitmap(); // not implemented
WordsBitmap& operator= (const WordsBitmap& other);
//! set all elements to false
void Initialize() {
for (size_t pos = 0 ; pos < m_size ; pos++) {
m_bitmap[pos] = false;
}
}
//sets elements by vector
void Initialize(const std::vector<bool>& vector) {
size_t vector_size = vector.size();
bool gapFound = false;
for (size_t pos = 0 ; pos < m_size ; pos++) {
if (pos < vector_size && vector[pos] == true) m_bitmap[pos] = true;
else {
m_bitmap[pos] = false;
if (!gapFound) {
m_firstGap = pos;
gapFound = true;
}
}
}
if (!gapFound) m_firstGap = NOT_FOUND;
}
/** Update the first gap, when bits are flipped */
void UpdateFirstGap(size_t startPos, size_t endPos, bool value) {
if (value) {
//may remove gap
if (startPos <= m_firstGap && m_firstGap <= endPos) {
m_firstGap = NOT_FOUND;
for (size_t i = endPos + 1 ; i < m_size; ++i) {
for (size_t i = endPos + 1 ; i < m_bitmap.size(); ++i) {
if (!m_bitmap[i]) {
m_firstGap = i;
break;
@ -96,38 +81,35 @@ protected:
public:
//! create WordsBitmap of length size and initialise with vector
WordsBitmap(size_t size, const std::vector<bool>& initialize_vector)
:m_size (size), m_firstGap(0) {
m_bitmap = (bool*) malloc(sizeof(bool) * size);
Initialize(initialize_vector);
//! Create WordsBitmap of length size, and initialise with vector.
WordsBitmap(size_t size, const std::vector<bool>& initializer)
:m_bitmap(initializer.begin(), initializer.end()), m_firstGap(0) {
// The initializer may not be of the same length. Change to the desired
// length. If we need to add any elements, initialize them to false.
m_bitmap.resize(size, false);
// Find the first gap, and cache it.
std::vector<char>::const_iterator first_gap = std::find(
m_bitmap.begin(), m_bitmap.end(), false);
m_firstGap = (
(first_gap == m_bitmap.end()) ?
NOT_FOUND : first_gap - m_bitmap.begin());
}
//! create WordsBitmap of length size and initialise
//! Create WordsBitmap of length size and initialise.
WordsBitmap(size_t size)
:m_size (size), m_firstGap(0) {
m_bitmap = (bool*) malloc(sizeof(bool) * size);
Initialize();
:m_bitmap(size, false), m_firstGap(0) {
}
//! deep copy
//! Deep copy.
WordsBitmap(const WordsBitmap &copy)
:m_size (copy.m_size), m_firstGap(copy.m_firstGap) {
m_bitmap = (bool*) malloc(sizeof(bool) * m_size);
for (size_t pos = 0 ; pos < copy.m_size ; pos++) {
m_bitmap[pos] = copy.GetValue(pos);
}
m_firstGap = copy.m_firstGap;
:m_bitmap(copy.m_bitmap), m_firstGap(copy.m_firstGap) {
}
~WordsBitmap() {
free(m_bitmap);
}
//! count of words translated
//! Count of words translated.
size_t GetNumWordsCovered() const {
size_t count = 0;
for (size_t pos = 0 ; pos < m_size ; pos++) {
if (m_bitmap[pos])
count++;
}
return count;
return std::count(m_bitmap.begin(), m_bitmap.end(), true);
}
//! position of 1st word not yet translated, or NOT_FOUND if everything already translated
@ -138,7 +120,7 @@ public:
//! position of last word not yet translated, or NOT_FOUND if everything already translated
size_t GetLastGapPos() const {
for (int pos = (int) m_size - 1 ; pos >= 0 ; pos--) {
for (int pos = int(m_bitmap.size()) - 1 ; pos >= 0 ; pos--) {
if (!m_bitmap[pos]) {
return pos;
}
@ -150,7 +132,7 @@ public:
//! position of last translated word
size_t GetLastPos() const {
for (int pos = (int) m_size - 1 ; pos >= 0 ; pos--) {
for (int pos = int(m_bitmap.size()) - 1 ; pos >= 0 ; pos--) {
if (m_bitmap[pos]) {
return pos;
}
@ -163,7 +145,7 @@ public:
//! whether a word has been translated at a particular position
bool GetValue(size_t pos) const {
return m_bitmap[pos];
return bool(m_bitmap[pos]);
}
//! set value at a particular position
void SetValue( size_t pos, bool value ) {
@ -198,7 +180,7 @@ public:
}
//! number of elements
size_t GetSize() const {
return m_size;
return m_bitmap.size();
}
//! transitive comparison of WordsBitmap
@ -213,7 +195,8 @@ public:
if (thisSize != compareSize) {
return (thisSize < compareSize) ? -1 : 1;
}
return std::memcmp(m_bitmap, compare.m_bitmap, thisSize * sizeof(bool));
return std::memcmp(
&m_bitmap[0], &compare.m_bitmap[0], thisSize * sizeof(bool));
}
bool operator< (const WordsBitmap &compare) const {
@ -229,20 +212,20 @@ public:
}
inline size_t GetEdgeToTheRightOf(size_t r) const {
if (r+1 == m_size) return r;
while (r+1 < m_size && !m_bitmap[r+1]) {
++r;
}
return r;
if (r+1 == m_bitmap.size()) return r;
return (
std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) -
m_bitmap.begin()
) - 1;
}
//! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
WordsBitmapID GetID() const {
assert(m_size < (1<<16));
assert(m_bitmap.size() < (1<<16));
size_t start = GetFirstGapPos();
if (start == NOT_FOUND) start = m_size; // nothing left
if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
size_t end = GetLastPos();
if (end == NOT_FOUND) end = 0; // nothing translated yet
@ -257,10 +240,10 @@ public:
//! converts bitmap into an integer ID, with an additional span covered
WordsBitmapID GetIDPlus( size_t startPos, size_t endPos ) const {
assert(m_size < (1<<16));
assert(m_bitmap.size() < (1<<16));
size_t start = GetFirstGapPos();
if (start == NOT_FOUND) start = m_size; // nothing left
if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
size_t end = GetLastPos();
if (end == NOT_FOUND) end = 0; // nothing translated yet
@ -284,8 +267,8 @@ public:
// friend
inline std::ostream& operator<<(std::ostream& out, const WordsBitmap& wordsBitmap)
{
for (size_t i = 0 ; i < wordsBitmap.m_size ; i++) {
out << (wordsBitmap.GetValue(i) ? 1 : 0);
for (size_t i = 0 ; i < wordsBitmap.m_bitmap.size() ; i++) {
out << int(wordsBitmap.GetValue(i));
}
return out;
}

View File

@ -17,6 +17,7 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cstdlib>
#include <vector>
#include <string>
@ -123,7 +124,7 @@ int main(int argc, char* argv[])
std::cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
int prev = 0;
while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
int binCount = Moses::Scan<int>(argv[++i]);
int binCount = std::atoi( argv[++i] );
countBin.push_back( binCount );
if (prev+1 == binCount) {
std::cerr << " " << binCount;
@ -164,8 +165,8 @@ int main(int argc, char* argv[])
}
pos = single_setting.find(":");
UTIL_THROW_IF2(pos == std::string::npos, "faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'");
unsigned int field = Moses::Scan<unsigned int>( single_setting.substr(0,pos) );
float threshold = Moses::Scan<float>( single_setting.substr(pos+1) );
unsigned int field = std::atoll( single_setting.substr(0,pos).c_str() );
float threshold = std::atof( single_setting.substr(pos+1).c_str() );
if (field == 0) {
minScore0 = threshold;
std::cerr << "setting minScore0 to " << threshold << std::endl;
@ -195,9 +196,9 @@ void loadCountOfCounts( const std::string& fileNameCountOfCounts )
std::string line;
while (getline(fileCountOfCounts, line)) {
if (totalCount < 0)
totalCount = Moses::Scan<float>(line); // total number of distinct phrase pairs
totalCount = std::atof( line.c_str() ); // total number of distinct phrase pairs
else
countOfCounts.push_back( Moses::Scan<float>(line) );
countOfCounts.push_back( std::atof( line.c_str() ) );
}
fileCountOfCounts.Close();
@ -259,6 +260,7 @@ void processFiles( const std::string& fileNameDirect,
// loop through all extracted phrase translations
int i=0;
while(true) {
// Print progress dots to stderr.
i++;
if (i%100000 == 0) std::cerr << "." << std::flush;
@ -285,13 +287,13 @@ void processFiles( const std::string& fileNameDirect,
Moses::Tokenize( directCounts, itemDirect[4] );
std::vector<std::string> indirectCounts;
Moses::Tokenize( indirectCounts, itemIndirect[4] );
float countF = Moses::Scan<float>(directCounts[0]);
float countE = Moses::Scan<float>(indirectCounts[0]);
float countEF = Moses::Scan<float>(indirectCounts[1]);
float countF = std::atof( directCounts[0].c_str() );
float countE = std::atof( indirectCounts[0].c_str() );
float countEF = std::atof( indirectCounts[1].c_str() );
float n1_F, n1_E;
if (kneserNeyFlag) {
n1_F = Moses::Scan<float>(directCounts[2]);
n1_E = Moses::Scan<float>(indirectCounts[2]);
n1_F = std::atof( directCounts[2].c_str() );
n1_E = std::atof( indirectCounts[2].c_str() );
}
// Good Turing discounting
@ -436,6 +438,9 @@ void processFiles( const std::string& fileNameDirect,
fileDirect.Close();
fileIndirect.Close();
fileConsolidated.Close();
// We've been printing progress dots to stderr. End the line.
std::cerr << std::endl;
}

View File

@ -19,7 +19,7 @@
#include "Alignment.h"
#include "Exception.h"
#include "syntax-common/exception.h"
#include <algorithm>
#include <cassert>
@ -27,6 +27,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -70,4 +72,5 @@ void FlipAlignment(Alignment &a)
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -25,6 +25,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -35,5 +37,5 @@ void ReadAlignment(const std::string &, Alignment &);
void FlipAlignment(Alignment &);
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -34,6 +34,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -242,36 +244,24 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
return p;
}
// Finds the set of frontier nodes. The definition of a frontier node differs
// from Galley et al's (2004) in the following ways:
//
// 1. A node with an empty span is not a frontier node (this excludes
// unaligned target subtrees).
// 2. Target word nodes are not frontier nodes.
// 3. Source word nodes are not frontier nodes.
// 4. Unless the --AllowUnary option is used, a node is not a frontier node if
// it has the same span as its parent.
// Recursively constructs the set of frontier nodes for the tree (or subtree)
// rooted at the given node.
void AlignmentGraph::ComputeFrontierSet(Node *root,
const Options &options,
std::set<Node *> &frontierSet) const
{
// Don't include word nodes or unaligned target subtrees.
// Non-tree nodes and unaligned target subtrees are not frontier nodes (and
// nor are their descendants). See the comment for the function
// AlignmentGraph::IsFrontierNode().
if (root->GetType() != TREE || root->GetSpan().empty()) {
return;
}
if (!SpansIntersect(root->GetComplementSpan(), Closure(root->GetSpan()))) {
// Unless unary rules are explicitly allowed, we use Chung et al's (2011)
// modified defintion of a frontier node to eliminate the production of
// non-lexical unary rules.
assert(root->GetParents().size() <= 1);
if (options.allowUnary
|| root->GetParents().empty()
|| root->GetParents()[0]->GetSpan() != root->GetSpan()) {
frontierSet.insert(root);
}
if (IsFrontierNode(*root, options)) {
frontierSet.insert(root);
}
// Recursively check descendants.
const std::vector<Node *> &children = root->GetChildren();
for (std::vector<Node *>::const_iterator p(children.begin());
p != children.end(); ++p) {
@ -279,6 +269,37 @@ void AlignmentGraph::ComputeFrontierSet(Node *root,
}
}
// Determines whether the given node is a frontier node or not. The definition
// of a frontier node differs from Galley et al's (2004) in the following ways:
//
// 1. A node with an empty span is not a frontier node (this is to exclude
// unaligned target subtrees).
// 2. Target word nodes are not frontier nodes.
// 3. Source word nodes are not frontier nodes.
// 4. Unless the --AllowUnary option is used, a node is not a frontier node if
// it has the same span as its parent.
bool AlignmentGraph::IsFrontierNode(const Node &n, const Options &options) const
{
// Don't include word nodes or unaligned target subtrees.
if (n.GetType() != TREE || n.GetSpan().empty()) {
return false;
}
// This is the original GHKM definition of a frontier node.
if (SpansIntersect(n.GetComplementSpan(), Closure(n.GetSpan()))) {
return false;
}
// Unless unary rules are explicitly allowed, we use Chung et al's (2011)
// modified defintion of a frontier node to eliminate the production of
// non-lexical unary rules.
assert(n.GetParents().size() <= 1);
if (!options.allowUnary &&
!n.GetParents().empty() &&
n.GetParents()[0]->GetSpan() == n.GetSpan()) {
return false;
}
return true;
}
void AlignmentGraph::CalcComplementSpans(Node *root)
{
Span compSpan;
@ -393,4 +414,5 @@ Node *AlignmentGraph::DetermineAttachmentPoint(int index)
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -32,6 +32,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -64,6 +66,7 @@ private:
Node *CopyParseTree(const SyntaxTree *);
void ComputeFrontierSet(Node *, const Options &, std::set<Node *> &) const;
bool IsFrontierNode(const Node &, const Options &) const;
void CalcComplementSpans(Node *);
void GetTargetTreeLeaves(Node *, std::vector<Node *> &);
void AttachUnalignedSourceWords();
@ -78,6 +81,7 @@ private:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -29,6 +29,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -128,4 +130,5 @@ Subgraph ComposedRule::CreateSubgraph()
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -28,6 +28,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -67,6 +69,7 @@ private:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -1,46 +0,0 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#ifndef EXTRACT_GHKM_EXCEPTION_H_
#define EXTRACT_GHKM_EXCEPTION_H_
#include <string>
namespace MosesTraining
{
namespace GHKM
{
class Exception
{
public:
Exception(const char *msg) : m_msg(msg) {}
Exception(const std::string &msg) : m_msg(msg) {}
const std::string &GetMsg() const {
return m_msg;
}
private:
std::string m_msg;
};
} // namespace GHKM
} // namespace MosesTraining
#endif

View File

@ -30,6 +30,7 @@
#include <boost/program_options.hpp>
#include "syntax-common/exception.h"
#include "syntax-common/xml_tree_parser.h"
#include "InputFileStream.h"
@ -43,7 +44,6 @@
#include "Alignment.h"
#include "AlignmentGraph.h"
#include "Exception.h"
#include "Node.h"
#include "Options.h"
#include "PhraseOrientation.h"
@ -55,6 +55,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -131,8 +133,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::string sourceLine;
std::string alignmentLine;
Alignment alignment;
Syntax::XmlTreeParser targetXmlTreeParser;
Syntax::XmlTreeParser sourceXmlTreeParser;
XmlTreeParser targetXmlTreeParser;
XmlTreeParser sourceXmlTreeParser;
ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
size_t lineNum = options.sentenceOffset;
@ -163,8 +165,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
} catch (const Exception &e) {
std::ostringstream oss;
oss << "Failed to parse target XML tree at line " << lineNum;
if (!e.GetMsg().empty()) {
oss << ": " << e.GetMsg();
if (!e.msg().empty()) {
oss << ": " << e.msg();
}
Error(oss.str());
}
@ -181,8 +183,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
} catch (const Exception &e) {
std::ostringstream oss;
oss << "Failed to parse source XML tree at line " << lineNum;
if (!e.GetMsg().empty()) {
oss << ": " << e.GetMsg();
if (!e.msg().empty()) {
oss << ": " << e.msg();
}
Error(oss.str());
}
@ -195,7 +197,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
} catch (const Exception &e) {
std::ostringstream oss;
oss << "Failed to read alignment at line " << lineNum << ": ";
oss << e.GetMsg();
oss << e.msg();
Error(oss.str());
}
if (alignment.size() == 0) {
@ -896,4 +898,5 @@ void ExtractGHKM::StripBitParLabels(
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -32,12 +32,14 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
struct Options;
class ExtractGHKM : public Syntax::Tool
class ExtractGHKM : public Tool
{
public:
ExtractGHKM() : Tool("extract-ghkm") {}
@ -76,4 +78,5 @@ private:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -21,6 +21,6 @@
int main(int argc, char *argv[])
{
MosesTraining::GHKM::ExtractGHKM tool;
MosesTraining::Syntax::GHKM::ExtractGHKM tool;
return tool.Main(argc, argv);
}

View File

@ -23,6 +23,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -70,4 +72,5 @@ void Node::GetTargetWords(std::vector<std::string> &targetWords) const
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -30,6 +30,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -215,6 +217,7 @@ Node *Node::LowestCommonAncestor(InputIterator first, InputIterator last)
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -23,6 +23,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -89,5 +91,5 @@ public:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -28,6 +28,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -469,5 +471,5 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -32,6 +32,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -120,4 +122,5 @@ private:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -5,6 +5,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -38,4 +40,5 @@ bool Rule::PartitionOrderComp(const Node *a, const Node *b)
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -9,6 +9,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -54,6 +56,7 @@ protected:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -28,6 +28,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -197,4 +199,5 @@ void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -31,6 +31,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -93,4 +95,5 @@ private:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -32,6 +32,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -229,4 +231,5 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -25,6 +25,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -57,5 +59,5 @@ private:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -21,6 +21,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -45,4 +47,5 @@ ContiguousSpan Closure(const Span &s)
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -26,6 +26,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -37,6 +39,7 @@ bool SpansIntersect(const Span &, const ContiguousSpan &);
ContiguousSpan Closure(const Span &);
} // namespace MosesTraining
} // namespace Syntax
} // namespace GHKM
#endif

View File

@ -7,6 +7,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -91,4 +93,5 @@ StsgRule::StsgRule(const Subgraph &fragment)
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -9,6 +9,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -39,6 +41,7 @@ private:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -13,6 +13,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -92,4 +94,5 @@ void StsgRuleWriter::Write(const StsgRule &rule)
}
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -8,6 +8,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -36,6 +38,7 @@ private:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining
#endif

View File

@ -24,6 +24,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -195,4 +197,5 @@ void Subgraph::RecursivelyGetPartsOfSpeech(const Node *n, std::vector<std::strin
}
} // namespace MosesTraining
} // namespace Syntax
} // namespace GHKM

View File

@ -26,6 +26,8 @@
namespace MosesTraining
{
namespace Syntax
{
namespace GHKM
{
@ -137,5 +139,5 @@ private:
};
} // namespace GHKM
} // namespace Syntax
} // namespace MosesTraining

View File

@ -4,6 +4,7 @@
#include <vector>
#include "extract-lex.h"
#include "InputFileStream.h"
#include "moses/Util.h"
using namespace std;
using namespace MosesTraining;
@ -53,9 +54,9 @@ int main(int argc, char* argv[])
assert(isAlign);
vector<string> toksTarget, toksSource, toksAlign;
Tokenize(toksTarget, lineTarget);
Tokenize(toksSource, lineSource);
Tokenize(toksAlign, lineAlign);
Moses::Tokenize(toksTarget, lineTarget);
Moses::Tokenize(toksSource, lineSource);
Moses::Tokenize(toksAlign, lineAlign);
/*
cerr << endl
@ -99,7 +100,7 @@ void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource,
const string &alignTok = *iterAlign;
vector<size_t> alignPos;
Tokenize(alignPos, alignTok, "-");
Moses::Tokenize(alignPos, alignTok, "-");
assert(alignPos.size() == 2);
if (alignPos[0] >= toksSource.size()) {

View File

@ -9,59 +9,6 @@
namespace MosesTraining
{
//! convert string to variable of type T. Used to reading floats, int etc from files
template<typename T>
inline T Scan(const std::string &input)
{
std::stringstream stream(input);
T ret;
stream >> ret;
return ret;
}
//! speeded up version of above
template<typename T>
inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
{
output.resize(input.size());
for (size_t i = 0 ; i < input.size() ; i++) {
output[i] = Scan<T>( input[i] );
}
}
inline void Tokenize(std::vector<std::string> &output
, const std::string& str
, const std::string& delimiters = " \t")
{
// Skip delimiters at beginning.
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
// Find first "non-delimiter".
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
while (std::string::npos != pos || std::string::npos != lastPos) {
// Found a token, add it to the vector.
output.push_back(str.substr(lastPos, pos - lastPos));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiters, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiters, lastPos);
}
}
// speeded up version of above
template<typename T>
inline void Tokenize( std::vector<T> &output
, const std::string &input
, const std::string& delimiters = " \t")
{
std::vector<std::string> stringVector;
Tokenize(stringVector, input, delimiters);
return Scan<T>(output, stringVector );
}
class WordCount
{
friend std::ostream& operator<<(std::ostream&, const WordCount&);

View File

@ -283,6 +283,7 @@ int main(int argc, char* argv[])
string englishString, foreignString, alignmentString, weightString;
while(getline(*eFileP, englishString)) {
// Print progress dots to stderr.
i++;
if (i%10000 == 0) cerr << "." << flush;
@ -337,6 +338,9 @@ int main(int argc, char* argv[])
extractFileContextInv.Close();
}
}
// We've been printing progress dots to stderr. End the line.
cerr << endl;
}
namespace MosesTraining

View File

@ -1,2 +1,2 @@
exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../../util//kenutil ../..//z ;
exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../OutputFileStream.cpp ../..//boost_iostreams ../..//boost_filesystem ../../util//kenutil ../..//z ;

View File

@ -277,7 +277,7 @@ void Model::score_fe(const string& f, const string& e)
{
if (!fe) //Make sure we do not do anything if it is not a fe model
return;
fprintf(file,"%s ||| %s ||| ",f.c_str(),e.c_str());
outputFile << f << " ||| " << e << " |||";
//condition on the previous phrase
if (previous) {
vector<double> scores;
@ -288,9 +288,8 @@ void Model::score_fe(const string& f, const string& e)
sum += scores[i];
}
for(size_t i=0; i<scores.size(); ++i) {
fprintf(file,"%f ",scores[i]/sum);
outputFile << " " << (scores[i]/sum);
}
//fprintf(file, "||| ");
}
//condition on the next phrase
if (next) {
@ -302,17 +301,17 @@ void Model::score_fe(const string& f, const string& e)
sum += scores[i];
}
for(size_t i=0; i<scores.size(); ++i) {
fprintf(file, "%f ", scores[i]/sum);
outputFile << " " << (scores[i]/sum);
}
}
fprintf(file,"\n");
outputFile << endl;
}
void Model::score_f(const string& f)
{
if (fe) //Make sure we do not do anything if it is not a f model
return;
fprintf(file, "%s ||| ", f.c_str());
cout << f << " |||";
//condition on the previous phrase
if (previous) {
vector<double> scores;
@ -323,9 +322,8 @@ void Model::score_f(const string& f)
sum += scores[i];
}
for(size_t i=0; i<scores.size(); ++i) {
fprintf(file, "%f ", scores[i]/sum);
outputFile << " " << (scores[i]/sum);
}
//fprintf(file, "||| ");
}
//condition on the next phrase
if (next) {
@ -337,22 +335,16 @@ void Model::score_f(const string& f)
sum += scores[i];
}
for(size_t i=0; i<scores.size(); ++i) {
fprintf(file, "%f ", scores[i]/sum);
outputFile << " " << (scores[i]/sum);
}
}
fprintf(file, "\n");
outputFile << endl;
}
Model::Model(ModelScore* ms, Scorer* sc, const string& dir, const string& lang, const string& fn)
: modelscore(ms), scorer(sc), filename(fn)
{
file = fopen(filename.c_str(),"w");
if (!file) {
cerr << "Could not open the model output file: " << filename << endl;
exit(1);
}
outputFile.Open( (filename+".gz").c_str() );
fe = false;
if (lang.compare("fe") == 0) {
fe = true;
@ -373,28 +365,11 @@ Model::Model(ModelScore* ms, Scorer* sc, const string& dir, const string& lang,
Model::~Model()
{
fclose(file);
outputFile.Close();
delete modelscore;
delete scorer;
}
void Model::zipFile()
{
fclose(file);
file = fopen(filename.c_str(), "rb");
gzFile gzfile = gzopen((filename+".gz").c_str(),"wb");
char inbuffer[128];
int num_read;
while ((num_read = fread(inbuffer, 1, sizeof(inbuffer), file)) > 0) {
gzwrite(gzfile, inbuffer, num_read);
}
fclose(file);
gzclose(gzfile);
//Remove the unzipped file
remove(filename.c_str());
}
void Model::split_config(const string& config, string& dir, string& lang, string& orient)
{
istringstream is(config);

View File

@ -13,7 +13,7 @@
#include <fstream>
#include "util/string_piece.hh"
#include "../OutputFileStream.h"
enum ORIENTATION {MONO, SWAP, DRIGHT, DLEFT, OTHER, NOMONO};
@ -122,8 +122,8 @@ private:
ModelScore* modelscore;
Scorer* scorer;
std::FILE* file;
std::string filename;
Moses::OutputFileStream outputFile;
bool fe;
bool previous;

View File

@ -205,11 +205,10 @@ int main(int argc, char* argv[])
models[i]->score_f(f_current);
}
//Zip all files
// delete model objects (and close files)
for (size_t i=0; i<models.size(); ++i) {
models[i]->zipFile();
delete models[i];
}
return 0;
}

View File

@ -120,8 +120,13 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words )
for( size_t i=0; i<nodes.size(); i++ ) {
cout << " <tree span=\"" << nodes[i]->start
<< "-" << nodes[i]->end
<< "\" label=\"" << nodes[i]->label
<< "\"/>";
<< "\" label=\"" << nodes[i]->label << "\"";
for (SyntaxNode::AttributeMap::const_iterator
p = nodes[i]->attributes.begin();
p != nodes[i]->attributes.end(); ++p) {
cout << " " << p->first << "=\"" << p->second << "\"";
}
cout << "/>";
}
cout << endl;
}

View File

@ -19,7 +19,9 @@
#include <sstream>
#include <assert.h>
#include <cstdlib>
#include <cstring>
#include <list>
#include <map>
#include <set>
#include <vector>
@ -70,6 +72,7 @@ bool nonTermContextTarget = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
float minCount = 0;
float minCountHierarchical = 0;
bool phraseOrientationPriorsFlag = false;
@ -107,7 +110,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float>
const std::string &fileNameLeftHandSideSourceLabelCounts,
const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
void processPhrasePairs( std::list< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
@ -131,14 +134,28 @@ int main(int argc, char* argv[])
ScoreFeatureManager featureManager;
if (argc < 4) {
std::cerr <<
"syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] "
"[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] "
"[--NoWordAlignment] [--UnalignedPenalty] "
"syntax: score extract lex phrase-table "
"[--Inverse] "
"[--Hierarchical] "
"[--LogProb] "
"[--NegLogProb] "
"[--NoLex] "
"[--GoodTuring] "
"[--KneserNey] "
"[--NoWordAlignment] "
"[--UnalignedPenalty] "
"[--UnalignedFunctionWordPenalty function-word-file] "
"[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] "
"[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] "
"[--TargetPreferenceLabels] [--UnpairedExtractFormat] "
"[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
"[--MinCountHierarchical count] "
"[--PartsOfSpeech] "
"[--PCFG] "
"[--TreeFragments] "
"[--SourceLabels] "
"[--SourceLabelCountsLHS] "
"[--TargetPreferenceLabels] "
"[--UnpairedExtractFormat] "
"[--ConditionOnTargetLHS] "
"[--CrossedNonTerm]"
<< std::endl;
std::cerr << featureManager.usage() << std::endl;
exit(1);
}
@ -235,9 +252,13 @@ int main(int argc, char* argv[])
logProbFlag = true;
negLogProb = -1;
std::cerr << "using negative log-probabilities" << std::endl;
} else if (strcmp(argv[i],"--MinCount") == 0) {
minCount = std::atof( argv[++i] );
std::cerr << "dropping all phrase pairs occurring less than " << minCount << " times" << std::endl;
minCount -= 0.00001; // account for rounding
} else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
minCountHierarchical = Moses::Scan<float>( argv[++i] );
std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
minCountHierarchical = std::atof( argv[++i] );
std::cerr << "dropping all hierarchical phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
minCountHierarchical -= 0.00001; // account for rounding
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
crossedNonTerm = true;
@ -325,8 +346,8 @@ int main(int argc, char* argv[])
// loop through all extracted phrase translations
std::string line, lastLine;
ExtractionPhrasePair *phrasePair = NULL;
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
std::list< ExtractionPhrasePair* > phrasePairsWithSameSource;
std::list< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
int tmpSentenceId;
PHRASE *tmpPhraseSource, *tmpPhraseTarget;
@ -359,6 +380,7 @@ int main(int argc, char* argv[])
while ( getline(extractFile, line) ) {
// Print progress dots to stderr.
if ( ++i % 100000 == 0 ) {
std::cerr << "." << std::flush;
}
@ -389,7 +411,7 @@ int main(int argc, char* argv[])
// once the first of them has been found to have to be set to false
if ( hierarchicalFlag ) {
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
for ( std::list< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) {
if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
sourceMatch, targetMatch, alignmentMatch ) ) {
@ -419,7 +441,7 @@ int main(int argc, char* argv[])
if ( !phrasePairsWithSameSource.empty() &&
!sourceMatch ) {
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
delete *iter;
}
@ -450,8 +472,11 @@ int main(int argc, char* argv[])
}
// We've been printing progress dots to stderr. End the line.
std::cerr << std::endl;
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
delete *iter;
}
@ -546,7 +571,7 @@ void processLine( std::string line,
} else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
sscanf(token[j].c_str(), "%f", &count);
} else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
float pcfgScore = Moses::Scan<float>( token[j] );
float pcfgScore = std::atof( token[j].c_str() );
pcfgSum = pcfgScore * count;
}
}
@ -652,7 +677,7 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
}
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
void processPhrasePairs( std::list< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
{
if (phrasePairsWithSameSource.size() == 0) {
@ -664,14 +689,14 @@ void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSa
//std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
// loop through phrase pairs
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
// add to total count
totalSource += (*iter)->GetCount();
}
// output the distinct phrase pairs, one at a time
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
iter!=phrasePairsWithSameSource.end(); ++iter) {
// add to total count
outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
@ -700,16 +725,15 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
countOfCounts[ countInt ]++;
}
// compute PCFG score
float pcfgScore = 0;
if (pcfgFlag && !inverseFlag) {
pcfgScore = phrasePair.GetPcfgScore() / count;
}
// output phrases
const PHRASE *phraseSource = phrasePair.GetSource();
const PHRASE *phraseTarget = phrasePair.GetTarget();
// do not output if count below threshold
if (count < minCount) {
return;
}
// do not output if hierarchical and count below threshold
if (hierarchicalFlag && count < minCountHierarchical) {
for(size_t j=0; j<phraseSource->size()-1; ++j) {
@ -718,6 +742,12 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
}
// compute PCFG score
float pcfgScore = 0;
if (pcfgFlag && !inverseFlag) {
pcfgScore = phrasePair.GetPcfgScore() / count;
}
// source phrase (unless inverse)
if (!inverseFlag) {
printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
@ -1167,7 +1197,7 @@ void LexicalTable::load( const std::string &fileName )
continue;
}
double prob = Moses::Scan<double>( token[2] );
double prob = std::atof( token[2].c_str() );
WORD_ID wordT = vcbT.storeIfNew( token[0] );
WORD_ID wordS = vcbS.storeIfNew( token[1] );
ltable[ wordS ][ wordT ] = prob;

View File

@ -137,38 +137,38 @@ sub run_transliteration
print "Filter Table\n";
`$MOSES_SRC/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-score-options '--KneserNey' \
-phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \
-config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \
`$MOSES_SRC/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-score-options '--KneserNey' \\
-phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\
-config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\
-lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`;
`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
$TRANSLIT_MODEL/evaluation/$eval_file.filtered \
$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \
$TRANSLIT_MODEL/evaluation/$eval_file \
`$MOSES_SRC/scripts/training/filter-model-given-input.pl \\
$TRANSLIT_MODEL/evaluation/$eval_file.filtered \\
$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\
$TRANSLIT_MODEL/evaluation/$eval_file \\
-Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
`rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`;
print "Apply Filter\n";
`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \
$TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \
$TRANSLIT_MODEL/model/moses.ini \
$TRANSLIT_MODEL/tuning/moses.tuned.ini \
`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\
$TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \\
$TRANSLIT_MODEL/model/moses.ini \\
$TRANSLIT_MODEL/tuning/moses.tuned.ini \\
$TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`;
my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null";
`$DECODER \
-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
-threads 16 -drop-unknown -distortion-limit 0 \
-n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 \
distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \
< $TRANSLIT_MODEL/evaluation/$eval_file \
`$DECODER \\
-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\
-threads 16 -drop-unknown -distortion-limit 0 \\
-n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 \\
distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \\
< $TRANSLIT_MODEL/evaluation/$eval_file \\
> $TRANSLIT_MODEL/evaluation/$eval_file.op $drop_stderr`;
}
@ -315,52 +315,52 @@ sub run_decoder
`mkdir $corpus_dir/evaluation`;
`$MOSES_SRC/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-lmodel-oov-feature "yes" -post-decoding-translit "yes" \
-phrase-translation-table $corpus_dir/model/phrase-table \
`$MOSES_SRC/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-lmodel-oov-feature "yes" -post-decoding-translit "yes" \\
-phrase-translation-table $corpus_dir/model/phrase-table \\
-config $corpus_dir/model/moses.ini -lm 0:5:$LM_FILE:8`;
`touch $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`;
`$MOSES_SRC/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-lmodel-oov-feature "yes" -post-decoding-translit "yes" \
-phrase-translation-table $corpus_dir/model/phrase-table \
-config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \
`$MOSES_SRC/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-lmodel-oov-feature "yes" -post-decoding-translit "yes" \\
-phrase-translation-table $corpus_dir/model/phrase-table \\
-config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \\
-lm 0:3:$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini:8`;
`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
$corpus_dir/evaluation/filtered \
$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \
$INPUT_FILE -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt \
`$MOSES_SRC/scripts/training/filter-model-given-input.pl \\
$corpus_dir/evaluation/filtered \\
$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \\
$INPUT_FILE -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt \\
1 1 4 100 2"`;
`rm $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`;
`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl \
$corpus_dir/evaluation/filtered/moses.ini \
< $corpus_dir/model/moses.ini \
`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl \\
$corpus_dir/evaluation/filtered/moses.ini \\
< $corpus_dir/model/moses.ini \\
> $corpus_dir/evaluation/moses.filtered.ini`;
my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null";
`$DECODER \
-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
-threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \
-max-trans-opt-per-coverage 100 \
-f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \
< $INPUT_FILE \
`$DECODER \\
-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\
-threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \\
-max-trans-opt-per-coverage 100 \\
-f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \\
< $INPUT_FILE \\
> $OUTPUT_FILE $drop_stderr`;
print "$DECODER \
-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
-threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \
-max-trans-opt-per-coverage 100 \
-f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \
< $INPUT_FILE \
print "$DECODER \\
-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\
-threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \\
-max-trans-opt-per-coverage 100 \\
-f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \\
< $INPUT_FILE \\
> $OUTPUT_FILE $drop_stderr\n";
}

View File

@ -103,34 +103,34 @@ sub run_transliteration
print STDERR "Filter Table\n";
`$MOSES_SRC/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-reordering msd-bidirectional-fe -score-options '--KneserNey' \
-phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \
-reordering-table $TRANSLIT_MODEL/model/reordering-table \
-config $eval_file.moses.table.ini \
`$MOSES_SRC/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-reordering msd-bidirectional-fe -score-options '--KneserNey' \\
-phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\
-reordering-table $TRANSLIT_MODEL/model/reordering-table \\
-config $eval_file.moses.table.ini \\
-lm 0:3:$eval_file.moses.table.ini:8`;
`$MOSES_SRC/scripts/training/filter-model-given-input.pl \
$eval_file.filtered $eval_file.moses.table.ini $eval_file \
`$MOSES_SRC/scripts/training/filter-model-given-input.pl \\
$eval_file.filtered $eval_file.moses.table.ini $eval_file \\
-Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
`rm $eval_file.moses.table.ini`;
print STDERR "Apply Filter\n";
`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \
$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \
`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\
$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \\
$TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`;
`$MOSES_SRC/bin/moses \
-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \
-threads 16 -drop-unknown -distortion-limit 0 \
-n-best-list $eval_file.op.nBest 50 \
-f $eval_file.filtered.ini \
< $eval_file \
`$MOSES_SRC/bin/moses \\
-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\
-threads 16 -drop-unknown -distortion-limit 0 \\
-n-best-list $eval_file.op.nBest 50 \\
-f $eval_file.filtered.ini \\
< $eval_file \\
> $eval_file.op`;
}

View File

@ -118,80 +118,80 @@ sub learn_transliteration_model{
print "Align Corpus\n";
`$MOSES_SRC_DIR/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -last-step 1 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t \
`$MOSES_SRC_DIR/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -last-step 1 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t \\
-corpus-dir $OUT_DIR/training/prepared`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 \
-dont-zip -first-step 2 -last-step 2 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 \\
-dont-zip -first-step 2 -last-step 2 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \\
-giza-e2f $OUT_DIR/training/giza -direction 2`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \
`$MOSES_SRC_DIR/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \\
-giza-f2e $OUT_DIR/training/giza-inverse -direction 1`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza \
-giza-f2e $OUT_DIR/training/giza-inverse \
-alignment-file $OUT_DIR/model/aligned \
`$MOSES_SRC_DIR/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza \\
-giza-f2e $OUT_DIR/training/giza-inverse \\
-alignment-file $OUT_DIR/model/aligned \\
-alignment-stem $OUT_DIR/model/aligned -alignment grow-diag-final-and`;
print "Train Translation Models\n";
`$MOSES_SRC_DIR/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex \
-alignment-file $OUT_DIR/model/aligned \
-alignment-stem $OUT_DIR/model/aligned \
`$MOSES_SRC_DIR/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex \\
-alignment-file $OUT_DIR/model/aligned \\
-alignment-stem $OUT_DIR/model/aligned \\
-corpus $OUT_DIR/training/corpus$t`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned \
-alignment-stem $OUT_DIR/model/aligned -extract-file \
`$MOSES_SRC_DIR/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned \\
-alignment-stem $OUT_DIR/model/aligned -extract-file \\
$OUT_DIR/model/extract -corpus $OUT_DIR/training/corpus$t`;
`$MOSES_SRC_DIR/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-score-options '--KneserNey' -extract-file $OUT_DIR/model/extract \
-lexical-file $OUT_DIR/model/lex -phrase-translation-table \
`$MOSES_SRC_DIR/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-score-options '--KneserNey' -extract-file $OUT_DIR/model/extract \\
-lexical-file $OUT_DIR/model/lex -phrase-translation-table \\
$OUT_DIR/model/phrase-table`;
print "Train Language Models\n";
`$SRILM_DIR/ngram-count \
-order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk \
`$SRILM_DIR/ngram-count \\
-order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk \\
-text $OUT_DIR/lm/target -lm $OUT_DIR/lm/targetLM`;
`$MOSES_SRC_DIR/bin/build_binary \
`$MOSES_SRC_DIR/bin/build_binary \\
$OUT_DIR/lm/targetLM $OUT_DIR/lm/targetLM.bin`;
print "Create Config File\n";
`$MOSES_SRC_DIR/scripts/training/train-model.perl \
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \
-score-options '--KneserNey' \
-phrase-translation-table $OUT_DIR/model/phrase-table \
`$MOSES_SRC_DIR/scripts/training/train-model.perl \\
-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\
-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\
-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\
-score-options '--KneserNey' \\
-phrase-translation-table $OUT_DIR/model/phrase-table \\
-config $OUT_DIR/model/moses.ini -lm 0:5:$OUT_DIR/lm/targetLM.bin:8`;
}

View File

@ -194,21 +194,19 @@ raw-corpus = $toy-data/nc-5k.$output-extension
[LM:bilingual-lm]
#bilingual-lm
exclude-from-interpolation = true
#required settings
bilingual-lm = "yes"
bilingual-lm-workdir = "bilingual"
bilingual-lm-settings = ""
order = "5"
source-window = "4"
nplm-dir = "/mnt/gna0/rsennrich/tools/nplm-0.3-gpu-experimental/"
#actual training
train_order = "14" #this is equal to order + 2*source-window + 1
nplm-output-dir = "nplm_out"
nplm-settings = "-l /mnt/gna0/rsennrich/tools/nplm-0.3-gpu-experimental/"
# Add extra settings for ngram extraction or nplm training
#bilingual-lm-settings = ""
#nplm-settings = ""
#Config file generation:
config-feature-line = "BilingualNPLM order=$order source_window=$source-window path=$working-dir/$nplm-output-dir/train.10k.model.nplm.10 source_vocab=$working-dir/$bilingual-lm-workdir/vocab.source target_vocab=$working-dir/$bilingual-lm-workdir/vocab.target"
config-weight-line = "BilingualNPLM0= 0.1"
# Defaults to 10
#epochs = 2
#################################################################
# INTERPOLATING LANGUAGE MODELS

View File

@ -61,6 +61,7 @@ factorize
rerun-on-change: TRAINING:input-factors TRAINING:output-factors
default-name: corpus/factored
pass-unless: TRAINING:input-factors
pass-if: factorize-after-split
parallelizable: yes
error: can't open
error: incompatible number of words in factor
@ -112,6 +113,15 @@ post-split-clean-syntax
pass-unless: input-splitter output-splitter
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 10000 OUT.lines-retained --ignore-xml
error: there is a blank factor
post-split-factorize
in: clean-split-stem
out: post-split-factorized-stem
rerun-on-change: TRAINING:input-factors TRAINING:output-factors
default-name: corpus/split-factored
pass-unless: AND TRAINING:input-factors factorize-after-split
parallelizable: yes
error: can't open
error: incompatible number of words in factor
[RECASING] single
tokenize
@ -160,20 +170,24 @@ train
ignore-if: no-splitter-training
[LM] multiple
prepare-bilingual-nplm
prepare-bilingual-lm
in: TRAINING:corpus TRAINING:word-alignment
out: numberized_ngrams
ignore-unless: bilingual-lm
rerun-on-change: TRAINING:corpus TRAINING:word-alignment
template: $moses-script-dir/training/bilingual-lm/extract_training.py -c IN0 -e $output-extension -f $input-extension -a IN1.$TRAINING:alignment-symmetrization-method -w $working-dir/$bilingual-lm-workdir -n $order -m $source-window $bilingual-lm-settings
default-name: lm/bilingualLM_prep
default-name: lm/blm
train-bilingual-lm
in: numberized_ngrams TRAINING:corpus
out: binlm
ignore-unless: bilingual-lm
rerun-on-change: numberized_ngrams
template: $moses-script-dir/training/bilingual-lm/train_nplm.py -w $working-dir/$bilingual-lm-workdir -c IN1 -r $working-dir/$nplm-output-dir -n $train_order $nplm-settings
default-name: lm/bilingualLM
default-name: lm/blm
train-nplm
in: stripped-corpus
out: binlm
ignore-unless: nplm
rerun-on-change: stripped-corpus
default-name: lm/nplm
get-corpus
in: get-corpus-script
out: raw-corpus
@ -207,6 +221,7 @@ factorize
out: factorized-corpus
default-name: lm/factored
pass-unless: factors
pass-if: factorize-after-split
ignore-if: concatenate-files concatenate-files-split
parallelizable: yes
error: can't open
@ -238,8 +253,17 @@ split
pass-unless: output-splitter
ignore-if: concatenate-files concatenate-files-split
template: $output-splitter -model IN1.$output-extension < IN > OUT
strip
post-split-factorize
in: split-corpus
out: split-factorized-corpus
default-name: lm/split-factored
pass-unless: AND factors factorize-after-split
ignore-if: concatenate-files concatenate-files-split
parallelizable: yes
error: can't open
error: incompatible number of words in factor
strip
in: split-factorized-corpus
out: stripped-corpus
default-name: lm/stripped
pass-unless: mock-output-parser-lm
@ -261,7 +285,7 @@ train
in: stripped-corpus
out: lm
default-name: lm/lm
ignore-if: rlm-training custom-training bilingual-lm
ignore-if: rlm-training custom-training bilingual-lm nplm
rerun-on-change: lm-training order settings
template: $lm-training -order $order $settings -text IN -lm OUT
error: cannot execute binary file
@ -278,7 +302,7 @@ train-custom
template: $custom-training -text IN -lm OUT
final-model: yes
train-custom-syntax
in: split-corpus
in: split-factorized-corpus
out: binlm
default-name: lm/custom-lm
rerun-on-change: custom-training
@ -337,6 +361,7 @@ factorize-tuning
out: factorized-tuning
default-name: lm/interpolate-tuning.factored
pass-unless: TRAINING:output-factors
pass-if: factorize-after-split
parallelizable: yes
error: can't open
error: incompatible number of words in factor
@ -361,8 +386,16 @@ split-tuning
default-name: lm/interpolate-tuning.split
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
post-split-factorize-tuning
in: split-tuning
out: post-split-factorized-tuning
default-name: lm/interpolate-tuning.split-factored
pass-unless: AND TRAINING:output-factors factorize-after-split
parallelizable: yes
error: can't open
error: incompatible number of words in factor
strip-tuning
in: split-tuning
in: post-split-factorized-tuning
out: stripped-tuning
default-name: lm/interpolate-tuning.stripped
pass-unless: mock-output-parser-lm
@ -490,12 +523,12 @@ train-in-mono
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
[TRAINING] single
consolidate
in: CORPUS:clean-split-stem
in: CORPUS:post-split-factorized-stem
out: corpus
default-name: corpus
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
build-domains
in: CORPUS:clean-split-stem
in: CORPUS:post-split-factorized-stem
out: domains
default-name: model/domains
ignore-unless: domain-features mml-filter-corpora
@ -523,14 +556,14 @@ fast-align
in: prepared-data-fast-align
out: fast-alignment
rerun-on-change: fast-align-settings
ignore-if: fast-align-max-lines
ignore-if: fast-align-max-lines fast-align-save-model
template: $external-bin-dir/fast_align -i IN $fast-align-settings > OUT
default-name: fast-align
fast-align-inverse
in: prepared-data-fast-align
out: fast-alignment-inverse
rerun-on-change: fast-align-settings
ignore-if: fast-align-max-lines
ignore-if: fast-align-max-lines fast-align-save-model
template: $external-bin-dir/fast_align -i IN -r $fast-align-settings > OUT
default-name: fast-align-inverse
fast-align-in-parts
@ -539,7 +572,7 @@ fast-align-in-parts
rerun-on-change: fast-align-settings fast-align-max-lines
ignore-unless: fast-align-max-lines
tmp-name: training/tmp.fast-align
template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' -save-model '$?fast-align-save-model' -o OUT
default-name: fast-align
fast-align-in-parts-inverse
in: prepared-data-fast-align
@ -547,8 +580,24 @@ fast-align-in-parts-inverse
rerun-on-change: fast-align-settings fast-align-max-lines
ignore-unless: fast-align-max-lines
tmp-name: training/tmp.fast-align-inverse
template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' -save-model '$?fast-align-save-model' -o OUT
default-name: fast-align-inverse
fast-align-save-model
in: prepared-data-fast-align
out: fast-alignment
ignore-unless: fast-align-save-model
ignore-if: fast-align-max-lines
default-name: fast-align
tmp-name: training/tmp.fast-align-inverse
template: $external-bin-dir/fast_align -i IN $fast-align-settings -p OUT.parameters > OUT 2> OUT.log
fast-align-save-model-inverse
in: prepared-data-fast-align
out: fast-alignment-inverse
ignore-unless: fast-align-save-model
ignore-if: fast-align-max-lines
default-name: fast-align-inverse
tmp-name: training/tmp.fast-align-inverse
template: $external-bin-dir/fast_align -r -i IN $fast-align-settings -p OUT.parameters > OUT 2> OUT.log
symmetrize-fast-align
in: fast-alignment fast-alignment-inverse corpus-mml-prefilter=OR=corpus
out: word-alignment
@ -616,7 +665,7 @@ build-biconcor
final-model: yes
build-suffix-array
in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: phrase-translation-table
out: sigtest-filter-phrase-translation-table
default-name: model/suffix-array
ignore-unless: suffix-array
error: usage
@ -688,11 +737,18 @@ build-ttable
final-model: yes
build-mmsapt
in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: phrase-translation-table
out: sigtest-filter-phrase-translation-table
ignore-unless: mmsapt
default-name: model/phrase-table-mmsapt
template: $moses-script-dir/training/build-mmsapt.perl --alignment IN.$alignment-symmetrization-method --corpus IN1 --f $input-extension --e $output-extension --dir OUT --settings '$mmsapt'
final-model: yes
custom-phrase-table-pruning
in: phrase-translation-table
out: sigtest-filter-phrase-translation-table
ignore-unless: custom-phrase-table-pruning
ignore-if: mmsapt
template: $custom-phrase-table-pruning IN OUT
default-name: model/phrase-table-pruned
sigtest-filter-suffix-array
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: sigtest-filter-suffix-array
@ -714,7 +770,7 @@ sigtest-filter-ttable
out: sigtest-filter-phrase-translation-table
default-name: model/phrase-table-sigtest-filter
pass-unless: sigtest-filter
ignore-if: TRAINING:config
ignore-if: TRAINING:config custom-phrase-table-pruning
final-model: yes
sigtest-filter-reordering
in: reordering-table sigtest-filter-suffix-array
@ -761,6 +817,7 @@ create-config
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature
default-name: model/moses.ini
error: Unknown option
error: requires an argument
final-model: yes
binarize-config
in: config

View File

@ -853,7 +853,7 @@ sub delete_output {
`rm -r $file` if $EXECUTE;
}
# delete regular file that matches exactly
if (-e $file) {
elsif (-e $file) {
print "\tdelete file $file\n";
`rm $file` if $EXECUTE;
}
@ -864,14 +864,14 @@ sub delete_output {
foreach (`ls $dir`) {
chop;
next unless substr($_,0,length($f)) eq $f;
if (-e "$dir/$_") {
if (-d "$dir/$_") {
print "\tdelete directory $file\n";
`rm -r $dir/$_` if $EXECUTE;
}
elsif (-e "$dir/$_") {
print "\tdelete file $dir/$_\n";
`rm $dir/$_` if $EXECUTE;
}
else {
print "\tdelete directory $dir/$_\n";
`rm -r $dir/$_` if $EXECUTE;
}
}
}
@ -1119,13 +1119,13 @@ sub define_step {
next if $RE_USE[$i];
next if defined($PASS{$i});
next if &define_template($i);
if ($DO_STEP[$i] =~ /^CORPUS:(.+):factorize$/) {
if ($DO_STEP[$i] =~ /^CORPUS:(.+):(post-split-)?factorize$/) {
&define_corpus_factorize($i);
}
elsif ($DO_STEP[$i] eq 'SPLITTER:train') {
&define_splitter_train($i);
}
elsif ($DO_STEP[$i] =~ /^LM:(.+):factorize$/) {
elsif ($DO_STEP[$i] =~ /^LM:(.+):(post-split-)?factorize$/) {
&define_lm_factorize($i,$1);
}
elsif ($DO_STEP[$i] =~ /^LM:(.+):randomize$/ ||
@ -1135,6 +1135,15 @@ sub define_step {
elsif ($DO_STEP[$i] =~ /^LM:(.+):train-randomized$/) {
&define_lm_train_randomized($i,$1);
}
elsif ($DO_STEP[$i] =~ /^LM:(.+):train-bilingual-lm$/) {
&define_lm_train_bilingual_lm($i,$1);
}
elsif ($DO_STEP[$i] =~ /^LM:(.+):prepare-bilingual-lm$/) {
&define_lm_prepare_bilingual_lm($i,$1);
}
elsif ($DO_STEP[$i] =~ /^LM:(.+):train-nplm$/) {
&define_lm_train_nplm($i,$1);
}
elsif ($DO_STEP[$i] eq 'TRAINING:prepare-data') {
&define_training_prepare_data($i);
}
@ -1182,7 +1191,7 @@ sub define_step {
elsif ($DO_STEP[$i] eq 'TRAINING:create-config' || $DO_STEP[$i] eq 'TRAINING:create-config-interpolated-lm') {
&define_training_create_config($i);
}
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:factorize-tuning') {
elsif ($DO_STEP[$i] =~ /^INTERPOLATED-LM:(post-split-)?factorize-tuning$/) {
&define_interpolated_lm_factorize_tuning($i);
}
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:interpolate') {
@ -1777,6 +1786,95 @@ sub define_lm_train_randomized {
&create_step($step_id,$cmd);
}
sub define_lm_train_bilingual_lm {
my ($step_id,$set) = @_;
my ($working_dir, $ngrams, $corpus) = &get_output_and_input($step_id);
my $scripts = &check_backoff_and_get("LM:moses-script-dir");
my $cmd = "$scripts/training/bilingual-lm/train_nplm.py -w $working_dir -c $corpus -r $working_dir";
my $nplm_dir = &check_backoff_and_get("LM:$set:nplm-dir");
$cmd .= " -l $nplm_dir";
my ($n, $m, $total_order) = &get_bilingual_lm_order($set);
$cmd .= " -n $total_order";
my $epochs = &get_bilingual_lm_epochs($set);
$cmd .= " -e $epochs" if defined($epochs);
my $nplm_settings = backoff_and_get("LM:$set:nplm-settings");
$cmd .= " $nplm_settings" if defined($nplm_settings);
# Create the ini file
$cmd .= "\n";
$cmd .= "$scripts/training/bilingual-lm/create_blm_ini.py -w $working_dir -n $n -m $m -x $set -e $epochs";
&create_step($step_id,$cmd);
}
sub define_lm_prepare_bilingual_lm {
my ($step_id,$set) = @_;
my ($working_dir, $corpus, $align) = &get_output_and_input($step_id);
my $scripts = &check_backoff_and_get("LM:moses-script-dir");
my $cmd = "$scripts/training/bilingual-lm/extract_training.py -w $working_dir -c $corpus";
my $input_extension = &check_backoff_and_get("GENERAL:input-extension");
my $output_extension = &check_backoff_and_get("GENERAL:output-extension");
$cmd .= " -e $output_extension -f $input_extension";
my $align_method = &check_backoff_and_get("TRAINING:alignment-symmetrization-method");
$cmd .= " -a $align.$align_method";
my ($n, $m, $total_order) = &get_bilingual_lm_order($set);
$cmd .= " -n $n -m $m";
my $bilingual_settings = backoff_and_get("LM:$set:bilingual-lm-settings");
$cmd .= " $bilingual_settings" if defined($bilingual_settings);
&create_step($step_id,$cmd);
}
sub define_lm_train_nplm {
my ($step_id,$set) = @_;
my ($working_dir, $corpus) = &get_output_and_input($step_id);
my $scripts = &check_backoff_and_get("LM:moses-script-dir");
my $cmd = "$scripts/training/train-neurallm.py --mmap --working-dir $working_dir --corpus $corpus";
my $nplm_dir = &check_backoff_and_get("LM:$set:nplm-dir");
$cmd .= " --nplm-home $nplm_dir";
my $epochs = &backoff_and_get("LM:$set:epochs");
$epochs = 2 unless defined($epochs);
$cmd .= " --epochs $epochs";
my $nplm_settings = backoff_and_get("LM:$set:nplm-settings");
$cmd .= " $nplm_settings" if defined($nplm_settings);
my $order = &backoff_and_get("LM:$set:order");
$order = 5 unless defined($order);
$cmd .= " --order $order";
# Create the ini file
$cmd .= "\n";
$cmd .= "$scripts/training/create_nplm_ini.py -w $working_dir -e $epochs -x $set -n $order";
&create_step($step_id,$cmd);
}
sub get_bilingual_lm_order {
my ($set) = @_;
my $order = &backoff_and_get("LM:$set:order");
$order = 5 unless defined ($order);
my $source_window = &backoff_and_get("LM:$set:source-window");
$source_window = 4 unless defined ($order);
return ($order, $source_window, $order + 2*$source_window+1);
}
sub get_bilingual_lm_epochs {
my ($set) = @_;
my $epochs = &backoff_and_get("LM:$set:epochs");
$epochs = 10 unless defined($epochs);
return $epochs;
}
sub define_lm_randomize {
my ($step_id,$set_dummy) = @_;
@ -2548,7 +2646,8 @@ sub define_training_create_config {
}
# sparse lexical features provide additional content for config file
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
my @additional_ini_files;
push (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features;
my @LM_SETS = &get_sets("LM");
my %INTERPOLATED_AWAY;
@ -2599,8 +2698,11 @@ sub define_training_create_config {
if (&get("LM:$set:config-feature-line") && &get("LM:$set:config-weight-line")) {
$feature_lines .= &get("LM:$set:config-feature-line") . ";";
$weight_lines .= &get("LM:$set:config-weight-line") . ";";
}
else {
} elsif (&get("LM:$set:nplm")) {
push(@additional_ini_files, "$lm/nplm.ini");
} elsif (&get("LM:$set:bilingual-lm")) {
push(@additional_ini_files, "$lm/blm.ini");
} else {
my $order = &check_backoff_and_get("LM:$set:order");
my $lm_file = "$lm";
@ -2629,13 +2731,17 @@ sub define_training_create_config {
}
}
if (defined($feature_lines)) {
if ($feature_lines) {
$cmd .= "-config-add-feature-lines \"$feature_lines\" ";
}
if (defined($weight_lines)) {
if ($weight_lines) {
$cmd .= "-config-add-weight-lines \"$weight_lines\" ";
}
if (@additional_ini_files) {
$cmd .= "-additional-ini-file " . join(":", @additional_ini_files);
}
&create_step($step_id,$cmd);
}
@ -2795,7 +2901,8 @@ sub get_interpolated_lm_sets {
my $count=0;
my $icount=0;
foreach my $set (@LM_SETS) {
next if (&get("LM:$set:exclude-from-interpolation"));
next if (&get("LM:$set:exclude-from-interpolation")) or (&get("LM:$set:bilingual-lm"))
or (&get("LM:$set:nplm"));
my $order = &check_backoff_and_get("LM:$set:order");
my $factor = 0;
@ -2831,6 +2938,7 @@ sub get_training_setting {
my $pcfg = &get("TRAINING:use-pcfg-feature");
my $baseline_alignment = &get("TRAINING:baseline-alignment-model");
my $no_glue_grammar = &get("TRAINING:no-glue-grammar");
my $mmsapt = &get("TRAINING:mmsapt");
my $xml = $source_syntax || $target_syntax;
@ -2855,6 +2963,7 @@ sub get_training_setting {
$cmd .= "-parallel " if $parallel;
$cmd .= "-pcfg " if $pcfg;
$cmd .= "-baseline-alignment-model $baseline_alignment " if defined($baseline_alignment) && ($step == 1 || $step == 2);
$cmd .= "-mmsapt " if defined($mmsapt);
# factored training
if (&backoff_and_get("TRAINING:input-factors")) {
@ -3454,12 +3563,20 @@ sub define_template {
}
$cmd =~ s/VERSION/$VERSION/g;
print "\tcmd is $cmd\n" if $VERBOSE;
while ($cmd =~ /^([\S\s]*)\$\{([^\s\/\"\']+)\}([\S\s]*)$/ ||
$cmd =~ /^([\S\s]*)\$([^\s\/\"\']+)([\S\s]*)$/) {
my ($pre,$variable,$post) = ($1,$2,$3);
$cmd = $pre
. &check_backoff_and_get(&extend_local_name($module,$set,$variable))
. $post;
# replace variables
while ($cmd =~ /^([\S\s]*)\$(\??)\{([^\s\/\"\']+)\}([\S\s]*)$/ ||
$cmd =~ /^([\S\s]*)\$(\??)([^\s\/\"\']+)([\S\s]*)$/) {
my ($pre,$optional,$variable,$post) = ($1,$2,$3,$4);
my $value;
if ($optional eq '?') {
$value = &backoff_and_get(&extend_local_name($module,$set,$variable));
$value = "" unless $value;
}
else {
$value = &check_backoff_and_get(&extend_local_name($module,$set,$variable));
}
$cmd = $pre.$value.$post;
}
# deal with pipelined commands

View File

@ -12,28 +12,33 @@ use warnings;
use strict;
use Getopt::Long qw(:config pass_through no_ignore_case permute);
my ($BIN,$IN,$MAX_LINES,$SETTINGS,$REVERSE,$TMP);
my ($BIN,$IN,$OUT,$MAX_LINES,$SETTINGS,$REVERSE,$SAVE_MODEL,$TMP);
GetOptions('bin=s' => \$BIN,
'i=s' => \$IN,
'o=s' => \$OUT,
'max-lines=i' => \$MAX_LINES,
'settings=s' => \$SETTINGS,
'save-model=s' => \$SAVE_MODEL,
'r' => \$REVERSE,
'tmp=s' => \$TMP,
) or exit(1);
die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR")
unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) && defined($MAX_LINES)
die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR [-save-model MODEL] -o ALIGNMENTS")
unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP)
&& defined($MAX_LINES) && defined($OUT)
&& $MAX_LINES > 0;
die("ERROR - input file does not exist: $IN") unless -e $IN;
die("ERROR - fast_align binary does not exist: $BIN") unless -e $BIN;
$SAVE_MODEL = defined($SAVE_MODEL) && $SAVE_MODEL && $SAVE_MODEL ne 'no';
chomp(my $line_count = `cat $IN | wc -l`);
# not more than maximal number of lines -> just run it regulary
if ($MAX_LINES > $line_count) {
my $cmd = "$BIN -i $IN $SETTINGS";
my $cmd = "$BIN -i $IN $SETTINGS >$OUT";
$cmd .= " -r" if defined($REVERSE);
$cmd .= " -p $OUT.parameters 2> $OUT.log" if $SAVE_MODEL;
safesystem($cmd) or die;
exit(0);
}
@ -56,6 +61,7 @@ foreach my $input_file (@INPUT_FILES) {
# process part
my $cmd = "$BIN -i $input_file $SETTINGS";
$cmd .= " -r" if defined($REVERSE);
$cmd .= " -p $output_file.parameters 2> $output_file.log" if $SAVE_MODEL;
$cmd .= " >$output_file";
safesystem($cmd) or die;
die("ERROR: no output produced from command $cmd") unless -e $output_file;
@ -67,12 +73,63 @@ foreach my $input_file (@INPUT_FILES) {
}
# join output
$cmd = "cat $TMP/aligned-*";
$cmd = "cat $TMP/aligned-?? > $OUT";
safesystem($cmd) or die;
$cmd = "rm -r $TMP/* ; rmdir $TMP";
# join model
&join_model(scalar @INPUT_FILES) if $SAVE_MODEL;
&join_log(scalar @INPUT_FILES) if $SAVE_MODEL;
$cmd = "rm $TMP/* ; rmdir $TMP";
safesystem($cmd);
sub join_model {
my ($count) = @_;
open(CONCAT,"cat $TMP/aligned-*.parameters | LC_ALL=C sort -T $TMP -S 10%|");
open(JOINED,">$OUT.parameters");
my ($last_f,$last_e,$f,$e,$score,$merged_score);
while(<CONCAT>) {
($f,$e,$score) = split;
if (!defined($last_f) || $f ne $last_f || $e ne $last_e) {
printf JOINED "%s %s %f\n",$last_f,$last_e,log($merged_score) if defined($last_f);
$last_f = $f;
$last_e = $e;
$merged_score = 0;
}
$merged_score += exp($score)/$count;
}
printf JOINED "%s %s %f\n",$f,$e,log($merged_score);
close(CONCAT);
close(JOINED);
}
sub merge_entry {
my ($count,$f,$e,@SCORE) = @_;
my $score = 0;
foreach (@SCORE) {
$score += exp($_)/$count;
}
$score = log($score);
print JOINED "$f $e $score\n";
}
sub join_log {
my ($count) = @_;
open(CONCAT,"cat $TMP/aligned-*.log |");
my ($length,$tension,$tension_count) = (0,0,0);
while(<CONCAT>) {
$length += $1 if /expected target length = source length \* ([\d\.]+)/;
$tension += $1 if /final tension: ([\d\.]+)/ and (++$tension_count % 3 == 0);
}
close(CONCAT);
$length /= $count;
$tension /= $count;
open(JOINED,">$OUT.log");
print JOINED "expected target length = source length * $length\n";
print JOINED " final tension: $tension\n";
close(JOINED);
}
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
import argparse
import os
import os.path
import sys
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-w", "--working-dir", dest="working_dir")
parser.add_argument("-n", "--target-context", dest="n")
parser.add_argument("-m", "--source-context", dest="m")
parser.add_argument("-i", "--ini_filename", dest="ini_filename")
parser.add_argument("-x", "--name", dest="name")
parser.add_argument("-e", "--epochs", dest="epochs")
parser.set_defaults(
working_dir="working",
n = "5",
m = "4",
ini_filename = "blm.ini",
name = "comb",
epochs = "10"
)
options = parser.parse_args()
if not os.path.exists(options.working_dir):
os.makedirs(options.working_dir)
# Bit of a hack, parse the working directory to get the name
name = os.path.basename(options.working_dir).split(".")[0].split("-")[-1]
ini_filename = os.path.join(options.working_dir,options.ini_filename)
with open(ini_filename,"w") as ifh:
print>>ifh, "[feature]"
print>>ifh,"BilingualNPLM name=BLM%s order=%s source_window=%s path=%s/train.10k.model.nplm.%s source_vocab=%s/vocab.source target_vocab=%s/vocab.target" \
% (options.name,options.n, options.m, options.working_dir, options.epochs, options.working_dir, options.working_dir)
print>>ifh
print>>ifh,"[weight]"
print>>ifh,"BLM%s= 0.1" % options.name
print>>ifh
if __name__ == "__main__":
main()

View File

@ -37,9 +37,7 @@ while (defined $_) {
$nr++;
print STDERR "." if $nr % 10000 == 0;
print STDERR "($nr)" if $nr % 100000 == 0;
chomp;
s/\s+/ /g; s/^ //; s/ $//;
my @intokens = split / /;
my ($intokens,$MARKUP) = split_xml($_);
# load lines of corresponding streams and ensure equal number of words
my @lines_of_extratoks;
foreach my $factor (0..$#streams) {
@ -49,14 +47,17 @@ while (defined $_) {
chomp($line);
$line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//;
my @toks = split / /, $line;
die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#intokens)"
if $#toks != $#intokens;
die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)"
if $#toks != $#$intokens;
$lines_of_extratoks[$factor] = \@toks;
}
# for every token, print the factors in the order as user wished
for(my $i=0; $i<=$#intokens; $i++) {
my $token = $intokens[$i];
for(my $i=0; $i<=$#$intokens; $i++) {
print " " if $i && $$MARKUP[$i] eq '';
print $$MARKUP[$i];
my $token = $$intokens[$i];
my @outtoken = ();
push @outtoken, $token; # add the first one
# print STDERR "Token: $token\n";
@ -69,11 +70,56 @@ while (defined $_) {
print " " if $i != 0;
print join("|", @outtoken);
}
print $$MARKUP[$#$MARKUP];
print "\n";
$_ = readline($firststream);
}
close $firststream;
print STDERR "Done.\n";
# store away xml markup
sub split_xml {
my ($line) = @_;
my (@WORD,@MARKUP);
my $i = 0;
$MARKUP[0] = "";
while($line =~ /\S/) {
# XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
my $potential_xml = $1;
my $line_next = $2;
# exception for factor that is an XML tag
if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
$WORD[$i-1] .= $potential_xml;
if ($line_next =~ /^(\|+)(.*)$/) {
$WORD[$i-1] .= $1;
$line_next = $2;
}
}
else {
$MARKUP[$i] .= $potential_xml." ";
}
$line = $line_next;
}
# non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
# '<' or '>' occurs in word, but it's not an XML tag
elsif ($line =~ /^\s*(\S+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
else {
die("ERROR: huh? $line\n");
}
}
chop($MARKUP[$#MARKUP]);
return (\@WORD,\@MARKUP);
}

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
import argparse
import os
import os.path
import sys
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-w", "--working-dir", dest="working_dir")
parser.add_argument("-n", "--order", dest="n")
parser.add_argument("-i", "--ini_filename", dest="ini_filename")
parser.add_argument("-x", "--name", dest="name")
parser.add_argument("-e", "--epochs", dest="epochs")
parser.add_argument("-f", "--factor", dest="factor")
parser.set_defaults(
working_dir="working",
n = "5",
ini_filename = "nplm.ini",
name = "neural",
epochs = "10",
factor = "0"
)
options = parser.parse_args()
if not os.path.exists(options.working_dir):
os.makedirs(options.working_dir)
ini_filename = os.path.join(options.working_dir,options.ini_filename)
with open(ini_filename,"w") as ifh:
print>>ifh, "[feature]"
print>>ifh,"NeuralLM factor=%s name=NPLM%s order=%s path=%s/train.model.nplm.%s" \
% (options.factor,options.name, options.n, options.working_dir, options.epochs)
print>>ifh
print>>ifh,"[weight]"
print>>ifh,"NPLM%s= 0.1" % options.name
print>>ifh
if __name__ == "__main__":
main()

View File

@ -37,6 +37,12 @@ my $MAX_LENGTH = 10;
# utilities
my $ZCAT = "gzip -cd";
# sometimes you just have to do the right thing without asking
my $sort_option = "";
if (`echo 'youcandoit' | sort --compress-program gzip 2>/dev/null` =~ /youcandoit/) {
$sort_option = "--compress-program gzip ";
}
# get optional parameters
my $opt_hierarchical = 0;
my $binarizer = undef;
@ -410,13 +416,13 @@ for(my $i=0;$i<=$#TABLE;$i++) {
# ... phrase translation model
elsif ($binarizer =~ /processPhraseTableMin/) {
#compact phrase table
my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted";
my $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $binarizer -in $mid_file.sorted.gz -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted.gz";
safesystem($cmd) or die "Can't binarize";
} elsif ($binarizer =~ /CreateOnDiskPt/) {
my $cmd = "$binarizer $mid_file $new_file.bin";
safesystem($cmd) or die "Can't binarize";
} else {
my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
my $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
safesystem($cmd) or die "Can't binarize";
}
}
@ -431,7 +437,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
$lexbin =~ s/PhraseTable/LexicalTable/;
my $cmd;
if ($lexbin =~ /processLexicalTableMin/) {
$cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted";
$cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $lexbin -in $mid_file.sorted.gz -out $new_file -threads $threads && rm $mid_file.sorted.gz";
} else {
$lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
$cmd = "$lexbin -in $mid_file -out $new_file";

View File

@ -89,11 +89,6 @@ def create_parser():
help=(
"Sentence end symbol. Will be skipped during extraction "
"(default: %(default)s)"))
parser.add_argument(
'--ptkvz', action='store_true',
help=(
"Special rule for German dependency trees: "
"concatenate separable verb prefix and verb."))
return parser
@ -107,22 +102,15 @@ def escape_text(s):
return s
def get_head(xml, add_ptkvz):
def get_head(xml):
"""Deterministic heuristic to get head of subtree."""
head = None
preterminal = None
for child in xml:
if not len(child):
if head is not None:
continue
preterminal = child.get('label')
head = escape_text(child.text.strip())
elif add_ptkvz and head and child.get('label') == 'avz':
for grandchild in child:
if grandchild.get('label') == 'PTKVZ':
head = escape_text(grandchild.text.strip()) + head
break
return head, preterminal
return head, preterminal
@ -159,7 +147,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab,
parent_labels = (
[vocab.get('<root_label>', 0)] * options.up_context)
head, preterminal = get_head(xml, options.ptkvz)
head, preterminal = get_head(xml)
if not head:
head = '<dummy_head>'
preterminal = head
@ -222,7 +210,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab,
preterminal_child = head_child
child_label = '<head_label>'
else:
head_child, preterminal_child = get_head(child, options.ptkvz)
head_child, preterminal_child = get_head(child)
child_label = child.get('label')
if head_child is None:

View File

@ -46,11 +46,6 @@ def create_parser():
parser.add_argument(
'--output', '-o', type=str, default='vocab', metavar='PREFIX',
help="Output prefix (default: 'vocab')")
parser.add_argument(
'--ptkvz', action="store_true",
help=(
"Special rule for German dependency trees: attach separable "
"verb prefixes to verb."))
return parser
@ -70,16 +65,9 @@ def get_head(xml, args):
preterminal = None
for child in xml:
if not len(child):
if head is not None:
continue
preterminal = child.get('label')
head = escape_text(child.text.strip())
elif args.ptkvz and head and child.get('label') == 'avz':
for grandchild in child:
if grandchild.get('label') == 'PTKVZ':
head = escape_text(grandchild.text.strip()) + head
break
return head, preterminal
return head, preterminal

View File

@ -1604,6 +1604,7 @@ sub extract_phrase {
$cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE;
$cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/;
$cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE;
$cmd .= " --NoTTable" if $_MMSAPT;
map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
print STDERR "$cmd\n";
@ -1611,12 +1612,16 @@ sub extract_phrase {
if (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) {
print STDERR "merging with baseline extract from $_BASELINE_EXTRACT\n";
safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | $GZIP_EXEC > $extract_file.gz");
safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | $GZIP_EXEC > $extract_file.inv.gz");
safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | $GZIP_EXEC > $extract_file.gz")
if -e "$extract_file$suffix.gz";
safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | $GZIP_EXEC > $extract_file.inv.gz")
if -e "$extract_file$suffix.inv.gz";
safesystem("$ZCAT $_BASELINE_EXTRACT.o.gz $extract_file$suffix.o.gz | $GZIP_EXEC > $extract_file.o.gz")
if -e "$extract_file$suffix.o.gz";
safesystem("rm $extract_file$suffix.gz");
safesystem("rm $extract_file$suffix.inv.gz");
safesystem("rm $extract_file$suffix.gz")
if -e "$extract_file$suffix.gz";
safesystem("rm $extract_file$suffix.inv.gz")
if -e "$extract_file$suffix.inv.gz";
safesystem("rm $extract_file$suffix.o.gz")
if -e "$extract_file$suffix.o.gz";
}
@ -2343,7 +2348,9 @@ sub create_ini {
}
if ($_ADDITIONAL_INI_FILE) {
print INI "\n# additional settings\n\n";
print INI `cat $_ADDITIONAL_INI_FILE`;
for my $AIF (split (/:/, $_ADDITIONAL_INI_FILE)) {
print INI `cat $AIF`;
}
}
# feature functions and weights

View File

@ -122,10 +122,11 @@ def main(options):
if options.output_dir is None:
options.output_dir = options.working_dir
else:
# Create output dir if necessary
if not os.path.exists(options.output_dir):
os.makedirs(options.output_dir)
# Create dirs if necessary
if not os.path.exists(options.working_dir):
os.makedirs(options.working_dir)
if not os.path.exists(options.output_dir):
os.makedirs(options.output_dir)
numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
train_file = numberized_file