This commit is contained in:
Ulrich Germann 2015-09-01 23:42:29 +01:00
commit 764780ea26
459 changed files with 12395 additions and 12732 deletions

View File

@ -21,6 +21,9 @@ mingw/MosesGUI/icons_rc.py
mingw/MosesGUI/Ui_credits.py
mingw/MosesGUI/Ui_mainWindow.py
moses/TranslationModel/UG
moses/server
moses/parameters
moses/thread_safe_container.h
phrase-extract/pcfg-common
phrase-extract/syntax-common
randlm
@ -32,3 +35,4 @@ srilm
util
xmlrpc-c
.git
util/ug_cache_with_timeout.h

10
Jamroot
View File

@ -108,7 +108,7 @@ external-lib z ;
#lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
#requirements += <library>dl ;
#requirements += <cxxflags>-std=c++0x ;
if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] {
if [ option.get "full-tcmalloc" : : "yes" ] {
@ -133,7 +133,9 @@ if [ option.get "filter-warnings" : : "yes" ] {
requirements += <cxxflags>-Wno-unused-but-set-variable ;
requirements += <cxxflags>-Wno-unused-result ;
requirements += <cxxflags>-Wno-unused-variable ;
requirements += <cxxflags>-Wcomment ;
requirements += <cxxflags>-Wno-comment ;
requirements += <cxxflags>-Wno-strict-aliasing ;
requirements += <cxxflags>-Wno-overloaded-virtual ;
}
if [ option.get "debug-build" : : "yes" ] {
@ -179,7 +181,7 @@ if [ option.get "with-icu" : : "yes" ]
requirements += <library>icui18n/<link>shared ;
requirements += <cxxflags>-fPIC ;
requirements += <address-model>64 ;
requirements += <runtime-link>shared ;
# requirements += <runtime-link>shared ;
}
if [ option.get "with-probing-pt" : : "yes" ]
@ -301,5 +303,5 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
#local temp = [ _shell "bash source ./s.sh" ] ;
local temp = [ _shell "mkdir -p $(TOP)/bin" ] ;
local temp = [ _shell "rm $(TOP)/bin/moses_chart" ] ;
local temp = [ _shell "rm -f $(TOP)/bin/moses_chart" ] ;
local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ;

View File

@ -21,6 +21,11 @@ SuffixArray::SuffixArray()
m_wordInSentence(NULL),
m_sentence(NULL),
m_sentenceLength(NULL),
m_document(NULL),
m_documentName(NULL),
m_documentNameLength(0),
m_documentCount(0),
m_useDocument(false),
m_vcb(),
m_size(0),
m_sentenceCount(0) { }
@ -32,6 +37,8 @@ SuffixArray::~SuffixArray()
free(m_wordInSentence);
free(m_sentence);
free(m_sentenceLength);
free(m_document);
free(m_documentName);
}
void SuffixArray::Create(const string& fileName )
@ -46,22 +53,32 @@ void SuffixArray::Create(const string& fileName )
textFile.open(fileName.c_str());
if (!textFile) {
cerr << "no such file or directory " << fileName << endl;
cerr << "Error: no such file or directory " << fileName << endl;
exit(1);
}
// first pass through data: get size
istream *fileP = &textFile;
m_size = 0;
m_sentenceCount = 0;
m_documentCount = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
if (m_useDocument && ProcessDocumentLine(line,0)) continue;
vector< WORD_ID > words = m_vcb.Tokenize( line );
m_size += words.size() + 1;
m_sentenceCount++;
}
textFile.close();
cerr << m_size << " words (incl. sentence boundaries)" << endl;
if (m_useDocument) {
cerr << m_documentCount << " documents" << endl;
if (m_documentCount == 0) {
cerr << "Error: no documents found, aborting." << endl;
exit(1);
}
}
// allocate memory
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
@ -69,21 +86,31 @@ void SuffixArray::Create(const string& fileName )
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
// fill the array
int wordIndex = 0;
int sentenceId = 0;
textFile.open(fileName.c_str());
if (!textFile) {
cerr << "no such file or directory " << fileName << endl;
exit(1);
CheckAllocation(m_array != NULL, "m_array");
CheckAllocation(m_index != NULL, "m_index");
CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
CheckAllocation(m_sentence != NULL, "m_sentence");
CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
if (m_useDocument) {
m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
m_documentName = (INDEX*) calloc( sizeof( char ), m_documentCount );
m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
CheckAllocation(m_document != NULL, "m_document");
CheckAllocation(m_documentName != NULL, "m_documentName");
CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
}
// second pass through data: fill the arrays
int wordIndex = 0;
int sentenceId = 0;
m_documentNameLength = 0; // re-use as counter
m_documentCount = 0; // re-use as counter
textFile.open(fileName.c_str());
fileP = &textFile;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
if (m_useDocument && ProcessDocumentLine(line,sentenceId)) continue;
vector< WORD_ID > words = m_vcb.Tokenize( line );
vector< WORD_ID >::const_iterator i;
@ -105,7 +132,7 @@ void SuffixArray::Create(const string& fileName )
m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
if (m_buffer == NULL) {
cerr << "cannot allocate memory to m_buffer" << endl;
cerr << "Error: cannot allocate memory to m_buffer" << endl;
exit(1);
}
@ -114,6 +141,45 @@ void SuffixArray::Create(const string& fileName )
cerr << "done sorting" << endl;
}
// very specific code to deal with common crawl document ids
bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId )
{
size_t i;
// first 32 characters are hex-hash
for(i=0; i<32; i++) {
if ((line[i] < '0' || line[i] > '9') && (line[i] < 'a' || line[i] > 'f')) {
return false;
}
}
if (line[i++] != ' ') return false;
// second token is float
for (; line[i] != ' ' && line[i] != 0; i++) {
if (line[i] != '.' && (line[i] < '0' || line[i] > '9')) {
return false;
}
}
i++;
// last token is url (=name)
size_t startName = i;
for (; line[i] != ' ' && line[i] != 0; i++) {}
if (line[i] == ' ') return false;
size_t endName = i+1; // include '\0'
// second pass: record name and sentence number
if (m_document != NULL) {
m_documentName[m_documentCount] = m_documentNameLength;
for(size_t i=startName; i<endName; i++) {
m_documentNameBuffer[m_documentNameLength + i-startName] = line[i];
}
m_document[m_documentCount] = sentenceId;
}
m_documentNameLength += endName-startName;
m_documentCount++;
return true;
}
// good ol' quick sort
void SuffixArray::Sort(INDEX start, INDEX end)
{
@ -162,7 +228,6 @@ int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
{
// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
}
@ -272,13 +337,73 @@ void SuffixArray::List(INDEX start, INDEX end)
}
}
void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase )
{
cout << "QUERY\t";
for(size_t i=0; i<phrase.size(); i++) {
if (i>0) cout << " ";
cout << phrase[i];
}
cout << '\t';
INDEX start = 0;
INDEX end = m_size-1;
INDEX mid = FindFirst( phrase, start, end );
if (mid == m_size) { // no matches
cout << "0 matches" << endl;
return;
}
INDEX firstMatch = FindLast( phrase, mid, start, -1 );
INDEX lastMatch = FindLast( phrase, mid, end, 1 );
// loop through all matches
cout << (lastMatch-firstMatch+1) << " matches" << endl;
for(INDEX i=firstMatch; i<=lastMatch; i++) {
// get sentence information
INDEX pos = GetPosition( i );
INDEX start = pos - GetWordInSentence( pos );
char length = GetSentenceLength( GetSentence( pos ) );
// print document name
if (m_useDocument) {
INDEX sentence = GetSentence( pos );
INDEX document = GetDocument( sentence );
PrintDocumentName( document );
cout << '\t';
}
// print sentence
for(char i=0; i<length; i++) {
if (i>0) cout << " ";
cout << GetWord( start + i );
}
cout << endl;
}
}
SuffixArray::INDEX SuffixArray::GetDocument( INDEX sentence ) const
{
// binary search
INDEX min = 0;
INDEX max = m_documentCount-1;
if (sentence >= m_document[max]) {
return max;
}
while(true) {
INDEX mid = (min + max) / 2;
if (sentence >= m_document[mid] && sentence < m_document[mid+1]) {
return mid;
}
if (sentence < m_document[mid]) {
max = mid-1;
} else {
min = mid+1;
}
}
}
void SuffixArray::Save(const string& fileName ) const
{
FILE *pFile = fopen ( fileName.c_str() , "w" );
if (pFile == NULL) {
cerr << "Cannot open " << fileName << endl;
exit(1);
}
if (pFile == NULL) Error("cannot open",fileName);
fwrite( &m_size, sizeof(INDEX), 1, pFile );
fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
@ -288,6 +413,16 @@ void SuffixArray::Save(const string& fileName ) const
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
char useDocument = m_useDocument; // not sure if that is needed
fwrite( &useDocument, sizeof(char), 1, pFile );
if (m_useDocument) {
fwrite( &m_documentCount, sizeof(INDEX), 1, pFile );
fwrite( m_document, sizeof(INDEX), m_documentCount, pFile );
fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile );
fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile );
fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile );
}
fclose( pFile );
m_vcb.Save( fileName + ".src-vcb" );
@ -296,56 +431,81 @@ void SuffixArray::Save(const string& fileName ) const
void SuffixArray::Load(const string& fileName )
{
FILE *pFile = fopen ( fileName.c_str() , "r" );
if (pFile == NULL) {
cerr << "no such file or directory " << fileName << endl;
exit(1);
}
if (pFile == NULL) Error("no such file or directory", fileName);
cerr << "loading from " << fileName << endl;
fread( &m_size, sizeof(INDEX), 1, pFile );
fread( &m_size, sizeof(INDEX), 1, pFile )
|| Error("could not read m_size from", fileName);
cerr << "words in corpus: " << m_size << endl;
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
CheckAllocation(m_array != NULL, "m_array");
CheckAllocation(m_index != NULL, "m_index");
CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
CheckAllocation(m_sentence != NULL, "m_sentence");
fread( m_array, sizeof(WORD_ID), m_size, pFile ) // corpus
|| Error("could not read m_array from", fileName);
fread( m_index, sizeof(INDEX), m_size, pFile ) // suffix array
|| Error("could not read m_index from", fileName);
fread( m_wordInSentence, sizeof(char), m_size, pFile) // word index
|| Error("could not read m_wordInSentence from", fileName);
fread( m_sentence, sizeof(INDEX), m_size, pFile ) // sentence index
|| Error("could not read m_sentence from", fileName);
if (m_array == NULL) {
cerr << "Error: cannot allocate memory to m_array" << endl;
exit(1);
}
if (m_index == NULL) {
cerr << "Error: cannot allocate memory to m_index" << endl;
exit(1);
}
if (m_wordInSentence == NULL) {
cerr << "Error: cannot allocate memory to m_wordInSentence" << endl;
exit(1);
}
if (m_sentence == NULL) {
cerr << "Error: cannot allocate memory to m_sentence" << endl;
exit(1);
}
fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
fread( m_index, sizeof(INDEX), m_size, pFile ); // suffix array
fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index
fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile )
|| Error("could not read m_sentenceCount from", fileName);
cerr << "sentences in corpus: " << m_sentenceCount << endl;
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
if (m_sentenceLength == NULL) {
cerr << "Error: cannot allocate memory to m_sentenceLength" << endl;
exit(1);
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile) // sentence length
|| Error("could not read m_sentenceLength from", fileName);
if (m_useDocument) { // do not read it when you do not need it
char useDocument;
fread( &useDocument, sizeof(char), 1, pFile )
|| Error("could not read m_useDocument from", fileName);
if (!useDocument) {
cerr << "Error: stored suffix array does not have a document index\n";
exit(1);
}
fread( &m_documentCount, sizeof(INDEX), 1, pFile )
|| Error("could not read m_documentCount from", fileName);
m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
CheckAllocation(m_document != NULL, "m_document");
CheckAllocation(m_documentName != NULL, "m_documentName");
fread( m_document, sizeof(INDEX), m_documentCount, pFile )
|| Error("could not read m_document from", fileName);
fread( m_documentName, sizeof(INDEX), m_documentCount, pFile )
|| Error("could not read m_documentName from", fileName);
fread( &m_documentNameLength, sizeof(INDEX), 1, pFile )
|| Error("could not read m_documentNameLength from", fileName);
m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
fread( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile )
|| Error("could not read m_document from", fileName);
}
fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
fclose( pFile );
m_vcb.Load( fileName + ".src-vcb" );
}
void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const
{
if (check) return;
cerr << "Error: could not allocate memory for " << dataStructure << endl;
exit(1);
}
bool SuffixArray::Error( const char *message, const string &fileName) const
{
cerr << "Error: " << message << " " << fileName << endl;
exit(1);
return true; // yeah, i know.
}

View File

@ -15,6 +15,12 @@ private:
INDEX *m_sentence;
char *m_sentenceLength;
WORD_ID m_endOfSentence;
INDEX *m_document;
INDEX *m_documentName;
char *m_documentNameBuffer;
size_t m_documentNameLength;
size_t m_documentCount;
bool m_useDocument;
Vocabulary m_vcb;
INDEX m_size;
INDEX m_sentenceCount;
@ -28,6 +34,7 @@ public:
~SuffixArray();
void Create(const std::string& fileName );
bool ProcessDocumentLine( const char* const, const size_t );
void Sort(INDEX start, INDEX end);
int CompareIndex( INDEX a, INDEX b ) const;
inline int CompareWord( WORD_ID a, WORD_ID b ) const;
@ -40,6 +47,7 @@ public:
INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
int Match( const std::vector< WORD > &phrase, INDEX index );
void List( INDEX start, INDEX end );
void PrintSentenceMatches( const std::vector< WORD > &phrase );
inline INDEX GetPosition( INDEX index ) const {
return m_index[ index ];
}
@ -58,6 +66,17 @@ public:
inline WORD GetWord( INDEX position ) const {
return m_vcb.GetWord( m_array[position] );
}
void UseDocument() {
m_useDocument = true;
}
INDEX GetDocument( INDEX sentence ) const;
void PrintDocumentName( INDEX document ) {
for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) {
std::cout << m_documentNameBuffer[ i ];
}
}
void Save(const std::string& fileName ) const;
void Load(const std::string& fileName );
void CheckAllocation(bool, const char *dataStructure) const;
bool Error( const char* message, const std::string& fileName) const;
};

View File

@ -62,7 +62,7 @@ void Vocabulary::Save(const string& fileName ) const
vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
if (!vcbFile) {
cerr << "Failed to open " << vcbFile << endl;
cerr << "Failed to open " << fileName << endl;
exit(1);
}
@ -81,7 +81,7 @@ void Vocabulary::Load(const string& fileName )
vcbFile.open(fileName.c_str());
if (!vcbFile) {
cerr << "no such file or directory: " << vcbFile << endl;
cerr << "no such file or directory: " << fileName << endl;
exit(1);
}

View File

@ -1,4 +1,5 @@
#include "SuffixArray.h"
#include "../util/tokenize.hh"
#include <getopt.h>
using namespace std;
@ -13,10 +14,12 @@ int main(int argc, char* argv[])
string query;
string fileNameSuffix;
string fileNameSource;
int loadFlag = false;
int saveFlag = false;
int createFlag = false;
int queryFlag = false;
bool loadFlag = false;
bool saveFlag = false;
bool createFlag = false;
bool queryFlag = false;
bool querySentenceFlag = false;
int stdioFlag = false; // receive requests from STDIN, respond to STDOUT
string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
while(1) {
@ -25,11 +28,14 @@ int main(int argc, char* argv[])
{"save", required_argument, 0, 's'},
{"create", required_argument, 0, 'c'},
{"query", required_argument, 0, 'q'},
{"query-sentence", required_argument, 0, 'Q'},
{"document", required_argument, 0, 'd'},
{"stdio", no_argument, 0, 'i'},
{"stdio-sentence", no_argument, 0, 'I'},
{0, 0, 0, 0}
};
int option_index = 0;
int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
int c = getopt_long (argc, argv, "l:s:c:q:Q:iId", long_options, &option_index);
if (c == -1) break;
switch (c) {
case 'l':
@ -48,17 +54,25 @@ int main(int argc, char* argv[])
query = string(optarg);
queryFlag = true;
break;
case 'Q':
query = string(optarg);
querySentenceFlag = true;
break;
case 'i':
stdioFlag = true;
break;
case 'I':
stdioFlag = true;
querySentenceFlag = true;
break;
case 'd':
suffixArray.UseDocument();
break;
default:
cerr << info;
exit(1);
}
}
if (stdioFlag) {
queryFlag = true;
}
// check if parameter settings are legal
if (saveFlag && !createFlag) {
@ -74,7 +88,7 @@ int main(int argc, char* argv[])
exit(1);
}
// do your thing
// get suffix array
if (createFlag) {
cerr << "will create\n";
cerr << "corpus is in " << fileNameSource << endl;
@ -88,16 +102,26 @@ int main(int argc, char* argv[])
cerr << "will load from " << fileNameSuffix << endl;
suffixArray.Load( fileNameSuffix );
}
// do something with it
if (stdioFlag) {
while(true) {
string query;
if (getline(cin, query, '\n').eof()) {
return 0;
}
cout << lookup( query ) << endl;
if (querySentenceFlag) {
vector< string > queryString = util::tokenize( query.c_str() );
suffixArray.PrintSentenceMatches( queryString );
} else {
cout << lookup( query ) << endl;
}
}
} else if (queryFlag) {
cout << lookup( query ) << endl;
} else if (querySentenceFlag) {
vector< string > queryString = util::tokenize( query.c_str() );
suffixArray.PrintSentenceMatches( queryString );
}
return 0;
}
@ -105,32 +129,6 @@ int main(int argc, char* argv[])
size_t lookup( string query )
{
cerr << "query is " << query << endl;
vector< string > queryString = tokenize( query.c_str() );
vector< string > queryString = util::tokenize( query.c_str() );
return suffixArray.Count( queryString );
}
// Duplicate of definition in util/tokenize.hh.
// TODO: Can we de-duplicate this? At the time of writing biconcor does not
// use util at all.
vector<string> tokenize(const char input[])
{
vector< string > token;
bool betweenWords = true;
int start=0;
int i;
for(i = 0; input[i] != '\0'; i++) {
const bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
} else if (isSpace && !betweenWords) {
token.push_back( string( input+start, i-start ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( string( input+start, i-start ) );
return token;
}

View File

@ -28,14 +28,16 @@ TEST_DIR: /home/moses-speedtest/phrase_tables/tests
TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
BASEBRANCH: RELEASE-2.1.1
MOSES_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-prof
MOSES_GOOGLE_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-gperftools
</pre>
The _MOSES\_REPO\_PATH_ is the place where you have set up and built moses.
The _DROP\_CACHES\_COMM_ is the command that would b eused to drop caches. It should run without needing root access.
The _DROP\_CACHES\_COMM_ is the command that would be used to drop caches. It should run without needing root access.
_TEST\_DIR_ is the directory where all the tests will reside.
_TEST\_LOG\_DIR_ is the directory where the performance logs will be gathered. It should be created before running the testsuite for the first time.
_BASEBRANCH_ is the branch against which all new tests will be compared. It should normally be set to be the latest Moses stable release.
_MOSES\_PROFILER\_REPO_ is a path to a moses repository set up and built with profiling enabled. Optional if you want to produce profiling results.
_MOSES\_GOOGLE\_PROFILER\_REPO is a path to moses repository set up with full tcmalloc and profiler, as well as shared link for use with gperftools.
### Creating tests
In order to create a test one should go into the TEST_DIR and create a new folder. That folder will be used for the name of the test.
@ -45,7 +47,7 @@ An example such configuration file is **test\_config**
<pre>
Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/,
Variants: vanilla, cached, ldpre, profile #Can't have cached without ldpre or vanilla
Variants: vanilla, cached, ldpre, profile, google-profiler #Can't have cached without ldpre or vanilla
</pre>
The _Command:_ line specifies the executable (which is looked up in the /bin directory of the repo.) and any arguments necessary. Before running the test, the script cds to the current test directory so you can use relative paths.
@ -61,11 +63,21 @@ The _Variants:_ line specifies what type of tests should we run. This particular
If you want to produce profiler results together in some tests you need to specify the _MOSES\_PROFILER\_REPO_ in the config
```bash
git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-profile
cd mosesdecoder
cd mosesdecoder-profile
./bjam -j10 --with-cmph=/usr/include/ variant=profile
```
Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run.
Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run (files ending in **\_profile**).
#### Produce google profiler results.
If you want to produce profiler results together in some tests you need to specify the _MOSES\_GOOGLE\_PROFILER\_REPO in the config
```bash
git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-google-profile
cd mosesdecoder
./bjam link=shared -j10 --full-tcmalloc --with-cmph=/usr/include/
```
Afterwards for testcases which contain the **google-profiler** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **google-profiler** output from every run (files prefixed with **pprof**). To analyze the output you need to use [pprof](http://google-perftools.googlecode.com/svn/trunk/doc/cpuprofile.html).
### Running tests.
Running the tests is done through the **runtests.py** script.

View File

@ -2,6 +2,7 @@
import os
import subprocess
import time
import shutil
from argparse import ArgumentParser
from testsuite_common import processLogLine
@ -26,16 +27,21 @@ def parse_cmd():
arguments = parser.parse_args()
return arguments
def repoinit(testconfig, profiler=True):
def repoinit(testconfig, profiler=None):
"""Determines revision and sets up the repo. If given the profiler optional
argument, wil init the profiler repo instead of the default one."""
revision = ''
#Update the repo
if profiler:
if profiler == "gnu-profiler":
if testconfig.repo_prof is not None:
os.chdir(testconfig.repo_prof)
else:
raise ValueError('Profiling repo is not defined')
elif profiler == "google-profiler":
if testconfig.repo_gprof is not None:
os.chdir(testconfig.repo_gprof)
else:
raise ValueError('Profiling repo is not defined')
else:
os.chdir(testconfig.repo)
#Checkout specific branch, else maintain main branch
@ -61,9 +67,10 @@ def repoinit(testconfig, profiler=True):
class Configuration:
"""A simple class to hold all of the configuration constatns"""
def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None):
def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None, repo_gprof=None):
self.repo = repo
self.repo_prof = repo_prof
self.repo_gprof = repo_gprof
self.drop_caches = drop_caches
self.tests = tests
self.testlogs = testlogs
@ -88,16 +95,17 @@ class Configuration:
class Test:
"""A simple class to contain all information about tests"""
def __init__(self, name, command, ldopts, permutations, prof_command=None):
def __init__(self, name, command, ldopts, permutations, prof_command=None, gprof_command=None):
self.name = name
self.command = command
self.prof_command = prof_command
self.gprof_command = gprof_command
self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet
self.permutations = permutations
def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):
def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None, moses_gprof_repo=None):
"""Parses the config file"""
command, ldopts, prof_command = '', '', None
command, ldopts, prof_command, gprof_command = '', '', None, None
permutations = []
fileopen = open(conffile, 'r')
for line in fileopen:
@ -108,8 +116,10 @@ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):
if opt == 'Command:':
command = args.replace('\n', '')
if moses_prof is not None: # Get optional command for profiling
if moses_prof_repo is not None: # Get optional command for profiling
prof_command = moses_prof_repo + '/bin/' + command
if moses_gprof_repo is not None: # Get optional command for google-perftools
gprof_command = moses_gprof_repo + '/bin/' + command
command = moses_repo + '/bin/' + command
elif opt == 'LDPRE:':
ldopts = args.replace('\n', '')
@ -118,14 +128,14 @@ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None):
else:
raise ValueError('Unrecognized option ' + opt)
#We use the testdir as the name.
testcase = Test(testdir, command, ldopts, permutations, prof_command)
testcase = Test(testdir, command, ldopts, permutations, prof_command, gprof_command)
fileopen.close()
return testcase
def parse_testconfig(conffile):
"""Parses the config file for the whole testsuite."""
repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', ''
basebranch, baserev, repo_prof_path = '', '', None
basebranch, baserev, repo_prof_path, repo_gprof_path = '', '', None, None
fileopen = open(conffile, 'r')
for line in fileopen:
line = line.split('#')[0] # Discard comments
@ -146,10 +156,12 @@ def parse_testconfig(conffile):
baserev = args.replace('\n', '')
elif opt == 'MOSES_PROFILER_REPO:': # Optional
repo_prof_path = args.replace('\n', '')
elif opt == 'MOSES_GOOGLE_PROFILER_REPO:': # Optional
repo_gprof_path = args.replace('\n', '')
else:
raise ValueError('Unrecognized option ' + opt)
config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\
basebranch, baserev, repo_prof_path)
basebranch, baserev, repo_prof_path, repo_gprof_path)
fileopen.close()
return config
@ -160,7 +172,9 @@ def get_config():
config.additional_args(args.singletestdir, args.revision, args.branch)
revision = repoinit(config)
if config.repo_prof is not None:
repoinit(config, True)
repoinit(config, "gnu-profiler")
if config.repo_gprof is not None:
repoinit(config, "google-profiler")
config.set_revision(revision)
return config
@ -212,16 +226,27 @@ def write_gprof(command, name, variant, config):
executable_path = command.split(' ')[0] # Path to the moses binary
gprof_command = 'gprof ' + executable_path + ' ' + gmon_path + ' > ' + outputfile
subprocess.call([gprof_command], shell=True)
os.remove('gmon_path') # After we are done discard the gmon file
os.remove(gmon_path) # After we are done discard the gmon file
def execute_test(command, path, name, variant, config, profile=False):
def write_pprof(name, variant, config):
"""Copies the google-perftools profiler output to the corresponding test directory"""
output_dir = config.testlogs + '/' + name
if not os.path.exists(output_dir):
os.makedirs(output_dir)
outputfile = output_dir + '/pprof_' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant
shutil.move("/tmp/moses.prof", outputfile)
def execute_test(command, path, name, variant, config, profile=None):
"""Executes a testcase given a whole command, path to the test file output,
name of the test and variant tested. Config is the global configuration"""
subprocess.Popen([command], stdout=None, stderr=subprocess.PIPE, shell=True).communicate()
if not profile:
if profile is None:
write_log(path, name + '_' + variant, config)
else: # Basically produce a gmon output
elif profile == "gnu-profiler": # Basically produce a gmon output
write_gprof(command, name, variant, config)
elif profile == "google-profiler":
write_pprof(name, variant, config)
def execute_tests(testcase, cur_directory, config):
@ -255,7 +280,7 @@ def execute_tests(testcase, cur_directory, config):
subprocess.call([config.drop_caches], shell=True)
#Create the command for executing moses:
whole_command = 'LD_PRELOAD ' + opt + time_command + testcase.command
whole_command = 'LD_PRELOAD=' + opt + time_command + testcase.command
variant = 'ldpre_' + opt
#test normal and cached
@ -271,9 +296,9 @@ def execute_tests(testcase, cur_directory, config):
if 'vanilla' in testcase.permutations:
whole_command = testcase.prof_command
execute_test(whole_command, time_path, testcase.name, 'profile', config, True)
execute_test(whole_command, time_path, testcase.name, 'profile', config, "gnu-profiler")
if 'cached' in testcase.permutations:
execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, True)
execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, "gnu-profiler")
if 'ldpre' in testcase.permutations:
for opt in testcase.ldopts:
@ -282,13 +307,42 @@ def execute_tests(testcase, cur_directory, config):
subprocess.call([config.drop_caches], shell=True)
#Create the command for executing moses:
whole_command = 'LD_PRELOAD ' + opt + testcase.prof_command
whole_command = 'LD_PRELOAD=' + opt + " " + testcase.prof_command
variant = 'profile_ldpre_' + opt
#test normal and cached
execute_test(whole_command, time_path, testcase.name, variant, config, True)
execute_test(whole_command, time_path, testcase.name, variant, config, "gnu-profiler")
if 'cached' in testcase.permutations:
execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, True)
execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, "gnu-profiler")
#Google-perftools profiler
if 'google-profiler' in testcase.permutations:
subprocess.call(['sync'], shell=True) # Drop caches first
subprocess.call([config.drop_caches], shell=True)
#Create the command for executing moses
whole_command = "CPUPROFILE=/tmp/moses.prof " + testcase.gprof_command
#test normal and cached
execute_test(whole_command, time_path, testcase.name, 'vanilla', config, 'google-profiler')
if 'cached' in testcase.permutations:
execute_test(whole_command, time_path, testcase.name, 'vanilla_cached', config, 'google-profiler')
#Now perform LD_PRELOAD tests
if 'ldpre' in testcase.permutations:
for opt in testcase.ldopts:
#Clear caches
subprocess.call(['sync'], shell=True)
subprocess.call([config.drop_caches], shell=True)
#Create the command for executing moses:
whole_command = 'LD_PRELOAD=' + opt + " " + whole_command
variant = 'ldpre_' + opt
#test normal and cached
execute_test(whole_command, time_path, testcase.name, variant, config, 'google-profiler')
if 'cached' in testcase.permutations:
execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, 'google-profiler')
# Go through all the test directories and executes tests
@ -319,7 +373,7 @@ if __name__ == '__main__':
for logfile in os.listdir(CONFIG.testlogs):
logfile_name = CONFIG.testlogs + '/' + logfile
if not check_for_basever(logfile_name, CONFIG.basebranch):
if os.path.isfile(logfile_name) and not check_for_basever(logfile_name, CONFIG.basebranch):
logfile = logfile.replace('_vanilla', '')
logfile = logfile.replace('_cached', '')
logfile = logfile.replace('_ldpre', '')
@ -330,7 +384,7 @@ if __name__ == '__main__':
#Create a new configuration for base version tests:
BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\
CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\
CONFIG.baserev, CONFIG.repo_prof)
CONFIG.baserev, CONFIG.repo_prof, CONFIG.repo_gprof)
BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch)
#Set up the repository and get its revision:
REVISION = repoinit(BASECONFIG)
@ -340,20 +394,28 @@ if __name__ == '__main__':
subprocess.call(['./previous.sh'], shell=True)
#If profiler configuration exists also init it
if BASECONFIG.repo_prof is not None:
repoinit(BASECONFIG, True)
repoinit(BASECONFIG, "gnu-profiler")
os.chdir(BASECONFIG.repo_prof)
subprocess.call(['./previous.sh'], shell=True)
if BASECONFIG.repo_gprof is not None:
repoinit(BASECONFIG, "google-profiler")
os.chdir(BASECONFIG.repo_gprof)
subprocess.call(['./previous.sh'], shell=True)
#Perform tests
for directory in FIRSTTIME:
cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\
'/config', directory, BASECONFIG.repo)
'/config', directory, BASECONFIG.repo, BASECONFIG.repo_prof, BASECONFIG.repo_gprof)
execute_tests(cur_testcase, directory, BASECONFIG)
#Reset back the repository to the normal configuration
repoinit(CONFIG)
if BASECONFIG.repo_prof is not None:
repoinit(CONFIG, True)
repoinit(CONFIG, "gnu-profiler")
if BASECONFIG.repo_gprof is not None:
repoinit(CONFIG, "google-profiler")
#Builds moses
os.chdir(CONFIG.repo)
@ -362,12 +424,16 @@ if __name__ == '__main__':
os.chdir(CONFIG.repo_prof)
subprocess.call(['./previous.sh'], shell=True)
if CONFIG.repo_gprof is not None:
os.chdir(CONFIG.repo_gprof)
subprocess.call(['./previous.sh'], shell=True)
if CONFIG.singletest:
TESTCASE = parse_configfile(CONFIG.tests + '/' +\
CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo)
CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
execute_tests(TESTCASE, CONFIG.singletest, CONFIG)
else:
for directory in ALL_DIR:
cur_testcase = parse_configfile(CONFIG.tests + '/' + directory +\
'/config', directory, CONFIG.repo)
'/config', directory, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
execute_tests(cur_testcase, directory, CONFIG)

View File

@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -72,13 +72,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -1,5 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="OnDiskPt" InternalType="Library">
<Plugins>
<Plugin Name="CMakePlugin">
<![CDATA[[{
"name": "Debug",
"enabled": false,
"buildDirectory": "build",
"sourceDirectory": "$(ProjectPath)",
"generator": "",
"buildType": "",
"arguments": [],
"parentProject": ""
}]]]>
</Plugin>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
</Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="src"/>
@ -27,6 +44,8 @@
<File Name="../../../OnDiskPt/Word.cpp"/>
<File Name="../../../OnDiskPt/Word.h"/>
</VirtualDirectory>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
<Settings Type="Static Library">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@ -40,9 +59,9 @@
<Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
<IncludePath Value="../../.."/>
<IncludePath Value="../../../phrase-extract"/>
<IncludePath Value="../../../boost/include"/>
<Preprocessor Value="MAX_NUM_FACTORS=4"/>
</Compiler>
<Linker Options="" Required="yes"/>
@ -72,7 +91,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -110,7 +129,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -118,6 +137,4 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -6,10 +6,11 @@
<Project Name="lm" Path="lm/lm.project" Active="No"/>
<Project Name="OnDiskPt" Path="OnDiskPt/OnDiskPt.project" Active="No"/>
<Project Name="search" Path="search/search.project" Active="No"/>
<Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="Yes"/>
<Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="No"/>
<Project Name="score" Path="score/score.project" Active="No"/>
<Project Name="consolidate" Path="consolidate/consolidate.project" Active="No"/>
<Project Name="moses" Path="moses/moses.project" Active="No"/>
<Project Name="pruneGeneration" Path="pruneGeneration/pruneGeneration.project" Active="Yes"/>
<BuildMatrix>
<WorkspaceConfiguration Name="Debug" Selected="yes">
<Project Name="manual-label" ConfigName="Debug"/>
@ -23,6 +24,7 @@
<Project Name="score" ConfigName="Debug"/>
<Project Name="consolidate" ConfigName="Debug"/>
<Project Name="moses" ConfigName="Debug"/>
<Project Name="pruneGeneration" ConfigName="Debug"/>
</WorkspaceConfiguration>
<WorkspaceConfiguration Name="Release" Selected="yes">
<Project Name="manual-label" ConfigName="Release"/>
@ -36,6 +38,7 @@
<Project Name="score" ConfigName="Release"/>
<Project Name="consolidate" ConfigName="Release"/>
<Project Name="moses" ConfigName="Release"/>
<Project Name="pruneGeneration" ConfigName="Release"/>
</WorkspaceConfiguration>
</BuildMatrix>
</CodeLite_Workspace>

View File

@ -102,9 +102,14 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.h</locationURI>
</link>
<link>
<name>SyntaxTree.cpp</name>
<name>SyntaxNodeCollection.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxTree.cpp</locationURI>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.cpp</locationURI>
</link>
<link>
<name>SyntaxNodeCollection.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.h</locationURI>
</link>
<link>
<name>SyntaxTree.h</name>

View File

@ -1,5 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="extract-mixed-syntax" InternalType="Console">
<Plugins>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
<Plugin Name="CMakePlugin">
<![CDATA[[{
"name": "Debug",
"enabled": false,
"buildDirectory": "build",
"sourceDirectory": "$(ProjectPath)",
"generator": "",
"buildType": "",
"arguments": [],
"parentProject": ""
}]]]>
</Plugin>
</Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="src"/>
@ -43,6 +60,10 @@
<File Name="../../../phrase-extract/OutputFileStream.cpp"/>
<File Name="../../../phrase-extract/OutputFileStream.h"/>
</VirtualDirectory>
<Dependencies Name="Debug">
<Project Name="util"/>
</Dependencies>
<Dependencies Name="Release"/>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@ -56,13 +77,14 @@
<Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
<IncludePath Value="../../../"/>
<IncludePath Value="../../../phrase-extract"/>
<IncludePath Value="../../../boost/include"/>
</Compiler>
<Linker Options="" Required="yes">
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/util/Debug"/>
<LibraryPath Value="../../../boost/lib64"/>
<LibraryPath Value="../../../contrib/other-builds/util/Debug"/>
<LibraryPath Value="Debug"/>
<Library Value="util"/>
<Library Value="boost_iostreams"/>
<Library Value="boost_program_options"/>
@ -94,7 +116,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -133,7 +155,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -141,8 +163,4 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug">
<Project Name="util"/>
</Dependencies>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -26,6 +26,7 @@
<option id="gnu.cpp.compiler.option.include.paths.231971122" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/include&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.61884195" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>

View File

@ -81,9 +81,14 @@
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.h</locationURI>
</link>
<link>
<name>SyntaxTree.cpp</name>
<name>SyntaxNodeCollection.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxTree.cpp</locationURI>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.cpp</locationURI>
</link>
<link>
<name>SyntaxNodeCollection.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.h</locationURI>
</link>
<link>
<name>SyntaxTree.h</name>

View File

@ -5,16 +5,16 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2119725657" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2119725657" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2119725657" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2119725657." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1708444053" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.645190133" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
@ -25,6 +25,7 @@
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.535775760" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.874182289" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1355287045" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
@ -61,16 +62,16 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1230189043" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1230189043" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1230189043" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.exe.release.1230189043." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.280378247" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1881910636" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>

View File

@ -1,5 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="extract" InternalType="Console">
<Plugins>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
<Plugin Name="CMakePlugin">
<![CDATA[[{
"name": "Debug",
"enabled": false,
"buildDirectory": "build",
"sourceDirectory": "$(ProjectPath)",
"generator": "",
"buildType": "",
"arguments": [],
"parentProject": ""
}]]]>
</Plugin>
</Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="src">
@ -13,6 +30,8 @@
<File Name="../../../phrase-extract/tables-core.cpp"/>
<File Name="../../../phrase-extract/tables-core.h"/>
</VirtualDirectory>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@ -26,11 +45,11 @@
<Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
<IncludePath Value="../../../"/>
<IncludePath Value="../../../boost/include"/>
</Compiler>
<Linker Options="" Required="yes">
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
<LibraryPath Value="../../../boost/lib64"/>
<Library Value="boost_iostreams"/>
<Library Value="z"/>
</Linker>
@ -60,7 +79,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -99,7 +118,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -107,6 +126,4 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -83,6 +83,16 @@
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
<linkedResources>
<link>
<name>InternalTree.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/InternalTree.cpp</locationURI>
</link>
<link>
<name>InternalTree.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/InternalTree.h</locationURI>
</link>
<link>
<name>bin</name>
<type>2</type>

View File

@ -546,26 +546,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/builder/interpolate.hh</locationURI>
</link>
<link>
<name>builder/joint_order.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/builder/joint_order.hh</locationURI>
</link>
<link>
<name>builder/lmplz_main.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/builder/lmplz_main.cc</locationURI>
</link>
<link>
<name>builder/ngram.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/builder/ngram.hh</locationURI>
</link>
<link>
<name>builder/ngram_stream.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/builder/ngram_stream.hh</locationURI>
</link>
<link>
<name>builder/pipeline.cc</name>
<type>1</type>
@ -576,21 +561,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/builder/pipeline.hh</locationURI>
</link>
<link>
<name>builder/print.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/builder/print.cc</locationURI>
</link>
<link>
<name>builder/print.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/builder/print.hh</locationURI>
</link>
<link>
<name>builder/sort.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/builder/sort.hh</locationURI>
</link>
<link>
<name>filter/Jamfile</name>
<type>1</type>

View File

@ -1,5 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="lm" InternalType="Library">
<Plugins>
<Plugin Name="CMakePlugin">
<![CDATA[[{
"name": "Debug",
"enabled": false,
"buildDirectory": "build",
"sourceDirectory": "$(ProjectPath)",
"generator": "",
"buildType": "",
"arguments": [],
"parentProject": ""
}]]]>
</Plugin>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
</Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="src"/>
@ -27,6 +44,8 @@
<File Name="../../../lm/virtual_interface.cc"/>
<File Name="../../../lm/vocab.cc"/>
</VirtualDirectory>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
<Settings Type="Static Library">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@ -40,9 +59,9 @@
<Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
<IncludePath Value="../../.."/>
<IncludePath Value="../../../phrase-extract"/>
<IncludePath Value="../../../boost/include"/>
<Preprocessor Value="KENLM_MAX_ORDER=7"/>
</Compiler>
<Linker Options="" Required="yes"/>
@ -72,7 +91,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -110,7 +129,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -118,6 +137,4 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -11,15 +11,15 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactExtension="a" artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.staticLib" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.staticLib" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.lib.debug.1721952013" name="Debug" parent="cdt.managedbuild.config.gnu.lib.debug">
<configuration artifactExtension="a" artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.staticLib" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.staticLib,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.lib.debug.1721952013" name="Debug" parent="cdt.managedbuild.config.gnu.lib.debug">
<folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/>
@ -32,6 +32,9 @@
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.2072043013" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="MAX_NUM_FACTORS=4"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.lib.debug.1365367786" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.lib.debug">
@ -46,9 +49,6 @@
</tool>
</toolChain>
</folderInfo>
<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.195400614" name="MeteorScorer.cpp" rcbsApplicability="disable" resourcePath="MeteorScorer.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.307282660">
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.307282660" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
</fileInfo>
<sourceEntries>
<entry excluding="mert/PreProcessFilter.h|mert/PreProcessFilter.cpp|mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
@ -66,15 +66,15 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactExtension="a" artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.staticLib" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.staticLib" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.lib.release.3250316" name="Release" parent="cdt.managedbuild.config.gnu.lib.release">
<configuration artifactExtension="a" artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.staticLib" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.staticLib,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.lib.release.3250316" name="Release" parent="cdt.managedbuild.config.gnu.lib.release">
<folderInfo id="cdt.managedbuild.config.gnu.lib.release.3250316." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.lib.release.1996805666" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.release">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.release.106685808" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.release"/>

View File

@ -46,20 +46,20 @@
<Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
<IncludePath Value="../../.."/>
<IncludePath Value="../../../phrase-extract"/>
<IncludePath Value="../../../boost/include"/>
<Preprocessor Value="MAX_NUM_FACTORS=4"/>
<Preprocessor Value="KENLM_MAX_ORDER=7"/>
<Preprocessor Value="TRACE_ENABLE=1"/>
</Compiler>
<Linker Options="" Required="yes">
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/lm/Debug"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/moses/Debug"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/OnDiskPt/Debug"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/search/Debug"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/util/Debug"/>
<LibraryPath Value="../../../boost/lib64"/>
<LibraryPath Value="../../../contrib/other-builds/lm/Debug"/>
<LibraryPath Value="../../../contrib/other-builds/moses/Debug"/>
<LibraryPath Value="../../../contrib/other-builds/OnDiskPt/Debug"/>
<LibraryPath Value="../../../contrib/other-builds/search/Debug"/>
<LibraryPath Value="../../../contrib/other-builds/util/Debug"/>
<Library Value="util"/>
<Library Value="moses"/>
<Library Value="search"/>

View File

@ -11,11 +11,11 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -79,12 +79,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1911984684" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -60,6 +60,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/AlignmentInfoTest.cpp</locationURI>
</link>
<link>
<name>AllOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/AllOptions.cpp</locationURI>
</link>
<link>
<name>AllOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/AllOptions.h</locationURI>
</link>
<link>
<name>BaseManager.cpp</name>
<type>1</type>
@ -70,6 +80,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/BaseManager.h</locationURI>
</link>
<link>
<name>BeamSearchOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/BeamSearchOptions.h</locationURI>
</link>
<link>
<name>BitmapContainer.cpp</name>
<type>1</type>
@ -80,6 +95,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/BitmapContainer.h</locationURI>
</link>
<link>
<name>BookkeepingOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/BookkeepingOptions.cpp</locationURI>
</link>
<link>
<name>BookkeepingOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/BookkeepingOptions.h</locationURI>
</link>
<link>
<name>CMakeLists.txt</name>
<type>1</type>
@ -230,6 +255,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.h</locationURI>
</link>
<link>
<name>CubePruningOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/CubePruningOptions.cpp</locationURI>
</link>
<link>
<name>CubePruningOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/CubePruningOptions.h</locationURI>
</link>
<link>
<name>DecodeGraph.cpp</name>
<type>1</type>
@ -460,6 +495,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/InputFileStream.h</locationURI>
</link>
<link>
<name>InputOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/InputOptions.cpp</locationURI>
</link>
<link>
<name>InputOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/InputOptions.h</locationURI>
</link>
<link>
<name>InputPath.cpp</name>
<type>1</type>
@ -490,6 +535,16 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>LMBR_Options.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/LMBR_Options.cpp</locationURI>
</link>
<link>
<name>LMBR_Options.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/LMBR_Options.h</locationURI>
</link>
<link>
<name>LVoc.cpp</name>
<type>1</type>
@ -510,6 +565,21 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/LatticeMBR.h</locationURI>
</link>
<link>
<name>LookupOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/LookupOptions.h</locationURI>
</link>
<link>
<name>MBR_Options.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/MBR_Options.cpp</locationURI>
</link>
<link>
<name>MBR_Options.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/MBR_Options.h</locationURI>
</link>
<link>
<name>Manager.cpp</name>
<type>1</type>
@ -535,6 +605,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/MosesTest.cpp</locationURI>
</link>
<link>
<name>NBestOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/NBestOptions.cpp</locationURI>
</link>
<link>
<name>NBestOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/NBestOptions.h</locationURI>
</link>
<link>
<name>NonTerminal.cpp</name>
<type>1</type>
@ -550,6 +630,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/ObjectPool.h</locationURI>
</link>
<link>
<name>OptionsBaseClass.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/OptionsBaseClass.cpp</locationURI>
</link>
<link>
<name>OptionsBaseClass.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/OptionsBaseClass.h</locationURI>
</link>
<link>
<name>OutputCollector.h</name>
<type>1</type>
@ -635,6 +725,26 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/ReorderingConstraint.h</locationURI>
</link>
<link>
<name>ReorderingOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ReorderingOptions.cpp</locationURI>
</link>
<link>
<name>ReorderingOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ReorderingOptions.h</locationURI>
</link>
<link>
<name>ReportingOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ReportingOptions.cpp</locationURI>
</link>
<link>
<name>ReportingOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ReportingOptions.h</locationURI>
</link>
<link>
<name>RuleCube.cpp</name>
<type>1</type>
@ -711,14 +821,14 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/SearchNormal.h</locationURI>
</link>
<link>
<name>SearchNormalBatch.cpp</name>
<name>SearchOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/SearchNormalBatch.cpp</locationURI>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/SearchOptions.cpp</locationURI>
</link>
<link>
<name>SearchNormalBatch.h</name>
<name>SearchOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/SearchNormalBatch.h</locationURI>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/SearchOptions.h</locationURI>
</link>
<link>
<name>Sentence.cpp</name>
@ -740,6 +850,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/SentenceStats.h</locationURI>
</link>
<link>
<name>ServerOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ServerOptions.cpp</locationURI>
</link>
<link>
<name>ServerOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ServerOptions.h</locationURI>
</link>
<link>
<name>SquareMatrix.cpp</name>
<type>1</type>
@ -1065,6 +1185,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/mbr.h</locationURI>
</link>
<link>
<name>parameters</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>rule.proto</name>
<type>1</type>
@ -1360,16 +1485,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SetSourcePhrase.h</locationURI>
</link>
<link>
<name>FF/SkeletonChangeInput.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SkeletonChangeInput.cpp</locationURI>
</link>
<link>
<name>FF/SkeletonChangeInput.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SkeletonChangeInput.h</locationURI>
</link>
<link>
<name>FF/SkeletonStatefulFF.cpp</name>
<type>1</type>
@ -2240,6 +2355,146 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.h</locationURI>
</link>
<link>
<name>parameters/AllOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/AllOptions.cpp</locationURI>
</link>
<link>
<name>parameters/AllOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/AllOptions.h</locationURI>
</link>
<link>
<name>parameters/BeamSearchOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/BeamSearchOptions.h</locationURI>
</link>
<link>
<name>parameters/BookkeepingOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/BookkeepingOptions.cpp</locationURI>
</link>
<link>
<name>parameters/BookkeepingOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/BookkeepingOptions.h</locationURI>
</link>
<link>
<name>parameters/ContextParameters.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.cpp</locationURI>
</link>
<link>
<name>parameters/ContextParameters.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.h</locationURI>
</link>
<link>
<name>parameters/CubePruningOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/CubePruningOptions.cpp</locationURI>
</link>
<link>
<name>parameters/CubePruningOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/CubePruningOptions.h</locationURI>
</link>
<link>
<name>parameters/InputOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/InputOptions.cpp</locationURI>
</link>
<link>
<name>parameters/InputOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/InputOptions.h</locationURI>
</link>
<link>
<name>parameters/LMBR_Options.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/LMBR_Options.cpp</locationURI>
</link>
<link>
<name>parameters/LMBR_Options.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/LMBR_Options.h</locationURI>
</link>
<link>
<name>parameters/LookupOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/LookupOptions.h</locationURI>
</link>
<link>
<name>parameters/MBR_Options.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/MBR_Options.cpp</locationURI>
</link>
<link>
<name>parameters/MBR_Options.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/MBR_Options.h</locationURI>
</link>
<link>
<name>parameters/NBestOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/NBestOptions.cpp</locationURI>
</link>
<link>
<name>parameters/NBestOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/NBestOptions.h</locationURI>
</link>
<link>
<name>parameters/OptionsBaseClass.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/OptionsBaseClass.cpp</locationURI>
</link>
<link>
<name>parameters/OptionsBaseClass.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/OptionsBaseClass.h</locationURI>
</link>
<link>
<name>parameters/ReorderingOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ReorderingOptions.cpp</locationURI>
</link>
<link>
<name>parameters/ReorderingOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ReorderingOptions.h</locationURI>
</link>
<link>
<name>parameters/ReportingOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ReportingOptions.cpp</locationURI>
</link>
<link>
<name>parameters/ReportingOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ReportingOptions.h</locationURI>
</link>
<link>
<name>parameters/SearchOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/SearchOptions.cpp</locationURI>
</link>
<link>
<name>parameters/SearchOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/SearchOptions.h</locationURI>
</link>
<link>
<name>parameters/ServerOptions.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ServerOptions.cpp</locationURI>
</link>
<link>
<name>parameters/ServerOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ServerOptions.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LexicalReordering.cpp</name>
<type>1</type>

View File

@ -775,6 +775,8 @@
<File Name="../../../moses/WordsRange.h"/>
<File Name="../../../moses/XmlOption.cpp"/>
<File Name="../../../moses/XmlOption.h"/>
<File Name="../../../moses/OutputFileStream.cpp"/>
<File Name="../../../moses/OutputFileStream.h"/>
</VirtualDirectory>
<VirtualDirectory Name="PP">
<File Name="../../../moses/PP/CountsPhraseProperty.cpp"/>
@ -793,8 +795,6 @@
<File Name="../../../moses/PP/SpanLengthPhraseProperty.h"/>
<File Name="../../../moses/PP/TreeStructurePhraseProperty.h"/>
</VirtualDirectory>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
<VirtualDirectory Name="parameters">
<File Name="../../../moses/parameters/ContextParameters.cpp"/>
<File Name="../../../moses/parameters/ContextParameters.h"/>
@ -814,7 +814,7 @@
<ResourceCompiler Options=""/>
</GlobalSettings>
<Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<Compiler Options="-g -std=c++0x" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="../../../"/>
<IncludePath Value="../../../phrase-extract"/>
@ -897,4 +897,6 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -0,0 +1,125 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="pruneGeneration" InternalType="Console">
<Plugins>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
<Plugin Name="CMakePlugin">
<![CDATA[[{
"name": "Debug",
"enabled": false,
"buildDirectory": "build",
"sourceDirectory": "$(ProjectPath)",
"generator": "",
"buildType": "",
"arguments": [],
"parentProject": ""
}]]]>
</Plugin>
</Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="src">
<File Name="../../../misc/pruneGeneration.cpp"/>
<File Name="../../../misc/pruneGeneration.h"/>
</VirtualDirectory>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
<IncludePath Value="."/>
</Compiler>
<Linker Options="">
<LibraryPath Value="."/>
</Linker>
<ResourceCompiler Options=""/>
</GlobalSettings>
<Configuration Name="Debug" CompilerType="GCC ( XCode )" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="../../.."/>
<IncludePath Value="../../../boost/include"/>
</Compiler>
<Linker Options="" Required="yes">
<LibraryPath Value="../../../boost/lib64"/>
<LibraryPath Value="../../../contrib/other-builds/moses/Debug"/>
<Library Value="boost_filesystem"/>
<Library Value="boost_system"/>
<Library Value="boost_iostreams"/>
<Library Value="moses"/>
<Library Value="z"/>
<Library Value="bz2"/>
</Linker>
<ResourceCompiler Options="" Required="no"/>
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
<Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
<![CDATA[]]>
</Environment>
<Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="yes">
<DebuggerSearchPaths/>
<PostConnectCommands/>
<StartupCommands/>
</Debugger>
<PreBuild/>
<PostBuild/>
<CustomBuild Enabled="no">
<RebuildCommand/>
<CleanCommand/>
<BuildCommand/>
<PreprocessFileCommand/>
<SingleFileCommand/>
<MakefileGenerationCommand/>
<ThirdPartyToolName>None</ThirdPartyToolName>
<WorkingDirectory/>
</CustomBuild>
<AdditionalRules>
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
<SearchPaths/>
</Completion>
</Configuration>
<Configuration Name="Release" CompilerType="GCC ( XCode )" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-O2;-Wall" C_Options="-O2;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<Preprocessor Value="NDEBUG"/>
</Compiler>
<Linker Options="" Required="yes"/>
<ResourceCompiler Options="" Required="no"/>
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Release" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
<Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
<![CDATA[]]>
</Environment>
<Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="no">
<DebuggerSearchPaths/>
<PostConnectCommands/>
<StartupCommands/>
</Debugger>
<PreBuild/>
<PostBuild/>
<CustomBuild Enabled="no">
<RebuildCommand/>
<CleanCommand/>
<BuildCommand/>
<PreprocessFileCommand/>
<SingleFileCommand/>
<MakefileGenerationCommand/>
<ThirdPartyToolName>None</ThirdPartyToolName>
<WorkingDirectory/>
</CustomBuild>
<AdditionalRules>
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
<SearchPaths/>
</Completion>
</Configuration>
</Settings>
</CodeLite_Project>

View File

@ -59,7 +59,6 @@
<listOptionValue builtIn="false" value="boost_program_options"/>
<listOptionValue builtIn="false" value="pthread"/>
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="bz2"/>
<listOptionValue builtIn="false" value="dl"/>
<listOptionValue builtIn="false" value="rt"/>
</option>

View File

@ -19,6 +19,10 @@
<File Name="../../../phrase-extract/tables-core.cpp"/>
<File Name="../../../phrase-extract/tables-core.h"/>
</VirtualDirectory>
<Dependencies Name="Debug">
<Project Name="util"/>
</Dependencies>
<Dependencies Name="Release"/>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@ -32,17 +36,17 @@
<Configuration Name="Debug" CompilerType="clang( based on LLVM 3.5svn )" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
<IncludePath Value="../../.."/>
<IncludePath Value="../../../phrase-extract"/>
<IncludePath Value="../../../boost/include"/>
</Compiler>
<Linker Options="" Required="yes">
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/lm/Debug"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/moses/Debug"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/OnDiskPt/Debug"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/search/Debug"/>
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/contrib/other-builds/util/Debug"/>
<LibraryPath Value="../../../boost/lib64"/>
<LibraryPath Value="../../../contrib/other-builds/lm/Debug"/>
<LibraryPath Value="../../../contrib/other-builds/moses/Debug"/>
<LibraryPath Value="../../../contrib/other-builds/OnDiskPt/Debug"/>
<LibraryPath Value="../../../contrib/other-builds/search/Debug"/>
<LibraryPath Value="../../../contrib/other-builds/util/Debug"/>
<Library Value="moses"/>
<Library Value="search"/>
<Library Value="OnDiskPt"/>
@ -86,7 +90,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -125,7 +129,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -133,8 +137,4 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug">
<Project Name="util"/>
</Dependencies>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -10,6 +10,8 @@
<File Name="../../../search/rule.cc"/>
<File Name="../../../search/vertex.cc"/>
</VirtualDirectory>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
<Settings Type="Static Library">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@ -23,9 +25,9 @@
<Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/phrase-extract"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
<IncludePath Value="../../.."/>
<IncludePath Value="../../../phrase-extract"/>
<IncludePath Value="../../../boost/include"/>
<Preprocessor Value="KENLM_MAX_ORDER=7"/>
</Compiler>
<Linker Options="" Required="yes"/>
@ -55,7 +57,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -93,7 +95,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -101,6 +103,4 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -159,10 +159,10 @@
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/server"/>
</configuration>
<configuration configurationName="Debug">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/server"/>
</configuration>
</storageModule>

View File

@ -62,6 +62,8 @@
<File Name="../../../util/stream/sort_test.cc" ExcludeProjConfig="Debug"/>
<File Name="../../../util/stream/stream_test.cc" ExcludeProjConfig="Debug"/>
</VirtualDirectory>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
<Settings Type="Static Library">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@ -75,8 +77,8 @@
<Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Static Library" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g" C_Options="-g" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
<IncludePath Value="../../.."/>
<IncludePath Value="../../../boost/include"/>
</Compiler>
<Linker Options="" Required="yes"/>
<ResourceCompiler Options="" Required="no"/>
@ -105,7 +107,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -143,7 +145,7 @@
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<Completion EnableCpp11="no" EnableCpp14="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
@ -151,6 +153,4 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -13,7 +13,7 @@ with-xmlrpc-c = [ option.get "with-xmlrpc-c" ] ;
if $(with-xmlrpc-c) {
echo While building mosesserver ... ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "!!! You are linking the XMLRPC-C library; Do NOT use v.1.25.29 !!!" ;
echo "!!! You are linking the XMLRPC-C library; Must be v.1.32 (September 2012) or higher !!!" ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
build-moses-server = true ;

View File

@ -38,13 +38,12 @@ int main(int argc, char** argv)
#include "moses/StaticData.h"
#include "moses/ThreadPool.h"
#include "moses/TranslationTask.h"
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#include "moses/FF/StatefulFeatureFunction.h"
#if PT_UG
#include "moses/TranslationModel/UG/mmsapt.h"
#endif
#include "moses/TreeInput.h"
#include "moses/LM/ORLM.h"
#include "moses/IOWrapper.h"
#include <boost/foreach.hpp>
@ -58,8 +57,8 @@ int main(int argc, char** argv)
#include <xmlrpc-c/server_abyss.hpp>
// using namespace Moses;
using Moses::TreeInput;
using namespace std;
using namespace Moses;
typedef std::map<std::string, xmlrpc_c::value> params_t;
@ -82,70 +81,16 @@ public:
Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
pdsa->add(source_,target_,alignment_);
#else
const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0];
PhraseDictionaryDynSuffixArray*
pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
cerr << "Inserting into address " << pdsa << endl;
pdsa->insertSnt(source_, target_, alignment_);
std::string msg;
msg = "Server was compiled without a phrase table implementation that ";
msg += "supports updates.";
throw xmlrpc_c::fault(msg.c_str(), xmlrpc_c::fault::CODE_PARSE);
#endif
if(add2ORLM_) {
//updateORLM();
}
XVERBOSE(1,"Done inserting\n");
//PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy);
map<string, xmlrpc_c::value> retData;
//*retvalP = xmlrpc_c::value_struct(retData);
#ifndef PT_UG
pdf = 0;
#endif
pdsa = 0;
*retvalP = xmlrpc_c::value_string("Phrase table updated");
}
string source_, target_, alignment_;
bool bounded_, add2ORLM_;
/*
void updateORLM() {
// TODO(level101): this belongs in the language model, not in moseserver.cpp
vector<string> vl;
map<vector<string>, int> ngSet;
LMList lms = StaticData::Instance().GetLMList(); // get LM
LMList::const_iterator lmIter = lms.begin();
LanguageModel *lm = *lmIter;
LanguageModelORLM* orlm = static_cast<LanguageModelORLM*>(lm);
if(orlm == 0) {
cerr << "WARNING: Unable to add target sentence to ORLM\n";
return;
}
// break out new ngrams from sentence
const int ngOrder(orlm->GetNGramOrder());
const std::string sBOS = orlm->GetSentenceStart()->GetString().as_string();
const std::string sEOS = orlm->GetSentenceEnd()->GetString().as_string();
Utils::splitToStr(target_, vl, " ");
// insert BOS and EOS
vl.insert(vl.begin(), sBOS);
vl.insert(vl.end(), sEOS);
for(int j=0; j < vl.size(); ++j) {
int i = (j<ngOrder) ? 0 : j-ngOrder+1;
for(int t=j; t >= i; --t) {
vector<string> ngVec;
for(int s=t; s<=j; ++s) {
ngVec.push_back(vl[s]);
//cerr << vl[s] << " ";
}
ngSet[ngVec]++;
//cerr << endl;
}
}
// insert into LM in order from 1grams up (for LM well-formedness)
cerr << "Inserting " << ngSet.size() << " ngrams into ORLM...\n";
for(int i=1; i <= ngOrder; ++i) {
iterate(ngSet, it) {
if(it->first.size() == i)
orlm->UpdateORLM(it->first, it->second);
}
}
}
*/
bool bounded_;
void breakOutParams(const params_t& params) {
params_t::const_iterator si = params.find("source");
@ -165,8 +110,6 @@ public:
XVERBOSE(1,"alignment = " << alignment_ << endl);
si = params.find("bounded");
bounded_ = (si != params.end());
si = params.find("updateORLM");
add2ORLM_ = (si != params.end());
}
};
@ -678,6 +621,14 @@ int main(int argc, char** argv)
bool isSerial = false;
size_t numThreads = 10; //for translation tasks
//Abyss server configuration: initial values reflect hard-coded default
//-> http://xmlrpc-c.sourceforge.net/doc/libxmlrpc_server_abyss.html#max_conn
size_t maxConn = 15;
size_t maxConnBacklog = 15;
size_t keepaliveTimeout = 15;
size_t keepaliveMaxConn = 30;
size_t timeout = 15;
for (int i = 0; i < argc; ++i) {
if (!strcmp(argv[i],"--server-port")) {
++i;
@ -695,6 +646,46 @@ int main(int argc, char** argv)
} else {
logfile = argv[i];
}
} else if (!strcmp(argv[i],"--server-maxconn")) {
++i;
if (i >= argc) {
cerr << "Error: Missing argument to --server-maxconn" << endl;
exit(1);
} else {
maxConn = atoi(argv[i]);
}
} else if (!strcmp(argv[i],"--server-maxconn-backlog")) {
++i;
if (i >= argc) {
cerr << "Error: Missing argument to --server-maxconn-backlog" << endl;
exit(1);
} else {
maxConnBacklog = atoi(argv[i]);
}
} else if (!strcmp(argv[i],"--server-keepalive-timeout")) {
++i;
if (i >= argc) {
cerr << "Error: Missing argument to --server-keepalive-timeout" << endl;
exit(1);
} else {
keepaliveTimeout = atoi(argv[i]);
}
} else if (!strcmp(argv[i],"--server-keepalive-maxconn")) {
++i;
if (i >= argc) {
cerr << "Error: Missing argument to --server-keepalive-maxconn" << endl;
exit(1);
} else {
keepaliveMaxConn = atoi(argv[i]);
}
} else if (!strcmp(argv[i],"--server-timeout")) {
++i;
if (i >= argc) {
cerr << "Error: Missing argument to --server-timeout" << endl;
exit(1);
} else {
timeout = atoi(argv[i]);
}
} else if (!strcmp(argv[i], "--threads")) {
++i;
if (i>=argc) {
@ -740,20 +731,27 @@ int main(int argc, char** argv)
myRegistry.addMethod("updater", updater);
myRegistry.addMethod("optimize", optimizer);
/* CODE FOR old xmlrpc-c v. 1.32 or lower
xmlrpc_c::serverAbyss myAbyssServer(
myRegistry,
port, // TCP port on which to listen
logfile
);
/* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04
*/
/* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04 */
xmlrpc_c::serverAbyss myAbyssServer(
xmlrpc_c::serverAbyss::constrOpt()
.registryPtr(&myRegistry)
.registryP(&myRegistry)
.portNumber(port) // TCP port on which to listen
.logFileName(logfile)
.allowOrigin("*")
.maxConn((unsigned int)maxConn)
.maxConnBacklog((unsigned int)maxConnBacklog)
.keepaliveTimeout((unsigned int)keepaliveTimeout)
.keepaliveMaxConn((unsigned int)keepaliveMaxConn)
.timeout((unsigned int)timeout)
);
*/
XVERBOSE(1,"Listening on port " << port << endl);
if (isSerial) {

View File

@ -2,7 +2,7 @@
# xmlrpc-c library (including the abyss server) that is needed for
# moses server functionality
if [ option.get "no-xmlrpc-c" ]
if [ option.get "no-xmlrpc-c" : : "yes" ]
{
rule xmlrpc ( what ? ) { } # never return anything
}

View File

@ -1,46 +1,139 @@
cmake_minimum_required(VERSION 2.8.8)
#
# The KenLM cmake files make use of add_library(... OBJECTS ...)
#
# This syntax allows grouping of source files when compiling
# (effectively creating "fake" libraries based on source subdirs).
#
# This syntax was only added in cmake version 2.8.8
#
# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
set(KENLM_MAX_ORDER 6)
add_definitions(-DKENLM_MAX_ORDER=${KENLM_MAX_ORDER})
# Explicitly list the source files for this subdirectory
#
# If you add any source files to this subdirectory
# that should be included in the kenlm library,
# (this excludes any unit test files)
# you should add them to the following list:
set(KENLM_SOURCE
bhiksha.cc
binary_format.cc
config.cc
lm_exception.cc
model.cc
quantize.cc
read_arpa.cc
search_hashed.cc
search_trie.cc
sizes.cc
trie.cc
trie_sort.cc
value_build.cc
virtual_interface.cc
vocab.cc
)
# Group these objects together for later use.
#
# Given add_library(foo OBJECT ${my_foo_sources}),
# refer to these objects as $<TARGET_OBJECTS:foo>
#
add_library(kenlm OBJECT ${KENLM_SOURCE})
# This directory has children that need to be processed
add_subdirectory(builder)
add_subdirectory(common)
add_subdirectory(filter)
# Explicitly list the executable files to be compiled
set(EXE_LIST
query
fragment
build_binary
)
# Iterate through the executable list
foreach(exe ${EXE_LIST})
# Compile the executable, linking against the requisite dependent object files
add_executable(${exe} ${exe}_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>)
# Link the executable against boost
target_link_libraries(${exe} ${Boost_LIBRARIES})
# Group executables together
set_target_properties(${exe} PROPERTIES FOLDER executables)
# End for loop
endforeach(exe)
# Install the executable files
install(TARGETS ${EXE_LIST} DESTINATION bin)
if(BUILD_TESTING)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
left_test
model_test
partial_test
)
# Iterate through the Boost tests list
foreach(test ${KENLM_BOOST_TESTS_LIST})
# Compile the executable, linking against the requisite dependent object files
add_executable(${test} ${test}.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>)
# Require the following compile flag
set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES})
# model_test requires an extra command line parameter
if ("${test}" STREQUAL "model_test")
set(test_params
${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa
)
else()
set(test_params
${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
)
endif()
# Specify command arguments for how to run each unit test
#
# Assuming that foo was defined via add_executable(foo ...),
# the syntax $<TARGET_FILE:foo> gives the full path to the executable.
#
add_test(NAME ${test}_test
COMMAND $<TARGET_FILE:${test}> ${test_params})
# Group unit tests together
set_target_properties(${test} PROPERTIES FOLDER "unit_tests")
# End for loop
endforeach(test)
endif()
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/bhiksha.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/bhiksha.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/binary_format.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/binary_format.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/blank.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/config.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/config.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/enumerate_vocab.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/facade.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/left.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/lm_exception.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/lm_exception.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/max_order.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/model.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/model.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/model_type.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/ngram_query.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/partial.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/quantize.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/quantize.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/read_arpa.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/read_arpa.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/return.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/search_hashed.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/search_hashed.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/search_trie.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/search_trie.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/sizes.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/sizes.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/state.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/trie.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/trie.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/trie_sort.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/trie_sort.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/value.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/value_build.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/value_build.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/virtual_interface.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/virtual_interface.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/vocab.cc")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/vocab.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/weights.hh")
list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/word_index.hh")
add_library(kenlm OBJECT ${SOURCE_KENLM})

87
lm/builder/CMakeLists.txt Normal file
View File

@ -0,0 +1,87 @@
cmake_minimum_required(VERSION 2.8.8)
#
# The KenLM cmake files make use of add_library(... OBJECTS ...)
#
# This syntax allows grouping of source files when compiling
# (effectively creating "fake" libraries based on source subdirs).
#
# This syntax was only added in cmake version 2.8.8
#
# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
# Explicitly list the source files for this subdirectory
#
# If you add any source files to this subdirectory
# that should be included in the kenlm library,
# (this excludes any unit test files)
# you should add them to the following list:
#
# In order to set correct paths to these files
# in case this variable is referenced by CMake files in the parent directory,
# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
#
set(KENLM_BUILDER_SOURCE
${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc
${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc
${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc
${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc
${CMAKE_CURRENT_SOURCE_DIR}/output.cc
${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc
)
# Group these objects together for later use.
#
# Given add_library(foo OBJECT ${my_foo_sources}),
# refer to these objects as $<TARGET_OBJECTS:foo>
#
add_library(kenlm_builder OBJECT ${KENLM_BUILDER_SOURCE})
# Compile the executable, linking against the requisite dependent object files
add_executable(lmplz lmplz_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
# Link the executable against boost
target_link_libraries(lmplz ${Boost_LIBRARIES})
# Group executables together
set_target_properties(lmplz PROPERTIES FOLDER executables)
if(BUILD_TESTING)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
adjust_counts_test
corpus_count_test
)
# Iterate through the Boost tests list
foreach(test ${KENLM_BOOST_TESTS_LIST})
# Compile the executable, linking against the requisite dependent object files
add_executable(${test} ${test}.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
# Require the following compile flag
set_target_properties(${test} PROPERTIES COMPILE_FLAGS "-DBOOST_TEST_DYN_LINK -DBOOST_PROGRAM_OPTIONS_DYN_LINK")
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES})
# Specify command arguments for how to run each unit test
#
# Assuming that foo was defined via add_executable(foo ...),
# the syntax $<TARGET_FILE:foo> gives the full path to the executable.
#
add_test(NAME ${test}_test
COMMAND $<TARGET_FILE:${test}>)
# Group unit tests together
set_target_properties(${test} PROPERTIES FOLDER "unit_tests")
# End for loop
endforeach(test)
endif()

View File

@ -15,9 +15,6 @@
#include "util/stream/timer.hh"
#include "util/tokenize_piece.hh"
#include <boost/unordered_set.hpp>
#include <boost/unordered_map.hpp>
#include <functional>
#include <stdint.h>

View File

@ -43,12 +43,13 @@ BOOST_AUTO_TEST_CASE(Short) {
util::scoped_fd vocab(util::MakeTemp("corpus_count_test_vocab"));
util::stream::Chain chain(config);
NGramStream<BuildingPayload> stream;
uint64_t token_count;
WordIndex type_count = 10;
std::vector<bool> prune_words;
CorpusCount counter(input_piece, vocab.get(), token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT);
chain >> boost::ref(counter) >> stream >> util::stream::kRecycle;
chain >> boost::ref(counter);
NGramStream<BuildingPayload> stream(chain.Add());
chain >> util::stream::kRecycle;
const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};

View File

@ -1,54 +1,18 @@
#ifndef LM_BUILDER_PRINT_H
#define LM_BUILDER_PRINT_H
#ifndef LM_BUILDER_DEBUG_PRINT_H
#define LM_BUILDER_DEBUG_PRINT_H
#include "lm/common/ngram_stream.hh"
#include "lm/builder/output.hh"
#include "lm/builder/payload.hh"
#include "lm/common/ngram.hh"
#include "lm/common/print.hh"
#include "lm/common/ngram_stream.hh"
#include "util/fake_ofstream.hh"
#include "util/file.hh"
#include "util/mmap.hh"
#include "util/string_piece.hh"
#include <boost/lexical_cast.hpp>
#include <ostream>
#include <cassert>
// Warning: print routines read all unigrams before all bigrams before all
// trigrams etc. So if other parts of the chain move jointly, you'll have to
// buffer.
namespace lm { namespace builder {
class VocabReconstitute {
public:
// fd must be alive for life of this object; does not take ownership.
explicit VocabReconstitute(int fd);
const char *Lookup(WordIndex index) const {
assert(index < map_.size() - 1);
return map_[index];
}
StringPiece LookupPiece(WordIndex index) const {
return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]);
}
std::size_t Size() const {
// There's an extra entry to support StringPiece lengths.
return map_.size() - 1;
}
private:
util::scoped_memory memory_;
std::vector<const char*> map_;
};
// Not defined, only specialized.
template <class T> void PrintPayload(util::FakeOFStream &to, const BuildingPayload &payload);
template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const BuildingPayload &payload) {
// TODO slow
to << payload.count;
}
template <> inline void PrintPayload<Uninterpolated>(util::FakeOFStream &to, const BuildingPayload &payload) {
@ -101,19 +65,6 @@ template <class V> class Print {
int to_;
};
class PrintARPA : public OutputHook {
public:
explicit PrintARPA(int fd, bool verbose_header)
: OutputHook(PROB_SEQUENTIAL_HOOK), out_fd_(fd), verbose_header_(verbose_header) {}
void Sink(util::stream::Chains &chains);
void Run(const util::stream::ChainPositions &positions);
private:
util::scoped_fd out_fd_;
bool verbose_header_;
};
}} // namespaces
#endif // LM_BUILDER_PRINT_H
#endif // LM_BUILDER_DEBUG_PRINT_H

View File

@ -1,4 +1,4 @@
#include "lm/builder/print.hh"
#include "lm/common/print.hh"
#include "lm/word_index.hh"
#include "util/file.hh"
#include "util/read_compressed.hh"
@ -20,7 +20,7 @@ int main(int argc, char *argv[]) {
}
util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
lm::builder::VocabReconstitute vocab(vocab_file.get());
lm::VocabReconstitute vocab(vocab_file.get());
unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {

View File

@ -5,6 +5,8 @@
#include <vector>
#include <stdint.h>
namespace lm { namespace builder {
// Some configuration info that is used to add
// comments to the beginning of an ARPA file
struct HeaderInfo {
@ -21,4 +23,6 @@ struct HeaderInfo {
// TODO: More info if multiple models were interpolated
};
}} // namespaces
#endif

View File

@ -1,9 +1,9 @@
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/discount.hh"
#include "lm/builder/special.hh"
#include "lm/builder/hash_gamma.hh"
#include "lm/builder/payload.hh"
#include "lm/common/special.hh"
#include "lm/common/ngram_stream.hh"
#include "util/murmur_hash.hh"
#include "util/file.hh"

View File

@ -10,9 +10,8 @@
namespace util { namespace stream { class Chains; } }
namespace lm {
namespace builder {
class SpecialVocab;
namespace builder {
struct InitialProbabilitiesConfig {
// These should be small buffers to keep the adder from getting too far ahead

View File

@ -1,16 +1,16 @@
#include "lm/builder/interpolate.hh"
#include "lm/builder/hash_gamma.hh"
#include "lm/builder/joint_order.hh"
#include "lm/common/ngram_stream.hh"
#include "lm/builder/payload.hh"
#include "lm/common/compare.hh"
#include "lm/common/joint_order.hh"
#include "lm/common/ngram_stream.hh"
#include "lm/lm_exception.hh"
#include "util/fixed_array.hh"
#include "util/murmur_hash.hh"
#include <cassert>
#include <cmath>
#include <iostream>
namespace lm { namespace builder {
namespace {
@ -91,7 +91,8 @@ template <class Output> class Callback {
}
}
void Enter(unsigned order_minus_1, NGram<BuildingPayload> &gram) {
void Enter(unsigned order_minus_1, void *data) {
NGram<BuildingPayload> gram(data, order_minus_1 + 1);
BuildingPayload &pay = gram.Value();
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
probs_[order_minus_1 + 1] = pay.complete.prob;
@ -125,7 +126,7 @@ template <class Output> class Callback {
output_.Gram(order_minus_1, out_backoff, pay.complete);
}
void Exit(unsigned, const NGram<BuildingPayload> &) const {}
void Exit(unsigned, void *) const {}
private:
util::FixedArray<util::stream::Stream> backoffs_;

View File

@ -1,7 +1,7 @@
#ifndef LM_BUILDER_INTERPOLATE_H
#define LM_BUILDER_INTERPOLATE_H
#include "lm/builder/special.hh"
#include "lm/common/special.hh"
#include "lm/word_index.hh"
#include "util/stream/multi_stream.hh"

View File

@ -1,6 +1,6 @@
#include "lm/builder/output.hh"
#include "lm/builder/pipeline.hh"
#include "lm/builder/print.hh"
#include "lm/common/size_option.hh"
#include "lm/lm_exception.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
@ -13,21 +13,6 @@
#include <vector>
namespace {
class SizeNotify {
public:
SizeNotify(std::size_t &out) : behind_(out) {}
void operator()(const std::string &from) {
behind_ = util::ParseSize(from);
}
private:
std::size_t &behind_;
};
boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
}
// Parse and validate pruning thresholds then return vector of threshold counts
// for each n-grams order.
@ -106,17 +91,16 @@ int main(int argc, char *argv[]) {
("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI. If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
("memory,S", lm:: SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
("minimum_block", lm::SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
("sort_block", lm::SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write a file containing the unique vocabulary strings delimited by null bytes")
("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
("intermediate", po::value<std::string>(&intermediate), "Write ngrams to an intermediate file. Turns off ARPA output (which can be reactivated by --arpa file). Forces --renumber on. Implicitly makes --vocab_file be the provided name + .vocab.")
("intermediate", po::value<std::string>(&intermediate), "Write ngrams to intermediate files. Turns off ARPA output (which can be reactivated by --arpa file). Forces --renumber on.")
("renumber", po::bool_switch(&pipeline.renumber_vocabulary), "Rrenumber the vocabulary identifiers so that they are monotone with the hash of each string. This is consistent with the ordering used by the trie data structure.")
("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
@ -217,15 +201,10 @@ int main(int argc, char *argv[]) {
bool writing_intermediate = vm.count("intermediate");
if (writing_intermediate) {
pipeline.renumber_vocabulary = true;
if (!pipeline.vocab_file.empty()) {
std::cerr << "--intermediate and --vocab_file are incompatible because --intermediate already makes a vocab file." << std::endl;
return 1;
}
pipeline.vocab_file = intermediate + ".vocab";
}
lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate);
lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate, pipeline.output_q);
if (!writing_intermediate || vm.count("arpa")) {
output.Add(new lm::builder::PrintARPA(out.release(), verbose_header));
output.Add(new lm::builder::PrintHook(out.release(), verbose_header));
}
lm::builder::Pipeline(pipeline, in.release(), output);
} catch (const util::MallocException &e) {

View File

@ -1,6 +1,8 @@
#include "lm/builder/output.hh"
#include "lm/common/model_buffer.hh"
#include "lm/common/print.hh"
#include "util/fake_ofstream.hh"
#include "util/stream/multi_stream.hh"
#include <iostream>
@ -9,23 +11,22 @@ namespace lm { namespace builder {
OutputHook::~OutputHook() {}
Output::Output(StringPiece file_base, bool keep_buffer)
: file_base_(file_base.data(), file_base.size()), keep_buffer_(keep_buffer) {}
Output::Output(StringPiece file_base, bool keep_buffer, bool output_q)
: buffer_(file_base, keep_buffer, output_q) {}
void Output::SinkProbs(util::stream::Chains &chains, bool output_q) {
void Output::SinkProbs(util::stream::Chains &chains) {
Apply(PROB_PARALLEL_HOOK, chains);
if (!keep_buffer_ && !Have(PROB_SEQUENTIAL_HOOK)) {
if (!buffer_.Keep() && !Have(PROB_SEQUENTIAL_HOOK)) {
chains >> util::stream::kRecycle;
chains.Wait(true);
return;
}
lm::common::ModelBuffer buf(file_base_, keep_buffer_, output_q);
buf.Sink(chains);
buffer_.Sink(chains, header_.counts_pruned);
chains >> util::stream::kRecycle;
chains.Wait(false);
if (Have(PROB_SEQUENTIAL_HOOK)) {
std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
buf.Source(chains);
buffer_.Source(chains);
Apply(PROB_SEQUENTIAL_HOOK, chains);
chains >> util::stream::kRecycle;
chains.Wait(true);
@ -34,8 +35,18 @@ void Output::SinkProbs(util::stream::Chains &chains, bool output_q) {
void Output::Apply(HookType hook_type, util::stream::Chains &chains) {
for (boost::ptr_vector<OutputHook>::iterator entry = outputs_[hook_type].begin(); entry != outputs_[hook_type].end(); ++entry) {
entry->Sink(chains);
entry->Sink(header_, VocabFile(), chains);
}
}
void PrintHook::Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) {
if (verbose_header_) {
util::FakeOFStream out(file_.get(), 50);
out << "# Input file: " << info.input_file << '\n';
out << "# Token count: " << info.token_count << '\n';
out << "# Smoothing: Modified Kneser-Ney" << '\n';
}
chains >> PrintARPA(vocab_file, file_.get(), info.counts_pruned);
}
}} // namespaces

View File

@ -2,6 +2,7 @@
#define LM_BUILDER_OUTPUT_H
#include "lm/builder/header_info.hh"
#include "lm/common/model_buffer.hh"
#include "util/file.hh"
#include <boost/ptr_container/ptr_vector.hpp>
@ -20,69 +21,64 @@ enum HookType {
NUMBER_OF_HOOKS // Keep this last so we know how many values there are.
};
class Output;
class OutputHook {
public:
explicit OutputHook(HookType hook_type) : type_(hook_type), master_(NULL) {}
explicit OutputHook(HookType hook_type) : type_(hook_type) {}
virtual ~OutputHook();
virtual void Sink(util::stream::Chains &chains) = 0;
virtual void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) = 0;
protected:
const HeaderInfo &GetHeader() const;
int GetVocabFD() const;
HookType Type() const { return type_; }
private:
friend class Output;
const HookType type_;
const Output *master_;
HookType type_;
};
class Output : boost::noncopyable {
public:
Output(StringPiece file_base, bool keep_buffer);
Output(StringPiece file_base, bool keep_buffer, bool output_q);
// Takes ownership.
void Add(OutputHook *hook) {
hook->master_ = this;
outputs_[hook->type_].push_back(hook);
outputs_[hook->Type()].push_back(hook);
}
bool Have(HookType hook_type) const {
return !outputs_[hook_type].empty();
}
void SetVocabFD(int to) { vocab_fd_ = to; }
int GetVocabFD() const { return vocab_fd_; }
int VocabFile() const { return buffer_.VocabFile(); }
void SetHeader(const HeaderInfo &header) { header_ = header; }
const HeaderInfo &GetHeader() const { return header_; }
// This is called by the pipeline.
void SinkProbs(util::stream::Chains &chains, bool output_q);
void SinkProbs(util::stream::Chains &chains);
unsigned int Steps() const { return Have(PROB_SEQUENTIAL_HOOK); }
private:
void Apply(HookType hook_type, util::stream::Chains &chains);
boost::ptr_vector<OutputHook> outputs_[NUMBER_OF_HOOKS];
int vocab_fd_;
HeaderInfo header_;
ModelBuffer buffer_;
std::string file_base_;
bool keep_buffer_;
boost::ptr_vector<OutputHook> outputs_[NUMBER_OF_HOOKS];
HeaderInfo header_;
};
inline const HeaderInfo &OutputHook::GetHeader() const {
return master_->GetHeader();
}
class PrintHook : public OutputHook {
public:
// Takes ownership
PrintHook(int write_fd, bool verbose_header)
: OutputHook(PROB_SEQUENTIAL_HOOK), file_(write_fd), verbose_header_(verbose_header) {}
inline int OutputHook::GetVocabFD() const {
return master_->GetVocabFD();
}
void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains);
private:
util::scoped_fd file_;
bool verbose_header_;
};
}} // namespaces

View File

@ -277,27 +277,27 @@ void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &maste
}
master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.prune_vocab, config.output_q, specials);
gamma_chains >> util::stream::kRecycle;
output.SinkProbs(master.MutableChains(), config.output_q);
output.SinkProbs(master.MutableChains());
}
class VocabNumbering {
public:
VocabNumbering(StringPiece vocab_file, StringPiece temp_prefix, bool renumber)
: vocab_file_(vocab_file.data(), vocab_file.size()),
temp_prefix_(temp_prefix.data(), temp_prefix.size()),
VocabNumbering(int final_vocab, StringPiece temp_prefix, bool renumber)
: final_vocab_(final_vocab),
renumber_(renumber),
specials_(kBOS, kEOS) {
InitFile(renumber || vocab_file.empty());
if (renumber) {
temporary_.reset(util::MakeTemp(temp_prefix));
}
}
int File() const { return null_delimited_.get(); }
int WriteOnTheFly() const { return renumber_ ? temporary_.get() : final_vocab_; }
// Compute the vocabulary mapping and return the memory used.
std::size_t ComputeMapping(WordIndex type_count) {
if (!renumber_) return 0;
util::scoped_fd previous(null_delimited_.release());
InitFile(vocab_file_.empty());
ngram::SortedVocabulary::ComputeRenumbering(type_count, previous.get(), null_delimited_.get(), vocab_mapping_);
ngram::SortedVocabulary::ComputeRenumbering(type_count, temporary_.get(), final_vocab_, vocab_mapping_);
temporary_.reset();
return sizeof(WordIndex) * vocab_mapping_.size();
}
@ -312,15 +312,9 @@ class VocabNumbering {
const SpecialVocab &Specials() const { return specials_; }
private:
void InitFile(bool temp) {
null_delimited_.reset(temp ?
util::MakeTemp(temp_prefix_) :
util::CreateOrThrow(vocab_file_.c_str()));
}
std::string vocab_file_, temp_prefix_;
util::scoped_fd null_delimited_;
int final_vocab_;
// Out of order vocab file created on the fly.
util::scoped_fd temporary_;
bool renumber_;
@ -349,18 +343,17 @@ void Pipeline(PipelineConfig &config, int text_file, Output &output) {
// master's destructor will wait for chains. But they might be deadlocked if
// this thread dies because e.g. it ran out of memory.
try {
VocabNumbering numbering(config.vocab_file, config.TempPrefix(), config.renumber_vocabulary);
VocabNumbering numbering(output.VocabFile(), config.TempPrefix(), config.renumber_vocabulary);
uint64_t token_count;
WordIndex type_count;
std::string text_file_name;
std::vector<bool> prune_words;
util::scoped_ptr<util::stream::Sort<SuffixOrder, CombineCounts> > sorted_counts(
CountText(text_file, numbering.File(), master, token_count, type_count, text_file_name, prune_words));
CountText(text_file, numbering.WriteOnTheFly(), master, token_count, type_count, text_file_name, prune_words));
std::cerr << "Unigram tokens " << token_count << " types " << type_count << std::endl;
// Create vocab mapping, which uses temporary memory, while nothing else is happening.
std::size_t subtract_for_numbering = numbering.ComputeMapping(type_count);
output.SetVocabFD(numbering.File());
std::cerr << "=== 2/" << master.Steps() << " Calculating and sorting adjusted counts ===" << std::endl;
master.InitForAdjust(*sorted_counts, type_count, subtract_for_numbering);

View File

@ -18,7 +18,6 @@ class Output;
struct PipelineConfig {
std::size_t order;
std::string vocab_file;
util::stream::SortConfig sort;
InitialProbabilitiesConfig initial_probs;
util::stream::ChainConfig read_backoffs;

View File

@ -1,64 +0,0 @@
#include "lm/builder/print.hh"
#include "util/fake_ofstream.hh"
#include "util/file.hh"
#include "util/mmap.hh"
#include "util/scoped.hh"
#include "util/stream/timer.hh"
#include <sstream>
#include <cstring>
namespace lm { namespace builder {
VocabReconstitute::VocabReconstitute(int fd) {
uint64_t size = util::SizeOrThrow(fd);
util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
const char *const start = static_cast<const char*>(memory_.get());
const char *i;
for (i = start; i != start + size; i += strlen(i) + 1) {
map_.push_back(i);
}
// Last one for LookupPiece.
map_.push_back(i);
}
void PrintARPA::Sink(util::stream::Chains &chains) {
chains >> boost::ref(*this);
}
void PrintARPA::Run(const util::stream::ChainPositions &positions) {
VocabReconstitute vocab(GetVocabFD());
util::FakeOFStream out(out_fd_.get());
// Write header.
if (verbose_header_) {
out << "# Input file: " << GetHeader().input_file << '\n';
out << "# Token count: " << GetHeader().token_count << '\n';
out << "# Smoothing: Modified Kneser-Ney" << '\n';
}
out << "\\data\\\n";
for (size_t i = 0; i < positions.size(); ++i) {
out << "ngram " << (i+1) << '=' << GetHeader().counts_pruned[i] << '\n';
}
out << '\n';
for (unsigned order = 1; order <= positions.size(); ++order) {
out << "\\" << order << "-grams:" << '\n';
for (NGramStream<BuildingPayload> stream(positions[order - 1]); stream; ++stream) {
// Correcting for numerical precision issues. Take that IRST.
out << stream->Value().complete.prob << '\t' << vocab.Lookup(*stream->begin());
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
out << ' ' << vocab.Lookup(*i);
}
if (order != positions.size())
out << '\t' << stream->Value().complete.backoff;
out << '\n';
}
out << '\n';
}
out << "\\end\\\n";
}
}} // namespaces

40
lm/common/CMakeLists.txt Normal file
View File

@ -0,0 +1,40 @@
cmake_minimum_required(VERSION 2.8.8)
#
# The KenLM cmake files make use of add_library(... OBJECTS ...)
#
# This syntax allows grouping of source files when compiling
# (effectively creating "fake" libraries based on source subdirs).
#
# This syntax was only added in cmake version 2.8.8
#
# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
# Explicitly list the source files for this subdirectory
#
# If you add any source files to this subdirectory
# that should be included in the kenlm library,
# (this excludes any unit test files)
# you should add them to the following list:
#
# In order to set correct paths to these files
# in case this variable is referenced by CMake files in the parent directory,
# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
#
set(KENLM_COMMON_SOURCE
${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc
${CMAKE_CURRENT_SOURCE_DIR}/print.cc
${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc
${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc
)
# Group these objects together for later use.
#
# Given add_library(foo OBJECT ${my_foo_sources}),
# refer to these objects as $<TARGET_OBJECTS:foo>
#
add_library(kenlm_common OBJECT ${KENLM_COMMON_SOURCE})

View File

@ -1,2 +1,2 @@
fakelib common : [ glob *.cc : *test.cc *main.cc ]
../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm ;
../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm /top//boost_program_options ;

View File

@ -1,8 +1,7 @@
#ifndef LM_BUILDER_JOINT_ORDER_H
#define LM_BUILDER_JOINT_ORDER_H
#ifndef LM_COMMON_JOINT_ORDER_H
#define LM_COMMON_JOINT_ORDER_H
#include "lm/common/ngram_stream.hh"
#include "lm/builder/payload.hh"
#include "lm/lm_exception.hh"
#ifdef DEBUG
@ -12,15 +11,19 @@
#include <cstring>
namespace lm { namespace builder {
namespace lm {
template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
// Allow matching to reference streams[-1].
NGramStreams<BuildingPayload> streams_with_dummy;
streams_with_dummy.InitWithDummy(positions);
NGramStream<BuildingPayload> *streams = streams_with_dummy.begin() + 1;
util::FixedArray<ProxyStream<NGramHeader> > streams_with_dummy(positions.size() + 1);
// A bogus stream for [-1].
streams_with_dummy.push_back();
for (std::size_t i = 0; i < positions.size(); ++i) {
streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1));
}
ProxyStream<NGramHeader> *streams = streams_with_dummy.begin() + 1;
unsigned int order;
std::size_t order;
for (order = 0; order < positions.size() && streams[order]; ++order) {}
assert(order); // should always have <unk>.
@ -31,11 +34,11 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
less_compare.push_back(i + 1);
#endif // DEBUG
unsigned int current = 0;
std::size_t current = 0;
while (true) {
// Does the context match the lower one?
if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
callback.Enter(current, *streams[current]);
callback.Enter(current, streams[current].Get());
// Transition to looking for extensions.
if (++current < order) continue;
}
@ -51,7 +54,7 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
while(true) {
assert(current > 0);
--current;
callback.Exit(current, *streams[current]);
callback.Exit(current, streams[current].Get());
if (++streams[current]) break;
@ -63,6 +66,6 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
}
}
}} // namespaces
} // namespaces
#endif // LM_BUILDER_JOINT_ORDER_H
#endif // LM_COMMON_JOINT_ORDER_H

View File

@ -8,25 +8,30 @@
#include <boost/lexical_cast.hpp>
namespace lm { namespace common {
namespace lm {
namespace {
const char kMetadataHeader[] = "KenLM intermediate binary file";
} // namespace
ModelBuffer::ModelBuffer(const std::string &file_base, bool keep_buffer, bool output_q)
: file_base_(file_base), keep_buffer_(keep_buffer), output_q_(output_q) {}
ModelBuffer::ModelBuffer(const std::string &file_base)
: file_base_(file_base), keep_buffer_(false) {
ModelBuffer::ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q)
: file_base_(file_base.data(), file_base.size()), keep_buffer_(keep_buffer), output_q_(output_q),
vocab_file_(keep_buffer ? util::CreateOrThrow((file_base_ + ".vocab").c_str()) : util::MakeTemp(file_base_)) {}
ModelBuffer::ModelBuffer(StringPiece file_base)
: file_base_(file_base.data(), file_base.size()), keep_buffer_(false) {
const std::string full_name = file_base_ + ".kenlm_intermediate";
util::FilePiece in(full_name.c_str());
StringPiece token = in.ReadLine();
UTIL_THROW_IF2(token != kMetadataHeader, "File " << full_name << " begins with \"" << token << "\" not " << kMetadataHeader);
token = in.ReadDelimited();
UTIL_THROW_IF2(token != "Order", "Expected Order, got \"" << token << "\" in " << full_name);
unsigned long order = in.ReadULong();
UTIL_THROW_IF2(token != "Counts", "Expected Counts, got \"" << token << "\" in " << full_name);
char got;
while ((got = in.get()) == ' ') {
counts_.push_back(in.ReadULong());
}
UTIL_THROW_IF2(got != '\n', "Expected newline at end of counts.");
token = in.ReadDelimited();
UTIL_THROW_IF2(token != "Payload", "Expected Payload, got \"" << token << "\" in " << full_name);
@ -39,16 +44,16 @@ ModelBuffer::ModelBuffer(const std::string &file_base)
UTIL_THROW(util::Exception, "Unknown payload " << token);
}
files_.Init(order);
for (unsigned long i = 0; i < order; ++i) {
vocab_file_.reset(util::OpenReadOrThrow((file_base_ + ".vocab").c_str()));
files_.Init(counts_.size());
for (unsigned long i = 0; i < counts_.size(); ++i) {
files_.push_back(util::OpenReadOrThrow((file_base_ + '.' + boost::lexical_cast<std::string>(i + 1)).c_str()));
}
}
// virtual destructor
ModelBuffer::~ModelBuffer() {}
void ModelBuffer::Sink(util::stream::Chains &chains) {
void ModelBuffer::Sink(util::stream::Chains &chains, const std::vector<uint64_t> &counts) {
counts_ = counts;
// Open files.
files_.Init(chains.size());
for (std::size_t i = 0; i < chains.size(); ++i) {
@ -64,19 +69,23 @@ void ModelBuffer::Sink(util::stream::Chains &chains) {
if (keep_buffer_) {
util::scoped_fd metadata(util::CreateOrThrow((file_base_ + ".kenlm_intermediate").c_str()));
util::FakeOFStream meta(metadata.get(), 200);
meta << kMetadataHeader << "\nOrder " << chains.size() << "\nPayload " << (output_q_ ? "q" : "pb") << '\n';
meta << kMetadataHeader << "\nCounts";
for (std::vector<uint64_t>::const_iterator i = counts_.begin(); i != counts_.end(); ++i) {
meta << ' ' << *i;
}
meta << "\nPayload " << (output_q_ ? "q" : "pb") << '\n';
}
}
void ModelBuffer::Source(util::stream::Chains &chains) {
assert(chains.size() == files_.size());
for (unsigned int i = 0; i < files_.size(); ++i) {
assert(chains.size() <= files_.size());
for (unsigned int i = 0; i < chains.size(); ++i) {
chains[i] >> util::stream::PRead(files_[i].get());
}
}
std::size_t ModelBuffer::Order() const {
return files_.size();
void ModelBuffer::Source(std::size_t order_minus_1, util::stream::Chain &chain) {
chain >> util::stream::PRead(files_[order_minus_1].get());
}
}} // namespaces
} // namespace

View File

@ -1,5 +1,5 @@
#ifndef LM_BUILDER_MODEL_BUFFER_H
#define LM_BUILDER_MODEL_BUFFER_H
#ifndef LM_COMMON_MODEL_BUFFER_H
#define LM_COMMON_MODEL_BUFFER_H
/* Format with separate files in suffix order. Each file contains
* n-grams of the same order.
@ -9,37 +9,55 @@
#include "util/fixed_array.hh"
#include <string>
#include <vector>
namespace util { namespace stream { class Chains; } }
namespace util { namespace stream {
class Chains;
class Chain;
}} // namespaces
namespace lm { namespace common {
namespace lm {
class ModelBuffer {
public:
// Construct for writing.
ModelBuffer(const std::string &file_base, bool keep_buffer, bool output_q);
// Construct for writing. Must call VocabFile() and fill it with null-delimited vocab words.
ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q);
// Load from file.
explicit ModelBuffer(const std::string &file_base);
explicit ModelBuffer(StringPiece file_base);
// explicit for virtual destructor.
~ModelBuffer();
void Sink(util::stream::Chains &chains);
// Must call VocabFile and populate before calling this function.
void Sink(util::stream::Chains &chains, const std::vector<uint64_t> &counts);
// Read files and write to the given chains. If fewer chains are provided,
// only do the lower orders.
void Source(util::stream::Chains &chains);
void Source(std::size_t order_minus_1, util::stream::Chain &chain);
// The order of the n-gram model that is associated with the model buffer.
std::size_t Order() const;
std::size_t Order() const { return counts_.size(); }
// Requires Sink or load from file.
const std::vector<uint64_t> &Counts() const {
assert(!counts_.empty());
return counts_;
}
int VocabFile() const { return vocab_file_.get(); }
int StealVocabFile() { return vocab_file_.release(); }
bool Keep() const { return keep_buffer_; }
private:
const std::string file_base_;
const bool keep_buffer_;
bool output_q_;
std::vector<uint64_t> counts_;
util::scoped_fd vocab_file_;
util::FixedArray<util::scoped_fd> files_;
};
}} // namespaces
} // namespace lm
#endif // LM_BUILDER_MODEL_BUFFER_H
#endif // LM_COMMON_MODEL_BUFFER_H

View File

@ -16,6 +16,8 @@ class NGramHeader {
NGramHeader(void *begin, std::size_t order)
: begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {}
NGramHeader() : begin_(NULL), end_(NULL) {}
const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); }
uint8_t *Base() { return reinterpret_cast<uint8_t*>(begin_); }
@ -32,6 +34,7 @@ class NGramHeader {
const WordIndex *end() const { return end_; }
WordIndex *end() { return end_; }
std::size_t size() const { return end_ - begin_; }
std::size_t Order() const { return end_ - begin_; }
private:
@ -42,6 +45,8 @@ template <class PayloadT> class NGram : public NGramHeader {
public:
typedef PayloadT Payload;
NGram() : NGramHeader(NULL, 0) {}
NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {}
// Would do operator++ but that can get confusing for a stream.

View File

@ -10,24 +10,21 @@
namespace lm {
template <class Payload> class NGramStream {
template <class Proxy> class ProxyStream {
public:
NGramStream() : gram_(NULL, 0) {}
// Make an invalid stream.
ProxyStream() {}
NGramStream(const util::stream::ChainPosition &position) : gram_(NULL, 0) {
Init(position);
explicit ProxyStream(const util::stream::ChainPosition &position, const Proxy &proxy = Proxy())
: proxy_(proxy), stream_(position) {
proxy_.ReBase(stream_.Get());
}
void Init(const util::stream::ChainPosition &position) {
stream_.Init(position);
gram_ = NGram<Payload>(stream_.Get(), NGram<Payload>::OrderFromSize(position.GetChain().EntrySize()));
}
Proxy &operator*() { return proxy_; }
const Proxy &operator*() const { return proxy_; }
NGram<Payload> &operator*() { return gram_; }
const NGram<Payload> &operator*() const { return gram_; }
NGram<Payload> *operator->() { return &gram_; }
const NGram<Payload> *operator->() const { return &gram_; }
Proxy *operator->() { return &proxy_; }
const Proxy *operator->() const { return &proxy_; }
void *Get() { return stream_.Get(); }
const void *Get() const { return stream_.Get(); }
@ -36,21 +33,25 @@ template <class Payload> class NGramStream {
bool operator!() const { return !stream_; }
void Poison() { stream_.Poison(); }
NGramStream &operator++() {
ProxyStream<Proxy> &operator++() {
++stream_;
gram_.ReBase(stream_.Get());
proxy_.ReBase(stream_.Get());
return *this;
}
private:
NGram<Payload> gram_;
Proxy proxy_;
util::stream::Stream stream_;
};
template <class Payload> inline util::stream::Chain &operator>>(util::stream::Chain &chain, NGramStream<Payload> &str) {
str.Init(chain.Add());
return chain;
}
template <class Payload> class NGramStream : public ProxyStream<NGram<Payload> > {
public:
// Make an invalid stream.
NGramStream() {}
explicit NGramStream(const util::stream::ChainPosition &position) :
ProxyStream<NGram<Payload> >(position, NGram<Payload>(NULL, NGram<Payload>::OrderFromSize(position.GetChain().EntrySize()))) {}
};
template <class Payload> class NGramStreams : public util::stream::GenericStreams<NGramStream<Payload> > {
private:

62
lm/common/print.cc Normal file
View File

@ -0,0 +1,62 @@
#include "lm/common/print.hh"
#include "lm/common/ngram_stream.hh"
#include "util/fake_ofstream.hh"
#include "util/file.hh"
#include "util/mmap.hh"
#include "util/scoped.hh"
#include <sstream>
#include <cstring>
namespace lm {
VocabReconstitute::VocabReconstitute(int fd) {
uint64_t size = util::SizeOrThrow(fd);
util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
const char *const start = static_cast<const char*>(memory_.get());
const char *i;
for (i = start; i != start + size; i += strlen(i) + 1) {
map_.push_back(i);
}
// Last one for LookupPiece.
map_.push_back(i);
}
namespace {
template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FakeOFStream &out) {
out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
out << ' ' << vocab.Lookup(*i);
}
}
} // namespace
void PrintARPA::Run(const util::stream::ChainPositions &positions) {
VocabReconstitute vocab(vocab_fd_);
util::FakeOFStream out(out_fd_);
out << "\\data\\\n";
for (size_t i = 0; i < positions.size(); ++i) {
out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
}
out << '\n';
for (unsigned order = 1; order < positions.size(); ++order) {
out << "\\" << order << "-grams:" << '\n';
for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) {
PrintLead(vocab, stream, out);
out << '\t' << stream->Value().backoff << '\n';
}
out << '\n';
}
out << "\\" << positions.size() << "-grams:" << '\n';
for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) {
PrintLead(vocab, stream, out);
out << '\n';
}
out << '\n';
out << "\\end\\\n";
}
} // namespace lm

58
lm/common/print.hh Normal file
View File

@ -0,0 +1,58 @@
#ifndef LM_COMMON_PRINT_H
#define LM_COMMON_PRINT_H
#include "lm/word_index.hh"
#include "util/mmap.hh"
#include "util/string_piece.hh"
#include <cassert>
#include <vector>
namespace util { namespace stream { class ChainPositions; }}
// Warning: PrintARPA routines read all unigrams before all bigrams before all
// trigrams etc. So if other parts of the chain move jointly, you'll have to
// buffer.
namespace lm {
class VocabReconstitute {
public:
// fd must be alive for life of this object; does not take ownership.
explicit VocabReconstitute(int fd);
const char *Lookup(WordIndex index) const {
assert(index < map_.size() - 1);
return map_[index];
}
StringPiece LookupPiece(WordIndex index) const {
return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]);
}
std::size_t Size() const {
// There's an extra entry to support StringPiece lengths.
return map_.size() - 1;
}
private:
util::scoped_memory memory_;
std::vector<const char*> map_;
};
class PrintARPA {
public:
// Does not take ownership of vocab_fd or out_fd.
explicit PrintARPA(int vocab_fd, int out_fd, const std::vector<uint64_t> &counts)
: vocab_fd_(vocab_fd), out_fd_(out_fd), counts_(counts) {}
void Run(const util::stream::ChainPositions &positions);
private:
int vocab_fd_;
int out_fd_;
std::vector<uint64_t> counts_;
};
} // namespace lm
#endif // LM_COMMON_PRINT_H

24
lm/common/size_option.cc Normal file
View File

@ -0,0 +1,24 @@
#include <boost/program_options.hpp>
#include "util/usage.hh"
namespace lm {
namespace {
class SizeNotify {
public:
explicit SizeNotify(std::size_t &out) : behind_(out) {}
void operator()(const std::string &from) {
behind_ = util::ParseSize(from);
}
private:
std::size_t &behind_;
};
}
boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
}
} // namespace lm

11
lm/common/size_option.hh Normal file
View File

@ -0,0 +1,11 @@
#include <boost/program_options.hpp>
#include <cstddef>
#include <string>
namespace lm {
// Create a boost program option for data sizes. This parses sizes like 1T and 10k.
boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value);
} // namespace lm

View File

@ -1,9 +1,9 @@
#ifndef LM_BUILDER_SPECIAL_H
#define LM_BUILDER_SPECIAL_H
#ifndef LM_COMMON_SPECIAL_H
#define LM_COMMON_SPECIAL_H
#include "lm/word_index.hh"
namespace lm { namespace builder {
namespace lm {
class SpecialVocab {
public:
@ -22,6 +22,6 @@ class SpecialVocab {
WordIndex eos_;
};
}} // namespaces
} // namespace lm
#endif // LM_BUILDER_SPECIAL_H
#endif // LM_COMMON_SPECIAL_H

62
lm/filter/CMakeLists.txt Normal file
View File

@ -0,0 +1,62 @@
cmake_minimum_required(VERSION 2.8.8)
#
# The KenLM cmake files make use of add_library(... OBJECTS ...)
#
# This syntax allows grouping of source files when compiling
# (effectively creating "fake" libraries based on source subdirs).
#
# This syntax was only added in cmake version 2.8.8
#
# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
# Explicitly list the source files for this subdirectory
#
# If you add any source files to this subdirectory
# that should be included in the kenlm library,
# (this excludes any unit test files)
# you should add them to the following list:
#
# In order to set correct paths to these files
# in case this variable is referenced by CMake files in the parent directory,
# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
#
set(KENLM_FILTER_SOURCE
${CMAKE_CURRENT_SOURCE_DIR}/arpa_io.cc
${CMAKE_CURRENT_SOURCE_DIR}/phrase.cc
${CMAKE_CURRENT_SOURCE_DIR}/vocab.cc
)
# Group these objects together for later use.
#
# Given add_library(foo OBJECT ${my_foo_sources}),
# refer to these objects as $<TARGET_OBJECTS:foo>
#
add_library(kenlm_filter OBJECT ${KENLM_FILTER_SOURCE})
# Explicitly list the executable files to be compiled
set(EXE_LIST
filter
phrase_table_vocab
)
# Iterate through the executable list
foreach(exe ${EXE_LIST})
# Compile the executable, linking against the requisite dependent object files
add_executable(${exe} ${exe}_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_filter> $<TARGET_OBJECTS:kenlm_util>)
# Link the executable against boost
target_link_libraries(${exe} ${Boost_LIBRARIES})
# Group executables together
set_target_properties(${exe} PROPERTIES FOLDER executables)
# End for loop
endforeach(exe)

View File

@ -5,10 +5,7 @@
#include <vector>
#include "StatisticsBasedScorer.h"
#include "moses/FF/InternalTree.h"
using Moses::TreePointer;
using Moses::InternalTree;
#include "InternalTree.h"
namespace MosesTuning
{

110
mert/InternalTree.cpp Normal file
View File

@ -0,0 +1,110 @@
#include "InternalTree.h"
namespace MosesTuning
{
InternalTree::InternalTree(const std::string & line, const bool terminal):
m_isTerminal(terminal)
{
size_t found = line.find_first_of("[] ");
if (found == line.npos) {
m_value = line;
}
else {
AddSubTree(line, 0);
}
}
size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
{
std::string value;
char token = 0;
while (token != ']' && pos != std::string::npos) {
size_t oldpos = pos;
pos = line.find_first_of("[] ", pos);
if (pos == std::string::npos) break;
token = line[pos];
value = line.substr(oldpos,pos-oldpos);
if (token == '[') {
if (m_value.size() > 0) {
m_children.push_back(boost::make_shared<InternalTree>(value,false));
pos = m_children.back()->AddSubTree(line, pos+1);
} else {
if (value.size() > 0) {
m_value = value;
}
pos = AddSubTree(line, pos+1);
}
} else if (token == ' ' || token == ']') {
if (value.size() > 0 && !(m_value.size() > 0)) {
m_value = value;
} else if (value.size() > 0) {
m_isTerminal = false;
m_children.push_back(boost::make_shared<InternalTree>(value,true));
}
if (token == ' ') {
pos++;
}
}
if (m_children.size() > 0) {
m_isTerminal = false;
}
}
if (pos == std::string::npos) {
return line.size();
}
return std::min(line.size(),pos+1);
}
std::string InternalTree::GetString(bool start) const
{
std::string ret = "";
if (!start) {
ret += " ";
}
if (!m_isTerminal) {
ret += "[";
}
ret += m_value;
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it) {
ret += (*it)->GetString(false);
}
if (!m_isTerminal) {
ret += "]";
}
return ret;
}
void InternalTree::Combine(const std::vector<TreePointer> &previous)
{
std::vector<TreePointer>::iterator it;
bool found = false;
leafNT next_leafNT(this);
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
found = next_leafNT(it);
if (found) {
*it = *it_prev;
} else {
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
}
}
}
}

77
mert/InternalTree.h Normal file
View File

@ -0,0 +1,77 @@
#pragma once
#include <iostream>
#include <string>
#include <map>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/make_shared.hpp>
#include "util/generator.hh"
#include "util/exception.hh"
namespace MosesTuning
{
class InternalTree;
typedef boost::shared_ptr<InternalTree> TreePointer;
typedef int NTLabel;
class InternalTree
{
std::string m_value;
std::vector<TreePointer> m_children;
bool m_isTerminal;
public:
InternalTree(const std::string & line, const bool terminal = false);
InternalTree(const InternalTree & tree):
m_value(tree.m_value),
m_isTerminal(tree.m_isTerminal) {
const std::vector<TreePointer> & children = tree.m_children;
for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
m_children.push_back(boost::make_shared<InternalTree>(**it));
}
}
size_t AddSubTree(const std::string & line, size_t start);
std::string GetString(bool start = true) const;
void Combine(const std::vector<TreePointer> &previous);
const std::string & GetLabel() const {
return m_value;
}
size_t GetLength() const {
return m_children.size();
}
std::vector<TreePointer> & GetChildren() {
return m_children;
}
bool IsTerminal() const {
return m_isTerminal;
}
bool IsLeafNT() const {
return (!m_isTerminal && m_children.size() == 0);
}
};
// Python-like generator that yields next nonterminal leaf on every call
$generator(leafNT)
{
std::vector<TreePointer>::iterator it;
InternalTree* tree;
leafNT(InternalTree* root = 0): tree(root) {}
$emit(std::vector<TreePointer>::iterator)
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
$yield(it);
} else if ((*it)->GetLength() > 0) {
if ((*it).get()) { // normal pointer to same object that TreePointer points to
$restart(tree = (*it).get());
}
}
}
$stop;
};
}

View File

@ -30,7 +30,7 @@ InterpolatedScorer.cpp
Point.cpp
PerScorer.cpp
HwcmScorer.cpp
../moses/FF/InternalTree.cpp
InternalTree.cpp
Scorer.cpp
ScorerFactory.cpp
Optimizer.cpp

View File

@ -14,6 +14,8 @@ exe 1-1-Extraction : 1-1-Extraction.cpp ..//boost_filesystem ../moses//moses ;
exe prunePhraseTable : prunePhraseTable.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ;
exe pruneGeneration : pruneGeneration.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ;
local with-cmph = [ option.get "with-cmph" ] ;
if $(with-cmph) {
exe processPhraseTableMin : processPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ;
@ -46,6 +48,6 @@ $(TOP)//boost_iostreams
$(TOP)//boost_program_options
;
alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable ;
alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable pruneGeneration ;
#processPhraseTable queryPhraseTable

98
misc/pruneGeneration.cpp Normal file
View File

@ -0,0 +1,98 @@
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <algorithm>
#include <functional>
#include <boost/filesystem.hpp>
#include "pruneGeneration.h"
#include "moses/InputFileStream.h"
#include "moses/OutputFileStream.h"
using namespace std;
int main(int argc, char **argv)
{
cerr << "Starting" << endl;
int limit = atoi(argv[1]);
string inPathStem = argv[2];
string outPathStem = argv[3];
namespace fs = boost::filesystem;
//cerr << "inPathStem=" << inPathStem << endl;
fs::path p(inPathStem);
fs::path dir = p.parent_path();
//cerr << "dir=" << dir << endl;
fs::path fileStem = p.filename();
string fileStemStr = fileStem.native();
size_t fileStemStrSize = fileStemStr.size();
//cerr << "fileStem=" << fileStemStr << endl;
// loop thru each file in directory
fs::directory_iterator end_iter;
for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) {
if (fs::is_regular_file(dir_iter->status())) {
fs::path currPath = *dir_iter;
string currPathStr = currPath.native();
//cerr << "currPathStr=" << currPathStr << endl;
fs::path currFile = currPath.filename();
string currFileStr = currFile.native();
if (currFileStr.find(fileStemStr) == 0) {
// found gen table we need
//cerr << "found=" << currPathStr << endl;
string suffix = currFileStr.substr(fileStemStrSize, currFileStr.size() - fileStemStrSize);
string outPath = outPathStem + suffix;
cerr << "PRUNING " << currPathStr << " TO " << outPath << endl;
Moses::InputFileStream inStrme(currPathStr);
Moses::OutputFileStream outStrme(outPath);
Process(limit, inStrme, outStrme);
}
}
}
cerr << "Finished" << endl;
}
void Process(int limit, istream &inStrme, ostream &outStrme)
{
vector<Rec> records;
string prevInWord;
string line;
while (getline(inStrme, line)) {
vector<string> toks;
Tokenize(toks, line);
assert(toks.size() == 4);
if (prevInWord != toks[0]) {
Output(outStrme, records, limit);
records.clear();
}
// add new record
float prob = atof(toks[2].c_str());
records.push_back(Rec(prob, line));
prevInWord = toks[0];
}
// last
Output(outStrme, records, limit);
records.clear();
}
void Output(ostream &outStrme, vector<Rec> &records, int limit)
{
std::sort(records.rbegin(), records.rend());
for (size_t i = 0; i < limit && i < records.size(); ++i) {
const Rec &rec = records[i];
outStrme << rec.line << endl;
}
}

46
misc/pruneGeneration.h Normal file
View File

@ -0,0 +1,46 @@
#pragma once
#include <vector>
#include <string>
#include <iostream>
class Rec
{
public:
float prob;
std::string line;
Rec(float aprob, const std::string &aline)
:prob(aprob)
,line(aline)
{}
inline bool operator< (const Rec &compare) const {
return prob < compare.prob;
}
};
////////////////////////////////////////////////////////////
void Process(int limit, std::istream &inStrme, std::ostream &outStrme);
void Output(std::ostream &outStrme, std::vector<Rec> &records, int limit);
////////////////////////////////////////////////////////////
inline void Tokenize(std::vector<std::string> &output
, const std::string& str
, const std::string& delimiters = " \t")
{
// Skip delimiters at beginning.
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
// Find first "non-delimiter".
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
while (std::string::npos != pos || std::string::npos != lastPos) {
// Found a token, add it to the vector.
output.push_back(str.substr(lastPos, pos - lastPos));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiters, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiters, lastPos);
}
}

View File

@ -159,13 +159,15 @@ int main(int argc, char* argv[])
}
StaticData& SD = const_cast<StaticData&>(StaticData::Instance());
SD.SetUseLatticeMBR(true);
LMBR_Options& lmbr = SD.options().lmbr;
MBR_Options& mbr = SD.options().mbr;
lmbr.enabled = true;
boost::shared_ptr<IOWrapper> ioWrapper(new IOWrapper);
if (!ioWrapper) {
throw runtime_error("Failed to initialise IOWrapper");
}
size_t nBestSize = SD.GetMBRSize();
size_t nBestSize = mbr.size;
if (nBestSize <= 0) {
throw new runtime_error("Non-positive size specified for n-best list");
@ -187,13 +189,13 @@ int main(int argc, char* argv[])
manager.CalcNBest(nBestSize, nBestList,true);
//grid search
BOOST_FOREACH(float const& p, pgrid) {
SD.SetLatticeMBRPrecision(p);
lmbr.precision = p;
BOOST_FOREACH(float const& r, rgrid) {
SD.SetLatticeMBRPRatio(r);
lmbr.ratio = r;
BOOST_FOREACH(size_t const prune_i, prune_grid) {
SD.SetLatticeMBRPruningFactor(size_t(prune_i));
lmbr.pruning_factor = prune_i;
BOOST_FOREACH(float const& scale_i, scale_grid) {
SD.SetMBRScale(scale_i);
mbr.scale = scale_i;
size_t lineCount = source->GetTranslationId();
cout << lineCount << " ||| " << p << " "
<< r << " " << size_t(prune_i) << " " << scale_i

View File

@ -27,6 +27,12 @@ BaseManager::GetSource() const
return m_source;
}
const ttasksptr
BaseManager::GetTtask() const
{
return m_ttask.lock();
}
void
BaseManager::
OutputSearchGraphAsHypergraph(std::ostream& out) const
@ -134,6 +140,14 @@ void BaseManager::WriteApplicationContext(std::ostream &out,
}
}
AllOptions const&
BaseManager::
options() const
{
return GetTtask()->options();
}
} // namespace

View File

@ -5,7 +5,7 @@
#include <string>
#include "ScoreComponentCollection.h"
#include "InputType.h"
#include "moses/parameters/AllOptions.h"
namespace Moses
{
class ScoreComponentCollection;
@ -50,6 +50,8 @@ public:
//! the input sentence being decoded
const InputType& GetSource() const;
const ttasksptr GetTtask() const;
AllOptions const& options() const;
virtual void Decode() = 0;
// outputs

View File

@ -53,7 +53,7 @@ ChartCell::ChartCell(size_t startPos, size_t endPos, ChartManager &manager) :
ChartCellBase(startPos, endPos), m_manager(manager)
{
const StaticData &staticData = StaticData::Instance();
m_nBestIsEnabled = staticData.IsNBestEnabled();
m_nBestIsEnabled = staticData.options().nbest.enabled;
}
ChartCell::~ChartCell() {}
@ -100,7 +100,7 @@ void ChartCell::Decode(const ChartTranslationOptionList &transOptList
}
// pluck things out of queue and add to hypo collection
const size_t popLimit = staticData.GetCubePruningPopLimit();
const size_t popLimit = staticData.options().cube.pop_limit;
for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) {
ChartHypothesis *hypo = queue.Pop();
AddHypothesis(hypo);

View File

@ -287,8 +287,11 @@ void ChartHypothesis::CleanupArcList()
* so we'll keep all of arc list if nedd distinct n-best list
*/
const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize();
bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphHypergraph();
size_t nBestSize = staticData.options().nbest.nbest_size;
bool distinctNBest = (staticData.options().nbest.only_distinct
|| staticData.options().mbr.enabled
|| staticData.GetOutputSearchGraph()
|| staticData.GetOutputSearchGraphHypergraph());
if (!distinctNBest && m_arcList->size() > nBestSize) {
// prune arc list only if there too many arcs

View File

@ -38,8 +38,8 @@ ChartHypothesisCollection::ChartHypothesisCollection()
const StaticData &staticData = StaticData::Instance();
m_beamWidth = staticData.GetBeamWidth();
m_maxHypoStackSize = staticData.GetMaxHypoStackSize();
m_nBestIsEnabled = staticData.IsNBestEnabled();
m_maxHypoStackSize = staticData.options().search.stack_size;
m_nBestIsEnabled = staticData.options().nbest.enabled;
m_bestScore = -std::numeric_limits<float>::infinity();
}

View File

@ -52,11 +52,7 @@ public:
// shouldn't be mixing hypos with different lhs
assert(hypoA->GetTargetLHS() == hypoB->GetTargetLHS());
int ret = hypoA->RecombineCompare(*hypoB);
if (ret != 0)
return (ret < 0);
return false;
return (hypoA->RecombineCompare(*hypoB) < 0);
}
};

View File

@ -207,7 +207,7 @@ void ChartManager::CalcNBest(
// with 0 being 'unlimited.' This actually sets a large-ish limit in case
// too many translations are identical.
const StaticData &staticData = StaticData::Instance();
const std::size_t nBestFactor = staticData.GetNBestFactor();
const std::size_t nBestFactor = staticData.options().nbest.factor;
std::size_t numDerivations = (nBestFactor == 0) ? n*1000 : n*nBestFactor;
// Extract the derivations.
@ -318,13 +318,14 @@ void ChartManager::OutputBest(OutputCollector *collector) const
void ChartManager::OutputNBest(OutputCollector *collector) const
{
const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize();
size_t nBestSize = staticData.options().nbest.nbest_size;
if (nBestSize > 0) {
const size_t translationId = m_source.GetTranslationId();
VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl);
VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO "
<< staticData.options().nbest.output_file_path << endl);
std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest());
CalcNBest(nBestSize, nBestList,staticData.options().nbest.only_distinct);
OutputNBestList(collector, nBestList, translationId);
IFVERBOSE(2) {
PrintUserTime("N-Best Hypotheses Generation Time:");
@ -348,10 +349,9 @@ void ChartManager::OutputNBestList(OutputCollector *collector,
FixPrecision(out);
}
bool includeWordAlignment =
StaticData::Instance().PrintAlignmentInfoInNbest();
bool PrintNBestTrees = StaticData::Instance().PrintNBestTrees();
NBestOptions const& nbo = StaticData::Instance().options().nbest;
bool includeWordAlignment = nbo.include_alignment_info;
bool PrintNBestTrees = nbo.print_trees;
for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
p != nBestList.end(); ++p) {
@ -620,9 +620,9 @@ void ChartManager::OutputDetailedTranslationReport(
if (staticData.IsDetailedAllTranslationReportingEnabled()) {
const Sentence &sentence = dynamic_cast<const Sentence &>(m_source);
size_t nBestSize = staticData.GetNBestSize();
size_t nBestSize = staticData.options().nbest.nbest_size;
std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
CalcNBest(nBestSize, nBestList, staticData.GetDistinctNBest());
CalcNBest(nBestSize, nBestList, staticData.options().nbest.nbest_size);
OutputDetailedAllTranslationReport(collector, nBestList, sentence, translationId);
}

View File

@ -106,7 +106,8 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
targetPhrase->SetTargetLHS(targetLHS);
targetPhrase->SetAlignmentInfo("0-0");
targetPhrase->EvaluateInIsolation(*unksrc);
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.PrintNBestTrees() || staticData.GetTreeStructure() != NULL) {
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.options().nbest.print_trees || staticData.GetTreeStructure() != NULL) {
targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]");
}

View File

@ -1,3 +1,4 @@
// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
// $Id$
#include "ConfusionNet.h"
@ -65,9 +66,9 @@ ConfusionNet() : InputType()
{
stats.createOne();
const StaticData& staticData = StaticData::Instance();
if (staticData.IsSyntax()) {
m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
const StaticData& SD = StaticData::Instance();
if (SD.IsSyntax()) {
m_defaultLabelSet.insert(SD.GetInputDefaultNonTerminal());
}
UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
}

View File

@ -1,3 +1,4 @@
// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
// $Id: ExportInterface.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $
/***********************************************************************
@ -63,9 +64,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <xmlrpc-c/base.hpp>
#include <xmlrpc-c/registry.hpp>
#include <xmlrpc-c/server_abyss.hpp>
#include "server/Translator.h"
#include "server/Optimizer.h"
#include "server/Updater.h"
#include "server/Server.h"
#endif
using namespace std;
@ -147,41 +146,9 @@ int
run_as_server()
{
#ifdef HAVE_XMLRPC_C
int port;
params.SetParameter(port, "server-port", 8080);
bool isSerial;
params.SetParameter(isSerial, "serial", false);
string logfile;
params.SetParameter(logfile, "server-log", string(""));
size_t num_threads;
params.SetParameter(num_threads, "threads", size_t(10));
if (isSerial) VERBOSE(1,"Running server in serial mode." << endl);
xmlrpc_c::registry myRegistry;
xmlrpc_c::methodPtr const translator(new MosesServer::Translator(num_threads));
xmlrpc_c::methodPtr const updater(new MosesServer::Updater);
xmlrpc_c::methodPtr const optimizer(new MosesServer::Optimizer);
myRegistry.addMethod("translate", translator);
myRegistry.addMethod("updater", updater);
myRegistry.addMethod("optimize", optimizer);
xmlrpc_c::serverAbyss myAbyssServer(myRegistry, port, logfile);
XVERBOSE(1,"Listening on port " << port << endl);
if (isSerial) {
while(1) myAbyssServer.runOnce();
} else myAbyssServer.run();
std::cerr << "xmlrpc_c::serverAbyss.run() returned but should not." << std::endl;
// #pragma message("BUILDING MOSES WITH SERVER SUPPORT")
#else
// #pragma message("BUILDING MOSES WITHOUT SERVER SUPPORT")
std::cerr << "Moses was compiled without server support." << endl;
MosesServer::Server server(params);
return server.run(); // actually: don't return. see Server::run()
#endif
return 1;
}
int
@ -212,21 +179,57 @@ batch_run()
ThreadPool pool(staticData.ThreadCount());
#endif
// using context for adaptation:
// e.g., context words / strings from config file / cmd line
std::string context_string;
params.SetParameter(context_string,"context-string",string(""));
// main loop over set of input sentences
// ... or weights for documents/domains from config file / cmd. line
std::string context_weights;
params.SetParameter(context_weights,"context-weights",string(""));
// ... or the surrounding context (--context-window ...)
size_t size_t_max = std::numeric_limits<size_t>::max();
bool use_context_window = ioWrapper->GetLookAhead() || ioWrapper->GetLookBack();
bool use_context = use_context_window || context_string.size();
bool use_sliding_context_window = (use_context_window
&& ioWrapper->GetLookAhead() != size_t_max);
boost::shared_ptr<std::vector<std::string> > context_window;
boost::shared_ptr<std::vector<std::string> >* cw;
cw = use_context_window ? &context_window : NULL;
if (!cw && context_string.size())
context_window.reset(new std::vector<std::string>(1,context_string));
// global scope of caches, biases, etc., if any
boost::shared_ptr<ContextScope> gscope;
if (!use_sliding_context_window)
gscope.reset(new ContextScope);
// main loop over set of input sentences
boost::shared_ptr<InputType> source;
while ((source = ioWrapper->ReadInput()) != NULL) {
while ((source = ioWrapper->ReadInput(cw)) != NULL) {
IFVERBOSE(1) ResetUserTime();
// set up task of translating one sentence
boost::shared_ptr<TranslationTask>
task = TranslationTask::create(source, ioWrapper);
if (source->GetContext())
task->SetContextString(*source->GetContext());
else task->SetContextString(context_string);
boost::shared_ptr<ContextScope> lscope;
if (gscope) lscope = gscope;
else lscope.reset(new ContextScope);
boost::shared_ptr<TranslationTask> task;
task = TranslationTask::create(source, ioWrapper, lscope);
if (cw) {
if (context_string.size())
context_window->push_back(context_string);
if(!use_sliding_context_window)
cw = NULL;
}
if (context_window)
task->SetContextWindow(context_window);
if (context_weights != "")
task->SetContextWeights(context_weights);
// Allow for (sentence-)context-specific processing prior to
// decoding. This can be used, for example, for context-sensitive

View File

@ -1,3 +1,4 @@
// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
#pragma once
// $Id$

View File

@ -43,7 +43,9 @@ ConstrainedDecoding::ConstrainedDecoding(const std::string &line)
void ConstrainedDecoding::Load()
{
const StaticData &staticData = StaticData::Instance();
bool addBeginEndWord = (staticData.GetSearchAlgorithm() == CYKPlus) || (staticData.GetSearchAlgorithm() == ChartIncremental);
bool addBeginEndWord
= ((staticData.options().search.algo == CYKPlus)
|| (staticData.options().search.algo == ChartIncremental));
for(size_t i = 0; i < m_paths.size(); ++i) {
InputFileStream constraintFile(m_paths[i]);

View File

@ -6,7 +6,6 @@
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModel.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
#include "moses/TranslationModel/PhraseDictionaryScope3.h"
#include "moses/TranslationModel/PhraseDictionaryTransliteration.h"
#include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h"
@ -152,7 +151,7 @@ FeatureFactory
::DefaultSetup(F *feature)
{
StaticData &static_data = StaticData::InstanceNonConst();
const string &featureName = feature->GetScoreProducerDescription();
const std::string &featureName = feature->GetScoreProducerDescription();
std::vector<float> weights = static_data.GetParameter()->GetWeights(featureName);
@ -165,8 +164,8 @@ FeatureFactory
<< "WARNING: Auto-initializing all weights for this FF to 1.0");
weights.assign(feature->GetNumScoreComponents(),1.0);
} else {
TRACE_ERR("WARNING: No weights specified in config file for FF "
<< featureName << ". Using default values supplied by FF.");
VERBOSE(2,"WARNING: No weights specified in config file for FF "
<< featureName << ". Using default values supplied by FF.");
}
}
UTIL_THROW_IF2(weights.size() != feature->GetNumScoreComponents(),
@ -215,7 +214,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(PhraseDictionaryMultiModel);
MOSES_FNAME(PhraseDictionaryMultiModelCounts);
MOSES_FNAME(PhraseDictionaryALSuffixArray);
MOSES_FNAME(PhraseDictionaryDynSuffixArray);
// MOSES_FNAME(PhraseDictionaryDynSuffixArray);
MOSES_FNAME(PhraseDictionaryTransliteration);
MOSES_FNAME(PhraseDictionaryDynamicCacheBased);
MOSES_FNAME(PhraseDictionaryFuzzyMatch);
@ -353,18 +352,18 @@ void FeatureRegistry::Construct(const std::string &name, const std::string &line
void FeatureRegistry::PrintFF() const
{
vector<string> ffs;
std::vector<std::string> ffs;
std::cerr << "Available feature functions:" << std::endl;
Map::const_iterator iter;
for (iter = registry_.begin(); iter != registry_.end(); ++iter) {
const string &ffName = iter->first;
const std::string &ffName = iter->first;
ffs.push_back(ffName);
}
vector<string>::const_iterator iterVec;
std::vector<std::string>::const_iterator iterVec;
std::sort(ffs.begin(), ffs.end());
for (iterVec = ffs.begin(); iterVec != ffs.end(); ++iterVec) {
const string &ffName = *iterVec;
const std::string &ffName = *iterVec;
std::cerr << ffName << " ";
}

View File

@ -19,8 +19,8 @@ HyperParameterAsWeight::HyperParameterAsWeight(const std::string &line)
vector<float> weights = staticData.GetWeights(this);
staticData.m_maxHypoStackSize = weights[0] * 1000;
staticData.m_beamWidth = weights[1] * 10;
staticData.m_options.search.stack_size = weights[0] * 1000;
staticData.m_options.search.beam_width = weights[1] * 10;
}

View File

@ -1,27 +1,24 @@
#include "InternalTree.h"
#include "moses/StaticData.h"
namespace Moses
{
InternalTree::InternalTree(const std::string & line, size_t start, size_t len, const bool terminal):
m_value_nt(0),
m_isTerminal(terminal)
InternalTree::InternalTree(const std::string & line, size_t start, size_t len, const bool nonterminal)
{
if (len > 0) {
m_value.assign(line, start, len);
m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(start, len), nonterminal);
}
}
InternalTree::InternalTree(const std::string & line, const bool terminal):
m_value_nt(0),
m_isTerminal(terminal)
InternalTree::InternalTree(const std::string & line, const bool nonterminal)
{
size_t found = line.find_first_of("[] ");
if (found == line.npos) {
m_value = line;
m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), line, nonterminal);
} else {
AddSubTree(line, 0);
}
@ -32,6 +29,7 @@ size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
char token = 0;
size_t len = 0;
bool has_value = false;
while (token != ']' && pos != std::string::npos) {
size_t oldpos = pos;
@ -41,30 +39,27 @@ size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
len = pos-oldpos;
if (token == '[') {
if (!m_value.empty()) {
m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, false));
if (has_value) {
m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, true));
pos = m_children.back()->AddSubTree(line, pos+1);
} else {
if (len > 0) {
m_value.assign(line, oldpos, len);
m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(oldpos, len), false);
has_value = true;
}
pos = AddSubTree(line, pos+1);
}
} else if (token == ' ' || token == ']') {
if (len > 0 && m_value.empty()) {
m_value.assign(line, oldpos, len);
if (len > 0 && !has_value) {
m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(oldpos, len), true);
has_value = true;
} else if (len > 0) {
m_isTerminal = false;
m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, true));
m_children.push_back(boost::make_shared<InternalTree>(line, oldpos, len, false));
}
if (token == ' ') {
pos++;
}
}
if (!m_children.empty()) {
m_isTerminal = false;
}
}
if (pos == std::string::npos) {
@ -82,16 +77,16 @@ std::string InternalTree::GetString(bool start) const
ret += " ";
}
if (!m_isTerminal) {
if (!IsTerminal()) {
ret += "[";
}
ret += m_value;
ret += m_value.GetString(StaticData::Instance().GetOutputFactorOrder(), false);
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it) {
ret += (*it)->GetString(false);
}
if (!m_isTerminal) {
if (!IsTerminal()) {
ret += "]";
}
return ret;
@ -120,13 +115,13 @@ void InternalTree::Unbinarize()
{
// nodes with virtual label cannot be unbinarized
if (m_value.empty() || m_value[0] == '^') {
if (m_value.GetString(0).empty() || m_value.GetString(0).as_string()[0] == '^') {
return;
}
//if node has child that is virtual node, get unbinarized list of children
for (std::vector<TreePointer>::iterator it = m_children.begin(); it != m_children.end(); ++it) {
if (!(*it)->IsTerminal() && (*it)->GetLabel()[0] == '^') {
if (!(*it)->IsTerminal() && (*it)->GetLabel().GetString(0).as_string()[0] == '^') {
std::vector<TreePointer> new_children;
GetUnbinarizedChildren(new_children);
m_children = new_children;
@ -144,8 +139,8 @@ void InternalTree::Unbinarize()
void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const
{
for (std::vector<TreePointer>::const_iterator itx = m_children.begin(); itx != m_children.end(); ++itx) {
const std::string &label = (*itx)->GetLabel();
if (!label.empty() && label[0] == '^') {
const StringPiece label = (*itx)->GetLabel().GetString(0);
if (!label.empty() && label.as_string()[0] == '^') {
(*itx)->GetUnbinarizedChildren(ret);
} else {
ret.push_back(*itx);
@ -153,7 +148,7 @@ void InternalTree::GetUnbinarizedChildren(std::vector<TreePointer> &ret) const
}
}
bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
bool InternalTree::FlatSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetLabel() == label) {
@ -163,7 +158,7 @@ bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer
return false;
}
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
bool InternalTree::RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetLabel() == label) {
@ -178,7 +173,7 @@ bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePo
return false;
}
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
bool InternalTree::RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetLabel() == label) {
@ -194,88 +189,4 @@ bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePo
return false;
}
bool InternalTree::FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetNTLabel() == label) {
return true;
}
}
return false;
}
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetNTLabel() == label) {
return true;
}
std::vector<TreePointer>::const_iterator it2;
if ((*it)->RecursiveSearch(label, it2)) {
it = it2;
return true;
}
}
return false;
}
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if ((*it)->GetNTLabel() == label) {
parent = this;
return true;
}
std::vector<TreePointer>::const_iterator it2;
if ((*it)->RecursiveSearch(label, it2, parent)) {
it = it2;
return true;
}
}
return false;
}
bool InternalTree::FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
return true;
}
}
return false;
}
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
return true;
}
std::vector<TreePointer>::const_iterator it2;
if ((*it)->RecursiveSearch(labels, it2)) {
it = it2;
return true;
}
}
return false;
}
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
{
for (it = m_children.begin(); it != m_children.end(); ++it) {
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
parent = this;
return true;
}
std::vector<TreePointer>::const_iterator it2;
if ((*it)->RecursiveSearch(labels, it2, parent)) {
it = it2;
return true;
}
}
return false;
}
}

View File

@ -5,30 +5,28 @@
#include <map>
#include <vector>
#include "FFState.h"
#include "moses/Word.h"
#include <boost/shared_ptr.hpp>
#include <boost/make_shared.hpp>
#include "util/generator.hh"
#include "util/exception.hh"
#include "util/string_piece.hh"
namespace Moses
{
class InternalTree;
typedef boost::shared_ptr<InternalTree> TreePointer;
typedef int NTLabel;
class InternalTree
{
std::string m_value;
NTLabel m_value_nt;
Word m_value;
std::vector<TreePointer> m_children;
bool m_isTerminal;
public:
InternalTree(const std::string & line, size_t start, size_t len, const bool terminal);
InternalTree(const std::string & line, const bool terminal = false);
InternalTree(const std::string & line, const bool nonterminal = true);
InternalTree(const InternalTree & tree):
m_value(tree.m_value),
m_isTerminal(tree.m_isTerminal) {
m_value(tree.m_value) {
const std::vector<TreePointer> & children = tree.m_children;
for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
m_children.push_back(boost::make_shared<InternalTree>(**it));
@ -40,20 +38,10 @@ public:
void Combine(const std::vector<TreePointer> &previous);
void Unbinarize();
void GetUnbinarizedChildren(std::vector<TreePointer> &children) const;
const std::string & GetLabel() const {
const Word & GetLabel() const {
return m_value;
}
// optionally identify label by int instead of string;
// allows abstraction if multiple nonterminal strings should map to same label.
const NTLabel & GetNTLabel() const {
return m_value_nt;
}
void SetNTLabel(NTLabel value) {
m_value_nt = value;
}
size_t GetLength() const {
return m_children.size();
}
@ -62,38 +50,22 @@ public:
}
bool IsTerminal() const {
return m_isTerminal;
return !m_value.IsNonTerminal();
}
bool IsLeafNT() const {
return (!m_isTerminal && m_children.size() == 0);
return (m_value.IsNonTerminal() && m_children.size() == 0);
}
// different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents.
// can be used for formulating syntax constraints.
// if found, 'it' is iterator to first tree node that matches search string
bool FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
bool FlatSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const;
bool RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it) const;
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
// use NTLabel for search to reduce number of string comparisons / deal with synonymous labels
// if found, 'it' is iterator to first tree node that matches search string
bool FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
// pass vector of possible labels to search
// if found, 'it' is iterator to first tree node that matches search string
bool FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
bool RecursiveSearch(const Word & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
// Python-like generator that yields next nonterminal leaf on every call
$generator(leafNT) {

View File

@ -1,4 +1,4 @@
// -*- c++ -*-
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
#pragma once
#include <string>

View File

@ -1,6 +1,5 @@
// -*- c++ -*-
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
#pragma once
#include <vector>
#include <string>
@ -12,7 +11,6 @@
#include "moses/WordsBitmap.h"
#include "moses/TranslationOption.h"
#include "moses/FF/FFState.h"
#include "ReorderingStack.h"
namespace Moses

View File

@ -75,7 +75,7 @@ void Model1Vocabulary::Load(const std::string& fileName)
++i;
std::vector<std::string> tokens = Tokenize(line);
UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
unsigned id = Scan<unsigned>(tokens[0]);
unsigned id = atoll( tokens[0].c_str() );
if (! ( (id == 1) && (tokens[1] == "UNK") )) {
const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
bool stored = Store(factor, id);
@ -86,7 +86,7 @@ void Model1Vocabulary::Load(const std::string& fileName)
++i;
std::vector<std::string> tokens = Tokenize(line);
UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
unsigned id = Scan<unsigned>(tokens[0]);
unsigned id = atoll( tokens[0].c_str() );
const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
bool stored = Store(factor, id);
UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
@ -105,11 +105,11 @@ void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabular
++i;
std::vector<std::string> tokens = Tokenize(line);
UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
unsigned idS = Scan<unsigned>(tokens[0]);
unsigned idT = Scan<unsigned>(tokens[1]);
unsigned idS = atoll( tokens[0].c_str() );
unsigned idT = atoll( tokens[1].c_str() );
const Factor* wordS = vcbS.GetWord(idS);
const Factor* wordT = vcbT.GetWord(idT);
float prob = Scan<float>(tokens[2]);
float prob = std::atof( tokens[2].c_str() );
if ( (wordS != NULL) && (wordT != NULL) ) {
m_ltable[ wordS ][ wordT ] = prob;
}

View File

@ -134,7 +134,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
if (targetPhrase.GetAlignNonTerm().GetSize() != 0) {
// Initialize phrase orientation scoring object
MosesTraining::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(),
MosesTraining::Syntax::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(),
targetPhrase.GetAlignTerm(), targetPhrase.GetAlignNonTerm());
PhraseOrientationFeature::ReoClassData* reoClassData = new PhraseOrientationFeature::ReoClassData();
@ -150,7 +150,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
// LEFT-TO-RIGHT DIRECTION
MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_L2R);
MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::Syntax::GHKM::PhraseOrientation::REO_DIR_L2R);
if ( ((targetIndex == 0) || !phraseOrientation.TargetSpanIsAligned(0,targetIndex)) // boundary non-terminal in rule-initial position (left boundary)
&& (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule
@ -170,7 +170,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
if (reoClassData->firstNonTerminalPreviousSourceSpanIsAligned &&
reoClassData->firstNonTerminalFollowingSourceSpanIsAligned) {
// discontinuous
l2rOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
l2rOrientation = MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
} else {
reoClassData->firstNonTerminalIsBoundary = true;
}
@ -180,7 +180,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
// RIGHT-TO-LEFT DIRECTION
MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_R2L);
MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::Syntax::GHKM::PhraseOrientation::REO_DIR_R2L);
if ( ((targetIndex == targetPhrase.GetSize()-1) || !phraseOrientation.TargetSpanIsAligned(targetIndex,targetPhrase.GetSize()-1)) // boundary non-terminal in rule-final position (right boundary)
&& (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule
@ -200,7 +200,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source,
if (reoClassData->lastNonTerminalPreviousSourceSpanIsAligned &&
reoClassData->lastNonTerminalFollowingSourceSpanIsAligned) {
// discontinuous
r2lOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
r2lOrientation = MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT;
} else {
reoClassData->lastNonTerminalIsBoundary = true;
}
@ -335,25 +335,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
// LEFT-TO-RIGHT DIRECTION
MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT];
MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT];
IFFEATUREVERBOSE(2) {
FEATUREVERBOSE(2, "l2rOrientation ");
switch (l2rOrientation) {
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT:
FEATUREVERBOSE2(2, "mono" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
FEATUREVERBOSE2(2, "swap" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
FEATUREVERBOSE2(2, "dleft" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
FEATUREVERBOSE2(2, "dright" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
// modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
// modelType == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_MSLR
FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
break;
default:
@ -396,23 +396,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
} else {
if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
if ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
newScores[0] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityMono());
// if sub-derivation has left-boundary non-terminal:
// add recursive actual score of boundary non-terminal from subderivation
LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x1, newScores, accumulator);
} else if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
} else if ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
newScores[1] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilitySwap());
// if sub-derivation has left-boundary non-terminal:
// add recursive actual score of boundary non-terminal from subderivation
LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x2, newScores, accumulator);
} else if ( ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
} else if ( ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
newScores[2] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous());
// if sub-derivation has left-boundary non-terminal:
@ -437,25 +437,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
// RIGHT-TO-LEFT DIRECTION
MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT];
MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT];
IFFEATUREVERBOSE(2) {
FEATUREVERBOSE(2, "r2lOrientation ");
switch (r2lOrientation) {
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT:
FEATUREVERBOSE2(2, "mono" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT:
FEATUREVERBOSE2(2, "swap" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT:
FEATUREVERBOSE2(2, "dleft" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT:
FEATUREVERBOSE2(2, "dright" << std::endl);
break;
case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
// modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR
case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN:
// modelType == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_MSLR
FEATUREVERBOSE2(2, "unknown->dleft" << std::endl);
break;
default:
@ -498,23 +498,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied(
} else {
if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
if ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
newScores[m_offsetR2LScores+0] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityMono());
// if sub-derivation has right-boundary non-terminal:
// add recursive actual score of boundary non-terminal from subderivation
RightBoundaryR2LScoreRecursive(featureID, prevState, 0x1, newScores, accumulator);
} else if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
} else if ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
newScores[m_offsetR2LScores+1] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilitySwap());
// if sub-derivation has right-boundary non-terminal:
// add recursive actual score of boundary non-terminal from subderivation
RightBoundaryR2LScoreRecursive(featureID, prevState, 0x2, newScores, accumulator);
} else if ( ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
} else if ( ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
newScores[m_offsetR2LScores+2] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous());
// if sub-derivation has right-boundary non-terminal:
@ -862,17 +862,17 @@ void PhraseOrientationFeature::SparseNonTerminalR2LScore(const Factor* nonTermin
}
const std::string* PhraseOrientationFeature::ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const
const std::string* PhraseOrientationFeature::ToString(const MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS o) const
{
if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
if ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) {
return &MORIENT;
} else if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
} else if ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) {
return &SORIENT;
} else if ( ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
} else if ( ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) ||
( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) ||
( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) {
return &DORIENT;
} else {

View File

@ -302,8 +302,8 @@ public:
struct ReoClassData {
public:
std::vector<MosesTraining::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassL2R;
std::vector<MosesTraining::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassR2L;
std::vector<MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassL2R;
std::vector<MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS> nonTerminalReoClassR2L;
bool firstNonTerminalIsBoundary;
bool firstNonTerminalPreviousSourceSpanIsAligned;
bool firstNonTerminalFollowingSourceSpanIsAligned;
@ -401,7 +401,7 @@ protected:
ScoreComponentCollection* scoreBreakdown,
const std::string* o) const;
const std::string* ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const;
const std::string* ToString(const MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS o) const;
static const std::string MORIENT;
static const std::string SORIENT;

View File

@ -16,21 +16,29 @@ namespace Moses
PhrasePairFeature::PhrasePairFeature(const std::string &line)
:StatelessFeatureFunction(0, line)
,m_unrestricted(false)
,m_simple(true)
,m_sourceContext(false)
,m_domainTrigger(false)
,m_ignorePunctuation(false)
{
std::cerr << "Initializing PhrasePairFeature.." << std::endl;
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
ReadParameters();
if (m_simple == 1) std::cerr << "using simple phrase pairs.. ";
if (m_sourceContext == 1) std::cerr << "using source context.. ";
if (m_domainTrigger == 1) std::cerr << "using domain triggers.. ";
if (m_simple == 1) VERBOSE(1, " Using simple phrase pairs.");
if (m_sourceContext == 1) VERBOSE(1, " Using source context.");
if (m_domainTrigger == 1) VERBOSE(1, " Using domain triggers.");
// compile a list of punctuation characters
if (m_ignorePunctuation) {
std::cerr << "ignoring punctuation for triggers.. ";
VERBOSE(1, " Ignoring punctuation for triggers.");
char punctuation[] = "\"'!?¿·()#_,.:;•&@/\\0123456789~=";
for (size_t i=0; i < sizeof(punctuation)-1; ++i)
for (size_t i=0; i < sizeof(punctuation)-1; ++i) {
m_punctuationHash[punctuation[i]] = 1;
}
}
VERBOSE(1, " Done." << std::endl);
}
void PhrasePairFeature::SetParameter(const std::string& key, const std::string& value)
@ -76,7 +84,7 @@ void PhrasePairFeature::Load()
}
inFileSource.close();
} else {
} else if (!m_unrestricted) {
// restricted source word vocabulary
ifstream inFileSource(m_filePathSource.c_str());
UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);
@ -101,8 +109,6 @@ void PhrasePairFeature::Load()
}
inFileTarget.close();*/
m_unrestricted = false;
}
}
@ -114,25 +120,6 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
, ScoreComponentCollection *estimatedFutureScore) const
{
const Phrase& source = inputPath.GetPhrase();
if (m_simple) {
ostringstream namestr;
namestr << "pp_";
namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
namestr << ",";
namestr << sourceFactor->GetString();
}
namestr << "~";
namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
}
scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
if (m_domainTrigger) {
const Sentence& isnt = static_cast<const Sentence&>(input);
const bool use_topicid = isnt.GetUseTopicId();
@ -140,18 +127,18 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
// compute pair
ostringstream pair;
pair << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
pair << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() );
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
pair << ",";
pair << sourceFactor->GetString();
pair << "~";
pair << ReplaceTilde( sourceFactor->GetString() );
}
pair << "~";
pair << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
pair << "~~";
pair << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() );
for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
pair << ",";
pair << targetFactor->GetString();
pair << "~";
pair << ReplaceTilde( targetFactor->GetString() );
}
if (use_topicid || use_topicid_prob) {
@ -159,7 +146,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
// use topicid as trigger
const long topicid = isnt.GetTopicId();
stringstream feature;
feature << "pp_";
feature << m_description << "_";
if (topicid == -1)
feature << "unk";
else
@ -173,13 +160,13 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
const vector<string> &topicid_prob = *(isnt.GetTopicIdAndProb());
if (atol(topicid_prob[0].c_str()) == -1) {
stringstream feature;
feature << "pp_unk_";
feature << m_description << "_unk_";
feature << pair.str();
scoreBreakdown.SparsePlusEquals(feature.str(), 1);
} else {
for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
stringstream feature;
feature << "pp_";
feature << m_description << "_";
feature << topicid_prob[i];
feature << "_";
feature << pair.str();
@ -193,7 +180,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
for (set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
string sourceTrigger = *p;
ostringstream namestr;
namestr << "pp_";
namestr << m_description << "_";
namestr << sourceTrigger;
namestr << "_";
namestr << pair.str();
@ -221,21 +208,21 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
if (m_unrestricted || sourceTriggerExists) {
ostringstream namestr;
namestr << "pp_";
namestr << m_description << "_";
namestr << sourceTrigger;
namestr << "~";
namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
namestr << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() );
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
namestr << ",";
namestr << sourceFactor->GetString();
namestr << "~";
namestr << ReplaceTilde( sourceFactor->GetString() );
}
namestr << "~";
namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString();
namestr << "~~";
namestr << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() );
for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
namestr << "~";
namestr << ReplaceTilde( targetFactor->GetString() );
}
scoreBreakdown.SparsePlusEquals(namestr.str(),1);
@ -244,6 +231,31 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input
}
}
void PhrasePairFeature::EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{
if (m_simple) {
ostringstream namestr;
namestr << m_description << "_";
namestr << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() );
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
namestr << "~";
namestr << ReplaceTilde( sourceFactor->GetString() );
}
namestr << "~~";
namestr << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() );
for (size_t i = 1; i < targetPhrase.GetSize(); ++i) {
const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId);
namestr << "~";
namestr << ReplaceTilde( targetFactor->GetString() );
}
scoreBreakdown.SparsePlusEquals(namestr.str(),1);
}
}
bool PhrasePairFeature::IsUseable(const FactorMask &mask) const
{
bool ret = mask[m_targetFactorId];

View File

@ -1,5 +1,4 @@
#ifndef moses_PhrasePairFeature_h
#define moses_PhrasePairFeature_h
#pragma once
#include <stdexcept>
#include <boost/unordered_set.hpp>
@ -32,6 +31,16 @@ class PhrasePairFeature: public StatelessFeatureFunction
CharHash m_punctuationHash;
std::string m_filePathSource;
inline std::string ReplaceTilde(const StringPiece &str) const {
std::string out = str.as_string();
size_t pos = out.find('~');
while ( pos != std::string::npos ) {
out.replace(pos,1,"<TILDE>");
pos = out.find('~',pos);
}
return out;
};
public:
PhrasePairFeature(const std::string &line);
@ -43,8 +52,7 @@ public:
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const {
}
, ScoreComponentCollection &estimatedFutureScore) const;
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const {
@ -69,5 +77,3 @@ public:
}
#endif

View File

@ -12,7 +12,7 @@ namespace Moses
{
RulePairUnlexicalizedSource::RulePairUnlexicalizedSource(const std::string &line)
: StatelessFeatureFunction(0, line)
: StatelessFeatureFunction(1, line)
, m_glueRules(false)
, m_nonGlueRules(true)
, m_glueTargetLHSStr("Q")
@ -81,6 +81,9 @@ void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source
}
scoreBreakdown.PlusEquals(this, namestr.str(), 1);
if ( targetPhraseLHS != m_glueTargetLHS ) {
scoreBreakdown.PlusEquals(this, 1);
}
}
}

View File

@ -34,7 +34,7 @@ public:
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const {
vector<float> newScores(m_numScoreComponents);
std::vector<float> newScores(m_numScoreComponents);
newScores[0] = translationOptionList.size();
TranslationOptionList::const_iterator iterTransOpt;

View File

@ -13,6 +13,7 @@ namespace Moses
SoftMatchingFeature::SoftMatchingFeature(const std::string &line)
: StatelessFeatureFunction(0, line)
, m_softMatches(moses_MaxNumNonterminals)
, m_scoreIdentical(true)
{
ReadParameters();
}
@ -26,6 +27,8 @@ void SoftMatchingFeature::SetParameter(const std::string& key, const std::string
} else if (key == "path") {
const std::string filePath = value;
Load(filePath);
} else if (key == "score-identical") {
m_scoreIdentical = Scan<bool>(value);
} else {
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
}
@ -80,8 +83,10 @@ void SoftMatchingFeature::EvaluateWhenApplied(const ChartHypothesis& hypo,
const ChartHypothesis* prevHypo = hypo.GetPrevHypo(nonTermInd);
const Word& prevLHS = prevHypo->GetTargetLHS();
const std::string &name = GetOrSetFeatureName(word, prevLHS);
accumulator->PlusEquals(this,name,1);
if ( (word != prevLHS) || m_scoreIdentical ) {
const std::string &name = GetOrSetFeatureName(word, prevLHS);
accumulator->PlusEquals(this,name,1);
}
}
}
}

View File

@ -55,6 +55,7 @@ public:
private:
mutable std::vector<std::vector<Word> > m_softMatches; // map RHS of new rule to list of possible LHS of old rule (subtree)
mutable std::vector<std::vector<std::string> > m_nameCache;
bool m_scoreIdentical;
#ifdef WITH_THREADS
//reader-writer lock

View File

@ -38,9 +38,8 @@ void SourceWordDeletionFeature::SetParameter(const std::string& key, const std::
void SourceWordDeletionFeature::Load()
{
if (m_filename == "") {
if (m_filename.empty())
return;
}
FEATUREVERBOSE(1, "Loading source word deletion word list from " << m_filename << std::endl);
ifstream inFile(m_filename.c_str());

Some files were not shown because too many files have changed in this diff Show More