Merge branch 'master' of git://github.com/moses-smt/mosesdecoder

This commit is contained in:
phikoehn 2012-12-01 13:45:00 +00:00
commit 269883fedd
9 changed files with 168 additions and 183 deletions

View File

@ -5,7 +5,7 @@
#include <boost/thread/thread.hpp>
#endif
#include "moses/CompactPT/LexicalReorderingTableCreator.h"
#include "moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h"
using namespace Moses;

View File

@ -5,7 +5,7 @@
#endif
#include "moses/TypeDef.h"
#include "moses/CompactPT/PhraseTableCreator.h"
#include "moses/TranslationModel/CompactPT/PhraseTableCreator.h"
using namespace Moses;

View File

@ -6,7 +6,7 @@
#include <string>
#include <vector>
#include "moses/CompactPT/PhraseDictionaryCompact.h"
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
#include "moses/Util.h"
#include "moses/Phrase.h"

View File

@ -52,14 +52,10 @@ void auxAppend(IPhrase& head, const IPhrase& tail)
LexicalReorderingTable* LexicalReorderingTable::LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors)
{
//decide use Compact or Tree or Memory table
#ifdef HAVE_CMPH
if(FileExists(filePath + ".minlexr")) {
LexicalReorderingTable *compactLexr =
LexicalReorderingTableCompact::CheckAndLoad(filePath + ".minlexr", f_factors, e_factors, c_factors);
if(compactLexr)
return compactLexr;
}
#endif
LexicalReorderingTable *compactLexr =
LexicalReorderingTableCompact::CheckAndLoad(filePath + ".minlexr", f_factors, e_factors, c_factors);
if(compactLexr)
return compactLexr;
if(FileExists(filePath+".binlexr.idx")) {
//there exists a binary version use that
return new LexicalReorderingTableTree(filePath, f_factors, e_factors, c_factors);

View File

@ -1663,7 +1663,7 @@ bool StaticData::LoadPhraseBoundaryFeature()
{
const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-pb"));
if (weight.size() > 1) {
std::cerr << "only one sparse producer weight allowed for the phrase boundary feature" << std::endl;
std::cerr << "Only one sparse producer weight allowed for the phrase boundary feature" << std::endl;
return false;
}

View File

@ -141,7 +141,7 @@ LexicalReorderingTable* LexicalReorderingTableCompact::CheckAndLoad(
}
// file name is specified with suffix
if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr
|| FileExists(filePath))
&& FileExists(filePath))
{
//there exists a compact binary version use that
VERBOSE(2,"Using compact lexical reordering table" << std::endl);

View File

@ -49,35 +49,13 @@ namespace tmmt
cerr << "loading completed" << endl;
}
FuzzyMatchWrapper::WordIndex &FuzzyMatchWrapper::GetWordIndex(long translationId)
{
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
std::map<long, WordIndex>::iterator iter = m_wordIndex.find(translationId);
assert(iter != m_wordIndex.end());
return iter->second;
}
void FuzzyMatchWrapper::AddWordIndex(long translationId)
{
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
WordIndex &ret = m_wordIndex[translationId];
}
void FuzzyMatchWrapper::DeleteWordIndex(long translationId)
{
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
size_t ret = m_wordIndex.erase(translationId);
CHECK(ret == 1);
}
string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
{
const Moses::StaticData &staticData = Moses::StaticData::Instance();
AddWordIndex(translationId);
string fuzzyMatchFile = ExtractTM(translationId, dirNameStr);
WordIndex wordIndex;
string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
// create extrac files
create_xml(fuzzyMatchFile);
@ -104,12 +82,11 @@ namespace tmmt
+ " -phrase-translation-table " + fuzzyMatchFile + ".pt";
system(cmd.c_str());
DeleteWordIndex(translationId);
return fuzzyMatchFile + ".pt.gz";
}
string FuzzyMatchWrapper::ExtractTM(long translationId, const string &dirNameStr)
string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
{
const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
@ -277,7 +254,7 @@ namespace tmmt
int pruned_match_count = 0;
if (short_match_max_length( input_length ))
{
init_short_matches(translationId, input[sentenceInd] );
init_short_matches(wordIndex, translationId, input[sentenceInd] );
}
vector< int > best_tm;
typedef map< int, vector< Match > >::iterator I;
@ -289,7 +266,7 @@ namespace tmmt
int tmID = tm->first;
int tm_length = suffixArray->GetSentenceLength(tmID);
vector< Match > &match = tm->second;
add_short_matches( translationId, match, source[tmID], input_length, best_cost );
add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
//cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
@ -573,16 +550,34 @@ namespace tmmt
}
}
bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
{
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
if (lookup != m_lsed.end()) {
value = lookup->second;
return true;
}
return false;
}
void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
{
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
m_lsed[ key ] = value;
}
/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
{
// check if already computed -> lookup in cache
pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = lsed.find( pIdx );
if (lookup != lsed.end())
{
return (lookup->second);
unsigned int value;
bool ret = GetLSEDCache(pIdx, value);
if (ret) {
return value;
}
// get surface strings for word indices
@ -623,129 +618,129 @@ unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
free( cost );
// cache and return result
lsed[ pIdx ] = final;
SetLSEDCache(pIdx, final);
return final;
}
/* string edit distance implementation */
unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {
/* string edit distance implementation */
// initialize cost and path matrices
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {
// initialize cost and path matrices
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
for( unsigned int i=0; i<=a.size(); i++ ) {
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
path[i] = (char*) calloc( sizeof(char), b.size()+1 );
if (i>0)
{
cost[i][0] = cost[i-1][0];
if (use_letter_sed)
{
cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
}
else
{
cost[i][0]++;
}
}
else
{
cost[i][0] = 0;
}
path[i][0] = 'I';
}
for( unsigned int j=0; j<=b.size(); j++ ) {
if (j>0)
{
cost[0][j] = cost[0][j-1];
if (use_letter_sed)
{
cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size();
}
else
{
cost[0][j]++;
}
}
else
{
cost[0][j] = 0;
}
path[0][j] = 'D';
}
// core string edit distance algorithm
for( unsigned int i=1; i<=a.size(); i++ ) {
for( unsigned int j=1; j<=b.size(); j++ ) {
unsigned int ins = cost[i-1][j];
unsigned int del = cost[i][j-1];
unsigned int match;
if (use_letter_sed)
{
ins += GetVocabulary().GetWord( a[i-1] ).size();
del += GetVocabulary().GetWord( b[j-1] ).size();
match = letter_sed( a[i-1], b[j-1] );
}
else
{
ins++;
del++;
match = ( a[i-1] == b[j-1] ) ? 0 : 1;
}
unsigned int diag = cost[i-1][j-1] + match;
char action = (ins < del) ? 'I' : 'D';
unsigned int min = (ins < del) ? ins : del;
if (diag < min)
{
action = (match>0) ? 'S' : 'M';
min = diag;
}
cost[i][j] = min;
path[i][j] = action;
}
}
// construct string for best path
unsigned int i = a.size();
unsigned int j = b.size();
best_path = "";
while( i>0 || j>0 )
{
best_path = path[i][j] + best_path;
if (path[i][j] == 'I')
{
i--;
}
else if (path[i][j] == 'D')
{
j--;
}
else
{
i--;
j--;
}
}
// clear out memory
unsigned int final = cost[a.size()][b.size()];
for( unsigned int i=0; i<=a.size(); i++ ) {
free( cost[i] );
free( path[i] );
}
free( cost );
free( path );
// return result
return final;
}
for( unsigned int i=0; i<=a.size(); i++ ) {
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
path[i] = (char*) calloc( sizeof(char), b.size()+1 );
if (i>0)
{
cost[i][0] = cost[i-1][0];
if (use_letter_sed)
{
cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
}
else
{
cost[i][0]++;
}
}
else
{
cost[i][0] = 0;
}
path[i][0] = 'I';
}
for( unsigned int j=0; j<=b.size(); j++ ) {
if (j>0)
{
cost[0][j] = cost[0][j-1];
if (use_letter_sed)
{
cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size();
}
else
{
cost[0][j]++;
}
}
else
{
cost[0][j] = 0;
}
path[0][j] = 'D';
}
// core string edit distance algorithm
for( unsigned int i=1; i<=a.size(); i++ ) {
for( unsigned int j=1; j<=b.size(); j++ ) {
unsigned int ins = cost[i-1][j];
unsigned int del = cost[i][j-1];
unsigned int match;
if (use_letter_sed)
{
ins += GetVocabulary().GetWord( a[i-1] ).size();
del += GetVocabulary().GetWord( b[j-1] ).size();
match = letter_sed( a[i-1], b[j-1] );
}
else
{
ins++;
del++;
match = ( a[i-1] == b[j-1] ) ? 0 : 1;
}
unsigned int diag = cost[i-1][j-1] + match;
char action = (ins < del) ? 'I' : 'D';
unsigned int min = (ins < del) ? ins : del;
if (diag < min)
{
action = (match>0) ? 'S' : 'M';
min = diag;
}
cost[i][j] = min;
path[i][j] = action;
}
}
// construct string for best path
unsigned int i = a.size();
unsigned int j = b.size();
best_path = "";
while( i>0 || j>0 )
{
best_path = path[i][j] + best_path;
if (path[i][j] == 'I')
{
i--;
}
else if (path[i][j] == 'D')
{
j--;
}
else
{
i--;
j--;
}
}
// clear out memory
unsigned int final = cost[a.size()][b.size()];
for( unsigned int i=0; i<=a.size(); i++ ) {
free( cost[i] );
free( path[i] );
}
free( cost );
free( path );
// return result
return final;
}
/* utlility function: compute length of sentence in characters
(spaces do not count) */
@ -838,13 +833,12 @@ int FuzzyMatchWrapper::short_match_max_length( int input_length )
(to be used by the next function)
(done here, because this has be done only once for an input sentence) */
void FuzzyMatchWrapper::init_short_matches(long translationId, const vector< WORD_ID > &input )
void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
{
int max_length = short_match_max_length( input.size() );
if (max_length == 0)
return;
WordIndex &wordIndex = GetWordIndex(translationId);
wordIndex.clear();
// store input words and their positions in hash map
@ -861,14 +855,12 @@ void FuzzyMatchWrapper::init_short_matches(long translationId, const vector< WOR
/* add all short matches to list of matches for a sentence */
void FuzzyMatchWrapper::add_short_matches(long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
{
int max_length = short_match_max_length( input_length );
if (max_length == 0)
return;
WordIndex &wordIndex = GetWordIndex(translationId);
int tm_length = tm.size();
map< WORD_ID,vector< int > >::iterator input_word_hit;
for(int t_pos=0; t_pos<tm.size(); t_pos++)

View File

@ -12,6 +12,7 @@
#ifdef WITH_THREADS
#include <boost/thread/shared_mutex.hpp>
#endif
#include <fstream>
#include <string>
#include "SuffixArray.h"
@ -46,16 +47,14 @@ protected:
int multiple_max;
typedef std::map< WORD_ID,std::vector< int > > WordIndex;
std::map<long, WordIndex> m_wordIndex;
//WordIndex m_wordIndex;
// global cache for word pairs
std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > m_lsed;
#ifdef WITH_THREADS
//reader-writer lock
mutable boost::shared_mutex m_accessLock;
#endif
// global cache for word pairs
std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > lsed;
void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus );
void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus);
void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
@ -69,21 +68,21 @@ protected:
unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence );
unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx );
unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed );
void init_short_matches(long translationId, const std::vector< WORD_ID > &input );
void init_short_matches(WordIndex &wordIndex, long translationId, const std::vector< WORD_ID > &input );
int short_match_max_length( int input_length );
void add_short_matches(long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost );
void add_short_matches(WordIndex &wordIndex, long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost );
std::vector< Match > prune_matches( const std::vector< Match > &match, int best_cost );
int parse_matches( std::vector< Match > &match, int input_length, int tm_length, int &best_cost );
void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector<SentenceAlignment> &targets, const std::string &inputStr, const std::string &path, std::ofstream &outputFile);
std::string ExtractTM(long translationId, const std::string &inputPath);
std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath);
Vocabulary &GetVocabulary()
{ return suffixArray->GetVocabulary(); }
WordIndex &GetWordIndex(long translationId);
void AddWordIndex(long translationId);
void DeleteWordIndex(long translationId);
bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const;
void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value);
};
}

View File

@ -11,8 +11,6 @@ import optparse
import random
import sys
import numpy
from defaultconfig import Config
logging.basicConfig(format = "%(asctime)-15s %(message)s")