mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-27 11:59:52 +03:00
binary hiero reordering feature. Implementation of 1 described in nist 2012. 1 if non-term is reordered wrt to other words or non-terms. 0 otherwise
This commit is contained in:
parent
1931bfe959
commit
33c03edfbb
@ -58,6 +58,7 @@ bool unalignedFlag = false;
|
||||
bool unalignedFWFlag = false;
|
||||
bool outputNTLengths = false;
|
||||
bool singletonFeature = false;
|
||||
bool crossedNonTerm = false;
|
||||
int countOfCounts[COC_MAX+1];
|
||||
int totalDistinct = 0;
|
||||
float minCountHierarchical = 0;
|
||||
@ -71,13 +72,13 @@ vector<string> tokenize( const char [] );
|
||||
|
||||
void writeCountOfCounts( const string &fileNameCountOfCounts );
|
||||
void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton);
|
||||
PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
|
||||
const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
|
||||
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton );
|
||||
double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
|
||||
double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
|
||||
double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
|
||||
double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
|
||||
set<string> functionWordList;
|
||||
void loadFunctionWords( const string &fileNameFunctionWords );
|
||||
double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
|
||||
double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
|
||||
void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
|
||||
, map<size_t, map<size_t, float> > &sourceProb
|
||||
, map<size_t, map<size_t, float> > &targetProb);
|
||||
@ -90,7 +91,7 @@ int main(int argc, char* argv[])
|
||||
<< "scoring methods for extracted rules\n";
|
||||
|
||||
if (argc < 4) {
|
||||
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] \n";
|
||||
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
|
||||
exit(1);
|
||||
}
|
||||
string fileNameExtract = argv[1];
|
||||
@ -156,6 +157,9 @@ int main(int argc, char* argv[])
|
||||
} else if (strcmp(argv[i],"--Singleton") == 0) {
|
||||
singletonFeature = true;
|
||||
cerr << "binary singleton feature\n";
|
||||
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
|
||||
crossedNonTerm = true;
|
||||
cerr << "crossed non-term reordering feature\n";
|
||||
} else {
|
||||
cerr << "ERROR: unknown option " << argv[i] << endl;
|
||||
exit(1);
|
||||
@ -243,12 +247,12 @@ int main(int argc, char* argv[])
|
||||
processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
|
||||
|
||||
phrasePairsWithSameF.clear();
|
||||
isSingleton = true;
|
||||
isSingleton = false;
|
||||
lastPhrasePair = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
isSingleton = false;
|
||||
isSingleton = true;
|
||||
}
|
||||
|
||||
// add phrase pairs to list, it's now the last one
|
||||
@ -336,7 +340,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
|
||||
|
||||
}
|
||||
|
||||
PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
|
||||
const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
|
||||
{
|
||||
float bestAlignmentCount = -1;
|
||||
PhraseAlignment* bestAlignment;
|
||||
@ -357,7 +361,7 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
|
||||
}
|
||||
}
|
||||
|
||||
return bestAlignment;
|
||||
return *bestAlignment;
|
||||
}
|
||||
|
||||
|
||||
@ -448,11 +452,73 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t,
|
||||
|
||||
}
|
||||
|
||||
bool calcCrossedNonTerm(int sourcePos, int targetPos, const std::vector< std::set<size_t> > &alignedToS)
|
||||
{
|
||||
for (int currSource = 0; currSource < alignedToS.size(); ++currSource)
|
||||
{
|
||||
if (currSource == sourcePos)
|
||||
{ // skip
|
||||
}
|
||||
else
|
||||
{
|
||||
const std::set<size_t> &targetSet = alignedToS[currSource];
|
||||
std::set<size_t>::const_iterator iter;
|
||||
for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
|
||||
{
|
||||
size_t currTarget = *iter;
|
||||
|
||||
if ((currSource < sourcePos && currTarget > targetPos)
|
||||
|| (currSource > sourcePos && currTarget < targetPos)
|
||||
)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
|
||||
{
|
||||
const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
|
||||
|
||||
for (int sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos)
|
||||
{
|
||||
const std::set<size_t> &targetSet = alignedToS[sourcePos];
|
||||
cerr << "size=" << targetSet.size() << " ";
|
||||
std::set<size_t>::const_iterator iter;
|
||||
for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
|
||||
{
|
||||
size_t targetPos = *iter;
|
||||
cerr << sourcePos << "-" << targetPos << " ";
|
||||
}
|
||||
cerr << endl;
|
||||
|
||||
WORD_ID wordId = phraseS[sourcePos];
|
||||
const WORD &word = vcbS.getWord(wordId);
|
||||
bool isNonTerm = isNonTerminal(word);
|
||||
|
||||
if (isNonTerm)
|
||||
{
|
||||
assert(targetSet.size() == 1);
|
||||
int targetPos = *targetSet.begin();
|
||||
bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
|
||||
if (ret)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton )
|
||||
{
|
||||
if (phrasePair.size() == 0) return;
|
||||
|
||||
PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
|
||||
const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
|
||||
|
||||
// compute count
|
||||
float count = 0;
|
||||
@ -492,17 +558,17 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
|
||||
|
||||
// source phrase (unless inverse)
|
||||
if (! inverseFlag) {
|
||||
printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
|
||||
printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
|
||||
phraseTableFile << " ||| ";
|
||||
}
|
||||
|
||||
// target phrase
|
||||
printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
|
||||
printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
|
||||
phraseTableFile << " ||| ";
|
||||
|
||||
// source phrase (if inverse)
|
||||
if (inverseFlag) {
|
||||
printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
|
||||
printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
|
||||
phraseTableFile << " ||| ";
|
||||
}
|
||||
|
||||
@ -525,7 +591,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
|
||||
}
|
||||
|
||||
if (singletonFeature) {
|
||||
phraseTableFile << " " << (isSingleton?1:0);
|
||||
phraseTableFile << " " << (isSingleton ? 1 : 0);
|
||||
}
|
||||
|
||||
if (crossedNonTerm && !inverseFlag) {
|
||||
phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
|
||||
}
|
||||
|
||||
// target-side PCFG score
|
||||
@ -539,26 +609,31 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
|
||||
if (! inverseFlag) {
|
||||
if (hierarchicalFlag) {
|
||||
// always output alignment if hiero style, but only for non-terms
|
||||
assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
|
||||
assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
|
||||
for(size_t j = 0; j < phraseT.size() - 1; j++) {
|
||||
if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
|
||||
if (bestAlignment->alignedToT[ j ].size() != 1) {
|
||||
if (bestAlignment.alignedToT[ j ].size() != 1) {
|
||||
cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
|
||||
phraseTableFile.flush();
|
||||
assert(bestAlignment->alignedToT[ j ].size() == 1);
|
||||
assert(bestAlignment.alignedToT[ j ].size() == 1);
|
||||
}
|
||||
int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
|
||||
int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
|
||||
phraseTableFile << sourcePos << "-" << j << " ";
|
||||
}
|
||||
else if (wordAlignmentFlag) {
|
||||
int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
|
||||
phraseTableFile << sourcePos << "-" << j << " ";
|
||||
const std::set<size_t> &sourceSet = bestAlignment.alignedToT[ j ];
|
||||
std::set<size_t>::const_iterator iter;
|
||||
for (iter = sourceSet.begin(); iter != sourceSet.end(); ++iter)
|
||||
{
|
||||
int sourcePos = *iter;
|
||||
phraseTableFile << sourcePos << "-" << j << " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (wordAlignmentFlag) {
|
||||
// alignment info in pb model
|
||||
for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) {
|
||||
const set< size_t > &aligned = bestAlignment->alignedToT[j];
|
||||
for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
|
||||
const set< size_t > &aligned = bestAlignment.alignedToT[j];
|
||||
for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
|
||||
phraseTableFile << *p << "-" << j << " ";
|
||||
}
|
||||
@ -592,13 +667,13 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
|
||||
phraseTableFile << endl;
|
||||
}
|
||||
|
||||
double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
|
||||
double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
|
||||
{
|
||||
// unaligned word counter
|
||||
double unaligned = 1.0;
|
||||
// only checking target words - source words are caught when computing inverse
|
||||
for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
|
||||
const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
|
||||
for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
|
||||
const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
|
||||
if (srcIndices.empty()) {
|
||||
unaligned *= 2.718;
|
||||
}
|
||||
@ -606,13 +681,13 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, Ph
|
||||
return unaligned;
|
||||
}
|
||||
|
||||
double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
|
||||
double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
|
||||
{
|
||||
// unaligned word counter
|
||||
double unaligned = 1.0;
|
||||
// only checking target words - source words are caught when computing inverse
|
||||
for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
|
||||
const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
|
||||
for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
|
||||
const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
|
||||
if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
|
||||
unaligned *= 2.718;
|
||||
}
|
||||
@ -645,14 +720,14 @@ void loadFunctionWords( const string &fileName )
|
||||
inFile.close();
|
||||
}
|
||||
|
||||
double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
|
||||
double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
|
||||
{
|
||||
// lexical translation probability
|
||||
double lexScore = 1.0;
|
||||
int null = vcbS.getWordID("NULL");
|
||||
// all target words have to be explained
|
||||
for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
|
||||
const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
|
||||
for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
|
||||
const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
|
||||
if (srcIndices.empty()) {
|
||||
// explain unaligned word by NULL
|
||||
lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
|
||||
|
Loading…
Reference in New Issue
Block a user