mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
Merge branch 'GHKMStruct' of github.com:moses-smt/mosesdecoder into GHKMStruct
This commit is contained in:
commit
df86f0e78b
@ -1,44 +0,0 @@
|
||||
//
|
||||
// ExtractedRule.cpp
|
||||
// extract
|
||||
//
|
||||
// Created by Hieu Hoang on 13/09/2011.
|
||||
// Copyright 2011 __MyCompanyName__. All rights reserved.
|
||||
//
|
||||
|
||||
#include "ExtractedRule.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
void ExtractedRule::OutputNTLengths(std::ostream &out) const
|
||||
{
|
||||
ostringstream outString;
|
||||
OutputNTLengths(outString);
|
||||
out << outString;
|
||||
}
|
||||
|
||||
void ExtractedRule::OutputNTLengths(std::ostringstream &outString) const
|
||||
{
|
||||
std::map<size_t, std::pair<size_t, size_t> >::const_iterator iter;
|
||||
for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter) {
|
||||
size_t sourcePos = iter->first;
|
||||
const std::pair<size_t, size_t> &spanLengths = iter->second;
|
||||
outString << sourcePos << "=" << spanLengths.first << "," <<spanLengths.second << " ";
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const ExtractedRule &obj)
|
||||
{
|
||||
out << obj.source << " ||| " << obj.target << " ||| "
|
||||
<< obj.alignment << " ||| "
|
||||
<< obj.alignmentInv << " ||| ";
|
||||
|
||||
obj.OutputNTLengths(out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace
|
@ -32,8 +32,6 @@ namespace MosesTraining
|
||||
// sentence-level collection of rules
|
||||
class ExtractedRule
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream &, const ExtractedRule &);
|
||||
|
||||
public:
|
||||
std::string source;
|
||||
std::string target;
|
||||
@ -54,8 +52,6 @@ public:
|
||||
float count;
|
||||
double pcfgScore;
|
||||
|
||||
std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
|
||||
|
||||
ExtractedRule(int sT, int eT, int sS, int eS)
|
||||
: source()
|
||||
, target()
|
||||
@ -76,13 +72,6 @@ public:
|
||||
, count(0)
|
||||
, pcfgScore(0.0) {
|
||||
}
|
||||
|
||||
void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) {
|
||||
m_ntLengths[sourcePos] = std::pair<size_t, size_t>(sourceLength, targetLength);
|
||||
}
|
||||
|
||||
void OutputNTLengths(std::ostream &out) const;
|
||||
void OutputNTLengths(std::ostringstream &out) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -49,7 +49,6 @@ public:
|
||||
bool duplicateRules;
|
||||
bool fractionalCounting;
|
||||
bool pcfgScore;
|
||||
bool outputNTLengths;
|
||||
bool gzOutput;
|
||||
bool unpairedExtractFormat;
|
||||
bool conditionOnTargetLhs;
|
||||
@ -83,7 +82,6 @@ public:
|
||||
, duplicateRules(true)
|
||||
, fractionalCounting(true)
|
||||
, pcfgScore(false)
|
||||
, outputNTLengths(false)
|
||||
, gzOutput(false)
|
||||
, unpairedExtractFormat(false)
|
||||
, conditionOnTargetLhs(false)
|
||||
|
@ -41,7 +41,6 @@ bool lowCountFlag = false;
|
||||
bool goodTuringFlag = false;
|
||||
bool kneserNeyFlag = false;
|
||||
bool logProbFlag = false;
|
||||
bool outputNTLengths = false;
|
||||
inline float maybeLogProb( float a )
|
||||
{
|
||||
return logProbFlag ? log(a) : a;
|
||||
@ -62,7 +61,7 @@ int main(int argc, char* argv[])
|
||||
<< "consolidating direct and indirect rule tables\n";
|
||||
|
||||
if (argc < 4) {
|
||||
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--OutputNTLengths] \n";
|
||||
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] \n";
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameDirect = argv[1];
|
||||
@ -119,8 +118,6 @@ int main(int argc, char* argv[])
|
||||
} else if (strcmp(argv[i],"--LogProb") == 0) {
|
||||
logProbFlag = true;
|
||||
cerr << "using log-probabilities\n";
|
||||
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
|
||||
outputNTLengths = true;
|
||||
} else {
|
||||
cerr << "ERROR: unknown option " << argv[i] << endl;
|
||||
exit(1);
|
||||
@ -315,10 +312,6 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
// counts, for debugging
|
||||
fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
|
||||
|
||||
if (outputNTLengths) {
|
||||
fileConsolidated << " ||| " << itemDirect[5];
|
||||
}
|
||||
|
||||
// count bin feature (as a sparse feature)
|
||||
if (sparseCountBinFeatureFlag ||
|
||||
directSparseScores.compare("") != 0 ||
|
||||
|
@ -129,7 +129,6 @@ int main(int argc, char* argv[])
|
||||
<< " --GlueGrammar FILE"
|
||||
<< " | --UnknownWordLabel FILE"
|
||||
<< " | --OnlyDirect"
|
||||
<< " | --OutputNTLengths"
|
||||
<< " | --MaxSpan[" << options.maxSpan << "]"
|
||||
<< " | --MinHoleTarget[" << options.minHoleTarget << "]"
|
||||
<< " | --MinHoleSource[" << options.minHoleSource << "]"
|
||||
@ -262,8 +261,6 @@ int main(int argc, char* argv[])
|
||||
options.fractionalCounting = false;
|
||||
} else if (strcmp(argv[i],"--PCFG") == 0) {
|
||||
options.pcfgScore = true;
|
||||
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
|
||||
options.outputNTLengths = true;
|
||||
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
|
||||
options.unpairedExtractFormat = true;
|
||||
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
|
||||
@ -663,9 +660,6 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
|
||||
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
|
||||
if (!m_options.onlyDirectFlag)
|
||||
rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
|
||||
|
||||
rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ;
|
||||
|
||||
}
|
||||
|
||||
rule.alignment.erase(rule.alignment.size()-1);
|
||||
@ -1077,9 +1071,6 @@ void ExtractTask::writeRulesToFile()
|
||||
<< rule->target << " ||| "
|
||||
<< rule->alignment << " ||| "
|
||||
<< rule->count << " ||| ";
|
||||
if (m_options.outputNTLengths) {
|
||||
rule->OutputNTLengths(out);
|
||||
}
|
||||
if (m_options.pcfgScore) {
|
||||
out << " ||| " << rule->pcfgScore;
|
||||
}
|
||||
|
@ -61,7 +61,6 @@ int negLogProb = 1;
|
||||
bool lexFlag = true;
|
||||
bool unalignedFlag = false;
|
||||
bool unalignedFWFlag = false;
|
||||
bool outputNTLengths = false;
|
||||
bool singletonFeature = false;
|
||||
bool crossedNonTerm = false;
|
||||
int countOfCounts[COC_MAX+1];
|
||||
@ -85,9 +84,6 @@ double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlig
|
||||
set<string> functionWordList;
|
||||
void loadFunctionWords( const string &fileNameFunctionWords );
|
||||
double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
|
||||
void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
|
||||
, map<size_t, map<size_t, float> > &sourceProb
|
||||
, map<size_t, map<size_t, float> > &targetProb);
|
||||
void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
|
||||
void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
|
||||
|
||||
@ -98,7 +94,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
ScoreFeatureManager featureManager;
|
||||
if (argc < 4) {
|
||||
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
|
||||
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
|
||||
cerr << featureManager.usage() << endl;
|
||||
exit(1);
|
||||
}
|
||||
@ -164,8 +160,6 @@ int main(int argc, char* argv[])
|
||||
minCountHierarchical = atof(argv[++i]);
|
||||
cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
|
||||
minCountHierarchical -= 0.00001; // account for rounding
|
||||
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
|
||||
outputNTLengths = true;
|
||||
} else if (strcmp(argv[i],"--Singleton") == 0) {
|
||||
singletonFeature = true;
|
||||
cerr << "binary singleton feature\n";
|
||||
@ -405,86 +399,6 @@ const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phraseP
|
||||
}
|
||||
|
||||
|
||||
void calcNTLengthProb(const map<size_t, map<size_t, size_t> > &lengths
|
||||
, size_t total
|
||||
, map<size_t, map<size_t, float> > &probs)
|
||||
{
|
||||
map<size_t, map<size_t, size_t> >::const_iterator iterOuter;
|
||||
for (iterOuter = lengths.begin(); iterOuter != lengths.end(); ++iterOuter) {
|
||||
size_t sourcePos = iterOuter->first;
|
||||
const map<size_t, size_t> &inner = iterOuter->second;
|
||||
|
||||
map<size_t, size_t>::const_iterator iterInner;
|
||||
for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
|
||||
size_t length = iterInner->first;
|
||||
size_t count = iterInner->second;
|
||||
float prob = (float) count / (float) total;
|
||||
probs[sourcePos][length] = prob;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
|
||||
, map<size_t, map<size_t, float> > &sourceProb
|
||||
, map<size_t, map<size_t, float> > &targetProb)
|
||||
{
|
||||
map<size_t, map<size_t, size_t> > sourceLengths, targetLengths;
|
||||
// 1st = position in source phrase, 2nd = length, 3rd = count
|
||||
map<size_t, size_t> totals;
|
||||
// 1st = position in source phrase, 2nd = total counts
|
||||
// each source pos should have same count?
|
||||
|
||||
vector< PhraseAlignment* >::const_iterator iterOuter;
|
||||
for (iterOuter = phrasePairs.begin(); iterOuter != phrasePairs.end(); ++iterOuter) {
|
||||
const PhraseAlignment &phrasePair = **iterOuter;
|
||||
const std::map<size_t, std::pair<size_t, size_t> > &ntLengths = phrasePair.GetNTLengths();
|
||||
|
||||
std::map<size_t, std::pair<size_t, size_t> >::const_iterator iterInner;
|
||||
for (iterInner = ntLengths.begin(); iterInner != ntLengths.end(); ++iterInner) {
|
||||
size_t sourcePos = iterInner->first;
|
||||
size_t sourceLength = iterInner->second.first;
|
||||
size_t targetLength = iterInner->second.second;
|
||||
|
||||
sourceLengths[sourcePos][sourceLength]++;
|
||||
targetLengths[sourcePos][targetLength]++;
|
||||
|
||||
totals[sourcePos]++;
|
||||
}
|
||||
}
|
||||
|
||||
if (totals.size() == 0) {
|
||||
// no non-term. Don't bother
|
||||
return;
|
||||
}
|
||||
|
||||
size_t total = totals.begin()->second;
|
||||
if (totals.size() > 1) {
|
||||
assert(total == (++totals.begin())->second );
|
||||
}
|
||||
|
||||
calcNTLengthProb(sourceLengths, total, sourceProb);
|
||||
calcNTLengthProb(targetLengths, total, targetProb);
|
||||
|
||||
}
|
||||
|
||||
void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t, float> > &probs, const string &prefix)
|
||||
{
|
||||
map<size_t, map<size_t, float> >::const_iterator iterOuter;
|
||||
for (iterOuter = probs.begin(); iterOuter != probs.end(); ++iterOuter) {
|
||||
size_t sourcePos = iterOuter->first;
|
||||
const map<size_t, float> &inner = iterOuter->second;
|
||||
|
||||
map<size_t, float>::const_iterator iterInner;
|
||||
for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
|
||||
size_t length = iterInner->first;
|
||||
float prob = iterInner->second;
|
||||
|
||||
phraseTableFile << sourcePos << "|" << prefix << "|" << length << "=" << prob << " ";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool calcCrossedNonTerm(size_t sourcePos, size_t targetPos, const std::vector< std::set<size_t> > &alignedToS)
|
||||
{
|
||||
for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource) {
|
||||
@ -694,21 +608,6 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
|
||||
if (kneserNeyFlag)
|
||||
phraseTableFile << " " << distinctCount;
|
||||
|
||||
// nt lengths
|
||||
if (outputNTLengths) {
|
||||
phraseTableFile << " ||| ";
|
||||
|
||||
if (!inverseFlag) {
|
||||
map<size_t, map<size_t, float> > sourceProb, targetProb;
|
||||
// 1st sourcePos, 2nd = length, 3rd = prob
|
||||
|
||||
calcNTLengthProb(phrasePair, sourceProb, targetProb);
|
||||
|
||||
outputNTLengthProbs(phraseTableFile, sourceProb, "S");
|
||||
outputNTLengthProbs(phraseTableFile, targetProb, "T");
|
||||
}
|
||||
}
|
||||
|
||||
//MARIA
|
||||
//sparse features
|
||||
phraseTableFile << " ||| ";
|
||||
@ -727,7 +626,6 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
|
||||
}
|
||||
|
||||
|
||||
|
||||
phraseTableFile << endl;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user