mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 12:52:29 +03:00
Changes on main branch files while I was working on dynamic phrase tables.
This commit is contained in:
parent
6e4035fb12
commit
fdc504d47a
@ -65,6 +65,11 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>RuleExtractionOptions.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/RuleExtractionOptions.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>SentenceAlignment.cpp</name>
|
||||
<type>1</type>
|
||||
|
@ -5,13 +5,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.162355801" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -70,7 +70,6 @@
|
||||
<listOptionValue builtIn="false" value="irstlm"/>
|
||||
<listOptionValue builtIn="false" value="dstruct"/>
|
||||
<listOptionValue builtIn="false" value="dalm"/>
|
||||
<listOptionValue builtIn="false" value="MurmurHash3"/>
|
||||
<listOptionValue builtIn="false" value="flm"/>
|
||||
<listOptionValue builtIn="false" value="oolm"/>
|
||||
<listOptionValue builtIn="false" value="lattice"/>
|
||||
@ -108,13 +107,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.516628324" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -5,13 +5,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.461114338" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -71,7 +71,6 @@
|
||||
<listOptionValue builtIn="false" value="lattice"/>
|
||||
<listOptionValue builtIn="false" value="misc"/>
|
||||
<listOptionValue builtIn="false" value="dalm"/>
|
||||
<listOptionValue builtIn="false" value="MurmurHash3"/>
|
||||
<listOptionValue builtIn="false" value="search"/>
|
||||
<listOptionValue builtIn="false" value="RandLM"/>
|
||||
<listOptionValue builtIn="false" value="OnDiskPt"/>
|
||||
@ -109,13 +108,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.2121690436" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -181,10 +181,8 @@ FeatureStats::FeatureStats(const size_t size)
|
||||
|
||||
FeatureStats::~FeatureStats()
|
||||
{
|
||||
if (m_array) {
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
|
||||
void FeatureStats::Copy(const FeatureStats &stats)
|
||||
|
@ -35,10 +35,8 @@ ScoreStats::ScoreStats(const size_t size)
|
||||
|
||||
ScoreStats::~ScoreStats()
|
||||
{
|
||||
if (m_array) {
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
|
||||
void ScoreStats::Copy(const ScoreStats &stats)
|
||||
@ -157,4 +155,4 @@ bool operator==(const ScoreStats& s1, const ScoreStats& s2)
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -21,10 +21,8 @@ public:
|
||||
}
|
||||
|
||||
static void Delete() {
|
||||
if (m_instance) {
|
||||
delete m_instance;
|
||||
m_instance = NULL;
|
||||
}
|
||||
delete m_instance;
|
||||
m_instance = NULL;
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -50,7 +50,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "moses/FeatureVector.h"
|
||||
#include "moses/FF/StatefulFeatureFunction.h"
|
||||
#include "moses/FF/StatelessFeatureFunction.h"
|
||||
#include "moses/FF/SyntaxConstraintFeature.h"
|
||||
#include "moses/FF/TreeStructureFeature.h"
|
||||
#include "util/exception.hh"
|
||||
|
||||
using namespace std;
|
||||
@ -395,14 +395,16 @@ void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
|
||||
UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
|
||||
"No output file for tree fragments specified");
|
||||
|
||||
//Tree of full sentence (to stderr)
|
||||
const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
|
||||
for( size_t i=0; i<sff.size(); i++ ) {
|
||||
const StatefulFeatureFunction *ff = sff[i];
|
||||
if (ff->GetScoreProducerDescription() == "SyntaxConstraintFeature0") {
|
||||
const TreeState* tree = dynamic_cast<const TreeState*>(hypo->GetFFState(i));
|
||||
out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n";
|
||||
break;
|
||||
//Tree of full sentence
|
||||
const StatefulFeatureFunction* treeStructure = StaticData::Instance().GetTreeStructure();
|
||||
if (treeStructure != NULL) {
|
||||
const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
|
||||
for( size_t i=0; i<sff.size(); i++ ) {
|
||||
if (sff[i] == treeStructure) {
|
||||
const TreeState* tree = dynamic_cast<const TreeState*>(hypo->GetFFState(i));
|
||||
out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,7 +97,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
|
||||
|
||||
targetPhrase->SetTargetLHS(targetLHS);
|
||||
targetPhrase->SetAlignmentInfo("0-0");
|
||||
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
|
||||
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.GetTreeStructure() != NULL) {
|
||||
targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]");
|
||||
}
|
||||
|
||||
|
@ -52,8 +52,7 @@ LexicalReordering::LexicalReordering(const std::string &line)
|
||||
|
||||
LexicalReordering::~LexicalReordering()
|
||||
{
|
||||
if(m_table)
|
||||
delete m_table;
|
||||
delete m_table;
|
||||
delete m_configuration;
|
||||
}
|
||||
|
||||
|
@ -63,6 +63,11 @@ public:
|
||||
state = new DALM::State(*from.state);
|
||||
}
|
||||
|
||||
void reset(DALM::State *s){
|
||||
delete state;
|
||||
state = s;
|
||||
}
|
||||
|
||||
virtual int Compare(const FFState& other) const{
|
||||
const DALMState &o = static_cast<const DALMState &>(other);
|
||||
if(state->get_count() < o.state->get_count()) return -1;
|
||||
@ -82,11 +87,9 @@ public:
|
||||
class DALMChartState : public FFState
|
||||
{
|
||||
private:
|
||||
size_t sourceStartPos;
|
||||
size_t sourceEndPos;
|
||||
size_t inputSize;
|
||||
DALM::VocabId *prefixIDs;
|
||||
size_t prefixLength;
|
||||
const ChartHypothesis &hypo;
|
||||
DALM::Fragment *prefixFragments;
|
||||
unsigned short prefixLength;
|
||||
float prefixScore;
|
||||
DALMState *rightContext;
|
||||
bool isLarge;
|
||||
@ -94,15 +97,13 @@ private:
|
||||
public:
|
||||
DALMChartState(
|
||||
const ChartHypothesis &hypo,
|
||||
DALM::VocabId *prefixIDs,
|
||||
size_t prefixLength,
|
||||
DALM::Fragment *prefixFragments,
|
||||
unsigned short prefixLength,
|
||||
float prefixScore,
|
||||
DALMState *rightContext,
|
||||
bool isLarge)
|
||||
: sourceStartPos(hypo.GetCurrSourceRange().GetStartPos()),
|
||||
sourceEndPos(hypo.GetCurrSourceRange().GetEndPos()),
|
||||
inputSize(hypo.GetManager().GetSource().GetSize()),
|
||||
prefixIDs(prefixIDs),
|
||||
: hypo(hypo),
|
||||
prefixFragments(prefixFragments),
|
||||
prefixLength(prefixLength),
|
||||
prefixScore(prefixScore),
|
||||
rightContext(rightContext),
|
||||
@ -110,16 +111,16 @@ public:
|
||||
{}
|
||||
|
||||
virtual ~DALMChartState(){
|
||||
if(prefixIDs != NULL) delete [] prefixIDs;
|
||||
if(rightContext != NULL) delete rightContext;
|
||||
delete [] prefixFragments;
|
||||
delete rightContext;
|
||||
}
|
||||
|
||||
size_t GetPrefixLength() const{
|
||||
unsigned short GetPrefixLength() const{
|
||||
return prefixLength;
|
||||
}
|
||||
|
||||
const DALM::VocabId *GetPrefixIDs() const{
|
||||
return prefixIDs;
|
||||
const DALM::Fragment *GetPrefixFragments() const{
|
||||
return prefixFragments;
|
||||
}
|
||||
|
||||
float GetPrefixScore() const{
|
||||
@ -137,17 +138,22 @@ public:
|
||||
virtual int Compare(const FFState& other) const{
|
||||
const DALMChartState &o = static_cast<const DALMChartState &>(other);
|
||||
// prefix
|
||||
if (sourceStartPos > 0) { // not for "<s> ..."
|
||||
if (hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
|
||||
if (prefixLength != o.prefixLength){
|
||||
return (prefixLength < o.prefixLength)?-1:1;
|
||||
} else {
|
||||
int ret = memcmp(prefixIDs, o.prefixIDs, prefixLength);
|
||||
if (ret != 0) return ret;
|
||||
if(prefixLength > 0){
|
||||
DALM::Fragment &f = prefixFragments[prefixLength-1];
|
||||
DALM::Fragment &of = o.prefixFragments[prefixLength-1];
|
||||
int ret = DALM::compare_fragments(f, of);
|
||||
if(ret != 0) return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// suffix
|
||||
if (sourceEndPos < inputSize - 1) { // not for "... </s>"
|
||||
size_t inputSize = hypo.GetManager().GetSource().GetSize();
|
||||
if (hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
|
||||
int ret = o.rightContext->Compare(*rightContext);
|
||||
if (ret != 0) return ret;
|
||||
}
|
||||
@ -323,8 +329,8 @@ FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featu
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
|
||||
size_t contextSize = m_nGramOrder-1;
|
||||
DALM::VocabId *prefixIDs = new DALM::VocabId[contextSize];
|
||||
size_t prefixLength = 0;
|
||||
DALM::Fragment *prefixFragments = new DALM::Fragment[contextSize];
|
||||
unsigned short prefixLength = 0;
|
||||
bool isLarge = false;
|
||||
|
||||
// initial language model scores
|
||||
@ -350,11 +356,14 @@ FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featu
|
||||
// state is finalized.
|
||||
isLarge = true;
|
||||
}else{
|
||||
float score = m_lm->query(wid, *state);
|
||||
hypoScore += score;
|
||||
if (!isLarge){
|
||||
if(isLarge){
|
||||
float score = m_lm->query(wid, *state);
|
||||
hypoScore += score;
|
||||
}else{
|
||||
float score = m_lm->query(wid, *state, prefixFragments[prefixLength]);
|
||||
|
||||
prefixScore += score;
|
||||
prefixIDs[prefixLength] = wid;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
@ -374,8 +383,10 @@ FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featu
|
||||
// get language model state
|
||||
dalm_state->reset(*prevState->GetRightContext());
|
||||
state = dalm_state->get_state();
|
||||
|
||||
prefixLength = prevState->GetPrefixLength();
|
||||
std::memcpy(prefixIDs, prevState->GetPrefixIDs(), sizeof(DALM::VocabId)*prefixLength);
|
||||
const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
|
||||
std::memcpy(prefixFragments, prevPrefixFragments, sizeof(DALM::Fragment)*prefixLength);
|
||||
isLarge = prevState->LargeEnough();
|
||||
}
|
||||
phrasePos++;
|
||||
@ -389,11 +400,12 @@ FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featu
|
||||
// regular word
|
||||
if (!word.IsNonTerminal()) {
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
float score = m_lm->query(wid, *state);
|
||||
hypoScore += score;
|
||||
if (!isLarge){
|
||||
if (isLarge) {
|
||||
hypoScore += m_lm->query(wid, *state);
|
||||
}else{
|
||||
float score = m_lm->query(wid, *state, prefixFragments[prefixLength]);
|
||||
prefixScore += score;
|
||||
prefixIDs[prefixLength] = wid;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
@ -410,19 +422,22 @@ FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featu
|
||||
static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
|
||||
|
||||
size_t prevPrefixLength = prevState->GetPrefixLength();
|
||||
const DALM::VocabId *prevPrefixIDs = prevState->GetPrefixIDs();
|
||||
|
||||
const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
|
||||
DALM::Gap gap(*state);
|
||||
// score its prefix
|
||||
for(size_t prefixPos = 0; prefixPos < prevPrefixLength; prefixPos++) {
|
||||
DALM::VocabId wid = prevPrefixIDs[prefixPos];
|
||||
float score = m_lm->query(wid, *state);
|
||||
hypoScore += score;
|
||||
if (!isLarge){
|
||||
const DALM::Fragment &f = prevPrefixFragments[prefixPos];
|
||||
|
||||
if (isLarge) {
|
||||
hypoScore += m_lm->query(f, *state, gap);
|
||||
} else {
|
||||
float score = m_lm->query(f, *state, gap, prefixFragments[prefixLength]);
|
||||
prefixScore += score;
|
||||
prefixIDs[prefixLength] = wid;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
gap.succ();
|
||||
}
|
||||
|
||||
// check if we are dealing with a large sub-phrase
|
||||
@ -430,18 +445,22 @@ FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featu
|
||||
// add its language model score
|
||||
hypoScore += UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
||||
hypoScore -= prevState->GetPrefixScore(); // remove overwrapped score.
|
||||
|
||||
// copy language model state
|
||||
// copy language model state
|
||||
dalm_state->reset(*prevState->GetRightContext());
|
||||
state = dalm_state->get_state();
|
||||
}
|
||||
} else {
|
||||
DALM::State *state_new = new DALM::State(*prevState->GetRightContext()->get_state());
|
||||
m_lm->set_state(*state_new, *state, gap);
|
||||
dalm_state->reset(state_new);
|
||||
state = dalm_state->get_state();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// assign combined score to score breakdown
|
||||
out->Assign(this, TransformLMScore(hypoScore));
|
||||
|
||||
return new DALMChartState(hypo, prefixIDs, prefixLength, prefixScore, dalm_state, isLarge);
|
||||
return new DALMChartState(hypo, prefixFragments, prefixLength, prefixScore, dalm_state, isLarge);
|
||||
}
|
||||
|
||||
bool LanguageModelDALM::IsUseable(const FactorMask &mask) const
|
||||
|
@ -66,6 +66,7 @@ StaticData::StaticData()
|
||||
,m_lmEnableOOVFeature(false)
|
||||
,m_isAlwaysCreateDirectTranslationOption(false)
|
||||
,m_currentWeightSetting("default")
|
||||
,m_treeStructure(NULL)
|
||||
{
|
||||
m_xmlBrackets.first="<";
|
||||
m_xmlBrackets.second=">";
|
||||
@ -1184,5 +1185,52 @@ void StaticData::CheckLEGACYPT()
|
||||
}
|
||||
|
||||
|
||||
void StaticData::ResetWeights(const std::string &denseWeights, const std::string &sparseFile)
|
||||
{
|
||||
m_allWeights = ScoreComponentCollection();
|
||||
|
||||
// dense weights
|
||||
string name("");
|
||||
vector<float> weights;
|
||||
vector<string> toks = Tokenize(denseWeights);
|
||||
for (size_t i = 0; i < toks.size(); ++i) {
|
||||
const string &tok = toks[i];
|
||||
|
||||
if (tok.substr(tok.size() - 1, 1) == "=") {
|
||||
// start of new feature
|
||||
|
||||
if (name != "") {
|
||||
// save previous ff
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name);
|
||||
m_allWeights.Assign(&ff, weights);
|
||||
weights.clear();
|
||||
}
|
||||
|
||||
name = tok.substr(0, tok.size() - 1);
|
||||
} else {
|
||||
// a weight for curr ff
|
||||
float weight = Scan<float>(toks[i]);
|
||||
weights.push_back(weight);
|
||||
}
|
||||
}
|
||||
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name);
|
||||
m_allWeights.Assign(&ff, weights);
|
||||
|
||||
// sparse weights
|
||||
InputFileStream sparseStrme(sparseFile);
|
||||
string line;
|
||||
while (getline(sparseStrme, line)) {
|
||||
vector<string> toks = Tokenize(line);
|
||||
UTIL_THROW_IF2(toks.size() != 2, "Incorrect sparse weight format. Should be FFName_spareseName weight");
|
||||
|
||||
vector<string> names = Tokenize(toks[0], "_");
|
||||
UTIL_THROW_IF2(names.size() != 2, "Incorrect sparse weight name. Should be FFName_spareseName");
|
||||
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(names[0]);
|
||||
m_allWeights.Assign(&ff, names[1], Scan<float>(toks[1]));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
@ -221,6 +221,8 @@ protected:
|
||||
std::map<Word, std::set<Word> > m_soft_matches_map;
|
||||
std::map<Word, std::set<Word> > m_soft_matches_map_reverse;
|
||||
|
||||
const StatefulFeatureFunction* m_treeStructure;
|
||||
|
||||
public:
|
||||
|
||||
bool IsAlwaysCreateDirectTranslationOption() const {
|
||||
@ -756,6 +758,20 @@ public:
|
||||
|
||||
bool AdjacentOnly() const
|
||||
{ return m_adjacentOnly; }
|
||||
|
||||
|
||||
void ResetWeights(const std::string &denseWeights, const std::string &sparseFile);
|
||||
|
||||
|
||||
// need global access for output of tree structure
|
||||
const StatefulFeatureFunction* GetTreeStructure() const {
|
||||
return m_treeStructure;
|
||||
}
|
||||
|
||||
void SetTreeStructure(const StatefulFeatureFunction* treeStructure) {
|
||||
m_treeStructure = treeStructure;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -43,10 +43,10 @@ public:
|
||||
alpha_[i] = i * log10(0.4);
|
||||
}
|
||||
~OnlineRLM() {
|
||||
if(alpha_) delete[] alpha_;
|
||||
delete[] alpha_;
|
||||
if(bAdapting_) delete vocab_;
|
||||
else vocab_ = NULL;
|
||||
if(cache_) delete cache_;
|
||||
delete cache_;
|
||||
delete bPrefix_;
|
||||
delete bHit_;
|
||||
}
|
||||
|
@ -235,8 +235,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
|
||||
// SCORES ...
|
||||
string directScores, directSparseScores, indirectScores, indirectSparseScores;
|
||||
breakdownCoreAndSparse( itemDirect[2], directScores, directSparseScores );
|
||||
breakdownCoreAndSparse( itemIndirect[2], indirectScores, indirectSparseScores );
|
||||
breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
|
||||
breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );
|
||||
|
||||
vector<string> directCounts = tokenize(itemDirect[4].c_str());
|
||||
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
|
||||
@ -307,7 +307,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
}
|
||||
|
||||
// alignment
|
||||
fileConsolidated << " ||| " << itemDirect[3];
|
||||
fileConsolidated << " ||| " << itemDirect[2];
|
||||
|
||||
// counts, for debugging
|
||||
fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
|
||||
|
@ -1,909 +0,0 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2009 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include <sstream>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "SafeGetline.h"
|
||||
#include "ScoreFeature.h"
|
||||
#include "tables-core.h"
|
||||
#include "ExtractionPhrasePair.h"
|
||||
#include "score.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
|
||||
#define LINE_MAX_LENGTH 100000
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
LexicalTable lexTable;
|
||||
bool inverseFlag = false;
|
||||
bool hierarchicalFlag = false;
|
||||
bool pcfgFlag = false;
|
||||
bool treeFragmentsFlag = false;
|
||||
bool unpairedExtractFormatFlag = false;
|
||||
bool conditionOnTargetLhsFlag = false;
|
||||
bool wordAlignmentFlag = true;
|
||||
bool goodTuringFlag = false;
|
||||
bool kneserNeyFlag = false;
|
||||
bool logProbFlag = false;
|
||||
int negLogProb = 1;
|
||||
#define COC_MAX 10
|
||||
bool lexFlag = true;
|
||||
bool unalignedFlag = false;
|
||||
bool unalignedFWFlag = false;
|
||||
bool crossedNonTerm = false;
|
||||
int countOfCounts[COC_MAX+1];
|
||||
int totalDistinct = 0;
|
||||
float minCountHierarchical = 0;
|
||||
std::map<std::string,float> sourceLHSCounts;
|
||||
std::map<std::string, std::map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
|
||||
|
||||
std::set<std::string> sourceLabelSet;
|
||||
std::map<std::string,size_t> sourceLabels;
|
||||
std::vector<std::string> sourceLabelsByIndex;
|
||||
|
||||
Vocabulary vcbT;
|
||||
Vocabulary vcbS;
|
||||
|
||||
} // namespace
|
||||
|
||||
std::vector<std::string> tokenize( const char [] );
|
||||
|
||||
void processLine( std::string line,
|
||||
int lineID, bool includeSentenceIdFlag, int &sentenceId,
|
||||
PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
|
||||
std::string &additionalPropertiesString,
|
||||
float &count, float &pcfgSum );
|
||||
void writeCountOfCounts( const std::string &fileNameCountOfCounts );
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
|
||||
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
|
||||
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
||||
double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
|
||||
set<std::string> functionWordList;
|
||||
void loadFunctionWords( const string &fileNameFunctionWords );
|
||||
double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
||||
int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
||||
void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
|
||||
void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
|
||||
void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment );
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::cerr << "Score v2.1 -- "
|
||||
<< "scoring methods for extracted rules" << std::endl;
|
||||
|
||||
ScoreFeatureManager featureManager;
|
||||
if (argc < 4) {
|
||||
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
|
||||
std::cerr << featureManager.usage() << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
std::string fileNameExtract = argv[1];
|
||||
std::string fileNameLex = argv[2];
|
||||
std::string fileNamePhraseTable = argv[3];
|
||||
std::string fileNameCountOfCounts;
|
||||
std::string fileNameFunctionWords;
|
||||
std::vector<std::string> featureArgs; // all unknown args passed to feature manager
|
||||
|
||||
for(int i=4; i<argc; i++) {
|
||||
if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
|
||||
inverseFlag = true;
|
||||
std::cerr << "using inverse mode" << std::endl;
|
||||
} else if (strcmp(argv[i],"--Hierarchical") == 0) {
|
||||
hierarchicalFlag = true;
|
||||
std::cerr << "processing hierarchical rules" << std::endl;
|
||||
} else if (strcmp(argv[i],"--PCFG") == 0) {
|
||||
pcfgFlag = true;
|
||||
std::cerr << "including PCFG scores" << std::endl;
|
||||
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
|
||||
treeFragmentsFlag = true;
|
||||
std::cerr << "including tree fragment information from syntactic parse\n";
|
||||
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
|
||||
unpairedExtractFormatFlag = true;
|
||||
std::cerr << "processing unpaired extract format" << std::endl;
|
||||
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
|
||||
conditionOnTargetLhsFlag = true;
|
||||
std::cerr << "processing unpaired extract format" << std::endl;
|
||||
} else if (strcmp(argv[i],"--NoWordAlignment") == 0) {
|
||||
wordAlignmentFlag = false;
|
||||
std::cerr << "omitting word alignment" << std::endl;
|
||||
} else if (strcmp(argv[i],"--NoLex") == 0) {
|
||||
lexFlag = false;
|
||||
std::cerr << "not computing lexical translation score" << std::endl;
|
||||
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
|
||||
goodTuringFlag = true;
|
||||
fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
|
||||
std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
|
||||
} else if (strcmp(argv[i],"--KneserNey") == 0) {
|
||||
kneserNeyFlag = true;
|
||||
fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
|
||||
std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
|
||||
} else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
|
||||
unalignedFlag = true;
|
||||
std::cerr << "using unaligned word penalty" << std::endl;
|
||||
} else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
|
||||
unalignedFWFlag = true;
|
||||
if (i+1==argc) {
|
||||
std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
fileNameFunctionWords = argv[++i];
|
||||
std::cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << std::endl;
|
||||
} else if (strcmp(argv[i],"--LogProb") == 0) {
|
||||
logProbFlag = true;
|
||||
std::cerr << "using log-probabilities" << std::endl;
|
||||
} else if (strcmp(argv[i],"--NegLogProb") == 0) {
|
||||
logProbFlag = true;
|
||||
negLogProb = -1;
|
||||
std::cerr << "using negative log-probabilities" << std::endl;
|
||||
} else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
|
||||
minCountHierarchical = atof(argv[++i]);
|
||||
std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
|
||||
minCountHierarchical -= 0.00001; // account for rounding
|
||||
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
|
||||
crossedNonTerm = true;
|
||||
std::cerr << "crossed non-term reordering feature" << std::endl;
|
||||
} else {
|
||||
featureArgs.push_back(argv[i]);
|
||||
++i;
|
||||
for (; i < argc && strncmp(argv[i], "--", 2); ++i) {
|
||||
featureArgs.push_back(argv[i]);
|
||||
}
|
||||
if (i != argc) --i; //roll back, since we found another -- argument
|
||||
}
|
||||
}
|
||||
|
||||
MaybeLog maybeLogProb(logProbFlag, negLogProb);
|
||||
|
||||
// configure extra features
|
||||
if (!inverseFlag) {
|
||||
featureManager.configure(featureArgs);
|
||||
}
|
||||
|
||||
// lexical translation table
|
||||
if (lexFlag) {
|
||||
lexTable.load( fileNameLex );
|
||||
}
|
||||
|
||||
// function word list
|
||||
if (unalignedFWFlag) {
|
||||
loadFunctionWords( fileNameFunctionWords );
|
||||
}
|
||||
|
||||
// compute count of counts for Good Turing discounting
|
||||
if (goodTuringFlag || kneserNeyFlag) {
|
||||
for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
|
||||
}
|
||||
|
||||
// sorted phrase extraction file
|
||||
Moses::InputFileStream extractFile(fileNameExtract);
|
||||
|
||||
if (extractFile.fail()) {
|
||||
std::cerr << "ERROR: could not open extract file " << fileNameExtract << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
istream &extractFileP = extractFile;
|
||||
|
||||
// output file: phrase translation table
|
||||
ostream *phraseTableFile;
|
||||
|
||||
if (fileNamePhraseTable == "-") {
|
||||
phraseTableFile = &std::cout;
|
||||
} else {
|
||||
Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
|
||||
bool success = outputFile->Open(fileNamePhraseTable);
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open file phrase table file "
|
||||
<< fileNamePhraseTable << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
phraseTableFile = outputFile;
|
||||
}
|
||||
|
||||
// loop through all extracted phrase translations
|
||||
char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH];
|
||||
lastLine[0] = '\0';
|
||||
ExtractionPhrasePair *phrasePair = NULL;
|
||||
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
|
||||
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
|
||||
|
||||
int tmpSentenceId;
|
||||
PHRASE *tmpPhraseSource, *tmpPhraseTarget;
|
||||
ALIGNMENT *tmpTargetToSourceAlignment;
|
||||
std::string tmpAdditionalPropertiesString;
|
||||
float tmpCount=0.0f, tmpPcfgSum=0.0f;
|
||||
|
||||
int i=0;
|
||||
SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
|
||||
if ( !extractFileP.eof() ) {
|
||||
++i;
|
||||
tmpPhraseSource = new PHRASE();
|
||||
tmpPhraseTarget = new PHRASE();
|
||||
tmpTargetToSourceAlignment = new ALIGNMENT();
|
||||
processLine( std::string(line),
|
||||
i, featureManager.includeSentenceId(), tmpSentenceId,
|
||||
tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
||||
tmpAdditionalPropertiesString,
|
||||
tmpCount, tmpPcfgSum);
|
||||
phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
|
||||
tmpTargetToSourceAlignment,
|
||||
tmpCount, tmpPcfgSum );
|
||||
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
|
||||
featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
|
||||
phrasePairsWithSameSource.push_back( phrasePair );
|
||||
if ( hierarchicalFlag ) {
|
||||
phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
|
||||
}
|
||||
strcpy( lastLine, line );
|
||||
SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
|
||||
}
|
||||
|
||||
while ( !extractFileP.eof() ) {
|
||||
|
||||
if ( ++i % 100000 == 0 ) {
|
||||
std::cerr << "." << std::flush;
|
||||
}
|
||||
|
||||
// identical to last line? just add count
|
||||
if (strcmp(line,lastLine) == 0) {
|
||||
phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
|
||||
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
continue;
|
||||
} else {
|
||||
strcpy( lastLine, line );
|
||||
}
|
||||
|
||||
tmpPhraseSource = new PHRASE();
|
||||
tmpPhraseTarget = new PHRASE();
|
||||
tmpTargetToSourceAlignment = new ALIGNMENT();
|
||||
tmpAdditionalPropertiesString.clear();
|
||||
processLine( std::string(line),
|
||||
i, featureManager.includeSentenceId(), tmpSentenceId,
|
||||
tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
||||
tmpAdditionalPropertiesString,
|
||||
tmpCount, tmpPcfgSum);
|
||||
|
||||
bool matchesPrevious = false;
|
||||
bool sourceMatch = true; bool targetMatch = true; bool alignmentMatch = true; // be careful with these,
|
||||
// ExtractionPhrasePair::Matches() checks them in order and does not continue with the others
|
||||
// once the first of them has been found to have to be set to false
|
||||
|
||||
if ( hierarchicalFlag ) {
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
|
||||
iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) {
|
||||
if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
||||
sourceMatch, targetMatch, alignmentMatch ) ) {
|
||||
matchesPrevious = true;
|
||||
phrasePair = (*iter);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ( phrasePair->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
||||
sourceMatch, targetMatch, alignmentMatch ) ) {
|
||||
matchesPrevious = true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( matchesPrevious ) {
|
||||
delete tmpPhraseSource;
|
||||
delete tmpPhraseTarget;
|
||||
if ( !phrasePair->Add( tmpTargetToSourceAlignment,
|
||||
tmpCount, tmpPcfgSum ) ) {
|
||||
delete tmpTargetToSourceAlignment;
|
||||
}
|
||||
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
|
||||
featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
|
||||
} else {
|
||||
|
||||
if ( !phrasePairsWithSameSource.empty() &&
|
||||
!sourceMatch ) {
|
||||
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
||||
delete *iter;
|
||||
}
|
||||
phrasePairsWithSameSource.clear();
|
||||
if ( hierarchicalFlag ) {
|
||||
phrasePairsWithSameSourceAndTarget.clear();
|
||||
}
|
||||
}
|
||||
|
||||
if ( hierarchicalFlag ) {
|
||||
if ( !phrasePairsWithSameSourceAndTarget.empty() &&
|
||||
!targetMatch ) {
|
||||
phrasePairsWithSameSourceAndTarget.clear();
|
||||
}
|
||||
}
|
||||
|
||||
phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
|
||||
tmpTargetToSourceAlignment,
|
||||
tmpCount, tmpPcfgSum );
|
||||
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
|
||||
featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
|
||||
phrasePairsWithSameSource.push_back(phrasePair);
|
||||
|
||||
if ( hierarchicalFlag ) {
|
||||
phrasePairsWithSameSourceAndTarget.push_back(phrasePair);
|
||||
}
|
||||
}
|
||||
|
||||
SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
|
||||
}
|
||||
|
||||
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
||||
delete *iter;
|
||||
}
|
||||
phrasePairsWithSameSource.clear();
|
||||
|
||||
|
||||
phraseTableFile->flush();
|
||||
if (phraseTableFile != &std::cout) {
|
||||
delete phraseTableFile;
|
||||
}
|
||||
|
||||
// output count of count statistics
|
||||
if (goodTuringFlag || kneserNeyFlag) {
|
||||
writeCountOfCounts( fileNameCountOfCounts );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void processLine( std::string line,
|
||||
int lineID, bool includeSentenceIdFlag, int &sentenceId,
|
||||
PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
|
||||
std::string &additionalPropertiesString,
|
||||
float &count, float &pcfgSum )
|
||||
{
|
||||
size_t foundAdditionalProperties = line.find("{{");
|
||||
if (foundAdditionalProperties != std::string::npos) {
|
||||
additionalPropertiesString = line.substr(foundAdditionalProperties);
|
||||
line = line.substr(0,foundAdditionalProperties);
|
||||
} else {
|
||||
additionalPropertiesString.clear();
|
||||
}
|
||||
|
||||
phraseSource->clear();
|
||||
phraseTarget->clear();
|
||||
targetToSourceAlignment->clear();
|
||||
|
||||
std::vector<std::string> token = tokenize( line.c_str() );
|
||||
int item = 1;
|
||||
for ( size_t j=0; j<token.size(); ++j ) {
|
||||
if (token[j] == "|||") {
|
||||
++item;
|
||||
} else if (item == 1) { // source phrase
|
||||
phraseSource->push_back( vcbS.storeIfNew( token[j] ) );
|
||||
} else if (item == 2) { // target phrase
|
||||
phraseTarget->push_back( vcbT.storeIfNew( token[j] ) );
|
||||
} else if (item == 3) { // alignment
|
||||
int s,t;
|
||||
sscanf(token[j].c_str(), "%d-%d", &s, &t);
|
||||
if ((size_t)t >= phraseTarget->size() || (size_t)s >= phraseSource->size()) {
|
||||
std::cerr << "WARNING: phrase pair " << lineID
|
||||
<< " has alignment point (" << s << ", " << t << ")"
|
||||
<< " out of bounds (" << phraseSource->size() << ", " << phraseTarget->size() << ")"
|
||||
<< std::endl;
|
||||
} else {
|
||||
// first alignment point? -> initialize
|
||||
if ( targetToSourceAlignment->size() == 0 ) {
|
||||
size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
|
||||
targetToSourceAlignment->resize(numberOfTargetSymbols);
|
||||
}
|
||||
// add alignment point
|
||||
targetToSourceAlignment->at(t).insert(s);
|
||||
}
|
||||
} else if (includeSentenceIdFlag && item == 4) { // optional sentence id
|
||||
sscanf(token[j].c_str(), "%d", &sentenceId);
|
||||
} else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
|
||||
sscanf(token[j].c_str(), "%f", &count);
|
||||
} else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
|
||||
float pcfgScore = std::atof(token[j].c_str());
|
||||
pcfgSum = pcfgScore * count;
|
||||
}
|
||||
}
|
||||
|
||||
if ( targetToSourceAlignment->size() == 0 ) {
|
||||
size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
|
||||
targetToSourceAlignment->resize(numberOfTargetSymbols);
|
||||
}
|
||||
|
||||
if (item + (includeSentenceIdFlag?-1:0) == 3) {
|
||||
count = 1.0;
|
||||
}
|
||||
if (item < 3 || item > 6) {
|
||||
std::cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void writeCountOfCounts( const string &fileNameCountOfCounts )
|
||||
{
|
||||
// open file
|
||||
Moses::OutputFileStream countOfCountsFile;
|
||||
bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open count-of-counts file "
|
||||
<< fileNameCountOfCounts << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// Kneser-Ney needs the total number of phrase pairs
|
||||
countOfCountsFile << totalDistinct << std::endl;
|
||||
|
||||
// write out counts
|
||||
for(int i=1; i<=COC_MAX; i++) {
|
||||
countOfCountsFile << countOfCounts[ i ] << std::endl;
|
||||
}
|
||||
countOfCountsFile.Close();
|
||||
}
|
||||
|
||||
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
|
||||
{
|
||||
if (phrasePairsWithSameSource.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
float totalSource = 0;
|
||||
|
||||
//std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
|
||||
|
||||
// loop through phrase pairs
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
||||
// add to total count
|
||||
totalSource += (*iter)->GetCount();
|
||||
}
|
||||
|
||||
// output the distinct phrase pairs, one at a time
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
||||
// add to total count
|
||||
outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
|
||||
}
|
||||
}
|
||||
|
||||
void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
float totalCount, int distinctCount,
|
||||
ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager,
|
||||
const MaybeLog& maybeLogProb )
|
||||
{
|
||||
assert(phrasePair.IsValid());
|
||||
|
||||
const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
|
||||
float count = phrasePair.GetCount();
|
||||
|
||||
map< string, float > domainCount;
|
||||
|
||||
// collect count of count statistics
|
||||
if (goodTuringFlag || kneserNeyFlag) {
|
||||
totalDistinct++;
|
||||
int countInt = count + 0.99999;
|
||||
if (countInt <= COC_MAX)
|
||||
countOfCounts[ countInt ]++;
|
||||
}
|
||||
|
||||
// compute PCFG score
|
||||
float pcfgScore = 0;
|
||||
if (pcfgFlag && !inverseFlag) {
|
||||
pcfgScore = phrasePair.GetPcfgScore() / count;
|
||||
}
|
||||
|
||||
// output phrases
|
||||
const PHRASE *phraseSource = phrasePair.GetSource();
|
||||
const PHRASE *phraseTarget = phrasePair.GetTarget();
|
||||
|
||||
// do not output if hierarchical and count below threshold
|
||||
if (hierarchicalFlag && count < minCountHierarchical) {
|
||||
for(size_t j=0; j<phraseSource->size()-1; ++j) {
|
||||
if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// source phrase (unless inverse)
|
||||
if (!inverseFlag) {
|
||||
printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
|
||||
phraseTableFile << " ||| ";
|
||||
}
|
||||
|
||||
// target phrase
|
||||
printTargetPhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
|
||||
phraseTableFile << " ||| ";
|
||||
|
||||
// source phrase (if inverse)
|
||||
if (inverseFlag) {
|
||||
printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
|
||||
phraseTableFile << " ||| ";
|
||||
}
|
||||
|
||||
// lexical translation probability
|
||||
if (lexFlag) {
|
||||
double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S );
|
||||
phraseTableFile << maybeLogProb( lexScore );
|
||||
}
|
||||
|
||||
// unaligned word penalty
|
||||
if (unalignedFlag) {
|
||||
double penalty = computeUnalignedPenalty( bestAlignmentT2S );
|
||||
phraseTableFile << " " << maybeLogProb( penalty );
|
||||
}
|
||||
|
||||
// unaligned function word penalty
|
||||
if (unalignedFWFlag) {
|
||||
double penalty = computeUnalignedFWPenalty( phraseTarget, bestAlignmentT2S );
|
||||
phraseTableFile << " " << maybeLogProb( penalty );
|
||||
}
|
||||
|
||||
if (crossedNonTerm && !inverseFlag) {
|
||||
phraseTableFile << " " << calcCrossedNonTerm( phraseTarget, bestAlignmentT2S );
|
||||
}
|
||||
|
||||
// target-side PCFG score
|
||||
if (pcfgFlag && !inverseFlag) {
|
||||
phraseTableFile << " " << maybeLogProb( pcfgScore );
|
||||
}
|
||||
|
||||
// extra features
|
||||
ScoreFeatureContext context(phrasePair, maybeLogProb);
|
||||
std::vector<float> extraDense;
|
||||
map<string,float> extraSparse;
|
||||
featureManager.addFeatures(context, extraDense, extraSparse);
|
||||
for (size_t i = 0; i < extraDense.size(); ++i) {
|
||||
phraseTableFile << " " << extraDense[i];
|
||||
}
|
||||
|
||||
for (map<string,float>::const_iterator i = extraSparse.begin();
|
||||
i != extraSparse.end(); ++i) {
|
||||
phraseTableFile << " " << i->first << " " << i->second;
|
||||
}
|
||||
|
||||
phraseTableFile << " ||| ";
|
||||
|
||||
// output alignment info
|
||||
if ( !inverseFlag ) {
|
||||
if ( hierarchicalFlag ) {
|
||||
// always output alignment if hiero style
|
||||
assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
|
||||
std::vector<std::string> alignment;
|
||||
for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
|
||||
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
|
||||
if ( bestAlignmentT2S->at(j).size() != 1 ) {
|
||||
std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
|
||||
phraseTableFile.flush();
|
||||
assert(bestAlignmentT2S->at(j).size() == 1);
|
||||
}
|
||||
size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
|
||||
//phraseTableFile << sourcePos << "-" << j << " ";
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
} else {
|
||||
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
||||
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
||||
size_t sourcePos = *setIter;
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
// now print all alignments, sorted by source index
|
||||
sort(alignment.begin(), alignment.end());
|
||||
for (size_t i = 0; i < alignment.size(); ++i) {
|
||||
phraseTableFile << alignment[i] << " ";
|
||||
}
|
||||
} else if (wordAlignmentFlag) {
|
||||
// alignment info in pb model
|
||||
for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
|
||||
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
||||
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
||||
size_t sourcePos = *setIter;
|
||||
phraseTableFile << sourcePos << "-" << j << " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// counts
|
||||
phraseTableFile << " ||| " << totalCount << " " << count;
|
||||
if (kneserNeyFlag)
|
||||
phraseTableFile << " " << distinctCount;
|
||||
|
||||
if ((treeFragmentsFlag) &&
|
||||
!inverseFlag) {
|
||||
phraseTableFile << " |||";
|
||||
}
|
||||
|
||||
// tree fragments
|
||||
if (treeFragmentsFlag && !inverseFlag) {
|
||||
const std::string *bestTreeFragment = phrasePair.FindBestPropertyValue("Tree");
|
||||
if (bestTreeFragment) {
|
||||
phraseTableFile << " {{Tree " << *bestTreeFragment << "}}";
|
||||
}
|
||||
}
|
||||
|
||||
phraseTableFile << std::endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource )
|
||||
{
|
||||
for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) {
|
||||
if (currTarget == targetPos) {
|
||||
// skip
|
||||
} else {
|
||||
const std::set<size_t> &sourceSet = alignmentTargetToSource->at(currTarget);
|
||||
for (std::set<size_t>::const_iterator iter = sourceSet.begin();
|
||||
iter != sourceSet.end(); ++iter) {
|
||||
size_t currSource = *iter;
|
||||
|
||||
if ((currTarget < targetPos && currSource > sourcePos)
|
||||
|| (currTarget > targetPos && currSource < sourcePos)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
|
||||
{
|
||||
assert(phraseTarget->size() >= alignmentTargetToSource->size() );
|
||||
|
||||
for (size_t targetPos = 0; targetPos < alignmentTargetToSource->size(); ++targetPos) {
|
||||
|
||||
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(targetPos) ))) {
|
||||
const std::set<size_t> &alignmentPoints = alignmentTargetToSource->at(targetPos);
|
||||
assert( alignmentPoints.size() == 1 );
|
||||
size_t sourcePos = *alignmentPoints.begin();
|
||||
bool ret = calcCrossedNonTerm(targetPos, sourcePos, alignmentTargetToSource);
|
||||
if (ret)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource )
|
||||
{
|
||||
// unaligned word counter
|
||||
double unaligned = 1.0;
|
||||
// only checking target words - source words are caught when computing inverse
|
||||
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
|
||||
const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
||||
if (srcIndices.empty()) {
|
||||
unaligned *= 2.718;
|
||||
}
|
||||
}
|
||||
return unaligned;
|
||||
}
|
||||
|
||||
|
||||
double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
|
||||
{
|
||||
// unaligned word counter
|
||||
double unaligned = 1.0;
|
||||
// only checking target words - source words are caught when computing inverse
|
||||
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
|
||||
const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
||||
if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) {
|
||||
unaligned *= 2.718;
|
||||
}
|
||||
}
|
||||
return unaligned;
|
||||
}
|
||||
|
||||
void loadFunctionWords( const string &fileName )
|
||||
{
|
||||
std::cerr << "Loading function word list from " << fileName;
|
||||
ifstream inFile;
|
||||
inFile.open(fileName.c_str());
|
||||
if (inFile.fail()) {
|
||||
std::cerr << " - ERROR: could not open file" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
while(true) {
|
||||
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (inFileP->eof()) break;
|
||||
std::vector<string> token = tokenize( line );
|
||||
if (token.size() > 0)
|
||||
functionWordList.insert( token[0] );
|
||||
}
|
||||
|
||||
std::cerr << " - read " << functionWordList.size() << " function words" << std::endl;
|
||||
inFile.close();
|
||||
}
|
||||
|
||||
|
||||
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
|
||||
{
|
||||
// lexical translation probability
|
||||
double lexScore = 1.0;
|
||||
int null = vcbS.getWordID("NULL");
|
||||
// all target words have to be explained
|
||||
for(size_t ti=0; ti<alignmentTargetToSource->size(); ti++) {
|
||||
const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
||||
if (srcIndices.empty()) {
|
||||
// explain unaligned word by NULL
|
||||
lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) );
|
||||
} else {
|
||||
// go through all the aligned words to compute average
|
||||
double thisWordScore = 0;
|
||||
for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
|
||||
thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) );
|
||||
}
|
||||
lexScore *= thisWordScore / (double)srcIndices.size();
|
||||
}
|
||||
}
|
||||
return lexScore;
|
||||
}
|
||||
|
||||
|
||||
void LexicalTable::load( const string &fileName )
|
||||
{
|
||||
std::cerr << "Loading lexical translation table from " << fileName;
|
||||
ifstream inFile;
|
||||
inFile.open(fileName.c_str());
|
||||
if (inFile.fail()) {
|
||||
std::cerr << " - ERROR: could not open file" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
if (i%100000 == 0) std::cerr << "." << flush;
|
||||
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (inFileP->eof()) break;
|
||||
|
||||
std::vector<string> token = tokenize( line );
|
||||
if (token.size() != 3) {
|
||||
std::cerr << "line " << i << " in " << fileName
|
||||
<< " has wrong number of tokens, skipping:" << std::endl
|
||||
<< token.size() << " " << token[0] << " " << line << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
double prob = atof( token[2].c_str() );
|
||||
WORD_ID wordT = vcbT.storeIfNew( token[0] );
|
||||
WORD_ID wordS = vcbS.storeIfNew( token[1] );
|
||||
ltable[ wordS ][ wordT ] = prob;
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
|
||||
void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
||||
const ALIGNMENT *targetToSourceAlignment, ostream &out)
|
||||
{
|
||||
// get corresponding target non-terminal and output pair
|
||||
ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT();
|
||||
invertAlignment(phraseSource, phraseTarget, targetToSourceAlignment, sourceToTargetAlignment);
|
||||
// output source symbols, except root, in rule table format
|
||||
for (std::size_t i = 0; i < phraseSource->size()-1; ++i) {
|
||||
const std::string &word = vcbS.getWord(phraseSource->at(i));
|
||||
if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
|
||||
out << word << " ";
|
||||
continue;
|
||||
}
|
||||
const std::set<std::size_t> &alignmentPoints = sourceToTargetAlignment->at(i);
|
||||
assert(alignmentPoints.size() == 1);
|
||||
size_t j = *(alignmentPoints.begin());
|
||||
if (inverseFlag) {
|
||||
out << vcbT.getWord(phraseTarget->at(j)) << word << " ";
|
||||
} else {
|
||||
out << word << vcbT.getWord(phraseTarget->at(j)) << " ";
|
||||
}
|
||||
}
|
||||
// output source root symbol
|
||||
if (conditionOnTargetLhsFlag && !inverseFlag) {
|
||||
out << "[X]";
|
||||
} else {
|
||||
out << vcbS.getWord(phraseSource->back());
|
||||
}
|
||||
delete sourceToTargetAlignment;
|
||||
}
|
||||
|
||||
|
||||
void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
||||
const ALIGNMENT *targetToSourceAlignment, ostream &out)
|
||||
{
|
||||
// output target symbols, except root, in rule table format
|
||||
for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) {
|
||||
const std::string &word = vcbT.getWord(phraseTarget->at(i));
|
||||
if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
|
||||
out << word << " ";
|
||||
continue;
|
||||
}
|
||||
// get corresponding source non-terminal and output pair
|
||||
std::set<std::size_t> alignmentPoints = targetToSourceAlignment->at(i);
|
||||
assert(alignmentPoints.size() == 1);
|
||||
int j = *(alignmentPoints.begin());
|
||||
if (inverseFlag) {
|
||||
out << word << vcbS.getWord(phraseSource->at(j)) << " ";
|
||||
} else {
|
||||
out << vcbS.getWord(phraseSource->at(j)) << word << " ";
|
||||
}
|
||||
}
|
||||
// output target root symbol
|
||||
if (conditionOnTargetLhsFlag) {
|
||||
if (inverseFlag) {
|
||||
out << "[X]";
|
||||
} else {
|
||||
out << vcbS.getWord(phraseSource->back());
|
||||
}
|
||||
} else {
|
||||
out << vcbT.getWord(phraseTarget->back());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
||||
const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment) {
|
||||
// typedef std::vector< std::set<size_t> > ALIGNMENT;
|
||||
|
||||
outSourceToTargetAlignment->clear();
|
||||
size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size());
|
||||
outSourceToTargetAlignment->resize(numberOfSourceSymbols);
|
||||
// add alignment point
|
||||
for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) {
|
||||
for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin();
|
||||
setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) {
|
||||
size_t sourcePosition = *setIter;
|
||||
outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user