Merge ../mosesdecoder into perf_moses2

This commit is contained in:
Hieu Hoang 2016-01-12 09:23:07 +00:00
commit d0a48e71ad
7 changed files with 229 additions and 16 deletions

View File

@ -148,7 +148,7 @@ function run_single_test () {
cd ..
touch giza-pp.ok
fi
./bjam $MCC_CONFIGURE_ARGS --with-giza="$(pwd)/giza-pp/bin" || err="bjam with-giza"
./bjam $MCC_CONFIGURE_ARGS" || err="bjam"
srilm_dir=$(echo $MCC_CONFIGURE_ARGS | sed -r 's/.*--with-srilm=([^ ]+) .*/\1/')
mach_type=$($srilm_dir/sbin/machine-type)
mkdir -p "$WORKDIR/ems_workdir"

View File

@ -161,7 +161,9 @@ int
run_as_server()
{
#ifdef HAVE_XMLRPC_C
kill(getppid(),SIGALRM);
if (params.GetParam("daemon")) {
kill(getppid(),SIGALRM);
}
MosesServer::Server server(params);
return server.run(); // actually: don't return. see Server::run()
#else

View File

@ -6,6 +6,7 @@
#include "moses/PP/CountsPhraseProperty.h"
#include "moses/PP/SourceLabelsPhraseProperty.h"
#include "moses/PP/TargetPreferencesPhraseProperty.h"
#include "moses/PP/TreeStructurePhraseProperty.h"
#include "moses/PP/SpanLengthPhraseProperty.h"
#include "moses/PP/NonTermContextProperty.h"
@ -57,6 +58,7 @@ PhrasePropertyFactory::PhrasePropertyFactory()
MOSES_PNAME2("Counts", CountsPhraseProperty);
MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty);
MOSES_PNAME2("TargetPreferences", TargetPreferencesPhraseProperty);
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
MOSES_PNAME2("NonTermContext", NonTermContextProperty);

View File

@ -0,0 +1,123 @@
#include "moses/PP/TargetPreferencesPhraseProperty.h"
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <string>
#include <queue>
#include <assert.h>
#include <limits>
namespace Moses
{
void TargetPreferencesPhraseProperty::ProcessValue(const std::string &value)
{
std::istringstream tokenizer(value);
if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read number of non-terminals. Flawed property?");
}
assert( m_nNTs > 0 );
if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read overall rule count. Flawed property?");
}
assert( m_totalCount > 0.0 );
// read labelled rule items
std::priority_queue<float> ruleLabelledCountsPQ;
while (tokenizer.peek() != EOF) {
try {
TargetPreferencesPhrasePropertyItem item;
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
if (m_nNTs == 1) {
item.m_labelsRHSCount = m_totalCount;
} else { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
for (size_t i=0; i<m_nNTs-1; ++i) { // RHS non-terminal labels
size_t labelRHS;
if (! (tokenizer >> labelRHS) ) { // RHS non-terminal label
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read right-hand side label index. Flawed property?");
}
item.m_labelsRHS.push_back(labelRHS);
}
if (! (tokenizer >> item.m_labelsRHSCount)) {
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read right-hand side count. Flawed property?");
}
if (! (tokenizer >> numberOfLHSsGivenRHS)) {
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read number of left-hand sides. Flawed property?");
}
}
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS non-terminal labels seen with this RHS
size_t labelLHS;
if (! (tokenizer >> labelLHS)) { // LHS non-terminal label
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read left-hand side label index. Flawed property?");
}
float ruleLabelledCount;
if (! (tokenizer >> ruleLabelledCount)) {
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read count. Flawed property?");
}
item.m_labelsLHSList.push_back( std::make_pair(labelLHS,ruleLabelledCount) );
ruleLabelledCountsPQ.push(ruleLabelledCount);
}
m_labelItems.push_back(item);
} catch (const std::exception &e) {
UTIL_THROW2("TargetPreferencesPhraseProperty: Read error. Flawed property?");
}
}
// keep only top N label vectors
const size_t N=50;
if (ruleLabelledCountsPQ.size() > N) {
float topNRuleLabelledCount = std::numeric_limits<int>::max();
for (size_t i=0; !ruleLabelledCountsPQ.empty() && i<N; ++i) {
topNRuleLabelledCount = ruleLabelledCountsPQ.top();
ruleLabelledCountsPQ.pop();
}
size_t nKept=0;
std::list<TargetPreferencesPhrasePropertyItem>::iterator itemIter=m_labelItems.begin();
while (itemIter!=m_labelItems.end()) {
if (itemIter->m_labelsRHSCount < topNRuleLabelledCount) {
itemIter = m_labelItems.erase(itemIter);
} else {
std::list< std::pair<size_t,float> >::iterator itemLHSIter=(itemIter->m_labelsLHSList).begin();
while (itemLHSIter!=(itemIter->m_labelsLHSList).end()) {
if (itemLHSIter->second < topNRuleLabelledCount) {
itemLHSIter = (itemIter->m_labelsLHSList).erase(itemLHSIter);
} else {
if (nKept >= N) {
itemLHSIter = (itemIter->m_labelsLHSList).erase(itemLHSIter,(itemIter->m_labelsLHSList).end());
} else {
++nKept;
++itemLHSIter;
}
}
}
if ((itemIter->m_labelsLHSList).empty()) {
itemIter = m_labelItems.erase(itemIter);
} else {
++itemIter;
}
}
}
}
};
} // namespace Moses

View File

@ -0,0 +1,71 @@
#pragma once
#include "moses/PP/PhraseProperty.h"
#include "util/exception.hh"
#include <string>
#include <list>
namespace Moses
{
class TargetPreferencesPhrasePropertyItem
{
friend class TargetPreferencesPhraseProperty;
public:
TargetPreferencesPhrasePropertyItem() {};
float GetTargetPreferencesRHSCount() const {
return m_labelsRHSCount;
};
const std::list<size_t> &GetTargetPreferencesRHS() const {
return m_labelsRHS;
};
const std::list< std::pair<size_t,float> > &GetTargetPreferencesLHSList() const {
return m_labelsLHSList;
};
private:
float m_labelsRHSCount;
std::list<size_t> m_labelsRHS; // should be of size nNTs-1 (empty if initial rule, i.e. no right-hand side non-terminals)
std::list< std::pair<size_t,float> > m_labelsLHSList; // list of left-hand sides for this right-hand side, with counts
};
class TargetPreferencesPhraseProperty : public PhraseProperty
{
public:
TargetPreferencesPhraseProperty() {};
virtual void ProcessValue(const std::string &value);
size_t GetNumberOfNonTerminals() const {
return m_nNTs;
}
float GetTotalCount() const {
return m_totalCount;
}
const std::list<TargetPreferencesPhrasePropertyItem> &GetTargetPreferencesItems() const {
return m_labelItems;
};
virtual const std::string *GetValueString() const {
UTIL_THROW2("TargetPreferencesPhraseProperty: value string not available in this phrase property");
return NULL;
};
protected:
size_t m_nNTs;
float m_totalCount;
std::list<TargetPreferencesPhrasePropertyItem> m_labelItems;
};
} // namespace Moses

View File

@ -697,14 +697,13 @@ void ExtractTask::saveTargetSyntacticPreference( const HoleCollection &holeColl,
const Hole &hole = *iterHoleList;
int labelI = labelIndex[ 2+holeCount ];
string targetLabel = "X";
int startT = hole.GetStart(1);
int endT = hole.GetEnd(1);
if (m_sentence.targetTree.HasNode(startT,endT)) {
rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelI]->label;
rule.targetSyntacticPreference += " ";
} else {
rule.targetSyntacticPreference += "X ";
rule.targetSyntacticPreference += "XRHS ";
}
++holeCount;
}
@ -815,7 +814,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
rule.targetSyntacticPreference += " ";
rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
} else {
rule.targetSyntacticPreference += " X";
rule.targetSyntacticPreference += " XLHS";
}
}
@ -1098,7 +1097,7 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
if (m_sentence.targetTree.HasNode(startT,endT)) {
rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
} else {
rule.targetSyntacticPreference += "X";
rule.targetSyntacticPreference += "XLHS";
}
}
@ -1233,14 +1232,30 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
{
ofstream grammarFile;
grammarFile.open(fileName.c_str());
std::string glueRulesPhraseProperty = "";
if (options.phraseOrientation) {
glueRulesPhraseProperty.append(" ||| ||| {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}");
glueRulesPhraseProperty.append(" {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}");
}
const size_t targetSyntacticPreferencesLabelGlueTop = 0;
const size_t targetSyntacticPreferencesLabelGlueX = 1;
if (!options.targetSyntax || options.targetSyntacticPreferences) {
grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| 0-0 ||| 0" << glueRulesPhraseProperty << endl
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl;
grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| 0-0 ||| 0 ||| |||" << glueRulesPhraseProperty;
if (options.targetSyntacticPreferences) {
grammarFile << " {{TargetPreferences 1 1 " << targetSyntacticPreferencesLabelGlueTop << " 1}}";
}
grammarFile << std::endl;
grammarFile << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 1-1 ||| 0 ||| |||" << glueRulesPhraseProperty;
if (options.targetSyntacticPreferences) {
grammarFile << " {{TargetPreferences 2 1 " << targetSyntacticPreferencesLabelGlueTop << " 1 1 " << targetSyntacticPreferencesLabelGlueTop << " 1}}";
}
grammarFile << std::endl;
grammarFile << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0 ||| |||" << glueRulesPhraseProperty;
if (options.targetSyntacticPreferences) {
grammarFile << " {{TargetPreferences 3 1 " << targetSyntacticPreferencesLabelGlueTop << " " << targetSyntacticPreferencesLabelGlueX << " 1 1 " << targetSyntacticPreferencesLabelGlueTop << " 1}}";
}
grammarFile << std::endl;
} else {
// choose a top label that is not already a label
string topLabel = "QQQQQQ";
@ -1251,21 +1266,21 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
}
}
// basic rules
grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| 0-0" << endl
<< "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1" << endl;
grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| 0-0" << std::endl
<< "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1" << std::endl;
// top rules
for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
i != targetTopLabelCollection.end(); i++ ) {
grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2" << endl;
grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2" << std::endl;
}
// glue rules
for( set<string>::const_iterator i = targetLabelCollection.begin();
i != targetLabelCollection.end(); i++ ) {
grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << endl;
grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << std::endl;
}
grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << endl; // glue rule for unknown word...
grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << std::endl; // glue rule for unknown word...
}
grammarFile.close();
}

View File

@ -314,7 +314,7 @@ if (!$inverse && defined($partsOfSpeechFile))
# merge target syntactic preferences labels files
if (!$inverse && defined($targetSyntacticPreferencesLabelsFile))
{
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $targetSyntacticPreferencesLabelsFile";
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $targetSyntacticPreferencesLabelsFile";
print STDERR "Merging target syntactic preferences labels files: $cmd \n";
`$cmd`;
}