source labels: integration into EMS

This commit is contained in:
Matthias Huck 2014-08-07 21:02:51 +01:00
parent cda9d1d5ae
commit c27cbf55ea
11 changed files with 324 additions and 41 deletions

View File

@ -9,6 +9,7 @@
#include "moses/PP/TreeStructurePhraseProperty.h" #include "moses/PP/TreeStructurePhraseProperty.h"
#include "moses/PP/SpanLengthPhraseProperty.h" #include "moses/PP/SpanLengthPhraseProperty.h"
#include "moses/PP/NonTermContextProperty.h" #include "moses/PP/NonTermContextProperty.h"
#include "moses/PP/OrientationPhraseProperty.h"
namespace Moses namespace Moses
{ {
@ -59,6 +60,7 @@ PhrasePropertyFactory::PhrasePropertyFactory()
MOSES_PNAME2("Tree",TreeStructurePhraseProperty); MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty); MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
MOSES_PNAME2("NonTermContext", NonTermContextProperty); MOSES_PNAME2("NonTermContext", NonTermContextProperty);
MOSES_PNAME2("Orientation", OrientationPhraseProperty);
} }
PhrasePropertyFactory::~PhrasePropertyFactory() PhrasePropertyFactory::~PhrasePropertyFactory()

View File

@ -16,12 +16,12 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
std::istringstream tokenizer(value); std::istringstream tokenizer(value);
if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side) if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property?"); UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
} }
assert( m_nNTs > 0 ); assert( m_nNTs > 0 );
if (! (tokenizer >> m_totalCount)) { // second token: overall rule count if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property?"); UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
} }
assert( m_totalCount > 0.0 ); assert( m_totalCount > 0.0 );
@ -32,7 +32,7 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
std::priority_queue<float> ruleLabelledCountsPQ; std::priority_queue<float> ruleLabelledCountsPQ;
while (tokenizer.peek() != EOF) { while (tokenizer.peek() != EOF) {
try { // try {
SourceLabelsPhrasePropertyItem item; SourceLabelsPhrasePropertyItem item;
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max(); size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
@ -46,28 +46,28 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
size_t sourceLabelRHS; size_t sourceLabelRHS;
if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property?"); UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
} }
item.m_sourceLabelsRHS.push_back(sourceLabelRHS); item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
} }
if (! (tokenizer >> item.m_sourceLabelsRHSCount)) { if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property?"); UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
} }
if (! (tokenizer >> numberOfLHSsGivenRHS)) { if (! (tokenizer >> numberOfLHSsGivenRHS)) {
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property?"); UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
} }
} }
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
size_t sourceLabelLHS; size_t sourceLabelLHS;
if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property?"); UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
} }
float ruleSourceLabelledCount; float ruleSourceLabelledCount;
if (! (tokenizer >> ruleSourceLabelledCount)) { if (! (tokenizer >> ruleSourceLabelledCount)) {
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property?"); UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
} }
item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) ); item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
ruleLabelledCountsPQ.push(ruleSourceLabelledCount); ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
@ -75,9 +75,9 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
m_sourceLabelItems.push_back(item); m_sourceLabelItems.push_back(item);
} catch (const std::exception &e) { // } catch (const std::exception &e) {
UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?"); // UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
} // }
} }
// keep only top N label vectors // keep only top N label vectors

View File

@ -0,0 +1,159 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "PropertiesConsolidator.h"
#include <sstream>
#include <limits>
#include <vector>
#include "moses/Util.h"
#include "phrase-extract/InputFileStream.h"
#include "phrase-extract/OutputFileStream.h"
namespace MosesTraining
{
void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
{
Moses::InputFileStream inFile(sourceLabelSetFile);
// read source label set
m_sourceLabels.clear();
std::string line;
while (getline(inFile, line)) {
std::istringstream tokenizer(line);
std::string label;
size_t index;
try {
tokenizer >> label >> index;
} catch (const std::exception &e) {
UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
}
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
}
inFile.Close();
m_sourceLabelsFlag = true;
}
std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const
{
if ( propertiesString.empty() ) {
return propertiesString;
}
std::ostringstream out;
std::vector<std::string> toks;
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
for (size_t i = 1; i < toks.size(); ++i) {
std::string &tok = toks[i];
if (tok.empty()) {
continue;
}
size_t endPos = tok.rfind("}");
tok = tok.substr(0, endPos - 1);
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
assert(keyValue.size() == 2);
if ( !keyValue[0].compare("SourceLabels") ) {
if ( m_sourceLabelsFlag ) {
// SourceLabels additional property: replace strings with vocabulary indices
out << " {{" << keyValue[0];
std::istringstream tokenizer(keyValue[1]);
size_t nNTs;
double totalCount;
if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
<< "Flawed SourceLabels property?");
}
assert( nNTs > 0 );
out << " " << nNTs;
if (! (tokenizer >> totalCount)) { // second token: overall rule count
UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
<< "Flawed SourceLabels property?");
}
assert( totalCount > 0.0 );
out << " " << totalCount;
while (tokenizer.peek() != EOF) {
try {
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
std::string token;
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
tokenizer >> token; // RHS source non-terminal label
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
out << " " << found->second;
}
tokenizer >> token; // sourceLabelsRHSCount
out << " " << token;
tokenizer >> numberOfLHSsGivenRHS;
out << " " << numberOfLHSsGivenRHS;
}
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
tokenizer >> token; // LHS source non-terminal label
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
out << " " << found->second;
tokenizer >> token; // ruleSourceLabelledCount
out << " " << token;
}
} catch (const std::exception &e) {
UTIL_THROW2("Flawed item in SourceLabels property?");
}
}
out << "}}";
} else { // don't process source labels additional property
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
}
} else {
// output other additional property
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
}
}
return out.str();
}
} // namespace MosesTraining

View File

@ -0,0 +1,48 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <string>
#include <map>
namespace MosesTraining
{
class PropertiesConsolidator
{
public:
PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
std::string ProcessPropertiesString(const std::string &propertiesString) const;
private:
bool m_sourceLabelsFlag;
std::map<std::string,size_t> m_sourceLabels;
};
} // namespace MosesTraining

View File

@ -28,6 +28,7 @@
#include "tables-core.h" #include "tables-core.h"
#include "InputFileStream.h" #include "InputFileStream.h"
#include "OutputFileStream.h" #include "OutputFileStream.h"
#include "PropertiesConsolidator.h"
using namespace std; using namespace std;
@ -37,13 +38,14 @@ bool phraseCountFlag = false;
bool lowCountFlag = false; bool lowCountFlag = false;
bool goodTuringFlag = false; bool goodTuringFlag = false;
bool kneserNeyFlag = false; bool kneserNeyFlag = false;
bool sourceLabelsFlag = false;
bool logProbFlag = false; bool logProbFlag = false;
inline float maybeLogProb( float a ) inline float maybeLogProb( float a )
{ {
return logProbFlag ? log(a) : a; return logProbFlag ? log(a) : a;
} }
void processFiles( char*, char*, char*, char* ); void processFiles( char*, char*, char*, char*, char* );
void loadCountOfCounts( char* ); void loadCountOfCounts( char* );
void breakdownCoreAndSparse( string combined, string &core, string &sparse ); void breakdownCoreAndSparse( string combined, string &core, string &sparse );
bool getLine( istream &fileP, vector< string > &item ); bool getLine( istream &fileP, vector< string > &item );
@ -57,13 +59,14 @@ int main(int argc, char* argv[])
<< "consolidating direct and indirect rule tables\n"; << "consolidating direct and indirect rule tables\n";
if (argc < 4) { if (argc < 4) {
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] \n"; cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] \n";
exit(1); exit(1);
} }
char* &fileNameDirect = argv[1]; char* &fileNameDirect = argv[1];
char* &fileNameIndirect = argv[2]; char* &fileNameIndirect = argv[2];
char* &fileNameConsolidated = argv[3]; char* &fileNameConsolidated = argv[3];
char* fileNameCountOfCounts; char* fileNameCountOfCounts;
char* fileNameSourceLabelSet;
for(int i=4; i<argc; i++) { for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"--Hierarchical") == 0) { if (strcmp(argv[i],"--Hierarchical") == 0) {
@ -114,13 +117,21 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--LogProb") == 0) { } else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true; logProbFlag = true;
cerr << "using log-probabilities\n"; cerr << "using log-probabilities\n";
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
sourceLabelsFlag = true;
if (i+1==argc) {
cerr << "ERROR: specify source label set file!\n";
exit(1);
}
fileNameSourceLabelSet = argv[++i];
cerr << "processing source labels property\n";
} else { } else {
cerr << "ERROR: unknown option " << argv[i] << endl; cerr << "ERROR: unknown option " << argv[i] << endl;
exit(1); exit(1);
} }
} }
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts ); processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
} }
vector< float > countOfCounts; vector< float > countOfCounts;
@ -169,7 +180,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9; if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
} }
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts ) void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
{ {
if (goodTuringFlag || kneserNeyFlag) if (goodTuringFlag || kneserNeyFlag)
loadCountOfCounts( fileNameCountOfCounts ); loadCountOfCounts( fileNameCountOfCounts );
@ -198,6 +209,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
exit(1); exit(1);
} }
// create properties consolidator
// (in case any additional phrase property requires further processing)
MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
if (sourceLabelsFlag) {
propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
}
// loop through all extracted phrase translations // loop through all extracted phrase translations
int i=0; int i=0;
while(true) { while(true) {
@ -307,12 +325,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// counts, for debugging // counts, for debugging
fileConsolidated << "||| " << countE << " " << countF << " " << countEF; fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
// count bin feature (as a sparse feature) // sparse features
fileConsolidated << " |||"; fileConsolidated << " |||";
if (directSparseScores.compare("") != 0) if (directSparseScores.compare("") != 0)
fileConsolidated << " " << directSparseScores; fileConsolidated << " " << directSparseScores;
if (indirectSparseScores.compare("") != 0) if (indirectSparseScores.compare("") != 0)
fileConsolidated << " " << indirectSparseScores; fileConsolidated << " " << indirectSparseScores;
// count bin feature (as a sparse feature)
if (sparseCountBinFeatureFlag) { if (sparseCountBinFeatureFlag) {
bool foundBin = false; bool foundBin = false;
for(size_t i=0; i < countBin.size(); i++) { for(size_t i=0; i < countBin.size(); i++) {
@ -332,9 +351,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
} }
// arbitrary key-value pairs // arbitrary key-value pairs
fileConsolidated << " ||| "; fileConsolidated << " |||";
if (itemDirect.size() >= 6) { if (itemDirect.size() >= 6) {
fileConsolidated << itemDirect[5]; //if (sourceLabelsFlag) {
fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
//} else {
// fileConsolidated << itemDirect[5];
//}
} }
fileConsolidated << endl; fileConsolidated << endl;

View File

@ -617,9 +617,8 @@ void ExtractGHKM::WriteGlueGrammar(
} }
} }
std::string sourceTopLabel = "TOPLABEL"; size_t sourceLabelGlueTop = 0;
std::string sourceSLabel = "S"; size_t sourceLabelGlueX = 1;
std::string sourceSomeLabel = "SOMELABEL";
// basic rules // basic rules
out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| |||"; out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| |||";
@ -627,7 +626,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " <s>]}}"; out << " {{Tree [" << topLabel << " <s>]}}";
} }
if (options.sourceLabels) { if (options.sourceLabels) {
out << " {{SourceLabels 1 1 " << sourceTopLabel << " 1}}"; out << " {{SourceLabels 1 1 " << sourceLabelGlueTop << " 1}}";
} }
out << std::endl; out << std::endl;
@ -636,7 +635,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}"; out << " {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}";
} }
if (options.sourceLabels) { if (options.sourceLabels) {
out << " {{SourceLabels 2 1 " << sourceTopLabel << " 1 1 " << sourceTopLabel << " 1}}"; out << " {{SourceLabels 2 1 " << sourceLabelGlueTop << " 1 1 " << sourceLabelGlueTop << " 1}}";
} }
out << std::endl; out << std::endl;
@ -648,7 +647,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}"; out << " {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}";
} }
if (options.sourceLabels) { if (options.sourceLabels) {
out << " {{SourceLabels 2 1 " << sourceSLabel << " 1 1 " << sourceTopLabel << " 1}}"; out << " {{SourceLabels 2 1 " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
} }
out << std::endl; out << std::endl;
} }
@ -661,7 +660,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}"; out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
} }
if (options.sourceLabels) { if (options.sourceLabels) {
out << " {{SourceLabels 3 2.718 " << sourceTopLabel << " " << sourceSomeLabel << " 2.718 1 " << sourceTopLabel << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL" out << " {{SourceLabels 3 2.718 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 2.718 1 " << sourceLabelGlueTop << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL"
} }
out << std::endl; out << std::endl;
} }
@ -672,7 +671,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}"; out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
} }
if (options.sourceLabels) { if (options.sourceLabels) {
out << " {{SourceLabels 3 1 " << sourceTopLabel << " " << sourceSomeLabel << " 1 1 " << sourceTopLabel << " 1}}"; // TODO: there should be better options than using "SOMELABEL" out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
} }
out << std::endl; out << std::endl;
} }

View File

@ -1860,7 +1860,7 @@ sub define_tuning_tune {
$cmd .= " --lambdas \"$lambda\"" if $lambda; $cmd .= " --lambdas \"$lambda\"" if $lambda;
$cmd .= " --continue" if $tune_continue; $cmd .= " --continue" if $tune_continue;
$cmd .= " --skip-decoder" if $skip_decoder; $cmd .= " --skip-decoder" if $skip_decoder;
$cmd .= " --inputtype $tune_inputtype" if $tune_inputtype; $cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype);
my $qsub_args = &get_qsub_args("TUNING"); my $qsub_args = &get_qsub_args("TUNING");
$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args); $cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
@ -2217,6 +2217,10 @@ sub define_training_extract_phrases {
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model","")); my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file "; $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
} }
if (&get("TRAINING:ghkm-source-labels")) {
$cmd .= "-ghkm-source-labels ";
}
} }
my $extract_settings = &get("TRAINING:extract-settings"); my $extract_settings = &get("TRAINING:extract-settings");
@ -2254,6 +2258,11 @@ sub define_training_build_ttable {
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model","")); my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file "; $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
} }
if (&get("TRAINING:ghkm-source-labels")) {
$cmd .= "-ghkm-source-labels ";
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
}
} }
&create_step($step_id,$cmd); &create_step($step_id,$cmd);
@ -2438,6 +2447,12 @@ sub define_training_create_config {
} }
} }
if (&get("TRAINING:ghkm-source-labels")) {
$cmd .= "-ghkm-source-labels ";
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
}
# sparse lexical features provide additional content for config file # sparse lexical features provide additional content for config file
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features; $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
@ -3412,7 +3427,7 @@ sub check_backoff_and_get_array {
# the following two functions deal with getting information about # the following two functions deal with getting information about
# files that are passed between steps. this are either specified # files that are passed between steps. this are either specified
# in the meta file (default) or in the configuration file (here called # in the meta file (default) or in the configuration file (here called
# 'specified', in the step management refered to as 'given'). # 'specified', in the step management referred to as 'given').
sub get_specified_or_default_file { sub get_specified_or_default_file {
my ($specified_module,$specified_set,$specified_parameter, my ($specified_module,$specified_set,$specified_parameter,

View File

@ -219,14 +219,14 @@ foreach (@children) {
waitpid($_, 0); waitpid($_, 0);
} }
# glue rules # merge glue rules
if (defined($glueFile)) { if (defined($glueFile)) {
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile"; my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
print STDERR "Merging glue rules: $cmd \n"; print STDERR "Merging glue rules: $cmd \n";
print STDERR `$cmd`; print STDERR `$cmd`;
} }
# phrase orientation priors (GHKM extraction) # merge phrase orientation priors (GHKM extraction)
if ($phraseOrientation && defined($phraseOrientationPriorsFile)) { if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
print STDERR "Merging phrase orientation priors\n"; print STDERR "Merging phrase orientation priors\n";

View File

@ -27,10 +27,22 @@ my $scoreCmd = $ARGV[2];
my $extractFile = $ARGV[3]; # 1st arg of extract argument my $extractFile = $ARGV[3]; # 1st arg of extract argument
my $lexFile = $ARGV[4]; my $lexFile = $ARGV[4];
my $ptHalf = $ARGV[5]; # output my $ptHalf = $ARGV[5]; # output
my $inverse = 0;
my $sourceLabelsFile;
my $otherExtractArgs= ""; my $otherExtractArgs= "";
for (my $i = 6; $i < $#ARGV; ++$i) for (my $i = 6; $i < $#ARGV; ++$i)
{ {
if ($ARGV[$i] eq '--SourceLabels') {
$sourceLabelsFile = $ARGV[++$i];
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet ";
next;
}
if ($ARGV[$i] eq '--Inverse') {
$inverse = 1;
$otherExtractArgs .= $ARGV[$i] ." ";
next;
}
$otherExtractArgs .= $ARGV[$i] ." "; $otherExtractArgs .= $ARGV[$i] ." ";
} }
#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs #$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
@ -258,6 +270,14 @@ if (-e $cocPath)
close(FHCOC); close(FHCOC);
} }
# merge source label files
if (!$inverse && defined($sourceLabelsFile))
{
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $sourceLabelsFile";
print STDERR "Merging source label files: $cmd \n";
`$cmd`;
}
$cmd = "rm -rf $TMPDIR \n"; $cmd = "rm -rf $TMPDIR \n";
print STDERR $cmd; print STDERR $cmd;
systemCheck($cmd); systemCheck($cmd);

View File

@ -127,8 +127,8 @@ my $___NOCASE = 0;
# Use "--nonorm" to non normalize translation before computing scores # Use "--nonorm" to non normalize translation before computing scores
my $___NONORM = 0; my $___NONORM = 0;
# set 0 if input type is text, set 1 if input type is confusion network # set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree
my $___INPUTTYPE = 0; my $___INPUTTYPE;
my $mertdir = undef; # path to new mert directory my $mertdir = undef; # path to new mert directory
@ -1228,14 +1228,18 @@ sub run_decoder {
if (defined $___JOBS && $___JOBS > 0) { if (defined $___JOBS && $___JOBS > 0) {
die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA; die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
$decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
} else { } else {
my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE"; my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct";
if ($___HG_MIRA) { if ($___HG_MIRA) {
safesystem("rm -rf $hypergraph_dir"); safesystem("rm -rf $hypergraph_dir");
$nbest_list_cmd = "-output-search-graph-hypergraph true gz"; $nbest_list_cmd = "-output-search-graph-hypergraph true gz";
} }
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out"; $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG";
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
$decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
} }
print STDERR "Executing: $decoder_cmd \n"; print STDERR "Executing: $decoder_cmd \n";
@ -1309,7 +1313,9 @@ sub get_featlist_from_moses {
print STDERR "Using cached features list: $featlistfn\n"; print STDERR "Using cached features list: $featlistfn\n";
} else { } else {
print STDERR "Asking moses for feature names and values from $___CONFIG\n"; print STDERR "Asking moses for feature names and values from $___CONFIG\n";
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn"; my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
$cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
$cmd .= " -show-weights > $featlistfn";
print STDERR "Executing: $cmd\n"; print STDERR "Executing: $cmd\n";
safesystem($cmd) or die "Failed to run moses with the config $configfn"; safesystem($cmd) or die "Failed to run moses with the config $configfn";
} }

View File

@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE, $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS, @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE, $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS, $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE, $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
@ -112,6 +112,8 @@ $_HELP = 1
'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS, 'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION, 'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION,
'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation 'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
'pcfg' => \$_PCFG, 'pcfg' => \$_PCFG,
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1, 'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2, 'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
@ -1427,10 +1429,15 @@ sub extract_phrase {
$cmd .= " --PCFG" if $_PCFG; $cmd .= " --PCFG" if $_PCFG;
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2; $cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1; $cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS; if (defined($_GHKM))
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION; {
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE); $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
if (!defined($_GHKM)) { $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
$cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
}
else
{
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX; $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX; $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
$cmd .= " --MaxSpan $max_length"; $cmd .= " --MaxSpan $max_length";
@ -1609,6 +1616,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS; $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION; $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE); $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " $DOMAIN" if $DOMAIN; $cmd .= " $DOMAIN" if $DOMAIN;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE; $cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
@ -1659,6 +1667,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN; $cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN;
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING; $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY; $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " | gzip -c > $ttable_file.gz"; $cmd .= " | gzip -c > $ttable_file.gz";
@ -2164,6 +2173,7 @@ sub create_ini {
print INI "WordPenalty\n"; print INI "WordPenalty\n";
print INI "PhrasePenalty\n"; print INI "PhrasePenalty\n";
print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE); print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
print INI "SoftSourceSyntacticConstraintsFeature sourceLabelSetFile=$_GHKM_SOURCE_LABELS_FILE\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
print INI $feature_spec; print INI $feature_spec;
print INI "\n# dense weights for feature functions\n"; print INI "\n# dense weights for feature functions\n";
@ -2171,6 +2181,7 @@ sub create_ini {
print INI "UnknownWordPenalty0= 1\n"; print INI "UnknownWordPenalty0= 1\n";
print INI "WordPenalty0= -1\n"; print INI "WordPenalty0= -1\n";
print INI "PhrasePenalty0= 0.2\n"; print INI "PhrasePenalty0= 0.2\n";
print INI "SoftSourceSyntacticConstraintsFeature0= 0.3 -0.3 -0.3\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
print INI $weight_spec; print INI $weight_spec;
close(INI); close(INI);
} }