From c27cbf55eacd4c72685507b9bab624437d9adb4b Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Thu, 7 Aug 2014 21:02:51 +0100 Subject: [PATCH] source labels: integration into EMS --- moses/PP/Factory.cpp | 2 + moses/PP/SourceLabelsPhraseProperty.cpp | 22 +-- phrase-extract/PropertiesConsolidator.cpp | 159 ++++++++++++++++++++ phrase-extract/PropertiesConsolidator.h | 48 ++++++ phrase-extract/consolidate-main.cpp | 37 ++++- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 15 +- scripts/ems/experiment.perl | 19 ++- scripts/generic/extract-parallel.perl | 4 +- scripts/generic/score-parallel.perl | 20 +++ scripts/training/mert-moses.pl | 18 ++- scripts/training/train-model.perl | 21 ++- 11 files changed, 324 insertions(+), 41 deletions(-) create mode 100644 phrase-extract/PropertiesConsolidator.cpp create mode 100644 phrase-extract/PropertiesConsolidator.h diff --git a/moses/PP/Factory.cpp b/moses/PP/Factory.cpp index 4e9bfbf0e..fd146005b 100644 --- a/moses/PP/Factory.cpp +++ b/moses/PP/Factory.cpp @@ -9,6 +9,7 @@ #include "moses/PP/TreeStructurePhraseProperty.h" #include "moses/PP/SpanLengthPhraseProperty.h" #include "moses/PP/NonTermContextProperty.h" +#include "moses/PP/OrientationPhraseProperty.h" namespace Moses { @@ -59,6 +60,7 @@ PhrasePropertyFactory::PhrasePropertyFactory() MOSES_PNAME2("Tree",TreeStructurePhraseProperty); MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty); MOSES_PNAME2("NonTermContext", NonTermContextProperty); + MOSES_PNAME2("Orientation", OrientationPhraseProperty); } PhrasePropertyFactory::~PhrasePropertyFactory() diff --git a/moses/PP/SourceLabelsPhraseProperty.cpp b/moses/PP/SourceLabelsPhraseProperty.cpp index bca5c9a30..8e6a5dd6d 100644 --- a/moses/PP/SourceLabelsPhraseProperty.cpp +++ b/moses/PP/SourceLabelsPhraseProperty.cpp @@ -16,12 +16,12 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value) std::istringstream tokenizer(value); if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side) - UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property?"); + UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value); } assert( m_nNTs > 0 ); if (! (tokenizer >> m_totalCount)) { // second token: overall rule count - UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property?"); + UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value); } assert( m_totalCount > 0.0 ); @@ -32,7 +32,7 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value) std::priority_queue ruleLabelledCountsPQ; while (tokenizer.peek() != EOF) { - try { +// try { SourceLabelsPhrasePropertyItem item; size_t numberOfLHSsGivenRHS = std::numeric_limits::max(); @@ -46,28 +46,28 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value) for (size_t i=0; i> sourceLabelRHS) ) { // RHS source non-terminal label - UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property?"); + UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value); } item.m_sourceLabelsRHS.push_back(sourceLabelRHS); } if (! (tokenizer >> item.m_sourceLabelsRHSCount)) { - UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property?"); + UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value); } if (! (tokenizer >> numberOfLHSsGivenRHS)) { - UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property?"); + UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value); } } for (size_t i=0; i> sourceLabelLHS)) { // LHS source non-terminal label - UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property?"); + UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value); } float ruleSourceLabelledCount; if (! (tokenizer >> ruleSourceLabelledCount)) { - UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property?"); + UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value); } item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) ); ruleLabelledCountsPQ.push(ruleSourceLabelledCount); @@ -75,9 +75,9 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value) m_sourceLabelItems.push_back(item); - } catch (const std::exception &e) { - UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?"); - } +// } catch (const std::exception &e) { +// UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?"); +// } } // keep only top N label vectors diff --git a/phrase-extract/PropertiesConsolidator.cpp b/phrase-extract/PropertiesConsolidator.cpp new file mode 100644 index 000000000..642c48672 --- /dev/null +++ b/phrase-extract/PropertiesConsolidator.cpp @@ -0,0 +1,159 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "PropertiesConsolidator.h" + +#include +#include +#include + +#include "moses/Util.h" +#include "phrase-extract/InputFileStream.h" +#include "phrase-extract/OutputFileStream.h" + + +namespace MosesTraining +{ + +void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile) +{ + Moses::InputFileStream inFile(sourceLabelSetFile); + + // read source label set + m_sourceLabels.clear(); + std::string line; + while (getline(inFile, line)) { + std::istringstream tokenizer(line); + std::string label; + size_t index; + try { + tokenizer >> label >> index; + } catch (const std::exception &e) { + UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " ."); + } + std::pair< std::map::iterator, bool > inserted = m_sourceLabels.insert( std::pair(label,index) ); + UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once."); + } + + inFile.Close(); + + m_sourceLabelsFlag = true; +} + + +std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const +{ + if ( propertiesString.empty() ) { + return propertiesString; + } + + std::ostringstream out; + std::vector toks; + Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{"); + for (size_t i = 1; i < toks.size(); ++i) { + std::string &tok = toks[i]; + if (tok.empty()) { + continue; + } + size_t endPos = tok.rfind("}"); + tok = tok.substr(0, endPos - 1); + std::vector keyValue = Moses::TokenizeFirstOnly(tok, " "); + assert(keyValue.size() == 2); + + if ( !keyValue[0].compare("SourceLabels") ) { + + if ( m_sourceLabelsFlag ) { + + // SourceLabels additional property: replace strings with vocabulary indices + out << " {{" << keyValue[0]; + + std::istringstream tokenizer(keyValue[1]); + + size_t nNTs; + double totalCount; + + if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side) + UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. " + << "Flawed SourceLabels property?"); + } + assert( nNTs > 0 ); + out << " " << nNTs; + + if (! (tokenizer >> totalCount)) { // second token: overall rule count + UTIL_THROW2("Not able to read overall rule count from SourceLabels property. " + << "Flawed SourceLabels property?"); + } + assert( totalCount > 0.0 ); + out << " " << totalCount; + + while (tokenizer.peek() != EOF) { + try { + + size_t numberOfLHSsGivenRHS = std::numeric_limits::max(); + + std::string token; + + if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule + for (size_t i=0; i> token; // RHS source non-terminal label + std::map::const_iterator found = m_sourceLabels.find(token); + UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set."); + out << " " << found->second; + } + + tokenizer >> token; // sourceLabelsRHSCount + out << " " << token; + + tokenizer >> numberOfLHSsGivenRHS; + out << " " << numberOfLHSsGivenRHS; + } + + for (size_t i=0; i> token; // LHS source non-terminal label + std::map::const_iterator found = m_sourceLabels.find(token); + UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set."); + out << " " << found->second; + + tokenizer >> token; // ruleSourceLabelledCount + out << " " << token; + } + + } catch (const std::exception &e) { + UTIL_THROW2("Flawed item in SourceLabels property?"); + } + } + + out << "}}"; + + } else { // don't process source labels additional property + out << " {{" << keyValue[0] << " " << keyValue[1] << "}}"; + } + + } else { + + // output other additional property + out << " {{" << keyValue[0] << " " << keyValue[1] << "}}"; + } + } + + return out.str(); +} + +} // namespace MosesTraining + diff --git a/phrase-extract/PropertiesConsolidator.h b/phrase-extract/PropertiesConsolidator.h new file mode 100644 index 000000000..cc6a7a835 --- /dev/null +++ b/phrase-extract/PropertiesConsolidator.h @@ -0,0 +1,48 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + + +#pragma once + +#include +#include + + +namespace MosesTraining +{ + +class PropertiesConsolidator +{ +public: + + PropertiesConsolidator() : m_sourceLabelsFlag(false) {}; + + void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile); + + std::string ProcessPropertiesString(const std::string &propertiesString) const; + +private: + + bool m_sourceLabelsFlag; + std::map m_sourceLabels; + +}; + +} // namespace MosesTraining + diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index a2174805c..10697a956 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -28,6 +28,7 @@ #include "tables-core.h" #include "InputFileStream.h" #include "OutputFileStream.h" +#include "PropertiesConsolidator.h" using namespace std; @@ -37,13 +38,14 @@ bool phraseCountFlag = false; bool lowCountFlag = false; bool goodTuringFlag = false; bool kneserNeyFlag = false; +bool sourceLabelsFlag = false; bool logProbFlag = false; inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; } -void processFiles( char*, char*, char*, char* ); +void processFiles( char*, char*, char*, char*, char* ); void loadCountOfCounts( char* ); void breakdownCoreAndSparse( string combined, string &core, string &sparse ); bool getLine( istream &fileP, vector< string > &item ); @@ -57,13 +59,14 @@ int main(int argc, char* argv[]) << "consolidating direct and indirect rule tables\n"; if (argc < 4) { - cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] \n"; + cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] \n"; exit(1); } char* &fileNameDirect = argv[1]; char* &fileNameIndirect = argv[2]; char* &fileNameConsolidated = argv[3]; char* fileNameCountOfCounts; + char* fileNameSourceLabelSet; for(int i=4; i countOfCounts; @@ -169,7 +180,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts ) if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9; } -void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts ) +void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet ) { if (goodTuringFlag || kneserNeyFlag) loadCountOfCounts( fileNameCountOfCounts ); @@ -198,6 +209,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC exit(1); } + // create properties consolidator + // (in case any additional phrase property requires further processing) + MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator(); + if (sourceLabelsFlag) { + propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet); + } + // loop through all extracted phrase translations int i=0; while(true) { @@ -307,12 +325,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC // counts, for debugging fileConsolidated << "||| " << countE << " " << countF << " " << countEF; - // count bin feature (as a sparse feature) + // sparse features fileConsolidated << " |||"; if (directSparseScores.compare("") != 0) fileConsolidated << " " << directSparseScores; if (indirectSparseScores.compare("") != 0) fileConsolidated << " " << indirectSparseScores; + // count bin feature (as a sparse feature) if (sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { @@ -332,9 +351,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC } // arbitrary key-value pairs - fileConsolidated << " ||| "; + fileConsolidated << " |||"; if (itemDirect.size() >= 6) { - fileConsolidated << itemDirect[5]; + //if (sourceLabelsFlag) { + fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]); + //} else { + // fileConsolidated << itemDirect[5]; + //} } fileConsolidated << endl; diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 36dfee2e5..774416079 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -617,9 +617,8 @@ void ExtractGHKM::WriteGlueGrammar( } } - std::string sourceTopLabel = "TOPLABEL"; - std::string sourceSLabel = "S"; - std::string sourceSomeLabel = "SOMELABEL"; + size_t sourceLabelGlueTop = 0; + size_t sourceLabelGlueX = 1; // basic rules out << " [X] ||| [" << topLabel << "] ||| 1 ||| ||| ||| |||"; @@ -627,7 +626,7 @@ void ExtractGHKM::WriteGlueGrammar( out << " {{Tree [" << topLabel << " ]}}"; } if (options.sourceLabels) { - out << " {{SourceLabels 1 1 " << sourceTopLabel << " 1}}"; + out << " {{SourceLabels 1 1 " << sourceLabelGlueTop << " 1}}"; } out << std::endl; @@ -636,7 +635,7 @@ void ExtractGHKM::WriteGlueGrammar( out << " {{Tree [" << topLabel << " [" << topLabel << "] ]}}"; } if (options.sourceLabels) { - out << " {{SourceLabels 2 1 " << sourceTopLabel << " 1 1 " << sourceTopLabel << " 1}}"; + out << " {{SourceLabels 2 1 " << sourceLabelGlueTop << " 1 1 " << sourceLabelGlueTop << " 1}}"; } out << std::endl; @@ -648,7 +647,7 @@ void ExtractGHKM::WriteGlueGrammar( out << " {{Tree [" << topLabel << " [" << i->first << "] ]}}"; } if (options.sourceLabels) { - out << " {{SourceLabels 2 1 " << sourceSLabel << " 1 1 " << sourceTopLabel << " 1}}"; + out << " {{SourceLabels 2 1 " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}"; } out << std::endl; } @@ -661,7 +660,7 @@ void ExtractGHKM::WriteGlueGrammar( out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}"; } if (options.sourceLabels) { - out << " {{SourceLabels 3 2.718 " << sourceTopLabel << " " << sourceSomeLabel << " 2.718 1 " << sourceTopLabel << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL" + out << " {{SourceLabels 3 2.718 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 2.718 1 " << sourceLabelGlueTop << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL" } out << std::endl; } @@ -672,7 +671,7 @@ void ExtractGHKM::WriteGlueGrammar( out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}"; } if (options.sourceLabels) { - out << " {{SourceLabels 3 1 " << sourceTopLabel << " " << sourceSomeLabel << " 1 1 " << sourceTopLabel << " 1}}"; // TODO: there should be better options than using "SOMELABEL" + out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}"; // TODO: there should be better options than using "SOMELABEL" } out << std::endl; } diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 1108bec1b..59c064224 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1860,7 +1860,7 @@ sub define_tuning_tune { $cmd .= " --lambdas \"$lambda\"" if $lambda; $cmd .= " --continue" if $tune_continue; $cmd .= " --skip-decoder" if $skip_decoder; - $cmd .= " --inputtype $tune_inputtype" if $tune_inputtype; + $cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype); my $qsub_args = &get_qsub_args("TUNING"); $cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args); @@ -2217,6 +2217,10 @@ sub define_training_extract_phrases { my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model","")); $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file "; } + + if (&get("TRAINING:ghkm-source-labels")) { + $cmd .= "-ghkm-source-labels "; + } } my $extract_settings = &get("TRAINING:extract-settings"); @@ -2254,6 +2258,11 @@ sub define_training_build_ttable { my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model","")); $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file "; } + if (&get("TRAINING:ghkm-source-labels")) { + $cmd .= "-ghkm-source-labels "; + my $source_labels_file = &versionize(&long_file_name("source-labels","model","")); + $cmd .= "-ghkm-source-labels-file $source_labels_file "; + } } &create_step($step_id,$cmd); @@ -2438,6 +2447,12 @@ sub define_training_create_config { } } + if (&get("TRAINING:ghkm-source-labels")) { + $cmd .= "-ghkm-source-labels "; + my $source_labels_file = &versionize(&long_file_name("source-labels","model","")); + $cmd .= "-ghkm-source-labels-file $source_labels_file "; + } + # sparse lexical features provide additional content for config file $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features; @@ -3412,7 +3427,7 @@ sub check_backoff_and_get_array { # the following two functions deal with getting information about # files that are passed between steps. this are either specified # in the meta file (default) or in the configuration file (here called -# 'specified', in the step management refered to as 'given'). +# 'specified', in the step management referred to as 'given'). sub get_specified_or_default_file { my ($specified_module,$specified_set,$specified_parameter, diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index ff6a058b5..433e95b9d 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -219,14 +219,14 @@ foreach (@children) { waitpid($_, 0); } -# glue rules +# merge glue rules if (defined($glueFile)) { my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile"; print STDERR "Merging glue rules: $cmd \n"; print STDERR `$cmd`; } -# phrase orientation priors (GHKM extraction) +# merge phrase orientation priors (GHKM extraction) if ($phraseOrientation && defined($phraseOrientationPriorsFile)) { print STDERR "Merging phrase orientation priors\n"; diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index 213b9e90e..7835d3826 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -27,10 +27,22 @@ my $scoreCmd = $ARGV[2]; my $extractFile = $ARGV[3]; # 1st arg of extract argument my $lexFile = $ARGV[4]; my $ptHalf = $ARGV[5]; # output +my $inverse = 0; +my $sourceLabelsFile; my $otherExtractArgs= ""; for (my $i = 6; $i < $#ARGV; ++$i) { + if ($ARGV[$i] eq '--SourceLabels') { + $sourceLabelsFile = $ARGV[++$i]; + $otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet "; + next; + } + if ($ARGV[$i] eq '--Inverse') { + $inverse = 1; + $otherExtractArgs .= $ARGV[$i] ." "; + next; + } $otherExtractArgs .= $ARGV[$i] ." "; } #$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs @@ -258,6 +270,14 @@ if (-e $cocPath) close(FHCOC); } +# merge source label files +if (!$inverse && defined($sourceLabelsFile)) +{ + my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $sourceLabelsFile"; + print STDERR "Merging source label files: $cmd \n"; + `$cmd`; +} + $cmd = "rm -rf $TMPDIR \n"; print STDERR $cmd; systemCheck($cmd); diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index dd41538f5..b965fbdd5 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -127,8 +127,8 @@ my $___NOCASE = 0; # Use "--nonorm" to non normalize translation before computing scores my $___NONORM = 0; -# set 0 if input type is text, set 1 if input type is confusion network -my $___INPUTTYPE = 0; +# set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree +my $___INPUTTYPE; my $mertdir = undef; # path to new mert directory @@ -1228,14 +1228,18 @@ sub run_decoder { if (defined $___JOBS && $___JOBS > 0) { die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA; - $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; + $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG"; + $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); + $decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; } else { - my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE"; + my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct"; if ($___HG_MIRA) { safesystem("rm -rf $hypergraph_dir"); $nbest_list_cmd = "-output-search-graph-hypergraph true gz"; } - $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out"; + $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG"; + $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); + $decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out"; } print STDERR "Executing: $decoder_cmd \n"; @@ -1309,7 +1313,9 @@ sub get_featlist_from_moses { print STDERR "Using cached features list: $featlistfn\n"; } else { print STDERR "Asking moses for feature names and values from $___CONFIG\n"; - my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn"; + my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn"; + $cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); + $cmd .= " -show-weights > $featlistfn"; print STDERR "Executing: $cmd\n"; safesystem($cmd) or die "Failed to run moses with the config $configfn"; } diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index da8e677bc..8f661b812 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_ $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE, @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS, $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE, - $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS, + $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS, $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE, $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, @@ -112,6 +112,8 @@ $_HELP = 1 'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS, 'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION, 'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation + 'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS, + 'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE, 'pcfg' => \$_PCFG, 'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1, 'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2, @@ -1427,10 +1429,15 @@ sub extract_phrase { $cmd .= " --PCFG" if $_PCFG; $cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2; $cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1; - $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS; - $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION; - $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE); - if (!defined($_GHKM)) { + if (defined($_GHKM)) + { + $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS; + $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION; + $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE); + $cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS; + } + else + { $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX; $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX; $cmd .= " --MaxSpan $max_length"; @@ -1609,6 +1616,7 @@ sub score_phrase_phrase_extract { $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS; $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION; $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE); + $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE); $cmd .= " $DOMAIN" if $DOMAIN; $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); $cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE; @@ -1659,6 +1667,7 @@ sub score_phrase_phrase_extract { $cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN; $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING; $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY; + $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE); $cmd .= " | gzip -c > $ttable_file.gz"; @@ -2164,6 +2173,7 @@ sub create_ini { print INI "WordPenalty\n"; print INI "PhrasePenalty\n"; print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE); + print INI "SoftSourceSyntacticConstraintsFeature sourceLabelSetFile=$_GHKM_SOURCE_LABELS_FILE\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE); print INI $feature_spec; print INI "\n# dense weights for feature functions\n"; @@ -2171,6 +2181,7 @@ sub create_ini { print INI "UnknownWordPenalty0= 1\n"; print INI "WordPenalty0= -1\n"; print INI "PhrasePenalty0= 0.2\n"; + print INI "SoftSourceSyntacticConstraintsFeature0= 0.3 -0.3 -0.3\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE); print INI $weight_spec; close(INI); }