bug fix to enable pruned search graph output by default

2024-09-11 11:25:40 +03:00 · 2012-09-03 07:23:32 +01:00 · 2012-09-03 07:23:32 +01:00 · 0e783dc529
commit 0e783dc529
parent d99f97297f
8 changed files with 155 additions and 1 deletions
--- a/moses/src/ChartHypothesis.cpp
+++ b/moses/src/ChartHypothesis.cpp
@ -272,6 +272,9 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
 		out << "->" << hypo.GetWinningHypothesis()->GetId();
 	}

+  if (StaticData::Instance().GetIncludeLHSInSearchGraph()) {
+    out << " " << hypo.GetTargetLHS() << "=>";
+  }
  out << " " << hypo.GetCurrTargetPhrase()
      //<< " " << outPhrase
      << " " << hypo.GetCurrSourceRange();
--- a/moses/src/Parameter.cpp
+++ b/moses/src/Parameter.cpp
@ -120,6 +120,7 @@ Parameter::Parameter()
  AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
  AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
  AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
+  AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
 #ifdef HAVE_PROTOBUF
  AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
 #endif
--- a/moses/src/StaticData.cpp
+++ b/moses/src/StaticData.cpp
@ -239,7 +239,8 @@ bool StaticData::LoadData(Parameter *parameter)
  } else
    m_outputSearchGraphPB = false;
 #endif
-  SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", true );
+  SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", false );
+  SetBooleanParameter( &m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false );

  // include feature names in the n-best list
  SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true );
--- a/moses/src/StaticData.h
+++ b/moses/src/StaticData.h
@ -195,6 +195,7 @@ protected:
  bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
 #endif
  bool m_unprunedSearchGraph; //! do not exclude dead ends (chart decoder only)
+  bool m_includeLHSInSearchGraph; //! include LHS of rules in search graph

  size_t m_cubePruningPopLimit;
  size_t m_cubePruningDiversity;
@ -562,6 +563,10 @@ public:
    return m_unprunedSearchGraph;
  }

+  bool GetIncludeLHSInSearchGraph() const {
+    return m_includeLHSInSearchGraph;
+  }
+
  XmlInputType GetXmlInputType() const {
    return m_xmlInputType;
  }
--- a/phrase-extract/domain.cpp
+++ b/phrase-extract/domain.cpp
@ -0,0 +1,52 @@
+// $Id$
+//#include "beammain.h"
+#include "domain.h"
+#include "tables-core.h"
+#include "InputFileStream.h"
+#include "SafeGetline.h"
+
+#define TABLE_LINE_MAX_LENGTH 1000
+
+using namespace std;
+
+namespace MosesTraining
+{
+
+// handling of domain names: load database with sentence-id / domain name info
+void Domain::load( const std::string &domainFileName ) {
+  Moses::InputFileStream fileS( domainFileName );
+  istream *fileP = &fileS;
+  while(true) {
+    char line[TABLE_LINE_MAX_LENGTH];
+    SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
+    if (fileP->eof()) break;
+    // read
+    vector< string > domainSpecLine = tokenize( line );
+    int lineNumber;
+    if (domainSpecLine.size() != 2 ||
+        ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
+      cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
+      exit(1);
+    }
+    // store
+    string &name = domainSpecLine[1];
+    spec.push_back( make_pair( lineNumber, name ));
+    if (name2id.find( name ) == name2id.end()) {
+      name2id[ name ] = list.size();
+      list.push_back( name );
+    }
+  }
+}
+
+// get domain name based on sentence number
+string Domain::getDomainOfSentence( int sentenceId ) {
+  for(size_t i=0; i<spec.size(); i++) {
+    if (sentenceId <= spec[i].first) {
+      return spec[i].second;
+    }
+  }
+  return "undefined";
+}
+
+}
+
--- a/phrase-extract/domain.h
+++ b/phrase-extract/domain.h
@ -0,0 +1,32 @@
+// $Id$
+
+#ifndef _DOMAIN_H
+#define _DOMAIN_H
+
+#include <iostream>
+#include <fstream>
+#include <assert.h>
+#include <stdlib.h>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+
+extern std::vector<std::string> tokenize( const char*);
+
+namespace MosesTraining
+{
+
+class Domain
+{
+public:
+  std::vector< std::pair< int, std::string > > spec;
+  std::vector< std::string > list;
+  std::map< std::string, int > name2id;
+  void load( const std::string &fileName );
+  std::string getDomainOfSentence( int sentenceId );
+};
+
+}
+
+#endif
--- a/scripts/ems/support/build-domain-file-from-subcorpora.perl
+++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl
@ -0,0 +1,38 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+# Create domain file from corpora
+# (helper for domain adatpation)
+
+# Creates a file with domain names and end line numbers for different domains
+# within the cleaned training corpus. This file is used by various domain 
+# adaptation methods.
+
+my ($extension,@SUBCORPORA) = @ARGV;
+
+my $line_count = 0;
+my %UNIQUE_NAME;
+my $number = 1;
+foreach (@SUBCORPORA) {
+  # get number of lines
+  if (!-e "$_.$extension" && -e "$_.$extension.gz") {
+    $line_count += `zcat $_.$extension.gz | wc -l`;
+  }
+  elsif (-e "$_.$extension") {
+    $line_count += `wc -l < $_.$extension`;
+  }
+  else {
+    die("ERROR: could not open sub corpus file $_.$extension\n");
+  }
+
+  # construct name
+  my $name = $number++; # default: cardinal number
+  while(defined($UNIQUE_NAME{$name})) { $name = $number++; } # slightly paranoid
+  if (/\/([^\.\/]+)\.[^\/]+$/ && !defined($UNIQUE_NAME{$1})) { # reconstruct corpus name
+    $name = $1;
+    $UNIQUE_NAME{$1}++;
+  }
+  print "$line_count $name\n";
+}
+
--- a/scripts/ems/support/train-irstlm.perl
+++ b/scripts/ems/support/train-irstlm.perl
@ -0,0 +1,22 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+# wrapper for irstlm training
+
+my $IRSTLM = shift @ARGV;
+
+my $settings = join(" ",@ARGV);
+$settings =~ s/\-order/\-n/;
+$settings =~ s/\-text/\-i/;
+$settings =~ s/\-lm/\-o/;
+
+if ($settings !~ /\-o +(\S+)/) {
+  die("ERROR: no output file specified");
+}
+my $lm = $1;
+$settings =~ s/(\-o +\S+)/$1.iarpa.gz/;
+
+my $cmd = "IRSTLM=$IRSTLM $IRSTLM/scripts/build-lm.sh $settings ; ~/moses/irstlm/bin/compile-lm --text yes $lm.iarpa.gz $lm";
+print STDERR $cmd."\n";
+print `$cmd`;