bug fix to enable pruned search graph output by default

This commit is contained in:
phikoehn 2012-09-03 07:23:32 +01:00
parent d99f97297f
commit 0e783dc529
8 changed files with 155 additions and 1 deletions

View File

@ -272,6 +272,9 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
out << "->" << hypo.GetWinningHypothesis()->GetId();
}
if (StaticData::Instance().GetIncludeLHSInSearchGraph()) {
out << " " << hypo.GetTargetLHS() << "=>";
}
out << " " << hypo.GetCurrTargetPhrase()
//<< " " << outPhrase
<< " " << hypo.GetCurrSourceRange();

View File

@ -120,6 +120,7 @@ Parameter::Parameter()
AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
#ifdef HAVE_PROTOBUF
AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
#endif

View File

@ -239,7 +239,8 @@ bool StaticData::LoadData(Parameter *parameter)
} else
m_outputSearchGraphPB = false;
#endif
SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", true );
SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", false );
SetBooleanParameter( &m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false );
// include feature names in the n-best list
SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true );

View File

@ -195,6 +195,7 @@ protected:
bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
#endif
bool m_unprunedSearchGraph; //! do not exclude dead ends (chart decoder only)
bool m_includeLHSInSearchGraph; //! include LHS of rules in search graph
size_t m_cubePruningPopLimit;
size_t m_cubePruningDiversity;
@ -562,6 +563,10 @@ public:
return m_unprunedSearchGraph;
}
bool GetIncludeLHSInSearchGraph() const {
return m_includeLHSInSearchGraph;
}
XmlInputType GetXmlInputType() const {
return m_xmlInputType;
}

52
phrase-extract/domain.cpp Normal file
View File

@ -0,0 +1,52 @@
// $Id$
//#include "beammain.h"
#include "domain.h"
#include "tables-core.h"
#include "InputFileStream.h"
#include "SafeGetline.h"
#define TABLE_LINE_MAX_LENGTH 1000
using namespace std;
namespace MosesTraining
{
// handling of domain names: load database with sentence-id / domain name info
void Domain::load( const std::string &domainFileName ) {
Moses::InputFileStream fileS( domainFileName );
istream *fileP = &fileS;
while(true) {
char line[TABLE_LINE_MAX_LENGTH];
SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
if (fileP->eof()) break;
// read
vector< string > domainSpecLine = tokenize( line );
int lineNumber;
if (domainSpecLine.size() != 2 ||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
exit(1);
}
// store
string &name = domainSpecLine[1];
spec.push_back( make_pair( lineNumber, name ));
if (name2id.find( name ) == name2id.end()) {
name2id[ name ] = list.size();
list.push_back( name );
}
}
}
// get domain name based on sentence number
string Domain::getDomainOfSentence( int sentenceId ) {
for(size_t i=0; i<spec.size(); i++) {
if (sentenceId <= spec[i].first) {
return spec[i].second;
}
}
return "undefined";
}
}

32
phrase-extract/domain.h Normal file
View File

@ -0,0 +1,32 @@
// $Id$
#ifndef _DOMAIN_H
#define _DOMAIN_H
#include <iostream>
#include <fstream>
#include <assert.h>
#include <stdlib.h>
#include <string>
#include <queue>
#include <map>
#include <cmath>
extern std::vector<std::string> tokenize( const char*);
namespace MosesTraining
{
class Domain
{
public:
std::vector< std::pair< int, std::string > > spec;
std::vector< std::string > list;
std::map< std::string, int > name2id;
void load( const std::string &fileName );
std::string getDomainOfSentence( int sentenceId );
};
}
#endif

View File

@ -0,0 +1,38 @@
#!/usr/bin/perl -w
use strict;
# Create domain file from corpora
# (helper for domain adatpation)
# Creates a file with domain names and end line numbers for different domains
# within the cleaned training corpus. This file is used by various domain
# adaptation methods.
my ($extension,@SUBCORPORA) = @ARGV;
my $line_count = 0;
my %UNIQUE_NAME;
my $number = 1;
foreach (@SUBCORPORA) {
# get number of lines
if (!-e "$_.$extension" && -e "$_.$extension.gz") {
$line_count += `zcat $_.$extension.gz | wc -l`;
}
elsif (-e "$_.$extension") {
$line_count += `wc -l < $_.$extension`;
}
else {
die("ERROR: could not open sub corpus file $_.$extension\n");
}
# construct name
my $name = $number++; # default: cardinal number
while(defined($UNIQUE_NAME{$name})) { $name = $number++; } # slightly paranoid
if (/\/([^\.\/]+)\.[^\/]+$/ && !defined($UNIQUE_NAME{$1})) { # reconstruct corpus name
$name = $1;
$UNIQUE_NAME{$1}++;
}
print "$line_count $name\n";
}

View File

@ -0,0 +1,22 @@
#!/usr/bin/perl -w
use strict;
# wrapper for irstlm training
my $IRSTLM = shift @ARGV;
my $settings = join(" ",@ARGV);
$settings =~ s/\-order/\-n/;
$settings =~ s/\-text/\-i/;
$settings =~ s/\-lm/\-o/;
if ($settings !~ /\-o +(\S+)/) {
die("ERROR: no output file specified");
}
my $lm = $1;
$settings =~ s/(\-o +\S+)/$1.iarpa.gz/;
my $cmd = "IRSTLM=$IRSTLM $IRSTLM/scripts/build-lm.sh $settings ; ~/moses/irstlm/bin/compile-lm --text yes $lm.iarpa.gz $lm";
print STDERR $cmd."\n";
print `$cmd`;