Support for the decoding of arbitrary word lattices. Must be given in the form of a "plf" file, which is a little tricky. I'll add documentation at some point; for now, refer to the example plf file in the "lattice-surface" regression test.

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1359 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
redpony 2007-04-18 14:08:46 +00:00
parent d228a3c878
commit c80d8b8d47
26 changed files with 401 additions and 27 deletions

View File

@ -1,4 +1,5 @@
lib_LIBRARIES = libirstlm.a
AM_CPPFLAGS = -fPIC
libirstlm_a_SOURCES = \
dictionary.cpp \

View File

@ -121,7 +121,9 @@ InputType*IOStream::GetInput(InputType* inputType)
{
if(inputType->Read(*m_inputStream, m_inputFactorOrder))
{
inputType->SetTranslationId(m_translationId++);
if (long x = inputType->GetTranslationId()) { if (x>=m_translationId) m_translationId = x+1; }
else inputType->SetTranslationId(m_translationId++);
return inputType;
}
else

View File

@ -49,6 +49,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include "IOStream.h"
#include "Sentence.h"
#include "ConfusionNet.h"
#include "WordLattice.h"
#include "TranslationAnalysis.h"
#if HAVE_CONFIG_H
@ -63,9 +64,13 @@ using namespace std;
bool ReadInput(IOStream &ioStream, InputTypeEnum inputType, InputType*& source)
{
delete source;
source=ioStream.GetInput((inputType == SentenceInput ?
static_cast<InputType*>(new Sentence(Input)) :
static_cast<InputType*>(new ConfusionNet)));
switch(inputType)
{
case SentenceInput: source = ioStream.GetInput(new Sentence(Input)); break;
case ConfusionNetworkInput: source = ioStream.GetInput(new ConfusionNet); break;
case WordLatticeInput: source = ioStream.GetInput(new WordLattice); break;
default: TRACE_ERR("Unknown input type: " << inputType << "\n");
}
return (source ? true : false);
}

View File

@ -47,6 +47,12 @@ struct CNStats {
CNStats stats;
size_t ConfusionNet::GetColumnIncrement(size_t i, size_t j) const
{
(void) i;
(void) j;
return 1;
}
ConfusionNet::ConfusionNet()
: InputType()

View File

@ -15,7 +15,7 @@ class ConfusionNet : public InputType {
public:
typedef std::vector<std::pair<Word,float> > Column;
private:
protected:
std::vector<Column> data;
bool ReadFormat0(std::istream&,const std::vector<FactorType>& factorOrder);
@ -24,7 +24,7 @@ class ConfusionNet : public InputType {
public:
ConfusionNet();
~ConfusionNet();
virtual ~ConfusionNet();
ConfusionNet(Sentence const& s);
@ -33,13 +33,14 @@ class ConfusionNet : public InputType {
const Column& GetColumn(size_t i) const {assert(i<data.size());return data[i];}
const Column& operator[](size_t i) const {return GetColumn(i);}
virtual size_t GetColumnIncrement(size_t i, size_t j) const; //! returns 1 for CNs
bool Empty() const {return data.empty();}
size_t GetSize() const {return data.size();}
void Clear() {data.clear();}
bool ReadF(std::istream&,const std::vector<FactorType>& factorOrder,int format=0);
void Print(std::ostream&) const;
virtual void Print(std::ostream&) const;
int Read(std::istream& in,const std::vector<FactorType>& factorOrder);

View File

@ -60,7 +60,7 @@ pair<HypothesisStack::iterator, bool> HypothesisStack::Add(Hypothesis *hypo)
// this may also affect the worst score
if ( m_bestScore + m_beamThreshold > m_worstScore )
m_worstScore = m_bestScore + m_beamThreshold;
}
}
// Prune only if stack is twice as big as needed (lazy pruning)
VERBOSE(3,", now size " << m_hypos.size());

View File

@ -37,6 +37,8 @@ class InputType
{
protected:
long m_translationId; //< contiguous Id
bool m_hasMetaData;
long m_segId;
public:
@ -70,7 +72,7 @@ public:
//! return substring at a particular position. Only valid for Sentence class. TODO - get rid of this fn
virtual const Word& GetWord(size_t pos) const=0;
TO_STRING();
};

View File

@ -1,5 +1,5 @@
lib_LIBRARIES = libmoses.a
AM_CPPFLAGS = -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES -DUSE_HYPO_POOL
AM_CPPFLAGS = -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES
libmoses_a_SOURCES = \
ConfusionNet.cpp \
DecodeStep.cpp \
@ -17,6 +17,7 @@ libmoses_a_SOURCES = \
InputType.cpp \
InputFileStream.cpp \
LMList.cpp \
LVoc.cpp \
LanguageModel.cpp \
LanguageModelFactory.cpp \
LanguageModelMultiFactor.cpp \
@ -26,11 +27,10 @@ libmoses_a_SOURCES = \
LatticePathCollection.cpp \
LexicalReordering.cpp \
LexicalReorderingTable.cpp \
PrefixTreeMap.cpp \
LVoc.cpp \
Manager.cpp \
md5.cpp \
mempool.cpp \
PCNTools.cpp \
Parameter.cpp \
PartialTranslOptColl.cpp \
Phrase.cpp \
@ -40,6 +40,7 @@ libmoses_a_SOURCES = \
PhraseDictionaryTree.cpp \
PhraseDictionaryTreeAdaptor.cpp \
PhraseReference.cpp \
PrefixTreeMap.cpp \
ScoreComponentCollection.cpp \
ScoreIndexManager.cpp \
ScoreProducer.cpp \
@ -56,6 +57,7 @@ libmoses_a_SOURCES = \
Util.cpp \
Word.cpp \
WordsBitmap.cpp \
WordLattice.cpp \
WordsRange.cpp

130
moses/src/PCNTools.cpp Normal file
View File

@ -0,0 +1,130 @@
#include "PCNTools.h"
namespace PCN
{
const std::string chars = "'\\";
const char& quote = chars[0];
const char& slash = chars[1];
// safe get
inline char get(const std::string& in, int c) {
if (c < 0 || c >= (int)in.size()) return 0;
else return in[(size_t)c];
}
// consume whitespace
inline void eatws(const std::string& in, int& c) {
while (get(in,c) == ' ') { c++; }
}
// from 'foo' return foo
std::string getEscapedString(const std::string& in, int &c)
{
eatws(in,c);
if (get(in,c++) != quote) return "ERROR";
std::string res;
char cur = 0;
do {
cur = get(in,c++);
if (cur == slash) { res += get(in,c++); }
else if (cur != quote) { res += cur; }
} while (get(in,c) != quote && (c < (int)in.size()));
c++;
eatws(in,c);
return res;
}
// basically atof
float getFloat(const std::string& in, int &c)
{
std::string tmp;
eatws(in,c);
while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
tmp += get(in,c++);
}
eatws(in,c);
return atof(tmp.c_str());
}
// basically atof
int getInt(const std::string& in, int &c)
{
std::string tmp;
eatws(in,c);
while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
tmp += get(in,c++);
}
eatws(in,c);
return atoi(tmp.c_str());
}
// parse ('foo', 0.23)
CNAlt getCNAlt(const std::string& in, int &c)
{
if (get(in,c++) != '(') return CNAlt(); // throw "expected (";
std::string word = getEscapedString(in,c);
if (get(in,c++) != ',') return CNAlt(); // throw "expected , after string";
size_t cnNext = 1;
float prob = getFloat(in,c);
if (get(in,c) == ',') { // WORD LATTICE
c++;
int colIncr = getInt(in,c);
if (colIncr < 1) { colIncr = 1; //WARN
}
cnNext = (size_t)colIncr;
}
if (get(in,c++) != ')') return CNAlt(); // throw "expected )";
eatws(in,c);
return CNAlt(std::pair<std::string, float>(word,prob), cnNext);
}
// parse (('foo', 0.23), ('bar', 0.77))
CNCol getCNCol(const std::string& in, int &c) {
CNCol res;
if (get(in,c++) != '(') return res; // error
eatws(in,c);
while (1) {
if (c > (int)in.size()) { break; }
if (get(in,c) == ')') {
c++;
eatws(in,c);
break;
}
if (get(in,c) == ',' && get(in,c+1) == ')') {
c+=2;
eatws(in,c);
break;
}
if (get(in,c) == ',') { c++; eatws(in,c); }
res.push_back(getCNAlt(in, c));
}
return res;
}
// parse ((('foo', 0.23), ('bar', 0.77)), (('a', 0.3), ('c', 0.7)))
CN parsePCN(const std::string& in)
{
CN res;
int c = 0;
if (in[c++] != '(') return res; // error
while (1) {
if (c > (int)in.size()) { break; }
if (get(in,c) == ')') {
c++;
eatws(in,c);
break;
}
if (get(in,c) == ',' && get(in,c+1) == ')') {
c+=2;
eatws(in,c);
break;
}
if (get(in,c) == ',') { c++; eatws(in,c); }
res.push_back(getCNCol(in, c));
}
return res;
}
}

43
moses/src/PCNTools.h Normal file
View File

@ -0,0 +1,43 @@
// $Id: StaticData.h 992 2006-11-21 23:06:30Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <vector>
#include <string>
#include <utility>
/** A couple of utilities to read .pcn files. A python-compatible format
* for encoding confusion networks.
*/
namespace PCN {
typedef std::pair<std::pair<std::string, float>, size_t> CNAlt;
typedef std::vector<CNAlt> CNCol;
typedef std::vector<CNCol> CN;
/** Given a string ((('foo',0.1),('bar',0.9)),...) representation of a
* confusion net in PCN format, return a CN object
*/
CN parsePCN(const std::string& in);
};

View File

@ -3,6 +3,8 @@
#pragma once
#include "StaticData.h" // needed for factor splitter
inline bool existsFile(const char* filePath) {
struct stat mystat;
return (stat(filePath,&mystat)==0);
@ -265,7 +267,7 @@ public:
for(size_t k=0;k<factorStrings.size();++k)
{
std::vector<std::string> factors=Tokenize(*factorStrings[k],"|");
std::vector<std::string> factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter());
Word& w=targetPhrase.AddWord();
for(size_t l=0;l<m_output.size();++l)
w[m_output[l]]= factorCollection.AddFactor(Output, m_output[l], factors[l]);
@ -366,14 +368,14 @@ public:
if(nextP) // w is a word that should be considered
{
Range newRange(curr.begin(),curr.end()+1);
Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx));
float newScore=curr.GetScore()+currCol[colidx].second; // CN score
Phrase newSrc(curr.src);
if(!isEpsilon) newSrc.AddWord(w);
if(newRange.second<srcSize && newScore>LOWEST_SCORE)
{
// if there is more room to grow, add a new state onto the queue
// to be explored that represents [begin, curEnd+1)
// to be explored that represents [begin, curEnd+)
stack.push_back(State(newRange,nextP,newScore,newRealWords));
stack.back().src=newSrc;
}
@ -462,6 +464,7 @@ public:
CreateTargetPhrase(targetPhrase,j->first,scores.trans,scores.src);
costs.push_back(std::make_pair(targetPhrase.GetFutureScore(),tCands.size()));
tCands.push_back(targetPhrase);
//std::cerr << i->first.first << "-" << i->first.second << ": " << targetPhrase << std::endl;
}
TargetPhraseCollection *rv=PruneTargetCandidates(tCands,costs);

View File

@ -30,10 +30,14 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
std::string line;
if (getline(in, line, '\n').eof())
return 0;
line = Trim(line);
std::map<std::string, std::string> meta;
do
{
if (getline(in, line, '\n').eof()) return 0;
line = Trim(line);
meta = ProcessAndStripSGML(line);
} while (line == "");
if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); }
Phrase::CreateFromString(factorOrder, line, factorDelimiter);
return 1;
}

View File

@ -94,7 +94,10 @@ bool StaticData::LoadData(Parameter *parameter)
// input type has to be specified BEFORE loading the phrase tables!
if(m_parameter->GetParam("inputtype").size())
m_inputType= (InputTypeEnum) Scan<int>(m_parameter->GetParam("inputtype")[0]);
VERBOSE(2,"input type is: "<<(m_inputType?"confusion net":"text input")<<"\n");
std::string s_it = "text input";
if (m_inputType == 1) { s_it = "confusion net"; }
if (m_inputType == 2) { s_it = "word lattice"; }
VERBOSE(2,"input type is: "<<s_it<<"\n");
// factor delimiter
if (m_parameter->GetParam("factor-delimiter").size() > 0) {

View File

@ -166,7 +166,7 @@ void TranslationOptionCollection::ProcessUnknownWord(const std::vector < std::li
* \param factorCollection input sentence with all factors
*/
void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,
size_t sourcePos)
size_t sourcePos, size_t length)
{
// unknown word, add as trans opt
FactorCollection &factorCollection = FactorCollection::Instance();
@ -203,13 +203,12 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,
}
targetPhrase.SetScore();
transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, m_source, 0);
transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase, m_source, 0);
}
else
{ // drop source word. create blank trans opt
const TargetPhrase targetPhrase(Output);
transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, m_source, 0);
transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase, m_source, 0);
}
transOpt->CalcScore();

View File

@ -76,7 +76,7 @@ protected:
void ProcessUnknownWord(const std::vector < std::list <const DecodeStep* > *> &decodeStepVL);
//! special handling of ONE unknown words.
virtual void ProcessOneUnknownWord(const Word &sourceWord
, size_t sourcePos);
, size_t sourcePos, size_t length = 1);
//! pruning: only keep the top n (m_maxNoTransOptPerCoverage) elements */
void Prune();

View File

@ -24,8 +24,10 @@ void TranslationOptionCollectionConfusionNet::ProcessUnknownWord(
ConfusionNet const& source=dynamic_cast<ConfusionNet const&>(m_source);
ConfusionNet::Column const& coll=source.GetColumn(sourcePos);
for(ConfusionNet::Column::const_iterator i=coll.begin();i!=coll.end();++i)
ProcessOneUnknownWord(i->first ,sourcePos);
size_t j=0;
for(ConfusionNet::Column::const_iterator i=coll.begin();i!=coll.end();++i) {
ProcessOneUnknownWord(i->first ,sourcePos, source.GetColumnIncrement(sourcePos, j++));
}
}

View File

@ -128,6 +128,7 @@ enum InputTypeEnum
{
SentenceInput = 0
,ConfusionNetworkInput = 1
,WordLatticeInput = 2
};
enum DictionaryFind

View File

@ -141,3 +141,61 @@ void PrintUserTime(const std::string &message)
{
g_timer.check(message.c_str());
}
std::map<std::string, std::string> ProcessAndStripSGML(std::string &line)
{
std::map<std::string, std::string> meta;
std::string lline = ToLower(line);
if (lline.find("<seg")!=0) return meta;
size_t close = lline.find(">");
if (close == std::string::npos) return meta; // error
size_t end = lline.find("</seg>");
std::string seg = Trim(lline.substr(4, close-4));
std::string text = line.substr(close+1, end - close - 1);
for (size_t i = 1; i < seg.size(); i++) {
if (seg[i] == '=' && seg[i-1] == ' ') {
std::string less = seg.substr(0, i-1) + seg.substr(i);
seg = less; i = 0; continue;
}
if (seg[i] == '=' && seg[i+1] == ' ') {
std::string less = seg.substr(0, i+1);
if (i+2 < seg.size()) less += seg.substr(i+2);
seg = less; i = 0; continue;
}
}
line = Trim(text);
if (seg == "") return meta;
for (size_t i = 1; i < seg.size(); i++) {
if (seg[i] == '=') {
std::string label = seg.substr(0, i);
std::string val = seg.substr(i+1);
if (val[0] == '"') {
val = val.substr(1);
size_t close = val.find('"');
if (close == std::string::npos) {
TRACE_ERR("SGML parse error: missing \"\n");
seg = "";
i = 0;
} else {
seg = val.substr(close+1);
val = val.substr(0, close);
i = 0;
}
} else {
size_t close = val.find(' ');
if (close == std::string::npos) {
seg = "";
i = 0;
} else {
seg = val.substr(close+1);
val = val.substr(0, close);
}
}
label = Trim(label);
seg = Trim(seg);
meta[label] = val;
}
}
return meta;
}

View File

@ -279,3 +279,8 @@ const std::string ToLower(const std::string& str);
// A couple of utilities to measure decoding time
void ResetUserTime();
void PrintUserTime(const std::string &message);
// dump SGML parser for <seg> tags
#include <map>
std::map<std::string, std::string> ProcessAndStripSGML(std::string &line);

View File

@ -17,6 +17,7 @@ my @tests = qw (
basic-surface-binptable
multi-factor-binptable
nbest-multi-factor
lattice-surface
lexicalized-reordering
);
############################################################

View File

@ -0,0 +1,22 @@
#!/usr/bin/perl
BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, "$script_dir/../perllib"; }
use RegTestUtils;
$x=0;
while (<>) {
chomp;
if (/^Finished loading LanguageModels/) {
my $time = RegTestUtils::readTime($_);
print "LMLOAD_TIME ~ $time\n";
}
if (/^Finished loading phrase tables/) {
my $time = RegTestUtils::readTime($_);
print "PTLOAD_TIME ~ $time\n";
}
next unless /^BEST TRANSLATION:/;
my $pscore = RegTestUtils::readHypoScore($_);
$x++;
print "SCORE_$x = $pscore\n";
}

View File

@ -0,0 +1,7 @@
#!/usr/bin/perl
$x=0;
while (<>) {
chomp;
$x++;
print "TRANSLATION_$x=$_\n";
}

View File

@ -0,0 +1,60 @@
# Moses configuration file
# automatic exodus from pharaoh.ini Wed Jul 12 18:24:14 EDT 2006
###########################
### PHARAOH CONFIG FILE ###
###########################
# phrase table f, n, p(n|f)
[ttable-file]
0 0 5 ${MODEL_PATH}/confusionNet-surface-only/phrase-table.0-0
# language model
[lmodel-file]
0 0 3 ${LM_PATH}/europarl.en.srilm.gz
# limit on how many phrase translations e for each phrase f are loaded
[ttable-limit]
#ttable element load limit 0 = all elements loaded
20
# distortion (reordering) weight
[weight-d]
0.141806519223522
# language model weight
[weight-l]
0.142658800199951
# translation model weight (phrase translation, lexical weighting)
[weight-t]
0.00402447059454402
0.0685647475075862
0.294089113124688
0.0328320356515851
-0.0426081987467227
# word penalty
[weight-w]
-0.273416114951401
[distortion-limit]
4
[beam-threshold]
0.03
[input-factors]
0
[mapping]
T 0
[inputtype]
2
[weight-i]
1.0
[verbose]
2

View File

@ -0,0 +1,3 @@
((('damit|PROADV',1.0),),(('ist|VSFIN',1.0),('war|VSFIN',1.0),('sei|VSFIN',1.0),),(('der|ART',1.0),('die|ART',1.0),('das|ART',1.0),),(('arbeitsplan|NN',1.0),),)
((('damit|PROADV',1.0),('dies|PROADV',1.0),),(('ist|VSFIN',1.0),('war|VSFIN',1.0),('sei|VSFIN',1.0),('ist|VVFIN',1.0),('war|VVFIN',1.0),('sei|VVFIN',1.0),),(('der|ART',1.0),('die|ART',1.0),('das|ART',1.0),('der|DT',1.0),('die|DT',1.0),('das|DT',1.0),),(('arbeitsplan|NN',1.0),),)
((('damit|PROADV',1.0),('dies|PROADV',0.0),),(('ist|VSFIN',1.0),('war|VSFIN',1.0),('sei|VSFIN',1.0),('dies|PROADV',0.0),('das|DT',0.0),),(('der|ART',1.0),('die|ART',1.0),('das|ART',1.0),('dies|PROADV',0.0),),(('arbeitsplan|NN',1.0),('dies|PROADV',0.0),),)

View File

@ -0,0 +1,9 @@
TRANSLATION_1=that is the order of business
TRANSLATION_2=this is the order of business
TRANSLATION_3=that is the order of business
LMLOAD_TIME ~ 10.00
PTLOAD_TIME ~ 10.00
SCORE_1 = -2.966
SCORE_2 = -2.565
SCORE_3 = -2.966
TOTAL_WALLTIME ~ 11

View File

@ -7,8 +7,13 @@ DS?=$(shell date '+%Y%m%d')
# set these 2 variables below to point to a directory where you want the
# compiled scripts to be copied to
<<<<<<< .mine
TARGETDIR=/chomes/redpony/moses
BINDIR=/fs/cliplab/software/moses-scripts/giza
=======
TARGETDIR=/home/s0565741/terabyte/bin
BINDIR=/home/s0565741/terabyte/bin
>>>>>>> .r1358
MAIN_SCRIPTS_TARGET_DIR=$(TARGETDIR)
# MAIN_SCRIPTS_TARGET_DIR=$(shell echo `pwd`/temp)