Merge branch 'master' into dynamic-models

This commit is contained in:
Nicola Bertoldi 2014-04-30 08:32:46 +02:00
commit fe1ed42f81
43 changed files with 1907 additions and 261 deletions

View File

@ -55,7 +55,7 @@
# information also known as -g
# --notrace compiles without TRACE macros
#
# --enable-boost-pool uses Boost pools for the memory SCFG table
# --enable-boost-pool uses Boost pools for the memory SCFG tabgle
#
# --enable-mpi switch on mpi
# --without-libsegfault does not link with libSegFault
@ -148,9 +148,13 @@ if [ option.get "with-mm" : : "yes" ]
moses/TranslationModel/UG/mm//mtt-build
moses/TranslationModel/UG/mm//mtt-dump
moses/TranslationModel/UG/mm//symal2mam
moses/TranslationModel/UG/mm//mam2symal
moses/TranslationModel/UG/mm//mam_verify
moses/TranslationModel/UG/mm//custom-pt
moses/TranslationModel/UG/mm//mmlex-build
moses/TranslationModel/UG/mm//mmlex-lookup
moses/TranslationModel/UG/mm//mtt-count-words
moses/TranslationModel/UG/mm//calc-coverage
moses/TranslationModel/UG//try-align
;
}

View File

@ -35,7 +35,7 @@ if $(build-moses-server) = true
xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --libs" ] ;
xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --cflags" ] ;
exe mosesserver : mosesserver.cpp ../../moses//moses ../../moses-cmd/IOWrapper.cpp ../../OnDiskPt//OnDiskPt : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
exe mosesserver : mosesserver.cpp ../../moses//moses ../../OnDiskPt//OnDiskPt ../../moses-cmd/IOWrapper.cpp : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
} else {
alias mosesserver ;
}

View File

@ -10,6 +10,9 @@
#include "moses/StaticData.h"
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#if PT_UG
#include "moses/TranslationModel/UG/mmsapt.h"
#endif
#include "moses/TreeInput.h"
#include "moses/LM/ORLM.h"
#include "moses-cmd/IOWrapper.h"
@ -43,10 +46,16 @@ public:
xmlrpc_c::value * const retvalP) {
const params_t params = paramList.getStruct(0);
breakOutParams(params);
#if PT_UG
Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
pdsa->add(source_,target_,alignment_);
#else
const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0];
PhraseDictionaryDynSuffixArray* pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
PhraseDictionaryDynSuffixArray*
pdsa = (PhraseDictionaryDynSuffixArray*) pdf;
cerr << "Inserting into address " << pdsa << endl;
pdsa->insertSnt(source_, target_, alignment_);
#endif
if(add2ORLM_) {
//updateORLM();
}
@ -54,7 +63,9 @@ public:
//PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy);
map<string, xmlrpc_c::value> retData;
//*retvalP = xmlrpc_c::value_struct(retData);
#ifndef PT_UG
pdf = 0;
#endif
pdsa = 0;
*retvalP = xmlrpc_c::value_string("Phrase table updated");
}
@ -211,8 +222,7 @@ public:
"Missing source text",
xmlrpc_c::fault::CODE_PARSE);
}
const string source(
(xmlrpc_c::value_string(si->second)));
const string source((xmlrpc_c::value_string(si->second)));
cerr << "Input: " << source << endl;
si = params.find("align");
@ -230,6 +240,9 @@ public:
si = params.find("nbest-distinct");
bool nbest_distinct = (si != params.end());
si = params.find("add-score-breakdown");
bool addScoreBreakdown = (si != params.end());
vector<float> multiModelWeights;
si = params.find("lambda");
if (si != params.end()) {
@ -258,8 +271,8 @@ public:
if (staticData.IsChart()) {
TreeInput tinput;
const vector<FactorType> &inputFactorOrder =
staticData.GetInputFactorOrder();
const vector<FactorType>&
inputFactorOrder = staticData.GetInputFactorOrder();
stringstream in(source + "\n");
tinput.Read(in,inputFactorOrder);
ChartManager manager(tinput);
@ -305,7 +318,8 @@ public:
insertTranslationOptions(manager,retData);
}
if (nbest_size>0) {
outputNBest(manager, retData, nbest_size, nbest_distinct, reportAllFactors, addAlignInfo);
outputNBest(manager, retData, nbest_size, nbest_distinct,
reportAllFactors, addAlignInfo, addScoreBreakdown);
}
}
pair<string, xmlrpc_c::value>
@ -330,8 +344,9 @@ public:
if (addAlignmentInfo) {
/**
* Add the alignment info to the array. This is in target order and consists of
* (tgt-start, src-start, src-end) triples.
* Add the alignment info to the array. This is in target
* order and consists of (tgt-start, src-start, src-end)
* triples.
**/
map<string, xmlrpc_c::value> phraseAlignInfo;
phraseAlignInfo["tgt-start"] = xmlrpc_c::value_int(hypo->GetCurrTargetWordsRange().GetStartPos());
@ -396,7 +411,8 @@ public:
const int n=100,
const bool distinct=false,
const bool reportAllFactors=false,
const bool addAlignmentInfo=false)
const bool addAlignmentInfo=false,
const bool addScoreBreakdown=false)
{
TrellisPathList nBestList;
manager.CalcNBest(n, nBestList, distinct);
@ -452,6 +468,14 @@ public:
}
}
if (addScoreBreakdown)
{
// should the score breakdown be reported in a more structured manner?
ostringstream buf;
MosesCmd::OutputAllFeatureScores(path.GetScoreBreakdown(),buf);
nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
}
// weighted score
nBestXMLItem["totalScore"] = xmlrpc_c::value_double(path.GetTotalScore());
nBestXml.push_back(xmlrpc_c::value_struct(nBestXMLItem));
@ -490,11 +514,55 @@ public:
}
retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml)));
}
};
static
void
PrintFeatureWeight(ostream& out, const FeatureFunction* ff)
{
out << ff->GetScoreProducerDescription() << "=";
size_t numScoreComps = ff->GetNumScoreComponents();
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
out << " " << values[i];
}
out << endl;
}
static
void
ShowWeights(ostream& out)
{
// adapted from moses-cmd/Main.cpp
std::ios::fmtflags old_flags = out.setf(std::ios::fixed);
size_t old_precision = out.precision(6);
const vector<const StatelessFeatureFunction*>&
slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>&
sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (size_t i = 0; i < sff.size(); ++i) {
const StatefulFeatureFunction *ff = sff[i];
if (ff->IsTuneable()) {
PrintFeatureWeight(out,ff);
}
else {
out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
}
}
for (size_t i = 0; i < slf.size(); ++i) {
const StatelessFeatureFunction *ff = slf[i];
if (ff->IsTuneable()) {
PrintFeatureWeight(out,ff);
}
else {
out << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
}
}
if (! (old_flags & std::ios::fixed))
out.unsetf(std::ios::fixed);
out.precision(old_precision);
}
int main(int argc, char** argv)
{
@ -542,11 +610,16 @@ int main(int argc, char** argv)
exit(1);
}
if (params->isParamSpecified("show-weights")) {
ShowWeights(cout);
exit(0);
}
//512 MB data limit (512KB is not enough for optimization)
xmlrpc_limit_set(XMLRPC_XML_SIZE_LIMIT_ID, 512*1024*1024);
xmlrpc_c::registry myRegistry;
xmlrpc_c::methodPtr const translator(new Translator);
xmlrpc_c::methodPtr const updater(new Updater);
xmlrpc_c::methodPtr const optimizer(new Optimizer);

View File

@ -253,14 +253,17 @@ public:
if ( appendSuffix ) {
fileName << "." << compression;
}
boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream;
boost::iostreams::filtering_ostream *file
= new boost::iostreams::filtering_ostream;
if ( compression == "gz" ) {
file->push( boost::iostreams::gzip_compressor() );
} else if ( compression == "bz2" ) {
file->push( boost::iostreams::bzip2_compressor() );
} else if ( compression != "txt" ) {
TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl);
TRACE_ERR("Unrecognized hypergraph compression format ("
<< compression
<< ") - using uncompressed plain txt" << std::endl);
compression = "txt";
}
@ -271,7 +274,10 @@ public:
manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
file -> flush();
} else {
TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl);
TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber
<< " because the output file " << fileName.str()
<< " is not open or not ready for writing"
<< std::endl);
}
file -> pop();
delete file;

View File

@ -95,7 +95,7 @@ namespace Moses
ConfusionNet::
ReadF(std::istream& in, const std::vector<FactorType>& factorOrder, int format)
{
VERBOSE(1, "read confusion net with format "<<format<<"\n");
VERBOSE(2, "read confusion net with format "<<format<<"\n");
switch(format) {
case 0:
return ReadFormat0(in,factorOrder);
@ -120,7 +120,9 @@ namespace Moses
return rv;
}
#if 0
// Deprecated due to code duplication;
// use Word::CreateFromString() instead
void
ConfusionNet::
String2Word(const std::string& s,Word& w,
@ -132,6 +134,7 @@ namespace Moses
FactorCollection::Instance().AddFactor
(Input,factorOrder[i], factorStrVector[i]));
}
#endif
bool
ConfusionNet::
@ -155,7 +158,8 @@ namespace Moses
Column col;
while(is>>word) {
Word w;
String2Word(word,w,factorOrder);
// String2Word(word,w,factorOrder);
w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
std::vector<float> probs(totalCount, 0.0);
for(size_t i=0; i < numInputScores; i++) {
double prob;
@ -216,7 +220,9 @@ namespace Moses
VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
data[i][j].second.denseScores[0]=0.0;
}
String2Word(word,data[i][j].first,factorOrder);
// String2Word(word,data[i][j].first,factorOrder);
Word& w = data[i][j].first;
w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
} else return 0;
}
return !data.empty();

View File

@ -14,11 +14,11 @@ InputFeature *InputFeature::s_instance = NULL;
InputFeature::InputFeature(const std::string &line)
: StatelessFeatureFunction(line)
, m_numInputScores(0)
, m_numRealWordCount(0)
{
m_numInputScores = this->m_numScoreComponents;
ReadParameters();
UTIL_THROW_IF2(s_instance, "Can only have 1 input feature");
s_instance = this;
}

View File

@ -5,7 +5,7 @@
#include "TypeDef.h"
#include "AlignmentInfo.h"
#include "util/exception.hh"
#include "TranslationModel/PhraseDictionary.h"
using namespace std;
namespace Moses
@ -18,9 +18,9 @@ InputPath(const Phrase &phrase, const NonTerminalSet &sourceNonTerms,
,m_phrase(phrase)
,m_range(range)
,m_inputScore(inputScore)
,m_nextNode(1)
,m_sourceNonTerms(sourceNonTerms)
,m_sourceNonTermArray(FactorCollection::Instance().GetNumNonTerminals(), false)
,m_nextNode(1)
{
for (NonTerminalSet::const_iterator iter = sourceNonTerms.begin(); iter != sourceNonTerms.end(); ++iter) {
size_t idx = (*iter)[0]->GetId();
@ -33,6 +33,14 @@ InputPath(const Phrase &phrase, const NonTerminalSet &sourceNonTerms,
InputPath::~InputPath()
{
// Since there is no way for the Phrase Dictionaries to tell in
// which (sentence) context phrases were looked up, we tell them
// now that the phrase isn't needed any more by this inputPath
typedef std::pair<const TargetPhraseCollection*, const void* > entry;
std::map<const PhraseDictionary*, entry>::const_iterator iter;
for (iter = m_targetPhrases.begin(); iter != m_targetPhrases.end(); ++iter)
iter->first->Release(iter->second.first);
delete m_inputScore;
}

View File

@ -40,7 +40,18 @@ current ?= "" ;
path-constant LM-LOG : bin/lm.log ;
update-if-changed $(LM-LOG) $(current) ;
obj FF_Factory.o : FF/Factory.cpp LM//macros headers ../lm//kenlm : <dependency>$(LM-LOG) ;
obj FF_Factory.o : FF/Factory.cpp LM//macros headers ../lm//kenlm mmlib : <dependency>$(LM-LOG) ;
if [ option.get "with-mm" : no : yes ] = yes
{
alias mmlib :
$(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)/moses/TranslationModel/UG/mm//mm
;
} else {
alias mmlib ;
}
lib moses :
[ glob
@ -62,12 +73,11 @@ lib moses :
]
headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool
..//search ../util/double-conversion//double-conversion ..//z ../OnDiskPt//OnDiskPt
$(TOP)//boost_iostreams
$(TOP)//boost_iostreams mmlib
:
<threading>single:<source>../util//rt
;
#generic//generic mm//mm
alias headers-to-install : [ glob-tree *.h ] ;

View File

@ -182,7 +182,11 @@ void Manager::printDivergentHypothesis(long translationId, const Hypothesis* hyp
}
void Manager::printThisHypothesis(long translationId, const Hypothesis* hypo, const vector <const TargetPhrase*> & remainingPhrases, float remainingScore, ostream& outputStream) const
void
Manager::
printThisHypothesis(long translationId, const Hypothesis* hypo,
const vector <const TargetPhrase*> & remainingPhrases,
float remainingScore, ostream& outputStream) const
{
outputStream << translationId << " ||| ";

View File

@ -50,6 +50,13 @@ PhraseDictionary::PhraseDictionary(const std::string &line)
s_staticColl.push_back(this);
}
bool
PhraseDictionary::
ProvidesPrefixCheck() const
{
return false;
}
const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollectionLEGACY(const Phrase& src) const
{
const TargetPhraseCollection *ret;
@ -129,6 +136,23 @@ SetFeaturesToApply()
}
}
// tell the Phrase Dictionary that the TargetPhraseCollection is not needed any more
void
PhraseDictionary::
Release(TargetPhraseCollection const* tpc) const
{
// do nothing by default
return;
}
bool
PhraseDictionary::
PrefixExists(Phrase const& phrase) const
{
return true;
}
void
PhraseDictionary::
GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const

View File

@ -71,6 +71,8 @@ public:
class PhraseDictionary : public DecodeFeature
{
public:
virtual bool ProvidesPrefixCheck() const;
static const std::vector<PhraseDictionary*>& GetColl() {
return s_staticColl;
}
@ -85,6 +87,16 @@ public:
return m_tableLimit;
}
virtual
void
Release(TargetPhraseCollection const* tpc) const;
/// return true if phrase table entries starting with /phrase/
// exist in the table.
virtual
bool
PrefixExists(Phrase const& phrase) const;
// LEGACY!
// The preferred method is to override GetTargetPhraseCollectionBatch().
// See class PhraseDictionaryMemory or PhraseDictionaryOnDisk for details

View File

@ -0,0 +1,48 @@
#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
namespace Moses
{
ThreadSafeCounter::
ThreadSafeCounter()
: ctr(0)
{ }
size_t
ThreadSafeCounter::
operator++()
{
boost::lock_guard<boost::mutex> guard(this->lock);
return ++ctr;
}
size_t
ThreadSafeCounter::
operator++(int foo)
{
boost::lock_guard<boost::mutex> guard(this->lock);
return ctr++;
}
ThreadSafeCounter::
operator size_t() const
{
return ctr;
}
size_t
ThreadSafeCounter::
operator--()
{
boost::lock_guard<boost::mutex> guard(this->lock);
return --ctr;
}
size_t
ThreadSafeCounter::
operator--(int foo)
{
boost::lock_guard<boost::mutex> guard(this->lock);
return ctr--;
}
}

View File

@ -0,0 +1,21 @@
#pragma once
#include <boost/thread.hpp>
namespace Moses
{
class ThreadSafeCounter
{
size_t ctr;
boost::mutex lock;
public:
ThreadSafeCounter();
size_t operator++();
size_t operator++(int);
size_t operator--();
size_t operator--(int);
operator size_t() const;
};
}

View File

@ -1,3 +1,5 @@
external-lib bz2 ;
exe mmlex-build :
mmlex-build.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
@ -7,6 +9,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe mmlex-lookup :
mmlex-lookup.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe mtt-count-words :
mtt-count-words.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
@ -34,6 +45,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe mam2symal :
mam2symal.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe symal2mam :
symal2mam.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
@ -43,17 +63,47 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe custom-pt :
custom-pt.cc
#$(TOP)/moses/generic//generic
exe mam_verify :
mam_verify.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe custom-pt :
custom-pt.cc
$(TOP)/moses//moses
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)/util//kenutil
;
install $(PREFIX)/bin : mtt-build mtt-dump mtt-count-words symal2mam custom-pt mmlex-build ;
exe calc-coverage :
calc-coverage.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
install $(PREFIX)/bin :
mtt-build
mtt-dump
mtt-count-words
symal2mam
mam2symal
custom-pt
mmlex-build
mmlex-lookup
mam_verify
calc-coverage
;
fakelib mm : [ glob ug_*.cc tpt_*.cc ] ;

View File

@ -76,7 +76,7 @@ endef
testprogs = test-dynamic-im-tsa
programs = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
programs += mtt-count-words
programs += mtt-count-words calc-coverage
all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
@echo $^

View File

@ -0,0 +1,56 @@
#include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
#include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
#include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
#include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
// using namespace Moses;
using namespace ugdiss;
typedef L2R_Token<SimpleWordId> Token;
TokenIndex V;
sptr<vector<vector<Token> > > C(new vector<vector<Token> >());
void
add_file(string fname)
{
filtering_istream in;
open_input_stream(fname,in);
string line;
while (getline(in,line))
{
C->push_back(vector<Token>());
fill_token_seq(V,line,C->back());
}
}
int
main(int argc, char* argv[])
{
V.setDynamic(true);
add_file(argv[1]);
sptr<imTtrack<Token> > T(new imTtrack<Token>(C));
imTSA<Token> I(T,NULL,NULL);
string line;
while (getline(cin,line))
{
vector<Token> seq; fill_token_seq<Token>(V,line,seq);
for (size_t i = 0; i < seq.size(); ++i)
{
TSA<Token>::tree_iterator m(&I);
cout << V[seq[i].id()];
for (size_t k = i; k < seq.size() && m.extend(seq[k]); ++k)
{
cout << " ";
if (k > i) cout << V[seq[k].id()] << " ";
cout << "[" << m.approxOccurrenceCount() << "]";
}
cout << endl;
}
}
}

View File

@ -53,7 +53,7 @@ nbest_phrasepairs(uint64_t const pid1,
pstats const& ps,
vector<PhrasePair> & nbest)
{
boost::unordered_map<uint64_t,jstats>::const_iterator m;
pstats::trg_map_t::const_iterator m;
vector<size_t> idx(nbest.size());
size_t i=0;
for (m = ps.trg.begin();

View File

@ -0,0 +1,98 @@
// -*- c++ -*-
// (c) 2008-2010 Ulrich Germann
#include <boost/program_options.hpp>
#include <iomanip>
#include "tpt_typedefs.h"
#include "ug_mm_ttrack.h"
#include "tpt_tokenindex.h"
#include "ug_deptree.h"
#include "ug_corpus_token.h"
#include "tpt_pickler.h"
using namespace std;
using namespace ugdiss;
namespace po = boost::program_options;
string mamfile;
vector<string> range;
typedef L2R_Token<Conll_Sform> Token;
mmTtrack<char> MAM;
bool with_sids;
void
interpret_args(int ac, char* av[])
{
po::variables_map vm;
po::options_description o("Options");
o.add_options()
("help,h", "print this message")
("numbers,n", po::bool_switch(&with_sids), "print sentence ids as first token")
;
po::options_description h("Hidden Options");
h.add_options()
("mamfile", po::value<string>(&mamfile), "mamfile")
("range", po::value<vector<string> >(&range), "range")
;
po::positional_options_description a;
a.add("mamfile",1);
a.add("range",-1);
po::store(po::command_line_parser(ac,av)
.options(h.add(o))
.positional(a)
.run(),vm);
po::notify(vm); // IMPORTANT
if (vm.count("help") || mamfile.empty())
{
cout << "usage:\n\t"
<< av[0] << " track name [<range>]\n"
<< endl;
cout << o << endl;
exit(0);
}
}
void
printRangeMAM(size_t start, size_t stop)
{
for (;start < stop; start++)
{
// size_t i = 0;
char const* p = MAM.sntStart(start);
char const* q = MAM.sntEnd(start);
if (with_sids) cout << start << " ";
ushort s,t;
while (p < q)
{
p = binread(p,s);
p = binread(p,t);
cout << s << "-" << t << " ";
}
cout << endl;
}
}
int
main(int argc, char*argv[])
{
interpret_args(argc,argv);
MAM.open(mamfile);
if (!range.size()) printRangeMAM(0, MAM.size());
else
{
for (size_t i = 0; i < range.size(); i++)
{
istringstream buf(range[i]);
size_t first,last; uchar c;
buf>>first;
if (buf.peek() == '-') buf>>c>>last;
else last = first;
if (last < MAM.size())
printRangeMAM(first,last+1);
}
}
}

View File

@ -0,0 +1,120 @@
// -*- c++ -*-
// (c) 2008-2010 Ulrich Germann
#include <boost/program_options.hpp>
#include <iomanip>
#include "tpt_typedefs.h"
#include "ug_mm_ttrack.h"
#include "tpt_tokenindex.h"
#include "ug_deptree.h"
#include "ug_corpus_token.h"
#include "tpt_pickler.h"
using namespace std;
using namespace ugdiss;
namespace po = boost::program_options;
typedef L2R_Token<Conll_Sform> Token;
string bname,L1,L2;
mmTtrack<char> MAM;
mmTtrack<Token> T1,T2;
bool inv;
vector<string> range;
void
interpret_args(int ac, char* av[])
{
po::variables_map vm;
po::options_description o("Options");
o.add_options()
("help,h", "print this message")
("inv,i", po::bool_switch(&inv), "inverse")
;
po::options_description h("Hidden Options");
h.add_options()
("bname", po::value<string>(&bname), "base name")
("L1", po::value<string>(&L1), "L1")
("L2", po::value<string>(&L2), "L2")
("range", po::value<vector<string> >(&range), "range")
;
po::positional_options_description a;
a.add("bname",1);
a.add("L1",1);
a.add("L2",1);
a.add("range",-1);
po::store(po::command_line_parser(ac,av)
.options(h.add(o))
.positional(a)
.run(),vm);
po::notify(vm); // IMPORTANT
if (vm.count("help") || L2.empty())
{
cout << "usage:\n\t"
<< av[0] << " <base name> <L1> <L2> \n"
<< endl;
cout << o << endl;
exit(0);
}
}
size_t
check_range(size_t start, size_t stop)
{
size_t noAln = 0;
for (size_t sid = start; sid < stop; ++sid)
{
char const* p = MAM.sntStart(sid);
char const* q = MAM.sntEnd(sid);
size_t slen = T1.sntLen(sid);
size_t tlen = T2.sntLen(sid);
if (p == q) ++noAln;
ushort s,t;
while (p < q)
{
p = binread(p,s);
p = binread(p,t);
if (s >= slen || t >= tlen)
{
cout << "alignment out of bounds in sentence " << sid << ": "
<< s << "-" << t << " in " << slen << ":" << tlen << "."
<< endl;
break;
}
}
}
return noAln;
}
int
main(int argc, char*argv[])
{
interpret_args(argc,argv);
MAM.open(bname+L1+"-"+L2+".mam");
T1.open(bname+L1+".mct");
T2.open(bname+L2+".mct");
if (T1.size() != T2.size() || T1.size() != MAM.size())
{
cout << "Track sizes don't match!" << endl;
exit(1);
}
size_t noAln;
if (!range.size())
noAln = check_range(0, MAM.size());
else
{
noAln = 0;
for (size_t i = 0; i < range.size(); i++)
{
istringstream buf(range[i]);
size_t first,last; uchar c;
buf>>first;
if (buf.peek() == '-') buf>>c>>last;
else last = first;
if (last < MAM.size())
noAln += check_range(first,last+1);
}
}
cout << noAln << " sentence pairs without alignment" << endl;
}

View File

@ -0,0 +1,149 @@
// -*- c++ -*-
// Program to extract word cooccurrence counts from a memory-mapped
// word-aligned bitext stores the counts lexicon in the format for
// mm2dTable<uint32_t> (ug_mm_2d_table.h)
//
// (c) 2010-2012 Ulrich Germann
// to do: multi-threading
#include <queue>
#include <iomanip>
#include <vector>
#include <iterator>
#include <sstream>
#include <algorithm>
#include <boost/program_options.hpp>
#include <boost/dynamic_bitset.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#include <boost/math/distributions/binomial.hpp>
#include <boost/unordered_map.hpp>
#include <boost/unordered_set.hpp>
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
#include "ug_mm_2d_table.h"
#include "ug_mm_ttrack.h"
#include "ug_corpus_token.h"
using namespace std;
using namespace ugdiss;
using namespace boost::math;
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
typedef SimpleWordId Token;
// DECLARATIONS
void interpret_args(int ac, char* av[]);
string swrd,twrd,L1,L2,bname;
TokenIndex V1,V2;
LEX_t LEX;
void
lookup_source(ostream& out, id_type r)
{
vector<LEX_t::Cell> foo(LEX[r].start,LEX[r].stop);
sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
out << V1[r] << " " << LEX.m1(r) << endl;
BOOST_FOREACH(LEX_t::Cell const& c, foo)
{
out << setw(10) << float(c.val)/LEX.m1(r) << " "
<< setw(10) << float(c.val)/LEX.m2(c.id) << " "
<< V2[c.id] << " " << c.val << "/" << LEX.m2(c.id) << endl;
}
}
void
lookup_target(ostream& out, id_type c)
{
vector<LEX_t::Cell> foo;
LEX_t::Cell cell;
for (size_t r = 0; r < LEX.numRows; ++r)
{
size_t j = LEX[r][c];
if (j)
{
cell.id = r;
cell.val = j;
foo.push_back(cell);
}
}
sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
out << V2[c] << " " << LEX.m2(c) << endl;
BOOST_FOREACH(LEX_t::Cell const& r, foo)
{
out << setw(10) << float(r.val)/LEX.m2(c) << " "
<< setw(10) << float(r.val)/LEX.m1(r.id) << " "
<< V1[r.id] << " " << r.val << "/" << LEX.m1(r.id) << endl;
}
}
void
dump(ostream& out)
{
for (size_t r = 0; r < LEX.numRows; ++r)
lookup_source(out,r);
out << endl;
}
int
main(int argc, char* argv[])
{
interpret_args(argc,argv);
char c = *bname.rbegin();
if (c != '/' && c != '.') bname += '.';
V1.open(bname+L1+".tdx");
V2.open(bname+L2+".tdx");
LEX.open(bname+L1+"-"+L2+".lex");
cout.precision(2);
id_type swid = V1[swrd];
id_type twid = V2[twrd];
if (swid != 1 && twid != 1)
{
cout << swrd << " " << twrd << " "
<< LEX.m1(swid) << " / "
<< LEX[swid][twid] << " / "
<< LEX.m2(twid) << endl;
}
else if (swid != 1)
lookup_source(cout,swid);
else if (twid != 1)
lookup_target(cout,twid);
else
dump(cout);
}
void
interpret_args(int ac, char* av[])
{
namespace po=boost::program_options;
po::variables_map vm;
po::options_description o("Options");
po::options_description h("Hidden Options");
po::positional_options_description a;
o.add_options()
("help,h", "print this message")
("source,s",po::value<string>(&swrd),"source word")
("target,t",po::value<string>(&swrd),"target word")
;
h.add_options()
("bname", po::value<string>(&bname), "base name")
("L1", po::value<string>(&L1),"L1 tag")
("L2", po::value<string>(&L2),"L2 tag")
;
a.add("bname",1);
a.add("L1",1);
a.add("L2",1);
get_options(ac,av,h.add(o),a,vm,"cfg");
}

View File

@ -24,7 +24,7 @@ mmTtrack<SimpleWordId> MCT;
bool sform;
bool have_mtt, have_mct;
bool with_sids;
bool with_positions;
void
interpret_args(int ac, char* av[])
{
@ -34,6 +34,7 @@ interpret_args(int ac, char* av[])
("help,h", "print this message")
("numbers,n", po::bool_switch(&with_sids), "print sentence ids as first token")
("sform,s", po::bool_switch(&sform), "sform only")
("with-positions,p", po::bool_switch(&with_positions), "show word positions")
;
po::options_description h("Hidden Options");
@ -68,10 +69,10 @@ printRangeMTT(size_t start, size_t stop)
for (;start < stop; start++)
{
size_t i = 0;
Token const* t = MTT.sntStart(start);
Token const* s = MTT.sntStart(start);
Token const* e = MTT.sntEnd(start);
if (with_sids) cout << start << " ";
for (;t < e; ++t)
for (Token const* t = s; t < e; ++t)
{
#if 0
uchar const* x = reinterpret_cast<uchar const*>(t);
@ -91,7 +92,11 @@ printRangeMTT(size_t start, size_t stop)
cout << i+t->parent << " ";
cout << DT[t->dtype] << endl;
}
else cout << SF[t->id()] << " ";
else
{
if (with_positions) cout << t-s << ":";
cout << SF[t->id()] << " ";
}
}
cout << endl;
}
@ -102,10 +107,15 @@ printRangeMCT(size_t start, size_t stop)
{
for (;start < stop; start++)
{
SimpleWordId const* t = MCT.sntStart(start);
SimpleWordId const* s = MCT.sntStart(start);
SimpleWordId const* t = s;
SimpleWordId const* e = MCT.sntEnd(start);
if (with_sids) cout << start << " ";
while (t < e) cout << SF[(t++)->id()] << " ";
while (t < e)
{
if (with_positions) cout << t-s << ":";
cout << SF[(t++)->id()] << " ";
}
cout << endl;
}
}

View File

@ -21,8 +21,8 @@
#include <boost/program_options.hpp>
#include <boost/scoped_ptr.hpp>
#include "headers-base/util/exception.hh"
#include "headers-base/util/check.hh"
#include "util/exception.hh"
// #include "headers-base/util/check.hh"
// NOTE TO SELF:
/* Program to filter out sentences that GIZA will skip or truncate,

View File

@ -44,16 +44,14 @@ namespace ugdiss
file.open(fname);
if (!file.is_open())
{
cerr << "Error opening file " << fname << endl;
assert(0);
ostringstream msg;
msg << "TokenIndex::open: Error opening file '" << fname << "'.";
throw std::runtime_error(msg.str().c_str());
}
// cout << "file is open" << endl;
this->numTokens = *(reinterpret_cast<uint32_t const*>(file.data()));
unkId = *(reinterpret_cast<id_type const*>(file.data()+4));
// cout << "tokenindex.open: unkId=" << unkId << endl;
startIdx = reinterpret_cast<Entry const*>(file.data()+4+sizeof(id_type));
endIdx = startIdx + numTokens;
comp.base = reinterpret_cast<char const*>(endIdx);
@ -143,13 +141,10 @@ namespace ugdiss
TokenIndex::
operator[](id_type id) const
{
if (!ridx.size())
if (!ridx.size())
{
cerr << "FATAL ERROR: You need to call iniReverseIndex() "
<< "on the TokenIndex class before using operator[](id_type id)."
<< endl;
assert(0);
exit(1);
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
if (id < ridx.size())
return ridx[id];
@ -163,7 +158,11 @@ namespace ugdiss
TokenIndex::
iniReverseIndex()
{
if (!ridx.size()) ridx = reverseIndex();
if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
}
@ -171,7 +170,11 @@ namespace ugdiss
TokenIndex::
operator[](id_type id)
{
if (!ridx.size()) ridx = reverseIndex();
if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
if (id < ridx.size())
return ridx[id];
boost::lock_guard<boost::mutex> lk(*this->lock);
@ -184,7 +187,11 @@ namespace ugdiss
TokenIndex::
toString(vector<id_type> const& v)
{
if (!ridx.size()) ridx = reverseIndex();
if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
ostringstream buf;
for (size_t i = 0; i < v.size(); i++)
buf << (i ? " " : "") << (*this)[v[i]];
@ -195,7 +202,11 @@ namespace ugdiss
TokenIndex::
toString(vector<id_type> const& v) const
{
assert (ridx.size());
if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
ostringstream buf;
for (size_t i = 0; i < v.size(); i++)
buf << (i ? " " : "") << (*this)[v[i]];
@ -206,7 +217,11 @@ namespace ugdiss
TokenIndex::
toString(id_type const* start, id_type const* const stop)
{
if (!ridx.size()) ridx = reverseIndex();
if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
ostringstream buf;
if (start < stop)
buf << (*this)[*start];
@ -219,7 +234,11 @@ namespace ugdiss
TokenIndex::
toString(id_type const* start, id_type const* const stop) const
{
assert (ridx.size());
if (!ridx.size())
{
boost::lock_guard<boost::mutex> lk(*this->lock);
if (!ridx.size()) ridx = reverseIndex();
}
ostringstream buf;
if (start < stop)
buf << (*this)[*start];

View File

@ -28,7 +28,7 @@ namespace ugdiss
class TokenIndex
{
/** Reverse index: maps from ID to char const* */
vector<char const*> ridx;
mutable vector<char const*> ridx;
/** Label for the UNK token */
string unkLabel;
id_type unkId,numTokens;
@ -164,5 +164,12 @@ namespace ugdiss
write_tokenindex_to_disk(tok,ofile,unkToken);
}
template<typename Token>
void
fill_token_seq(TokenIndex& V, string const& line, vector<Token>& dest)
{
istringstream buf(line); string w;
while (buf>>w) dest.push_back(Token(V[w]));
}
}
#endif

View File

@ -10,6 +10,9 @@ namespace Moses
{
namespace bitext
{
ThreadSafeCounter pstats::active;
pstats::
pstats()
: raw_cnt (0)
@ -20,6 +23,14 @@ namespace Moses
{
ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
// if (++active%5 == 0)
// cerr << size_t(active) << " active pstats at " << __FILE__ << ":" << __LINE__ << endl;
}
pstats::
~pstats()
{
--active;
}
void
@ -49,16 +60,13 @@ namespace Moses
uint32_t fwd_o,
uint32_t bwd_o)
{
this->lock.lock();
boost::lock_guard<boost::mutex> guard(this->lock);
jstats& entry = this->trg[pid];
this->lock.unlock();
entry.add(w,a,cnt2,fwd_o,bwd_o);
if (this->good < entry.rcnt())
{
this->lock.lock();
return false;
// UTIL_THROW(util::Exception, "more joint counts than good counts!"
// << entry.rcnt() << "/" << this->good);
UTIL_THROW(util::Exception, "more joint counts than good counts:"
<< entry.rcnt() << "/" << this->good << "!");
}
return true;
}
@ -338,6 +346,10 @@ namespace Moses
typedef L2R_Token<SimpleWordId> TKN;
assert(s1.size() == s2.size() && s1.size() == aln.size());
#ifndef NDEBUG
size_t first_new_snt = this->T1 ? this->T1->size() : 0;
#endif
sptr<imBitext<TKN> > ret;
{
lock_guard<mutex> guard(this->lock);
@ -346,30 +358,58 @@ namespace Moses
// we add the sentences in separate threads (so it's faster)
boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
thread1.join(); // for debugging
// thread1.join(); // for debugging
boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
BOOST_FOREACH(string const& a, aln)
{
istringstream ibuf(a);
ostringstream obuf;
uint32_t row,col; char c;
while (ibuf>>row>>c>>col)
while (ibuf >> row >> c >> col)
{
assert(c == '-');
binwrite(obuf,row);
binwrite(obuf,col);
}
char const* x = obuf.str().c_str();
vector<char> v(x,x+obuf.str().size());
// important: DO NOT replace the two lines below this comment by
// char const* x = obuf.str().c_str(), as the memory x is pointing
// to is freed immediately upon deconstruction of the string object.
string foo = obuf.str();
char const* x = foo.c_str();
vector<char> v(x,x+foo.size());
ret->myTx = append(ret->myTx, v);
}
thread1.join();
thread2.join();
ret->Tx = ret->myTx;
ret->T1 = ret->myT1;
ret->T2 = ret->myT2;
ret->I1 = ret->myI1;
ret->I2 = ret->myI2;
#ifndef NDEBUG
// sanity check
for (size_t i = first_new_snt; i < ret->T1->size(); ++i)
{
size_t slen1 = ret->T1->sntLen(i);
size_t slen2 = ret->T2->sntLen(i);
char const* p = ret->Tx->sntStart(i);
char const* q = ret->Tx->sntEnd(i);
size_t k;
while (p < q)
{
p = binread(p,k);
assert(p);
assert(p < q);
assert(k < slen1);
p = binread(p,k);
assert(p);
assert(k < slen2);
}
}
#endif
return ret;
}

View File

@ -29,10 +29,12 @@
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
#include "moses/Util.h"
#include "moses/StaticData.h"
#include "headers-base/util/exception.hh"
#include "headers-base/util/check.hh"
#include "util/exception.hh"
// #include "util/check.hh"
#include "ug_typedefs.h"
#include "ug_mm_ttrack.h"
@ -44,10 +46,12 @@
#include "tpt_pickler.h"
#include "ug_lexical_phrase_scorer2.h"
#define PSTATS_CACHE_THRESHOLD 50
using namespace ugdiss;
using namespace std;
namespace Moses {
class Mmsapt;
namespace bitext
{
using namespace ugdiss;
@ -122,6 +126,7 @@ namespace Moses {
struct
pstats
{
static ThreadSafeCounter active;
boost::mutex lock; // for parallel gathering of stats
boost::condition_variable ready; // consumers can wait for this data structure to be ready.
@ -133,8 +138,11 @@ namespace Moses {
uint32_t ofwd[po_other+1], obwd[po_other+1];
typename boost::unordered_map<uint64_t, jstats> trg;
pstats();
// typedef typename boost::unordered_map<uint64_t, jstats> trg_map_t;
typedef typename std::map<uint64_t, jstats> trg_map_t;
trg_map_t trg;
pstats();
~pstats();
void release();
void register_worker();
size_t count_workers() { return in_progress; }
@ -192,8 +200,8 @@ namespace Moses {
int index;
int num_feats;
public:
virtual
virtual
void
operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest)
const = 0;
@ -212,6 +220,7 @@ namespace Moses {
PScorePfwd : public PhraseScorer<Token>
{
float conf;
int denom;
public:
PScorePfwd()
{
@ -219,9 +228,10 @@ namespace Moses {
}
int
init(int const i, float const c)
init(int const i, float const c, int d=0)
{
conf = c;
conf = c;
denom = d;
this->index = i;
return i + this->num_feats;
}
@ -234,10 +244,20 @@ namespace Moses {
if (!dest) dest = &pp.fvals;
if (pp.joint > pp.good1)
{
cerr << bt.toString(pp.p1,0) << " ::: " << bt.toString(pp.p2,1) << endl;
cerr << pp.joint << "/" << pp.good1 << "/" << pp.raw2 << endl;
cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
}
switch (denom)
{
case 0:
(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
break;
case 1:
(*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf));
break;
case 2:
(*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf));
}
(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
}
};
@ -294,6 +314,7 @@ namespace Moses {
parse_pid(pp.p2, sid2, off2, len2);
#if 0
cout << len1 << " " << len2 << endl;
Token const* t1 = bt.T1->sntStart(sid1);
for (size_t i = off1; i < off1 + len1; ++i)
cout << (*bt.V1)[t1[i].id()] << " ";
@ -307,6 +328,7 @@ namespace Moses {
BOOST_FOREACH (int a, pp.aln)
cout << a << " " ;
cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
#endif
scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
bt.T2->sntStart(sid2)+off2,0,len2,
@ -371,8 +393,10 @@ namespace Moses {
template<typename TKN>
class Bitext
{
friend class Moses::Mmsapt;
protected:
mutable boost::mutex lock;
mutable boost::mutex cache_lock;
public:
typedef TKN Token;
typedef typename TSA<Token>::tree_iterator iter;
@ -409,14 +433,22 @@ namespace Moses {
bitvector* full_alignment,
bool const flip) const;
mutable boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2;
#if 1
typedef boost::unordered_map<uint64_t,sptr<pstats> > pcache_t;
#else
typedef map<uint64_t,sptr<pstats> > pcache_t;
#endif
mutable pcache_t cache1,cache2;
protected:
size_t default_sample_size;
size_t num_workers;
size_t m_pstats_cache_threshold;
private:
sptr<pstats>
prep2(iter const& phrase, size_t const max_sample) const;
public:
Bitext(size_t const max_sample=5000);
Bitext(size_t const max_sample =1000,
size_t const xnum_workers =16);
Bitext(Ttrack<Token>* const t1,
Ttrack<Token>* const t2,
@ -425,7 +457,8 @@ namespace Moses {
TokenIndex* const v2,
TSA<Token>* const i1,
TSA<Token>* const i2,
size_t const max_sample=5000);
size_t const max_sample=1000,
size_t const xnum_workers=16);
virtual void open(string const base, string const L1, string const L2) = 0;
@ -433,10 +466,13 @@ namespace Moses {
sptr<pstats> lookup(iter const& phrase) const;
sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
void prep(iter const& phrase) const;
void setDefaultSampleSize(size_t const max_samples);
void setDefaultSampleSize(size_t const max_samples);
size_t getDefaultSampleSize() const;
string toString(uint64_t pid, int isL2) const;
virtual size_t revision() const { return 0; }
};
template<typename Token>
@ -471,6 +507,7 @@ namespace Moses {
Bitext<Token>::
setDefaultSampleSize(size_t const max_samples)
{
boost::lock_guard<boost::mutex> guard(this->lock);
if (max_samples != default_sample_size)
{
cache1.clear();
@ -481,8 +518,10 @@ namespace Moses {
template<typename Token>
Bitext<Token>::
Bitext(size_t const max_sample)
Bitext(size_t const max_sample, size_t const xnum_workers)
: default_sample_size(max_sample)
, num_workers(xnum_workers)
, m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
{ }
template<typename Token>
@ -494,9 +533,12 @@ namespace Moses {
TokenIndex* const v2,
TSA<Token>* const i1,
TSA<Token>* const i2,
size_t const max_sample)
size_t const max_sample,
size_t const xnum_workers)
: Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
, default_sample_size(max_sample)
, num_workers(xnum_workers)
, m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
{ }
// agenda is a pool of jobs
@ -508,6 +550,7 @@ namespace Moses {
boost::mutex lock;
class job
{
static ThreadSafeCounter active;
boost::mutex lock;
friend class agenda;
public:
@ -525,8 +568,9 @@ namespace Moses {
bool done() const;
job(typename TSA<Token>::tree_iterator const& m,
sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd);
~job();
};
public:
class
worker
{
@ -535,7 +579,7 @@ namespace Moses {
worker(agenda& a) : ag(a) {}
void operator()();
};
private:
list<sptr<job> > joblist;
vector<sptr<boost::thread> > workers;
bool shutdown;
@ -639,7 +683,7 @@ namespace Moses {
while (j->step(sid,offset))
{
aln.clear();
int po_fwd=5,po_bwd=5;
int po_fwd=po_other,po_bwd=po_other;
if (j->fwd)
{
if (!ag.bt.find_trg_phr_bounds
@ -669,20 +713,25 @@ namespace Moses {
// assert(b);
for (size_t i = e1; i <= e2; ++i)
{
if (!j->stats->add(b->getPid(),sample_weight,aln,
b->approxOccurrenceCount(),
po_fwd,po_bwd))
if (! j->stats->add(b->getPid(),sample_weight,aln,
b->approxOccurrenceCount(),
po_fwd,po_bwd))
{
cerr << "FATAL ERROR AT " << __FILE__
<< ":" << __LINE__ << endl;
assert(0);
ostringstream msg;
for (size_t z = 0; z < j->len; ++z)
{
id_type tid = ag.bt.T1->sntStart(sid)[offset+z].id();
cout << (*ag.bt.V1)[tid] << " ";
cerr << (*ag.bt.V1)[tid] << " ";
}
cout << endl;
cerr << endl;
for (size_t z = s; z <= i; ++z)
cout << (*ag.bt.V2)[(o+z)->id()] << " ";
cout << endl;
exit(1);
cerr << (*ag.bt.V2)[(o+z)->id()] << " ";
cerr << endl;
assert(0);
UTIL_THROW(util::Exception,"Error in sampling.");
}
if (i < e2)
{
@ -705,6 +754,16 @@ namespace Moses {
}
}
template<typename Token>
Bitext<Token>::
agenda::
job::
~job()
{
if (stats) stats.reset();
--active;
}
template<typename Token>
Bitext<Token>::
agenda::
@ -722,6 +781,9 @@ namespace Moses {
{
stats.reset(new pstats());
stats->raw_cnt = m.approxOccurrenceCount();
// if (++active%5 == 0)
++active;
// cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
}
template<typename Token>
@ -731,12 +793,12 @@ namespace Moses {
add_job(typename TSA<Token>::tree_iterator const& phrase,
size_t const max_samples)
{
boost::unique_lock<boost::mutex> lk(this->lock);
static boost::posix_time::time_duration nodelay(0,0,0,0);
bool fwd = phrase.root == bt.I1.get();
sptr<job> j(new job(phrase, fwd ? bt.I2 : bt.I1, max_samples, fwd));
sptr<job> j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd));
j->stats->register_worker();
boost::unique_lock<boost::mutex> lk(this->lock);
joblist.push_back(j);
if (joblist.size() == 1)
{
@ -770,7 +832,6 @@ namespace Moses {
// cerr << workers.size() << " workers on record" << endl;
sptr<job> ret;
if (this->shutdown) return ret;
// add_workers(0);
boost::unique_lock<boost::mutex> lock(this->lock);
if (this->doomed)
{
@ -840,7 +901,8 @@ namespace Moses {
i2.open(base+L2+".sfa", this->T2);
assert(this->T1->size() == this->T2->size());
}
template<typename TKN>
class imBitext : public Bitext<TKN>
{
@ -849,7 +911,9 @@ namespace Moses {
sptr<imTtrack<TKN> > myT2;
sptr<imTSA<TKN> > myI1;
sptr<imTSA<TKN> > myI2;
static ThreadSafeCounter my_revision;
public:
size_t revision() const { return my_revision; }
void open(string const base, string const L1, string L2);
imBitext(sptr<TokenIndex> const& V1,
sptr<TokenIndex> const& V2,
@ -867,6 +931,10 @@ namespace Moses {
};
template<typename TKN>
ThreadSafeCounter
imBitext<TKN>::my_revision;
template<typename TKN>
imBitext<TKN>::
imBitext(size_t max_sample)
@ -876,6 +944,7 @@ namespace Moses {
this->V2.reset(new TokenIndex());
this->V1->setDynamic(true);
this->V2->setDynamic(true);
++my_revision;
}
template<typename TKN>
@ -889,6 +958,7 @@ namespace Moses {
this->V2 = v2;
this->V1->setDynamic(true);
this->V2->setDynamic(true);
++my_revision;
}
@ -909,6 +979,8 @@ namespace Moses {
this->V1 = other.V1;
this->V2 = other.V2;
this->default_sample_size = other.default_sample_size;
this->num_workers = other.num_workers;
++my_revision;
}
template<typename TKN> class snt_adder;
@ -1050,7 +1122,6 @@ namespace Moses {
t1.open(base+L1+".mct");
t2.open(base+L2+".mct");
tx.open(base+L1+"-"+L2+".mam");
cerr << "DADA" << endl;
this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
@ -1063,19 +1134,27 @@ namespace Moses {
template<typename Token>
bool
Bitext<Token>::
find_trg_phr_bounds(size_t const sid, size_t const start, size_t const stop,
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
int & po_fwd, int & po_bwd,
vector<uchar>* core_alignment,
bitvector* full_alignment,
bool const flip) const
find_trg_phr_bounds
(size_t const sid,
size_t const start, size_t const stop,
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
int & po_fwd, int & po_bwd,
vector<uchar>* core_alignment, bitvector* full_alignment,
bool const flip) const
{
// if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
// a word on the core_alignment:
// since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1 < e2, respectively)
// are be definition unaligned, we store only the core alignment in *core_alignment
// it is up to the calling function to shift alignment points over for start positions
// of extracted phrases that start with a fringe word
//
// since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1
// < e2, respectively) are be definition unaligned, we store
// only the core alignment in *core_alignment it is up to the
// calling function to shift alignment points over for start
// positions of extracted phrases that start with a fringe word
assert(T1);
assert(T2);
assert(Tx);
bitvector forbidden((flip ? T1 : T2)->sntLen(sid));
size_t slen1 = (*T1).sntLen(sid);
size_t slen2 = (*T2).sntLen(sid);
@ -1092,12 +1171,22 @@ namespace Moses {
char const* p = Tx->sntStart(sid);
char const* x = Tx->sntEnd(sid);
// cerr << "flip = " << flip << " " << __FILE__ << ":" << __LINE__ << endl;
while (p < x)
{
if (flip) { p = binread(p,trg); assert(p<x); p = binread(p,src); }
else { p = binread(p,src); assert(p<x); p = binread(p,trg); }
// cerr << sid << " " << src << "/" << slen1 << " " << trg << "/"
// << slen2 << endl;
if (src >= slen1 || trg >= slen2)
{
ostringstream buf;
buf << "Alignment range error at sentence " << sid << "!" << endl
<< src << "/" << slen1 << " " << trg << "/" << slen2 << endl;
cerr << buf.str() << endl;
UTIL_THROW(util::Exception, buf.str().c_str());
}
if (src < start || src >= stop)
forbidden.set(trg);
else
@ -1214,29 +1303,44 @@ namespace Moses {
Bitext<Token>::
prep2(iter const& phrase, size_t const max_sample) const
{
// boost::lock_guard<boost::mutex>(this->lock);
boost::lock_guard<boost::mutex> guard(this->lock);
if (!ag)
{
ag.reset(new agenda(*this));
// ag->add_workers(1);
ag->add_workers(20);
if (this->num_workers > 1)
ag->add_workers(this->num_workers);
}
typedef boost::unordered_map<uint64_t,sptr<pstats> > pcache_t;
sptr<pstats> ret;
if (max_sample == this->default_sample_size)
#if 1
// use pcache only for plain sentence input
if (StaticData::Instance().GetInputType() == SentenceInput &&
max_sample == this->default_sample_size &&
phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
{
uint64_t pid = phrase.getPid();
pcache_t & cache(phrase.root == &(*this->I1) ? cache1 : cache2);
pcache_t::value_type entry(pid,sptr<pstats>());
// need to test what a good caching threshold is
// is caching here the cause of the apparent memory leak in
// confusion network decoding ????
uint64_t pid = phrase.getPid();
pcache_t & cache(phrase.root == &(*this->I1) ? cache1 : cache2);
pcache_t::value_type entry(pid,sptr<pstats>());
pair<pcache_t::iterator,bool> foo;
{
// boost::lock_guard<boost::mutex>(this->lock);
foo = cache.emplace(entry);
}
if (foo.second) foo.first->second = ag->add_job(phrase, max_sample);
foo = cache.insert(entry);
if (foo.second)
{
// cerr << "NEW FREQUENT PHRASE: "
// << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount()
// << " at " << __FILE__ << ":" << __LINE__ << endl;
foo.first->second = ag->add_job(phrase, max_sample);
assert(foo.first->second);
}
assert(foo.first->second);
ret = foo.first->second;
}
else ret = ag->add_job(phrase, max_sample);
assert(ret);
}
else
#endif
ret = ag->add_job(phrase, max_sample);
assert(ret);
return ret;
}
@ -1245,13 +1349,17 @@ namespace Moses {
Bitext<Token>::
lookup(iter const& phrase) const
{
boost::lock_guard<boost::mutex>(this->lock);
sptr<pstats> ret;
ret = prep2(phrase, this->default_sample_size);
sptr<pstats> ret = prep2(phrase, this->default_sample_size);
assert(ret);
boost::unique_lock<boost::mutex> lock(ret->lock);
while (ret->in_progress)
ret->ready.wait(lock);
boost::lock_guard<boost::mutex> guard(this->lock);
if (this->num_workers <= 1)
typename agenda::worker(*this->ag)();
else
{
boost::unique_lock<boost::mutex> lock(ret->lock);
while (ret->in_progress)
ret->ready.wait(lock);
}
return ret;
}
@ -1260,11 +1368,16 @@ namespace Moses {
Bitext<Token>::
lookup(iter const& phrase, size_t const max_sample) const
{
boost::lock_guard<boost::mutex>(this->lock);
sptr<pstats> ret = prep2(phrase, max_sample);
boost::unique_lock<boost::mutex> lock(ret->lock);
while (ret->in_progress)
ret->ready.wait(lock);
boost::lock_guard<boost::mutex> guard(this->lock);
if (this->num_workers <= 1)
typename agenda::worker(*this->ag)();
else
{
boost::unique_lock<boost::mutex> lock(ret->lock);
while (ret->in_progress)
ret->ready.wait(lock);
}
return ret;
}
@ -1297,6 +1410,12 @@ namespace Moses {
return (max_samples && stats->good >= max_samples) || next == stop;
}
template<typename TKN>
ThreadSafeCounter
Bitext<TKN>::
agenda::
job::active;
} // end of namespace bitext
} // end of namespace moses
#endif

View File

@ -151,6 +151,7 @@ namespace ugdiss
filter2.set();
filter = &filter2;
}
assert(filter);
// In the first iteration over the corpus, we obtain word counts.
// They allows us to
// a. allocate the exact amount of memory we need
@ -235,9 +236,10 @@ namespace ugdiss
imTSA<TOKEN>::
getLowerBound(id_type id) const
{
if (id >= this->index.size())
if (id >= this->index.size())
return NULL;
return reinterpret_cast<char const*>(&(this->sufa[index[id]]));
assert(index[id] <= this->sufa.size());
return reinterpret_cast<char const*>(&(this->sufa.front()) + index[id]);
}
template<typename TOKEN>
@ -245,9 +247,10 @@ namespace ugdiss
imTSA<TOKEN>::
getUpperBound(id_type id) const
{
if (id+1 >= this->index.size())
if (++id >= this->index.size())
return NULL;
return reinterpret_cast<char const*>(&(this->sufa[index[id+1]]));
assert(index[id] <= this->sufa.size());
return reinterpret_cast<char const*>(&(this->sufa.front()) + index[id]);
}
template<typename TOKEN>
@ -255,6 +258,8 @@ namespace ugdiss
imTSA<TOKEN>::
readSid(char const* p, char const* q, id_type& sid) const
{
assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
sid = reinterpret_cast<cpos const*>(p)->sid;
return p;
}
@ -264,6 +269,8 @@ namespace ugdiss
imTSA<TOKEN>::
readSid(char const* p, char const* q, uint64_t& sid) const
{
assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
sid = reinterpret_cast<cpos const*>(p)->sid;
return p;
}
@ -273,6 +280,8 @@ namespace ugdiss
imTSA<TOKEN>::
readOffset(char const* p, char const* q, uint16_t& offset) const
{
assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
offset = reinterpret_cast<cpos const*>(p)->offset;
return p+sizeof(cpos);
}
@ -282,6 +291,8 @@ namespace ugdiss
imTSA<TOKEN>::
readOffset(char const* p, char const* q, uint64_t& offset) const
{
assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
offset = reinterpret_cast<cpos const*>(p)->offset;
return p+sizeof(cpos);
}
@ -363,6 +374,7 @@ namespace ugdiss
size_t n = 0;
BOOST_FOREACH(id_type sid, newsids)
{
assert(sid < crp->size());
for (size_t o = 0; o < (*crp)[sid].size(); ++o, ++n)
{ nidx[n].offset = o; nidx[n].sid = sid; }
}
@ -379,20 +391,22 @@ namespace ugdiss
size_t i = 0;
typename vector<cpos>::iterator k = this->sufa.begin();
this->index[0] = 0;
// cerr << newToks << " new items at "
// << __FILE__ << ":" << __LINE__ << endl;
for (size_t n = 0; n < nidx.size();)
{
id_type nid = crp->getToken(nidx[n])->id();
assert(nid >= i);
while (i < nid)
{
this->index[i] = k - this->sufa.begin();
if (++i < prior.index.size() && prior.index[i-1] < prior.index[i])
{
k = copy(prior.sufa.begin() + prior.index[i-1],
prior.sufa.begin() + prior.index[i], k);
}
this->index[i] = k - prior.sufa.begin();
}
this->index[i] = k - this->sufa.begin();
if (++i < prior.index.size() && prior.index[i] > prior.index[i-1])
{
size_t j = prior.index[i-1];
@ -418,6 +432,7 @@ namespace ugdiss
}
this->index[i] = k - this->sufa.begin();
}
this->index[i] = k - this->sufa.begin();
while (++i < this->index.size())
{
if (i < prior.index.size() && prior.index[i-1] < prior.index[i])
@ -425,6 +440,25 @@ namespace ugdiss
prior.sufa.begin() + prior.index[i], k);
this->index[i] = k - this->sufa.begin();
}
#if 0
// sanity checks
assert(this->sufa.size() == this->index.back());
BOOST_FOREACH(cpos const& x, this->sufa)
{
assert(x.sid < this->corpusSize);
assert(x.offset < this->corpus->sntLen(x.sid));
}
for (size_t i = 1; i < index.size(); ++i)
{
assert(index[i-1] <= index[i]);
assert(index[i] <= sufa.size());
for (size_t k = index[i-1]; k < index[i]; ++k)
assert(this->corpus->getToken(sufa[k])->id() == i-1);
}
assert(index[0] == 0);
assert(this->startArray == reinterpret_cast<char const*>(&(*this->sufa.begin())));
assert(this->endArray == reinterpret_cast<char const*>(&(*this->sufa.end())));
#endif
}
}

View File

@ -145,9 +145,9 @@ namespace ugdiss
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
{
myData = d;
numTokens = 0;
BOOST_FOREACH(vector<Token> const& v, d)
numTokens += v.size();
numToks = 0;
BOOST_FOREACH(vector<Token> const& v, *d)
numToks += v.size();
}
template<typename Token>

View File

@ -9,6 +9,7 @@
#include "tpt_typedefs.h"
#include "tpt_pickler.h"
#include "ug_typedefs.h"
#include "util/exception.hh"
namespace bio=boost::iostreams;
namespace ugdiss
{
@ -113,16 +114,21 @@ namespace ugdiss
// cout << "opening " << fname << " at " << __FILE__ << ":" << __LINE__ << endl;
if (access(fname.c_str(),R_OK))
{
cerr << "[" << __FILE__ << ":" << __LINE__ <<"] FATAL ERROR: "
<< "file '" << fname << " is not accessible." << endl;
exit(1);
ostringstream msg;
msg << "[" << __FILE__ << ":" << __LINE__ <<"] FATAL ERROR: "
<< "file '" << fname << " is not accessible." << endl;
string foo = msg.str();
UTIL_THROW(util::Exception,foo.c_str());
}
file.reset(new bio::mapped_file());
file->open(fname,ios::in|ios::out);
if (!file->is_open())
{
cerr << "Error opening file " << fname << endl;
assert(0);
ostringstream msg;
msg << "[" << __FILE__ << ":" << __LINE__ <<"] FATAL ERROR: "
<< "Opening file '" << fname << "' failed." << endl;
string foo = msg.str();
UTIL_THROW(util::Exception,foo.c_str());
}
char* p = file->data();
filepos_type offset = *reinterpret_cast<filepos_type*>(p);

View File

@ -59,6 +59,7 @@ namespace ugdiss
// TSA_tree_iterator(TSA_tree_iterator const& other);
TSA_tree_iterator(TSA<Token> const* s);
TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
// TSA_tree_iterator(TSA<Token> const* s, Token const& t);
TSA_tree_iterator(TSA<Token> const* s,
Token const* kstart,
@ -312,6 +313,17 @@ namespace ugdiss
: root(s)
{};
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator
(TSA<Token> const* r,
id_type const* s,
size_t const len)
: root(r)
{
for (id_type const* e = s + len; s < e && extend(*s); ++s);
};
// ---------------------------------------------------------------------------
#if 0

View File

@ -1,12 +1,26 @@
#include "mmsapt.h"
#include <boost/foreach.hpp>
#include <boost/tokenizer.hpp>
#include <algorithm>
namespace Moses
{
using namespace bitext;
using namespace std;
using namespace boost;
void
fillIdSeq(Phrase const& mophrase, size_t const ifactor,
TokenIndex const& V, vector<id_type>& dest)
{
dest.resize(mophrase.GetSize());
for (size_t i = 0; i < mophrase.GetSize(); ++i)
{
Factor const* f = mophrase.GetFactor(i,ifactor);
dest[i] = V[f->ToString()];
}
}
void
parseLine(string const& line, map<string,string> & params)
@ -23,6 +37,7 @@ namespace Moses
params[t.substr(i,j)] = t.substr(k);
}
}
#if 0
Mmsapt::
Mmsapt(string const& description, string const& line)
@ -35,7 +50,7 @@ namespace Moses
Mmsapt::
Mmsapt(string const& line)
// : PhraseDictionary("Mmsapt",line), ofactor(1,0)
: PhraseDictionary(line), ofactor(1,0)
: PhraseDictionary(line), ofactor(1,0), m_tpc_ctr(0)
{
this->init(line);
}
@ -53,36 +68,88 @@ namespace Moses
assert(L1.size());
assert(L2.size());
map<string,string>::const_iterator m;
m = param.find("pfwd_denom");
m_pfwd_denom = m != param.end() ? m->second[0] : 's';
m = param.find("smooth");
lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05;
m_lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05;
m = param.find("max-samples");
default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
m = param.find("workers");
m_workers = m != param.end() ? atoi(m->second.c_str()) : 8;
m_workers = min(m_workers,24UL);
m = param.find("cache-size");
m_history.reserve(m != param.end()
? max(1000,atoi(m->second.c_str()))
: 10000);
this->m_numScoreComponents = atoi(param["num-features"].c_str());
// num_features = 0;
m = param.find("ifactor");
input_factor = m != param.end() ? atoi(m->second.c_str()) : 0;
poolCounts = true;
m = param.find("extra");
if (m != param.end())
{
extra_data = m->second;
// cerr << "have extra data" << endl;
}
// keeps track of the most frequently used target phrase collections
// (to keep them cached even when not actively in use)
}
void
Mmsapt::
load_extra_data(string bname)
{
// TO DO: ADD CHECKS FOR ROBUSTNESS
// - file existence?
// - same number of lines?
// - sane word alignment?
vector<string> text1,text2,symal;
string line;
filtering_istream in1,in2,ina;
open_input_stream(bname+L1+".txt.gz",in1);
open_input_stream(bname+L2+".txt.gz",in2);
open_input_stream(bname+L1+"-"+L2+".symal.gz",ina);
while(getline(in1,line)) text1.push_back(line);
while(getline(in2,line)) text2.push_back(line);
while(getline(ina,line)) symal.push_back(line);
lock_guard<mutex> guard(this->lock);
btdyn = btdyn->add(text1,text2,symal);
assert(btdyn);
// cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl;
}
void
Mmsapt::
Load()
{
btfix.num_workers = this->m_workers;
btfix.open(bname, L1, L2);
btfix.setDefaultSampleSize(m_default_sample_size);
size_t num_feats;
// TO DO: should we use different lbop parameters
// for the relative-frequency based features?
num_feats = calc_pfwd_fix.init(0,lbop_parameter);
num_feats = calc_pbwd_fix.init(num_feats,lbop_parameter);
num_feats = calc_pfwd_fix.init(0,m_lbop_parameter);
num_feats = calc_pbwd_fix.init(num_feats,m_lbop_parameter);
num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
num_feats = apply_pp.init(num_feats);
if (num_feats < this->m_numScoreComponents)
{
poolCounts = false;
num_feats = calc_pfwd_dyn.init(num_feats,lbop_parameter);
num_feats = calc_pbwd_dyn.init(num_feats,lbop_parameter);
num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter);
num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
}
btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2));
if (num_feats != this->m_numScoreComponents)
{
ostringstream buf;
@ -94,6 +161,11 @@ namespace Moses
// cerr << "MMSAPT provides " << num_feats << " features at "
// << __FILE__ << ":" << __LINE__ << endl;
btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2,m_default_sample_size));
btdyn->num_workers = this->m_workers;
if (extra_data.size()) load_extra_data(extra_data);
// currently not used
LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
typedef LexicalPhraseScorer2<Token>::table_t::Cell cell_t;
wlex21.resize(COOC.numCols);
@ -128,7 +200,9 @@ namespace Moses
Token const* x = bt.T2->sntStart(sid) + off;
for (uint32_t k = 0; k < len; ++k)
{
// cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl;
StringPiece wrd = (*bt.V2)[x[k].id()];
assert(off+len <= bt.T2->sntLen(sid));
w.CreateFromString(Output,ofactor,wrd,false);
tp->AddWord(w);
}
@ -151,7 +225,7 @@ namespace Moses
PhrasePair pp;
pp.init(pid1, stats, this->m_numScoreComponents);
apply_pp(bt,pp);
boost::unordered_map<uint64_t,jstats>::const_iterator t;
pstats::trg_map_t::const_iterator t;
for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
{
pp.update(t->first,t->second);
@ -178,14 +252,14 @@ namespace Moses
if (statsa && statsb)
pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
else if (statsa)
pp.init(pid1b, *statsa, this->m_numScoreComponents);
pp.init(pid1a, *statsa, this->m_numScoreComponents);
else if (statsb)
pp.init(pid1b, *statsb, this->m_numScoreComponents);
else return false; // throw "no stats for pooling available!";
apply_pp(bta,pp);
boost::unordered_map<uint64_t,jstats>::const_iterator b;
boost::unordered_map<uint64_t,jstats>::iterator a;
pstats::trg_map_t::const_iterator b;
pstats::trg_map_t::iterator a;
if (statsb)
{
for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
@ -222,7 +296,7 @@ namespace Moses
parse_pid(a->first, sid, off, len);
if (btb.T2)
{
Token const* x = btb.T2->sntStart(sid) + off;
Token const* x = bta.T2->sntStart(sid) + off;
TSA<Token>::tree_iterator m(btb.I2.get(), x, x+len);
if (m.size() == len)
pp.update(a->first,m.approxOccurrenceCount(),a->second);
@ -258,8 +332,8 @@ namespace Moses
Word w;
if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents);
if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
boost::unordered_map<uint64_t,jstats>::const_iterator b;
boost::unordered_map<uint64_t,jstats>::iterator a;
pstats::trg_map_t::const_iterator b;
pstats::trg_map_t::iterator a;
if (statsb)
{
pool.init(pid1b,*statsb,0);
@ -411,13 +485,35 @@ namespace Moses
// }
// }
Mmsapt::
TargetPhraseCollectionWrapper::
TargetPhraseCollectionWrapper(size_t r, uint64_t k)
: revision(r), key(k), refCount(0), idx(-1)
{ }
Mmsapt::
TargetPhraseCollectionWrapper::
~TargetPhraseCollectionWrapper()
{
assert(this->refCount == 0);
}
// This is not the most efficient way of phrase lookup!
TargetPhraseCollection const*
Mmsapt::
GetTargetPhraseCollectionLEGACY(const Phrase& src) const
{
TargetPhraseCollection* ret = new TargetPhraseCollection();
// map from Moses Phrase to internal id sequence
vector<id_type> sphrase;
fillIdSeq(src,input_factor,*btfix.V1,sphrase);
if (sphrase.size() == 0) return NULL;
// lookup in static bitext
TSA<Token>::tree_iterator mfix(btfix.I1.get(),&sphrase[0],sphrase.size());
// lookup in dynamic bitext
// Reserve a local copy of the dynamic bitext in its current form. /btdyn/
// is set to a new copy of the dynamic bitext every time a sentence pair
// is added. /dyn/ keeps the old bitext around as long as we need it.
@ -426,57 +522,77 @@ namespace Moses
boost::lock_guard<boost::mutex> guard(this->lock);
dyn = btdyn;
}
vector<id_type> sphrase(src.GetSize());
for (size_t i = 0; i < src.GetSize(); ++i)
{
Factor const* f = src.GetFactor(i,input_factor);
id_type wid = (*btfix.V1)[f->ToString()];
sphrase[i] = wid;
}
TSA<Token>::tree_iterator mfix(btfix.I1.get()), mdyn(dyn->I1.get());
for (size_t i = 0; mfix.size() == i && i < sphrase.size(); ++i)
mfix.extend(sphrase[i]);
assert(dyn);
TSA<Token>::tree_iterator mdyn(dyn->I1.get());
if (dyn->I1.get())
{
for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
mdyn.extend(sphrase[i]);
}
// phrase not found in either
if (mdyn.size() != sphrase.size() &&
mfix.size() != sphrase.size())
return NULL; // not found
// cache lookup:
uint64_t phrasekey;
if (mfix.size() == sphrase.size())
phrasekey = (mfix.getPid()<<1);
else
phrasekey = (mdyn.getPid()<<1)+1;
size_t revision = dyn->revision();
{
boost::lock_guard<boost::mutex> guard(this->lock);
tpc_cache_t::iterator c = m_cache.find(phrasekey);
if (c != m_cache.end() && c->second->revision == revision)
return encache(c->second);
}
// not found or not up to date
sptr<pstats> sfix,sdyn;
if (mfix.size() == sphrase.size())
{
// do we need this lock here?
// Is it used here to control the total number of running threads???
boost::lock_guard<boost::mutex> guard(this->lock);
sfix = btfix.lookup(mfix);
}
sfix = btfix.lookup(mfix);
if (mdyn.size() == sphrase.size())
sdyn = dyn->lookup(mdyn);
if (poolCounts)
TargetPhraseCollectionWrapper*
ret = new TargetPhraseCollectionWrapper(revision,phrasekey);
if ((poolCounts &&
pool_pstats(src, mfix.getPid(),sfix.get(),btfix,
mdyn.getPid(),sdyn.get(),*dyn,ret))
|| combine_pstats(src, mfix.getPid(),sfix.get(),btfix,
mdyn.getPid(),sdyn.get(),*dyn,ret))
{
if (!pool_pstats(src, mfix.getPid(),sfix.get(),btfix,
mdyn.getPid(),sdyn.get(),*dyn,ret))
return NULL;
}
else if (!combine_pstats(src, mfix.getPid(),sfix.get(),btfix,
mdyn.getPid(),sdyn.get(),*dyn,ret))
return NULL;
ret->NthElement(m_tableLimit);
ret->NthElement(m_tableLimit);
#if 0
sort(ret->begin(), ret->end(), CompareTargetPhrase());
cout << "SOURCE PHRASE: " << src << endl;
size_t i = 0;
for (TargetPhraseCollection::iterator r = ret->begin(); r != ret->end(); ++r)
{
cout << ++i << " " << **r << endl;
}
sort(ret->begin(), ret->end(), CompareTargetPhrase());
cout << "SOURCE PHRASE: " << src << endl;
size_t i = 0;
for (TargetPhraseCollection::iterator r = ret->begin(); r != ret->end(); ++r)
{
cout << ++i << " " << **r << endl;
FVector fv = (*r)->GetScoreBreakdown().CreateFVector();
typedef pair<Moses::FName,float> item_t;
BOOST_FOREACH(item_t f, fv)
cout << f.first << ":" << f.second << " ";
cout << endl;
}
#endif
return ret;
}
boost::lock_guard<boost::mutex> guard(this->lock);
m_cache[phrasekey] = ret;
return encache(ret);
}
void
Mmsapt::
CleanUpAfterSentenceProcessing(const InputType& source)
{ }
ChartRuleLookupManager*
Mmsapt::
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &)
@ -484,13 +600,177 @@ namespace Moses
throw "CreateRuleLookupManager is currently not supported in Mmsapt!";
}
template<typename Token>
void
fill_token_seq(TokenIndex& V, string const& line, vector<Token>& dest)
ChartRuleLookupManager*
Mmsapt::
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &,
size_t UnclearWhatThisVariableIsSupposedToAccomplishBecauseNobodyBotheredToDocumentItInPhraseTableDotHButIllTakeThisAsAnOpportunityToComplyWithTheMosesConventionOfRidiculouslyLongVariableAndClassNames)
{
istringstream buf(line); string w;
while (buf>>w) dest.push_back(Token(V[w]));
throw "CreateRuleLookupManager is currently not supported in Mmsapt!";
}
void
Mmsapt::
InitializeForInput(InputType const& source)
{
// assert(0);
}
bool operator<(timespec const& a, timespec const& b)
{
if (a.tv_sec != b.tv_sec) return a.tv_sec < b.tv_sec;
return (a.tv_nsec < b.tv_nsec);
}
bool operator>=(timespec const& a, timespec const& b)
{
if (a.tv_sec != b.tv_sec) return a.tv_sec > b.tv_sec;
return (a.tv_nsec >= b.tv_nsec);
}
void
bubble_up(vector<Mmsapt::TargetPhraseCollectionWrapper*>& v, size_t k)
{
if (k >= v.size()) return;
for (;k && (v[k]->tstamp < v[k/2]->tstamp); k /=2)
{
std::swap(v[k],v[k/2]);
std::swap(v[k]->idx,v[k/2]->idx);
}
}
void
bubble_down(vector<Mmsapt::TargetPhraseCollectionWrapper*>& v, size_t k)
{
for (size_t j = 2*(k+1); j <= v.size(); j = 2*((k=j)+1))
{
if (j == v.size() || (v[j-1]->tstamp < v[j]->tstamp)) --j;
if (v[j]->tstamp >= v[k]->tstamp) break;
std::swap(v[k],v[j]);
v[k]->idx = k;
v[j]->idx = j;
}
}
void
Mmsapt::
decache(TargetPhraseCollectionWrapper* ptr) const
{
if (ptr->refCount || ptr->idx >= 0) return;
timespec t; clock_gettime(CLOCK_MONOTONIC,&t);
timespec r; clock_getres(CLOCK_MONOTONIC,&r);
// if (t.tv_nsec < v[0]->tstamp.tv_nsec)
#if 0
float delta = t.tv_sec - ptr->tstamp.tv_sec;
cerr << "deleting old cache entry after "
<< delta << " seconds."
<< " clock resolution is " << r.tv_sec << ":" << r.tv_nsec
<< " at " << __FILE__ << ":" << __LINE__ << endl;
#endif
tpc_cache_t::iterator m = m_cache.find(ptr->key);
if (m != m_cache.end())
if (m->second == ptr)
m_cache.erase(m);
delete ptr;
--m_tpc_ctr;
}
Mmsapt::
TargetPhraseCollectionWrapper*
Mmsapt::
encache(TargetPhraseCollectionWrapper* ptr) const
{
// Calling process must lock for thread safety!!
if (!ptr) return NULL;
++ptr->refCount;
++m_tpc_ctr;
clock_gettime(CLOCK_MONOTONIC, &ptr->tstamp);
// update history
if (m_history.capacity() > 1)
{
vector<TargetPhraseCollectionWrapper*>& v = m_history;
if (ptr->idx >= 0) // ptr is already in history
{
assert(ptr == v[ptr->idx]);
size_t k = 2 * (ptr->idx + 1);
if (k < v.size()) bubble_up(v,k--);
if (k < v.size()) bubble_up(v,k);
}
else if (v.size() < v.capacity())
{
size_t k = ptr->idx = v.size();
v.push_back(ptr);
bubble_up(v,k);
}
else
{
v[0]->idx = -1;
decache(v[0]);
v[0] = ptr;
bubble_down(v,0);
}
}
return ptr;
}
bool
Mmsapt::
PrefixExists(Moses::Phrase const& phrase) const
{
if (phrase.GetSize() == 0) return false;
vector<id_type> myphrase;
fillIdSeq(phrase,input_factor,*btfix.V1,myphrase);
TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size());
if (mfix.size() == myphrase.size())
{
// cerr << phrase << " " << mfix.approxOccurrenceCount() << endl;
return true;
}
sptr<imBitext<Token> > dyn;
{ // braces are needed for scoping mutex lock guard!
boost::lock_guard<boost::mutex> guard(this->lock);
dyn = btdyn;
}
assert(dyn);
TSA<Token>::tree_iterator mdyn(dyn->I1.get());
if (dyn->I1.get())
{
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
mdyn.extend(myphrase[i]);
}
return mdyn.size() == myphrase.size();
}
void
Mmsapt::
Release(TargetPhraseCollection const* tpc) const
{
if (!tpc) return;
boost::lock_guard<boost::mutex> guard(this->lock);
TargetPhraseCollectionWrapper* ptr
= (reinterpret_cast<TargetPhraseCollectionWrapper*>
(const_cast<TargetPhraseCollection*>(tpc)));
if (--ptr->refCount == 0 && ptr->idx < 0)
decache(ptr);
#if 0
cerr << ptr->refCount << " references at "
<< __FILE__ << ":" << __LINE__
<< "; " << m_tpc_ctr << " TPC references still in circulation; "
<< m_history.size() << " instances in history."
<< endl;
#endif
}
bool
Mmsapt::
ProvidesPrefixCheck() const
{
return true;
}
}

View File

@ -3,7 +3,9 @@
// Design and code by Ulrich Germann.
#pragma once
#include <time.h>
#include <boost/thread.hpp>
#include <boost/scoped_ptr.hpp>
#include "moses/TypeDef.h"
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
@ -52,11 +54,16 @@ namespace Moses
private:
mmbitext btfix;
sptr<imbitext> btdyn;
string bname;
string bname,extra_data;
string L1;
string L2;
float lbop_parameter;
size_t default_sample_size;
float m_lbop_parameter;
size_t m_default_sample_size;
size_t m_workers; // number of worker threads for sampling the bitexts
char m_pfwd_denom; // denominator for computation of fwd phrase score:
// 'r' - divide by raw count
// 's' - divide by sample count
// 'g' - devide by number of "good" (i.e. coherent) samples
// size_t num_features;
size_t input_factor;
size_t output_factor; // we can actually return entire Tokens!
@ -70,6 +77,33 @@ namespace Moses
bool poolCounts;
vector<FactorType> ofactor;
public:
// typedef boost::unordered_map<uint64_t, sptr<TargetPhraseCollection> > tpcoll_cache_t;
class TargetPhraseCollectionWrapper
: public TargetPhraseCollection
{
public:
size_t const revision; // time stamp from dynamic bitext
uint64_t const key; // phrase key
uint32_t refCount; // reference count
timespec tstamp; // last use
int idx; // position in history heap
TargetPhraseCollectionWrapper(size_t r, uint64_t const k);
~TargetPhraseCollectionWrapper();
};
private:
TargetPhraseCollectionWrapper*
encache(TargetPhraseCollectionWrapper* const ptr) const;
void
decache(TargetPhraseCollectionWrapper* ptr) const;
typedef map<uint64_t, TargetPhraseCollectionWrapper*> tpc_cache_t;
mutable tpc_cache_t m_cache;
mutable vector<TargetPhraseCollectionWrapper*> m_history;
// phrase table feature weights for alignment:
vector<float> feature_weights;
@ -118,6 +152,10 @@ namespace Moses
TargetPhraseCollection* tpcoll
) const;
void
load_extra_data(string bname);
mutable size_t m_tpc_ctr;
public:
// Mmsapt(string const& description, string const& line);
Mmsapt(string const& line);
@ -130,6 +168,10 @@ namespace Moses
//! Create a sentence-specific manager for SCFG rule lookup.
ChartRuleLookupManager*
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &);
ChartRuleLookupManager*
CreateRuleLookupManager
(const ChartParser &, const ChartCellCollectionBase &, std::size_t);
#endif
void add(string const& s1, string const& s2, string const& a);
@ -139,6 +181,23 @@ namespace Moses
align(string const& src, string const& trg) const;
void setWeights(vector<float> const& w);
void
CleanUpAfterSentenceProcessing(const InputType& source);
void
InitializeForInput(InputType const& source);
void
Release(TargetPhraseCollection const* tpc) const;
bool
ProvidesPrefixCheck() const;
/// return true if prefix /phrase/ exists
bool
PrefixExists(Phrase const& phrase) const;
private:
};
} // end namespace

View File

@ -98,7 +98,7 @@ namespace Moses
typedef pair<uint32_t, uint32_t> span;
typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
typedef boost::unordered_map<uint64_t,jstats> jStatsTable;
typedef pstats::trg_map_t jStatsTable;
Mmsapt const& PT;
vector<id_type> s,t;

View File

@ -1,6 +1,7 @@
// $Id$
#include <list>
#include <vector>
#include "TranslationOptionCollectionConfusionNet.h"
#include "ConfusionNet.h"
#include "DecodeGraph.h"
@ -10,6 +11,7 @@
#include "FF/InputFeature.h"
#include "TranslationModel/PhraseDictionaryTreeAdaptor.h"
#include "util/exception.hh"
#include <boost/foreach.hpp>
using namespace std;
@ -17,11 +19,21 @@ namespace Moses
{
/** constructor; just initialize the base class */
TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet(
const ConfusionNet &input
, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
TranslationOptionCollectionConfusionNet::
TranslationOptionCollectionConfusionNet(const ConfusionNet &input,
size_t maxNoTransOptPerCoverage,
float translationOptionThreshold)
: TranslationOptionCollection(input, maxNoTransOptPerCoverage,
translationOptionThreshold)
{
// Prefix checkers are phrase dictionaries that provide a prefix check
// to indicate that a phrase table entry with a given prefix exists.
// If no entry with the given prefix exists, there is no point in
// expanding it further.
vector<PhraseDictionary*> prefixCheckers;
BOOST_FOREACH(PhraseDictionary* pd, PhraseDictionary::GetColl())
if (pd->ProvidesPrefixCheck()) prefixCheckers.push_back(pd);
const InputFeature &inputFeature = InputFeature::Instance();
UTIL_THROW_IF2(&inputFeature == NULL, "Input feature must be specified");
@ -91,6 +103,11 @@ TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet
Phrase subphrase(prevPhrase);
subphrase.AddWord(word);
bool OK = prefixCheckers.size() == 0;
for (size_t k = 0; !OK && k < prefixCheckers.size(); ++k)
OK = prefixCheckers[k]->PrefixExists(subphrase);
if (!OK) continue;
const ScorePair &scores = col[i].second;
ScorePair *inputScore = new ScorePair(*prevInputScore);
inputScore->PlusEquals(scores);
@ -105,6 +122,9 @@ TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet
} // for (iterPath = prevPaths.begin(); iterPath != prevPaths.end(); ++iterPath) {
}
}
// cerr << "HAVE " << m_inputPathQueue.size()
// << " input paths of max. length "
// << maxSizePhrase << "." << endl;
}
InputPathList &TranslationOptionCollectionConfusionNet::GetInputPathList(size_t startPos, size_t endPos)
@ -229,7 +249,9 @@ void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRangeLE
// go thru each intermediate trans opt just created
const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
for (iterPartialTranslOpt = partTransOptList.begin() ; iterPartialTranslOpt != partTransOptList.end() ; ++iterPartialTranslOpt) {
for (iterPartialTranslOpt = partTransOptList.begin();
iterPartialTranslOpt != partTransOptList.end();
++iterPartialTranslOpt) {
TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
if (transStep) {

View File

@ -37,14 +37,17 @@ TranslationOptionCollectionLattice::TranslationOptionCollectionLattice(
const std::vector<size_t> &nextNodes = input.GetNextNodes(startPos);
WordsRange range(startPos, startPos);
const NonTerminalSet &labels = input.GetLabelSet(startPos, startPos);
const ConfusionNet::Column &col = input.GetColumn(startPos);
for (size_t i = 0; i < col.size(); ++i) {
const Word &word = col[i].first;
UTIL_THROW_IF2(word.IsEpsilon(), "Epsilon not supported");
size_t nextNode = nextNodes[i];
size_t endPos = startPos + nextNode - 1;
WordsRange range(startPos, endPos);
const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos);
Phrase subphrase;
subphrase.AddWord(word);
@ -53,9 +56,7 @@ TranslationOptionCollectionLattice::TranslationOptionCollectionLattice(
InputPath *path = new InputPath(subphrase, labels, range, NULL, inputScore);
size_t nextNode = nextNodes[i];
path->SetNextNode(nextNode);
m_inputPathQueue.push_back(path);
}
}
@ -135,7 +136,7 @@ void TranslationOptionCollectionLattice::CreateTranslationOptions()
const WordsRange &range = path.GetWordsRange();
if (tpColl && tpColl->GetSize()) {
TargetPhraseCollection::const_iterator iter;
TargetPhraseCollection::const_iterator iter;
for (iter = tpColl->begin(); iter != tpColl->end(); ++iter) {
const TargetPhrase &tp = **iter;
TranslationOption *transOpt = new TranslationOption(range, tp);

View File

@ -98,19 +98,45 @@ StringPiece Word::GetString(FactorType factorType) const
class StrayFactorException : public util::Exception {};
void Word::CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
, const StringPiece &str
, bool isNonTerminal)
void
Word::
CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
, const StringPiece &str
, bool isNonTerminal
, bool strict)
{
FactorCollection &factorCollection = FactorCollection::Instance();
util::TokenIter<util::MultiCharacter> fit(str, StaticData::Instance().GetFactorDelimiter());
for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit, isNonTerminal);
}
UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times.");
vector<StringPiece> bits(MAX_NUM_FACTORS);
util::TokenIter<util::MultiCharacter>
fit(str, StaticData::Instance().GetFactorDelimiter());
size_t i = 0;
for (; i < MAX_NUM_FACTORS && fit; ++i,++fit)
bits[i] = *fit;
if (i == MAX_NUM_FACTORS)
UTIL_THROW_IF(fit, StrayFactorException,
"The hard limit for factors is " << MAX_NUM_FACTORS
<< ". The word " << str << " contains factor delimiter "
<< StaticData::Instance().GetFactorDelimiter()
<< " too many times.");
if (strict)
UTIL_THROW_IF(fit, StrayFactorException,
"You have configured " << factorOrder.size()
<< " factors but the word " << str
<< " contains factor delimiter "
<< StaticData::Instance().GetFactorDelimiter()
<< " too many times.");
UTIL_THROW_IF(i < factorOrder.size(),util::Exception,
"Too few factors in string '" << str << "'.");
for (size_t k = 0; k < factorOrder.size(); ++k)
{
UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception,
"Factor order out of bounds.");
m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal);
}
// assume term/non-term same for all factors
m_isNonTerminal = isNonTerminal;
}

View File

@ -151,7 +151,8 @@ public:
void CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
, const StringPiece &str
, bool isNonTerminal);
, bool isNonTerminal
, bool strict = true);
void CreateUnknownWord(const Word &sourceWord);

View File

@ -49,7 +49,12 @@ void WordLattice::Print(std::ostream& out) const
out<<"\n\n";
}
int WordLattice::InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<FactorType>& factorOrder, const std::string& debug_line)
int
WordLattice::
InitializeFromPCNDataType
(const PCN::CN& cn,
const std::vector<FactorType>& factorOrder,
const std::string& debug_line)
{
const StaticData &staticData = StaticData::Instance();
const InputFeature &inputFeature = InputFeature::Instance();
@ -73,14 +78,20 @@ int WordLattice::InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<
//check for correct number of link parameters
if (alt.m_denseFeatures.size() != numInputScores) {
TRACE_ERR("ERROR: need " << numInputScores << " link parameters, found " << alt.m_denseFeatures.size() << " while reading column " << i << " from " << debug_line << "\n");
TRACE_ERR("ERROR: need " << numInputScores
<< " link parameters, found "
<< alt.m_denseFeatures.size()
<< " while reading column " << i
<< " from " << debug_line << "\n");
return false;
}
//check each element for bounds
std::vector<float>::const_iterator probsIterator;
data[i][j].second = std::vector<float>(0);
for(probsIterator = alt.m_denseFeatures.begin(); probsIterator < alt.m_denseFeatures.end(); probsIterator++) {
for(probsIterator = alt.m_denseFeatures.begin();
probsIterator < alt.m_denseFeatures.end();
probsIterator++) {
IFVERBOSE(1) {
if (*probsIterator < 0.0f) {
TRACE_ERR("WARN: neg probability: " << *probsIterator << "\n");
@ -102,7 +113,9 @@ int WordLattice::InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<
float value = (alt.m_word=="" || alt.m_word==EPSILON) ? 0.0f : -1.0f;
data[i][j].second.denseScores.push_back(value);
}
String2Word(alt.m_word, data[i][j]. first, factorOrder);
Word& w = data[i][j].first;
w.CreateFromString(Input,factorOrder,StringPiece(alt.m_word),false);
// String2Word(alt.m_word, data[i][j]. first, factorOrder);
next_nodes[i][j] = alt.m_next;
if(next_nodes[i][j] > maxSizePhrase) {

View File

@ -119,14 +119,21 @@ sub exec_moses {
my ($decoder, $conf, $input, $results) = @_;
my $start_time = time;
my ($o, $ec, $sig);
my $cmd;
if ($NBEST > 0){
print STDERR "Nbest output file is $results/run.nbest\n";
print STDERR "Nbest size is $NBEST\n";
($o, $ec, $sig) = run_command("$decoder -f $conf -i $input -n-best-list $results/run.nbest $NBEST 1> $results/run.stdout 2> $results/run.stderr");
$cmd = "$decoder -f $conf -i $input -n-best-list $results/run.nbest $NBEST 1> $results/run.stdout 2> $results/run.stderr";
}
else{
($o, $ec, $sig) = run_command("$decoder -f $conf -i $input 1> $results/run.stdout 2> $results/run.stderr");
$cmd = "$decoder -f $conf -i $input 1> $results/run.stdout 2> $results/run.stderr";
}
open CMD, ">$results/cmd_line";
print CMD "$cmd\n";
close CMD;
($o, $ec, $sig) = run_command($cmd);
my $elapsed = time - $start_time;
return ($o, $elapsed, $ec, $sig);
}

View File

@ -21,11 +21,14 @@ while (@ARGV) {
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
/^-b$/ && ($|++, next); # no output buffering
}
if ($HELP) {
print "Usage ./split-sentences.perl (-l [en|de|...]) < textfile > splitfile\n";
exit;
print "Usage ./split-sentences.perl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n";
print "-q: quiet mode\n";
print "-b: no output buffering (for use in bidirectional pipes)\n";
exit;
}
if (!$QUIET) {
print STDERR "Sentence Splitter v3\n";

View File

@ -38,9 +38,17 @@ while(<STDIN>) {
print " " if $i;
print $$MARKUP[$i];
$$WORD[$i] =~ /^([^\|]+)(.*)/;
my $word = $1;
my $otherfactors = $2;
my ($word,$otherfactors);
if ($$WORD[$i] =~ /^([^\|]+)(.*)/)
{
$word = $1;
$otherfactors = $2;
}
else
{
$word = $$WORD[$i];
$otherfactors = "";
}
if ($sentence_start && defined($BEST{lc($word)})) {
print $BEST{lc($word)}; # truecase sentence start

291
scripts/server/sim-pe.py Executable file
View File

@ -0,0 +1,291 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Written by Ulrich Germann on the basis of contrib/server/client.py.
# This script simulates post-editing of MT output and incrementally
# updates the dynamic phrase tables in the moses server.
import xmlrpclib,datetime,argparse,sys,os,time
from subprocess import *
# We must perform some custom argument processing, as moses parameter
# specifications do not comply with the standards used in standard
# argument parsing packages; an isolated double dash separates script
# arguments from moses arguments
MosesProcess = None
NBestFile = None
def shutdown():
if MosesProcess:
if args.debug:
print >>sys.stderr,"shutting down moses server"
pass
MosesProcess.terminate()
pass
return
def find_free_port(p):
"""
Find a free port, starting at /p/.
Return the free port, or False if none found.
"""
ret = p
while ret - p < 20:
devnull = open(os.devnull,"w")
n = Popen(["netstat","-tnp"],stdout=PIPE,stderr=devnull)
if n.communicate()[0].find(":%d "%ret) < 0:
return p
ret += 1
pass
return False
def launch_moses(mo_args):
"""
Spawn a moses server process. Return URL of said process.
"""
global MosesProcess
try:
port_index = mo_args.index("--server-port") + 1
except:
mo_args.extend(["--server-port","7777"])
port_index = len(mo_args) - 1
pass
port = find_free_port(int(mo_args[port_index]))
if not port:
print >>sys.stderr, "FATAL ERROR: No available port for moses server!"
sys.exit(1)
pass
if args.debug:
MosesProcess = Popen([args.servercmd] + mo_args)
else:
devnull = open(os.devnull,"w")
MosesProcess = Popen([args.servercmd] + mo_args,
stderr=devnull, stdout=devnull)
if MosesProcess.poll():
print >>sys.stderr, "FATAL ERROR: Could not launch moses server!"
sys.exit(1)
pass
if args.debug:
print >>sys.stderr,"MOSES port is %d."%port
print >>sys.stderr,"Moses poll status is", MosesProcess.poll()
pass
return "http://localhost:%d"%port
def split_args(all_args):
"""
Split argument list all_args into script-specific
and moses-specific arguments.
"""
my_args = []
mo_args = []
try:
i = all_args.index("--")
my_args = all_args[:i]
mo_args = all_args[i+1:]
except:
my_args = []
mo_args = all_args[:]
pass
# IMPORTANT: the code below must be coordinated with
# - the evolution of moses command line arguments
# - mert-moses.pl
i = 0
while i < len(mo_args):
if mo_args[i] == "-i" or mo_args[i] == "-input-file":
my_args.extend(["--src",m_args[i+1]])
mo_args[i:i+2] = []
elif mo_args[i] == "-inputtype":
if mo_args[i+1] != "0":
# not yet supported! Therefore:
errmsg = "FATAL ERROR: "
errmsg += "%s only supports plain text input at this point."
print >>sys.stderr,errmsg%sys.argv[0]
sys.exit(1)
pass
my_args.extend(["--input-type",mo_args[i+1]])
mo_args[i:i+2] = []
elif mo_args[i] == "-lattice-samples":
my_args.extend(["--lattice-sample",mo_args[i+2]])
my_args.extend(["--lattice-sample-file",mo_args[i+1]])
mo_args[i:i+3] = []
# not yet supported! Therefore:
errmsg = "FATAL ERROR: "
errmsg += "%s does not yet support lattice sampling."
print >>sys.stderr,errmsg%sys.argv[0]
sys.exit(1)
elif mo_args[i] == "-n-best-list":
my_args.extend(["--nbest",mo_args[i+2]])
my_args.extend(["--nbest-file",mo_args[i+1]])
mo_args[i:i+3] = []
elif mo_args[i] == "-n-best-distinct":
my_args.extend(["-U"])
mo_args[i:i+1] = []
else:
i += 1
pass
pass
return my_args,mo_args
def interpret_args(my_args):
"""
Parse script-specific argument list.
"""
aparser = argparse.ArgumentParser()
# interfacing with moses
# aparser.add_argument("-m","--moses-cmd",default="moses",dest="mosescmd",
# help="path to standard moses command")
aparser.add_argument("-s","--server-cmd",default="mosesserver",
dest="servercmd", help="path to moses server command")
aparser.add_argument("-u","--url",help="URL of external moses server.")
# input / output
aparser.add_argument("-i","--input",help="source file",default="-")
aparser.add_argument("-r","--ref",help="reference translation",default=None)
aparser.add_argument("-a","--aln",help="alignment",default=None)
aparser.add_argument("-o","--output",default="-",help="output file")
aparser.add_argument("-d","--debug",action="store_true",help="debug mode")
# moses reporting options
aparser.add_argument("-A","--with-alignment", dest="A",
help="include alignment in output", action="store_true")
aparser.add_argument("-G","--with-graph",type=bool, default=False, dest="G",
help="include search graph info in output")
aparser.add_argument("-T","--with-transopt",type=bool, default=False, dest = "T",
help="include translation options info in output")
aparser.add_argument("-F","--report-all-factors", action="store_true",dest="F",
help="report all factors")
aparser.add_argument("-n","--nbest",type=int,dest="nbest",default=0,
help="size of nbest list")
aparser.add_argument("-N","--nbest-file",dest="nbestFile",default=0,
help="output file for nbest list")
aparser.add_argument("-U","--nbest-distinct",type=bool,dest="U",default=False,
help="report all factors")
return aparser.parse_args(my_args)
def translate(proxy,args,s):
param = {'text':s.strip()}
if args.A: param['align'] = True
if args.T: param['topt'] = True
if args.F: param['report-all-factors'] = True
if args.nbest:
param['nbest'] = int(args.nbest)
param['add-score-breakdown'] = True
pass
if args.U: param['nbest-distinct'] = True
try:
ret = proxy.translate(param)
except:
return None
pass
return ret
def read_data(fname):
"""
Read and return data (source, target or alignment) from file fname.
"""
if fname[-3:] == ".gz":
foo = Popen(["zcat",fname],stdout=PIPE)\
.communicate()[0]\
.strip().split('\n')
else:
foo = [x.strip() for x in open(fname).readlines()]
pass
return foo
def repack_result(id,result):
global args
if args.nbest:
if not NBestFile:
shutdown()
assert NBestFile
sys.exit(1)
for h in result['nbest']:
fields = (id,h['hyp'],h['fvals'],h['totalScore'])
print >>NBestFile,"%d ||| %s ||| %s ||| %f"%fields
pass
pass
if 'align' in result:
t = result['text'].split()
span = ''
i = 0
k = 0
for a in result['align']:
k = a['tgt-start']
if k: print " ".join(t[i:k]),span,
i = k
span = "|%d %d|"%(a['src-start'],a['src-end'])
pass
print " ".join(t[k:]),span
pass
else:
print result['text']
pass
return
if __name__ == "__main__":
my_args, mo_args = split_args(sys.argv[1:])
global args
args = interpret_args(my_args)
if "-show-weights" in mo_args:
devnull = open(os.devnull,"w")
mo = Popen([args.servercmd] + mo_args,stdout=PIPE,stderr=devnull)
print mo.communicate()[0].strip()
sys.exit(0)
pass
if args.nbest:
if args.nbestFile:
NBestFile = open(args.nbestFile,"w")
else:
NBestFile = sys.stdout
pass
pass
if "url" not in args or not args.url:
url = launch_moses(mo_args)
else:
url = args.url
pass
if url[:4] != "http": url = "http://%s"%url
if url[-5:] != "/RPC2": url += "/RPC2"
proxy = xmlrpclib.ServerProxy(url)
ret = None
aln = None
if args.ref: ref = read_data(args.ref)
if args.aln: aln = read_data(args.aln)
if (args.input == "-"):
line = sys.stdin.readline()
id = 0
while line:
result = translate(proxy,args,line)
repack_result(id,result)
line = sys.stdin.readline()
id += 1
pass
pass
else:
src = read_data(args.src)
for i in xrange(len(src)):
if ref and aln:
result = proxy.updater({'source' : src[i],
'target' : ref[i],
'alignment' : aln[i]})
repack_result(i,result)
pass
pass
pass
pass
shutdown()

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -W
#!/usr/bin/perl -W
# script for preprocessing language data prior to tokenization
# Start by Ulrich Germann, after noticing systematic preprocessing errors
# in some of the English Europarl data.
@ -12,21 +12,21 @@ binmode(STDOUT, ":utf8");
sub usage
{
print "Script for preprocessing of raw language data prior to tokenization\n";
print "Usage: $0 -l <language tag>\n";
print "Usage: $0 -l <language tag> [-b]\n";
print " -b: no buffering\n";
}
my %args;
getopt('l=s h',\%args);
getopt('l=s h b',\%args);
usage() && exit(0) if $args{'h'};
$|++ if $args{'b'};
if ($args{'l'} eq "en")
{
while (<>)
while (<>)
{
s/([[:alpha:]]\') s\b/$1s/g;
print;
s/([[:alpha:]]\') s\b/$1s/g;
print;
}
}
elsif ($args{'l'} eq "fr")
{
@ -38,6 +38,5 @@ elsif ($args{'l'} eq "fr")
}
else
{
print while <>;
}