Major overhaul of Mmsapt. Reorganization of old and addition of new features in phrase tables. Many critical bug fixes.

This commit is contained in:
Ulrich Germann 2014-07-09 02:41:28 +01:00
parent 28d64e2339
commit 4d41211c2c
42 changed files with 2370 additions and 1308 deletions

1
.gitignore vendored
View File

@ -79,3 +79,4 @@ nbproject/
mingw/MosesGUI/MosesGUI.e4p
mingw/MosesGUI/_eric4project/
contrib/m4m/merge-sorted

View File

@ -152,13 +152,15 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses
if [ option.get "with-mm" : : "yes" ]
{
alias mm :
moses/TranslationModel/UG//spe-check-coverage2
moses/TranslationModel/UG//ptable-lookup
moses/TranslationModel/UG//sim-pe
moses/TranslationModel/UG//spe-check-coverage
moses/TranslationModel/UG/mm//mtt-build
moses/TranslationModel/UG/mm//mtt-dump
moses/TranslationModel/UG/mm//symal2mam
moses/TranslationModel/UG/mm//mam2symal
moses/TranslationModel/UG/mm//mam_verify
moses/TranslationModel/UG/mm//custom-pt
moses/TranslationModel/UG/mm//mmlex-build
moses/TranslationModel/UG/mm//mmlex-lookup
moses/TranslationModel/UG/mm//mtt-count-words

View File

@ -22,7 +22,7 @@ int main(int argc, char **argv)
{
int tableLimit = 20;
std::string ttable = "";
bool useAlignments = false;
// bool useAlignments = false;
for(int i = 1; i < argc; i++) {
if(!strcmp(argv[i], "-tlimit")) {

View File

@ -4,6 +4,7 @@
#include <algorithm>
#include "moses/Util.h"
#include "moses/ChartManager.h"
#include "moses/Hypothesis.h"
#include "moses/Manager.h"
@ -59,7 +60,7 @@ public:
if(add2ORLM_) {
//updateORLM();
}
cerr << "Done inserting\n";
XVERBOSE(1,"Done inserting\n");
//PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy);
map<string, xmlrpc_c::value> retData;
//*retvalP = xmlrpc_c::value_struct(retData);
@ -120,17 +121,17 @@ public:
if(si == params.end())
throw xmlrpc_c::fault("Missing source sentence", xmlrpc_c::fault::CODE_PARSE);
source_ = xmlrpc_c::value_string(si->second);
cerr << "source = " << source_ << endl;
XVERBOSE(1,"source = " << source_ << endl);
si = params.find("target");
if(si == params.end())
throw xmlrpc_c::fault("Missing target sentence", xmlrpc_c::fault::CODE_PARSE);
target_ = xmlrpc_c::value_string(si->second);
cerr << "target = " << target_ << endl;
XVERBOSE(1,"target = " << target_ << endl);
si = params.find("alignment");
if(si == params.end())
throw xmlrpc_c::fault("Missing alignment", xmlrpc_c::fault::CODE_PARSE);
alignment_ = xmlrpc_c::value_string(si->second);
cerr << "alignment = " << alignment_ << endl;
XVERBOSE(1,"alignment = " << alignment_ << endl);
si = params.find("bounded");
bounded_ = (si != params.end());
si = params.find("updateORLM");
@ -224,7 +225,7 @@ public:
}
const string source((xmlrpc_c::value_string(si->second)));
cerr << "Input: " << source << endl;
XVERBOSE(1,"Input: " << source << endl);
si = params.find("align");
bool addAlignInfo = (si != params.end());
si = params.find("word-align");
@ -287,13 +288,13 @@ public:
}
} else {
Sentence sentence;
const vector<FactorType> &inputFactorOrder =
staticData.GetInputFactorOrder();
const vector<FactorType> &
inputFactorOrder = staticData.GetInputFactorOrder();
stringstream in(source + "\n");
sentence.Read(in,inputFactorOrder);
size_t lineNumber = 0; // TODO: Include sentence request number here?
Manager manager(lineNumber, sentence, staticData.GetSearchAlgorithm());
manager.ProcessSentence();
manager.ProcessSentence();
const Hypothesis* hypo = manager.GetBestHypothesis();
vector<xmlrpc_c::value> alignInfo;
@ -331,7 +332,7 @@ public:
pair<string, xmlrpc_c::value>
text("text", xmlrpc_c::value_string(out.str()));
retData.insert(text);
cerr << "Output: " << out.str() << endl;
XVERBOSE(1,"Output: " << out.str() << endl);
*retvalP = xmlrpc_c::value_struct(retData);
}
@ -574,7 +575,7 @@ int main(int argc, char** argv)
{
//Extract port and log, send other args to moses
char** mosesargv = new char*[argc+2];
char** mosesargv = new char*[argc+2]; // why "+2" [UG]
int mosesargc = 0;
int port = 8080;
const char* logfile = "/dev/null";
@ -634,11 +635,11 @@ int main(int argc, char** argv)
myRegistry.addMethod("updater", updater);
myRegistry.addMethod("optimize", optimizer);
xmlrpc_c::serverAbyss myAbyssServer(
myRegistry,
port, // TCP port on which to listen
logfile
);
xmlrpc_c::serverAbyss myAbyssServer(
myRegistry,
port, // TCP port on which to listen
logfile
);
/* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04
xmlrpc_c::serverAbyss myAbyssServer(
xmlrpc_c::serverAbyss::constrOpt()
@ -648,12 +649,10 @@ int main(int argc, char** argv)
.allowOrigin("*")
);
*/
cerr << "Listening on port " << port << endl;
XVERBOSE(1,"Listening on port " << port << endl);
if (isSerial) {
while(1) {
myAbyssServer.runOnce();
}
while(1) myAbyssServer.runOnce();
} else {
myAbyssServer.run();
}

View File

@ -3,4 +3,11 @@ alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z
exe moses : Main.cpp deps ;
exe lmbrgrid : LatticeMBRGrid.cpp deps ;
alias programs : moses lmbrgrid ;
exe simulate-pe :
simulate-pe.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_program_options
deps
;
alias programs : moses lmbrgrid simulate-pe ;

View File

@ -161,13 +161,17 @@ BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer
}
if (m_translations.size() > 1) {
UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
"Non-monotonic future score");
UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
"Non-monotonic future score: "
<< m_translations.Get(0)->GetFutureScore() << " vs. "
<< m_translations.Get(1)->GetFutureScore());
}
if (m_hypotheses.size() > 1) {
UTIL_THROW_IF2(m_hypotheses[0]->GetTotalScore() < m_hypotheses[1]->GetTotalScore(),
"Non-monotonic total score");
"Non-monotonic total score"
<< m_hypotheses[0]->GetTotalScore() << " vs. "
<< m_hypotheses[1]->GetTotalScore());
}
HypothesisScoreOrdererWithDistortion orderer (&transOptRange);
@ -442,7 +446,9 @@ BitmapContainer::ProcessBestHypothesis()
if (!Empty()) {
HypothesisQueueItem *check = Dequeue(true);
UTIL_THROW_IF2(item->GetHypothesis()->GetTotalScore() < check->GetHypothesis()->GetTotalScore(),
"Non-monotonic total score");
"Non-monotonic total score: "
<< item->GetHypothesis()->GetTotalScore() << " vs. "
<< check->GetHypothesis()->GetTotalScore());
}
// Logging for the criminally insane

View File

@ -105,7 +105,9 @@ void Manager::ProcessSentence()
// some reporting on how long this took
IFVERBOSE(1) {
GetSentenceStats().StopTimeCollectOpts();
TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took " << GetSentenceStats().GetTimeCollectOpts() << " seconds" << endl);
TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took "
<< GetSentenceStats().GetTimeCollectOpts() << " seconds at "
<< __FILE__ << ":" << __LINE__ << endl);
}
// search for best translation with the specified algorithm

View File

@ -20,6 +20,39 @@ $(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
exe sim-pe :
sim-pe.cc
$(TOP)/moses//moses
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
exe spe-check-coverage :
spe-check-coverage.cc
$(TOP)/moses//moses
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
exe spe-check-coverage2 :
spe-check-coverage2.cc
$(TOP)/moses//moses
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
install $(PREFIX)/bin : try-align ;
fakelib mmsapt : [ glob *.cpp mmsapt*.cc ] ;
fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ;

View File

@ -0,0 +1,50 @@
//-*- c++ -*-
#include "ug_splice_arglist.h"
#include "moses/Util.h"
#include "util/exception.hh"
#include <boost/foreach.hpp>
namespace Moses {
void
filter_arguments(int const argc_in, char const* const* const argv_in,
int & argc_moses, char*** argv_moses,
int & argc_other, char*** argv_other,
vector<pair<string,int> > const& filter)
{
*argv_moses = new char*[argc_in];
*argv_other = new char*[argc_in];
(*argv_moses)[0] = new char[strlen(argv_in[0])+1];
strcpy((*argv_moses)[0], argv_in[0]);
argc_moses = 1;
argc_other = 0;
typedef pair<string,int> option;
int i = 1;
while (i < argc_in)
{
BOOST_FOREACH(option const& o, filter)
{
if (o.first == argv_in[i])
{
(*argv_other)[argc_other] = new char[strlen(argv_in[i])+1];
strcpy((*argv_other)[argc_other++],argv_in[i]);
for (int k = 0; k < o.second; ++k)
{
UTIL_THROW_IF2(++i >= argc_in || argv_in[i][0] == '-',
"[" << HERE << "] Missing argument for "
<< "parameter " << o.first << "!");
(*argv_other)[argc_other] = new char[strlen(argv_in[i])+1];
strcpy((*argv_other)[argc_other++],argv_in[i]);
}
if (++i >= argc_in) break;
}
}
if (i >= argc_in) break;
(*argv_moses)[argc_moses] = new char[strlen(argv_in[i])+1];
strcpy((*argv_moses)[argc_moses++], argv_in[i++]);
}
}
} // namespace Moses

View File

@ -0,0 +1,18 @@
//-*- c++ -*-
#pragma once
#include <vector>
#include <string>
namespace Moses {
using namespace std;
// Function to splice the argument list (e.g. before handing it over to
// Moses LoadParam() function. /filter/ is a vector of argument names
// and the number of arguments after each of them
void
filter_arguments(int const argc_in, char const* const* const argv_in,
int & argc_moses, char*** argv_moses,
int & argc_other, char*** argv_other,
vector<pair<string,int> > const& filter);
} // namespace Moses

View File

@ -72,15 +72,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe custom-pt :
custom-pt.cc
$(TOP)/moses//moses
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)/util//kenutil
;
# exe custom-pt :
# custom-pt.cc
# $(TOP)/moses//moses
# $(TOP)//boost_iostreams
# $(TOP)//boost_program_options
# $(TOP)/moses/TranslationModel/UG/mm//mm
# $(TOP)/moses/TranslationModel/UG/generic//generic
# $(TOP)/util//kenutil
# ;
exe calc-coverage :
@ -98,7 +98,6 @@ mtt-dump
mtt-count-words
symal2mam
mam2symal
custom-pt
mmlex-build
mmlex-lookup
mam_verify

View File

@ -1,6 +1,6 @@
// build a phrase table for the given input
// #include "ug_lexical_phrase_scorer2.h"
#if 0
#include <stdint.h>
#include <string>
#include <vector>
@ -25,7 +25,7 @@
#include "ug_bitext.h"
#include "../mmsapt_phrase_scorers.h"
#include "ug_lexical_phrase_scorer2.h"
#include "../sapt_phrase_scorers.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
@ -109,6 +109,7 @@ nbest_phrasepairs(uint64_t const pid1,
int main(int argc, char* argv[])
{
// assert(argc == 4);
#if 0
#if 0
string base = argv[1];
string L1 = argv[2];
@ -182,7 +183,7 @@ int main(int argc, char* argv[])
}
}
}
#endif
exit(0);
}
#endif

View File

@ -158,99 +158,25 @@ namespace Moses
jstats::
invalidate()
{
my_rcnt = 0;
if (my_wcnt > 0)
my_wcnt *= -1;
}
void
jstats::
validate()
{
if (my_wcnt < 0)
my_wcnt *= -1;
}
bool
jstats::
valid()
{
return my_rcnt != 0;
return my_wcnt >= 0;
}
bool
PhrasePair::
operator<=(PhrasePair const& other) const
{
return this->score <= other.score;
}
bool
PhrasePair::
operator>=(PhrasePair const& other) const
{
return this->score >= other.score;
}
bool
PhrasePair::
operator<(PhrasePair const& other) const
{
return this->score < other.score;
}
bool
PhrasePair::
operator>(PhrasePair const& other) const
{
return this->score > other.score;
}
PhrasePair::
PhrasePair() {}
PhrasePair::
PhrasePair(PhrasePair const& o)
: p1(o.p1),
p2(o.p2),
raw1(o.raw1),
raw2(o.raw2),
sample1(o.sample1),
sample2(o.sample2),
good1(o.good1),
good2(o.good2),
joint(o.joint),
fvals(o.fvals),
aln(o.aln),
score(o.score)
{
for (size_t i = 0; i <= po_other; ++i)
{
dfwd[i] = o.dfwd[i];
dbwd[i] = o.dbwd[i];
}
}
void
PhrasePair::
init(uint64_t const pid1, pstats const& ps, size_t const numfeats)
{
p1 = pid1;
p2 = 0;
raw1 = ps.raw_cnt;
sample1 = ps.sample_cnt;
sample2 = 0;
good1 = ps.good;
good2 = 0;
raw2 = 0;
fvals.resize(numfeats);
}
void
PhrasePair::
init(uint64_t const pid1,
pstats const& ps1,
pstats const& ps2,
size_t const numfeats)
{
p1 = pid1;
raw1 = ps1.raw_cnt + ps2.raw_cnt;
sample1 = ps1.sample_cnt + ps2.sample_cnt;
sample2 = 0;
good1 = ps1.good + ps2.good;
good2 = 0;
fvals.resize(numfeats);
}
float
lbop(size_t const tries, size_t const succ, float const confidence)
@ -261,85 +187,6 @@ namespace Moses
find_lower_bound_on_p(tries, succ, confidence)));
}
PhrasePair const&
PhrasePair::
update(uint64_t const pid2, jstats const& js)
{
p2 = pid2;
raw2 = js.cnt2();
joint = js.rcnt();
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
float total_fwd = 0, total_bwd = 0;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
total_fwd += js.dcnt_fwd(po)+1;
total_bwd += js.dcnt_bwd(po)+1;
}
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
}
return *this;
}
PhrasePair const&
PhrasePair::
update(uint64_t const pid2, jstats const& js1, jstats const& js2)
{
p2 = pid2;
raw2 = js1.cnt2() + js2.cnt2();
joint = js1.rcnt() + js2.rcnt();
assert(js1.aln().size() || js2.aln().size());
if (js1.aln().size())
aln = js1.aln()[0].second;
else if (js2.aln().size())
aln = js2.aln()[0].second;
for (int i = po_first; i < po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
}
return *this;
}
PhrasePair const&
PhrasePair::
update(uint64_t const pid2,
size_t const raw2extra,
jstats const& js)
{
p2 = pid2;
raw2 = js.cnt2() + raw2extra;
joint = js.rcnt();
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
}
return *this;
}
float
PhrasePair::
eval(vector<float> const& w)
{
assert(w.size() == this->fvals.size());
this->score = 0;
for (size_t i = 0; i < w.size(); ++i)
this->score += w[i] * this->fvals[i];
return this->score;
}
template<>
sptr<imBitext<L2R_Token<SimpleWordId> > >
imBitext<L2R_Token<SimpleWordId> >::
@ -371,7 +218,8 @@ namespace Moses
uint32_t row,col; char c;
while (ibuf >> row >> c >> col)
{
assert(c == '-');
UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
<< "Error in alignment information:\n" << a);
binwrite(obuf,row);
binwrite(obuf,col);
}
@ -639,7 +487,6 @@ namespace Moses
cout << string(90,'-') << endl;
}
PhraseOrientation
find_po_fwd(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
@ -654,13 +501,13 @@ namespace Moses
ushort ns1,ne1,ne2;
if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2))
{
return po_other;
}
return po_other;
if (ns1 >= e1)
{
for (ushort j = e1; j < ns1; ++j)
if (a1[j].size()) return po_jfwd;
if (a1[j].size())
return po_jfwd;
return po_mono;
}
else

View File

@ -56,6 +56,7 @@ namespace Moses {
class Mmsapt;
namespace bitext
{
template<typename TKN> class Bitext;
using namespace ugdiss;
template<typename TKN> class Bitext;
@ -120,6 +121,7 @@ namespace Moses {
void add(float w, vector<uchar> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient);
void invalidate();
void validate();
bool valid();
uint32_t dcnt_fwd(PhraseOrientation const idx) const;
uint32_t dcnt_bwd(PhraseOrientation const idx) const;
@ -157,43 +159,6 @@ namespace Moses {
uint32_t fwd_o, uint32_t bwd_o);
};
class
PhrasePair
{
public:
uint64_t p1, p2;
uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
vector<float> fvals;
float dfwd[po_other+1];
float dbwd[po_other+1];
vector<uchar> aln;
// float avlex12,avlex21; // average lexical probs (Moses std)
// float znlex1,znlex2; // zens-ney lexical smoothing
// float colex1,colex2; // based on raw lexical occurrences
float score;
PhrasePair();
PhrasePair(PhrasePair const& o);
bool operator<(PhrasePair const& other) const;
bool operator>(PhrasePair const& other) const;
bool operator<=(PhrasePair const& other) const;
bool operator>=(PhrasePair const& other) const;
void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
size_t const numfeats);
PhrasePair const&
update(uint64_t const pid2, jstats const& js);
PhrasePair const&
update(uint64_t const pid2, jstats const& js1, jstats const& js2);
PhrasePair const&
update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
float eval(vector<float> const& w);
};
template<typename TKN>
class Bitext

View File

@ -16,6 +16,9 @@
#include "tpt_tokenindex.h"
#include "ug_ttrack_base.h"
#include "tpt_tokenindex.h"
#include "util/exception.hh"
#include "moses/Util.h"
// #include "ug_vocab.h"
// define the corpus buffer size (in sentences) and the
@ -49,6 +52,8 @@ namespace ugdiss
typename boost::shared_ptr<imTtrack<Token> >
append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt);
void m_check_token_count(); // debugging function
public:
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d);
@ -69,6 +74,22 @@ namespace ugdiss
};
template<typename Token>
void
imTtrack<Token>::
m_check_token_count()
{ // sanity check
size_t check = 0;
BOOST_FOREACH(vector<Token> const& s, *myData)
check += s.size();
UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]"
<< " Wrong token count after appending sentence!"
<< " Counted " << check << " but expected "
<< this->numToks << " in a total of " << myData->size()
<< " sentences.");
}
template<typename Token>
Token const*
imTtrack<Token>::
@ -111,9 +132,9 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL)
: numToks(0)
{
myData.reset(new vector<vector<Token> >());
numToks = 0;
string line,w;
size_t linectr=0;
boost::unordered_map<string,id_type> H;
@ -135,6 +156,7 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(size_t reserve)
: numToks(0)
{
myData.reset(new vector<vector<Token> >());
if (reserve) myData->reserve(reserve);
@ -143,9 +165,9 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
: numToks(0)
{
myData = d;
numToks = 0;
BOOST_FOREACH(vector<Token> const& v, *d)
numToks += v.size();
}
@ -171,6 +193,9 @@ namespace ugdiss
shared_ptr<imTtrack<TOKEN> >
append(shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
{
#if 1
if (crp) crp->m_check_token_count();
#endif
shared_ptr<imTtrack<TOKEN> > ret;
if (crp == NULL)
{
@ -185,6 +210,11 @@ namespace ugdiss
}
else ret = crp;
ret->myData->push_back(snt);
ret->numToks += snt.size();
#if 1
ret->m_check_token_count();
#endif
return ret;
}

View File

@ -27,7 +27,6 @@ namespace ugdiss
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
table_t COOC;
void open(string const& fname);
template<typename someint>
void
score(TKN const* snt1, size_t const s1, size_t const e1,
@ -104,7 +103,19 @@ namespace ugdiss
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
<< ": alpha parameter must be >= 0");
return float(COOC[s][t]+alpha)/(COOC.m1(s)+alpha);
float ret = COOC[s][t]+alpha;
ret = (ret?ret:1.)/(COOC.m1(s)+alpha);
UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
<< ": result not > 0 and <= 1. alpha = " << alpha << "; "
<< COOC[s][t] << "/" << COOC.m1(s));
#if 0
cerr << "[" << s << "," << t << "] "
<< COOC.m1(s) << "/"
<< COOC[s][t] << "/"
<< COOC.m2(t) << endl;
#endif
return ret;
}
template<typename TKN>
@ -115,7 +126,11 @@ namespace ugdiss
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
<< ": alpha parameter must be >= 0");
return float(COOC[s][t]+alpha)/(COOC.m2(t)+alpha);
float ret = float(COOC[s][t]+alpha);
ret = (ret?ret:1.)/(COOC.m2(t)+alpha);
UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
<< ": result not > 0 and <= 1.");
return ret;
}
template<typename TKN>

View File

@ -0,0 +1,97 @@
#include "ug_phrasepair.h"
namespace Moses {
namespace bitext
{
#if 0
void
PhrasePair::
init()
{
p1 = p2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
}
void
PhrasePair::
init(uint64_t const pid1,
pstats const& ps1,
pstats const& ps2,
size_t const numfeats)
{
p1 = pid1;
raw1 = ps1.raw_cnt + ps2.raw_cnt;
sample1 = ps1.sample_cnt + ps2.sample_cnt;
sample2 = 0;
good1 = ps1.good + ps2.good;
good2 = 0;
joint = 0;
fvals.resize(numfeats);
}
PhrasePair const&
PhrasePair::
update(uint64_t const pid2, jstats const& js1, jstats const& js2)
{
p2 = pid2;
raw2 = js1.cnt2() + js2.cnt2();
joint = js1.rcnt() + js2.rcnt();
assert(js1.aln().size() || js2.aln().size());
if (js1.aln().size())
aln = js1.aln()[0].second;
else if (js2.aln().size())
aln = js2.aln()[0].second;
for (int i = po_first; i < po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
}
return *this;
}
PhrasePair const&
PhrasePair::
update(uint64_t const pid2, size_t r2)
{
p2 = pid2;
raw2 = r2;
joint = 0;
return *this;
}
PhrasePair const&
PhrasePair::
update(uint64_t const pid2,
size_t const raw2extra,
jstats const& js)
{
p2 = pid2;
raw2 = js.cnt2() + raw2extra;
joint = js.rcnt();
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
}
return *this;
}
float
PhrasePair::
eval(vector<float> const& w)
{
assert(w.size() == this->fvals.size());
this->score = 0;
for (size_t i = 0; i < w.size(); ++i)
this->score += w[i] * this->fvals[i];
return this->score;
}
#endif
} // namespace bitext
} // namespace Moses

View File

@ -0,0 +1,243 @@
//-*- c++ -*-
#pragma once
#include "ug_bitext.h"
using namespace ugdiss;
using namespace std;
namespace Moses {
namespace bitext
{
template<typename Token>
string
toString(TokenIndex const& V, Token const* x, size_t const len)
{
if (!len) return "";
UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
ostringstream buf;
buf << V[x->id()];
size_t i = 1;
for (x = x->next(); x && i < len; ++i, x = x->next())
buf << " " << V[x->id()];
UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
return buf.str();
}
template<typename Token>
class
PhrasePair
{
public:
Token const* start1;
Token const* start2;
uint32_t len1;
uint32_t len2;
// uint64_t p1, p2;
uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
vector<float> fvals;
float dfwd[po_other+1]; // distortion counts // counts or probs?
float dbwd[po_other+1]; // distortion counts
vector<uchar> aln;
float score;
PhrasePair() { };
PhrasePair(PhrasePair const& o);
PhrasePair const& operator+=(PhrasePair const& other);
bool operator<(PhrasePair const& other) const;
bool operator>(PhrasePair const& other) const;
bool operator<=(PhrasePair const& other) const;
bool operator>=(PhrasePair const& other) const;
void init();
void init(Token const* x, uint32_t const len,
pstats const* ps = NULL, size_t const numfeats=0);
// void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
// void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
// size_t const numfeats);
// PhrasePair const&
// update(uint64_t const pid2, size_t r2 = 0);
PhrasePair const&
update(Token const* x, uint32_t const len, jstats const& js);
// PhrasePair const&
// update(uint64_t const pid2, jstats const& js1, jstats const& js2);
// PhrasePair const&
// update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
// float
// eval(vector<float> const& w);
class SortByTargetIdSeq
{
public:
int cmp(PhrasePair const& a, PhrasePair const& b) const;
bool operator()(PhrasePair const& a, PhrasePair const& b) const;
};
};
template<typename Token>
void
PhrasePair<Token>::
init(Token const* x, uint32_t const len,
pstats const* ps, size_t const numfeats)
{
start1 = x; len1 = len;
// p1 = pid1;
// p2 = 0;
if (ps)
{
raw1 = ps->raw_cnt;
sample1 = ps->sample_cnt;
good1 = ps->good;
}
else raw1 = sample1 = good1 = 0;
joint = 0;
good2 = 0;
sample2 = 0;
raw2 = 0;
fvals.resize(numfeats);
}
template<typename Token>
PhrasePair<Token> const&
PhrasePair<Token>::
update(Token const* x, uint32_t const len, jstats const& js)
{
// p2 = pid2;
start2 = x; len2 = len;
raw2 = js.cnt2();
joint = js.rcnt();
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
float total_fwd = 0, total_bwd = 0;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
total_fwd += js.dcnt_fwd(po)+1;
total_bwd += js.dcnt_bwd(po)+1;
}
// should we do that here or leave the raw counts?
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
}
return *this;
}
template<typename Token>
bool
PhrasePair<Token>::
operator<(PhrasePair const& other) const
{ return this->score < other.score; }
template<typename Token>
bool
PhrasePair<Token>::
operator>(PhrasePair const& other) const
{ return this->score > other.score; }
template<typename Token>
bool
PhrasePair<Token>::
operator<=(PhrasePair const& other) const
{ return this->score <= other.score; }
template<typename Token>
bool
PhrasePair<Token>::
operator>=(PhrasePair const& other) const
{ return this->score >= other.score; }
template<typename Token>
PhrasePair<Token> const&
PhrasePair<Token>::
operator+=(PhrasePair const& o)
{
raw1 += o.raw1;
raw2 += o.raw2;
sample1 += o.sample1;
sample2 += o.sample2;
good1 += o.good1;
good2 += o.good2;
joint += o.joint;
return *this;
}
template<typename Token>
PhrasePair<Token>::
PhrasePair(PhrasePair<Token> const& o)
: start1(o.start1)
, start2(o.start2)
, len1(o.len1)
, len2(o.len2)
, raw1(o.raw1)
, raw2(o.raw2)
, sample1(o.sample1)
, sample2(o.sample2)
, good1(o.good1)
, good2(o.good2)
, joint(o.joint)
, fvals(o.fvals)
, aln(o.aln)
, score(o.score)
{
for (size_t i = 0; i <= po_other; ++i)
{
dfwd[i] = o.dfwd[i];
dbwd[i] = o.dbwd[i];
}
}
template<typename Token>
int
PhrasePair<Token>::
SortByTargetIdSeq::
cmp(PhrasePair const& a, PhrasePair const& b) const
{
size_t i = 0;
Token const* x = a.start2;
Token const* y = b.start2;
while (i < a.len2 && i < b.len2 && x->id() == y->id())
{
x = x->next();
y = y->next();
++i;
}
if (i == a.len2 && i == b.len2) return 0;
if (i == a.len2) return -1;
if (i == b.len2) return 1;
return x->id() < y->id() ? -1 : 1;
}
template<typename Token>
bool
PhrasePair<Token>::
SortByTargetIdSeq::
operator()(PhrasePair const& a, PhrasePair const& b) const
{
return this->cmp(a,b) < 0;
}
template<typename Token>
void
PhrasePair<Token>::
init()
{
len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
start1 = start2 = NULL;
}
} // namespace bitext
} // namespace Moses

View File

@ -7,6 +7,8 @@
#include "ug_typedefs.h"
#include "tpt_tokenindex.h"
#include <iostream>
#include "util/exception.hh"
#include "moses/Util.h"
//#include <cassert>
// #include "ug_bv_iter.h"
@ -60,8 +62,13 @@ namespace ugdiss
// TSA_tree_iterator(TSA_tree_iterator const& other);
TSA_tree_iterator(TSA<Token> const* s);
TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other);
TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
// TSA_tree_iterator(TSA<Token> const* s, Token const& t);
TSA_tree_iterator(TSA<Token> const* s,
Token const* kstart,
size_t const len,
bool full_match_only=true);
TSA_tree_iterator(TSA<Token> const* s,
Token const* kstart,
Token const* kend,
@ -150,9 +157,12 @@ namespace ugdiss
double approxOccurrenceCount(int p=-1) const
{
assert(root);
if (p < 0) p += lower.size();
double ret = arrayByteSpanSize(p)/root->aveIndexEntrySize();
assert(ret < root->corpus->numTokens());
if (ret < 25) ret = rawCnt(p);
UTIL_THROW_IF2(ret > root->corpus->numTokens(), "[" << HERE << "] "
<< "Word count mismatch.");
assert(ret <= root->corpus->numTokens());
return ret;
}
@ -318,6 +328,18 @@ namespace ugdiss
: root(s)
{};
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other)
: root(s)
{
Token const* x = other.getToken(0);
for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i)
x = x->next();
};
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator
@ -382,6 +404,25 @@ namespace ugdiss
#endif
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
size_t const len, bool full_match_only)
: root(s)
{
if (!root) return;
size_t i = 0;
for (; i < len && kstart && extend(*kstart); ++i)
kstart = kstart->next();
if (full_match_only && i != len)
{
lower.clear();
upper.clear();
}
};
// DEPRECATED: DO NOT USE. Use the one that takes the length
// instead of kend.
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
@ -561,8 +602,7 @@ namespace ugdiss
TSA_tree_iterator<Token>::
rawCnt(int p) const
{
if (p < 0)
p = lower.size()+p;
if (p < 0) p += lower.size();
assert(p>=0);
if (lower.size() == 0) return root->getCorpusSize();
return root->rawCnt(lower[p],upper[p]);

File diff suppressed because it is too large Load Diff

View File

@ -19,6 +19,7 @@
#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
#include "moses/InputFileStream.h"
@ -29,7 +30,8 @@
#include <map>
#include "moses/TranslationModel/PhraseDictionary.h"
#include "mmsapt_phrase_scorers.h"
#include "mmsapt_phrase_scorers.h" // deprecated
#include "sapt_phrase_scorers.h"
// TO DO:
// - make lexical phrase scorer take addition to the "dynamic overlay" into account
@ -47,47 +49,68 @@ namespace Moses
#endif
{
friend class Alignment;
map<string,string> param;
public:
typedef L2R_Token<SimpleWordId> Token;
typedef mmBitext<Token> mmbitext;
typedef imBitext<Token> imbitext;
typedef Bitext<Token> bitext;
typedef TSA<Token> tsa;
typedef PhraseScorer<Token> pscorer;
private:
// vector<sptr<bitext> > shards;
mmbitext btfix;
sptr<imbitext> btdyn;
sptr<imbitext> btdyn;
string bname,extra_data;
string L1;
string L2;
float m_lbop_parameter;
float m_lex_alpha;
float m_lbop_conf; // confidence level for lbop smoothing
float m_lex_alpha; // alpha paramter (j+a)/(m+a) for lexical smoothing
// alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha)
// must be > 0 if dynamic
size_t m_default_sample_size;
size_t m_workers; // number of worker threads for sampling the bitexts
// deprecated!
char m_pfwd_denom; // denominator for computation of fwd phrase score:
// 'r' - divide by raw count
// 's' - divide by sample count
// 'g' - devide by number of "good" (i.e. coherent) samples
// size_t num_features;
// // deprecated!
// char m_pfwd_denom; // denominator for computation of fwd phrase score:
// // 'r' - divide by raw count
// // 's' - divide by sample count
// // 'g' - devide by number of "good" (i.e. coherent) samples
// // size_t num_features;
size_t input_factor;
size_t output_factor; // we can actually return entire Tokens!
bool withLogCountFeatures; // add logs of counts as features?
bool withCoherence;
string m_pfwd_features; // which pfwd functions to use
string m_pbwd_features; // which pbwd functions to use
// bool withLogCountFeatures; // add logs of counts as features?
// bool withCoherence;
// string m_pfwd_features; // which pfwd functions to use
// string m_pbwd_features; // which pbwd functions to use
// for display for human inspection (ttable dumps):
vector<string> m_feature_names; // names of features activated
vector<bool> m_is_logval; // keeps track of which features are log valued
vector<bool> m_is_integer; // keeps track of which features are integer valued
vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix)
vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn)
vector<sptr<pscorer > > m_active_ff_common; // activated feature functions (dyn)
size_t
add_corpus_specific_features
(vector<sptr<pscorer > >& ffvec, size_t num_feats);
void
register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry);
template<typename fftype>
void
check_ff(string const ffname,vector<sptr<pscorer> >* registry = NULL);
// add feature function if specified
template<typename fftype>
void
check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry = NULL);
// add feature function if specified
void
add_corpus_specific_features(vector<sptr<pscorer > >& ffvec);
// built-in feature functions
// PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
@ -140,12 +163,24 @@ namespace Moses
mm2dtable_t COOCraw;
TargetPhrase*
createTargetPhrase
mkTPhrase(Phrase const& src,
Moses::bitext::PhrasePair<Token>* fix,
Moses::bitext::PhrasePair<Token>* dyn,
sptr<Bitext<Token> > const& dynbt) const;
// template<typename Token>
// void
// expand(typename Bitext<Token>::iter const& m, Bitext<Token> const& bt,
// pstats const& pstats, vector<PhrasePair<Token> >& dest);
#if 0
TargetPhrase*
mkTPhrase
(Phrase const& src,
Bitext<Token> const& bt,
bitext::PhrasePair const& pp
Moses::bitext::PhrasePair const& pp
) const;
#endif
void
process_pstats
(Phrase const& src,
@ -180,7 +215,7 @@ namespace Moses
) const;
void
load_extra_data(string bname);
load_extra_data(string bname, bool locking);
mutable size_t m_tpc_ctr;
public:
@ -231,8 +266,14 @@ namespace Moses
vector<string> const&
GetFeatureNames() const;
void
ScorePPfix(bitext::PhrasePair& pp) const;
// void
// ScorePPfix(bitext::PhrasePair& pp) const;
bool
isLogVal(int i) const;
bool
isInteger(int i) const;
private:
};

View File

@ -1,335 +1,336 @@
#include "mmsapt.h"
// currently broken
namespace Moses
{
using namespace bitext;
using namespace std;
using namespace boost;
// namespace Moses
// {
// using namespace bitext;
// using namespace std;
// using namespace boost;
struct PPgreater
{
bool operator()(PhrasePair const& a, PhrasePair const& b)
{
return a.score > b.score;
}
};
// struct PPgreater
// {
// bool operator()(PhrasePair const& a, PhrasePair const& b)
// {
// return a.score > b.score;
// }
// };
void
Mmsapt::
setWeights(vector<float> const & w)
{
assert(w.size() == this->m_numScoreComponents);
this->feature_weights = w;
}
// void
// Mmsapt::
// setWeights(vector<float> const & w)
// {
// assert(w.size() == this->m_numScoreComponents);
// this->feature_weights = w;
// }
struct PhraseAlnHyp
{
PhrasePair pp;
ushort s1,e1,s2,e2; // start and end positions
int prev; // preceding alignment hypothesis
float score;
bitvector scov; // source coverage
PhraseAlnHyp(PhrasePair const& ppx, int slen,
pair<uint32_t,uint32_t> const& sspan,
pair<uint32_t,uint32_t> const& tspan)
: pp(ppx), prev(-1), score(ppx.score), scov(slen)
{
s1 = sspan.first; e1 = sspan.second;
s2 = tspan.first; e2 = tspan.second;
for (size_t i = s1; i < e1; ++i)
scov.set(i);
}
// struct PhraseAlnHyp
// {
// PhrasePair pp;
// ushort s1,e1,s2,e2; // start and end positions
// int prev; // preceding alignment hypothesis
// float score;
// bitvector scov; // source coverage
// PhraseAlnHyp(PhrasePair const& ppx, int slen,
// pair<uint32_t,uint32_t> const& sspan,
// pair<uint32_t,uint32_t> const& tspan)
// : pp(ppx), prev(-1), score(ppx.score), scov(slen)
// {
// s1 = sspan.first; e1 = sspan.second;
// s2 = tspan.first; e2 = tspan.second;
// for (size_t i = s1; i < e1; ++i)
// scov.set(i);
// }
bool operator<(PhraseAlnHyp const& other) const
{
return this->score < other.score;
}
// bool operator<(PhraseAlnHyp const& other) const
// {
// return this->score < other.score;
// }
bool operator>(PhraseAlnHyp const& other) const
{
return this->score > other.score;
}
// bool operator>(PhraseAlnHyp const& other) const
// {
// return this->score > other.score;
// }
PhraseOrientation
po_bwd(PhraseAlnHyp const* prev) const
{
if (s2 == 0) return po_first;
assert(prev);
assert(prev->e2 <= s2);
if (prev->e2 < s2) return po_other;
if (prev->e1 == s1) return po_mono;
if (prev->e1 < s1) return po_jfwd;
if (prev->s1 == e1) return po_swap;
if (prev->s1 > e1) return po_jbwd;
return po_other;
}
// PhraseOrientation
// po_bwd(PhraseAlnHyp const* prev) const
// {
// if (s2 == 0) return po_first;
// assert(prev);
// assert(prev->e2 <= s2);
// if (prev->e2 < s2) return po_other;
// if (prev->e1 == s1) return po_mono;
// if (prev->e1 < s1) return po_jfwd;
// if (prev->s1 == e1) return po_swap;
// if (prev->s1 > e1) return po_jbwd;
// return po_other;
// }
PhraseOrientation
po_fwd(PhraseAlnHyp const* next) const
{
if (!next) return po_last;
assert(next->s2 >= e2);
if (next->s2 < e2) return po_other;
if (next->e1 == s1) return po_swap;
if (next->e1 < s1) return po_jbwd;
if (next->s1 == e1) return po_mono;
if (next->s1 > e1) return po_jfwd;
return po_other;
}
// PhraseOrientation
// po_fwd(PhraseAlnHyp const* next) const
// {
// if (!next) return po_last;
// assert(next->s2 >= e2);
// if (next->s2 < e2) return po_other;
// if (next->e1 == s1) return po_swap;
// if (next->e1 < s1) return po_jbwd;
// if (next->s1 == e1) return po_mono;
// if (next->s1 > e1) return po_jfwd;
// return po_other;
// }
float
dprob_fwd(PhraseAlnHyp const& next)
{
return pp.dfwd[po_fwd(&next)];
}
// float
// dprob_fwd(PhraseAlnHyp const& next)
// {
// return pp.dfwd[po_fwd(&next)];
// }
float
dprob_bwd(PhraseAlnHyp const& prev)
{
return pp.dbwd[po_bwd(&prev)];
}
// float
// dprob_bwd(PhraseAlnHyp const& prev)
// {
// return pp.dbwd[po_bwd(&prev)];
// }
};
// };
class Alignment
{
typedef L2R_Token<SimpleWordId> Token;
typedef TSA<Token> tsa;
typedef pair<uint32_t, uint32_t> span;
typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
typedef pstats::trg_map_t jStatsTable;
// class Alignment
// {
// typedef L2R_Token<SimpleWordId> Token;
// typedef TSA<Token> tsa;
// typedef pair<uint32_t, uint32_t> span;
// typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
// typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
// typedef pstats::trg_map_t jStatsTable;
Mmsapt const& PT;
vector<id_type> s,t;
pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
pid2span_t spid2span,tpid2span;
vector<vector<sptr<pstats> > > spstats;
// Mmsapt const& PT;
// vector<id_type> s,t;
// pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
// pid2span_t spid2span,tpid2span;
// vector<vector<sptr<pstats> > > spstats;
vector<PhrasePair> PP;
// position-independent phrase pair info
public:
vector<PhraseAlnHyp> PAH;
vector<vector<int> > tpos2ahyp;
// maps from target start positions to PhraseAlnHyps starting at
// that position
// vector<PhrasePair> PP;
// // position-independent phrase pair info
// public:
// vector<PhraseAlnHyp> PAH;
// vector<vector<int> > tpos2ahyp;
// // maps from target start positions to PhraseAlnHyps starting at
// // that position
sptr<pstats> getPstats(span const& sspan);
void fill_tspan_maps();
void fill_sspan_maps();
public:
Alignment(Mmsapt const& pt, string const& src, string const& trg);
void show(ostream& out);
void show(ostream& out, PhraseAlnHyp const& ah);
};
// sptr<pstats> getPstats(span const& sspan);
// void fill_tspan_maps();
// void fill_sspan_maps();
// public:
// Alignment(Mmsapt const& pt, string const& src, string const& trg);
// void show(ostream& out);
// void show(ostream& out, PhraseAlnHyp const& ah);
// };
void
Alignment::
show(ostream& out, PhraseAlnHyp const& ah)
{
#if 0
LexicalPhraseScorer2<Token>::table_t const&
COOCjnt = PT.calc_lex.scorer.COOC;
// void
// Alignment::
// show(ostream& out, PhraseAlnHyp const& ah)
// {
// #if 0
// LexicalPhraseScorer2<Token>::table_t const&
// COOCjnt = PT.calc_lex.scorer.COOC;
out << setw(10) << exp(ah.score) << " "
<< PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
<< " <=> "
<< PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
vector<uchar> const& a = ah.pp.aln;
// BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
for (size_t u = 0; u+1 < a.size(); u += 2)
out << " " << int(a[u+1]) << "-" << int(a[u]);
// out << setw(10) << exp(ah.score) << " "
// << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
// << " <=> "
// << PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
// vector<uchar> const& a = ah.pp.aln;
// // BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
// for (size_t u = 0; u+1 < a.size(); u += 2)
// out << " " << int(a[u+1]) << "-" << int(a[u]);
if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
<< "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
<< "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
out << endl;
// float const* ofwdj = ah.pp.dfwd;
// float const* obwdj = ah.pp.dbwd;
// uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
// uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
// out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
// << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
// << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
// << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
// << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
// << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
// << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
// << "]" << endl
// << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
// << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
// << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
// << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
// << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
// << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
// << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
// << "]" << endl;
#endif
}
// if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
// out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
// << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
// << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
// out << endl;
// // float const* ofwdj = ah.pp.dfwd;
// // float const* obwdj = ah.pp.dbwd;
// // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
// // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
// // out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
// // << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
// // << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
// // << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
// // << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
// // << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
// // << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
// // << "]" << endl
// // << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
// // << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
// // << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
// // << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
// // << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
// // << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
// // << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
// // << "]" << endl;
// #endif
// }
void
Alignment::
show(ostream& out)
{
// show what we have so far ...
for (size_t s2 = 0; s2 < t.size(); ++s2)
{
VectorIndexSorter<PhraseAlnHyp> foo(PAH);
sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
show(out,PAH[tpos2ahyp[s2][h]]);
}
}
// void
// Alignment::
// show(ostream& out)
// {
// // show what we have so far ...
// for (size_t s2 = 0; s2 < t.size(); ++s2)
// {
// VectorIndexSorter<PhraseAlnHyp> foo(PAH);
// sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
// for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
// show(out,PAH[tpos2ahyp[s2][h]]);
// }
// }
sptr<pstats>
Alignment::
getPstats(span const& sspan)
{
size_t k = sspan.second - sspan.first - 1;
if (k < spstats[sspan.first].size())
return spstats[sspan.first][k];
else return sptr<pstats>();
}
// sptr<pstats>
// Alignment::
// getPstats(span const& sspan)
// {
// size_t k = sspan.second - sspan.first - 1;
// if (k < spstats[sspan.first].size())
// return spstats[sspan.first][k];
// else return sptr<pstats>();
// }
void
Alignment::
fill_tspan_maps()
{
tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
for (size_t i = 0; i < t.size(); ++i)
{
tsa::tree_iterator m(PT.btfix.I2.get());
for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
{
uint64_t pid = m.getPid();
tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
tspan2pid[i][k] = pid;
}
}
}
// void
// Alignment::
// fill_tspan_maps()
// {
// tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
// for (size_t i = 0; i < t.size(); ++i)
// {
// tsa::tree_iterator m(PT.btfix.I2.get());
// for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
// {
// uint64_t pid = m.getPid();
// tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
// tspan2pid[i][k] = pid;
// }
// }
// }
void
Alignment::
fill_sspan_maps()
{
sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
spstats.resize(s.size());
for (size_t i = 0; i < s.size(); ++i)
{
tsa::tree_iterator m(PT.btfix.I1.get());
for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
{
uint64_t pid = m.getPid();
sspan2pid[i][k] = pid;
pid2span_t::iterator p = spid2span.find(pid);
if (p != spid2span.end())
{
int x = p->second[0].first;
int y = p->second[0].second-1;
spstats[i].push_back(spstats[x][y-x]);
}
else
{
spstats[i].push_back(PT.btfix.lookup(m));
cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
<< spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
<< endl;
}
spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
}
}
}
// void
// Alignment::
// fill_sspan_maps()
// {
// sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
// spstats.resize(s.size());
// for (size_t i = 0; i < s.size(); ++i)
// {
// tsa::tree_iterator m(PT.btfix.I1.get());
// for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
// {
// uint64_t pid = m.getPid();
// sspan2pid[i][k] = pid;
// pid2span_t::iterator p = spid2span.find(pid);
// if (p != spid2span.end())
// {
// int x = p->second[0].first;
// int y = p->second[0].second-1;
// spstats[i].push_back(spstats[x][y-x]);
// }
// else
// {
// spstats[i].push_back(PT.btfix.lookup(m));
// cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
// << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
// << endl;
// }
// spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
// }
// }
// }
Alignment::
Alignment(Mmsapt const& pt, string const& src, string const& trg)
: PT(pt)
{
PT.btfix.V1->fillIdSeq(src,s);
PT.btfix.V2->fillIdSeq(trg,t);
// Alignment::
// Alignment(Mmsapt const& pt, string const& src, string const& trg)
// : PT(pt)
// {
// PT.btfix.V1->fillIdSeq(src,s);
// PT.btfix.V2->fillIdSeq(trg,t);
// LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
// BOOST_FOREACH(id_type i, t)
// {
// cout << (*PT.btfix.V2)[i];
// if (i < PT.wlex21.size())
// {
// BOOST_FOREACH(id_type k, PT.wlex21[i])
// {
// size_t j = COOC[k][i];
// size_t m1 = COOC.m1(k);
// size_t m2 = COOC.m2(i);
// if (j*1000 > m1 && j*1000 > m2)
// cout << " " << (*PT.btfix.V1)[k];
// }
// }
// cout << endl;
// }
// // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
// // BOOST_FOREACH(id_type i, t)
// // {
// // cout << (*PT.btfix.V2)[i];
// // if (i < PT.wlex21.size())
// // {
// // BOOST_FOREACH(id_type k, PT.wlex21[i])
// // {
// // size_t j = COOC[k][i];
// // size_t m1 = COOC.m1(k);
// // size_t m2 = COOC.m2(i);
// // if (j*1000 > m1 && j*1000 > m2)
// // cout << " " << (*PT.btfix.V1)[k];
// // }
// // }
// // cout << endl;
// // }
fill_tspan_maps();
fill_sspan_maps();
tpos2ahyp.resize(t.size());
// now fill the association score table
PAH.reserve(1000000);
typedef pid2span_t::iterator psiter;
for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
{
if (!L->second.size()) continue; // should never happen anyway
int i = L->second[0].first;
int k = L->second[0].second - i -1;
sptr<pstats> ps = spstats[i][k];
PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
jStatsTable & J = ps->trg;
for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
{
psiter R = tpid2span.find(y->first);
if (R == tpid2span.end()) continue;
pp.update(y->first, y->second);
PT.ScorePPfix(pp);
pp.eval(PT.feature_weights);
PP.push_back(pp);
BOOST_FOREACH(span const& sspan, L->second)
{
BOOST_FOREACH(span const& tspan, R->second)
{
tpos2ahyp[tspan.first].push_back(PAH.size());
PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
}
}
}
}
}
// fill_tspan_maps();
// fill_sspan_maps();
// tpos2ahyp.resize(t.size());
// // now fill the association score table
// PAH.reserve(1000000);
// typedef pid2span_t::iterator psiter;
// for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
// {
// if (!L->second.size()) continue; // should never happen anyway
// int i = L->second[0].first;
// int k = L->second[0].second - i -1;
// sptr<pstats> ps = spstats[i][k];
// PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
// jStatsTable & J = ps->trg;
// for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
// {
// psiter R = tpid2span.find(y->first);
// if (R == tpid2span.end()) continue;
// pp.update(y->first, y->second);
// PT.ScorePPfix(pp);
// pp.eval(PT.feature_weights);
// PP.push_back(pp);
// BOOST_FOREACH(span const& sspan, L->second)
// {
// BOOST_FOREACH(span const& tspan, R->second)
// {
// tpos2ahyp[tspan.first].push_back(PAH.size());
// PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
// }
// }
// }
// }
// }
int
extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
{
if ((PAH[edge].scov & PAH[next].scov).count())
return -1;
int ret = PAH.size();
PAH.push_back(PAH[next]);
PhraseAlnHyp & h = PAH.back();
h.prev = edge;
h.scov |= PAH[edge].scov;
h.score += log(PAH[edge].dprob_fwd(PAH[next]));
h.score += log(PAH[next].dprob_bwd(PAH[edge]));
return ret;
}
// int
// extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
// {
// if ((PAH[edge].scov & PAH[next].scov).count())
// return -1;
// int ret = PAH.size();
// PAH.push_back(PAH[next]);
// PhraseAlnHyp & h = PAH.back();
// h.prev = edge;
// h.scov |= PAH[edge].scov;
// h.score += log(PAH[edge].dprob_fwd(PAH[next]));
// h.score += log(PAH[next].dprob_bwd(PAH[edge]));
// return ret;
// }
sptr<vector<int> >
Mmsapt::
align(string const& src, string const& trg) const
{
// For the time being, we consult only the fixed bitext.
// We might also consider the dynamic bitext. => TO DO.
Alignment A(*this,src,trg);
VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
vector<size_t> o; foo.GetOrder(o);
BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
sptr<vector<int> > aln;
return aln;
}
}
// sptr<vector<int> >
// Mmsapt::
// align(string const& src, string const& trg) const
// {
// // For the time being, we consult only the fixed bitext.
// // We might also consider the dynamic bitext. => TO DO.
// Alignment A(*this,src,trg);
// VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
// vector<size_t> o; foo.GetOrder(o);
// BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
// sptr<vector<int> > aln;
// return aln;
// }
// }

View File

@ -1,268 +1,17 @@
// -*- c++ -*-
// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
#include "boost/format.hpp"
#include "sapt_pscore_base.h"
// DEPRECATED CODE: Word and phrase penalties are now
// added by the decoder.
namespace Moses {
namespace bitext
{
template<typename Token>
class
PhraseScorer
{
protected:
int m_index;
int m_num_feats;
vector<string> m_feature_names;
public:
virtual
void
operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest=NULL)
const = 0;
int
fcnt() const
{ return m_num_feats; }
vector<string> const &
fnames() const
{ return m_feature_names; }
string const &
fname(int i) const
{
UTIL_THROW_IF2((i < m_index || i >= m_index + m_num_feats),
"Feature name index out of range at "
<< __FILE__ << ":" << __LINE__);
return m_feature_names.at(i - m_index);
}
int
getIndex() const
{ return m_index; }
};
////////////////////////////////////////////////////////////////////////////////
template<typename Token>
class
PScorePfwd : public PhraseScorer<Token>
{
float conf;
char denom;
public:
PScorePfwd()
{
this->m_num_feats = 1;
}
int
init(int const i, float const c, char d)
{
conf = c;
denom = d;
this->m_index = i;
ostringstream buf;
buf << format("pfwd-%c%.3f") % denom % c;
this->m_feature_names.push_back(buf.str());
return i + this->m_num_feats;
}
void
operator()(Bitext<Token> const& bt, PhrasePair & pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
if (pp.joint > pp.good1)
{
cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
}
switch (denom)
{
case 'g':
(*dest)[this->m_index] = log(lbop(pp.good1, pp.joint, conf));
break;
case 's':
(*dest)[this->m_index] = log(lbop(pp.sample1, pp.joint, conf));
break;
case 'r':
(*dest)[this->m_index] = log(lbop(pp.raw1, pp.joint, conf));
}
}
};
////////////////////////////////////////////////////////////////////////////////
template<typename Token>
class
PScorePbwd : public PhraseScorer<Token>
{
float conf;
char denom;
public:
PScorePbwd()
{
this->m_num_feats = 1;
}
int
init(int const i, float const c, char d)
{
conf = c;
denom = d;
this->m_index = i;
ostringstream buf;
buf << format("pbwd-%c%.3f") % denom % c;
this->m_feature_names.push_back(buf.str());
return i + this->m_num_feats;
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// we use the denominator specification to scale the raw counts on the
// target side; the clean way would be to counter-sample
uint32_t r2 = pp.raw2;
if (denom == 'g') r2 = round(r2 * float(pp.good1) / pp.raw1);
else if (denom == 's') r2 = round(r2 * float(pp.sample1) / pp.raw1);
(*dest)[this->m_index] = log(lbop(max(r2, pp.joint),pp.joint,conf));
}
};
////////////////////////////////////////////////////////////////////////////////
template<typename Token>
class
PScoreCoherence : public PhraseScorer<Token>
{
public:
PScoreCoherence()
{
this->m_num_feats = 1;
}
int
init(int const i)
{
this->m_index = i;
this->m_feature_names.push_back(string("coherence"));
return i + this->m_num_feats;
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = log(pp.good1) - log(pp.sample1);
}
};
////////////////////////////////////////////////////////////////////////////////
template<typename Token>
class
PScoreLogCounts : public PhraseScorer<Token>
{
float conf;
public:
PScoreLogCounts()
{
this->m_num_feats = 5;
}
int
init(int const i)
{
this->m_index = i;
this->m_feature_names.push_back("log-r1");
this->m_feature_names.push_back("log-s1");
this->m_feature_names.push_back("log-g1");
this->m_feature_names.push_back("log-j");
this->m_feature_names.push_back("log-r2");
return i + this->m_num_feats;
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
size_t i = this->m_index;
assert(pp.raw1);
assert(pp.sample1);
assert(pp.good1);
assert(pp.joint);
assert(pp.raw2);
(*dest)[i] = -log(pp.raw1);
(*dest)[++i] = -log(pp.sample1);
(*dest)[++i] = -log(pp.good1);
(*dest)[++i] = +log(pp.joint);
(*dest)[++i] = -log(pp.raw2);
}
};
template<typename Token>
class
PScoreLex : public PhraseScorer<Token>
{
float const m_alpha;
public:
LexicalPhraseScorer2<Token> scorer;
PScoreLex(float const a)
: m_alpha(a)
{ this->m_num_feats = 2; }
int
init(int const i, string const& fname)
{
scorer.open(fname);
this->m_index = i;
this->m_feature_names.push_back("lexfwd");
this->m_feature_names.push_back("lexbwd");
return i + this->m_num_feats;
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
parse_pid(pp.p1, sid1, off1, len1);
parse_pid(pp.p2, sid2, off2, len2);
#if 0
cout << len1 << " " << len2 << endl;
Token const* t1 = bt.T1->sntStart(sid1);
for (size_t i = off1; i < off1 + len1; ++i)
cout << (*bt.V1)[t1[i].id()] << " ";
cout << __FILE__ << ":" << __LINE__ << endl;
Token const* t2 = bt.T2->sntStart(sid2);
for (size_t i = off2; i < off2 + len2; ++i)
cout << (*bt.V2)[t2[i].id()] << " ";
cout << __FILE__ << ":" << __LINE__ << endl;
BOOST_FOREACH (int a, pp.aln)
cout << a << " " ;
cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
#endif
scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
bt.T2->sntStart(sid2)+off2,0,len2,
pp.aln, m_alpha,
(*dest)[this->m_index],
(*dest)[this->m_index+1]);
}
};
/// Word penalty
template<typename Token>
class
@ -280,7 +29,8 @@ namespace Moses {
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
operator()(Bitext<Token> const& bt, PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
uint32_t sid2=0,off2=0,len2=0;
@ -307,7 +57,8 @@ namespace Moses {
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
operator()(Bitext<Token> const& bt, PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = 1;

View File

@ -106,15 +106,11 @@ int main(int argc, char* argv[])
cout << " ";
for (size_t k = idx.first; k < idx.second; ++k)
{
if (mmsapt && fname[k-idx.first].substr(0,3) == "log")
{
if(scores[k] < 0)
cout << " " << format("%10d") % round(exp(-scores[k]));
else
cout << " " << format("%10d") % round(exp(scores[k]));
}
else
cout << " " << format("%10.8f") % exp(scores[k]);
size_t j = k-idx.first;
float f = (mmsapt ? mmsapt->isLogVal(j) ? exp(scores[k]) : scores[k]
: scores[k] < 0 ? exp(scores[k]) : scores[k]);
string fmt = (mmsapt && mmsapt->isInteger(j)) ? "%10d" : "%10.8f";
cout << " " << format(fmt) % f;
}
cout << endl;
}

View File

@ -0,0 +1,13 @@
//-*- c++ -*-
#pragma once
#include <stdint.h>
using namespace std;
namespace sapt
{
using namespace Moses;
using namespace std;
}

View File

@ -0,0 +1,12 @@
// -*- c++ -*-
// Phrase scoring functions for suffix array-based phrase tables
// written by Ulrich Germann
#pragma once
#include "sapt_pscore_unaligned.h" // count # of unaligned words
#include "sapt_pscore_provenance.h" // reward for joint phrase occ. per corpus
#include "sapt_pscore_rareness.h" // penalty for rare occurrences (global?)
#include "sapt_pscore_logcnt.h" // logs of observed counts
#include "sapt_pscore_lex1.h" // plain vanilla Moses lexical scores
#include "sapt_pscore_pfwd.h" // fwd phrase prob
#include "sapt_pscore_pbwd.h" // bwd phrase prob
#include "sapt_pscore_coherence.h" // coherence feature: good/sample-size

View File

@ -0,0 +1,103 @@
// -*- c++ -*-
// Base classes for suffix array-based phrase scorers
// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
#include "util/exception.hh"
#include "boost/format.hpp"
namespace Moses {
namespace bitext
{
// abstract base class that defines the common API for phrase scorers
template<typename Token>
class
PhraseScorer
{
protected:
int m_index;
int m_num_feats;
string m_tag;
vector<string> m_feature_names;
public:
virtual
void
operator()(Bitext<Token> const& pt,
PhrasePair<Token>& pp,
vector<float> * dest=NULL)
const = 0;
void
setIndex(int const i) { m_index = i; }
int
getIndex() const { return m_index; }
int
fcnt() const { return m_num_feats; }
vector<string> const &
fnames() const { return m_feature_names; }
string const &
fname(int i) const
{
if (i < 0) i += m_num_feats;
UTIL_THROW_IF2(i < 0 || i >= m_num_feats,
"Feature name index out of range at " << HERE);
return m_feature_names.at(i);
}
virtual
bool
isLogVal(int i) const { return true; };
// is this feature log valued?
virtual
bool
isIntegerValued(int i) const { return false; };
// is this feature integer valued (e.g., count features)?
virtual
bool
allowPooling() const { return true; }
// does this feature function allow pooling of counts if
// there are no occurrences in the respective corpus?
};
// base class for 'families' of phrase scorers that have a single
template<typename Token>
class
SingleRealValuedParameterPhraseScorerFamily
: public PhraseScorer<Token>
{
protected:
vector<float> m_x;
virtual
void
init(string const specs)
{
using namespace boost;
UTIL_THROW_IF2(this->m_tag.size() == 0,
"m_tag must be initialized in constructor");
UTIL_THROW_IF2(specs.size() == 0,"empty specification string!");
UTIL_THROW_IF2(this->m_feature_names.size(),
"PhraseScorer can only be initialized once!");
this->m_index = -1;
float x; char c;
for (istringstream buf(specs); buf>>x; buf>>c)
{
this->m_x.push_back(x);
string fname = (format("%s-%.2f") % this->m_tag % x).str();
this->m_feature_names.push_back(fname);
}
this->m_num_feats = this->m_x.size();
}
};
} // namespace bitext
} // namespace moses

View File

@ -0,0 +1,33 @@
// -*- c++ -*-
// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
#include "boost/format.hpp"
namespace Moses {
namespace bitext
{
template<typename Token>
class
PScoreCoherence : public PhraseScorer<Token>
{
public:
PScoreCoherence(string const dummy)
{
this->m_index = -1;
this->m_num_feats = 1;
this->m_feature_names.push_back(string("coherence"));
}
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = log(pp.good1) - log(pp.sample1);
}
};
}
}

View File

@ -0,0 +1,70 @@
// -*- c++ -*-
// Phrase scorer that counts the number of unaligend words in the phrase
// written by Ulrich Germann
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "sapt_pscore_base.h"
#include <boost/dynamic_bitset.hpp>
namespace Moses {
namespace bitext
{
template<typename Token>
class
PScoreLex1 : public PhraseScorer<Token>
{
float m_alpha;
public:
LexicalPhraseScorer2<Token> scorer;
PScoreLex1(string const& alpaspec, string const& lexfile)
{
this->m_index = -1;
this->m_num_feats = 2;
this->m_feature_names.reserve(2);
this->m_feature_names.push_back("lexfwd");
this->m_feature_names.push_back("lexbwd");
m_alpha = atof(alpaspec.c_str());
scorer.open(lexfile);
}
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
// parse_pid(pp.p1, sid1, off1, len1);
// parse_pid(pp.p2, sid2, off2, len2);
#if 0
cout << len1 << " " << len2 << endl;
Token const* t1 = bt.T1->sntStart(sid1);
for (size_t i = off1; i < off1 + len1; ++i)
cout << (*bt.V1)[t1[i].id()] << " ";
cout << __FILE__ << ":" << __LINE__ << endl;
Token const* t2 = bt.T2->sntStart(sid2);
for (size_t i = off2; i < off2 + len2; ++i)
cout << (*bt.V2)[t2[i].id()] << " ";
cout << __FILE__ << ":" << __LINE__ << endl;
BOOST_FOREACH (int a, pp.aln)
cout << a << " " ;
cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
bt.T2->sntStart(sid2)+off2,0,len2,
pp.aln, m_alpha,
(*dest)[this->m_index],
(*dest)[this->m_index+1]);
#endif
scorer.score(pp.start1,0, pp.len1,
pp.start2,0, pp.len2, pp.aln, m_alpha,
(*dest)[this->m_index],
(*dest)[this->m_index+1]);
}
};
} //namespace bitext
} // namespace Moses

View File

@ -0,0 +1,65 @@
// -*- c++ -*-
// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
// with the asymptotic function x/(j+x) where x > 0 is a function
// parameter that determines the steepness of the rewards curve
// written by Ulrich Germann
#include "sapt_pscore_base.h"
#include <boost/dynamic_bitset.hpp>
using namespace std;
namespace Moses {
namespace bitext {
template<typename Token>
class
PScoreLogCnt : public PhraseScorer<Token>
{
string m_specs;
public:
PScoreLogCnt(string const specs)
{
this->m_index = -1;
this->m_specs = specs;
if (specs.find("r1") != string::npos) // raw source phrase counts
this->m_feature_names.push_back("log-r1");
if (specs.find("s1") != string::npos)
this->m_feature_names.push_back("log-s1"); // L1 sample size
if (specs.find("g1") != string::npos) // coherent phrases
this->m_feature_names.push_back("log-g1");
if (specs.find("j") != string::npos) // joint counts
this->m_feature_names.push_back("log-j");
if (specs.find("r2") != string::npos) // raw target phrase counts
this->m_feature_names.push_back("log-r2");
this->m_num_feats = this->m_feature_names.size();
}
bool
isIntegerValued(int i) const { return true; }
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
assert(pp.raw1);
assert(pp.sample1);
assert(pp.good1);
assert(pp.joint);
assert(pp.raw2);
size_t i = this->m_index;
if (m_specs.find("r1") != string::npos)
(*dest)[i++] = log(pp.raw1);
if (m_specs.find("s1") != string::npos)
(*dest)[i++] = log(pp.sample1);
if (m_specs.find("g1") != string::npos)
(*dest)[i++] = log(pp.good1);
if (m_specs.find("j") != string::npos)
(*dest)[i++] = log(pp.joint);
if (m_specs.find("r2") != string::npos)
(*dest)[++i] = log(pp.raw2);
}
};
} // namespace bitext
} // namespace Moses

View File

@ -0,0 +1,58 @@
//-*- c++ -*-
// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
#include "boost/format.hpp"
#include "boost/foreach.hpp"
namespace Moses {
namespace bitext
{
template<typename Token>
class
PScorePbwd : public PhraseScorer<Token>
{
float conf;
string denom;
public:
PScorePbwd(float const c, string d)
{
this->m_index = -1;
conf = c;
denom = d;
size_t checksum = d.size();
BOOST_FOREACH(char const& x, denom)
{
if (x == '+') { --checksum; continue; }
if (x != 'g' && x != 's' && x != 'r') continue;
string s = (format("pbwd-%c%.3f") % x % c).str();
this->m_feature_names.push_back(s);
}
this->m_num_feats = this->m_feature_names.size();
UTIL_THROW_IF2(this->m_feature_names.size() != checksum,
"Unknown parameter in specification '"
<< d << "' for Pbwd phrase scorer at " << HERE);
}
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// we use the denominator specification to scale the raw counts on the
// target side; the clean way would be to counter-sample
size_t i = this->m_index;
BOOST_FOREACH(char const& x, denom)
{
uint32_t m2 = pp.raw2;
if (x == 'g') m2 = round(m2 * float(pp.good1) / pp.raw1);
else if (x == 's') m2 = round(m2 * float(pp.sample1) / pp.raw1);
(*dest)[i++] = log(lbop(max(m2, pp.joint),pp.joint,conf));
}
}
};
} // namespace bitext
} // namespace Moses

View File

@ -0,0 +1,70 @@
// -*- c++ -*-
// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
#include "boost/format.hpp"
#include "boost/foreach.hpp"
namespace Moses {
namespace bitext
{
template<typename Token>
class
PScorePfwd : public PhraseScorer<Token>
{
float conf;
string denom;
public:
PScorePfwd(float const c, string d)
{
this->m_index = -1;
conf = c;
denom = d;
size_t checksum = d.size();
BOOST_FOREACH(char const& x, denom)
{
if (x == '+') { --checksum; continue; }
if (x != 'g' && x != 's' && x != 'r') continue;
string s = (format("pfwd-%c%.3f") % x % c).str();
this->m_feature_names.push_back(s);
}
this->m_num_feats = this->m_feature_names.size();
UTIL_THROW_IF2(this->m_feature_names.size() != checksum,
"Unknown parameter in specification '"
<< d << "' for Pfwd phrase scorer at " << HERE);
}
void
operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
if (pp.joint > pp.good1)
{
pp.joint = pp.good1;
// cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
// cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
}
size_t i = this->m_index;
BOOST_FOREACH(char const& c, this->denom)
{
switch (c)
{
case 'g':
(*dest)[i++] = log(lbop(pp.good1, pp.joint, conf));
break;
case 's':
(*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf));
break;
case 'r':
(*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf));
}
}
}
};
}
}

View File

@ -0,0 +1,47 @@
// -*- c++ -*-
// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
// with the asymptotic function j/(j+x) where x > 0 is a function
// parameter that determines the steepness of the rewards curve
// written by Ulrich Germann
#include "sapt_pscore_base.h"
#include <boost/dynamic_bitset.hpp>
using namespace std;
namespace Moses {
namespace bitext {
// asymptotic provenance feature n/(n+x)
template<typename Token>
class
PScoreProvenance : public SingleRealValuedParameterPhraseScorerFamily<Token>
{
public:
PScoreProvenance(string const& spec)
{
this->m_tag = "prov";
this->init(spec);
}
bool
isLogVal(int i) const { return false; }
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
size_t i = this->m_index;
BOOST_FOREACH(float const x, this->m_x)
(*dest).at(i++) = pp.joint/(x + pp.joint);
}
bool
allowPooling() const
{ return false; }
};
} // namespace bitext
} // namespace Moses

View File

@ -0,0 +1,41 @@
// -*- c++ -*-
// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
// with the asymptotic function x/(j+x) where x > 0 is a function
// parameter that determines the steepness of the rewards curve
// written by Ulrich Germann
#include "sapt_pscore_base.h"
#include <boost/dynamic_bitset.hpp>
using namespace std;
namespace Moses {
namespace bitext {
// rareness penalty: x/(n+x)
template<typename Token>
class
PScoreRareness : public SingleRealValuedParameterPhraseScorerFamily<Token>
{
public:
PScoreRareness(string const spec)
{
this->m_tag = "rare";
this->init(spec);
}
bool
isLogVal(int i) const { return false; }
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
size_t i = this->m_index;
BOOST_FOREACH(float const x, this->m_x)
(*dest).at(i++) = x/(x + pp.joint);
}
};
} // namespace bitext
} // namespace Moses

View File

@ -0,0 +1,67 @@
// -*- c++ -*-
// Phrase scorer that counts the number of unaligend words in the phrase
// written by Ulrich Germann
#include "sapt_pscore_base.h"
#include <boost/dynamic_bitset.hpp>
namespace Moses {
namespace bitext
{
template<typename Token>
class
PScoreUnaligned : public PhraseScorer<Token>
{
typedef boost::dynamic_bitset<uint64_t> bitvector;
public:
PScoreUnaligned(string const spec)
{
this->m_index = -1;
int f = this->m_num_feats = atoi(spec.c_str());
UTIL_THROW_IF2(f != 1 && f != 2,"unal parameter must be 1 or 2 at "<<HERE);
this->m_feature_names.resize(f);
if (f == 1)
this->m_feature_names[0] = "unal";
else
{
this->m_feature_names[0] = "unal-s";
this->m_feature_names[1] = "unal-t";
}
}
bool
isLogVal(int i) const { return false; }
bool
isIntegerValued(int i) const { return true; }
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
// parse_pid(pp.p1, sid1, off1, len1);
// parse_pid(pp.p2, sid2, off2, len2);
bitvector check1(pp.len1),check2(pp.len2);
for (size_t i = 0; i < pp.aln.size(); )
{
check1.set(pp.aln[i++]);
check2.set(pp.aln.at(i++));
}
if (this->m_num_feats == 1)
{
(*dest)[this->m_index] = pp.len1 - check1.count();
(*dest)[this->m_index] += pp.len2 - check2.count();
}
else
{
(*dest)[this->m_index] = pp.len1 - check1.count();
(*dest)[this->m_index+1] = pp.len2 - check2.count();
}
}
};
} // namespace bitext
} // namespace Moses

View File

@ -0,0 +1,83 @@
#include "mmsapt.h"
#include "moses/Manager.h"
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
#include <boost/foreach.hpp>
#include <boost/format.hpp>
#include <boost/tokenizer.hpp>
#include <boost/shared_ptr.hpp>
#include <algorithm>
#include <iostream>
using namespace Moses;
using namespace bitext;
using namespace std;
using namespace boost;
vector<FactorType> fo(1,FactorType(0));
ostream&
operator<<(ostream& out, Hypothesis const* x)
{
vector<const Hypothesis*> H;
for (const Hypothesis* h = x; h; h = h->GetPrevHypo())
H.push_back(h);
for (; H.size(); H.pop_back())
{
Phrase const& p = H.back()->GetCurrTargetPhrase();
for (size_t pos = 0 ; pos < p.GetSize() ; pos++)
out << *p.GetFactor(pos, 0) << (H.size() ? " " : "");
}
return out;
}
vector<FactorType> ifo;
size_t lineNumber;
string
translate(string const& source)
{
StaticData const& global = StaticData::Instance();
Sentence sentence;
istringstream ibuf(source+"\n");
sentence.Read(ibuf,ifo);
Manager manager(lineNumber, sentence, global.GetSearchAlgorithm());
manager.ProcessSentence();
ostringstream obuf;
const Hypothesis* h = manager.GetBestHypothesis();
obuf << h;
return obuf.str();
}
int main(int argc, char* argv[])
{
Parameter params;
if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(&params, argv[0]))
exit(1);
StaticData const& global = StaticData::Instance();
global.SetVerboseLevel(0);
ifo = global.GetInputFactorOrder();
lineNumber = 0; // TODO: Include sentence request number here?
string source, target, alignment;
while (getline(cin,source))
{
getline(cin,target);
getline(cin,alignment);
cout << "[S] " << source << endl;
cout << "[H] " << translate(source) << endl;
cout << "[T] " << target << endl;
Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
pdsa->add(source,target,alignment);
cout << "[X] " << translate(source) << endl;
cout << endl;
}
exit(0);
}

View File

@ -2,32 +2,33 @@
using namespace std;
using namespace Moses;
// currently broken
Mmsapt* PT;
int main(int argc, char* argv[])
{
string base = argv[1];
string L1 = argv[2];
string L2 = argv[3];
ostringstream buf;
buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base="
<< base << " L1=" << L1 << " L2=" << L2;
string configline = buf.str();
PT = new Mmsapt(configline);
PT->Load();
float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856, 0.647506 };
vector<float> weights(w,w+5);
PT->setWeights(weights);
// these values are taken from a moses.ini file;
// is there a convenient way of accessing them from within mmsapt ???
string eline,fline;
// TokenIndex V; V.open("crp/trn/mm/de.tdx");
while (getline(cin,eline) && getline(cin,fline))
{
cout << eline << endl;
cout << fline << endl;
PT->align(eline,fline);
}
delete PT;
// string base = argv[1];
// string L1 = argv[2];
// string L2 = argv[3];
// ostringstream buf;
// buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base="
// << base << " L1=" << L1 << " L2=" << L2;
// string configline = buf.str();
// PT = new Mmsapt(configline);
// PT->Load();
// float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856, 0.647506 };
// vector<float> weights(w,w+5);
// PT->setWeights(weights);
// // these values are taken from a moses.ini file;
// // is there a convenient way of accessing them from within mmsapt ???
// string eline,fline;
// // TokenIndex V; V.open("crp/trn/mm/de.tdx");
// while (getline(cin,eline) && getline(cin,fline))
// {
// cout << eline << endl;
// cout << fline << endl;
// PT->align(eline,fline);
// }
// delete PT;
}

View File

@ -345,10 +345,10 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
// find the best matches according to letter sed
string best_path = "";
int best_match = -1;
int best_letter_cost;
unsigned int best_letter_cost;
if (lsed_flag) {
best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
for(int si=0; si<best_tm.size(); si++) {
for(size_t si=0; si<best_tm.size(); si++) {
int s = best_tm[si];
string path;
unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );

View File

@ -59,7 +59,11 @@ const size_t DEFAULT_MAX_HYPOSTACK_SIZE = 200;
const size_t DEFAULT_MAX_TRANS_OPT_CACHE_SIZE = 10000;
const size_t DEFAULT_MAX_TRANS_OPT_SIZE = 5000;
const size_t DEFAULT_MAX_PART_TRANS_OPT_SIZE = 10000;
const size_t DEFAULT_MAX_PHRASE_LENGTH = 20;
#ifdef PT_UG
const size_t DEFAULT_MAX_PHRASE_LENGTH = -1;
#else
const size_t DEFAULT_MAX_PHRASE_LENGTH = 20;
#endif
const size_t DEFAULT_MAX_CHART_SPAN = 10;
const size_t ARRAY_SIZE_INCR = 10; //amount by which a phrase gets resized when necessary
const float LOWEST_SCORE = -100.0f;

View File

@ -56,8 +56,12 @@ namespace Moses
/** verbose macros
* */
#define VERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR(str); } }
#define IFVERBOSE(level) if (StaticData::Instance().GetVerboseLevel() >= level)
#define XVERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR("[" << __FILE__ << ":" << __LINE__ << "] ");TRACE_ERR(str); } }
#define HERE __FILE__ << ":" << __LINE__
#if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2)
// gcc nth_element() bug

View File

@ -152,7 +152,7 @@ def find_free_port(p):
class MosesServer(ProcessWrapper):
def __init__(self,args=["-fd", "\n"]):
def __init__(self,args=[]):
self.process = None
mserver_cmd = moses_root+"/bin/mosesserver"
self.cmd = [mserver_cmd] + args
@ -175,7 +175,10 @@ class MosesServer(ProcessWrapper):
self.cmd.extend(["--server-port", "%d"%self.port])
if debug:
print >>sys.stderr,self.cmd
self.process = Popen(self.cmd,stderr = sys.stderr)
# self.stderr = open("mserver.%d.stderr"%self.port,'w')
# self.stdout = open("mserver.%d.stdout"%self.port,'w')
# self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout)
self.process = Popen(self.cmd)
else:
devnull = open(os.devnull,"w")
self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
@ -216,10 +219,13 @@ class MosesServer(ProcessWrapper):
elif type(input) is list:
return [self.translate(x) for x in input]
elif type(input) is dict:
return self.proxy.translate(input)
else:
raise Exception("Can't handle input of this type!")
except:
attempts += 1
print >>sys.stderr, "WAITING", attempts

View File

@ -127,13 +127,40 @@ def translate(proxy, args, line):
param['nbest-distinct'] = True
pass
attempts = 0
while attempts < 120:
while attempts < 20:
t1 = time.time()
try:
return proxy.translate(param)
except:
print >>sys.stderr, "Waiting", proxy
attempts += 1
return proxy.translate(param)
# except xmlrpclib.Fault as e:
# except xmlrpclib.ProtocolError as e:
# except xmlrpclib.ResponseError as e:
except xmlrpclib.Error as e:
time.sleep(2) # give all the stderr stuff a chance to be flushed
print >>sys.stderr," XMLRPC error:",e
print >>sys.stderr, "Input was"
print >>sys.stderr, param
sys.exit(1)
except IOError as e:
print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror)
time.sleep(5)
except:
serverstatus = mserver.process.poll()
if serverstatus == None:
print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1)
attempts += 1
if attempts > 10:
time.sleep(10)
else:
time.sleep(5)
pass
else:
print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\
%(serverstatus/256,serverstatus%256)
pass
pass
pass
raise Exception("Exception: could not reach translation server.")
@ -210,17 +237,25 @@ if __name__ == "__main__":
pass
pass
if args.url:
mserver.connect(args.url)
else:
mserver.start(args=mo_args,port=args.port,debug=args.debug)
pass
ref = None
aln = None
if args.ref: ref = read_data(args.ref)
if args.aln: aln = read_data(args.aln)
if ref and aln:
try:
mo_args.index("--serial")
except:
mo_args.append("--serial")
pass
pass
if args.url:
mserver.connect(args.url)
else:
mserver.start(args=mo_args, port=args.port, debug=args.debug)
pass
if (args.input == "-"):
line = sys.stdin.readline()
idx = 0