Merge branch 'dynamic-phrase-tables'

This commit is contained in:
Ulrich Germann 2014-03-11 14:17:42 +00:00
commit f1449cd7fe
39 changed files with 3053 additions and 679 deletions

25
Jamroot
View File

@ -108,6 +108,8 @@ if [ option.get "enable-mpi" : : "yes" ] {
requirements += [ option.get "notrace" : <define>TRACE_ENABLE=1 ] ;
requirements += [ option.get "enable-boost-pool" : : <define>USE_BOOST_POOL ] ;
requirements += [ option.get "with-mm" : : <define>PT_UG ] ;
requirements += [ option.get "with-mm" : : <define>MAX_NUM_FACTORS=4 ] ;
if [ option.get "with-cmph" ] {
requirements += <define>HAVE_CMPH ;
@ -137,6 +139,23 @@ project : requirements
#Add directories here if you want their incidental targets too (i.e. tests).
build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses-chart-cmd mira scripts regression-testing ;
if [ option.get "with-mm" : : "yes" ]
{
alias mm :
moses/TranslationModel/UG/mm//mtt-build
moses/TranslationModel/UG/mm//mtt-dump
moses/TranslationModel/UG/mm//symal2mam
moses/TranslationModel/UG/mm//custom-pt
moses/TranslationModel/UG/mm//mmlex-build
moses/TranslationModel/UG/mm//mtt-count-words
moses/TranslationModel/UG//try-align
;
}
else
{
alias mm ;
}
alias programs :
lm//programs
moses-chart-cmd//moses_chart
@ -154,12 +173,10 @@ phrase-extract//pcfg-score
biconcor
mira//mira
contrib/server//mosesserver
#moses/mm//mtt-build
#moses/mm//mtt-dump
#moses/mm//symal2mam
#moses/mm//custom-pt
mm
;
install-bin-libs programs ;
install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-chart-cmd moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
install-headers headers-moses : moses//headers-to-install : moses ;

View File

@ -182,7 +182,9 @@ void IOWrapper::Initialization(const std::vector<FactorType> &/*inputFactorOrder
}
InputType*IOWrapper::GetInput(InputType* inputType)
InputType*
IOWrapper::
GetInput(InputType* inputType)
{
if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
if (long x = inputType->GetTranslationId()) {
@ -605,7 +607,7 @@ void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solu
bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
{
delete source;
if (source) delete source;
switch(inputType) {
case SentenceInput:
source = ioWrapper.GetInput(new Sentence);
@ -618,6 +620,7 @@ bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source
break;
default:
TRACE_ERR("Unknown input type: " << inputType << "\n");
source = NULL;
}
return (source ? true : false);
}

View File

@ -14,268 +14,293 @@
namespace Moses
{
struct CNStats {
size_t created,destr,read,colls,words;
CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
~CNStats() {
print(std::cerr);
}
void createOne() {
++created;
}
void destroyOne() {
++destr;
}
void collect(const ConfusionNet& cn) {
++read;
colls+=cn.GetSize();
for(size_t i=0; i<cn.GetSize(); ++i)
words+=cn[i].size();
}
void print(std::ostream& out) const {
if(created>0) {
out<<"confusion net statistics:\n"
" created:\t"<<created<<"\n"
" destroyed:\t"<<destr<<"\n"
" succ. read:\t"<<read<<"\n"
" columns:\t"<<colls<<"\n"
" words:\t"<<words<<"\n"
" avg. word/column:\t"<<words/(1.0*colls)<<"\n"
" avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
"\n\n";
struct CNStats {
size_t created,destr,read,colls,words;
CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
~CNStats() {
print(std::cerr);
}
}
};
CNStats stats;
size_t ConfusionNet::GetColumnIncrement(size_t i, size_t j) const
{
(void) i;
(void) j;
return 1;
}
ConfusionNet::ConfusionNet()
: InputType()
{
stats.createOne();
const StaticData& staticData = StaticData::Instance();
if (staticData.IsChart()) {
m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
}
UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
}
ConfusionNet::~ConfusionNet()
{
stats.destroyOne();
}
ConfusionNet::ConfusionNet(Sentence const& s)
{
data.resize(s.GetSize());
for(size_t i=0; i<s.GetSize(); ++i) {
ScorePair scorePair;
std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
data[i].push_back(temp);
}
}
bool ConfusionNet::ReadF(std::istream& in,
const std::vector<FactorType>& factorOrder,
int format)
{
VERBOSE(1, "read confusion net with format "<<format<<"\n");
switch(format) {
case 0:
return ReadFormat0(in,factorOrder);
case 1:
return ReadFormat1(in,factorOrder);
default:
std::stringstream strme;
strme << "ERROR: unknown format '"<<format
<<"' in ConfusionNet::Read";
UserMessage::Add(strme.str());
}
return false;
}
int ConfusionNet::Read(std::istream& in,
const std::vector<FactorType>& factorOrder)
{
int rv=ReadF(in,factorOrder,0);
if(rv) stats.collect(*this);
return rv;
}
void ConfusionNet::String2Word(const std::string& s,Word& w,
const std::vector<FactorType>& factorOrder)
{
std::vector<std::string> factorStrVector = Tokenize(s, "|");
for(size_t i=0; i<factorOrder.size(); ++i)
w.SetFactor(factorOrder[i],
FactorCollection::Instance().AddFactor(Input,factorOrder[i],
factorStrVector[i]));
}
bool ConfusionNet::ReadFormat0(std::istream& in,
const std::vector<FactorType>& factorOrder)
{
Clear();
const StaticData &staticData = StaticData::Instance();
const InputFeature &inputFeature = InputFeature::Instance();
size_t numInputScores = inputFeature.GetNumInputScores();
size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();
size_t totalCount = numInputScores + numRealWordCount;
bool addRealWordCount = (numRealWordCount > 0);
std::string line;
while(getline(in,line)) {
std::istringstream is(line);
std::string word;
Column col;
while(is>>word) {
Word w;
String2Word(word,w,factorOrder);
std::vector<float> probs(totalCount, 0.0);
for(size_t i=0; i < numInputScores; i++) {
double prob;
if (!(is>>prob)) {
TRACE_ERR("ERROR: unable to parse CN input - bad link probability, or wrong number of scores\n");
return false;
}
if(prob<0.0) {
VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
prob=0.0;
} else if (prob>1.0) {
VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
prob=1.0;
}
probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
void createOne() {
++created;
}
void destroyOne() {
++destr;
}
void collect(const ConfusionNet& cn) {
++read;
colls+=cn.GetSize();
for(size_t i=0; i<cn.GetSize(); ++i)
words+=cn[i].size();
}
void print(std::ostream& out) const {
if(created>0) {
out<<"confusion net statistics:\n"
" created:\t"<<created<<"\n"
" destroyed:\t"<<destr<<"\n"
" succ. read:\t"<<read<<"\n"
" columns:\t"<<colls<<"\n"
" words:\t"<<words<<"\n"
" avg. word/column:\t"<<words/(1.0*colls)<<"\n"
" avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
"\n\n";
}
//store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
if (addRealWordCount && word!=EPSILON && word!="")
probs.back() = -1.0;
ScorePair scorePair(probs);
col.push_back(std::make_pair(w,scorePair));
}
if(col.size()) {
data.push_back(col);
ShrinkToFit(data.back());
} else break;
};
CNStats stats;
size_t
ConfusionNet::
GetColumnIncrement(size_t i, size_t j) const
{
(void) i;
(void) j;
return 1;
}
return !data.empty();
}
bool ConfusionNet::ReadFormat1(std::istream& in,
const std::vector<FactorType>& factorOrder)
{
Clear();
std::string line;
if(!getline(in,line)) return 0;
size_t s;
if(getline(in,line)) s=atoi(line.c_str());
else return 0;
data.resize(s);
for(size_t i=0; i<data.size(); ++i) {
ConfusionNet::
ConfusionNet()
: InputType()
{
stats.createOne();
const StaticData& staticData = StaticData::Instance();
if (staticData.IsChart()) {
m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
}
UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
}
ConfusionNet::
~ConfusionNet()
{
stats.destroyOne();
}
ConfusionNet::
ConfusionNet(Sentence const& s)
{
data.resize(s.GetSize());
for(size_t i=0; i<s.GetSize(); ++i) {
ScorePair scorePair;
std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
data[i].push_back(temp);
}
}
bool
ConfusionNet::
ReadF(std::istream& in, const std::vector<FactorType>& factorOrder, int format)
{
VERBOSE(1, "read confusion net with format "<<format<<"\n");
switch(format) {
case 0:
return ReadFormat0(in,factorOrder);
case 1:
return ReadFormat1(in,factorOrder);
default:
std::stringstream strme;
strme << "ERROR: unknown format '"<<format
<<"' in ConfusionNet::Read";
UserMessage::Add(strme.str());
}
return false;
}
int
ConfusionNet::
Read(std::istream& in,
const std::vector<FactorType>& factorOrder)
{
int rv=ReadF(in,factorOrder,0);
if(rv) stats.collect(*this);
return rv;
}
void
ConfusionNet::
String2Word(const std::string& s,Word& w,
const std::vector<FactorType>& factorOrder)
{
std::vector<std::string> factorStrVector = Tokenize(s, "|");
for(size_t i=0; i<factorOrder.size(); ++i)
w.SetFactor(factorOrder[i],
FactorCollection::Instance().AddFactor
(Input,factorOrder[i], factorStrVector[i]));
}
bool
ConfusionNet::
ReadFormat0(std::istream& in, const std::vector<FactorType>& factorOrder)
{
Clear();
const StaticData &staticData = StaticData::Instance();
const InputFeature &inputFeature = InputFeature::Instance();
size_t numInputScores = inputFeature.GetNumInputScores();
size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();
size_t totalCount = numInputScores + numRealWordCount;
bool addRealWordCount = (numRealWordCount > 0);
std::string line;
while(getline(in,line)) {
std::istringstream is(line);
std::string word;
Column col;
while(is>>word) {
Word w;
String2Word(word,w,factorOrder);
std::vector<float> probs(totalCount, 0.0);
for(size_t i=0; i < numInputScores; i++) {
double prob;
if (!(is>>prob)) {
TRACE_ERR("ERROR: unable to parse CN input - bad link probability, or wrong number of scores\n");
return false;
}
if(prob<0.0) {
VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
prob=0.0;
} else if (prob>1.0) {
VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
prob=1.0;
}
probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
}
//store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
if (addRealWordCount && word!=EPSILON && word!="")
probs.back() = -1.0;
ScorePair scorePair(probs);
col.push_back(std::make_pair(w,scorePair));
}
if(col.size()) {
data.push_back(col);
ShrinkToFit(data.back());
} else break;
}
return !data.empty();
}
bool
ConfusionNet::
ReadFormat1(std::istream& in, const std::vector<FactorType>& factorOrder)
{
Clear();
std::string line;
if(!getline(in,line)) return 0;
std::istringstream is(line);
if(!(is>>s)) return 0;
std::string word;
double prob;
data[i].resize(s);
for(size_t j=0; j<s; ++j)
if(is>>word>>prob) {
//TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
data[i][j].second.denseScores = std::vector<float> (1);
data[i][j].second.denseScores.push_back((float) log(prob));
if(data[i][j].second.denseScores[0]<0) {
VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
data[i][j].second.denseScores[0]=0.0;
}
String2Word(word,data[i][j].first,factorOrder);
} else return 0;
}
return !data.empty();
}
void ConfusionNet::Print(std::ostream& out) const
{
out<<"conf net: "<<data.size()<<"\n";
for(size_t i=0; i<data.size(); ++i) {
out<<i<<" -- ";
for(size_t j=0; j<data[i].size(); ++j) {
out<<"("<<data[i][j].first.ToString()<<", ";
// dense
std::vector<float>::const_iterator iterDense;
for(iterDense = data[i][j].second.denseScores.begin(); iterDense < data[i][j].second.denseScores.end(); ++iterDense) {
out<<", "<<*iterDense;
}
// sparse
std::map<StringPiece, float>::const_iterator iterSparse;
for(iterSparse = data[i][j].second.sparseScores.begin(); iterSparse != data[i][j].second.sparseScores.end(); ++iterSparse) {
out << ", " << iterSparse->first << "=" << iterSparse->second;
}
out<<") ";
size_t s;
if(getline(in,line)) s=atoi(line.c_str());
else return 0;
data.resize(s);
for(size_t i=0; i<data.size(); ++i) {
if(!getline(in,line)) return 0;
std::istringstream is(line);
if(!(is>>s)) return 0;
std::string word;
double prob;
data[i].resize(s);
for(size_t j=0; j<s; ++j)
if(is>>word>>prob) {
//TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
data[i][j].second.denseScores = std::vector<float> (1);
data[i][j].second.denseScores.push_back((float) log(prob));
if(data[i][j].second.denseScores[0]<0) {
VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
data[i][j].second.denseScores[0]=0.0;
}
String2Word(word,data[i][j].first,factorOrder);
} else return 0;
}
out<<"\n";
return !data.empty();
}
void ConfusionNet::Print(std::ostream& out) const
{
out<<"conf net: "<<data.size()<<"\n";
for(size_t i=0; i<data.size(); ++i) {
out<<i<<" -- ";
for(size_t j=0; j<data[i].size(); ++j) {
out<<"("<<data[i][j].first.ToString()<<", ";
// dense
std::vector<float>::const_iterator iterDense;
for(iterDense = data[i][j].second.denseScores.begin();
iterDense < data[i][j].second.denseScores.end();
++iterDense) {
out<<", "<<*iterDense;
}
// sparse
std::map<StringPiece, float>::const_iterator iterSparse;
for(iterSparse = data[i][j].second.sparseScores.begin();
iterSparse != data[i][j].second.sparseScores.end();
++iterSparse) {
out << ", " << iterSparse->first << "=" << iterSparse->second;
}
out<<") ";
}
out<<"\n";
}
out<<"\n\n";
}
out<<"\n\n";
}
#ifdef _WIN32
#pragma warning(disable:4716)
#endif
Phrase ConfusionNet::GetSubString(const WordsRange&) const
{
UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
//return Phrase(Input);
}
Phrase
ConfusionNet::
GetSubString(const WordsRange&) const
{
UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
//return Phrase(Input);
}
std::string ConfusionNet::GetStringRep(const std::vector<FactorType> /* factorsToPrint */) const //not well defined yet
{
TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
return "";
}
std::string
ConfusionNet::
GetStringRep(const std::vector<FactorType> /* factorsToPrint */) const //not well defined yet
{
TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
return "";
}
#ifdef _WIN32
#pragma warning(disable:4716)
#endif
const Word& ConfusionNet::GetWord(size_t) const
{
UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
}
const Word& ConfusionNet::GetWord(size_t) const
{
UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
}
#ifdef _WIN32
#pragma warning(default:4716)
#endif
std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
{
cn.Print(out);
return out;
}
std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
{
cn.Print(out);
return out;
}
TranslationOptionCollection*
ConfusionNet::CreateTranslationOptionCollection() const
{
size_t maxNoTransOptPerCoverage = StaticData::Instance().GetMaxNoTransOptPerCoverage();
float translationOptionThreshold = StaticData::Instance().GetTranslationOptionThreshold();
TranslationOptionCollection *rv= new TranslationOptionCollectionConfusionNet(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
assert(rv);
return rv;
}
TranslationOptionCollection*
ConfusionNet::
CreateTranslationOptionCollection() const
{
size_t maxNoTransOptPerCoverage
= StaticData::Instance().GetMaxNoTransOptPerCoverage();
float translationOptionThreshold
= StaticData::Instance().GetTranslationOptionThreshold();
TranslationOptionCollection *rv
= new TranslationOptionCollectionConfusionNet
(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
assert(rv);
return rv;
}
}

View File

@ -47,7 +47,7 @@
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
#endif
#ifdef PT_UG
#include "moses/TranslationModel/mmsapt.h"
#include "moses/TranslationModel/UG/mmsapt.h"
#endif
#include "moses/LM/Ken.h"

View File

@ -13,7 +13,9 @@ namespace Moses
InputFeature *InputFeature::s_instance = NULL;
InputFeature::InputFeature(const std::string &line)
:StatelessFeatureFunction(line)
: StatelessFeatureFunction(line)
, m_numInputScores(0)
, m_numRealWordCount(0)
{
ReadParameters();
@ -23,6 +25,7 @@ InputFeature::InputFeature(const std::string &line)
void InputFeature::Load()
{
const PhraseDictionary *pt = PhraseDictionary::GetColl()[0];
const PhraseDictionaryTreeAdaptor *ptBin = dynamic_cast<const PhraseDictionaryTreeAdaptor*>(pt);

View File

@ -54,6 +54,7 @@ lib moses :
[ glob
*.cpp
TranslationModel/*.cpp
TranslationModel/UG/*.cpp
TranslationModel/fuzzy-match/*.cpp
TranslationModel/DynSAInclude/*.cpp
TranslationModel/RuleTable/*.cpp
@ -70,11 +71,11 @@ lib moses :
]
headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool rt
..//search ../util/double-conversion//double-conversion ..//z ../OnDiskPt//OnDiskPt
TranslationModel/UG/generic//generic TranslationModel/UG/mm//mm
$(TOP)//boost_iostreams ;
#generic//generic mm//mm
alias headers-to-install : [ glob-tree *.h ] ;
alias headers-to-install : [ glob-tree [^.]*.h ] ;
import testing ;

View File

@ -0,0 +1,14 @@
exe try-align :
try-align.cc
$(TOP)/moses//moses
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
install $(PREFIX)/bin : try-align ;
fakelib mmsapt : [ glob *.cpp mmsapt*.cc ] ;

View File

@ -0,0 +1,116 @@
# Some systems apparently distinguish between shell
# variables and environment variables. The latter are
# visible to the make utility, the former apparently not,
# so we need to set them if they are not defined yet
# ===============================================================================
# COMPILATION PREFERENCES
# ===============================================================================
# CCACHE: if set to ccache, use ccache to speed up compilation
# OPTI: optimization level
# PROF: profiler switches
CCACHE = ccache
OPTI = 3
EXE_TAG = exe
PROF =
# PROF = -g -pg
# ===============================================================================
SHELL = bash
MAKEFLAGS += --warn-undefined-variables
.DEFAULT_GOAL = all
.SUFFIXES:
# ===============================================================================
# COMPILATION 'LOCALIZATION'
HOST ?= $(shell hostname)
HOSTTYPE ?= $(shell uname -m)
KERNEL = $(shell uname -r)
MOSES_ROOT = ${HOME}/code/mosesdecoder
WDIR = build/${HOSTTYPE}/${KERNEL}/${OPTI}
VPATH = ${HOME}/code/mosesdecoder/
CXXFLAGS = ${PROF} -ggdb -Wall -O${OPTI} ${INCLUDES}
CXXFLAGS += -DMAX_NUM_FACTORS=4
CXXFLAGS += -DKENLM_MAX_ORDER=5
modirs := $(addprefix -I,$(shell find ${MOSES_ROOT}/moses ${MOSES_ROOT}/contrib -type d))
CXXFLAGS += -I${MOSES_ROOT}
INCLUDES =
BZLIB =
BOOSTLIBTAG =
lzma = lzma
#lzma =
REQLIBS = m z pthread dl ${lzma} ${BZLIB} \
boost_thread${BOOSTLIBTAG} \
boost_program_options${BOOSTLIBTAG} \
boost_system${BOOSTLIBTAG} \
boost_filesystem${BOOSTLIBTAG} \
boost_iostreams${BOOSTLIBTAG} z bz2
# icuuc icuio icui18n \
LIBS = $(addprefix -l, moses ${REQLIBS})
LIBS = $(addprefix -l, ${REQLIBS})
LIBDIRS = -L${HOME}/code/mosesdecoder/lib
LIBDIRS += -L${HOME}/lib
PREFIX ?= .
BINDIR ?= ${PREFIX}/bin
ifeq "$(OPTI)" "0"
BINPREF = debug.
else
BINPREF =
endif
OBJ2 :=
define compile
DEP += ${WDIR}/$(basename $(notdir $1)).d
${WDIR}/$(basename $(notdir $1)).o : $1 $(wildcard $(basename $1).h)
@echo -e "COMPILING $1"
@mkdir -p $$(@D)
${CXX} ${CXXFLAGS} -MD -MP -c $$(abspath $$<) -o $$@
endef
testprogs = test-dynamic-im-tsa try-align
programs = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
programs += mtt-count-words
all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
@echo $^
clean:
rm -f ${WDIR}/*.o ${WDIR}/*.d
custom-pt: ${BINDIR}/${BINPREF}custom-pt
echo $^
INMOGEN = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/generic/*/*.cpp)
#INMOMM = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/mm/*.cc)
#INMOMM += $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/mm/*.cpp)
OBJ = $(patsubst %.cc,%.o,$(wildcard $(patsubst %.h,%.cc,$(wildcard *.h))))
OBJ += $(patsubst %.cpp,%.o,${INMOGEN})
#OBJ += $(patsubst %.cpp,%.o,${INMOMM})
#OBJ += $(patsubst %.cc,%.o,${INMOMM})
EXE = $(patsubst %.cc,%.o,$(filter-out $(patsubst %.h,%.cc,$(wildcard *.h)),$(wildcard *.cc)))
$(foreach cpp,${INMOGEN},$(eval $(call compile,${cpp})))
$(foreach cpp,$(wildcard *.cc),$(eval $(call compile,${cpp})))
$(addprefix ${BINDIR}/${BINPREF}, $(programs)): $(addprefix ${WDIR}/,$(notdir ${OBJ}))
$(addprefix ${BINDIR}/${BINPREF}, $(programs)): ${MOSES_ROOT}/lib/libmoses.a
${BINDIR}/${BINPREF}%: ${WDIR}/%.o ${WDIR}/mmsapt_align.o
@mkdir -p ${BINDIR}
echo PREREQS: $^
$(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS} ${LIBS}
#try-align: ${WDIR}/try-align.o ${WDIR}/tpt_tokenindex.o
# $(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS}
.SECONDARY:
-include $(DEP)

View File

@ -1,41 +1,59 @@
exe mtt-build :
mtt-build.cc
$(TOP)/moses/generic//generic
exe mmlex-build :
mmlex-build.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/mm//mm
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe mtt-count-words :
mtt-count-words.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe mtt-build :
mtt-build.cc
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe mtt-dump :
mtt-dump.cc
$(TOP)/moses/generic//generic
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/mm//mm
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe symal2mam :
symal2mam.cc
$(TOP)/moses/generic//generic
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/mm//mm
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
exe custom-pt :
custom-pt.cc
$(TOP)/moses/generic//generic
#$(TOP)/moses/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/mm//mm
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
install $(PREFIX)/bin : mtt-build mtt-dump symal2mam custom-pt ;
install $(PREFIX)/bin : mtt-build mtt-dump mtt-count-words symal2mam custom-pt mmlex-build ;
fakelib mm : [ glob ug_*.cc tpt_*.cc ] ;

View File

@ -27,10 +27,11 @@ MAKEFLAGS += --warn-undefined-variables
# COMPILATION 'LOCALIZATION'
HOST ?= $(shell hostname)
HOSTTYPE ?= $(shell uname -m)
KERNEL = $(shell uname -r)
MOSES_ROOT = ${HOME}/code/moses/master/mosesdecoder
WDIR = build/${HOSTTYPE}/${OPTI}
VPATH = ${HOME}/code/moses/master/mosesdecoder/
MOSES_ROOT = ${HOME}/code/mosesdecoder
WDIR = build/${HOSTTYPE}/${KERNEL}/${OPTI}
VPATH = ${HOME}/code/mosesdecoder/
CXXFLAGS = ${PROF} -ggdb -Wall -O${OPTI} ${INCLUDES}
CXXFLAGS += -DMAX_NUM_FACTORS=4
CXXFLAGS += -DKENLM_MAX_ORDER=5
@ -50,8 +51,10 @@ REQLIBS = m z pthread lzma ${BZLIB} \
# icuuc icuio icui18n \
LIBS = $(addprefix -l, ${REQLIBS} moses)
LIBDIRS = -L${HOME}/code/moses/master/mosesdecoder/lib
BINDIR = bin
LIBDIRS = -L${HOME}/code/mosesdecoder/lib
LIBDIRS += -L${HOME}/lib
PREFIX ?= .
BINDIR ?= ${PREFIX}/bin
ifeq "$(OPTI)" "0"
BINPREF = debug.
else
@ -71,7 +74,9 @@ ${WDIR}/$(basename $(notdir $1)).o : $1 $(wildcard $(basename $1).h)
endef
programs = mtt-build mtt-dump symam2mam custom-pt mmlex-build
testprogs = test-dynamic-im-tsa
programs = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
programs += mtt-count-words
all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
@echo $^
@ -81,7 +86,7 @@ clean:
custom-pt: ${BINDIR}/${BINPREF}custom-pt
echo $^
INMOGEN = $(wildcard ${MOSES_ROOT}/moses/generic/*/*.cpp)
INMOGEN = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/generic/*/*.cpp)
OBJ = $(patsubst %.cc,%.o,$(wildcard $(patsubst %.h,%.cc,$(wildcard *.h))))
OBJ += $(patsubst %.cpp,%.o,${INMOGEN})
EXE = $(patsubst %.cc,%.o,$(filter-out $(patsubst %.h,%.cc,$(wildcard *.h)),$(wildcard *.cc)))

View File

@ -8,9 +8,9 @@
#include <iomanip>
#include <algorithm>
#include "moses/generic/sorting/VectorIndexSorter.h"
#include "moses/generic/sampling/Sampling.h"
#include "moses/generic/file_io/ug_stream.h"
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include <boost/math/distributions/binomial.hpp>
#include <boost/unordered_map.hpp>

View File

@ -1,27 +1,32 @@
// -*- c++ -*-
// Program to extract word cooccurrence counts from a memory-mapped word-aligned bitext
// stores the counts lexicon in the format for mm2dTable<uint32_t> (ug_mm_2d_table.h)
// Program to extract word cooccurrence counts from a memory-mapped
// word-aligned bitext stores the counts lexicon in the format for
// mm2dTable<uint32_t> (ug_mm_2d_table.h)
//
// (c) 2010-2012 Ulrich Germann
// to do: multi-threading
#include <queue>
#include <iomanip>
#include <vector>
#include <iterator>
#include <sstream>
#include <algorithm>
#include <boost/program_options.hpp>
#include <boost/dynamic_bitset.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#include <boost/math/distributions/binomial.hpp>
#include <boost/unordered_map.hpp>
#include <boost/unordered_set.hpp>
#include "moses/generic/program_options/ug_get_options.h"
// #include "ug_translation_finder.h"
// #include "ug_sorters.h"
// #include "ug_corpus_sampling.h"
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
#include "ug_mm_2d_table.h"
#include "ug_mm_ttrack.h"
#include "ug_corpus_token.h"
#include "ug_corpus_token.h"
using namespace std;
using namespace ugdiss;
@ -30,116 +35,296 @@ using namespace boost::math;
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
typedef SimpleWordId Token;
vector<uint32_t> m1; // marginals L1
vector<uint32_t> m2; // marginals L2
id_type first_rare_id=500;
vector<vector<uint32_t> > JFREQ; // joint count table for frequent L1 words
vector<map<id_type,uint32_t> > JRARE; // joint count table for rare L1 words
// DECLARATIONS
void interpret_args(int ac, char* av[]);
mmTtrack<Token> T1,T2;
mmTtrack<char> Tx;
TokenIndex V1,V2;
string bname,cfgFile,L1,L2,oname;
typedef pair<id_type,id_type> wpair;
struct Count
{
uint32_t a;
uint32_t c;
Count() : a(0), c(0) {};
Count(uint32_t ax, uint32_t cx) : a(ax), c(cx) {}
};
// DECLARATIONS
void interpret_args(int ac, char* av[]);
bool
operator<(pair<id_type,Count> const& a,
pair<id_type,Count> const& b)
{
return a.first < b.first;
}
typedef boost::unordered_map<wpair,Count> countmap_t;
typedef vector<vector<pair<id_type,Count> > > countlist_t;
vector<countlist_t> XLEX;
class Counter
{
public:
countmap_t CNT;
countlist_t & LEX;
size_t offset;
size_t skip;
Counter(countlist_t& lex, size_t o, size_t s)
: LEX(lex), offset(o), skip(s) {}
void processSentence(id_type sid);
void operator()();
};
string bname,cfgFile,L1,L2,oname,cooc;
int verbose;
size_t truncat;
size_t num_threads;
void
Counter::
operator()()
{
for (size_t sid = offset; sid < min(truncat,T1.size()); sid += skip)
processSentence(sid);
LEX.resize(V1.ksize());
for (countmap_t::const_iterator c = CNT.begin(); c != CNT.end(); ++c)
{
pair<id_type,Count> foo(c->first.second,c->second);
LEX.at(c->first.first).push_back(foo);
}
typedef vector<pair<id_type,Count> > v_t;
BOOST_FOREACH(v_t& v, LEX)
sort(v.begin(),v.end());
}
struct lexsorter
{
vector<countlist_t> const& v;
id_type wid;
lexsorter(vector<countlist_t> const& vx, id_type widx)
: v(vx),wid(widx) {}
bool operator()(pair<uint32_t,uint32_t> const& a,
pair<uint32_t,uint32_t> const& b) const
{
return (v.at(a.first).at(wid).at(a.second).first >
v.at(b.first).at(wid).at(b.second).first);
}
};
void
writeTableHeader(ostream& out)
{
filepos_type idxOffset=0;
numwrite(out,idxOffset); // blank for the time being
numwrite(out,id_type(V1.ksize()));
numwrite(out,id_type(V2.ksize()));
}
void writeTable(ostream* aln_out, ostream* coc_out)
{
vector<uint32_t> m1a(V1.ksize(),0); // marginals L1
vector<uint32_t> m2a(V2.ksize(),0); // marginals L2
vector<uint32_t> m1c(V1.ksize(),0); // marginals L1
vector<uint32_t> m2c(V2.ksize(),0); // marginals L2
vector<id_type> idxa(V1.ksize()+1,0);
vector<id_type> idxc(V1.ksize()+1,0);
if (aln_out) writeTableHeader(*aln_out);
if (coc_out) writeTableHeader(*coc_out);
size_t CellCountA=0,CellCountC=0;
for (size_t id1 = 0; id1 < V1.ksize(); ++id1)
{
idxa[id1] = CellCountA;
idxc[id1] = CellCountC;
lexsorter sorter(XLEX,id1);
vector<pair<uint32_t,uint32_t> > H; H.reserve(num_threads);
for (size_t i = 0; i < num_threads; ++i)
{
if (id1 < XLEX.at(i).size() && XLEX[i][id1].size())
H.push_back(pair<uint32_t,uint32_t>(i,0));
}
if (!H.size()) continue;
make_heap(H.begin(),H.end(),sorter);
while (H.size())
{
id_type id2 = XLEX[H[0].first][id1][H[0].second].first;
uint32_t aln = XLEX[H[0].first][id1][H[0].second].second.a;
uint32_t coc = XLEX[H[0].first][id1][H[0].second].second.c;
pop_heap(H.begin(),H.end(),sorter);
++H.back().second;
if (H.back().second == XLEX[H.back().first][id1].size())
H.pop_back();
else
push_heap(H.begin(),H.end(),sorter);
while (H.size() &&
XLEX[H[0].first][id1].at(H[0].second).first == id2)
{
aln += XLEX[H[0].first][id1][H[0].second].second.a;
coc += XLEX[H[0].first][id1][H[0].second].second.c;
pop_heap(H.begin(),H.end(),sorter);
++H.back().second;
if (H.back().second == XLEX[H.back().first][id1].size())
H.pop_back();
else
push_heap(H.begin(),H.end(),sorter);
}
if (aln_out)
{
++CellCountA;
numwrite(*aln_out,id2);
numwrite(*aln_out,aln);
m1a[id1] += aln;
m2a[id2] += aln;
}
if (coc_out && coc)
{
++CellCountC;
numwrite(*coc_out,id2);
numwrite(*coc_out,coc);
m1c[id1] += coc;
m2c[id2] += coc;
}
}
}
idxa.back() = CellCountA;
idxc.back() = CellCountC;
if (aln_out)
{
filepos_type idxOffsetA = aln_out->tellp();
BOOST_FOREACH(id_type foo, idxa)
numwrite(*aln_out,foo);
aln_out->write(reinterpret_cast<char const*>(&m1a[0]),m1a.size()*4);
aln_out->write(reinterpret_cast<char const*>(&m2a[0]),m2a.size()*4);
aln_out->seekp(0);
numwrite(*aln_out,idxOffsetA);
}
if (coc_out)
{
filepos_type idxOffsetC = coc_out->tellp();
BOOST_FOREACH(id_type foo, idxc)
numwrite(*coc_out,foo);
coc_out->write(reinterpret_cast<char const*>(&m1c[0]),m1c.size()*4);
coc_out->write(reinterpret_cast<char const*>(&m2c[0]),m2c.size()*4);
coc_out->seekp(0);
numwrite(*coc_out,idxOffsetC);
}
}
void
Counter::
processSentence(id_type sid)
{
Token const* s1 = T1.sntStart(sid);
Token const* e1 = T1.sntEnd(sid);
Token const* s2 = T2.sntStart(sid);
char const* p = Tx.sntStart(sid);
char const* q = Tx.sntEnd(sid);
ushort r,c;
bitvector check1(T1.sntLen(sid)), check2(T2.sntLen(sid));
check1.set();
check2.set();
Token const* e2 = T2.sntEnd(sid);
vector<ushort> cnt1(V1.ksize(),0);
vector<ushort> cnt2(V2.ksize(),0);
for (Token const* x = s1; x < e1; ++x)
++cnt1.at(x->id());
for (Token const* x = s2; x < e2; ++x)
++cnt2.at(x->id());
boost::unordered_set<wpair> seen;
bitvector check1(T1.sntLen(sid)); check1.set();
bitvector check2(T2.sntLen(sid)); check2.set();
// count links
char const* p = Tx.sntStart(sid);
char const* q = Tx.sntEnd(sid);
ushort r,c;
// cout << sid << " " << q-p << endl;
while (p < q)
{
p = binread(p,r);
p = binread(p,c);
// cout << sid << " " << r << "-" << c << endl;
assert(r < check1.size());
assert(c < check2.size());
assert(s1+r < e1);
assert(s2+c < e2);
check1.reset(r);
check2.reset(c);
id_type id1 = (s1+r)->id();
if (id1 < first_rare_id) JFREQ[id1][(s2+c)->id()]++;
else JRARE[id1][(s2+c)->id()]++;
id_type id2 = (s2+c)->id();
wpair k(id1,id2);
Count& cnt = CNT[k];
cnt.a++;
if (seen.insert(k).second)
cnt.c += cnt1[id1] * cnt2[id2];
}
// count unaliged words
for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i))
{
id_type id1 = (s1+i)->id();
if (id1 < first_rare_id) JFREQ[id1][0]++;
else JRARE[id1][0]++;
}
for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i))
JFREQ[0][(s2+i)->id()]++;
for (size_t i = check1.find_first();
i < check1.size();
i = check1.find_next(i))
CNT[wpair((s1+i)->id(),0)].a++;
for (size_t i = check2.find_first();
i < check2.size();
i = check2.find_next(i))
CNT[wpair(0,(s2+i)->id())].a++;
}
void
makeTable(string ofname)
{
ofstream out(ofname.c_str());
filepos_type idxOffset=0;
m1.resize(max(first_rare_id,V1.getNumTokens()),0);
m2.resize(V2.getNumTokens(),0);
JFREQ.resize(first_rare_id,vector<uint32_t>(m2.size(),0));
JRARE.resize(m1.size());
for (size_t sid = 0; sid < T1.size(); ++sid)
processSentence(sid);
// void
// writeTable(string ofname,
// vector<vector<uint32_t> >& FREQ,
// vector<map<id_type,uint32_t> >& RARE)
// {
// ofstream out(ofname.c_str());
// filepos_type idxOffset=0;
vector<id_type> index(V1.getNumTokens()+1,0);
numwrite(out,idxOffset); // blank for the time being
numwrite(out,id_type(m1.size()));
numwrite(out,id_type(m2.size()));
// vector<uint32_t> m1; // marginals L1
// vector<uint32_t> m2; // marginals L2
// m1.resize(max(first_rare_id,V1.getNumTokens()),0);
// m2.resize(V2.getNumTokens(),0);
// vector<id_type> index(V1.getNumTokens()+1,0);
// numwrite(out,idxOffset); // blank for the time being
// numwrite(out,id_type(m1.size()));
// numwrite(out,id_type(m2.size()));
id_type cellCount=0;
id_type stop = min(first_rare_id,id_type(m1.size()));
for (id_type id1 = 0; id1 < stop; ++id1)
{
index[id1] = cellCount;
vector<uint32_t> const& v = JFREQ[id1];
for (id_type id2 = 0; id2 < id_type(v.size()); ++id2)
{
if (!v[id2]) continue;
cellCount++;
numwrite(out,id2);
out.write(reinterpret_cast<char const*>(&v[id2]),sizeof(uint32_t));
m1[id1] += v[id2];
m2[id2] += v[id2];
}
}
for (id_type id1 = stop; id1 < id_type(m1.size()); ++id1)
{
index[id1] = cellCount;
map<id_type,uint32_t> const& M = JRARE[id1];
for (map<id_type,uint32_t>::const_iterator m = M.begin(); m != M.end(); ++m)
{
if (m->second == 0) continue;
cellCount++;
numwrite(out,m->first);
out.write(reinterpret_cast<char const*>(&m->second),sizeof(float));
m1[id1] += m->second;
m2[m->first] += m->second;
}
}
index[m1.size()] = cellCount;
idxOffset = out.tellp();
for (size_t i = 0; i < index.size(); ++i)
numwrite(out,index[i]);
out.write(reinterpret_cast<char const*>(&m1[0]),m1.size()*sizeof(float));
out.write(reinterpret_cast<char const*>(&m2[0]),m2.size()*sizeof(float));
// id_type cellCount=0;
// id_type stop = min(first_rare_id,id_type(m1.size()));
// for (id_type id1 = 0; id1 < stop; ++id1)
// {
// index[id1] = cellCount;
// vector<uint32_t> const& v = FREQ[id1];
// for (id_type id2 = 0; id2 < id_type(v.size()); ++id2)
// {
// if (!v[id2]) continue;
// cellCount++;
// numwrite(out,id2);
// out.write(reinterpret_cast<char const*>(&v[id2]),sizeof(uint32_t));
// m1[id1] += v[id2];
// m2[id2] += v[id2];
// }
// }
// for (id_type id1 = stop; id1 < id_type(m1.size()); ++id1)
// {
// index[id1] = cellCount;
// map<id_type,uint32_t> const& M = RARE[id1];
// for (map<id_type,uint32_t>::const_iterator m = M.begin(); m != M.end(); ++m)
// {
// if (m->second == 0) continue;
// cellCount++;
// numwrite(out,m->first);
// out.write(reinterpret_cast<char const*>(&m->second),sizeof(float));
// m1[id1] += m->second;
// m2[m->first] += m->second;
// }
// }
// index[m1.size()] = cellCount;
// idxOffset = out.tellp();
// for (size_t i = 0; i < index.size(); ++i)
// numwrite(out,index[i]);
// out.write(reinterpret_cast<char const*>(&m1[0]),m1.size()*sizeof(float));
// out.write(reinterpret_cast<char const*>(&m2[0]),m2.size()*sizeof(float));
// re-write the file header
out.seekp(0);
numwrite(out,idxOffset);
out.close();
}
// // re-write the file header
// out.seekp(0);
// numwrite(out,idxOffset);
// out.close();
// }
int
main(int argc, char* argv[])
@ -152,8 +337,21 @@ main(int argc, char* argv[])
Tx.open(bname+L1+"-"+L2+".mam");
V1.open(bname+L1+".tdx");
V2.open(bname+L2+".tdx");
makeTable(oname);
exit(0);
if (!truncat) truncat = T1.size();
XLEX.resize(num_threads);
vector<boost::shared_ptr<boost::thread> > workers(num_threads);
for (size_t i = 0; i < num_threads; ++i)
workers[i].reset(new boost::thread(Counter(XLEX[i],i,num_threads)));
for (size_t i = 0; i < workers.size(); ++i)
workers[i]->join();
// cerr << "done counting" << endl;
ofstream aln_out,coc_out;
if (oname.size()) aln_out.open(oname.c_str());
if (cooc.size()) coc_out.open(cooc.c_str());
writeTable(oname.size() ? &aln_out : NULL,
cooc.size() ? &coc_out : NULL);
if (oname.size()) aln_out.close();
if (cooc.size()) coc_out.close();
}
void
@ -169,6 +367,14 @@ interpret_args(int ac, char* av[])
("help,h", "print this message")
("cfg,f", po::value<string>(&cfgFile),"config file")
("oname,o", po::value<string>(&oname),"output file name")
("cooc,c", po::value<string>(&cooc),
"file name for raw co-occurrence counts")
("verbose,v", po::value<int>(&verbose)->default_value(0)->implicit_value(1),
"verbosity level")
("threads,t", po::value<size_t>(&num_threads)->default_value(4),
"count in <N> parallel threads")
("truncate,n", po::value<size_t>(&truncat)->default_value(0),
"truncate corpus to <N> sentences (for debugging)")
;
h.add_options()
@ -181,12 +387,14 @@ interpret_args(int ac, char* av[])
a.add("L2",1);
get_options(ac,av,h.add(o),a,vm,"cfg");
if (vm.count("help") || bname.empty() || oname.empty())
if (vm.count("help") || bname.empty() || (oname.empty() && cooc.empty()))
{
cout << "usage:\n\t" << av[0] << " <basename> <L1 tag> <L2 tag> -o <output file>\n" << endl;
cout << "usage:\n\t" << av[0] << " <basename> <L1 tag> <L2 tag> [-o <output file>] [-c <output file>]\n" << endl;
cout << "at least one of -o / -c must be specified." << endl;
cout << o << endl;
exit(0);
}
num_threads = min(num_threads,24UL);
}

View File

@ -25,12 +25,13 @@
#include "ug_mm_ttrack.h"
#include "tpt_pickler.h"
#include "ug_deptree.h"
#include "moses/generic/sorting/VectorIndexSorter.h"
#include "ug_im_tsa.h"
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/mm/ug_im_tsa.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
using namespace boost;
namespace po=boost::program_options;
int with_pfas;
@ -360,10 +361,10 @@ build_mmTSA(string infile, string outfile)
{
size_t mypid = fork();
if(mypid) return mypid;
mmTtrack<Token> T(infile);
shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile));
bdBitset filter;
filter.resize(T.size(),true);
imTSA<Token> S(&T,filter,(quiet?NULL:&cerr));
filter.resize(T->size(),true);
imTSA<Token> S(T,&filter,(quiet?NULL:&cerr));
S.save_as_mm_tsa(outfile);
exit(0);
}

View File

@ -0,0 +1,69 @@
// count words in a memory-mapped corpus
#include "ug_mm_ttrack.h"
#include "tpt_tokenindex.h"
#include "ug_corpus_token.h"
#include <string>
#include <vector>
#include <cassert>
#include <boost/unordered_map.hpp>
#include <boost/foreach.hpp>
#include <iomanip>
#include "ug_typedefs.h"
#include "tpt_pickler.h"
// #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
// #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
// #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include <algorithm>
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
typedef L2R_Token<SimpleWordId> Token;
// typedef mmTSA<Token>::tree_iterator iter;
typedef boost::unordered_map<pair<size_t,size_t>,size_t> phrase_counter_t;
#define CACHING_THRESHOLD 1000
mmTtrack<Token> T; // token tracks
TokenIndex V; // vocabs
// mmTSA<Token> I; // suffix arrays
void interpret_args(int ac, char* av[]);
string bname;
bool echo;
int main(int argc, char* argv[])
{
interpret_args(argc,argv);
T.open(bname+".mct");
V.open(bname+".tdx");
vector<size_t> cnt(V.ksize(),0);
for (size_t sid = 0; sid < T.size(); ++sid)
{
Token const* stop = T.sntEnd(sid);
for (Token const* t = T.sntStart(sid); t < stop; ++cnt[(t++)->id()]);
}
for (size_t wid = 2; wid < V.ksize(); ++wid)
cout << V[wid] << " " << cnt[wid] << endl;
exit(0);
}
void
interpret_args(int ac, char* av[])
{
namespace po=boost::program_options;
po::variables_map vm;
po::options_description o("Options");
po::options_description h("Hidden Options");
po::positional_options_description a;
o.add_options()
("help,h", "print this message")
;
h.add_options()
("bname", po::value<string>(&bname), "base name")
;
a.add("bname",1);
get_options(ac,av,h.add(o),a,vm);
}

View File

@ -11,11 +11,11 @@
#include <iomanip>
#include "ug_typedefs.h"
#include "tpt_pickler.h"
#include "moses/generic/sorting/VectorIndexSorter.h"
#include "moses/generic/sampling/Sampling.h"
#include "moses/generic/file_io/ug_stream.h"
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include <algorithm>
#include "moses/generic/program_options/ug_get_options.h"
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
using namespace std;
using namespace ugdiss;

View File

@ -11,8 +11,8 @@
#include "ug_deptree.h"
#include "tpt_tokenindex.h"
#include "tpt_pickler.h"
#include "moses/generic/program_options/ug_get_options.h"
#include "moses/generic/file_io/ug_stream.h"
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include <iostream>
#include <string>
@ -21,8 +21,8 @@
#include <boost/program_options.hpp>
#include <boost/scoped_ptr.hpp>
#include "util/exception.hh"
#include "util/check.hh"
#include "headers-base/util/exception.hh"
#include "headers-base/util/check.hh"
// NOTE TO SELF:
/* Program to filter out sentences that GIZA will skip or truncate,

View File

@ -0,0 +1,64 @@
// -*- c++ -*-
// test program for dynamic tsas
#include <boost/program_options.hpp>
#include <boost/program_options/options_description.hpp>
#include <boost/program_options/parsers.hpp>
#include <boost/program_options/variables_map.hpp>
#include <boost/iostreams/device/mapped_file.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <vector>
#include <string>
#include <sys/types.h>
#include <sys/wait.h>
#include "ug_conll_record.h"
#include "tpt_tokenindex.h"
#include "ug_mm_ttrack.h"
#include "tpt_pickler.h"
#include "ug_deptree.h"
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "ug_im_ttrack.h"
#include "ug_bitext.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
using namespace boost;
using namespace Moses::bitext;
namespace po=boost::program_options;
typedef L2R_Token<SimpleWordId> L2R;
int main()
{
sptr<imBitext<L2R> > bt(new imBitext<L2R>());
string s1,s2,aln;
vector<string> S1,S2,ALN;
while (getline(cin,s1) && getline(cin,s2) && getline(cin,aln))
{
S1.push_back(s1);
S2.push_back(s2);
ALN.push_back(aln);
}
bt = bt->add(S1,S2,ALN);
TSA<L2R>::tree_iterator m(bt->I2.get());
m.down();
do {
char const* p = m.lower_bound(-1);
tsa::ArrayEntry I(p);
do {
m.root->readEntry(I.next,I);
L2R const* stop = m.root->getCorpus()->sntEnd(I.sid);
for (L2R const* t = m.root->getCorpus()->getToken(I); t < stop; ++t)
cout << (*bt->V2)[t->id()] << " ";
cout << endl;
} while (I.next < m.upper_bound(-1));
} while (m.over());
}

View File

@ -138,7 +138,7 @@ namespace ugdiss
void
mkTokenIndex(string ofile,MYMAP const& M,string unkToken)
{
typedef pair<uint32_t,id_type> IndexEntry; // offset and id
// typedef pair<uint32_t,id_type> IndexEntry; // offset and id
typedef pair<string,uint32_t> Token; // token and id

View File

@ -17,7 +17,10 @@ namespace Moses
, good (0)
, sum_pairs (0)
, in_progress (0)
{}
{
ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
}
void
pstats::
@ -38,28 +41,34 @@ namespace Moses
this->lock.unlock();
}
void
bool
pstats::
add(uint64_t pid, float const w,
vector<uchar> const& a,
uint32_t const cnt2)
uint32_t const cnt2,
uint32_t fwd_o,
uint32_t bwd_o)
{
this->lock.lock();
jstats& entry = this->trg[pid];
this->lock.unlock();
entry.add(w,a,cnt2);
entry.add(w,a,cnt2,fwd_o,bwd_o);
if (this->good < entry.rcnt())
{
this->lock.lock();
UTIL_THROW(util::Exception, "more joint counts than good counts!"
<< entry.rcnt() << "/" << this->good);
return false;
// UTIL_THROW(util::Exception, "more joint counts than good counts!"
// << entry.rcnt() << "/" << this->good);
}
return true;
}
jstats::
jstats()
: my_rcnt(0), my_wcnt(0), my_cnt2(0)
{
ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
my_aln.reserve(1);
}
@ -69,11 +78,33 @@ namespace Moses
my_rcnt = other.rcnt();
my_wcnt = other.wcnt();
my_aln = other.aln();
for (int i = po_first; i <= po_other; i++)
{
ofwd[i] = other.ofwd[i];
obwd[i] = other.obwd[i];
}
}
uint32_t
jstats::
dcnt_fwd(PhraseOrientation const idx) const
{
assert(idx <= po_other);
return ofwd[idx];
}
uint32_t
jstats::
dcnt_bwd(PhraseOrientation const idx) const
{
assert(idx <= po_other);
return obwd[idx];
}
void
jstats::
add(float w, vector<uchar> const& a, uint32_t const cnt2)
add(float w, vector<uchar> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient)
{
boost::lock_guard<boost::mutex> lk(this->lock);
my_rcnt += 1;
@ -90,6 +121,8 @@ namespace Moses
if (my_aln[i].first > my_aln[i/2].first)
push_heap(my_aln.begin(),my_aln.begin()+i+1);
}
++ofwd[fwd_orient];
++obwd[bwd_orient];
}
uint32_t
@ -112,6 +145,34 @@ namespace Moses
aln() const
{ return my_aln; }
void
jstats::
invalidate()
{
my_rcnt = 0;
}
bool
jstats::
valid()
{
return my_rcnt != 0;
}
bool
PhrasePair::
operator<=(PhrasePair const& other) const
{
return this->score <= other.score;
}
bool
PhrasePair::
operator>=(PhrasePair const& other) const
{
return this->score >= other.score;
}
bool
PhrasePair::
operator<(PhrasePair const& other) const
@ -126,7 +187,30 @@ namespace Moses
return this->score > other.score;
}
PhrasePair::PhrasePair() {}
PhrasePair::
PhrasePair() {}
PhrasePair::
PhrasePair(PhrasePair const& o)
: p1(o.p1),
p2(o.p2),
raw1(o.raw1),
raw2(o.raw2),
sample1(o.sample1),
sample2(o.sample2),
good1(o.good1),
good2(o.good2),
joint(o.joint),
fvals(o.fvals),
aln(o.aln),
score(o.score)
{
for (size_t i = 0; i <= po_other; ++i)
{
dfwd[i] = o.dfwd[i];
dbwd[i] = o.dbwd[i];
}
}
void
PhrasePair::
@ -140,6 +224,22 @@ namespace Moses
good2 = 0;
fvals.resize(numfeats);
}
void
PhrasePair::
init(uint64_t const pid1,
pstats const& ps1,
pstats const& ps2,
size_t const numfeats)
{
p1 = pid1;
raw1 = ps1.raw_cnt + ps2.raw_cnt;
sample1 = ps1.sample_cnt + ps2.sample_cnt;
sample2 = 0;
good1 = ps1.good + ps2.good;
good2 = 0;
fvals.resize(numfeats);
}
float
lbop(size_t const tries, size_t const succ, float const confidence)
@ -149,7 +249,7 @@ namespace Moses
find_lower_bound_on_p(tries, succ, confidence);
}
void
PhrasePair const&
PhrasePair::
update(uint64_t const pid2, jstats const& js)
{
@ -159,8 +259,64 @@ namespace Moses
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
float total_fwd = 0, total_bwd = 0;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
total_fwd += js.dcnt_fwd(po)+1;
total_bwd += js.dcnt_bwd(po)+1;
}
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
}
return *this;
}
PhrasePair const&
PhrasePair::
update(uint64_t const pid2, jstats const& js1, jstats const& js2)
{
p2 = pid2;
raw2 = js1.cnt2() + js2.cnt2();
joint = js1.rcnt() + js2.rcnt();
assert(js1.aln().size() || js2.aln().size());
if (js1.aln().size())
aln = js1.aln()[0].second;
else if (js2.aln().size())
aln = js2.aln()[0].second;
for (int i = po_first; i < po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
}
return *this;
}
PhrasePair const&
PhrasePair::
update(uint64_t const pid2,
size_t const raw2extra,
jstats const& js)
{
p2 = pid2;
raw2 = js.cnt2() + raw2extra;
joint = js.rcnt();
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
}
return *this;
}
float
PhrasePair::
eval(vector<float> const& w)
@ -172,5 +328,331 @@ namespace Moses
return this->score;
}
template<>
sptr<imBitext<L2R_Token<SimpleWordId> > >
imBitext<L2R_Token<SimpleWordId> >::
add(vector<string> const& s1,
vector<string> const& s2,
vector<string> const& aln) const
{
typedef L2R_Token<SimpleWordId> TKN;
assert(s1.size() == s2.size() && s1.size() == aln.size());
sptr<imBitext<TKN> > ret;
{
lock_guard<mutex> guard(this->lock);
ret.reset(new imBitext<TKN>(*this));
}
// we add the sentences in separate threads (so it's faster)
boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
thread1.join(); // for debugging
boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
BOOST_FOREACH(string const& a, aln)
{
istringstream ibuf(a);
ostringstream obuf;
uint32_t row,col; char c;
while (ibuf>>row>>c>>col)
{
assert(c == '-');
binwrite(obuf,row);
binwrite(obuf,col);
}
char const* x = obuf.str().c_str();
vector<char> v(x,x+obuf.str().size());
ret->myTx = append(ret->myTx, v);
}
thread1.join();
thread2.join();
ret->Tx = ret->myTx;
ret->T1 = ret->myT1;
ret->T2 = ret->myT2;
ret->I1 = ret->myI1;
ret->I2 = ret->myI2;
return ret;
}
// template<>
void
snt_adder<L2R_Token<SimpleWordId> >::
operator()()
{
vector<id_type> sids;
sids.reserve(snt.size());
BOOST_FOREACH(string const& foo, snt)
{
sids.push_back(track ? track->size() : 0);
istringstream buf(foo);
string w;
vector<L2R_Token<SimpleWordId > > s;
s.reserve(100);
while (buf >> w)
s.push_back(L2R_Token<SimpleWordId>(V[w]));
track = append(track,s);
}
if (index)
index.reset(new imTSA<L2R_Token<SimpleWordId> >(*index,track,sids,V.tsize()));
else
index.reset(new imTSA<L2R_Token<SimpleWordId> >(track,NULL,NULL));
}
snt_adder<L2R_Token<SimpleWordId> >::
snt_adder(vector<string> const& s, TokenIndex& v,
sptr<imTtrack<L2R_Token<SimpleWordId> > >& t,
sptr<imTSA<L2R_Token<SimpleWordId> > >& i)
: snt(s), V(v), track(t), index(i)
{ }
bool
expand_phrase_pair
(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
ushort const s2, // next word on in target side
ushort const L1, ushort const R1, // limits of previous phrase
ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg
{
if (a2[s2].size() == 0)
{
cout << __FILE__ << ":" << __LINE__ << endl;
return false;
}
bitvector done1(a1.size());
bitvector done2(a2.size());
vector <pair<ushort,ushort> > agenda;
// x.first: side (1 or 2)
// x.second: word position
agenda.reserve(a1.size() + a2.size());
agenda.push_back(pair<ushort,ushort>(2,s2));
e2 = s2;
s1 = e1 = a2[s2].front();
if (s1 >= L1 && s1 < R1)
{
cout << __FILE__ << ":" << __LINE__ << endl;
return false;
}
agenda.push_back(pair<ushort,ushort>(2,s2));
while (agenda.size())
{
ushort side = agenda.back().first;
ushort p = agenda.back().second;
agenda.pop_back();
if (side == 1)
{
done1.set(p);
BOOST_FOREACH(ushort i, a1[p])
{
if (i < s2)
{
// cout << __FILE__ << ":" << __LINE__ << endl;
return false;
}
if (done2[i]) continue;
for (;e2 <= i;++e2)
if (!done2[e2])
agenda.push_back(pair<ushort,ushort>(2,e2));
}
}
else
{
done2.set(p);
BOOST_FOREACH(ushort i, a2[p])
{
if ((e1 < L1 && i >= L1) || (s1 >= R1 && i < R1) || (i >= L1 && i < R1))
{
// cout << __FILE__ << ":" << __LINE__ << " "
// << L1 << "-" << R1 << " " << i << " "
// << s1 << "-" << e1<< endl;
return false;
}
if (e1 < i)
{
for (; e1 <= i; ++e1)
if (!done1[e1])
agenda.push_back(pair<ushort,ushort>(1,e1));
}
else if (s1 > i)
{
for (; i <= s1; ++i)
if (!done1[i])
agenda.push_back(pair<ushort,ushort>(1,i));
}
}
}
}
++e1;
++e2;
return true;
}
// s1 = seed;
// e1 = seed;
// s2 = e2 = a1[seed].front();
// BOOST_FOREACH(ushort k, a1[seed])
// {
// if (s2 < k) s2 = k;
// if (e2 > k) e2 = k;
// }
// for (ushort j = s2; j <= e2; ++j)
// {
// if (a2[j].size() == 0) continue;
// done2.set(j);
// agenda.push_back(pair<ushort,ushort>(j,1));
// }
// while (agenda.size())
// {
// ushort side = agenda[0].second;
// ushort i = agenda[0].first;
// agenda.pop_back();
// if (side)
// {
// BOOST_FOREACH(ushort k, a2[i])
// {
// if (k < L1 || k > R1)
// return false;
// if (done1[k])
// continue;
// while (s1 > k)
// {
// --s1;
// if (done1[s1] || !a1[s1].size())
// continue;
// done1.set(s1);
// agenda.push_back(pair<ushort,ushort>(s1,0));
// }
// while (e1 < k)
// {
// ++e1;
// if (done1[e1] || !a1[e1].size())
// continue;
// done1.set(e1);
// agenda.push_back(pair<ushort,ushort>(e1,0));
// }
// }
// }
// else
// {
// BOOST_FOREACH(ushort k, a1[i])
// {
// if (k < L2 || k > R2)
// return false;
// if (done2[k])
// continue;
// while (s2 > k)
// {
// --s2;
// if (done2[s2] || !a2[s2].size())
// continue;
// done1.set(s2);
// agenda.push_back(pair<ushort,ushort>(s2,1));
// }
// while (e2 < k)
// {
// ++e2;
// if (done1[e2] || !a1[e2].size())
// continue;
// done2.set(e2);
// agenda.push_back(pair<ushort,ushort>(e2,1));
// }
// }
// }
// }
// ++e1;
// ++e2;
// return true;
// }
void
print_amatrix(vector<vector<ushort> > a1, uint32_t len2,
ushort b1, ushort e1, ushort b2, ushort e2)
{
vector<bitvector> M(a1.size(),bitvector(len2));
for (ushort j = 0; j < a1.size(); ++j)
{
BOOST_FOREACH(ushort k, a1[j])
M[j].set(k);
}
cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl;
cout << " ";
for (size_t c = 0; c < len2;++c)
cout << c%10;
cout << endl;
for (size_t r = 0; r < M.size(); ++r)
{
cout << setw(3) << r << " ";
for (size_t c = 0; c < M[r].size(); ++c)
{
if ((b1 <= r) && (r < e1) && b2 <= c && c < e2)
cout << (M[r][c] ? 'x' : '-');
else cout << (M[r][c] ? 'o' : '.');
}
cout << endl;
}
cout << string(90,'-') << endl;
}
PhraseOrientation
find_po_fwd(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
size_t b1, size_t e1,
size_t b2, size_t e2)
{
size_t n2 = e2;
while (n2 < a2.size() && a2[n2].size() == 0) ++n2;
if (n2 == a2.size())
return po_last;
ushort ns1,ne1,ne2;
if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2))
{
return po_other;
}
if (ns1 >= e1)
{
for (ushort j = e1; j < ns1; ++j)
if (a1[j].size()) return po_jfwd;
return po_mono;
}
else
{
for (ushort j = ne1; j < b1; ++j)
if (a1[j].size()) return po_jbwd;
return po_swap;
}
}
PhraseOrientation
find_po_bwd(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
size_t b1, size_t e1,
size_t b2, size_t e2)
{
int p2 = b2-1;
while (p2 >= 0 && !a2[p2].size()) --p2;
if (p2 < 0) return po_first;
ushort ps1,pe1,pe2;
if (!expand_phrase_pair(a1,a2,p2,b1,e1,ps1,pe1,pe2))
return po_other;
if (pe1 < b1)
{
for (ushort j = pe1; j < b1; ++j)
if (a1[j].size()) return po_jfwd;
return po_mono;
}
else
{
for (ushort j = e1; j < ps1; ++j)
if (a1[j].size()) return po_jbwd;
return po_swap;
}
}
}
}

View File

@ -26,13 +26,13 @@
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#include "moses/generic/sorting/VectorIndexSorter.h"
#include "moses/generic/sampling/Sampling.h"
#include "moses/generic/file_io/ug_stream.h"
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include "moses/Util.h"
#include "util/exception.hh"
#include "util/check.hh"
#include "headers-base/util/exception.hh"
#include "headers-base/util/check.hh"
#include "ug_typedefs.h"
#include "ug_mm_ttrack.h"
@ -54,6 +54,29 @@ namespace Moses {
template<typename TKN> class Bitext;
enum PhraseOrientation
{
po_first,
po_mono,
po_jfwd,
po_swap,
po_jbwd,
po_last,
po_other
};
PhraseOrientation
find_po_fwd(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
size_t b1, size_t e1,
size_t b2, size_t e2);
PhraseOrientation
find_po_bwd(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
size_t b1, size_t e1,
size_t b2, size_t e2);
template<typename sid_t, typename off_t, typename len_t>
void
parse_pid(uint64_t const pid, sid_t & sid,
@ -79,6 +102,7 @@ namespace Moses {
float my_wcnt; // weighted count
uint32_t my_cnt2;
vector<pair<size_t, vector<uchar> > > my_aln;
uint32_t ofwd[7], obwd[7];
public:
jstats();
jstats(jstats const& other);
@ -87,7 +111,12 @@ namespace Moses {
float wcnt() const;
vector<pair<size_t, vector<uchar> > > const & aln() const;
void add(float w, vector<uchar> const& a, uint32_t const cnt2);
void add(float w, vector<uchar> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient);
void invalidate();
bool valid();
uint32_t dcnt_fwd(PhraseOrientation const idx) const;
uint32_t dcnt_bwd(PhraseOrientation const idx) const;
};
struct
@ -101,14 +130,21 @@ namespace Moses {
size_t good; // number of selected instances with valid word alignments
size_t sum_pairs;
size_t in_progress; // keeps track of how many threads are currently working on this
uint32_t ofwd[po_other+1], obwd[po_other+1];
typename boost::unordered_map<uint64_t, jstats> trg;
pstats();
void release();
void register_worker();
size_t count_workers() { return in_progress; }
void add(uint64_t const pid, float const w,
vector<uchar> const& a, uint32_t const cnt2);
bool
add(uint64_t const pid,
float const w,
vector<uchar> const& a,
uint32_t const cnt2,
uint32_t fwd_o, uint32_t bwd_o);
};
class
@ -117,19 +153,34 @@ namespace Moses {
public:
uint64_t p1, p2;
uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
uint32_t mono,swap,left,right;
vector<float> fvals;
float dfwd[po_other+1];
float dbwd[po_other+1];
vector<uchar> aln;
// float avlex12,avlex21; // average lexical probs (Moses std)
// float znlex1,znlex2; // zens-ney lexical smoothing
// float colex1,colex2; // based on raw lexical occurrences
float score;
PhrasePair();
PhrasePair(PhrasePair const& o);
bool operator<(PhrasePair const& other) const;
bool operator>(PhrasePair const& other) const;
void init(uint64_t const pid1, pstats const& ps,
bool operator<=(PhrasePair const& other) const;
bool operator>=(PhrasePair const& other) const;
void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
size_t const numfeats);
void update(uint64_t const pid2, jstats const& js);
PhrasePair const&
update(uint64_t const pid2, jstats const& js);
PhrasePair const&
update(uint64_t const pid2, jstats const& js1, jstats const& js2);
PhrasePair const&
update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
float eval(vector<float> const& w);
};
@ -144,10 +195,16 @@ namespace Moses {
virtual
void
operator()(Bitext<Token> const& pt, PhrasePair& pp) const = 0;
operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest)
const = 0;
int
fcnt() const { return num_feats; }
fcnt() const
{ return num_feats; }
int
getIndex() const
{ return index; }
};
template<typename Token>
@ -170,14 +227,17 @@ namespace Moses {
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp) const
operator()(Bitext<Token> const& bt,
PhrasePair & pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
if (pp.joint > pp.good1)
{
cerr << bt.toString(pp.p1,0) << " ::: " << bt.toString(pp.p2,1) << endl;
cerr << pp.joint << "/" << pp.good1 << "/" << pp.raw2 << endl;
}
pp.fvals[this->index] = log(lbop(pp.good1, pp.joint, conf));
(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
}
};
@ -201,9 +261,10 @@ namespace Moses {
}
void
operator()(Bitext<Token> const& pt, PhrasePair& pp) const
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
{
pp.fvals[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf));
if (!dest) dest = &pp.fvals;
(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf));
}
};
@ -211,8 +272,8 @@ namespace Moses {
class
PScoreLex : public PhraseScorer<Token>
{
LexicalPhraseScorer2<Token> scorer;
public:
LexicalPhraseScorer2<Token> scorer;
PScoreLex() { this->num_feats = 2; }
@ -225,8 +286,9 @@ namespace Moses {
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp) const
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
parse_pid(pp.p1, sid1, off1, len1);
parse_pid(pp.p2, sid2, off2, len2);
@ -248,8 +310,8 @@ namespace Moses {
#endif
scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
bt.T2->sntStart(sid2)+off2,0,len2,
pp.aln, pp.fvals[this->index],
pp.fvals[this->index+1]);
pp.aln, (*dest)[this->index],
(*dest)[this->index+1]);
}
};
@ -271,11 +333,12 @@ namespace Moses {
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp) const
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
uint32_t sid2=0,off2=0,len2=0;
parse_pid(pp.p2, sid2, off2, len2);
pp.fvals[this->index] = len2;
(*dest)[this->index] = len2;
}
};
@ -297,9 +360,10 @@ namespace Moses {
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp) const
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
{
pp.fvals[this->index] = 1;
if (!dest) dest = &pp.fvals;
(*dest)[this->index] = 1;
}
};
@ -307,6 +371,7 @@ namespace Moses {
template<typename TKN>
class Bitext
{
protected:
mutable boost::mutex lock;
public:
typedef TKN Token;
@ -322,13 +387,13 @@ namespace Moses {
// each other's way.
mutable sptr<agenda> ag;
sptr<Ttrack<char> > const Tx; // word alignments
sptr<Ttrack<Token> > const T1; // token track
sptr<Ttrack<Token> > const T2; // token track
sptr<TokenIndex> const V1; // vocab
sptr<TokenIndex> const V2; // vocab
sptr<TSA<Token> > const I1; // indices
sptr<TSA<Token> > const I2; // indices
sptr<Ttrack<char> > Tx; // word alignments
sptr<Ttrack<Token> > T1; // token track
sptr<Ttrack<Token> > T2; // token track
sptr<TokenIndex> V1; // vocab
sptr<TokenIndex> V2; // vocab
sptr<TSA<Token> > I1; // indices
sptr<TSA<Token> > I2; // indices
/// given the source phrase sid[start:stop]
// find the possible start (s1 .. s2) and end (e1 .. e2)
@ -339,14 +404,20 @@ namespace Moses {
find_trg_phr_bounds
(size_t const sid, size_t const start, size_t const stop,
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
vector<uchar> * core_alignment, bool const flip) const;
int& po_fwd, int& po_bwd,
vector<uchar> * core_alignment,
bitvector* full_alignment,
bool const flip) const;
mutable boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2;
private:
protected:
size_t default_sample_size;
private:
sptr<pstats>
prep2(iter const& phrase, size_t const max_sample) const;
public:
Bitext(size_t const max_sample=5000);
Bitext(Ttrack<Token>* const t1,
Ttrack<Token>* const t2,
Ttrack<char>* const tx,
@ -358,6 +429,7 @@ namespace Moses {
virtual void open(string const base, string const L1, string const L2) = 0;
// sptr<pstats> lookup(Phrase const& phrase, size_t factor) const;
sptr<pstats> lookup(iter const& phrase) const;
sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
void prep(iter const& phrase) const;
@ -407,6 +479,12 @@ namespace Moses {
}
}
template<typename Token>
Bitext<Token>::
Bitext(size_t const max_sample)
: default_sample_size(max_sample)
{ }
template<typename Token>
Bitext<Token>::
Bitext(Ttrack<Token>* const t1,
@ -557,16 +635,27 @@ namespace Moses {
{
j->stats->register_worker();
vector<uchar> aln;
bitvector full_alignment(100*100);
while (j->step(sid,offset))
{
aln.clear();
if (!ag.bt.find_trg_phr_bounds
(sid, offset, offset + j->len, s1, s2, e1, e2,
j->fwd?&aln:NULL, !j->fwd))
int po_fwd=5,po_bwd=5;
if (j->fwd)
{
if (!ag.bt.find_trg_phr_bounds
(sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
&aln,&full_alignment,false))
continue;
}
else if (!ag.bt.find_trg_phr_bounds
(sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
NULL,NULL,true))
continue;
j->stats->lock.lock();
j->stats->good += 1;
j->stats->sum_pairs += (s2-s1+1)*(e2-e1+1);
++j->stats->ofwd[po_fwd];
++j->stats->obwd[po_bwd];
j->stats->lock.unlock();
for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
aln[k] += s2 - s1;
@ -580,8 +669,21 @@ namespace Moses {
// assert(b);
for (size_t i = e1; i <= e2; ++i)
{
j->stats->add(b->getPid(),sample_weight,aln,b->approxOccurrenceCount());
if (!j->stats->add(b->getPid(),sample_weight,aln,
b->approxOccurrenceCount(),
po_fwd,po_bwd))
{
for (size_t z = 0; z < j->len; ++z)
{
id_type tid = ag.bt.T1->sntStart(sid)[offset+z].id();
cout << (*ag.bt.V1)[tid] << " ";
}
cout << endl;
for (size_t z = s; z <= i; ++z)
cout << (*ag.bt.V2)[(o+z)->id()] << " ";
cout << endl;
exit(1);
}
if (i < e2)
{
#ifndef NDEBUG
@ -734,59 +836,239 @@ namespace Moses {
this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
i1.open(base+L1+".sfa", this->T1.get());
i2.open(base+L2+".sfa", this->T2.get());
i1.open(base+L1+".sfa", this->T1);
i2.open(base+L2+".sfa", this->T2);
assert(this->T1->size() == this->T2->size());
}
template<typename TKN>
class imBitext : public Bitext<TKN>
{
sptr<imTtrack<char> > myTx;
sptr<imTtrack<TKN> > myT1;
sptr<imTtrack<TKN> > myT2;
sptr<imTSA<TKN> > myI1;
sptr<imTSA<TKN> > myI2;
public:
void open(string const base, string const L1, string L2);
imBitext();
imBitext(sptr<TokenIndex> const& V1,
sptr<TokenIndex> const& V2,
size_t max_sample = 5000);
imBitext(size_t max_sample = 5000);
imBitext(imBitext const& other);
// sptr<imBitext<TKN> >
// add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a);
sptr<imBitext<TKN> >
add(vector<string> const& s1,
vector<string> const& s2,
vector<string> const& a) const;
};
template<typename TKN>
imBitext<TKN>::
imBitext()
: Bitext<TKN>(new imTtrack<TKN>(),
new imTtrack<TKN>(),
new imTtrack<char>(),
new TokenIndex(),
new TokenIndex(),
new imTSA<TKN>(),
new imTSA<TKN>())
{}
imBitext(size_t max_sample)
{
this->default_sample_size = max_sample;
this->V1.reset(new TokenIndex());
this->V2.reset(new TokenIndex());
this->V1->setDynamic(true);
this->V2->setDynamic(true);
}
template<typename TKN>
imBitext<TKN>::
imBitext(sptr<TokenIndex> const& v1,
sptr<TokenIndex> const& v2,
size_t max_sample)
{
this->default_sample_size = max_sample;
this->V1 = v1;
this->V2 = v2;
this->V1->setDynamic(true);
this->V2->setDynamic(true);
}
template<typename TKN>
imBitext<TKN>::
imBitext(imBitext<TKN> const& other)
{
this->myTx = other.myTx;
this->myT1 = other.myT1;
this->myT2 = other.myT2;
this->myI1 = other.myI1;
this->myI2 = other.myI2;
this->Tx = this->myTx;
this->T1 = this->myT1;
this->T2 = this->myT2;
this->I1 = this->myI1;
this->I2 = this->myI2;
this->V1 = other.V1;
this->V2 = other.V2;
this->default_sample_size = other.default_sample_size;
}
template<typename TKN> class snt_adder;
template<> class snt_adder<L2R_Token<SimpleWordId> >;
template<>
class snt_adder<L2R_Token<SimpleWordId> >
{
typedef L2R_Token<SimpleWordId> TKN;
vector<string> const & snt;
TokenIndex & V;
sptr<imTtrack<TKN> > & track;
sptr<imTSA<TKN > > & index;
public:
snt_adder(vector<string> const& s, TokenIndex& v,
sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
void operator()();
};
// template<typename TKN>
// void
// imBitext<TKN>::
// open(string const base, string const L1, string L2)
// class snt_adder
// {
// mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtracuk<TKN>*>(this->T1.get());
// mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
// mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
// t1.open(base+L1+".mct");
// t2.open(base+L2+".mct");
// tx.open(base+L1+"-"+L2+".mam");
// cerr << "DADA" << endl;
// this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
// this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
// mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
// mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
// i1.open(base+L1+".sfa", this->T1.get());
// i2.open(base+L2+".sfa", this->T2.get());
// assert(this->T1->size() == this->T2->size());
// vector<string> const & snt;
// TokenIndex & V;
// sptr<imTtrack<TKN> > & track;
// sptr<imTSA<TKN > > & index;
// public:
// snt_adder(vector<string> const& s, TokenIndex& v,
// sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
// template<typename T>
// void operator()();
// };
// // template<>
// void
// snt_adder<L2R_Token<SimpleWordId> >::
// operator()();
// template<>
// void
// snt_adder<char>::
// operator()()
// {
// vector<id_type> sids;
// sids.reserve(snt.size());
// BOOST_FOREACH(string const& s, snt)
// {
// sids.push_back(track ? track->size() : 0);
// istringstream buf(s);
// string w;
// vector<char> s;
// s.reserve(100);
// while (buf >> w)
// s.push_back(vector<char>(V[w]));
// track = append(track,s);
// }
// index.reset(new imTSA<char>(*index,track,sids,V.tsize()));
// }
// template<typename TKN>
// snt_adder<TKN>::
// snt_adder(vector<string> const& s, TokenIndex& v,
// sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i)
// : snt(s), V(v), track(t), index(i)
// {
// throw "Not implemented yet.";
// }
template<>
sptr<imBitext<L2R_Token<SimpleWordId> > >
imBitext<L2R_Token<SimpleWordId> >::
add(vector<string> const& s1,
vector<string> const& s2,
vector<string> const& aln) const;
template<typename TKN>
sptr<imBitext<TKN> >
imBitext<TKN>::
add(vector<string> const& s1,
vector<string> const& s2,
vector<string> const& aln) const
{
throw "Not yet implemented";
}
// template<typename TKN>
// sptr<imBitext<TKN> >
// imBitext<TKN>::
// add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a)
// {
// boost::lock_guard<boost::mutex> guard(this->lock);
// sptr<imBitext<TKN> > ret(new imBitext<TKN>());
// vector<id_type> sids(1,this->myT1.size()-1);
// ret->myT1 = add(this->myT1,s1);
// ret->myT2 = add(this->myT2,s2);
// size_t v1size = this->V1.tsize();
// size_t v2size = this->V2.tsize();
// BOOST_FOREACH(TKN const& t, s1) { if (t->id() >= v1size) v1size = t->id() + 1; }
// BOOST_FOREACH(TKN const& t, s2) { if (t->id() >= v2size) v2size = t->id() + 1; }
// ret->myI1.reset(new imTSA<TKN>(*this->I1,ret->myT1,sids,v1size));
// ret->myI2.reset(new imTSA<TKN>(*this->I2,ret->myT2,sids,v2size));
// ostringstream abuf;
// BOOST_FOREACH(ushort x, a) binwrite(abuf,x);
// vector<char> foo(abuf.str().begin(),abuf.str().end());
// ret->myTx = add(this->myTx,foo);
// ret->T1 = ret->myT1;
// ret->T2 = ret->myT2;
// ret->Tx = ret->myTx;
// ret->I1 = ret->myI1;
// ret->I2 = ret->myI2;
// ret->V1 = this->V1;
// ret->V2 = this->V2;
// return ret;
// }
// template<typename TKN>
// imBitext<TKN>::
// imBitext()
// : Bitext<TKN>(new imTtrack<TKN>(),
// new imTtrack<TKN>(),
// new imTtrack<char>(),
// new TokenIndex(),
// new TokenIndex(),
// new imTSA<TKN>(),
// new imTSA<TKN>())
// {}
template<typename TKN>
void
imBitext<TKN>::
open(string const base, string const L1, string L2)
{
mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
t1.open(base+L1+".mct");
t2.open(base+L2+".mct");
tx.open(base+L1+"-"+L2+".mam");
cerr << "DADA" << endl;
this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
i1.open(base+L1+".sfa", this->T1);
i2.open(base+L2+".sfa", this->T2);
assert(this->T1->size() == this->T2->size());
}
template<typename Token>
bool
Bitext<Token>::
find_trg_phr_bounds(size_t const sid, size_t const start, size_t const stop,
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
vector<uchar>* core_alignment, bool const flip) const
int & po_fwd, int & po_bwd,
vector<uchar>* core_alignment,
bitvector* full_alignment,
bool const flip) const
{
// if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
// a word on the core_alignment:
@ -795,10 +1077,18 @@ namespace Moses {
// it is up to the calling function to shift alignment points over for start positions
// of extracted phrases that start with a fringe word
bitvector forbidden((flip ? T1 : T2)->sntLen(sid));
size_t slen1 = (*T1).sntLen(sid);
size_t slen2 = (*T2).sntLen(sid);
if (full_alignment)
{
if (slen1*slen2 > full_alignment->size())
full_alignment->resize(slen1*slen2*2);
full_alignment->reset();
}
size_t src,trg;
size_t lft = forbidden.size();
size_t rgt = 0;
vector<vector<ushort> > aln((*T1).sntLen(sid));
vector<vector<ushort> > aln1(slen1),aln2(slen2);
char const* p = Tx->sntStart(sid);
char const* x = Tx->sntEnd(sid);
@ -814,11 +1104,24 @@ namespace Moses {
{
lft = min(lft,trg);
rgt = max(rgt,trg);
if (core_alignment)
}
if (core_alignment)
{
if (flip)
{
if (flip) aln[trg].push_back(src);
else aln[src].push_back(trg);
aln1[trg].push_back(src);
aln2[src].push_back(trg);
}
else
{
aln1[src].push_back(trg);
aln2[trg].push_back(src);
}
}
if (full_alignment)
{
if (flip) full_alignment->set(trg*slen2 + src);
else full_alignment->set(src*slen2 + trg);
}
}
@ -837,8 +1140,8 @@ namespace Moses {
{
for (size_t i = lft; i <= rgt; ++i)
{
sort(aln[i].begin(),aln[i].end());
BOOST_FOREACH(ushort x, aln[i])
sort(aln1[i].begin(),aln1[i].end());
BOOST_FOREACH(ushort x, aln1[i])
{
core_alignment->push_back(i-lft);
core_alignment->push_back(x-start);
@ -849,14 +1152,25 @@ namespace Moses {
{
for (size_t i = start; i < stop; ++i)
{
BOOST_FOREACH(ushort x, aln[i])
BOOST_FOREACH(ushort x, aln1[i])
{
core_alignment->push_back(i-start);
core_alignment->push_back(x-lft);
}
}
}
// now determine fwd and bwd phrase orientation
if (flip)
{
po_fwd = find_po_fwd(aln2,aln1,start,stop,s1,e2);
po_bwd = find_po_bwd(aln2,aln1,start,stop,s1,e2);
}
else
{
po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
}
#if 0
// if (e1 - s1 > 3)
{
@ -898,17 +1212,14 @@ namespace Moses {
template<typename Token>
sptr<pstats>
Bitext<Token>::
prep2(iter const& phrase, size_t const max_sample) const
prep2(iter const& phrase, size_t const max_sample) const
{
// boost::lock_guard<boost::mutex>(this->lock);
if (!ag)
{
// boost::lock_guard<boost::mutex>(this->lock);
if (!ag)
{
ag.reset(new agenda(*this));
ag->add_workers(20);
}
ag.reset(new agenda(*this));
// ag->add_workers(1);
ag->add_workers(20);
}
typedef boost::unordered_map<uint64_t,sptr<pstats> > pcache_t;
sptr<pstats> ret;
@ -928,7 +1239,7 @@ namespace Moses {
else ret = ag->add_job(phrase, max_sample);
return ret;
}
template<typename Token>
sptr<pstats>
Bitext<Token>::

View File

@ -11,6 +11,7 @@
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/dynamic_bitset.hpp>
#include <boost/foreach.hpp>
#include "tpt_tightindex.h"
#include "tpt_tokenindex.h"
@ -20,13 +21,17 @@
namespace ugdiss
{
using namespace std;
using namespace boost;
namespace bio=boost::iostreams;
//-----------------------------------------------------------------------
// template<typename TOKEN> class imBitext<TOKEN>;
//-----------------------------------------------------------------------
template<typename TOKEN>
class imTSA : public TSA<TOKEN>
{
typedef typename Ttrack<TOKEN>::Position cpos;
// friend class imBitext<TOKEN>;
public:
class tree_iterator;
friend class tree_iterator;
@ -35,7 +40,6 @@ namespace ugdiss
vector<cpos> sufa; // stores the actual array
vector<filepos_type> index; /* top-level index into regions in sufa
* (for faster access) */
private:
char const*
index_jump(char const* a, char const* z, float ratio) const;
@ -48,8 +52,14 @@ namespace ugdiss
public:
imTSA();
imTSA(Ttrack<TOKEN> const* c, bdBitset const& filt, ostream* log = NULL);
imTSA(shared_ptr<Ttrack<TOKEN> const> c,
bdBitset const* filt,
ostream* log = NULL);
imTSA(imTSA<TOKEN> const& prior,
shared_ptr<imTtrack<TOKEN> const> const& crp,
vector<id_type> const& newsids, size_t const vsize);
count_type
sntCnt(char const* p, char const * const q) const;
@ -78,6 +88,9 @@ namespace ugdiss
void
save_as_mm_tsa(string fname) const;
/// add a sentence to the database
// shared_ptr<imTSA<TOKEN> > add(vector<TOKEN> const& snt) const;
};
template<typename TOKEN>
@ -115,12 +128,11 @@ namespace ugdiss
imTSA<TOKEN>::
imTSA()
{
this->corpus = NULL;
this->indexSize = 0;
this->data = NULL;
this->indexSize = 0;
// this->data = NULL;
this->startArray = NULL;
this->endArray = NULL;
this->corpusSize=0;
this->endArray = NULL;
this->corpusSize = 0;
this->BitSetCachingThreshold=4096;
};
@ -128,11 +140,17 @@ namespace ugdiss
// specified in filter
template<typename TOKEN>
imTSA<TOKEN>::
imTSA(Ttrack<TOKEN> const* c, bdBitset const& filter, ostream* log)
imTSA(shared_ptr<Ttrack<TOKEN> const> c, bdBitset const* filter, ostream* log)
{
assert(c);
this->corpus = c;
bdBitset filter2;
if (!filter)
{
filter2.resize(c->size());
filter2.set();
filter = &filter2;
}
// In the first iteration over the corpus, we obtain word counts.
// They allows us to
// a. allocate the exact amount of memory we need
@ -160,9 +178,9 @@ namespace ugdiss
// Now dump all token positions into the right place in sufa
this->corpusSize = 0;
for (id_type sid = filter.find_first();
sid < filter.size();
sid = filter.find_next(sid))
for (id_type sid = filter->find_first();
sid < filter->size();
sid = filter->find_next(sid))
{
TOKEN const* k = c->sntStart(sid);
TOKEN const* const stop = c->sntEnd(sid);
@ -181,7 +199,7 @@ namespace ugdiss
// Now sort the array
if (log) *log << "sorting ...." << endl;
index.resize(wcnt.size()+1,0);
typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(c);
typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(c.get());
for (size_t i = 0; i < wcnt.size(); i++)
{
if (log && wcnt[i] > 5000)
@ -284,7 +302,7 @@ namespace ugdiss
getCounts(char const* p, char const* const q,
count_type& sids, count_type& raw) const
{
id_type sid; uint16_t off;
id_type sid; // uint16_t off;
bdBitset check(this->corpus->size());
cpos const* xp = reinterpret_cast<cpos const*>(p);
cpos const* xq = reinterpret_cast<cpos const*>(q);
@ -292,7 +310,7 @@ namespace ugdiss
for (;xp < xq;xp++)
{
sid = xp->sid;
off = xp->offset;
// off = xp->offset;
check.set(sid);
}
sids = check.count();
@ -323,8 +341,92 @@ namespace ugdiss
for (size_t i = 0; i < mmIndex.size(); i++)
numwrite(out,mmIndex[i]-mmIndex[0]);
out.seekp(0);
numwrite(out,idxStart);
numwrite(out,idxStart);
out.close();
}
template<typename TOKEN>
imTSA<TOKEN>::
imTSA(imTSA<TOKEN> const& prior,
shared_ptr<imTtrack<TOKEN> const> const& crp,
vector<id_type> const& newsids, size_t const vsize)
{
typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(crp.get());
// count how many tokens will be added to the TSA
// and index the new additions to the corpus
size_t newToks = 0;
BOOST_FOREACH(id_type sid, newsids)
newToks += crp->sntLen(sid);
vector<cpos> nidx(newToks); // new array entries
size_t n = 0;
BOOST_FOREACH(id_type sid, newsids)
{
for (size_t o = 0; o < (*crp)[sid].size(); ++o, ++n)
{ nidx[n].offset = o; nidx[n].sid = sid; }
}
sort(nidx.begin(),nidx.end(),sorter);
// create the new suffix array
this->numTokens = newToks + prior.sufa.size();
this->sufa.resize(this->numTokens);
this->startArray = reinterpret_cast<char const*>(&(*this->sufa.begin()));
this->endArray = reinterpret_cast<char const*>(&(*this->sufa.end()));
this->corpusSize = crp->size();
this->corpus = crp;
this->index.resize(vsize+1);
size_t i = 0;
typename vector<cpos>::iterator k = this->sufa.begin();
this->index[0] = 0;
for (size_t n = 0; n < nidx.size();)
{
id_type nid = crp->getToken(nidx[n])->id();
assert(nid >= i);
while (i < nid)
{
if (++i < prior.index.size() && prior.index[i-1] < prior.index[i])
{
k = copy(prior.sufa.begin() + prior.index[i-1],
prior.sufa.begin() + prior.index[i], k);
}
this->index[i] = k - prior.sufa.begin();
}
if (++i < prior.index.size() && prior.index[i] > prior.index[i-1])
{
size_t j = prior.index[i-1];
while (j < prior.index[i] && n < nidx.size()
&& crp->getToken(nidx[n])->id() < i)
{
assert(k < this->sufa.end());
if (sorter(prior.sufa[j],nidx[n]))
*k++ = prior.sufa[j++];
else
*k++ = nidx[n++];
}
while (j < prior.index[i])
{
assert(k < this->sufa.end());
*k++ = prior.sufa[j++];
}
}
while (n < nidx.size() && this->corpus->getToken(nidx[n])->id() < i)
{
assert(k < this->sufa.end());
*k++ = nidx[n++];
}
this->index[i] = k - this->sufa.begin();
}
while (++i < this->index.size())
{
if (i < prior.index.size() && prior.index[i-1] < prior.index[i])
k = copy(prior.sufa.begin() + prior.index[i-1],
prior.sufa.begin() + prior.index[i], k);
this->index[i] = k - this->sufa.begin();
}
}
}
#endif

View File

@ -1,4 +1,4 @@
// -*- c++-mode -*-
// -*- c++ -*-
// In-memory corpus track
// (c) 2006-2012 Ulrich Germann.
@ -10,6 +10,7 @@
#include <boost/shared_ptr.hpp>
#include <boost/unordered_map.hpp>
#include <boost/foreach.hpp>
#include "tpt_typedefs.h"
#include "tpt_tokenindex.h"
@ -17,24 +18,44 @@
#include "tpt_tokenindex.h"
// #include "ug_vocab.h"
// define the corpus buffer size (in sentences) and the
// for adding additional sentences:
#define IMTTRACK_INCREMENT_SIZE 100000
#define IMTSA_INCREMENT_SIZE 1000000
namespace ugdiss
{
using namespace std;
using namespace boost;
namespace bio=boost::iostreams;
template<typename Token=id_type>
template<typename Token> class imTSA;
template<typename Token> class imTtrack;
template<typename TOKEN>
typename boost::shared_ptr<imTtrack<TOKEN> >
append(typename boost::shared_ptr<imTtrack<TOKEN> > const & crp, vector<TOKEN> const & snt);
template<typename Token>
class imTtrack : public Ttrack<Token>
{
private:
size_t numToks;
boost::shared_ptr<vector<vector<Token> > > myData; // pointer to corpus data
friend class imTSA<Token>;
friend
typename boost::shared_ptr<imTtrack<Token> >
append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt);
public:
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d);
imTtrack(istream& in, TokenIndex const& V, ostream* log);
imTtrack();
imTtrack(size_t reserve = 0);
// imTtrack(istream& in, Vocab& V);
/** return pointer to beginning of sentence */
Token const* sntStart(size_t sid) const;
@ -65,7 +86,7 @@ namespace ugdiss
{
assert(sid < size());
if ((*myData)[sid].size() == 0) return NULL;
return &(*myData)[sid].back();
return &(*myData)[sid].back()+1;
}
template<typename Token>
@ -76,7 +97,7 @@ namespace ugdiss
// we assume that myIndex has pointers to both the beginning of the
// first sentence and the end point of the last, so there's one more
// offset in the myIndex than there are sentences
return myData.size();
return myData->size();
}
template<typename Token>
@ -113,9 +134,10 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack()
imTtrack(size_t reserve)
{
myData.reset(new vector<vector<Token> >());
if (reserve) myData->reserve(reserve);
}
template<typename Token>
@ -123,8 +145,11 @@ namespace ugdiss
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
{
myData = d;
numTokens = 0;
BOOST_FOREACH(vector<Token> const& v, d)
numTokens += v.size();
}
template<typename Token>
id_type
imTtrack<Token>::
@ -141,5 +166,27 @@ namespace ugdiss
return i;
}
/// add a sentence to the database
template<typename TOKEN>
shared_ptr<imTtrack<TOKEN> >
append(shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
{
shared_ptr<imTtrack<TOKEN> > ret;
if (crp == NULL)
{
ret.reset(new imTtrack<TOKEN>());
ret->myData->reserve(IMTTRACK_INCREMENT_SIZE);
}
else if (crp->myData->capacity() == crp->size())
{
ret.reset(new imTtrack<TOKEN>());
ret->myData->reserve(crp->size() + IMTTRACK_INCREMENT_SIZE);
copy(crp->myData->begin(),crp->myData->end(),ret->myData->begin());
}
else ret = crp;
ret->myData->push_back(snt);
return ret;
}
}
#endif

View File

@ -5,7 +5,7 @@
#ifndef __ug_lexical_phrase_scorer_h
#define __ug_lexical_phrase_scorer_h
#include "moses/generic/file_io/ug_stream.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include "tpt_tokenindex.h"
#include <string>
#include <boost/unordered_map.hpp>
@ -19,9 +19,9 @@ namespace ugdiss
class
LexicalPhraseScorer2
{
public:
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
table_t COOC;
public:
void open(string const& fname);
template<typename someint>
@ -96,8 +96,8 @@ namespace ugdiss
{
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
// if (!COOC[s][t]) cout << s << " " << t << endl;
assert(COOC[s][t]);
return float(COOC[s][t])/COOC.m1(s);
// assert(COOC[s][t]);
return float(COOC[s][t]+1)/(COOC.m1(s)+1);
}
template<typename TKN>
@ -106,8 +106,8 @@ namespace ugdiss
plup_bwd(id_type const s, id_type const t) const
{
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
assert(COOC[s][t]);
return float(COOC[s][t])/COOC.m2(t);
// assert(COOC[s][t]);
return float(COOC[s][t]+1)/(COOC.m2(t)+1);
}
template<typename TKN>

View File

@ -44,7 +44,7 @@ namespace ugdiss
public:
mmTSA();
mmTSA(string fname, Ttrack<TOKEN> const* c);
void open(string fname, Ttrack<TOKEN> const* c);
void open(string fname, typename boost::shared_ptr<Ttrack<TOKEN> const> c);
count_type
sntCnt(char const* p, char const * const q) const;
@ -100,7 +100,6 @@ namespace ugdiss
mmTSA<TOKEN>::
mmTSA()
{
this->corpus = NULL;
this->startArray = NULL;
this->endArray = NULL;
this->BitSetCachingThreshold=4096;
@ -120,7 +119,7 @@ namespace ugdiss
template<typename TOKEN>
void
mmTSA<TOKEN>::
open(string fname, Ttrack<TOKEN> const* c)
open(string fname, typename boost::shared_ptr<Ttrack<TOKEN> const> c)
{
this->bsc.reset(new BitSetCache<TSA<TOKEN> >(this));
if (access(fname.c_str(),F_OK))

View File

@ -34,6 +34,8 @@ namespace ugdiss
typedef TKN Token;
private:
id_type numSent;
id_type numWords;
bio::mapped_file_source file;
Token const* data; // pointer to first word of first sentence
id_type const* index; /* pointer to index (change data type for corpora

View File

@ -20,9 +20,9 @@
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#include "moses/generic/sorting/VectorIndexSorter.h"
#include "moses/generic/sampling/Sampling.h"
#include "moses/generic/file_io/ug_stream.h"
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include "ug_typedefs.h"
#include "ug_mm_ttrack.h"

View File

@ -1,6 +1,6 @@
#include "ug_tsa_array_entry.h"
#include "ug_ttrack_position.h"
#include "moses/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
// (c) 2007-2010 Ulrich Germann

View File

@ -8,9 +8,10 @@
#include <string>
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/shared_ptr.hpp>
#include "tpt_tokenindex.h"
#include "ug_ttrack_base.h"
#include "ug_im_ttrack.h"
#include "ug_corpus_token.h"
#include "ug_tsa_tree_iterator.h"
#include "ug_tsa_array_entry.h"
@ -44,7 +45,6 @@ namespace ugdiss
template<typename TKN>
class TSA
{
public:
virtual ~TSA() {};
typedef TSA_tree_iterator<TKN> tree_iterator;
@ -62,9 +62,9 @@ namespace ugdiss
friend class TSA_tree_iterator<TKN>;
protected:
Ttrack<TKN> const* corpus; // pointer to the underlying corpus
char const* startArray; // beginning ...
char const* endArray; // ... and end ...
shared_ptr<Ttrack<TKN> const> corpus; // pointer to the underlying corpus
char const* startArray; // beginning ...
char const* endArray; // ... and end ...
// of memory block storing the actual TSA
size_t corpusSize;
@ -737,7 +737,7 @@ namespace ugdiss
TSA<TKN>::
getCorpus() const
{
return corpus;
return corpus.get();
}
//---------------------------------------------------------------------------

View File

@ -19,21 +19,25 @@ namespace ugdiss
template<typename T>
void display(T const* x, string label)
{
cout << label << ":"; for (;x;x=next(x)) cout << " " << x->lemma; cout << endl;
cout << label << ":";
for (;x;x=next(x)) cout << " " << x->lemma;
cout << endl;
}
#endif
template<typename T> class TSA;
// CLASS DEFINITION
// The TSA_tree_iterator allows traversal of a Token Sequence Array as if it was a trie.
// The TSA_tree_iterator allows traversal of a Token Sequence Array
// as if it was a trie.
//
// down(): go to first child
// over(): go to next sibling
// up(): go to parent
// extend(id): go to a specific child node
// all four functions return true if successful, false otherwise
// lower_bound() and upper_bound() give the range of entries in the array covered by the
// "virtual trie node".
// lower_bound() and upper_bound() give the range of entries in the
// array covered by the "virtual trie node".
template<typename TKN>
class
TSA_tree_iterator
@ -50,12 +54,16 @@ namespace ugdiss
virtual ~TSA_tree_iterator() {};
TSA<Token> const* root;
// TO BE DONE: make the pointer private and add a const function to return the pointer
// TO BE DONE: make the pointer private and add a const function
// to return the pointer
// TSA_tree_iterator(TSA_tree_iterator const& other);
TSA_tree_iterator(TSA<Token> const* s);
// TSA_tree_iterator(TSA<Token> const* s, Token const& t);
// TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, Token const* kend);
TSA_tree_iterator(TSA<Token> const* s,
Token const* kstart,
Token const* kend,
bool full_match_only=true);
// TSA_tree_iterator(TSA<Token> const* s,
// TokenIndex const& V,
// string const& key);
@ -354,21 +362,24 @@ namespace ugdiss
// ---------------------------------------------------------------------------
#endif
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, Token const* kend)
TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
Token const* kend, bool full_match_only)
: root(s)
{
for (;kstart != kend; kstart = kstart->next())
if (!extend(*kstart))
break;
if (kstart != kend)
if (full_match_only && kstart != kend)
{
lower.clear();
upper.clear();
}
};
#endif
// ---------------------------------------------------------------------------
// EXTEND
// ---------------------------------------------------------------------------
@ -449,6 +460,7 @@ namespace ugdiss
TSA_tree_iterator<Token>::
getPid(int p) const
{
if (this->size() == 0) return 0;
if (p < 0) p += upper.size();
char const* lb = lower_bound(p);
char const* ub = upper_bound(p);
@ -845,8 +857,9 @@ namespace ugdiss
size_t m=0; // number of samples selected so far
typename Token::ArrayEntry I(lower.at(level));
char const* stop = upper.at(level);
while (m < N && I.next < stop)
while (m < N && (I.next) < stop)
{
root->readEntry(I.next,I);
@ -860,9 +873,9 @@ namespace ugdiss
}
}
ret->resize(m);
return ret;
}
} // end of namespace ugdiss
#endif

View File

@ -28,10 +28,6 @@ namespace ugdiss
template<typename TKN=id_type>
class Ttrack
{
protected:
id_type numSent;
id_type numWords;
public:
virtual ~Ttrack() {};
@ -92,13 +88,15 @@ namespace ugdiss
* Currently only defined for Ttrack<id_type> */
string str(id_type sid, TokenIndex const& T) const;
string pid2str(TokenIndex const* V, uint64_t pid) const;
// /** @return string representation of sentence /sid/
// * Currently only defined for Ttrack<id_type> */
// string str(id_type sid, Vocab const& V) const;
/** counts the tokens in the corpus; used for example in the construction of
* token sequence arrays */
count_type count_tokens(vector<count_type>& cnt, bdBitset const& filter,
count_type count_tokens(vector<count_type>& cnt, bdBitset const* filter,
int lengthCutoff=0, ostream* log=NULL) const;
// static id_type toID(TKN const& t);
@ -145,16 +143,27 @@ namespace ugdiss
template<typename TKN>
count_type
Ttrack<TKN>::
count_tokens(vector<count_type>& cnt, bdBitset const& filter,
int lengthCutoff, ostream* log) const
count_tokens(vector<count_type>& cnt, bdBitset const* filter,
int lengthCutoff, ostream* log) const
{
bdBitset filter2;
if (!filter)
{
filter2.resize(this->size());
filter2.set();
filter = &filter2;
}
cnt.clear();
cnt.reserve(500000);
count_type totalCount=0;
int64_t expectedTotal=numTokens();
for (size_t sid = filter.find_first();
sid < filter.size();
sid = filter.find_next(sid))
int64_t expectedTotal=0;
for (size_t sid = 0; sid < this->size(); ++sid)
expectedTotal += this->sntLen(sid);
for (size_t sid = filter->find_first();
sid < filter->size();
sid = filter->find_next(sid))
{
TKN const* k = sntStart(sid);
TKN const* const stop = sntEnd(sid);
@ -177,7 +186,7 @@ namespace ugdiss
}
}
}
if (this->size() == filter.count())
if (this->size() == filter->count())
{
if (totalCount != expectedTotal)
cerr << "OOPS: expected " << expectedTotal
@ -344,5 +353,36 @@ namespace ugdiss
return Position(this->size(),0);
}
template<typename TKN>
string
Ttrack<TKN>::
pid2str(TokenIndex const* V, uint64_t pid) const
{
uint32_t len = pid % (1<<16);
pid >>= 16;
uint32_t off = pid % (1<<16);
uint32_t sid = pid>>16;
ostringstream buf;
TKN const* t = sntStart(sid) + off;
TKN const* stop = t + len;
if (V)
{
while (t < stop)
{
buf << (*V)[t->id()];
if ((t = t->next()) != stop) buf << " ";
}
}
else
{
while (t < stop)
{
buf << t->id();
if ((t = t->next()) != stop) buf << " ";
}
}
return buf.str();
}
}
#endif

View File

@ -23,17 +23,19 @@ namespace Moses
params[t.substr(i,j)] = t.substr(k);
}
}
#if 0
Mmsapt::
Mmsapt(string const& description, string const& line)
: PhraseDictionary(description,line)
: PhraseDictionary(description,line), ofactor(1,0)
{
this->init(line);
}
#endif
Mmsapt::
Mmsapt(string const& line)
: PhraseDictionary("Mmsapt",line)
// : PhraseDictionary("Mmsapt",line), ofactor(1,0)
: PhraseDictionary(line), ofactor(1,0)
{
this->init(line);
}
@ -59,80 +61,409 @@ namespace Moses
// num_features = 0;
m = param.find("ifactor");
input_factor = m != param.end() ? atoi(m->second.c_str()) : 0;
poolCounts = true;
}
void
Mmsapt::
Load()
{
bt.open(bname, L1, L2);
btfix.open(bname, L1, L2);
size_t num_feats;
num_feats = calc_pfwd.init(0,lbop_parameter);
num_feats = calc_pbwd.init(num_feats,lbop_parameter);
num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
num_feats = apply_pp.init(num_feats);
assert (num_feats == this->m_numScoreComponents);
// TO DO: should we use different lbop parameters
// for the relative-frequency based features?
num_feats = calc_pfwd_fix.init(0,lbop_parameter);
num_feats = calc_pbwd_fix.init(num_feats,lbop_parameter);
num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
num_feats = apply_pp.init(num_feats);
if (num_feats < this->m_numScoreComponents)
{
poolCounts = false;
num_feats = calc_pfwd_dyn.init(num_feats,lbop_parameter);
num_feats = calc_pbwd_dyn.init(num_feats,lbop_parameter);
}
btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2));
if (num_feats != this->m_numScoreComponents)
{
ostringstream buf;
buf << "At " << __FILE__ << ":" << __LINE__
<< ": number of feature values provided by Phrase table"
<< " does not match number specified in Moses config file!";
throw buf.str().c_str();
}
// cerr << "MMSAPT provides " << num_feats << " features at "
// << __FILE__ << ":" << __LINE__ << endl;
LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
typedef LexicalPhraseScorer2<Token>::table_t::Cell cell_t;
wlex21.resize(COOC.numCols);
for (size_t r = 0; r < COOC.numRows; ++r)
for (cell_t const* c = COOC[r].start; c < COOC[r].stop; ++c)
wlex21[c->id].push_back(r);
COOCraw.open(bname + L1 + "-" + L2 + ".coc");
}
void
Mmsapt::
add(string const& s1, string const& s2, string const& a)
{
vector<string> S1(1,s1);
vector<string> S2(1,s2);
vector<string> ALN(1,a);
boost::lock_guard<boost::mutex> guard(this->lock);
btdyn = btdyn->add(S1,S2,ALN);
}
TargetPhrase*
Mmsapt::
createTargetPhrase(Phrase const& src,
Bitext<Token> const& bt,
PhrasePair const& pp) const
{
Word w; uint32_t sid,off,len;
TargetPhrase* tp = new TargetPhrase();
parse_pid(pp.p2, sid, off, len);
Token const* x = bt.T2->sntStart(sid) + off;
for (uint32_t k = 0; k < len; ++k)
{
StringPiece wrd = (*bt.V2)[x[k].id()];
w.CreateFromString(Output,ofactor,wrd,false);
tp->AddWord(w);
}
tp->GetScoreBreakdown().Assign(this, pp.fvals);
tp->Evaluate(src);
return tp;
}
// process phrase stats from a single parallel corpus
void
Mmsapt::
process_pstats
(Phrase const& src,
uint64_t const pid1,
pstats const& stats,
Bitext<Token> const & bt,
TargetPhraseCollection* tpcoll
) const
{
PhrasePair pp;
pp.init(pid1, stats, this->m_numScoreComponents);
apply_pp(bt,pp);
boost::unordered_map<uint64_t,jstats>::const_iterator t;
for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
{
pp.update(t->first,t->second);
calc_lex(bt,pp);
calc_pfwd_fix(bt,pp);
calc_pbwd_fix(bt,pp);
tpcoll->Add(createTargetPhrase(src,bt,pp));
}
}
// process phrase stats from a single parallel corpus
bool
Mmsapt::
pool_pstats(Phrase const& src,
uint64_t const pid1a,
pstats * statsa,
Bitext<Token> const & bta,
uint64_t const pid1b,
pstats const* statsb,
Bitext<Token> const & btb,
TargetPhraseCollection* tpcoll) const
{
PhrasePair pp;
if (statsa && statsb)
pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
else if (statsa)
pp.init(pid1b, *statsa, this->m_numScoreComponents);
else if (statsb)
pp.init(pid1b, *statsb, this->m_numScoreComponents);
else return false; // throw "no stats for pooling available!";
apply_pp(bta,pp);
boost::unordered_map<uint64_t,jstats>::const_iterator b;
boost::unordered_map<uint64_t,jstats>::iterator a;
if (statsb)
{
for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
{
uint32_t sid,off,len;
parse_pid(b->first, sid, off, len);
Token const* x = bta.T2->sntStart(sid) + off;
TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
if (m.size() == len)
{
;
if (statsa && ((a = statsa->trg.find(m.getPid()))
!= statsa->trg.end()))
{
pp.update(b->first,a->second,b->second);
a->second.invalidate();
}
else
pp.update(b->first,m.approxOccurrenceCount(),
b->second);
}
else pp.update(b->first,b->second);
calc_lex(btb,pp);
calc_pfwd_fix(btb,pp);
calc_pbwd_fix(btb,pp);
tpcoll->Add(createTargetPhrase(src,btb,pp));
}
}
if (!statsa) return statsb != NULL;
for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
{
uint32_t sid,off,len;
if (!a->second.valid()) continue;
parse_pid(a->first, sid, off, len);
if (btb.T2)
{
Token const* x = btb.T2->sntStart(sid) + off;
TSA<Token>::tree_iterator m(btb.I2.get(), x, x+len);
if (m.size() == len)
pp.update(a->first,m.approxOccurrenceCount(),a->second);
else
pp.update(a->first,a->second);
}
else
pp.update(a->first,a->second);
calc_lex(bta,pp);
calc_pfwd_fix(bta,pp);
calc_pbwd_fix(bta,pp);
tpcoll->Add(createTargetPhrase(src,bta,pp));
}
return true;
}
// this is not the most efficient way of phrase lookup!
// process phrase stats from a single parallel corpus
bool
Mmsapt::
combine_pstats
(Phrase const& src,
uint64_t const pid1a,
pstats * statsa,
Bitext<Token> const & bta,
uint64_t const pid1b,
pstats const* statsb,
Bitext<Token> const & btb,
TargetPhraseCollection* tpcoll
) const
{
PhrasePair ppfix,ppdyn,pool;
Word w;
if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents);
if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
boost::unordered_map<uint64_t,jstats>::const_iterator b;
boost::unordered_map<uint64_t,jstats>::iterator a;
if (statsb)
{
pool.init(pid1b,*statsb,0);
apply_pp(btb,ppdyn);
for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
{
ppdyn.update(b->first,b->second);
calc_pfwd_dyn(btb,ppdyn);
calc_pbwd_dyn(btb,ppdyn);
calc_lex(btb,ppdyn);
uint32_t sid,off,len;
parse_pid(b->first, sid, off, len);
Token const* x = bta.T2->sntStart(sid) + off;
TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
if (m.size() && statsa &&
((a = statsa->trg.find(m.getPid()))
!= statsa->trg.end()))
{
ppfix.update(a->first,a->second);
calc_pfwd_fix(bta,ppfix,&ppdyn.fvals);
calc_pbwd_fix(btb,ppfix,&ppdyn.fvals);
a->second.invalidate();
}
else
{
if (m.size())
pool.update(b->first,m.approxOccurrenceCount(),
b->second);
else
pool.update(b->first,b->second);
calc_pfwd_fix(btb,pool,&ppdyn.fvals);
calc_pbwd_fix(btb,pool,&ppdyn.fvals);
}
tpcoll->Add(createTargetPhrase(src,btb,ppdyn));
}
}
if (statsa)
{
pool.init(pid1a,*statsa,0);
apply_pp(bta,ppfix);
for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
{
if (!a->second.valid()) continue; // done above
ppfix.update(a->first,a->second);
calc_pfwd_fix(bta,ppfix);
calc_pbwd_fix(bta,ppfix);
calc_lex(bta,ppfix);
uint32_t sid,off,len;
parse_pid(a->first, sid, off, len);
Token const* x = btb.T2->sntStart(sid) + off;
TSA<Token>::tree_iterator m(btb.I2.get(),x,x+len);
if (m.size())
pool.update(a->first,m.approxOccurrenceCount(),a->second);
else
pool.update(a->first,a->second);
calc_pfwd_dyn(bta,pool,&ppfix.fvals);
calc_pbwd_dyn(bta,pool,&ppfix.fvals);
}
tpcoll->Add(createTargetPhrase(src,bta,ppfix));
}
return (statsa || statsb);
}
// // phrase statistics combination treating the two knowledge
// // sources separately with backoff to pooling when only one
// // of the two knowledge sources contains the phrase pair in
// // question
// void
// Mmsapt::
// process_pstats(uint64_t const mypid1,
// uint64_t const otpid1,
// pstats const& mystats, // my phrase stats
// pstats const* otstats, // other phrase stats
// Bitext<Token> const & mybt, // my bitext
// Bitext<Token> const * otbt, // other bitext
// PhraseScorer<Token> const& mypfwd,
// PhraseScorer<Token> const& mypbwd,
// PhraseScorer<Token> const* otpfwd,
// PhraseScorer<Token> const* otpbwd,
// TargetPhraseCollection* tpcoll)
// {
// boost::unordered_map<uint64_t,jstats>::const_iterator t;
// vector<FactorType> ofact(1,0);
// PhrasePair mypp,otpp,combo;
// mypp.init(mypid1, mystats, this->m_numScoreComponents);
// if (otstats)
// {
// otpp.init(otpid1, *otstats, 0);
// combo.init(otpid1, mystats, *otstats, 0);
// }
// else combo = mypp;
// for (t = mystats.trg.begin(); t != mystats.trg.end(); ++t)
// {
// if (!t->second.valid()) continue;
// // we dealt with this phrase pair already;
// // see j->second.invalidate() below;
// uint32_t sid,off,len; parse_pid(t->first,sid,off,len);
// mypp.update(t->first,t->second);
// apply_pp(mybt,mypp);
// calc_lex (mybt,mypp);
// mypfwd(mybt,mypp);
// mypbwd(mybt,mypp);
// if (otbt) // it's a dynamic phrase table
// {
// assert(otpfwd);
// assert(otpbwd);
// boost::unordered_map<uint64_t,jstats>::iterator j;
// // look up the current target phrase in the other bitext
// Token const* x = mybt.T2->sntStart(sid) + off;
// TSA<TOKEN>::tree_iterator m(otbt->I2.get(),x,x+len);
// if (otstats // source phrase exists in other bitext
// && m.size() // target phrase exists in other bitext
// && ((j = otstats->trg.find(m.getPid()))
// != otstats->trg.end())) // phrase pair found in other bitext
// {
// otpp.update(j->first,j->second);
// j->second.invalidate(); // mark the phrase pair as seen
// otpfwd(*otbt,otpp,&mypp.fvals);
// otpbwd(*otbt,otpp,&mypp.fvals);
// }
// else
// {
// if (m.size()) // target phrase seen in other bitext, but not the phrase pair
// combo.update(t->first,m.approxOccurrenceCount(),t->second);
// else
// combo.update(t->first,t->second);
// (*otpfwd)(mybt,combo,&mypp.fvals);
// (*otpbwd)(mybt,combo,&mypp.fvals);
// }
// }
// // now add the phrase pair to the TargetPhraseCollection:
// TargetPhrase* tp = new TargetPhrase();
// for (size_t k = off; k < stop; ++k)
// {
// StringPiece wrd = (*mybt.V2)[x[k].id()];
// Word w; w.CreateFromString(Output,ofact,wrd,false);
// tp->AddWord(w);
// }
// tp->GetScoreBreakdown().Assign(this,mypp.fvals);
// tp->Evaluate(src);
// tpcoll->Add(tp);
// }
// }
// This is not the most efficient way of phrase lookup!
TargetPhraseCollection const*
Mmsapt::
GetTargetPhraseCollectionLEGACY(const Phrase& src) const
{
TSA<Token>::tree_iterator m(bt.I1.get());
TargetPhraseCollection* ret = new TargetPhraseCollection();
// Reserve a local copy of the dynamic bitext in its current form. /btdyn/
// is set to a new copy of the dynamic bitext every time a sentence pair
// is added. /dyn/ keeps the old bitext around as long as we need it.
sptr<imBitext<Token> > dyn;
{ // braces are needed for scoping mutex lock guard!
boost::lock_guard<boost::mutex> guard(this->lock);
dyn = btdyn;
}
vector<id_type> sphrase(src.GetSize());
for (size_t i = 0; i < src.GetSize(); ++i)
{
Factor const* f = src.GetFactor(i,input_factor);
id_type wid = (*bt.V1)[f->ToString()];
// cout << (*bt.V1)[wid] << " ";
if (!m.extend(wid)) break;
id_type wid = (*btfix.V1)[f->ToString()];
sphrase[i] = wid;
}
#if 0
cout << endl;
Token const* sphrase = m.getToken(0);
for (size_t i = 0; i < m.size(); ++i)
cout << (*bt.V1)[sphrase[i].id()] << " ";
cout << endl;
#endif
sptr<pstats> s;
if (m.size() < src.GetSize()) return NULL;
{
boost::lock_guard<boost::mutex> guard(this->lock);
s = bt.lookup(m);
}
PhrasePair pp; pp.init(m.getPid(), *s, this->m_numScoreComponents);
TargetPhraseCollection* ret = new TargetPhraseCollection();
vector<FactorType> ofact(1,0);
boost::unordered_map<uint64_t,jstats>::const_iterator t;
for (t = s->trg.begin(); t != s->trg.end(); ++t)
TSA<Token>::tree_iterator mfix(btfix.I1.get()), mdyn(dyn->I1.get());
for (size_t i = 0; mfix.size() == i && i < sphrase.size(); ++i)
mfix.extend(sphrase[i]);
if (dyn->I1.get())
{
pp.update(t->first,t->second);
calc_pfwd(bt,pp);
calc_pbwd(bt,pp);
calc_lex (bt,pp);
apply_pp (bt,pp);
uint32_t sid,off,len;
parse_pid(t->first,sid,off,len);
size_t stop = off + len;
Token const* x = bt.T2->sntStart(sid);
TargetPhrase* tp = new TargetPhrase();
for (size_t k = off; k < stop; ++k)
{
StringPiece wrd = (*bt.V2)[x[k].id()];
Word w; w.CreateFromString(Output,ofact,wrd,false);
tp->AddWord(w);
}
tp->GetScoreBreakdown().Assign(this,pp.fvals);
tp->Evaluate(src);
ret->Add(tp);
for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
mdyn.extend(sphrase[i]);
}
sptr<pstats> sfix,sdyn;
if (mfix.size() == sphrase.size())
{
// do we need this lock here?
// Is it used here to control the total number of running threads???
boost::lock_guard<boost::mutex> guard(this->lock);
sfix = btfix.lookup(mfix);
}
if (mdyn.size() == sphrase.size())
sdyn = dyn->lookup(mdyn);
if (poolCounts)
{
if (!pool_pstats(src, mfix.getPid(),sfix.get(),btfix,
mdyn.getPid(),sdyn.get(),*dyn,ret))
return NULL;
}
else if (!combine_pstats(src, mfix.getPid(),sfix.get(),btfix,
mdyn.getPid(),sdyn.get(),*dyn,ret))
return NULL;
ret->NthElement(m_tableLimit);
#if 0
sort(ret->begin(), ret->end(), CompareTargetPhrase());
@ -150,7 +481,16 @@ namespace Moses
Mmsapt::
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &)
{
throw "CreateRuleLookupManager is currently not supported in Moses!";
throw "CreateRuleLookupManager is currently not supported in Mmsapt!";
}
template<typename Token>
void
fill_token_seq(TokenIndex& V, string const& line, vector<Token>& dest)
{
istringstream buf(line); string w;
while (buf>>w) dest.push_back(Token(V[w]));
}
}

View File

@ -5,18 +5,19 @@
#include <boost/thread.hpp>
#include "moses/generic/sorting/VectorIndexSorter.h"
#include "moses/generic/sampling/Sampling.h"
#include "moses/generic/file_io/ug_stream.h"
#include "moses/TypeDef.h"
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include "moses/mm/ug_mm_ttrack.h"
#include "moses/mm/ug_mm_tsa.h"
#include "moses/mm/tpt_tokenindex.h"
#include "moses/mm/ug_corpus_token.h"
#include "moses/mm/ug_typedefs.h"
#include "moses/mm/tpt_pickler.h"
#include "moses/mm/ug_bitext.h"
#include "moses/mm/ug_lexical_phrase_scorer2.h"
#include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
#include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
#include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
#include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
#include "moses/InputFileStream.h"
#include "moses/FactorTypeSet.h"
@ -25,20 +26,32 @@
#include "moses/TargetPhraseCollection.h"
#include <map>
#include "PhraseDictionary.h"
#include "moses/TranslationModel/PhraseDictionary.h"
// TO DO:
// - make lexical phrase scorer take addition to the "dynamic overlay" into account
// - switch to pool of sapts, where each sapt has its own provenance feature
// RESEARCH QUESTION: is this more effective than having multiple phrase tables,
// each with its own set of features?
using namespace std;
namespace Moses
{
using namespace bitext;
class Mmsapt : public PhraseDictionary
class Mmsapt
#ifndef NO_MOSES
: public PhraseDictionary
#endif
{
friend class Alignment;
public:
typedef L2R_Token<SimpleWordId> Token;
typedef mmBitext<Token> mmbitext;
mmbitext bt;
// string description;
typedef imBitext<Token> imbitext;
typedef TSA<Token> tsa;
private:
mmbitext btfix;
sptr<imbitext> btdyn;
string bname;
string L1;
string L2;
@ -48,25 +61,84 @@ namespace Moses
size_t input_factor;
size_t output_factor; // we can actually return entire Tokens!
// built-in feature functions
PScorePfwd<Token> calc_pfwd;
PScorePbwd<Token> calc_pbwd;
PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
PScoreLex<Token> calc_lex; // this one I'd like to see as an external ff eventually
PScorePP<Token> apply_pp; // apply phrase penalty
void init(string const& line);
mutable boost::mutex lock;
bool poolCounts;
vector<FactorType> ofactor;
// phrase table feature weights for alignment:
vector<float> feature_weights;
vector<vector<id_type> > wlex21;
// word translation lexicon (without counts, get these from calc_lex.COOC)
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> mm2dtable_t;
mm2dtable_t COOCraw;
TargetPhrase*
createTargetPhrase
(Phrase const& src,
Bitext<Token> const& bt,
bitext::PhrasePair const& pp
) const;
void
process_pstats
(Phrase const& src,
uint64_t const pid1,
pstats const& stats,
Bitext<Token> const & bt,
TargetPhraseCollection* tpcoll
) const;
bool
pool_pstats
(Phrase const& src,
uint64_t const pid1a,
pstats * statsa,
Bitext<Token> const & bta,
uint64_t const pid1b,
pstats const* statsb,
Bitext<Token> const & btb,
TargetPhraseCollection* tpcoll
) const;
bool
combine_pstats
(Phrase const& src,
uint64_t const pid1a,
pstats * statsa,
Bitext<Token> const & bta,
uint64_t const pid1b,
pstats const* statsb,
Bitext<Token> const & btb,
TargetPhraseCollection* tpcoll
) const;
public:
Mmsapt(string const& description, string const& line);
// Mmsapt(string const& description, string const& line);
Mmsapt(string const& line);
void
Load();
#ifndef NO_MOSES
TargetPhraseCollection const*
GetTargetPhraseCollectionLEGACY(const Phrase& src) const;
//! Create a sentence-specific manager for SCFG rule lookup.
ChartRuleLookupManager*
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &);
#endif
void add(string const& s1, string const& s2, string const& a);
// align two new sentences
sptr<vector<int> >
align(string const& src, string const& trg) const;
void setWeights(vector<float> const& w);
private:
};
} // end namespace

View File

@ -0,0 +1,334 @@
#include "mmsapt.h"
namespace Moses
{
using namespace bitext;
using namespace std;
using namespace boost;
struct PPgreater
{
bool operator()(PhrasePair const& a, PhrasePair const& b)
{
return a.score > b.score;
}
};
void
Mmsapt::
setWeights(vector<float> const & w)
{
assert(w.size() == this->m_numScoreComponents);
this->feature_weights = w;
}
struct PhraseAlnHyp
{
PhrasePair pp;
ushort s1,e1,s2,e2; // start and end positions
int prev; // preceding alignment hypothesis
float score;
bitvector scov; // source coverage
PhraseAlnHyp(PhrasePair const& ppx, int slen,
pair<uint32_t,uint32_t> const& sspan,
pair<uint32_t,uint32_t> const& tspan)
: pp(ppx), prev(-1), score(ppx.score), scov(slen)
{
s1 = sspan.first; e1 = sspan.second;
s2 = tspan.first; e2 = tspan.second;
for (size_t i = s1; i < e1; ++i)
scov.set(i);
}
bool operator<(PhraseAlnHyp const& other) const
{
return this->score < other.score;
}
bool operator>(PhraseAlnHyp const& other) const
{
return this->score > other.score;
}
PhraseOrientation
po_bwd(PhraseAlnHyp const* prev) const
{
if (s2 == 0) return po_first;
assert(prev);
assert(prev->e2 <= s2);
if (prev->e2 < s2) return po_other;
if (prev->e1 == s1) return po_mono;
if (prev->e1 < s1) return po_jfwd;
if (prev->s1 == e1) return po_swap;
if (prev->s1 > e1) return po_jbwd;
return po_other;
}
PhraseOrientation
po_fwd(PhraseAlnHyp const* next) const
{
if (!next) return po_last;
assert(next->s2 >= e2);
if (next->s2 < e2) return po_other;
if (next->e1 == s1) return po_swap;
if (next->e1 < s1) return po_jbwd;
if (next->s1 == e1) return po_mono;
if (next->s1 > e1) return po_jfwd;
return po_other;
}
float
dprob_fwd(PhraseAlnHyp const& next)
{
return pp.dfwd[po_fwd(&next)];
}
float
dprob_bwd(PhraseAlnHyp const& prev)
{
return pp.dbwd[po_bwd(&prev)];
}
};
class Alignment
{
typedef L2R_Token<SimpleWordId> Token;
typedef TSA<Token> tsa;
typedef pair<uint32_t, uint32_t> span;
typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
typedef boost::unordered_map<uint64_t,jstats> jStatsTable;
Mmsapt const& PT;
vector<id_type> s,t;
pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
pid2span_t spid2span,tpid2span;
vector<vector<sptr<pstats> > > spstats;
vector<PhrasePair> PP;
// position-independent phrase pair info
public:
vector<PhraseAlnHyp> PAH;
vector<vector<int> > tpos2ahyp;
// maps from target start positions to PhraseAlnHyps starting at
// that position
sptr<pstats> getPstats(span const& sspan);
void fill_tspan_maps();
void fill_sspan_maps();
public:
Alignment(Mmsapt const& pt, string const& src, string const& trg);
void show(ostream& out);
void show(ostream& out, PhraseAlnHyp const& ah);
};
void
Alignment::
show(ostream& out, PhraseAlnHyp const& ah)
{
LexicalPhraseScorer2<Token>::table_t const&
COOCjnt = PT.calc_lex.scorer.COOC;
out << setw(10) << exp(ah.score) << " "
<< PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
<< " <=> "
<< PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
vector<uchar> const& a = ah.pp.aln;
// BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
for (size_t u = 0; u+1 < a.size(); u += 2)
out << " " << int(a[u+1]) << "-" << int(a[u]);
if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
<< "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
<< "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
out << endl;
// float const* ofwdj = ah.pp.dfwd;
// float const* obwdj = ah.pp.dbwd;
// uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
// uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
// out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
// << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
// << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
// << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
// << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
// << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
// << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
// << "]" << endl
// << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
// << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
// << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
// << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
// << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
// << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
// << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
// << "]" << endl;
}
void
Alignment::
show(ostream& out)
{
// show what we have so far ...
for (size_t s2 = 0; s2 < t.size(); ++s2)
{
VectorIndexSorter<PhraseAlnHyp> foo(PAH);
sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
show(out,PAH[tpos2ahyp[s2][h]]);
}
}
sptr<pstats>
Alignment::
getPstats(span const& sspan)
{
size_t k = sspan.second - sspan.first - 1;
if (k < spstats[sspan.first].size())
return spstats[sspan.first][k];
else return sptr<pstats>();
}
void
Alignment::
fill_tspan_maps()
{
tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
for (size_t i = 0; i < t.size(); ++i)
{
tsa::tree_iterator m(PT.btfix.I2.get());
for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
{
uint64_t pid = m.getPid();
tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
tspan2pid[i][k] = pid;
}
}
}
void
Alignment::
fill_sspan_maps()
{
sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
spstats.resize(s.size());
for (size_t i = 0; i < s.size(); ++i)
{
tsa::tree_iterator m(PT.btfix.I1.get());
for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
{
uint64_t pid = m.getPid();
sspan2pid[i][k] = pid;
pid2span_t::iterator p = spid2span.find(pid);
if (p != spid2span.end())
{
int x = p->second[0].first;
int y = p->second[0].second-1;
spstats[i].push_back(spstats[x][y-x]);
}
else
{
spstats[i].push_back(PT.btfix.lookup(m));
cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
<< spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
<< endl;
}
spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
}
}
}
Alignment::
Alignment(Mmsapt const& pt, string const& src, string const& trg)
: PT(pt)
{
PT.btfix.V1->fillIdSeq(src,s);
PT.btfix.V2->fillIdSeq(trg,t);
// LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
// BOOST_FOREACH(id_type i, t)
// {
// cout << (*PT.btfix.V2)[i];
// if (i < PT.wlex21.size())
// {
// BOOST_FOREACH(id_type k, PT.wlex21[i])
// {
// size_t j = COOC[k][i];
// size_t m1 = COOC.m1(k);
// size_t m2 = COOC.m2(i);
// if (j*1000 > m1 && j*1000 > m2)
// cout << " " << (*PT.btfix.V1)[k];
// }
// }
// cout << endl;
// }
fill_tspan_maps();
fill_sspan_maps();
tpos2ahyp.resize(t.size());
// now fill the association score table
PAH.reserve(1000000);
typedef pid2span_t::iterator psiter;
for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
{
if (!L->second.size()) continue; // should never happen anyway
int i = L->second[0].first;
int k = L->second[0].second - i -1;
sptr<pstats> ps = spstats[i][k];
PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
jStatsTable & J = ps->trg;
for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
{
psiter R = tpid2span.find(y->first);
if (R == tpid2span.end()) continue;
pp.update(y->first, y->second);
PT.calc_lex(PT.btfix,pp);
PT.calc_pfwd_fix(PT.btfix,pp);
PT.calc_pbwd_fix(PT.btfix,pp);
pp.eval(PT.feature_weights);
PP.push_back(pp);
BOOST_FOREACH(span const& sspan, L->second)
{
BOOST_FOREACH(span const& tspan, R->second)
{
tpos2ahyp[tspan.first].push_back(PAH.size());
PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
}
}
}
}
}
int
extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
{
if ((PAH[edge].scov & PAH[next].scov).count())
return -1;
int ret = PAH.size();
PAH.push_back(PAH[next]);
PhraseAlnHyp & h = PAH.back();
h.prev = edge;
h.scov |= PAH[edge].scov;
h.score += log(PAH[edge].dprob_fwd(PAH[next]));
h.score += log(PAH[next].dprob_bwd(PAH[edge]));
return ret;
}
sptr<vector<int> >
Mmsapt::
align(string const& src, string const& trg) const
{
// For the time being, we consult only the fixed bitext.
// We might also consider the dynamic bitext. => TO DO.
Alignment A(*this,src,trg);
VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
vector<size_t> o; foo.GetOrder(o);
BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
sptr<vector<int> > aln;
return aln;
}
}

View File

@ -0,0 +1,33 @@
#include "mmsapt.h"
using namespace std;
using namespace Moses;
Mmsapt* PT;
int main(int argc, char* argv[])
{
string base = argv[1];
string L1 = argv[2];
string L2 = argv[3];
ostringstream buf;
buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base="
<< base << " L1=" << L1 << " L2=" << L2;
string configline = buf.str();
PT = new Mmsapt(configline);
PT->Load();
float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856, 0.647506 };
vector<float> weights(w,w+5);
PT->setWeights(weights);
// these values are taken from a moses.ini file;
// is there a convenient way of accessing them from within mmsapt ???
string eline,fline;
// TokenIndex V; V.open("crp/trn/mm/de.tdx");
while (getline(cin,eline) && getline(cin,fline))
{
cout << eline << endl;
cout << fline << endl;
PT->align(eline,fline);
}
delete PT;
}

View File

@ -35,7 +35,10 @@ ModelScore* ModelScore::createModelScore(const string& modeltype)
} else if (modeltype.compare("leftright") == 0) {
return new ModelScoreLR();
} else {
cerr << "Illegal model type given for lexical reordering model scoring: " << modeltype << ". The allowed types are: mslr, msd, monotonicity, leftright" << endl;
cerr << "Illegal model type given for lexical reordering model scoring: "
<< modeltype
<< ". The allowed types are: mslr, msd, monotonicity, leftright"
<< endl;
exit(1);
}
}

View File

@ -555,51 +555,6 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
phraseTableFile << " ||| ";
}
// alignment
if ( hierarchicalFlag ) {
// always output alignment if hiero style
assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
std::vector<std::string> alignment;
for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
if ( bestAlignmentT2S->at(j).size() != 1 ) {
std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
phraseTableFile.flush();
assert(bestAlignmentT2S->at(j).size() == 1);
}
size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
//phraseTableFile << sourcePos << "-" << j << " ";
std::stringstream point;
point << sourcePos << "-" << j;
alignment.push_back(point.str());
} else {
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
size_t sourcePos = *setIter;
std::stringstream point;
point << sourcePos << "-" << j;
alignment.push_back(point.str());
}
}
}
// now print all alignments, sorted by source index
sort(alignment.begin(), alignment.end());
for (size_t i = 0; i < alignment.size(); ++i) {
phraseTableFile << alignment[i] << " ";
}
} else if ( !inverseFlag && wordAlignmentFlag) {
// alignment info in pb model
for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
size_t sourcePos = *setIter;
phraseTableFile << sourcePos << "-" << j << " ";
}
}
}
phraseTableFile << " ||| ";
// lexical translation probability
if (lexFlag) {
double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S );
@ -641,6 +596,53 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
phraseTableFile << " " << i->first << " " << i->second;
}
phraseTableFile << " ||| ";
// output alignment info
if ( !inverseFlag ) {
if ( hierarchicalFlag ) {
// always output alignment if hiero style
assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
std::vector<std::string> alignment;
for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
if ( bestAlignmentT2S->at(j).size() != 1 ) {
std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
phraseTableFile.flush();
assert(bestAlignmentT2S->at(j).size() == 1);
}
size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
//phraseTableFile << sourcePos << "-" << j << " ";
std::stringstream point;
point << sourcePos << "-" << j;
alignment.push_back(point.str());
} else {
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
size_t sourcePos = *setIter;
std::stringstream point;
point << sourcePos << "-" << j;
alignment.push_back(point.str());
}
}
}
// now print all alignments, sorted by source index
sort(alignment.begin(), alignment.end());
for (size_t i = 0; i < alignment.size(); ++i) {
phraseTableFile << alignment[i] << " ";
}
} else if (wordAlignmentFlag) {
// alignment info in pb model
for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
size_t sourcePos = *setIter;
phraseTableFile << sourcePos << "-" << j << " ";
}
}
}
}
// counts
phraseTableFile << " ||| " << totalCount << " " << count;
if (kneserNeyFlag)

View File

@ -0,0 +1,43 @@
#!/usr/bin/perl -W
# script for preprocessing language data prior to tokenization
# Start by Ulrich Germann, after noticing systematic preprocessing errors
# in some of the English Europarl data.
use strict;
use Getopt::Std;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
sub usage
{
print "Script for preprocessing of raw language data prior to tokenization\n";
print "Usage: $0 -l <language tag>\n";
}
my %args;
getopt('l=s h',\%args);
usage() && exit(0) if $args{'h'};
if ($args{'l'} eq "en")
{
while (<>)
{
s/([[:alpha:]]\') s\b/$1s/g;
print;
}
}
elsif ($args{'l'} eq "fr")
{
while (<>)
{
s/\b([[:alpha:]]\')\s+(?=[[:alpha:]])/$1/g;
print;
}
}
else
{
print while <>;
}

View File

@ -33,7 +33,7 @@ my $TIMING = 0;
my $NUM_THREADS = 1;
my $NUM_SENTENCES_PER_THREAD = 2000;
my $PENN = 0;
my $NO_ESCAPING = 0;
while (@ARGV)
{
$_ = shift;
@ -49,6 +49,7 @@ while (@ARGV)
/^-threads$/ && ($NUM_THREADS = int(shift), next);
/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
/^-penn$/ && ($PENN = 1, next);
/^-no-escape/ && ($NO_ESCAPING = 1, next);
}
# for time calculation
@ -69,6 +70,7 @@ if ($HELP)
print " -time ... enable processing time calculation.\n";
print " -penn ... use Penn treebank-like tokenization.\n";
print " -protected FILE ... specify file with patters to be protected in tokenisation.\n";
print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
exit;
}
@ -246,7 +248,7 @@ sub tokenize
# aggressive hyphen splitting
if ($AGGRESSIVE)
{
$text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
$text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
}
#multi-dots stay together
@ -345,14 +347,17 @@ sub tokenize
$text =~ s/DOTMULTI/./g;
#escape special chars
$text =~ s/\&/\&amp;/g; # escape escape
$text =~ s/\|/\&#124;/g; # factor separator
$text =~ s/\</\&lt;/g; # xml
$text =~ s/\>/\&gt;/g; # xml
$text =~ s/\'/\&apos;/g; # xml
$text =~ s/\"/\&quot;/g; # xml
$text =~ s/\[/\&#91;/g; # syntax non-terminal
$text =~ s/\]/\&#93;/g; # syntax non-terminal
if (!$NO_ESCAPING)
{
$text =~ s/\&/\&amp;/g; # escape escape
$text =~ s/\|/\&#124;/g; # factor separator
$text =~ s/\</\&lt;/g; # xml
$text =~ s/\>/\&gt;/g; # xml
$text =~ s/\'/\&apos;/g; # xml
$text =~ s/\"/\&quot;/g; # xml
$text =~ s/\[/\&#91;/g; # syntax non-terminal
$text =~ s/\]/\&#93;/g; # syntax non-terminal
}
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;

View File

@ -315,7 +315,9 @@ print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
# path of script for filtering phrase tables and running the decoder
$filtercmd = File::Spec->catfile($SCRIPTS_ROOTDIR, "training", "filter-model-given-input.pl") if !defined $filtercmd;
if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) {
# WHY ... ! ___FILTER_PHRASE_TABLE ??? This doesn't make sense! [UG]
# if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) {
if ( ! -x $filtercmd && $___FILTER_PHRASE_TABLE) {
warn "Filtering command not found: $filtercmd.";
warn "Use --filtercmd=PATH to specify a valid one or --no-filter-phrase-table";
exit 1;
@ -409,7 +411,7 @@ if ($___ACTIVATE_FEATURES) {
}
my ($just_cmd_filtercmd, $x) = split(/ /, $filtercmd);
die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd;
die "Not executable: $just_cmd_filtercmd" if $___FILTER_PHRASE_TABLE && ! -x $just_cmd_filtercmd;
die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd;
die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
die "Not executable: $___DECODER" if ! -x $___DECODER;