Merge branch 'master' into miramerge

Conflicts:
	moses-chart-cmd/src/Main.cpp
	moses-cmd/src/IOWrapper.cpp
	moses-cmd/src/Main.cpp
	moses/src/DummyScoreProducers.cpp
	moses/src/DummyScoreProducers.h
	moses/src/GenerationDictionary.cpp
	moses/src/GenerationDictionary.h
	moses/src/GlobalLexicalModel.h
	moses/src/LMList.h
	moses/src/LanguageModel.cpp
	moses/src/LanguageModel.h
	moses/src/LanguageModelImplementation.h
	moses/src/LanguageModelKen.h
	moses/src/LanguageModelMultiFactor.cpp
	moses/src/LanguageModelMultiFactor.h
	moses/src/LanguageModelSingleFactor.cpp
	moses/src/LanguageModelSingleFactor.h
	moses/src/LexicalReordering.h
	moses/src/PhraseDictionary.cpp
	moses/src/PhraseDictionary.h
	moses/src/ScoreIndexManager.cpp
	moses/src/ScoreProducer.h
	moses/src/StaticData.cpp
	moses/src/StaticData.h
	moses/src/TranslationSystem.cpp
This commit is contained in:
bhaddow 2011-09-20 11:23:38 +01:00
commit 5b7c5ebdb5
180 changed files with 27179 additions and 33753 deletions

27
cruise-control/README Normal file
View File

@ -0,0 +1,27 @@
A simple regular testing of Moses codebase, aka cruise control
Started by Ondrej Bojar
2011-08-28
Usage:
1. Checkout this directory somewhere on the computer where you want to run the
cruise control.
2. Create as many config files as you wish, an example is ondrej-nb.config
...hardcode all paths to you preferred external tools like LM toolkits.
3. Run ./test_all_new_commits.sh <CONFIGFILE>
TODO / KNOWN BUGS
- regression tests are not run yet
- regression tests always require SRILM, but we need to test all LMs that have
been compiled in
=> add separate regression tests, one for each LM?
=> modify regression tests to actually loop over all LMs?
- final status is FAIL if any regression test fails, but we should actually
allow to expect failures for the given set of ./configure parameters
(e.g. regression test requiring irstlm is bound to fail if we're not linking
against irstlm)

View File

@ -0,0 +1,128 @@
#!/bin/bash
# given a config file runs tests on all untested commits of the scanned branches
# storing detailed logs to logs/CONFIGNAME/commit
# and extending the file brief.log
#
# A commit is assumed to be tested, if logs/CONFIGNAME/commit exists
#
# Ondrej Bojar, 2011
function warn() { echo "$@" >&2; }
function die() { echo "$@" >&2; exit 1; }
set -o pipefail # safer pipes
configf="$1"
[ -e "$configf" ] || die "usage: $0 configfile"
configname=$(basename $configf | sed 's/\.config$//')
source "$configf"
[ -z "$MCC_SCAN_BRANCHES" ] \
&& die "Bad config $configf; does not define MCC_SCAN_BRANCHES"
# use the given tempdir or make subdir tmp here
USE_TEMPDIR=$MCC_TEMPDIR
[ -d "$USE_TEMPDIR" ] || USE_TEMPDIR=./tmp
LOGDIR=$MCC_LOGDIR
[ -d "$LOGDIR" ] || LOGDIR=.
# ensure full path for logdir
LOGDIR=$(readlink -f "$LOGDIR")
[ -d "$LOGDIR" ] || die "Fatal: confusing readlink for $LOGDIR"
# this is where moses is cloned into
WORKDIR=$MCC_WORKDIR
[ -d "$WORKDIR" ] || WORKDIR=$USE_TEMPDIR/workdir
# this is where moses is taken from
GITREPO=$MCC_GITREPO
[ -d "$GITREPO" ] || GITREPO=/home/obo/moses-at-google-code
if [ ! -d "$WORKDIR" ]; then
mkdir $(dirname "$WORKDIR") || die "Failed to create workdir $WORKDIR"
warn "Cloning $GITREPO into $WORKDIR"
git clone "$GITREPO" $WORKDIR \
|| die "Failed to git clone into workdir $WORKDIR"
else
( cd "$WORKDIR" && git fetch ) \
|| die "Failed to update our clone at $WORKDIR"
fi
mkdir -p $LOGDIR/logs/$configname \
|| die "Failed to create dir $LOGDIR/logs/$configname"
#### How is one test performed
function run_single_test () {
commit=$1
longlog="$LOGDIR/logs/$configname/$commit"
if [ -e "$longlog" ]; then
# Commit already tested
return
fi
warn "Testing commit $commit"
# Get the version of this script
ccversion=$(svnversion 2>/dev/null)
[ ! -z "$ccversion" ] || ccversion=$(git show 2>&1 | head -n 1)
[ ! -z "$ccversion" ] || ccversion="unknown"
# Create log header with computer details:
echo "#### Moses Cruise Control Log for commit $commit" > $longlog
date >> $longlog
echo "## Cruise Control version" >> $longlog
echo $ccversion >> $longlog
echo "## Parameters" >> $longlog
cat $configf >> $longlog
echo "## Envinronment" >> $longlog
uname -a >> $longlog
env >> $longlog
pushd $WORKDIR 2>/dev/null >/dev/null || die "Failed to chdir to $WORKDIR"
git checkout --force $commit 2>/dev/null || die "Failed to checkout commit $commit"
err=""
echo "## regenerate-makefiles.sh" >> $longlog
./regenerate-makefiles.sh >> $longlog 2>&1 || err="regenerate-makefiles"
echo "## make clean" >> $longlog
make clean >> $longlog 2>&1 || warn "make clean failed, suspicious"
echo "## ./configure $MCC_CONFIGURE_ARGS" >> $longlog
[ -z "$err" ] && ./configure $MCC_CONFIGURE_ARGS >> $longlog 2>&1 \
|| err="configure"
echo "## make" >> $longlog
[ -z "$err" ] && make >> $longlog 2>&1 \
|| err="make"
cd regression-testing
echo "## Not running any regression tests yet." >> $longlog
echo "## Finished" >> $longlog
date >> $longlog
if [ -z "$err" ]; then
status="OK"
else
status="FAIL:$err"
fi
echo "## Status: $status" >> $longlog
nicedate=$(date +"%Y%m%d-%H%M%S")
echo "$commit $status $configname $ccversion $nicedate" \
>> "$LOGDIR/brief.log"
popd > /dev/null 2> /dev/null
}
#### Main loop over all commits
( cd "$WORKDIR" && git rev-list $MCC_SCAN_BRANCHES ) \
| while read commit; do
run_single_test $commit || die "Testing failed, stopping the loop."
done

View File

@ -368,6 +368,10 @@
GCC_ENABLE_FIX_AND_CONTINUE = YES;
GCC_MODEL_TUNING = G5;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
_LARGE_FILES,
"_FILE_OFFSET_BITS=64",
);
INSTALL_PATH = /usr/local/lib;
PRODUCT_NAME = kenlm;
};
@ -379,6 +383,10 @@
ALWAYS_SEARCH_USER_PATHS = NO;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
GCC_MODEL_TUNING = G5;
GCC_PREPROCESSOR_DEFINITIONS = (
_LARGE_FILES,
"_FILE_OFFSET_BITS=64",
);
INSTALL_PATH = /usr/local/lib;
PRODUCT_NAME = kenlm;
};

View File

@ -6,14 +6,18 @@
*
*/
#include <cassert>
#include <fstream>
#include "Scorer.h"
#include "ScorerFactory.h"
#include "Data.h"
#include "Util.h"
Data::Data(Scorer& ptr):
theScorer(&ptr)
theScorer(&ptr),
_sparse_flag(false)
{
score_type = (*theScorer).getName();
TRACE_ERR("Data::score_type " << score_type << std::endl);
@ -40,7 +44,6 @@ void Data::loadnbest(const std::string &file)
std::string theSentence;
std::string::size_type loc;
while (getline(inp,stringBuf,'\n')) {
if (stringBuf.empty()) continue;
@ -56,16 +59,15 @@ void Data::loadnbest(const std::string &file)
featentry.reset();
scoreentry.clear();
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
scoredata->add(scoreentry, sentence_index);
getNextPound(stringBuf, substring, "|||"); //third field
// examine first line for name of features
if (!existsFeatureNames()) {
std::string stringsupport=substring;
// adding feature names
std::string features="";
std::string tmpname="";
@ -75,10 +77,17 @@ void Data::loadnbest(const std::string &file)
getNextPound(stringsupport, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
features+=tmpname+"_"+stringify(tmpidx)+" ";
tmpidx++;
} else {
}
// ignore sparse feature name
else if (subsubstring.find("_") != string::npos) {
// also ignore its value
getNextPound(stringsupport, subsubstring);
}
// update current feature name
else {
tmpidx=0;
tmpname=subsubstring.substr(0,subsubstring.size() - 1);
}
@ -87,22 +96,36 @@ void Data::loadnbest(const std::string &file)
featdata->setFeatureMap(features);
}
// adding features
// adding features
while (!substring.empty()) {
// TRACE_ERR("Decompounding: " << substring << std::endl);
getNextPound(substring, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
// no ':' -> feature value that needs to be stored
if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
featentry.add(ATOFST(subsubstring.c_str()));
}
// sparse feature name? store as well
else if (subsubstring.find("_") != string::npos) {
std::string name = subsubstring;
getNextPound(substring, subsubstring);
featentry.addSparse( name, atof(subsubstring.c_str()) );
_sparse_flag = true;
}
}
//cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
featdata->add(featentry,sentence_index);
}
inp.close();
}
// TODO
void Data::mergeSparseFeatures() {
std::cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";
exit(1);
}
// really not the right place...
float sentenceLevelBleuPlusOne( ScoreStats &stats ) {
float logbleu = 0.0;
@ -144,7 +167,7 @@ public:
};
void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
void Data::sampleRankedPairs( const std::string &rankedpairfile ) {
cout << "Sampling ranked pairs." << endl;
ofstream *outFile = new ofstream();
@ -187,20 +210,15 @@ void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
for(unsigned int i=0; i<samples.size() && collected < n_samples; i++) {
if (samples[i]->getDiff() >= min_diff) {
collected++;
FeatureStats &f1 = featdata->get(S,samples[i]->getTranslation1());
FeatureStats &f2 = featdata->get(S,samples[i]->getTranslation2());
*out << "1";
for(unsigned int j=0; j<f1.size(); j++)
if (abs(f1.get(j)-f2.get(j)) > 0.00001)
*out << " F" << j << " " << (f1.get(j)-f2.get(j));
*out << endl;
outputSample( *out, featdata->get(S,samples[i]->getTranslation1()),
featdata->get(S,samples[i]->getTranslation2()) );
*out << endl;
*out << "0";
for(unsigned int j=0; j<f1.size(); j++)
if (abs(f1.get(j)-f2.get(j)) > 0.00001)
*out << " F" << j << " " << (f2.get(j)-f1.get(j));
*out << endl;
outputSample( *out, featdata->get(S,samples[i]->getTranslation2()),
featdata->get(S,samples[i]->getTranslation1()) );
*out << endl;
}
delete samples[i];
}
@ -209,3 +227,77 @@ void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
out->flush();
outFile->close();
}
void Data::outputSample( ostream &out, const FeatureStats &f1, const FeatureStats &f2 )
{
// difference in score in regular features
for(unsigned int j=0; j<f1.size(); j++)
if (abs(f1.get(j)-f2.get(j)) > 0.00001)
out << " F" << j << " " << (f1.get(j)-f2.get(j));
if (!hasSparseFeatures())
return;
// sparse features
const sparse_featstats_t &s1 = f1.getSparse();
const sparse_featstats_t &s2 = f2.getSparse();
for( sparse_featstats_t::const_iterator i=s1.begin(); i!=s1.end(); i++) {
if (s2.find(i->first) == s2.end())
out << " " << i->first << " " << i->second;
else {
float diff = i->second - s2.find(i->first)->second;
if (abs(diff) > 0.00001)
out << " " << i->first << " " << diff;
}
}
for( sparse_featstats_t::const_iterator i=s2.begin(); i!=s2.end(); i++) {
if (s1.find(i->first) == s1.end())
out << " " << i->first << " " << (- i->second);
}
}
void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,
std::vector<Data>& shards)
{
assert(shard_count);
assert(shard_size >=0);
assert(shard_size <= 1);
size_t data_size = scoredata->size();
assert(data_size == featdata->size());
shard_size *= data_size;
for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {
vector<size_t> shard_contents;
if (shard_size == 0) {
//split into roughly equal size shards
size_t shard_start = floor(0.5 + shard_id * (float)data_size / shard_count);
size_t shard_end = floor(0.5 + (shard_id+1) * (float)data_size / shard_count);
for (size_t i = shard_start; i < shard_end; ++i) {
shard_contents.push_back(i);
}
} else {
//create shards by randomly sampling
for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
shard_contents.push_back(rand() % data_size);
}
}
ScorerFactory SF;
Scorer* scorer = SF.getScorer(score_type, scorerconfig);
shards.push_back(Data(*scorer));
shards.back().score_type = score_type;
shards.back().number_of_scores = number_of_scores;
shards.back()._sparse_flag = _sparse_flag;
for (size_t i = 0; i < shard_contents.size(); ++i) {
shards.back().featdata->add(featdata->get(shard_contents[i]));
shards.back().scoredata->add(scoredata->get(shard_contents[i]));
}
//cerr << endl;
}
}

View File

@ -31,10 +31,10 @@ private:
Scorer* theScorer;
std::string score_type;
size_t number_of_scores; //number of scores
bool _sparse_flag;
public:
Data(Scorer& sc);
~Data() {};
inline void clear() {
@ -49,6 +49,10 @@ public:
return featdata;
};
Scorer* getScorer() {
return theScorer;
}
inline size_t NumberOfFeatures() const {
return featdata->NumberOfFeatures();
}
@ -62,11 +66,16 @@ public:
featdata->Features(f);
}
inline bool hasSparseFeatures() const { return _sparse_flag; }
void mergeSparseFeatures();
void loadnbest(const std::string &file);
void load(const std::string &featfile,const std::string &scorefile) {
featdata->load(featfile);
scoredata->load(scorefile);
if (featdata->hasSparseFeatures())
_sparse_flag = true;
}
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
@ -90,8 +99,17 @@ public:
return featdata->getFeatureIndex(name);
};
void sample_ranked_pairs( const std::string &rankedPairFile );
void sampleRankedPairs( const std::string &rankedPairFile );
void outputSample( std::ostream &out, const FeatureStats &f1, const FeatureStats &f2 );
/**
* Create shard_count shards. If shard_size == 0, then the shards are non-overlapping
* and exhaust the data. If 0 < shard_size <= 1, then shards are chosen by sampling
* the data (with replacement) and shard_size is interpreted as the proportion
* of the total size.
*/
void createShards(size_t shard_count, float shard_size, const std::string& scorerconfig,
std::vector<Data>& shards);
};
#endif

View File

@ -11,7 +11,7 @@
#include "Util.h"
FeatureArray::FeatureArray(): idx("")
FeatureArray::FeatureArray(): idx(""), _sparse_flag(false)
{};
void FeatureArray::savetxt(std::ofstream& outFile)
@ -69,6 +69,8 @@ void FeatureArray::loadtxt(ifstream& inFile, size_t n)
for (size_t i=0 ; i < n; i++) {
entry.loadtxt(inFile);
add(entry);
if (entry.getSparse().size()>0)
_sparse_flag = true;
}
}

View File

@ -30,6 +30,7 @@ protected:
featarray_t array_;
size_t number_of_features;
std::string features;
bool _sparse_flag;
private:
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
@ -43,6 +44,10 @@ public:
array_.clear();
}
inline bool hasSparseFeatures() const {
return _sparse_flag;
}
inline std::string getIndex() {
return idx;
}

View File

@ -51,9 +51,12 @@ void FeatureData::load(ifstream& inFile)
if (entry.size() == 0)
break;
if (size() == 0) {
if (size() == 0)
setFeatureMap(entry.Features());
}
if (entry.hasSparseFeatures())
_sparse_flag = true;
add(entry);
}
}

View File

@ -26,10 +26,10 @@ protected:
idx2name idx2arrayname_; //map from index to name of array
name2idx arrayname2idx_; //map from name to index of array
private:
size_t number_of_features;
std::string features;
bool _sparse_flag;
map<std::string, size_t> featname2idx_; //map from name to index of features
map<size_t, std::string> idx2featname_; //map from index to name of features
@ -43,6 +43,9 @@ public:
array_.clear();
}
inline bool hasSparseFeatures() const {
return _sparse_flag;
}
inline FeatureArray get(const std::string& idx) {
return array_.at(getIndex(idx));
}

View File

@ -21,7 +21,7 @@ FeatureStats::FeatureStats()
FeatureStats::~FeatureStats()
{
delete array_;
delete[] array_;
};
FeatureStats::FeatureStats(const FeatureStats &stats)
@ -30,6 +30,7 @@ FeatureStats::FeatureStats(const FeatureStats &stats)
entries_ = stats.size();
array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_);
map_ = stats.getSparse();
};
FeatureStats::FeatureStats(const size_t size)
@ -61,6 +62,11 @@ void FeatureStats::add(FeatureStatsType v)
array_[entries_++]=v;
}
void FeatureStats::addSparse(string name, FeatureStatsType v)
{
map_[name]=v;
}
void FeatureStats::set(std::string &theString)
{
std::string substring, stringBuf;
@ -68,7 +74,15 @@ void FeatureStats::set(std::string &theString)
while (!theString.empty()) {
getNextPound(theString, substring);
add(ATOFST(substring.c_str()));
// regular feature
if (substring.find(":") == string::npos) {
add(ATOFST(substring.c_str()));
}
// sparse feature
else {
size_t separator = substring.find_last_of(":");
addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
}
}
}
@ -123,6 +137,7 @@ FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
entries_ = stats.size();
array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_);
map_ = stats.getSparse();
return *this;
}
@ -131,7 +146,14 @@ FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
/**write the whole object to a stream*/
ostream& operator<<(ostream& o, const FeatureStats& e)
{
for (size_t i=0; i< e.size(); i++)
// print regular features
for (size_t i=0; i< e.size(); i++) {
o << e.get(i) << " ";
}
// sparse features
const sparse_featstats_t &sparse = e.getSparse();
for(sparse_featstats_t::const_iterator i = sparse.begin(); i != sparse.end(); i++) {
o << i->first << i->second << " ";
}
return o;
}

View File

@ -26,6 +26,7 @@ class FeatureStats
{
private:
featstats_t array_;
sparse_featstats_t map_;
size_t entries_;
size_t available_;
@ -43,9 +44,11 @@ public:
}
void expand();
void add(FeatureStatsType v);
void addSparse(string name, FeatureStatsType v);
inline void clear() {
memset((void*) array_,0,featbytes_);
map_.clear();
}
inline FeatureStatsType get(size_t i) {
@ -57,6 +60,9 @@ public:
inline featstats_t getArray() const {
return array_;
}
inline sparse_featstats_t getSparse() const {
return map_;
}
void set(std::string &theString);

View File

@ -1,97 +1,36 @@
lib_LTLIBRARIES = libmert.la
bin_PROGRAMS = mert extractor evaluator
mert_SOURCES = Util.cpp \
Timer.cpp \
ScoreStats.cpp ScoreArray.cpp ScoreData.cpp \
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp \
Data.cpp \
BleuScorer.cpp \
Point.cpp \
PerScorer.cpp \
Scorer.cpp \
Optimizer.cpp \
mert.cpp \
TERsrc/alignmentStruct.cpp \
TERsrc/hashMap.cpp \
TERsrc/hashMapStringInfos.cpp \
TERsrc/segmentStructure.cpp \
TERsrc/stringHasher.cpp \
TERsrc/terAlignment.cpp \
TERsrc/terShift.cpp \
TERsrc/tinyxml.cpp \
TERsrc/tinyxmlparser.cpp \
TERsrc/documentStructure.cpp \
TERsrc/hashMapInfos.cpp \
TERsrc/infosHasher.cpp \
TERsrc/stringInfosHasher.cpp \
TERsrc/tercalc.cpp \
TERsrc/tinystr.cpp \
TERsrc/tinyxmlerror.cpp \
TERsrc/tools.cpp \
TerScorer.cpp \
CderScorer.cpp
AM_CPPFLAGS = -W -Wall -Wno-unused -ffor-scope -DTRACE_ENABLE $(BOOST_CPPFLAGS)
extractor_SOURCES = Util.cpp \
Timer.cpp \
ScoreStats.cpp ScoreArray.cpp ScoreData.cpp \
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp \
Data.cpp \
BleuScorer.cpp \
Point.cpp \
PerScorer.cpp \
Scorer.cpp \
Optimizer.cpp \
extractor.cpp \
TERsrc/alignmentStruct.cpp \
TERsrc/hashMap.cpp \
TERsrc/hashMapStringInfos.cpp \
TERsrc/segmentStructure.cpp \
TERsrc/stringHasher.cpp \
TERsrc/terAlignment.cpp \
TERsrc/terShift.cpp \
TERsrc/tinyxml.cpp \
TERsrc/tinyxmlparser.cpp \
TERsrc/documentStructure.cpp \
TERsrc/hashMapInfos.cpp \
TERsrc/infosHasher.cpp \
TERsrc/stringInfosHasher.cpp \
TERsrc/tercalc.cpp \
TERsrc/tinystr.cpp \
TERsrc/tinyxmlerror.cpp \
TERsrc/tools.cpp \
TerScorer.cpp \
CderScorer.cpp
evaluator_SOURCES = Util.cpp \
evaluator.cpp \
libmert_la_SOURCES = \
Util.cpp \
Timer.cpp \
ScoreStats.cpp ScoreArray.cpp ScoreData.cpp \
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp \
Data.cpp \
BleuScorer.cpp \
Point.cpp \
PerScorer.cpp \
Scorer.cpp \
Optimizer.cpp \
TERsrc/alignmentStruct.cpp \
TERsrc/hashMap.cpp \
TERsrc/hashMapStringInfos.cpp \
TERsrc/segmentStructure.cpp \
TERsrc/stringHasher.cpp \
TERsrc/terAlignment.cpp \
TERsrc/terShift.cpp \
TERsrc/tinyxml.cpp \
TERsrc/tinyxmlparser.cpp \
TERsrc/documentStructure.cpp \
TERsrc/hashMapInfos.cpp \
TERsrc/infosHasher.cpp \
TERsrc/stringInfosHasher.cpp \
TERsrc/tercalc.cpp \
TERsrc/tinystr.cpp \
TERsrc/tinyxmlerror.cpp \
TERsrc/tools.cpp \
TerScorer.cpp \
CderScorer.cpp
mert_CPPFLAGS = -W -Wall -Wno-unused -ffor-scope -DTRACE_ENABLE
extractor_CPPFLAGS = -W -Wall -Wno-unused -ffor-scope -DTRACE_ENABLE
evaluator_CPPFLAGS = -W -Wall -Wno-unused -ffor-scope -DTRACE_ENABLE
mert_SOURCES = mert.cpp $(top_builddir)/moses/src/ThreadPool.cpp
extractor_SOURCES = extractor.cpp
evaluator_SOURCES = evaluator.cpp
extractor_LDADD = -lm -lz
mert_LDADD = -lm -lz
evaluator_LDADD = -lm -lz
extractor_LDADD = libmert.la -lm -lz
mert_LDADD = libmert.la -lm -lz $(BOOST_THREAD_LDFLAGS) $(BOOST_THREAD_LIBS)
evaluator_LDADD = libmert.la -lm -lz

View File

@ -100,6 +100,15 @@ Point Point::operator+(const Point& p2)const
return Res;
};
void Point::operator+=(const Point& p2)
{
assert(p2.size()==size());
for(unsigned i=0; i<size(); i++)
operator[](i)+=p2[i];
score=numeric_limits<statscore_t>::max();
};
Point Point::operator*(float l)const
{
Point Res(*this);

View File

@ -33,6 +33,12 @@ public:
static unsigned int getpdim() {
return pdim;
}
static void setpdim(size_t pd) {
pdim = pd;
}
static void setdim(size_t d) {
dim = d;
}
static bool OptimizeAll() {
return fixedweights.empty();
};
@ -46,6 +52,7 @@ public:
double operator*(const FeatureStats&)const;//compute the feature function
Point operator+(const Point&)const;
void operator+=(const Point&);
Point operator*(float)const;
/**write the Whole featureweight to a stream (ie pdim float)*/
friend ostream& operator<<(ostream& o,const Point& P);

View File

@ -21,7 +21,7 @@ ScoreStats::ScoreStats()
ScoreStats::~ScoreStats()
{
delete array_;
delete[] array_;
};
ScoreStats::ScoreStats(const ScoreStats &stats)

View File

@ -1,181 +0,0 @@
#include "documentStructure.h"
using namespace std;
namespace TERCpp
{
string documentStructure::toString()
{
stringstream s;
// s << "nword : " << vectorToString(nwords)<<endl;
// s << "alignment" << vectorToString(alignment)<<endl;
// s << "afterShift" << vectorToString(alignment)<<endl;
s << "Nothing to be printed" << endl;
return s.str();
}
string documentStructure::getDocId()
{
return docId;
}
vector< segmentStructure >* documentStructure::getSegments()
{
return &seg;
}
string documentStructure::getSysId()
{
return sysId;
}
// float documentStructure::getAverageLength()
// {
// return averageLength;
// }
// void documentStructure::setAverageLength(float f)
// {
// averageLength=f;
// }
void documentStructure::addSegments ( segmentStructure s )
{
seg.push_back ( s );
}
void documentStructure::addSegments ( string id, string text )
{
segmentStructure tmp_seg ( id, text );
seg.push_back ( tmp_seg );
}
segmentStructure* documentStructure::getLastSegments()
{
return & seg.at ( ( int ) seg.size() - 1 );
}
void documentStructure::setDocId ( string s )
{
docId = s;
}
void documentStructure::setSysId ( string s )
{
sysId = s;
}
segmentStructure* documentStructure::getSegment ( string id )
{
for ( int i = 0; i < ( int ) seg.size(); i++ ) {
if ( id.compare ( seg.at ( i ).getSegId() ) == 0 ) {
return & ( seg.at ( i ) );
}
}
cerr << "ERROR : documentStructure::getSegment : Segment " << id << " does not exist" <<endl;
cerr << "Segment size " << seg.size()<< endl;
for (int i=0; i<(int)seg.size(); i++) {
cerr <<seg.at(i).getSegId()<<endl;
}
exit(0);
}
int documentStructure::getSize()
{
return ( int ) seg.size();
}
// documentStructure::documentStructure()
// {
// // vector<string> ref;
// // vector<string> hyp;
// // vector<string> aftershift;
//
// // documentStructure[] allshifts = null;
//
// numEdits=0;
// numWords=0;
// bestRef="";
//
// numIns=0;
// numDel=0;
// numSub=0;
// numSft=0;
// numWsf=0;
// }
// documentStructure::documentStructure ()
// {
// start = 0;
// end = 0;
// moveto = 0;
// newloc = 0;
// cost=1.0;
// }
// documentStructure::documentStructure (int _start, int _end, int _moveto, int _newloc)
// {
// start = _start;
// end = _end;
// moveto = _moveto;
// newloc = _newloc;
// cost=1.0;
// }
// documentStructure::documentStructure (int _start, int _end, int _moveto, int _newloc, vector<string> _shifted)
// {
// start = _start;
// end = _end;
// moveto = _moveto;
// newloc = _newloc;
// shifted = _shifted;
// cost=1.0;
// }
// string documentStructure::vectorToString(vector<string> vec)
// {
// string retour("");
// for (vector<string>::iterator vecIter=vec.begin();vecIter!=vec.end(); vecIter++)
// {
// retour+=(*vecIter)+"\t";
// }
// return retour;
// }
// string documentStructure::toString()
// {
// stringstream s;
// s.str("");
// s << "[" << start << ", " << end << ", " << moveto << "/" << newloc << "]";
// if ((int)shifted.size() > 0)
// {
// s << " (" << vectorToString(shifted) << ")";
// }
// return s.str();
// }
/* The distance of the shift. */
// int documentStructure::distance()
// {
// if (moveto < start)
// {
// return start - moveto;
// }
// else if (moveto > end)
// {
// return moveto - end;
// }
// else
// {
// return moveto - start;
// }
// }
//
// bool documentStructure::leftShift()
// {
// return (moveto < start);
// }
//
// int documentStructure::size()
// {
// return (end - start) + 1;
// }
// documentStructure documentStructure::operator=(documentStructure t)
// {
//
// return t;
// }
}

View File

@ -1,60 +0,0 @@
#ifndef __DOCUMENTSTRUCTURE_H__
#define __DOCUMENTSTRUCTURE_H__
#include <vector>
#include <stdio.h>
#include <string>
#include <sstream>
#include "tools.h"
#include "segmentStructure.h"
using namespace std;
using namespace Tools;
namespace TERCpp
{
class documentStructure
{
private:
string docId;
string sysId;
vector<segmentStructure> seg;
public:
string getDocId();
string getSysId();
vector<segmentStructure>* getSegments();
segmentStructure* getLastSegments();
void setDocId ( string s );
void setSysId ( string s );
void addSegments ( segmentStructure s );
void addSegments ( string id, string text );
segmentStructure* getSegment ( string id );
int getSize();
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
// alignmentStruct (int _start, int _end, int _moveto, int _newloc, vector<string> _shifted);
// string toString();
// int distance() ;
// bool leftShift();
// int size();
// alignmentStruct operator=(alignmentStruct t);
// string vectorToString(vector<string> vec);
// int start;
// int end;
// int moveto;
// int newloc;
// vector<string> nwords; // The words we shifted
// vector<char> alignment ; // for pra_more output
// vector<vecInt> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
// double cost;
string toString();
};
}
#endif

View File

@ -3,7 +3,6 @@
*/
#ifndef __HASHMAP_H__
#define __HASHMAP_H__
#include <boost/functional/hash.hpp>
#include "stringHasher.h"
#include <vector>
#include <string>

View File

@ -3,7 +3,6 @@
*/
#ifndef __HASHMAPINFOS_H__
#define __HASHMAPINFOS_H__
#include <boost/functional/hash.hpp>
#include "infosHasher.h"
#include <vector>
#include <string>

View File

@ -3,7 +3,6 @@
*/
#ifndef __HASHMAPSTRINGINFOS_H__
#define __HASHMAPSTRINGINFOS_H__
#include <boost/functional/hash.hpp>
#include "stringInfosHasher.h"
#include <vector>
#include <string>

View File

@ -1,332 +0,0 @@
#include "multiEvaluation.h"
// #include <iostream>
// #include <boost/filesystem/fstream.hpp>
// #include <boost/archive/xml_oarchive.hpp>
// #include <boost/archive/xml_iarchive.hpp>
// #include <boost/serialization/nvp.hpp>
// helper functions to allow us to load and save sandwiches to/from xml
namespace TERCpp
{
multiEvaluation::multiEvaluation()
{
evalParameters.debugMode = false;
evalParameters.caseOn = false;
evalParameters.noPunct = false;
evalParameters.normalize = false;
evalParameters.tercomLike = false;
evalParameters.sgmlInputs = false;
evalParameters.noTxtIds = false;
// referencesTxt=new multiTxtDocument();
// hypothesisTxt=new documentStructure();
}
multiEvaluation::multiEvaluation ( param p )
{
evalParameters.debugMode = false;
evalParameters.caseOn = false;
evalParameters.noPunct = false;
evalParameters.normalize = false;
evalParameters.tercomLike = false;
evalParameters.sgmlInputs = false;
evalParameters.noTxtIds = false;
evalParameters = Tools::copyParam ( p );
// referencesTxt=new multiTxtDocument();
// hypothesisTxt=new documentStructure();
}
void multiEvaluation::addReferences()
{
referencesTxt.loadRefFiles ( evalParameters );
}
// void multiEvaluation::addReferences(vector< string > vecRefecrences)
// {
// for (int i=0; i< (int) vecRefecrences.size(); i++)
// {
// referencesTxt.loadFile(vecRefecrences.at(i));
// }
// }
void multiEvaluation::setHypothesis()
{
multiTxtDocument l_multiTxtTmp;
l_multiTxtTmp.loadHypFile ( evalParameters );
hypothesisTxt = (*(l_multiTxtTmp.getDocument ( "0" )));
}
void multiEvaluation::setParameters ( param p )
{
evalParameters = Tools::copyParam ( p );
}
void multiEvaluation::launchTxtEvaluation()
{
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchTxtEvaluation : before testing references and hypothesis size "<<endl<<"END DEBUG"<<endl;
}
if ( referencesTxt.getSize() == 0 ) {
cerr << "ERROR : multiEvaluation::launchTxtEvaluation : there is no references" << endl;
exit ( 0 );
}
if ( hypothesisTxt.getSize() == 0 ) {
cerr << "ERROR : multiEvaluation::launchTxtEvaluation : there is no hypothesis" << endl;
exit ( 0 );
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchTxtEvaluation : testing references and hypothesis size "<<endl<<" number of references : "<< referencesTxt.getSize()<<endl;
vector <string> s =referencesTxt.getListDocuments();
cerr << " avaiable ids : ";
for (vector <string>::iterator iterS=s.begin(); iterS!=s.end(); iterS++) {
cerr << " " << (*iterS);
}
cerr << endl;
for (vector <string>::iterator iterSBis=s.begin(); iterSBis!=s.end(); iterSBis++) {
cerr << " reference : "+(*iterSBis)+"; size : "<< (referencesTxt.getDocument((*iterSBis)))->getSize() << endl;
}
cerr << " hypothesis size : "<< hypothesisTxt.getSize() << endl<<"END DEBUG"<<endl;
}
int incDocRefences = 0;
stringstream l_stream;
vector<float> editsResults;
vector<float> wordsResults;
int tot_ins = 0;
int tot_del = 0;
int tot_sub = 0;
int tot_sft = 0;
int tot_wsf = 0;
float tot_err = 0;
float tot_wds = 0;
// vector<stringInfosHasher> setOfHypothesis = hashHypothesis.getHashMap();
ofstream outputSum ( ( evalParameters.hypothesisFile + ".output.sum.log" ).c_str() );
outputSum << "Hypothesis File: " + evalParameters.hypothesisFile + "\nReference File: " + evalParameters.referenceFile + "\n" + "Ave-Reference File: " << endl;
char outputCharBuffer[200];
sprintf ( outputCharBuffer, "%19s | %4s | %4s | %4s | %4s | %4s | %6s | %8s | %8s", "Sent Id", "Ins", "Del", "Sub", "Shft", "WdSh", "NumEr", "AvNumWd", "TER");
outputSum << outputCharBuffer << endl;
outputSum << "-------------------------------------------------------------------------------------" << endl;
vector <string> referenceList =referencesTxt.getListDocuments();
for (vector <string>::iterator referenceListIter=referenceList.begin(); referenceListIter!=referenceList.end(); referenceListIter++) {
// cerr << " " << (*referenceListIter);
documentStructure l_reference = (*(referencesTxt.getDocument ( (*referenceListIter) )));
evaluate ( l_reference, hypothesisTxt );
// evaluate ( l_reference);
}
// for ( incDocRefences = 0; incDocRefences < referencesTxt.getSize();incDocRefences++ )
// {
// l_stream.str ( "" );
// l_stream << incDocRefences;
// }
for ( vector<segmentStructure>::iterator segHypIt = hypothesisTxt.getSegments()->begin(); segHypIt != hypothesisTxt.getSegments()->end(); segHypIt++ ) {
terAlignment l_result = segHypIt->getAlignment();
string bestDocId = segHypIt->getBestDocId();
string l_id=segHypIt->getSegId();
editsResults.push_back(l_result.numEdits);
wordsResults.push_back(l_result.numWords);
l_result.scoreDetails();
tot_ins += l_result.numIns;
tot_del += l_result.numDel;
tot_sub += l_result.numSub;
tot_sft += l_result.numSft;
tot_wsf += l_result.numWsf;
tot_err += l_result.numEdits;
tot_wds += l_result.averageWords;
char outputCharBufferTmp[200];
sprintf(outputCharBufferTmp, "%19s | %4d | %4d | %4d | %4d | %4d | %6.1f | %8.3f | %8.3f",(l_id+":"+bestDocId).c_str(), l_result.numIns, l_result.numDel, l_result.numSub, l_result.numSft, l_result.numWsf, l_result.numEdits, l_result.averageWords, l_result.scoreAv()*100.0);
outputSum<< outputCharBufferTmp<<endl;
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchTxtEvaluation : Evaluation "<<endl<< l_result.toString() <<endl<<"END DEBUG"<<endl;
}
}
cout << "Total TER: " << scoreTER ( editsResults, wordsResults );
char outputCharBufferTmp[200];
outputSum << "-------------------------------------------------------------------------------------" << endl;
sprintf ( outputCharBufferTmp, "%19s | %4d | %4d | %4d | %4d | %4d | %6.1f | %8.3f | %8.3f", "TOTAL", tot_ins, tot_del, tot_sub, tot_sft, tot_wsf, tot_err, tot_wds, tot_err*100.0 / tot_wds );
outputSum << outputCharBufferTmp << endl;
outputSum.close();
}
void multiEvaluation::evaluate ( documentStructure& docStructReference, documentStructure& docStructhypothesis )
{
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::evaluate : launching evaluate on "<<endl<<" references size : "<< docStructReference.getSize() << endl << " hypothesis size : "<< docStructhypothesis.getSize() << endl<<"END DEBUG"<<endl;
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::evaluate : testing hypothesis "<<endl;
cerr <<" segId : "<< docStructhypothesis.getSegments()->at(0).getSegId() << endl<<"END DEBUG"<<endl;
}
for ( vector<segmentStructure>::iterator segHypIt = docStructhypothesis.getSegments()->begin(); segHypIt != docStructhypothesis.getSegments()->end(); segHypIt++ ) {
// cerr << "************************************************************************************************************************************************************************************** 1 " << (docStructhypothesis.getSegments()->at(0)).toString()<<endl;
terCalc * l_evalTER = new terCalc();
// cerr << "************************************************************************************************************************************************************************************** 2"<<endl;
// (*segHypIt).getSegId() ;
// cerr << "************************************************************************************************************************************************************************************** 3"<<endl;
segmentStructure * l_segRef = docStructReference.getSegment ( segHypIt->getSegId() );
// cerr << "************************************************************************************************************************************************************************************** 4"<<endl;
// exit(0);
terAlignment l_result = l_evalTER->TER ( segHypIt->getContent(), l_segRef->getContent());
l_result.averageWords = l_segRef->getAverageLength();
if (l_result.averageWords==0.0) {
cerr << "ERROR : tercpp : multiEvaluation::evaluate : averageWords is equal to zero" <<endl;
exit(0);
}
l_segRef->setAlignment ( l_result );
if ((segHypIt->getAlignment().numWords == 0) && (segHypIt->getAlignment().numEdits == 0 )) {
segHypIt->setAlignment ( l_result );
segHypIt->setBestDocId ( docStructReference.getDocId() );
} else if ( l_result.scoreAv() < segHypIt->getAlignment().scoreAv() ) {
segHypIt->setAlignment ( l_result );
segHypIt->setBestDocId ( docStructReference.getDocId() );
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::evaluate : testing "<<endl<<" hypothesis : "<< segHypIt->getSegId() <<endl;
cerr << "hypothesis score : "<< segHypIt->getAlignment().scoreAv() <<endl;
cerr << "BestDoc Id : "<< segHypIt->getBestDocId() <<endl;
cerr << "new score : "<< l_result.scoreAv() <<endl;
cerr << "new BestDoc Id : "<< docStructReference.getDocId() <<endl;
cerr << endl<<"END DEBUG"<<endl;
}
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::evaluate : "<<endl<<"End of function"<<endl<<"END DEBUG"<<endl;
}
// for (incSegHypothesis=0; incSegHypothesis< getSize();incSegHypothesis++)
// {
// docStructhypothesis->getSegments()
// }
}
string multiEvaluation::scoreTER ( vector<float> numEdits, vector<float> numWords )
{
vector<float>::iterator editsIt = numEdits.begin();
vector<float>::iterator wordsIt = numWords.begin();
if ( numWords.size() != numEdits.size() ) {
cerr << "ERROR : tercpp:score, diffrent size of hyp and ref" << endl;
exit ( 0 );
}
double editsCount = 0.0;
double wordsCount = 0.0;
while ( editsIt != numEdits.end() ) {
editsCount += ( *editsIt );
wordsCount += ( *wordsIt );
editsIt++;
wordsIt++;
}
stringstream output;
if ( ( wordsCount <= 0.0 ) && ( editsCount > 0.0 ) ) {
output << 1.0 << " (" << editsCount << "/" << wordsCount << ")" << endl;
} else if ( wordsCount <= 0.0 ) {
output << 0.0 << " (" << editsCount << "/" << wordsCount << ")" << endl;
} else {
// return editsCount/wordsCount;
output << editsCount / wordsCount << " (" << editsCount << "/" << wordsCount << ")" << endl;
}
return output.str();
}
void multiEvaluation::launchSGMLEvaluation()
{
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchSGMLEvaluation : before testing references and hypothesis size "<<endl<<"END DEBUG"<<endl;
}
if ( referencesSGML.getSize() == 0 ) {
cerr << "ERROR : multiEvaluation::launchSGMLEvaluation : there is no references" << endl;
exit ( 0 );
}
if ( hypothesisSGML.getSize() == 0 ) {
cerr << "ERROR : multiEvaluation::launchSGMLEvaluation : there is no hypothesis" << endl;
exit ( 0 );
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchSGMLEvaluation : testing references and hypothesis size "<<endl<<" references size : "<< referencesSGML.getSize() << endl << " hypothesis size : "<< hypothesisSGML.getSize() << endl<<"END DEBUG"<<endl;
}
int incDocRefences = 0;
stringstream l_stream;
vector<float> editsResults;
vector<float> wordsResults;
int tot_ins = 0;
int tot_del = 0;
int tot_sub = 0;
int tot_sft = 0;
int tot_wsf = 0;
float tot_err = 0;
float tot_wds = 0;
// vector<stringInfosHasher> setOfHypothesis = hashHypothesis.getHashMap();
ofstream outputSum ( ( evalParameters.hypothesisFile + ".output.sum.log" ).c_str() );
outputSum << "Hypothesis File: " + evalParameters.hypothesisFile + "\nReference File: " + evalParameters.referenceFile + "\n" + "Ave-Reference File: " << endl;
char outputCharBuffer[200];
sprintf ( outputCharBuffer, "%19s | %4s | %4s | %4s | %4s | %4s | %6s | %8s | %8s", "Sent Id", "Ins", "Del", "Sub", "Shft", "WdSh", "NumEr", "AvNumWd", "TER");
outputSum << outputCharBuffer << endl;
outputSum << "-------------------------------------------------------------------------------------" << endl;
for ( incDocRefences = 0; incDocRefences < referencesSGML.getSize(); incDocRefences++ ) {
l_stream.str ( "" );
l_stream << incDocRefences;
documentStructure l_reference = (*(referencesSGML.getDocument ( l_stream.str() )));
evaluate ( l_reference, hypothesisSGML );
}
for ( vector<segmentStructure>::iterator segHypIt = hypothesisSGML.getSegments()->begin(); segHypIt != hypothesisSGML.getSegments()->end(); segHypIt++ ) {
terAlignment l_result = segHypIt->getAlignment();
string bestDocId = segHypIt->getBestDocId();
string l_id=segHypIt->getSegId();
editsResults.push_back(l_result.numEdits);
wordsResults.push_back(l_result.averageWords);
l_result.scoreDetails();
tot_ins += l_result.numIns;
tot_del += l_result.numDel;
tot_sub += l_result.numSub;
tot_sft += l_result.numSft;
tot_wsf += l_result.numWsf;
tot_err += l_result.numEdits;
tot_wds += l_result.averageWords;
char outputCharBufferTmp[200];
sprintf(outputCharBufferTmp, "%19s | %4d | %4d | %4d | %4d | %4d | %6.1f | %8.3f | %8.3f",(l_id+":"+bestDocId).c_str(), l_result.numIns, l_result.numDel, l_result.numSub, l_result.numSft, l_result.numWsf, l_result.numEdits, l_result.averageWords, l_result.scoreAv()*100.0);
outputSum<< outputCharBufferTmp<<endl;
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchSGMLEvaluation : Evaluation "<<endl<< l_result.toString() <<endl<<"END DEBUG"<<endl;
}
}
cout << "Total TER: " << scoreTER ( editsResults, wordsResults );
char outputCharBufferTmp[200];
outputSum << "-------------------------------------------------------------------------------------" << endl;
sprintf ( outputCharBufferTmp, "%19s | %4d | %4d | %4d | %4d | %4d | %6.1f | %8.3f | %8.3f", "TOTAL", tot_ins, tot_del, tot_sub, tot_sft, tot_wsf, tot_err, tot_wds, tot_err*100.0 / tot_wds );
outputSum << outputCharBufferTmp << endl;
outputSum.close();
}
void multiEvaluation::addSGMLReferences()
{
xmlStructure refStruct;
refStruct.xmlParams=copyParam(evalParameters);
referencesSGML=refStruct.dump_to_SGMLDocument(evalParameters.referenceFile);
}
void multiEvaluation::setSGMLHypothesis()
{
SGMLDocument sgmlHyp;
xmlStructure hypStruct;
hypStruct.xmlParams=copyParam(evalParameters);
hypStruct.xmlParams.tercomLike=false;
sgmlHyp=hypStruct.dump_to_SGMLDocument(evalParameters.hypothesisFile);
hypothesisSGML=(*(sgmlHyp.getFirstDocument()));
}
}

View File

@ -1,44 +0,0 @@
#ifndef __MULTIEVAL_DOCUMENT_H__
#define __MULTIEVAL_DOCUMENT_H__
#include "multiTxtDocument.h"
#include "tools.h"
#include <iostream>
#include <string>
#include "xmlStructure.h"
#include "sgmlDocument.h"
using namespace Tools;
namespace TERCpp
{
class multiEvaluation
{
public:
multiEvaluation();
multiEvaluation(param p );
// void addReferences(string s);
// void addReferences(vector<string> vecRefecrences);
// void addReferences(documentStructure doc);
// void setHypothesis(string s);
// void setHypothesis(documentStructure doc);
void addReferences();
void setHypothesis();
void addSGMLReferences();
void setSGMLHypothesis();
void setParameters ( param p );
void launchTxtEvaluation();
void launchSGMLEvaluation();
void evaluate ( documentStructure & docStructReference, documentStructure & docStructhypothesis );
string scoreTER ( vector<float> numEdits, vector<float> numWords );
private:
param evalParameters;
multiTxtDocument referencesTxt;
documentStructure hypothesisTxt;
SGMLDocument referencesSGML;
documentStructure hypothesisSGML;
};
}
#endif //SANDWICH_DEFINED

View File

@ -1,347 +0,0 @@
#include "multiTxtDocument.h"
// #include <iostream>
// #include <boost/filesystem/fstream.hpp>
// #include <boost/archive/xml_oarchive.hpp>
// #include <boost/archive/xml_iarchive.hpp>
// #include <boost/serialization/nvp.hpp>
// helper functions to allow us to load and save sandwiches to/from xml
namespace TERCpp
{
multiTxtDocument::multiTxtDocument()
{
// docType="";
// setId="";
// srcLang="";
// tgtLang="";
}
// multiTxtDocument::multiTxtDocument ( string FileName )
// {
// this=xmlStruct.copy_to_multiTxtDocument(FileName);
// }
// xmlStructure multiTxtDocument::getStructure()
// {
// return xmlStruct;
// }
// string multiTxtDocument::getDocType()
// {
// return docType;
// }
// string multiTxtDocument::getSetId()
// {
// return setId;
// }
// string multiTxtDocument::getSrcLang()
// {
// return srcLang;
// }
// string multiTxtDocument::getTgtLang()
// {
// return tgtLang;
// }
// void multiTxtDocument::setDocType ( string s )
// {
// docType=s;
// }
// void multiTxtDocument::setSetId ( string s )
// {
// setId=s;
// }
// void multiTxtDocument::setSrcLang ( string s )
// {
// srcLang=s;
// }
// void multiTxtDocument::setTgtLang ( string s )
// {
// tgtLang=s;
// }
void multiTxtDocument::addDocument ( documentStructure doc )
{
documents.push_back ( doc );
}
documentStructure* multiTxtDocument::getLastDocument()
{
return & ( documents.at ( ( int ) documents.size() - 1 ) );
}
vector< documentStructure > multiTxtDocument::getDocuments()
{
return documents;
}
vector< string > multiTxtDocument::getListDocuments()
{
vector< string > to_return;
for (vector< documentStructure >::iterator iter=documents.begin(); iter!=documents.end(); iter++) {
string l_id=(*iter).getDocId();
to_return.push_back(l_id);
}
return to_return;
}
documentStructure* multiTxtDocument::getDocument ( string docId )
{
for ( int i = 0; i < ( int ) documents.size(); i++ ) {
if ( docId.compare ( documents.at ( i ).getDocId() ) == 0 ) {
return & ( documents.at ( i ) );
}
}
cerr << "ERROR : multiTxtDocument::getDocument : document " << docId << " does not exist !" << endl;
exit ( 0 );
}
void multiTxtDocument::loadFile ( string fileName, bool caseOn, bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike )
{
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : loading files " << endl << fileName << endl << "END DEBUG" << endl;
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : testing params " << endl << Tools::printParams ( multiTxtDocumentParams ) << endl << "END DEBUG" << endl;
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : testing others params " << endl << "caseOn : " << caseOn << endl << "noPunct : " << noPunct << endl << "debugMode : " << debugMode << endl << "noTxtIds : " << noTxtIds << endl << "tercomLike : " << tercomLike << endl << "END DEBUG" << endl;
}
ifstream fichierLoad ( fileName.c_str(), ios::in );
string line;
documentStructure l_doc;
if ( fichierLoad ) {
int l_ids = 1;
stringstream l_stream;
while ( getline ( fichierLoad, line ) ) {
string l_key;
string line_mod;
l_stream.str ( "" );
if ( noTxtIds ) {
l_stream << l_ids;
l_key = l_stream.str();
line_mod = line;
l_ids++;
} else {
if ((int)line.rfind ( "(" )==-1) {
cerr << "ERROR : multiTxtDocument::loadFile : Id not found, maybe you should use the --noTxtIds Option ? " << endl;
exit ( 0 );
}
l_key = line.substr ( line.rfind ( "(" ), line.size() - 1 );
line_mod = line.substr ( 0, line.rfind ( "(" ) - 1 );
}
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG multiTxtDocument::loadFile : line NOT tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
}
if ( !tercomLike ) {
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "TERCOM AT FALSE " << endl << "END DEBUG" << endl;
}
line_mod = tokenizePunct ( line_mod );
}
if ( !caseOn ) {
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "CASEON AT FALSE " << endl << "END DEBUG" << endl;
}
line_mod = lowerCase ( line_mod );
}
if ( noPunct ) {
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "NOPUNCT AT TRUE " << endl << "END DEBUG" << endl;
}
if ( !tercomLike ) {
line_mod = removePunctTercom ( line_mod );
} else {
line_mod = removePunct ( line_mod );
}
}
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG multiTxtDocument::loadFile : line tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
}
vector<string> vecDocLine = stringToVector ( line_mod, " " );
// string l_key;
// hashHypothesis.addValue(l_key,vecDocLine);
// l_key=(string)vecDocLine.at((int)vecDocLine.size()-1);
// vecDocLine.pop_back();
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp multiTxtDocument::loadFile : " << l_key << "|" << vectorToString ( vecDocLine ) << "|" << endl << "Vector Size : " << vecDocLine.size() << endl << "Line length : " << ( int ) line_mod.length() << endl << "END DEBUG" << endl;
}
// hashHypothesis.addValue(l_key,vecDocLine);
segmentStructure l_seg ( l_key, vecDocLine );
l_doc.addSegments ( l_seg );
}
// Ref=line;
// getline ( fichierHyp, line );
// Hyp=line;
fichierLoad.close(); // on ferme le fichier
l_stream.str ( "" );
l_stream << ( int ) documents.size();
l_doc.setDocId ( l_stream.str() );
addDocument ( l_doc );
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG multiTxtDocument::loadFile : document " << l_doc.getDocId() << " added !!!" << endl << "END DEBUG" << endl;
}
} else { // sinon
cerr << "ERROR : multiTxtDocument::loadFile : can't open file : " + fileName + " !" << endl;
exit ( 0 );
}
}
// void save_sandwich(const multiTxtDocument &sw, const std::string &file_name);
// multiTxtDocument load_sandwich(const std::string &file_name);
// int callmultiTxtDocument()
// {
// // xml filename
// const std::string fn="JasonsSarnie.xml";
//
// // create a new sandwich and lets take a look at it!
// multiTxtDocument *s = new multiTxtDocument("Granary", "Brie", "Bacon", false); // mmmmm, Brie and bacon! ;)
// std::cout << "Created the following sandwich:" << std::endl;
// s->output();
//
// // Now lets save the sandwich out to an XML file....
// std::cout << std::endl << "Saving the sandwich to xml...." << std::endl;
// save_sandwich(*s, fn);
//
// // And then load it into another multiTxtDocument variable and take a look at what we've got
// std::cout << "Attempting to load the saved sandwich..." << std::endl;
// multiTxtDocument s2 = load_sandwich(fn);
// std::cout << "Contents of loaded multiTxtDocument:" << std::endl;
// s2.output();
//
// delete s;
// std::string dummy;
// std::getline(std::cin, dummy);
//
// }
/*
// Save a multiTxtDocument to XML...
void save_sandwich(const multiTxtDocument &sw, const std::string &file_name)
{
// Create a filestream object
boost::filesystem::fstream ofs(file_name, std::ios::trunc | std::ios::out);
// Now create an XML output file using our filestream
boost::archive::xml_oarchive xml(ofs);
// call serialization::make_nvp, passing our sandwich.
// make_nvp will eventually call the sandwich instance (sw) serialize function
// causing the contents of sw to be output to the xml file
xml << boost::serialization::make_nvp("multiTxtDocument", sw);
}
// The load function works in almost the exact same way as save_sandwich,
// The only differences are:
// 1. we create an XML input stream - the original example in AD's link created another xml_oarchive, causing a runtime error...doh!
// 2. the call to make_nvp populates the sandwich instance(sw) which is then returned...
multiTxtDocument load_sandwich(const std::string &file_name)
{
multiTxtDocument sw;
boost::filesystem::fstream ifs(file_name, std::ios::binary | std::ios::in);
boost::archive::xml_iarchive xml(ifs);
xml >> boost::serialization::make_nvp("multiTxtDocument", sw);
return sw;
}*/
void multiTxtDocument::setAverageLength()
{
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : Starting calculate Average length " << endl << "END DEBUG" << endl;
}
vecFloat l_avLength((*documents.begin()).getSize(),0.0);
vector< documentStructure >::iterator iter=documents.begin();
// for (vector< documentStructure >::iterator iter=documents.begin(); iter!=documents.end(); iter++)
// {
// string l_id=(*iter).getDocId();
// to_return.push_back(l_id);
vector< segmentStructure > * l_vecSeg=(*iter).getSegments();
// vector< segmentStructure >::iterator iterSeg=l_vecSeg->begin();
for (vector< segmentStructure >::iterator iterSeg=l_vecSeg->begin(); iterSeg!=l_vecSeg->end(); iterSeg++) {
segmentStructure l_seg=(*iterSeg);
// if ( multiTxtDocumentParams.debugMode )
// {
// cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : Average length: " << l_seg.getAverageLength() << endl << "END DEBUG" << endl;
// }
if (l_seg.getAverageLength()==0.0) {
float l_average=0.0;
for (int l_iter =0; l_iter < (int)documents.size(); l_iter++) {
l_average+=(float)(documents.at(l_iter).getSegment(l_seg.getSegId()))->getSize();
}
l_average=l_average/(float)documents.size();
l_seg.setAverageLength(l_average);
for (iter=documents.begin(); iter!=documents.end(); iter++) {
// if ( multiTxtDocumentParams.debugMode )
// {
// cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : average length BEFORE assignation: DocId, SegId, Average: " << (*iter).getDocId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getSegId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getAverageLength() << endl << "END DEBUG" << endl;
// }
(*iter).getSegment(l_seg.getSegId())->setAverageLength(l_average);
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : average length AFTER assignation: DocId, SegId, Average: " << (*iter).getDocId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getSegId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getAverageLength() << endl << "END DEBUG" << endl;
}
}
}
iter=documents.begin();
// if ( multiTxtDocumentParams.debugMode )
// {
// cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : average length verification: DocId, SegId, Average: " << (*iter).getDocId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getSegId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getAverageLength() << endl << "END DEBUG" << endl;
// }
}
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : End calculate Average length " << endl << "END DEBUG" << endl;
}
// }
}
void multiTxtDocument::loadFiles ( string fileName, bool caseOn, bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike )
{
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFiles : loading files " << endl << fileName << endl << "END DEBUG" << endl;
}
vector<string> vecFiles = stringToVector ( fileName, "," );
for ( int i = 0; i < ( int ) vecFiles.size(); i++ ) {
loadFile ( vecFiles.at ( i ), caseOn, noPunct, debugMode, noTxtIds, tercomLike );
}
setAverageLength();
}
void multiTxtDocument::loadRefFile ( param p )
{
multiTxtDocumentParams = Tools::copyParam ( p );
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadRefFile : loading references " << endl << multiTxtDocumentParams.referenceFile << endl << "END DEBUG" << endl;
}
loadFile ( multiTxtDocumentParams.referenceFile, multiTxtDocumentParams.caseOn, multiTxtDocumentParams.noPunct, multiTxtDocumentParams.debugMode, multiTxtDocumentParams.noTxtIds, multiTxtDocumentParams.tercomLike );
}
void multiTxtDocument::loadRefFiles ( param p )
{
multiTxtDocumentParams = Tools::copyParam ( p );
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadRefFiles : loading references " << endl << multiTxtDocumentParams.referenceFile << endl << "END DEBUG" << endl;
}
loadFiles ( multiTxtDocumentParams.referenceFile, multiTxtDocumentParams.caseOn, multiTxtDocumentParams.noPunct, multiTxtDocumentParams.debugMode, multiTxtDocumentParams.noTxtIds, multiTxtDocumentParams.tercomLike );
}
void multiTxtDocument::loadHypFile ( param p )
{
multiTxtDocumentParams = Tools::copyParam ( p );
multiTxtDocumentParams.tercomLike = false;
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadHypFile : loading hypothesis " << endl << multiTxtDocumentParams.hypothesisFile << endl << "END DEBUG" << endl;
}
loadFile ( multiTxtDocumentParams.hypothesisFile, multiTxtDocumentParams.caseOn, multiTxtDocumentParams.noPunct, multiTxtDocumentParams.debugMode, multiTxtDocumentParams.noTxtIds, multiTxtDocumentParams.tercomLike );
}
void multiTxtDocument::loadHypFiles ( param p )
{
multiTxtDocumentParams = Tools::copyParam ( p );
multiTxtDocumentParams.tercomLike = false;
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadHypFiles : loading hypothesis " << endl << multiTxtDocumentParams.hypothesisFile << endl << "END DEBUG" << endl;
}
loadFile ( multiTxtDocumentParams.hypothesisFile, multiTxtDocumentParams.caseOn, multiTxtDocumentParams.noPunct, multiTxtDocumentParams.debugMode, multiTxtDocumentParams.noTxtIds, multiTxtDocumentParams.tercomLike );
}
int multiTxtDocument::getSize()
{
return ( int ) documents.size();
}
}

View File

@ -1,81 +0,0 @@
#ifndef __MULTITXT_DOCUMENT_H__
#define __MULTITXT_DOCUMENT_H__
#include "documentStructure.h"
#include "tools.h"
// #include "xmlStructure.h"
#include <iostream>
#include <string>
namespace TERCpp
{
class multiTxtDocument
{
public:
multiTxtDocument();
// multiTxtDocument(string FileName);
// multiTxtDocument(const std::string &bread, const std::string &cheese, const std::string &meat, const bool pickle):
// m_bread(bread), m_cheese(cheese), m_meat(meat), m_pickle(pickle){};
// ~multiTxtDocument(){};
// void output()
// {
// std::cout << "Bread = " << m_bread << ", Cheese = " << m_cheese <<
// ", Meat = " << m_meat << ", Has Pickle = " << m_pickle << std::endl;
//
// }
// void setDocType(string s);
// void setSetId(string s);
// void setSrcLang(string s);
// void setTgtLang(string s);
// string getDocType();
// string getSetId();
// string getSrcLang();
// string getTgtLang();
// xmlStructure getStructure();
void addDocument ( documentStructure doc );
documentStructure* getLastDocument();
documentStructure* getDocument ( string docId );
vector<documentStructure> getDocuments ();
vector<string> getListDocuments ();
void loadFile ( string fileName, bool caseOn, bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike );
void loadFiles ( string fileName, bool caseOn, bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike );
void loadRefFile ( param p );
void loadRefFiles ( param p );
void loadHypFile ( param p );
void loadHypFiles ( param p );
void setAverageLength();
int getSize();
private:
// string docType;
// string setId;
// string srcLang;
// string tgtLang;
// xmlStructure xmlStruct;
param multiTxtDocumentParams;
vector<documentStructure> documents;
// vector<string> bestDocumentId;
// std::string m_bread, m_cheese, m_meat;
// bool m_pickle;
//
// // declare the boost::serialization::access class as a friend of multiTxtDocument
// friend class boost::serialization::access;
// // Create a serialize function for serialization::access to use, I guess you could regard this as a kind of callback function!
// template<class archive>
// void serialize(archive& ar, const unsigned int version)
// {
// // Note: As explained in the original tut. the & operator is overridden in boost to use
// // << or >> depending on the direction of the data (read/write)
// using boost::serialization::make_nvp;
// ar & make_nvp("Bread", m_bread);
// ar & make_nvp("Cheese", m_cheese);
// ar & make_nvp("Meats", m_meat);
// ar & make_nvp("HasPickle", m_pickle);
// // Also note: strings in the first parameter of make_nvp cannot contain spaces!
// }
};
}
#endif //SANDWICH_DEFINED

View File

@ -1,82 +0,0 @@
#include "segmentStructure.h"
using namespace std;
namespace TERCpp
{
vecString segmentStructure::getContent()
{
return content;
}
string segmentStructure::getSegId()
{
return segId;
}
string segmentStructure::toString()
{
// return vectorToString(content);
return "";
}
void segmentStructure::addContent ( vecString vecS )
{
content = vecS;
averageLength=0.0;
}
void segmentStructure::setSegId ( string s )
{
segId = s;
}
segmentStructure::segmentStructure ( string id, vecString vecS )
{
segId = id;
content = vecS;
averageLength=0.0;
}
segmentStructure::segmentStructure ( string id, string txt )
{
segId = id;
content = stringToVector ( txt, " " );
averageLength=0.0;
}
void segmentStructure::addContent ( string s )
{
content = stringToVector ( s, " " );
averageLength=0.0;
}
segmentStructure::segmentStructure()
{
segId = "";
}
terAlignment segmentStructure::getAlignment()
{
return evaluation;
}
void segmentStructure::setAlignment ( terAlignment& l_align )
{
evaluation = l_align;
}
string segmentStructure::getBestDocId()
{
return bestDocId;
}
void segmentStructure::setBestDocId ( string s )
{
bestDocId = s;
}
float segmentStructure::getAverageLength()
{
return averageLength;
}
void segmentStructure::setAverageLength(float f)
{
averageLength=f;
}
int segmentStructure::getSize()
{
return (int)content.size();
}
}

View File

@ -1,73 +0,0 @@
#ifndef __SEGMENTSTRUCTURE_H__
#define __SEGMENTSTRUCTURE_H__
#include <vector>
#include <stdio.h>
#include <string>
#include <sstream>
#include "tools.h"
#include "tercalc.h"
using namespace std;
using namespace Tools;
namespace TERCpp
{
class segmentStructure
{
private:
string segId;
vecString content;
terAlignment evaluation;
string bestDocId;
float averageLength;
public:
segmentStructure();
segmentStructure ( string id, vecString vecS );
segmentStructure ( string id, string txt );
void setAverageLength(float f);
float getAverageLength();
string getSegId();
terAlignment getAlignment();
void setAlignment(terAlignment& l_align);
void setSegId ( string s );
void setBestDocId ( string s );
string getBestDocId();
void addContent ( vecString vecS );
void addContent ( string s );
int getSize();
// {
// return segId;
// }
vecString getContent();
// {
// return content;
// }
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
// alignmentStruct (int _start, int _end, int _moveto, int _newloc, vector<string> _shifted);
// string toString();
// int distance() ;
// bool leftShift();
// int size();
// alignmentStruct operator=(alignmentStruct t);
// string vectorToString(vector<string> vec);
// int start;
// int end;
// int moveto;
// int newloc;
vector<string> nwords; // The words we shifted
vector<char> alignment ; // for pra_more output
vector<vecInt> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
double cost;
string toString();
};
}
#endif

View File

@ -1,149 +0,0 @@
#include "sgmlDocument.h"
// #include <iostream>
// #include <boost/filesystem/fstream.hpp>
// #include <boost/archive/xml_oarchive.hpp>
// #include <boost/archive/xml_iarchive.hpp>
// #include <boost/serialization/nvp.hpp>
// helper functions to allow us to load and save sandwiches to/from xml
namespace TERCpp
{
SGMLDocument::SGMLDocument()
{
docType="";
setId="";
srcLang="";
tgtLang="";
}
// SGMLDocument::SGMLDocument ( string FileName )
// {
// this=xmlStruct.copy_to_SGMLDocument(FileName);
// }
// xmlStructure SGMLDocument::getStructure()
// {
// return xmlStruct;
// }
string SGMLDocument::getDocType()
{
return docType;
}
string SGMLDocument::getSetId()
{
return setId;
}
string SGMLDocument::getSrcLang()
{
return srcLang;
}
string SGMLDocument::getTgtLang()
{
return tgtLang;
}
void SGMLDocument::setDocType ( string s )
{
docType=s;
}
void SGMLDocument::setSetId ( string s )
{
setId=s;
}
void SGMLDocument::setSrcLang ( string s )
{
srcLang=s;
}
void SGMLDocument::setTgtLang ( string s )
{
tgtLang=s;
}
void SGMLDocument::addDocument ( documentStructure doc )
{
documents.push_back(doc);
}
documentStructure* SGMLDocument::getLastDocument()
{
return &(documents.at((int)documents.size()-1));
}
documentStructure* SGMLDocument::getFirstDocument()
{
return &(documents.at(0));
}
int SGMLDocument::getSize()
{
return (int)documents.size();
}
documentStructure* SGMLDocument::getDocument(string docId)
{
for ( int i = 0; i < ( int ) documents.size(); i++ ) {
if ( docId.compare ( documents.at ( i ).getDocId() ) == 0 ) {
return & ( documents.at ( i ) );
}
}
cerr << "ERROR : SGMLDocument::getDocument : document " << docId << " does not exist !" << endl;
exit ( 0 );
}
// void save_sandwich(const SGMLDocument &sw, const std::string &file_name);
// SGMLDocument load_sandwich(const std::string &file_name);
// int callSGMLDocument()
// {
// // xml filename
// const std::string fn="JasonsSarnie.xml";
//
// // create a new sandwich and lets take a look at it!
// SGMLDocument *s = new SGMLDocument("Granary", "Brie", "Bacon", false); // mmmmm, Brie and bacon! ;)
// std::cout << "Created the following sandwich:" << std::endl;
// s->output();
//
// // Now lets save the sandwich out to an XML file....
// std::cout << std::endl << "Saving the sandwich to xml...." << std::endl;
// save_sandwich(*s, fn);
//
// // And then load it into another SGMLDocument variable and take a look at what we've got
// std::cout << "Attempting to load the saved sandwich..." << std::endl;
// SGMLDocument s2 = load_sandwich(fn);
// std::cout << "Contents of loaded SGMLDocument:" << std::endl;
// s2.output();
//
// delete s;
// std::string dummy;
// std::getline(std::cin, dummy);
//
// }
/*
// Save a SGMLDocument to XML...
void save_sandwich(const SGMLDocument &sw, const std::string &file_name)
{
// Create a filestream object
boost::filesystem::fstream ofs(file_name, std::ios::trunc | std::ios::out);
// Now create an XML output file using our filestream
boost::archive::xml_oarchive xml(ofs);
// call serialization::make_nvp, passing our sandwich.
// make_nvp will eventually call the sandwich instance (sw) serialize function
// causing the contents of sw to be output to the xml file
xml << boost::serialization::make_nvp("SGMLDocument", sw);
}
// The load function works in almost the exact same way as save_sandwich,
// The only differences are:
// 1. we create an XML input stream - the original example in AD's link created another xml_oarchive, causing a runtime error...doh!
// 2. the call to make_nvp populates the sandwich instance(sw) which is then returned...
SGMLDocument load_sandwich(const std::string &file_name)
{
SGMLDocument sw;
boost::filesystem::fstream ifs(file_name, std::ios::binary | std::ios::in);
boost::archive::xml_iarchive xml(ifs);
xml >> boost::serialization::make_nvp("SGMLDocument", sw);
return sw;
}*/
}

View File

@ -1,69 +0,0 @@
#ifndef __SGML_DOCUMENT_H__
#define __SGML_DOCUMENT_H__
#include "documentStructure.h"
// #include "xmlStructure.h"
#include <iostream>
#include <string>
namespace TERCpp
{
class SGMLDocument
{
public:
SGMLDocument();
// SGMLDocument(string FileName);
// SGMLDocument(const std::string &bread, const std::string &cheese, const std::string &meat, const bool pickle):
// m_bread(bread), m_cheese(cheese), m_meat(meat), m_pickle(pickle){};
// ~SGMLDocument(){};
// void output()
// {
// std::cout << "Bread = " << m_bread << ", Cheese = " << m_cheese <<
// ", Meat = " << m_meat << ", Has Pickle = " << m_pickle << std::endl;
//
// }
void setDocType ( string s );
void setSetId ( string s );
void setSrcLang ( string s );
void setTgtLang ( string s );
string getDocType();
string getSetId();
string getSrcLang();
string getTgtLang();
// xmlStructure getStructure();
void addDocument ( documentStructure doc );
documentStructure* getLastDocument();
documentStructure* getFirstDocument();
int getSize();
documentStructure* getDocument(string docId);
private:
string docType;
string setId;
string srcLang;
string tgtLang;
// xmlStructure xmlStruct;
vector<documentStructure> documents;
// std::string m_bread, m_cheese, m_meat;
// bool m_pickle;
//
// // declare the boost::serialization::access class as a friend of SGMLDocument
// friend class boost::serialization::access;
// // Create a serialize function for serialization::access to use, I guess you could regard this as a kind of callback function!
// template<class archive>
// void serialize(archive& ar, const unsigned int version)
// {
// // Note: As explained in the original tut. the & operator is overridden in boost to use
// // << or >> depending on the direction of the data (read/write)
// using boost::serialization::make_nvp;
// ar & make_nvp("Bread", m_bread);
// ar & make_nvp("Cheese", m_cheese);
// ar & make_nvp("Meats", m_meat);
// ar & make_nvp("HasPickle", m_pickle);
// // Also note: strings in the first parameter of make_nvp cannot contain spaces!
// }
};
}
#endif //SANDWICH_DEFINED

View File

@ -1,40 +0,0 @@
/*
* Generic hashmap manipulation functions
*/
#ifndef __XMLSTRUCTURE_H__
#define __XMLSTRUCTURE_H__
#include "sgmlDocument.h"
#include "documentStructure.h"
#include "stdio.h"
#include <iostream>
#include <string>
#include "tinyxml.h"
using namespace std;
namespace TERCpp
{
class xmlStructure
{
private:
unsigned int NUM_INDENTS_PER_SPACE;
// void dump_attribs_to_SGMLDocuments ( SGMLDocument* arg1, const TiXmlElement* arg2 );
void dump_attribs_to_SGMLDocuments ( SGMLDocument* sgmlDoc, TiXmlElement* pElement, unsigned int indent );
public:
xmlStructure();
const char * getIndent( unsigned int numIndents );
const char * getIndentAlt( unsigned int numIndents );
int dump_attribs_to_stdout(TiXmlElement* pElement, unsigned int indent);
void dump_to_stdout( TiXmlNode* pParent, unsigned int indent );
void dump_to_stdout(const char* pFilename);
void copy_to_SGMLDocument(SGMLDocument* sgmlDoc ,TiXmlNode* pParent, unsigned int indent );
SGMLDocument dump_to_SGMLDocument(string FileName);
};
}
#endif

View File

@ -1,111 +0,0 @@
/*
www.sourceforge.net/projects/tinyxml
Original file by Yves Berquin.
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any
damages arising from the use of this software.
Permission is granted to anyone to use this software for any
purpose, including commercial applications, and to alter it and
redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and
must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
*/
/*
* THIS FILE WAS ALTERED BY Tyge Løvset, 7. April 2005.
*/
#ifndef TIXML_USE_STL
#include "tinystr.h"
// Error value for find primitive
const TiXmlString::size_type TiXmlString::npos = static_cast< TiXmlString::size_type >(-1);
// Null rep.
TiXmlString::Rep TiXmlString::nullrep_ = { 0, 0, { '\0' } };
void TiXmlString::reserve (size_type cap)
{
if (cap > capacity()) {
TiXmlString tmp;
tmp.init(length(), cap);
memcpy(tmp.start(), data(), length());
swap(tmp);
}
}
TiXmlString& TiXmlString::assign(const char* str, size_type len)
{
size_type cap = capacity();
if (len > cap || cap > 3*(len + 8)) {
TiXmlString tmp;
tmp.init(len);
memcpy(tmp.start(), str, len);
swap(tmp);
} else {
memmove(start(), str, len);
set_size(len);
}
return *this;
}
TiXmlString& TiXmlString::append(const char* str, size_type len)
{
size_type newsize = length() + len;
if (newsize > capacity()) {
reserve (newsize + capacity());
}
memmove(finish(), str, len);
set_size(newsize);
return *this;
}
TiXmlString operator + (const TiXmlString & a, const TiXmlString & b)
{
TiXmlString tmp;
tmp.reserve(a.length() + b.length());
tmp += a;
tmp += b;
return tmp;
}
TiXmlString operator + (const TiXmlString & a, const char* b)
{
TiXmlString tmp;
TiXmlString::size_type b_len = static_cast<TiXmlString::size_type>( strlen(b) );
tmp.reserve(a.length() + b_len);
tmp += a;
tmp.append(b, b_len);
return tmp;
}
TiXmlString operator + (const char* a, const TiXmlString & b)
{
TiXmlString tmp;
TiXmlString::size_type a_len = static_cast<TiXmlString::size_type>( strlen(a) );
tmp.reserve(a_len + b.length());
tmp.append(a, a_len);
tmp += b;
return tmp;
}
#endif // TIXML_USE_STL

View File

@ -1,337 +0,0 @@
/*
www.sourceforge.net/projects/tinyxml
Original file by Yves Berquin.
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any
damages arising from the use of this software.
Permission is granted to anyone to use this software for any
purpose, including commercial applications, and to alter it and
redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and
must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
*/
/*
* THIS FILE WAS ALTERED BY Tyge Lovset, 7. April 2005.
*
* - completely rewritten. compact, clean, and fast implementation.
* - sizeof(TiXmlString) = pointer size (4 bytes on 32-bit systems)
* - fixed reserve() to work as per specification.
* - fixed buggy compares operator==(), operator<(), and operator>()
* - fixed operator+=() to take a const ref argument, following spec.
* - added "copy" constructor with length, and most compare operators.
* - added swap(), clear(), size(), capacity(), operator+().
*/
#ifndef TIXML_USE_STL
#ifndef TIXML_STRING_INCLUDED
#define TIXML_STRING_INCLUDED
#include <assert.h>
#include <string.h>
/* The support for explicit isn't that universal, and it isn't really
required - it is used to check that the TiXmlString class isn't incorrectly
used. Be nice to old compilers and macro it here:
*/
#if defined(_MSC_VER) && (_MSC_VER >= 1200 )
// Microsoft visual studio, version 6 and higher.
#define TIXML_EXPLICIT explicit
#elif defined(__GNUC__) && (__GNUC__ >= 3 )
// GCC version 3 and higher.s
#define TIXML_EXPLICIT explicit
#else
#define TIXML_EXPLICIT
#endif
/*
TiXmlString is an emulation of a subset of the std::string template.
Its purpose is to allow compiling TinyXML on compilers with no or poor STL support.
Only the member functions relevant to the TinyXML project have been implemented.
The buffer allocation is made by a simplistic power of 2 like mechanism : if we increase
a string and there's no more room, we allocate a buffer twice as big as we need.
*/
class TiXmlString
{
public :
// The size type used
typedef size_t size_type;
// Error value for find primitive
static const size_type npos; // = -1;
// TiXmlString empty constructor
TiXmlString () : rep_(&nullrep_) {
}
// TiXmlString copy constructor
TiXmlString ( const TiXmlString & copy) : rep_(0) {
init(copy.length());
memcpy(start(), copy.data(), length());
}
// TiXmlString constructor, based on a string
TIXML_EXPLICIT TiXmlString ( const char * copy) : rep_(0) {
init( static_cast<size_type>( strlen(copy) ));
memcpy(start(), copy, length());
}
// TiXmlString constructor, based on a string
TIXML_EXPLICIT TiXmlString ( const char * str, size_type len) : rep_(0) {
init(len);
memcpy(start(), str, len);
}
// TiXmlString destructor
~TiXmlString () {
quit();
}
// = operator
TiXmlString& operator = (const char * copy) {
return assign( copy, (size_type)strlen(copy));
}
// = operator
TiXmlString& operator = (const TiXmlString & copy) {
return assign(copy.start(), copy.length());
}
// += operator. Maps to append
TiXmlString& operator += (const char * suffix) {
return append(suffix, static_cast<size_type>( strlen(suffix) ));
}
// += operator. Maps to append
TiXmlString& operator += (char single) {
return append(&single, 1);
}
// += operator. Maps to append
TiXmlString& operator += (const TiXmlString & suffix) {
return append(suffix.data(), suffix.length());
}
// Convert a TiXmlString into a null-terminated char *
const char * c_str () const {
return rep_->str;
}
// Convert a TiXmlString into a char * (need not be null terminated).
const char * data () const {
return rep_->str;
}
// Return the length of a TiXmlString
size_type length () const {
return rep_->size;
}
// Alias for length()
size_type size () const {
return rep_->size;
}
// Checks if a TiXmlString is empty
bool empty () const {
return rep_->size == 0;
}
// Return capacity of string
size_type capacity () const {
return rep_->capacity;
}
// single char extraction
const char& at (size_type index) const {
assert( index < length() );
return rep_->str[ index ];
}
// [] operator
char& operator [] (size_type index) const {
assert( index < length() );
return rep_->str[ index ];
}
// find a char in a string. Return TiXmlString::npos if not found
size_type find (char lookup) const {
return find(lookup, 0);
}
// find a char in a string from an offset. Return TiXmlString::npos if not found
size_type find (char tofind, size_type offset) const {
if (offset >= length()) return npos;
for (const char* p = c_str() + offset; *p != '\0'; ++p) {
if (*p == tofind) return static_cast< size_type >( p - c_str() );
}
return npos;
}
void clear () {
//Lee:
//The original was just too strange, though correct:
// TiXmlString().swap(*this);
//Instead use the quit & re-init:
quit();
init(0,0);
}
/* Function to reserve a big amount of data when we know we'll need it. Be aware that this
function DOES NOT clear the content of the TiXmlString if any exists.
*/
void reserve (size_type cap);
TiXmlString& assign (const char* str, size_type len);
TiXmlString& append (const char* str, size_type len);
void swap (TiXmlString& other) {
Rep* r = rep_;
rep_ = other.rep_;
other.rep_ = r;
}
private:
void init(size_type sz) {
init(sz, sz);
}
void set_size(size_type sz) {
rep_->str[ rep_->size = sz ] = '\0';
}
char* start() const {
return rep_->str;
}
char* finish() const {
return rep_->str + rep_->size;
}
struct Rep {
size_type size, capacity;
char str[1];
};
void init(size_type sz, size_type cap) {
if (cap) {
// Lee: the original form:
// rep_ = static_cast<Rep*>(operator new(sizeof(Rep) + cap));
// doesn't work in some cases of new being overloaded. Switching
// to the normal allocation, although use an 'int' for systems
// that are overly picky about structure alignment.
const size_type bytesNeeded = sizeof(Rep) + cap;
const size_type intsNeeded = ( bytesNeeded + sizeof(int) - 1 ) / sizeof( int );
rep_ = reinterpret_cast<Rep*>( new int[ intsNeeded ] );
rep_->str[ rep_->size = sz ] = '\0';
rep_->capacity = cap;
} else {
rep_ = &nullrep_;
}
}
void quit() {
if (rep_ != &nullrep_) {
// The rep_ is really an array of ints. (see the allocator, above).
// Cast it back before delete, so the compiler won't incorrectly call destructors.
delete [] ( reinterpret_cast<int*>( rep_ ) );
}
}
Rep * rep_;
static Rep nullrep_;
} ;
inline bool operator == (const TiXmlString & a, const TiXmlString & b)
{
return ( a.length() == b.length() ) // optimization on some platforms
&& ( strcmp(a.c_str(), b.c_str()) == 0 ); // actual compare
}
inline bool operator < (const TiXmlString & a, const TiXmlString & b)
{
return strcmp(a.c_str(), b.c_str()) < 0;
}
inline bool operator != (const TiXmlString & a, const TiXmlString & b)
{
return !(a == b);
}
inline bool operator > (const TiXmlString & a, const TiXmlString & b)
{
return b < a;
}
inline bool operator <= (const TiXmlString & a, const TiXmlString & b)
{
return !(b < a);
}
inline bool operator >= (const TiXmlString & a, const TiXmlString & b)
{
return !(a < b);
}
inline bool operator == (const TiXmlString & a, const char* b)
{
return strcmp(a.c_str(), b) == 0;
}
inline bool operator == (const char* a, const TiXmlString & b)
{
return b == a;
}
inline bool operator != (const TiXmlString & a, const char* b)
{
return !(a == b);
}
inline bool operator != (const char* a, const TiXmlString & b)
{
return !(b == a);
}
TiXmlString operator + (const TiXmlString & a, const TiXmlString & b);
TiXmlString operator + (const TiXmlString & a, const char* b);
TiXmlString operator + (const char* a, const TiXmlString & b);
/*
TiXmlOutStream is an emulation of std::ostream. It is based on TiXmlString.
Only the operators that we need for TinyXML have been developped.
*/
class TiXmlOutStream : public TiXmlString
{
public :
// TiXmlOutStream << operator.
TiXmlOutStream & operator << (const TiXmlString & in) {
*this += in;
return *this;
}
// TiXmlOutStream << operator.
TiXmlOutStream & operator << (const char * in) {
*this += in;
return *this;
}
} ;
#endif // TIXML_STRING_INCLUDED
#endif // TIXML_USE_STL

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,52 +0,0 @@
/*
www.sourceforge.net/projects/tinyxml
Original code (2.0 and earlier )copyright (c) 2000-2006 Lee Thomason (www.grinninglizard.com)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any
damages arising from the use of this software.
Permission is granted to anyone to use this software for any
purpose, including commercial applications, and to alter it and
redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and
must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
*/
#include "tinyxml.h"
// The goal of the seperate error file is to make the first
// step towards localization. tinyxml (currently) only supports
// english error messages, but the could now be translated.
//
// It also cleans up the code a bit.
//
const char* TiXmlBase::errorString[ TIXML_ERROR_STRING_COUNT ] = {
"No error",
"Error",
"Failed to open file",
"Memory allocation failed.",
"Error parsing Element.",
"Failed to read Element name",
"Error reading Element value.",
"Error reading Attributes.",
"Error: empty tag.",
"Error reading end tag.",
"Error parsing Unknown.",
"Error parsing Comment.",
"Error parsing Declaration.",
"Error document empty.",
"Error null (0) or unexpected EOF found in input stream.",
"Error parsing CDATA.",
"Error when TiXmlDocument added to document, because TiXmlDocument can only be at the root.",
};

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,6 @@
#include "tools.h"
using namespace std;
using namespace boost::xpressive;
namespace Tools
{
@ -179,6 +178,8 @@ string lowerCase ( string str )
}
return str;
}
/*
string removePunctTercom ( string str )
{
string str_mod = str;
@ -504,6 +505,7 @@ string normalizeStd ( string str )
return str_mod;
}
*/
param copyParam ( param p )
{

View File

@ -8,7 +8,6 @@
#include <stdlib.h>
#include <string>
#include <sstream>
#include <boost/xpressive/xpressive.hpp>
using namespace std;
@ -63,4 +62,4 @@ string printParams(param p);
// };
param copyParam(param p);
}
#endif
#endif

View File

@ -1,332 +0,0 @@
#include "xmlStructure.h"
// The following class defines a hash function for strings
using namespace std;
namespace TERCpp
{
// tutorial demo program
// ----------------------------------------------------------------------
// STDOUT dump and indenting utility functions
// ----------------------------------------------------------------------
// const unsigned int NUM_INDENTS_PER_SPACE=2;
xmlStructure::xmlStructure()
{
NUM_INDENTS_PER_SPACE = 2;
}
const char * xmlStructure::getIndent ( unsigned int numIndents )
{
static const char * pINDENT = " + ";
static const unsigned int LENGTH = strlen ( pINDENT );
unsigned int n = numIndents * NUM_INDENTS_PER_SPACE;
if ( n > LENGTH )
n = LENGTH;
return &pINDENT[ LENGTH-n ];
}
// same as getIndent but no "+" at the end
const char * xmlStructure::getIndentAlt ( unsigned int numIndents )
{
static const char * pINDENT = " ";
static const unsigned int LENGTH = strlen ( pINDENT );
unsigned int n = numIndents * NUM_INDENTS_PER_SPACE;
if ( n > LENGTH )
n = LENGTH;
return &pINDENT[ LENGTH-n ];
}
int xmlStructure::dump_attribs_to_stdout ( TiXmlElement* pElement, unsigned int indent )
{
if ( !pElement )
return 0;
TiXmlAttribute* pAttrib = pElement->FirstAttribute();
int i = 0;
int ival;
double dval;
const char* pIndent = getIndent ( indent );
printf ( "\n" );
while ( pAttrib ) {
printf ( "%s%s: value=[%s]", pIndent, pAttrib->Name(), pAttrib->Value() );
if ( pAttrib->QueryIntValue ( &ival ) == TIXML_SUCCESS )
printf ( " int=%d", ival );
if ( pAttrib->QueryDoubleValue ( &dval ) == TIXML_SUCCESS )
printf ( " d=%1.1f", dval );
printf ( "\n" );
i++;
pAttrib = pAttrib->Next();
}
return i;
}
void xmlStructure::dump_to_stdout ( TiXmlNode* pParent, unsigned int indent = 0 )
{
if ( !pParent )
return;
TiXmlNode* pChild;
TiXmlText* pText;
int t = pParent->Type();
printf ( "%s", getIndent ( indent ) );
int num;
switch ( t ) {
case TiXmlNode::DOCUMENT:
printf ( "Document" );
break;
case TiXmlNode::ELEMENT:
printf ( "Element [%s]", pParent->Value() );
num = dump_attribs_to_stdout ( pParent->ToElement(), indent + 1 );
switch ( num ) {
case 0:
printf ( " (No attributes)" );
break;
case 1:
printf ( "%s1 attribute", getIndentAlt ( indent ) );
break;
default:
printf ( "%s%d attributes", getIndentAlt ( indent ), num );
break;
}
break;
case TiXmlNode::COMMENT:
printf ( "Comment: [%s]", pParent->Value() );
break;
case TiXmlNode::UNKNOWN:
printf ( "Unknown" );
break;
case TiXmlNode::TEXT:
pText = pParent->ToText();
printf ( "Text: [%s]", pText->Value() );
break;
case TiXmlNode::DECLARATION:
printf ( "Declaration" );
break;
default:
break;
}
printf ( "\n" );
for ( pChild = pParent->FirstChild(); pChild != 0; pChild = pChild->NextSibling() ) {
dump_to_stdout ( pChild, indent + 1 );
}
}
// load the named file and dump its structure to STDOUT
void xmlStructure::dump_to_stdout ( const char* pFilename )
{
TiXmlDocument doc ( pFilename );
bool loadOkay = doc.LoadFile();
if ( loadOkay ) {
printf ( "\n%s:\n", pFilename );
dump_to_stdout ( &doc ); // defined later in the tutorial
} else {
printf ( "Failed to load file \"%s\"\n", pFilename );
}
}
// Load the file and dump it into a SGMLDocument.
SGMLDocument xmlStructure::dump_to_SGMLDocument ( string FileName )
{
TiXmlDocument doc ( FileName.c_str() );
SGMLDocument to_return;
bool isLoaded = doc.LoadFile();
if ( isLoaded ) {
copy_to_SGMLDocument ( &to_return, &doc, ( unsigned int ) 0 );
} else {
cerr << "ERROR : xmlStructure::dump_to_SGMLDocument : Failed to load file " << FileName << endl;
exit ( 0 );
}
return to_return;
}
void xmlStructure::copy_to_SGMLDocument ( SGMLDocument* sgmlDoc, TiXmlNode* pParent, unsigned int indent )
{
if ( !pParent )
return;
TiXmlNode* pChild;
TiXmlText* pText;
int t = pParent->Type();
// printf ( "%s", getIndent ( indent ) );
// int num;
string elementValue;
switch ( t ) {
case TiXmlNode::DOCUMENT:
// printf ( "Document" );
break;
case TiXmlNode::ELEMENT:
printf ( "Element [%s]", pParent->Value() );
elementValue = pParent->Value();
if ( ( ( int ) elementValue.compare ( "refset" ) == 0 ) || ( ( int ) elementValue.compare ( "tstset" ) == 0 ) ) {
sgmlDoc->setDocType ( elementValue );
} else if ( ( int ) elementValue.compare ( "doc" ) == 0 ) {
documentStructure tmp_doc;
sgmlDoc->addDocument ( tmp_doc );
} else if ( ( int ) elementValue.compare ( "seg" ) == 0 ) {
segmentStructure tmp_seg;
( sgmlDoc->getLastDocument() )->addSegments ( tmp_seg );
}
dump_attribs_to_SGMLDocuments ( sgmlDoc, pParent->ToElement(), indent + 1 );
// num = dump_attribs_to_stdout ( pParent->ToElement(), indent + 1 );
// switch ( num )
// {
// case 0:
// printf ( " (No attributes)" );
// break;
// case 1:
// printf ( "%s1 attribute", getIndentAlt ( indent ) );
// break;
// default:
// printf ( "%s%d attributes", getIndentAlt ( indent ), num );
// break;
// }
break;
// case TiXmlNode::COMMENT:
// printf ( "Comment: [%s]", pParent->Value() );
// break;
//
// case TiXmlNode::UNKNOWN:
// printf ( "Unknown" );
// break;
case TiXmlNode::TEXT:
pText = pParent->ToText();
// printf ( "Text: [%s]", pText->Value() );
if ( indent == 5 ) {
documentStructure * l_tmp_doc = sgmlDoc->getLastDocument();
segmentStructure * l_tmp_seg = l_tmp_doc->getLastSegments();
string l_text = pText->Value();
string line_mod=l_text;
if ( !xmlParams.tercomLike ) {
if ( xmlParams.debugMode ) {
cerr << "DEBUG xmlStructure::copy_to_SGMLDocument : line NOT tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
}
if ( xmlParams.debugMode ) {
cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "TERCOM AT FALSE " << endl << "END DEBUG" << endl;
}
line_mod = tokenizePunct ( line_mod );
}
if ( !xmlParams.caseOn ) {
if ( xmlParams.debugMode ) {
cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "CASEON AT FALSE " << endl << "END DEBUG" << endl;
}
line_mod = lowerCase ( line_mod );
}
if ( xmlParams.noPunct ) {
if ( xmlParams.debugMode ) {
cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "NOPUNCT AT TRUE " << endl << "END DEBUG" << endl;
}
if ( !xmlParams.tercomLike ) {
line_mod = removePunctTercom ( line_mod );
} else {
line_mod = removePunct ( line_mod );
}
}
if ( xmlParams.debugMode ) {
cerr << "DEBUG xmlStructure::copy_to_SGMLDocument : line tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
}
l_tmp_seg->addContent ( line_mod );
}
break;
// case TiXmlNode::DECLARATION:
// printf ( "Declaration" );
// break;
default:
break;
}
// printf ( "\n" );
for ( pChild = pParent->FirstChild(); pChild != 0; pChild = pChild->NextSibling() ) {
copy_to_SGMLDocument ( sgmlDoc, pChild, indent + 1 );
}
}
void xmlStructure::dump_attribs_to_SGMLDocuments ( SGMLDocument * sgmlDoc, TiXmlElement* pElement, unsigned int indent )
{
if ( !pElement )
return;
TiXmlAttribute* pAttrib = pElement->FirstAttribute();
// int i = 0;
// int ival;
// double dval;
// const char* pIndent = getIndent ( indent );
// printf ( "\n" );
while ( pAttrib ) {
string attribut = pAttrib->Name();
switch ( indent ) {
case 1 : {
if ( attribut.compare ( "setid" ) == 0 ) {
sgmlDoc->setSetId ( pAttrib->Value() );
}
if ( attribut.compare ( "srclang" ) == 0 ) {
sgmlDoc->setSrcLang ( pAttrib->Value() );
}
if ( attribut.compare ( "tgtlang" ) == 0 ) {
sgmlDoc->setTgtLang ( pAttrib->Value() );
}
}
break;
case 2: {
documentStructure * tmp_doc_bis = sgmlDoc->getLastDocument();
if ( attribut.compare ( "docid" ) == 0 ) {
tmp_doc_bis->setDocId ( pAttrib->Value() );
}
if ( attribut.compare ( "sysid" ) == 0 ) {
tmp_doc_bis->setSysId ( pAttrib->Value() );
}
}
break;
case 4: {
documentStructure * l_tmp_doc = sgmlDoc->getLastDocument();
segmentStructure * l_tmp_seg = l_tmp_doc->getLastSegments();
if ( attribut.compare ( "id" ) == 0 ) {
l_tmp_seg->setSegId ( pAttrib->Value() );
}
// else
// if (attribut.compare("Text")==0)
// {
// tmp_seg.addContent(pAttrib->Value());
// }
}
break;
default:
break;
}
// printf ( "%s%s: value=[%s]", pIndent, pAttrib->Name(), pAttrib->Value() );
// if ( pAttrib->QueryIntValue ( &ival ) == TIXML_SUCCESS )
// printf ( " int=%d", ival );
// if ( pAttrib->QueryDoubleValue ( &dval ) == TIXML_SUCCESS )
// printf ( " d=%1.1f", dval );
// printf ( "\n" );
// i++;
pAttrib = pAttrib->Next();
}
// return i;
}
// std::size_t hashValue(std::string key){}
}

View File

@ -1,40 +0,0 @@
/*
* Generic hashmap manipulation functions
*/
#ifndef __XMLSTRUCTURE_H__
#define __XMLSTRUCTURE_H__
#include "sgmlDocument.h"
#include "documentStructure.h"
#include "stdio.h"
#include <iostream>
#include <string>
#include "tinyxml.h"
using namespace std;
namespace TERCpp
{
class xmlStructure
{
private:
unsigned int NUM_INDENTS_PER_SPACE;
// void dump_attribs_to_SGMLDocuments ( SGMLDocument* arg1, const TiXmlElement* arg2 );
void dump_attribs_to_SGMLDocuments ( SGMLDocument* sgmlDoc, TiXmlElement* pElement, unsigned int indent );
public:
xmlStructure();
const char * getIndent ( unsigned int numIndents );
const char * getIndentAlt ( unsigned int numIndents );
int dump_attribs_to_stdout ( TiXmlElement* pElement, unsigned int indent );
void dump_to_stdout ( TiXmlNode* pParent, unsigned int indent );
void dump_to_stdout ( const char* pFilename );
void copy_to_SGMLDocument ( SGMLDocument* sgmlDoc , TiXmlNode* pParent, unsigned int indent );
SGMLDocument dump_to_SGMLDocument ( string FileName );
param xmlParams;
};
}
#endif

View File

@ -78,7 +78,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
}
ostringstream stats;
stats << result.numEdits << " " << result.averageWords << " " << result.scoreAv() << " " ;
stats << result.numEdits*100.0 << " " << result.averageWords*100.0 << " " << result.scoreAv()*100.0 << " " ;
string stats_str = stats.str();
entry.set ( stats_str );
}
@ -94,6 +94,8 @@ float TerScorer::calculateScore ( const vector<int>& comps )
return (1.0+(num / denom));
}
}
/*
float TerScorer::calculateScore ( const vector<float>& comps )
{
float denom = 1.0 * comps[1];
@ -105,3 +107,4 @@ float TerScorer::calculateScore ( const vector<float>& comps )
return (1.0+(num / denom));
}
}
*/

View File

@ -43,9 +43,9 @@ public:
};
// protected:
protected:
float calculateScore(const vector<int>& comps);
float calculateScore(const vector<float>& comps);
// float calculateScore(const vector<float>& comps);
private:
string javaEnv;

View File

@ -26,6 +26,7 @@ typedef vector<statscore_t> statscores_t;
typedef float FeatureStatsType;
typedef FeatureStatsType* featstats_t;
typedef map<string,FeatureStatsType> sparse_featstats_t;
//typedef vector<FeatureStatsType> featstats_t;
typedef vector<FeatureStats> featarray_t;
typedef vector<FeatureArray> featdata_t;

View File

@ -23,12 +23,14 @@
#include "Timer.h"
#include "Util.h"
#include "../moses/src/ThreadPool.h"
float min_interval = 1e-3;
using namespace std;
void usage(void)
void usage(int ret)
{
cerr<<"usage: mert -d <dimensions> (mandatory )"<<endl;
cerr<<"[-n] retry ntimes (default 1)"<<endl;
@ -42,9 +44,14 @@ void usage(void)
cerr<<"[--scfile|-S] comma separated list of scorer data files (default score.data)"<<endl;
cerr<<"[--ffile|-F] comma separated list of feature data files (default feature.data)"<<endl;
cerr<<"[--ifile|-i] the starting point data file (default init.opt)"<<endl;
#ifdef WITH_THREADS
cerr<<"[--threads|-T] use multiple threads (default 1)"<<endl;
#endif
cerr<<"[--shard-count] Split data into shards, optimize for each shard and average"<<endl;
cerr<<"[--shard-size] Shard size as proportion of data. If 0, use non-overlapping shards"<<endl;
cerr<<"[-v] verbose level"<<endl;
cerr<<"[--help|-h] print this message and exit"<<endl;
exit(1);
exit(ret);
}
static struct option long_options[] = {
@ -60,12 +67,48 @@ static struct option long_options[] = {
{"scfile",1,0,'S'},
{"ffile",1,0,'F'},
{"ifile",1,0,'i'},
#ifdef WITH_THREADS
{"threads", required_argument,0,'T'},
#endif
{"shard-count", required_argument, 0, 'a'},
{"shard-size", required_argument, 0, 'b'},
{"verbose",1,0,'v'},
{"help",no_argument,0,'h'},
{0, 0, 0, 0}
};
int option_index;
/**
* Runs an optimisation, or a random restart.
**/
class OptimizationTask : public Moses::Task
{
public:
OptimizationTask(Optimizer* optimizer, const Point& point) :
m_optimizer(optimizer), m_point(point) {}
bool DeleteAfterExecution() {
return false;
}
void Run() {
m_score = m_optimizer->Run(m_point);
}
statscore_t getScore() const {
return m_score;
}
const Point& getPoint() const {
return m_point;
}
private:
Optimizer* m_optimizer;
Point m_point;
statscore_t m_score;
};
int main (int argc, char **argv)
{
@ -83,6 +126,11 @@ int main (int argc, char **argv)
int nrandom=0;
int seed=0;
bool hasSeed = false;
#ifdef WITH_THREADS
size_t threads=1;
#endif
float shard_size = 0;
size_t shard_count = 0;
string type("powell");
string scorertype("BLEU");
string scorerconfig("");
@ -140,12 +188,37 @@ int main (int argc, char **argv)
case 'v':
setverboselevel(strtol(optarg,NULL,10));
break;
#ifdef WITH_THREADS
case 'T':
threads = strtol(optarg, NULL, 10);
if (threads < 1) threads = 1;
break;
#endif
case 'a':
shard_count = strtof(optarg,NULL);
break;
case 'b':
shard_size = strtof(optarg,NULL);
break;
case 'h':
usage(0);
break;
default:
usage();
usage(1);
}
}
if (pdim < 0)
usage();
usage(1);
cerr << "shard_size = " << shard_size << " shard_count = " << shard_count << endl;
if (shard_size && !shard_count) {
cerr << "Error: shard-size provided without shard-count" << endl;
exit(1);
}
if (shard_size > 1 || shard_size < 0) {
cerr << "Error: shard-size should be between 0 and 1" << endl;
exit(1);
}
if (hasSeed) {
cerr << "Seeding random numbers with " << seed << endl;
@ -230,6 +303,12 @@ int main (int argc, char **argv)
PrintUserTime("Data loaded");
// starting point score over latest n-best, accumulative n-best
//vector<unsigned> bests;
//compute bests with sparse features needs to be implemented
//currently sparse weights are not even loaded
//statscore_t score = TheScorer->score(bests);
if (tooptimizestr.length() > 0) {
cerr << "Weights to optimize: " << tooptimizestr << endl;
@ -257,63 +336,116 @@ int main (int argc, char **argv)
}
if (pairedrankfile.compare("") != 0) {
D.sample_ranked_pairs(pairedrankfile);
D.sampleRankedPairs(pairedrankfile);
PrintUserTime("Stopping...");
exit(0);
}
Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start_list[0],type,nrandom);
O->SetScorer(TheScorer);
O->SetFData(D.getFeatureData());
// run with specified starting points
stringstream oss;
statscore_t best=0, mean=0, var=0;
Point bestP;
for(int i=0;i<start_list.size();i++) {
Point P(start_list[i], min, max);//Generate from the full feature set. Warning: must be done after Optimizer initialization
statscore_t score=O->Run(P);
oss.str("");
oss << "Specified starting point number " << (1+i) << ", score: " << score;
if (i==0 || score>best) {
best=score;
bestP=P;
oss << " (new best)";
}
mean+=score;
var+=(score*score);
PrintUserTime(oss.str());
// treat sparse features just like regular features
if (D.hasSparseFeatures()) {
D.mergeSparseFeatures();
}
// run with random starting points
for(int i=0; i<ntry; i++) {
Point P(start_list[0], min, max);
P.Randomize(); // randomize within min and max as given to the constructor
statscore_t score=O->Run(P);
oss.str("");
oss << "Randomized starting point number " << (1+i) << ", score: " << score;
if(score>best) {
best=score;
bestP=P;
oss << " (new best)";
}
mean+=score;
var+=(score*score);
PrintUserTime(oss.str());
#ifdef WITH_THREADS
cerr << "Creating a pool of " << threads << " threads" << endl;
Moses::ThreadPool pool(threads);
#endif
Point::setpdim(pdim);
Point::setdim(tooptimize.size());
//starting points consist of specified points and random restarts
vector<Point> startingPoints;
for (size_t i = 0; i < start_list.size(); ++i) {
startingPoints.push_back(Point(start_list[i],min,max));
}
mean/=(float)ntry;
var/=(float)ntry;
var=sqrt(abs(var-mean*mean));
for (int i = 0; i < ntry; ++i) {
startingPoints.push_back(Point(start_list[0],min,max));
startingPoints.back().Randomize();
}
vector<vector<OptimizationTask*> > allTasks(1);
//optional sharding
vector<Data> shards;
if (shard_count) {
D.createShards(shard_count, shard_size, scorerconfig, shards);
allTasks.resize(shard_count);
}
//launch tasks
for (size_t i = 0 ; i < allTasks.size(); ++i) {
Data& data = D;
if (shard_count) data = shards[i]; //use the sharded data if it exists
vector<OptimizationTask*>& tasks = allTasks[i];
Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start_list[0],type,nrandom);
O->SetScorer(data.getScorer());
O->SetFData(data.getFeatureData());
//A task for each start point
for (size_t j = 0; j < startingPoints.size(); ++j) {
OptimizationTask* task = new OptimizationTask(O,startingPoints[j]);
tasks.push_back(task);
#ifdef WITH_THREADS
pool.Submit(task);
#else
task->Run();
#endif
}
}
//wait for all threads to finish
#ifdef WITH_THREADS
pool.Stop(true);
#endif
statscore_t total = 0;
Point totalP;
//collect results
for (size_t i = 0; i < allTasks.size(); ++i) {
statscore_t best=0, mean=0, var=0;
Point bestP;
for (size_t j = 0; j < allTasks[i].size(); ++j) {
statscore_t score = allTasks[i][j]->getScore();
mean += score;
var += score*score;
if (score > best) {
bestP = allTasks[i][j]->getPoint();
best = score;
}
delete allTasks[i][j];
}
mean/=(float)ntry;
var/=(float)ntry;
var=sqrt(abs(var-mean*mean));
if (verboselevel()>1)
cerr<<"shard " << i << " best score: "<< best << " variance of the score (for "<<ntry<<" try): "<<var<<endl;
totalP += bestP;
total += best;
if (verboselevel()>1)
cerr << "bestP " << bestP << endl;
}
//cerr << "totalP: " << totalP << endl;
Point finalP = totalP * (1.0 / allTasks.size());
statscore_t final = total / allTasks.size();
if (verboselevel()>1)
cerr<<"best score: "<< best << " variance of the score (for "<<ntry<<" try): "<<var<<endl;
cerr << "bestP: " << finalP << endl;
// L1-Normalization of the best Point
if ((int)tooptimize.size() == pdim)
bestP.NormalizeL1();
finalP.NormalizeL1();
cerr << "Best point: " << bestP << " => " << best << endl;
cerr << "Best point: " << finalP << " => " << final << endl;
ofstream res("weights.txt");
res<<bestP<<endl;
res<<finalP<<endl;
PrintUserTime("Stopping...");
}

View File

@ -1,46 +0,0 @@
package MertRegressionTesting;
use strict;
# if your tests need a new version of the test data, increment this
# and make sure that a moses-regression-tests-vX.Y is available
use constant TESTING_DATA_VERSION => '0.1';
# find the data directory in a few likely locations and make sure
# that it is the correct version
sub find_data_directory
{
my ($test_script_root, $data_dir) = @_;
my $data_version = TESTING_DATA_VERSION;
my @ds = ();
my $mrtp = "mert-reg-test-data-$data_version";
push @ds, $data_dir if defined $data_dir;
push @ds, "$test_script_root/$mrtp";
push @ds, "/tmp/$mrtp";
push @ds, "/var/tmp/$mrtp";
foreach my $d (@ds) {
next unless (-d $d);
return $d;
}
print STDERR<<EOT;
You do not appear to have the regression testing data installed.
You may either specify a non-standard location when running
the test suite with the --data-dir option,
or, you may install it in any one of the following
standard locations: $test_script_root, /tmp, or /var/tmp with these
commands:
cd <DESIRED_INSTALLATION_DIRECTORY>
MODIFY ACCORDING TO IRSTLM
wget http://www.statmt.org/moses/reg-testing/mert-regression-tests-v$data_version.tar
tar xf mert-regression-tests-v$data_version.tar
rm mert-regression-tests-v$data_version.tar
EOT
exit 1;
}
1;

View File

@ -1,88 +0,0 @@
#!/usr/bin/perl -w
use strict;
my ($results, $truth) = @ARGV;
my ($report, $pass, $fail) = compare_results("$results/results.dat", "$truth/results.dat");
open OUT, ">$results/Summary";
print OUT $report;
print $report;
close OUT;
if ($fail > 0) {
print <<EOT;
There were failures in this test run. Please analyze the results carefully.
EOT
exit 1;
}
exit 0;
sub compare_results {
my ($testf, $truthf) = @_;
my $test = read_results($testf);
my $truth = read_results($truthf);
my $ct1 = delete $truth->{'COMPARISON_TYPE'};
my $ct2 = delete $test->{'COMPARISON_TYPE'};
my $pass = 0;
my $fail = 0;
my $report = '';
foreach my $k (sort keys %$truth) {
$report .= "test-name=$k\tresult=";
if (!exists $test->{$k}) {
$report .= "missing from test results\n";
$fail++;
next;
}
my $truthv = $truth->{$k} || '';
my $testv = delete $test->{$k} || '';
if ($ct1->{$k} eq '=') {
if ($truthv eq $testv) {
$report .= "pass\n";
$pass++;
} else {
$report .= "fail\n\tTRUTH=$truthv\n\t TEST=$testv\n";
$fail++;
}
} else { # numeric difference
$testv=$testv?$testv:0;
$truthv=$truthv?$truthv:0;
my $diff = $testv - $truthv;
if ($diff == 0) { $report .= "identical\n"; next; }
$report .= "BASELINE=$truthv, TEST=$testv\t DELTA=$diff";
if ($truthv != 0) {
my $pct = $diff/$truthv;
my $t = sprintf "\t PCT CHANGE=%4.2f", $pct*100;
$report .= $t;
}
$report .= "\n";
}
}
foreach my $k (sort keys %$test) {
$fail++;
$report .= "test-name=$k\tfound in TEST but not in TRUTH.\n";
}
$report .= "\nTESTS PASSED=$pass\nTESTS FAILED=$fail\n";
return $report, $pass, $fail;
}
sub read_results {
my ($file) = @_;
open IN, "<$file" or die "Could not open $file!";
my %res;
while (my $l = <IN>) {
if ($l =~ /^([A-Za-z0-9_]+)\s*([=~])\s*(.+)$/) {
my ($key, $comparison_type, $value) = ($1, $2, $3);
$res{$key} = $value;
$res{'COMPARISON_TYPE'}->{$key}=$comparison_type;
}
}
close IN;
return \%res;
}

View File

@ -1,105 +0,0 @@
#!/usr/bin/perl -w
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use Getopt::Long;
############################################################
my @tests = qw (
mert-basic
extractor-txt
extractor-bin
);
my @qsubtests = qw (
);
if (@qsubtests){
my $cmd=&getQsubCmd();
if (!defined($cmd)){
print STDERR "Regression tests (@qsubtests) can not run on $ENV{HOST}\nbecause SGE is not installed\n\n";
}else{
push @tests, @qsubtests;
}
}
###########################################################
use MertRegressionTesting;
use File::Temp qw ( tempfile );
use POSIX qw ( strftime );
my $test_dir;
my $BIN_TEST = $script_dir;
my $data_dir;
my $mert_scripts_dir;
GetOptions("data-dir=s" => \$data_dir,
"mert-scripts-dir=s"=> \$mert_scripts_dir,
) or exit 1;
$data_dir = MertRegressionTesting::find_data_directory($BIN_TEST, $data_dir);
my $test_run = "$BIN_TEST/run-single-test.pl --data-dir=$data_dir";
$test_dir = $script_dir . "/tests";
$test_run .= " --test-dir=$test_dir" if $test_dir;
$test_run .= " --mert-scripts-dir=$mert_scripts_dir" if $mert_scripts_dir;
print "Data directory: $data_dir\n";
print "Running tests: @tests\n\n";
print "TEST NAME STATUS PATH TO RESULTS\n";
my $lb = "---------------------------------------------------------------------------------------------------------\n";
print $lb;
my $fail = 0;
my @failed;
foreach my $test (@tests) {
my $cmd = "$test_run --test=$test";
my ($res, $output, $results_path) = do_test($cmd);
format STDOUT =
@<<<<<<<<<<<<<<<<<<<<<< @<<<<<<<<< @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
$test, $res, $results_path
.
write;
if ($res eq 'FAIL') {
print "$lb$output$lb";
$fail++;
push @failed, $test;
} else {
# TOTAL_WALLTIME result=BASELINE=11, TEST=12 DELTA=1 PCT CHANGE=9.09
if ($output =~ /TOTAL_WALLTIME\s+result\s*=\s*([^\n]+)/o) {
print "\t\tTiming statistics: $1\n";
}
}
}
my $total = scalar @tests;
my $fail_percentage = int(100 * $fail / $total);
my $pass_percentage = int(100 * ($total-$fail) / $total);
print "\n$pass_percentage% of the tests passed.\n";
print "$fail_percentage% of the tests failed.\n";
if ($fail_percentage>0) { print "\nPLEASE INVESTIGATE THESE FAILED TESTS: @failed\n"; }
sub do_test {
my ($test) = @_;
my $o = `$test 2>&1`;
my $res = 'PASS';
$res = 'FAIL' if ($? > 0);
my $od = '';
if ($o =~ /RESULTS AVAILABLE IN: (.*)$/m) {
$od = $1;
$o =~ s/^RESULTS AVAIL.*$//mo;
}
return ($res, $o, $od);
}
sub getQsubCmd {
my $a =`which qsub | head -1 | awk '{print \$1}'`;
chomp($a);
if ($a && -e $a){ return $a; }
else{ return undef; }
}

View File

@ -1,9 +0,0 @@
#! /bin/sh -w
bin=$1; shift
testdir=$1; shift
cd $testdir
$bin/mert --scfile data/SCORESTAT.txt --ffile data/FEATSTAT.txt --ifile data/INIT -d 14 -n 20 -r 1000 2>&1 | grep -i "^Best"
$bin/mert --scfile data/SCORESTAT.bin --ffile data/FEATSTAT.bin --ifile data/INIT -d 14 -n 20 -r 1000 2>&1 | grep -i "^Best"

View File

@ -1,220 +0,0 @@
#!/usr/bin/env python
#
# Mert test suite.
# Created by Barry Haddow
#
# This script downloads data from www.statmt.org, and runs tests of mert,
# comparing weights against expected and producing timing information.
#
import ConfigParser
import logging
import optparse
import os
import os.path
import re
import string
import subprocess
import sys
import time
import urllib
import warnings
warnings.filterwarnings(action="ignore",message="tmpnam")
log = logging.getLogger("testmert")
dataurl = "http://www.statmt.org/moses/reg-testing/mert/"
def getMertDirectory():
scriptdir = os.path.dirname(__file__)
if not os.path.isabs(scriptdir):
scriptdir = os.path.join(os.getcwd(),scriptdir)
scriptdir = os.path.normpath(scriptdir)
return os.path.dirname(scriptdir)
class Mert:
"""Controls operation of mert loop"""
def __init__(self,weightfile,reffile,scorertype="BLEU",retries="20"):
self.reffile = reffile
self.scorertype = scorertype
self.workingdir = os.tmpnam()
os.mkdir(self.workingdir)
self.mertdir = getMertDirectory()
self.iteration = 1 # iteration number of inner loop
self.retries = retries
self.extractortimes = []
self.merttimes = []
os.system("cp %s %s" % \
(weightfile,self.getFileName("weights",self.iteration-1)))
# calculate dimension from weight file
weightfh = open(weightfile)
line = weightfh.readline()
self.dimension = repr(len(line.split()))
weightfh.close()
def innerLoop(self, nbestfile):
"""Perform iteration of the inner loop. Returns location of
weights file"""
log.debug("Inner loop: %d" % self.iteration)
# run extractor
scorefile = self.getFileName("scores",self.iteration)
featurefile = self.getFileName("features",self.iteration)
weightinfile = self.getFileName("weights",self.iteration-1)
cmd = [os.path.join(self.mertdir,"extractor"),"--reference",
self.reffile, "--nbest",nbestfile,"--sctype",self.scorertype,\
"--scfile", scorefile,"--ffile",featurefile]
if self.iteration > 1:
prevscorefile = self.getFileName("scores",self.iteration-1)
prevfeaturefile = self.getFileName("features",self.iteration-1)
cmd = cmd + ["--prev-scfile",prevscorefile , "--prev-ffile", prevfeaturefile ]
log.debug("Running: " + string.join(cmd))
start = time.time()
ret = subprocess.call(cmd)
self.extractortimes.append(time.time()-start)
if ret != 0:
raise RuntimeError("Failed to execute extractor: return code %d" % ret)
# run mert
cmd = [os.path.join(self.mertdir,"mert"),"--sctype",\
self.scorertype, "--scfile", scorefile, "--ffile", featurefile,\
"--ifile",weightinfile, "-d", self.dimension,"-n",self.retries]
log.debug("Running: " + string.join(cmd))
start = time.time()
ret = subprocess.call(cmd, cwd=self.workingdir)
self.merttimes.append(time.time()-start)
if ret != 0:
raise RuntimeError("Failed to execute mert: return code %d" % ret)
weightoutfile = self.getFileName("weights",self.iteration)
os.system("mv %s %s" % (os.path.join(self.workingdir,\
"weights.txt"), weightoutfile))
self.iteration = self.iteration + 1
return weightoutfile
def getFileName(self,stem,iteration):
return os.path.join(self.workingdir,stem+"."+repr(iteration))
def cleanup(self):
os.system("rm -rf %s" % self.workingdir)
class Test:
"""A mert test"""
def __init__(self,datadir):
self.datadir = datadir
config = ConfigParser.ConfigParser()
config.read(os.path.join(datadir,"config"))
self.iterations = config.getint("test","iterations")
log.debug("Test iterations: %d" % self.iterations)
self.tolerance = 0.00001
def run(self):
"""Run the test, return a boolean indicating success or failure"""
weightfile = os.path.join(self.datadir,"weights.0")
reffile = os.path.join(self.datadir,"reference")
self.mert = Mert(weightfile,reffile)
self.diffs = []
for i in range(self.iterations):
nbestfile = os.path.join(self.datadir,"nbest." + repr(i+1) + ".gz")
weightfile = self.mert.innerLoop(nbestfile)
expectedweightfile = os.path.join(self.datadir,"weights."+repr(i+1))
expectedweights = self.getWeights(expectedweightfile)
weights = self.getWeights(weightfile)
log.debug("Expected weights: " + repr(expectedweights))
log.debug("Actual weights: " + repr(weights))
diff = False
for j in range(len(weights)):
if abs(weights[j]-expectedweights[j]) > self.tolerance:
log.debug("Weight %d does not match: " % j)
diff = True
break
else:
log.debug("Weights match expected")
self.diffs.append(diff)
self.mert.cleanup()
def getWeights(self,weightfile):
"""Load a weight set from a file"""
weightfh = open(weightfile)
line = weightfh.readline()
weights = [float(w) for w in line.split()]
weightfh.close()
return weights
def printSummary(self):
"""Print a summary of the results"""
print "RESULTS: ", self.datadir
print "Weights matching expected: ",
for diff in self.diffs:
print not diff,
print
print "Extractor times: ",
for etime in self.mert.extractortimes:
print "%7.3f" % etime,
print "ave: %7.3f" % (sum(self.mert.extractortimes)/self.iterations)
print "Optimisation times: ",
for mtime in self.mert.merttimes:
print "%7.3f" % mtime,
print "ave: %7.3f" % (sum(self.mert.merttimes)/self.iterations)
def getTestList():
listfh = urllib.urlopen(os.path.join(dataurl,"tests.txt"))
tests = []
for line in listfh:
tests.append(line[:-1])
listfh.close()
return tests
def list():
"""List all available tests"""
tests = getTestList()
print "Available tests:"
for test in tests:
print test
def runAll(datadir):
"""Run all available tests"""
for test in getTestList():
runTest(test,datadir)
def runTest(testname,datadir):
log.info("Test started: " + testname)
if not os.path.isdir(datadir):
os.mkdir(datadir)
testdir = os.path.join(datadir,testname)
# Check if the test exists, download if necessary
if os.path.isdir(testdir):
log.debug("Directory %s already exists: not downloading" % testdir)
else:
testurl = os.path.join(dataurl,testname + ".tgz")
log.debug("Retrieving test data from " + testurl)
(arname,headers) = urllib.urlretrieve(testurl)
os.system("cd %s; tar zxf %s" % (datadir,arname))
log.debug("Done")
if not os.path.isdir(testdir):
raise RuntimeError("Test %s did not unpack properly" % testname)
test = Test(testdir)
test.run()
test.printSummary()
log.info("Test ended: " + testname)
def main():
logging.basicConfig(level = logging.DEBUG)
parser = optparse.OptionParser("usage: %prog [options] list|run|runall [testname]")
parser.add_option("-d", "--datadir", action="store", default="data",
dest="datadir", help="Data directory to use", metavar="DIR")
(options,args) = parser.parse_args()
if len(args) < 1:
parser.error("Need to specify an action")
if args[0] == "list":
list()
else:
datadir = options.datadir
if args[0] == "runall":
runAll(datadir)
elif args[0] == "run":
if len(args) < 2:
parser.error("The run action requires a test name")
runTest(args[1],datadir)
if __name__ == "__main__":
main()

View File

@ -297,6 +297,11 @@
GCC_ENABLE_FIX_AND_CONTINUE = YES;
GCC_MODEL_TUNING = G5;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
TRACE_ENABLE,
_LARGE_FILES,
"_FILE_OFFSET_BITS=64",
);
HEADER_SEARCH_PATHS = ../moses/src;
INSTALL_PATH = /usr/local/bin;
LIBRARY_SEARCH_PATHS = (
@ -325,6 +330,11 @@
ALWAYS_SEARCH_USER_PATHS = NO;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
GCC_MODEL_TUNING = G5;
GCC_PREPROCESSOR_DEFINITIONS = (
TRACE_ENABLE,
_LARGE_FILES,
"_FILE_OFFSET_BITS=64",
);
HEADER_SEARCH_PATHS = ../moses/src;
INSTALL_PATH = /usr/local/bin;
LIBRARY_SEARCH_PATHS = (

View File

@ -361,77 +361,55 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
}
}
std::string lastName = "";
// translation components
if (StaticData::Instance().GetInputType()==SentenceInput) {
// translation components for text input
vector<PhraseDictionaryFeature*> pds = system->GetPhraseDictionaries();
if (pds.size() > 0) {
if (labeledOutput)
out << "tm: ";
vector<PhraseDictionaryFeature*>::iterator iter;
for (iter = pds.begin(); iter != pds.end(); ++iter) {
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
for (size_t j = 0; j<scores.size(); ++j)
out << scores[j] << " ";
}
}
} else {
// translation components for Confusion Network input
// first translation component has GetNumInputScores() scores from the input Confusion Network
// at the beginning of the vector
vector<PhraseDictionaryFeature*> pds = system->GetPhraseDictionaries();
if (pds.size() > 0) {
vector<PhraseDictionaryFeature*>::iterator iter;
const vector<PhraseDictionaryFeature*>& pds = system->GetPhraseDictionaries();
if (pds.size() > 0) {
iter = pds.begin();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
for( size_t i=0; i<pds.size(); i++ ) {
size_t pd_numinputscore = pds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
for (size_t j = 0; j<scores.size(); ++j){
size_t pd_numinputscore = (*iter)->GetNumInputScores();
if (pd_numinputscore) {
if (labeledOutput)
out << "I: ";
for (size_t j = 0; j < pd_numinputscore; ++j)
out << scores[j] << " ";
}
for (iter = pds.begin() ; iter != pds.end(); ++iter) {
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
size_t pd_numinputscore = (*iter)->GetNumInputScores();
if (iter == pds.begin() && labeledOutput)
out << "tm: ";
for (size_t j = pd_numinputscore; j < scores.size() ; ++j)
out << scores[j] << " ";
}
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = pds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}
// word penalty
if (labeledOutput)
out << "w: ";
out << " w: ";
out << path.GetScoreBreakdown().GetScoreForProducer(system->GetWordPenaltyProducer()) << " ";
// generation
const vector<GenerationDictionary*> gds = system->GetGenerationDictionaries();
const vector<GenerationDictionary*>& gds = system->GetGenerationDictionaries();
if (gds.size() > 0) {
if (labeledOutput)
out << "g: ";
vector<GenerationDictionary*>::const_iterator iter;
for (iter = gds.begin(); iter != gds.end(); ++iter) {
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
for (size_t j = 0; j<scores.size(); j++) {
out << scores[j] << " ";
}
for( size_t i=0; i<gds.size(); i++ ) {
size_t pd_numinputscore = gds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
for (size_t j = 0; j<scores.size(); ++j){
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = gds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}
// total
out << "||| " << path.GetTotalScore();

View File

@ -207,22 +207,40 @@ void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<Fa
}
}
void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset)
{
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
AlignVec alignments = ai.GetSortedAlignments();
AlignVec::const_iterator it;
for (it = alignments.begin(); it != alignments.end(); ++it) {
const std::pair<size_t,size_t> &alignment = **it;
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
}
}
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
{
ostringstream out;
size_t targetOffset = 0;
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
AlignmentInfo::const_iterator it;
for (it = tp.GetAlignmentInfo().begin(); it != tp.GetAlignmentInfo().end(); ++it) {
out << it->first + sourceOffset << "-" << it->second + targetOffset << " ";
}
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
targetOffset += tp.GetSize();
}
out << std::endl;
}
void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
{
ostringstream out;
OutputAlignment(out, edges);
collector->Write(lineNo,out.str());
}
@ -364,68 +382,45 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
// print scores with feature names
OutputAllFeatureScores( out, system, path );
string lastName;
// translation components
if (StaticData::Instance().GetInputType()==SentenceInput) {
// translation components for text input
vector<PhraseDictionaryFeature*> pds = system->GetPhraseDictionaries();
if (pds.size() > 0) {
if (labeledOutput)
out << " tm:";
vector<PhraseDictionaryFeature*>::iterator iter;
for (iter = pds.begin(); iter != pds.end(); ++iter) {
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
for (size_t j = 0; j<scores.size(); ++j)
out << " " << scores[j];
}
}
} else {
// translation components for Confusion Network input
// first translation component has GetNumInputScores() scores from the input Confusion Network
// at the beginning of the vector
vector<PhraseDictionaryFeature*> pds = system->GetPhraseDictionaries();
if (pds.size() > 0) {
vector<PhraseDictionaryFeature*>::iterator iter;
const vector<PhraseDictionaryFeature*>& pds = system->GetPhraseDictionaries();
if (pds.size() > 0) {
iter = pds.begin();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
for( size_t i=0; i<pds.size(); i++ ) {
size_t pd_numinputscore = pds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
for (size_t j = 0; j<scores.size(); ++j){
size_t pd_numinputscore = (*iter)->GetNumInputScores();
if (pd_numinputscore) {
if (labeledOutput)
out << " I:";
for (size_t j = 0; j < pd_numinputscore; ++j)
out << " " << scores[j];
}
for (iter = pds.begin() ; iter != pds.end(); ++iter) {
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
size_t pd_numinputscore = (*iter)->GetNumInputScores();
if (iter == pds.begin() && labeledOutput)
out << " tm:";
for (size_t j = pd_numinputscore; j < scores.size() ; ++j)
out << " " << scores[j];
}
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = pds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}
// generation
const vector<GenerationDictionary*> gds = system->GetGenerationDictionaries();
const vector<GenerationDictionary*>& gds = system->GetGenerationDictionaries();
if (gds.size() > 0) {
if (labeledOutput)
out << " g: ";
vector<GenerationDictionary*>::const_iterator iter;
for (iter = gds.begin(); iter != gds.end(); ++iter) {
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
for (size_t j = 0; j<scores.size(); j++) {
out << scores[j] << " ";
}
for( size_t i=0; i<gds.size(); i++ ) {
size_t pd_numinputscore = gds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
for (size_t j = 0; j<scores.size(); ++j){
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = gds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}
@ -451,18 +446,17 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
}
if (includeWordAlignment) {
out << " |||";
out << " ||| ";
for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
WordsRange targetRange = path.GetTargetWordsRange(edge);
const int sourceOffset = sourceRange.GetStartPos();
const int targetOffset = targetRange.GetStartPos();
const AlignmentInfo AI = edge.GetCurrTargetPhrase().GetAlignmentInfo();
AlignmentInfo::const_iterator iter;
for (iter = AI.begin(); iter != AI.end(); ++iter) {
out << " " << iter->first+sourceOffset << "-" << iter->second+targetOffset;
}
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
OutputAlignment(out, ai, sourceOffset, targetOffset);
}
}

View File

@ -341,6 +341,7 @@ int main(int argc, char** argv)
exit(1);
}
// create threadpool, if using multi-threaded decoding
// note: multi-threading is done on sentence-level,
// each thread translates one sentence

View File

@ -16,10 +16,11 @@
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <algorithm>
#include <cassert>
#include "AlignmentInfo.h"
#include "TypeDef.h"
#include "StaticData.h"
namespace Moses
{
@ -41,8 +42,47 @@ void AlignmentInfo::BuildNonTermIndexMap()
for (p = begin(); p != end(); ++p) {
m_nonTermIndexMap[p->second] = i++;
}
}
bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b) {
if(a->second < b->second) return true;
if(a->second == b->second) return (a->first < b->first);
return false;
}
std::vector< const std::pair<size_t,size_t>* > AlignmentInfo::GetSortedAlignments() const
{
std::vector< const std::pair<size_t,size_t>* > ret;
CollType::const_iterator iter;
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter)
{
const std::pair<size_t,size_t> &alignPair = *iter;
ret.push_back(&alignPair);
}
const StaticData &staticData = StaticData::Instance();
WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort();
switch (wordAlignmentSort)
{
case NoSort:
break;
case TargetOrder:
std::sort(ret.begin(), ret.end(), compare_target);
break;
default:
assert(false);
}
return ret;
}
std::ostream& operator<<(std::ostream &out, const AlignmentInfo &alignmentInfo)
{
AlignmentInfo::const_iterator iter;

View File

@ -51,6 +51,8 @@ class AlignmentInfo
return m_nonTermIndexMap;
}
std::vector< const std::pair<size_t,size_t>* > GetSortedAlignments() const;
private:
// AlignmentInfo objects should only be created by an AlignmentInfoCollection
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs)

View File

@ -67,7 +67,7 @@ public:
return "BleuScoreFeature";
}
std::string GetScoreProducerWeightShortName() const
std::string GetScoreProducerWeightShortName(unsigned) const
{
return "bl";
}

View File

@ -190,7 +190,6 @@ void ChartCell::GetSearchGraph(long translationId, std::ostream &outputSearchGra
const ChartHypothesisCollection &coll = iterOutside->second;
coll.GetSearchGraph(translationId, outputSearchGraphStream, reachable);
}
}
std::ostream& operator<<(std::ostream &out, const ChartCell &cell)

View File

@ -259,7 +259,8 @@ void ChartHypothesisCollection::GetSearchGraph(long translationId, std::ostream
HCType::const_iterator iter;
for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter) {
ChartHypothesis &mainHypo = **iter;
if (reachable.find(mainHypo.GetId()) != reachable.end()) {
if (StaticData::Instance().GetUnprunedSearchGraph() ||
reachable.find(mainHypo.GetId()) != reachable.end()) {
outputSearchGraphStream << translationId << " " << mainHypo << endl;
}

View File

@ -254,7 +254,7 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
delete tpcollBerkeleyDb;
m_cache[tpCollFilePos] = targetPhraseCollection;
} else {
// jsut get out of cache
// just get out of cache
targetPhraseCollection = iterCache->second;
}

View File

@ -44,7 +44,7 @@ size_t DistortionScoreProducer::GetNumScoreComponents() const
return 1;
}
std::string DistortionScoreProducer::GetScoreProducerWeightShortName() const
std::string DistortionScoreProducer::GetScoreProducerWeightShortName(unsigned) const
{
return "d";
}
@ -105,7 +105,7 @@ size_t WordPenaltyProducer::GetNumScoreComponents() const
return 1;
}
std::string WordPenaltyProducer::GetScoreProducerWeightShortName() const
std::string WordPenaltyProducer::GetScoreProducerWeightShortName(unsigned) const
{
return "w";
}
@ -126,7 +126,7 @@ size_t UnknownWordPenaltyProducer::GetNumScoreComponents() const
}
std::string UnknownWordPenaltyProducer::GetScoreProducerWeightShortName() const
std::string UnknownWordPenaltyProducer::GetScoreProducerWeightShortName(unsigned) const
{
return "u";
}

View File

@ -21,7 +21,7 @@ public:
const WordsRange &prev, const WordsRange &curr, const int FirstGapPosition) const;
size_t GetNumScoreComponents() const;
std::string GetScoreProducerWeightShortName() const;
std::string GetScoreProducerWeightShortName(unsigned) const;
size_t GetNumInputScores() const;
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
@ -49,7 +49,7 @@ public:
WordPenaltyProducer() : StatelessFeatureFunction("WordPenalty") {}
size_t GetNumScoreComponents() const;
std::string GetScoreProducerWeightShortName() const;
std::string GetScoreProducerWeightShortName(unsigned) const;
size_t GetNumInputScores() const;
virtual void Evaluate(
@ -64,7 +64,7 @@ public:
UnknownWordPenaltyProducer() : StatelessFeatureFunction("!UnknownWordPenalty") {}
size_t GetNumScoreComponents() const;
std::string GetScoreProducerWeightShortName() const;
std::string GetScoreProducerWeightShortName(unsigned) const;
size_t GetNumInputScores() const;
virtual bool ComputeValueInTranslationOption() const;

View File

@ -60,7 +60,7 @@ void DynSuffixArray::BuildAuxArrays()
int DynSuffixArray::Rank(unsigned word, unsigned idx)
{
/* use Gerlach's code to make rank faster */
// the number of word in L[0..i]
// the number of words in L[0..i] (minus 1 which is why 'i < idx', not '<=')
int r(0);
for(unsigned i=0; i < idx; ++i)
if(m_L->at(i) == word) ++r;
@ -140,25 +140,33 @@ void DynSuffixArray::Insert(vuint_t* newSent, unsigned newIndex)
}
// Begin stage 4
Reorder(true_pos, LastFirstFunc(kprime)); // actual position vs computed position of cycle (newIndex-1)
cerr << "GETS HERE 13\n";
}
void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
{
//cerr << "j=" << j << "\tj'=" << jprime << endl;
set<pair<unsigned, unsigned> > seen;
while(j != jprime) {
// this 'seenit' check added for data with many loops. will remove after double
// checking.
bool seenit = seen.insert(std::make_pair(j, jprime)).second;
if(seenit) {
for(int i=1; i < m_SA->size(); ++i) {
if(m_corpus->at(m_SA->at(i)) < m_corpus->at(m_SA->at(i-1))) {
cerr << "PROBLEM WITH SUFFIX ARRAY REORDERING. EXITING...\n";
exit(1);
}
}
return;
}
//cerr << "j=" << j << "\tj'=" << jprime << endl;
int tmp, isaIdx(-1);
int isaIdx(-1);
int new_j = LastFirstFunc(j);
cerr << "new_j = " << new_j << endl;
// for SA, L, and F, the element at pos j is moved to j'
tmp = m_L->at(j); // L
m_L->at(j) = m_L->at(jprime);
m_L->at(jprime) = tmp;
tmp = m_SA->at(j); // SA
m_SA->at(j) = m_SA->at(jprime);
m_SA->at(jprime) = tmp;
assert(j <= jprime);
// for SA and L, the element at pos j is moved to pos j'
m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
m_L->erase(m_L->begin() + j);
m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
m_SA->erase(m_SA->begin() + j);
// all ISA values between (j...j'] decremented
for(size_t i = 0; i < m_ISA->size(); ++i) {
if((m_ISA->at(i) == j) && (isaIdx == -1))
@ -180,8 +188,8 @@ void DynSuffixArray::Delete(unsigned index, unsigned num2del)
int true_pos = LastFirstFunc(m_ISA->at(index)); // track cycle shift (newIndex - 1)
for(size_t q = 0; q < num2del; ++q) {
int row = m_ISA->at(index); // gives the position of index in SA and m_F
std::cerr << "row = " << row << std::endl;
std::cerr << "SA[r]/index = " << m_SA->at(row) << "/" << index << std::endl;
//std::cerr << "row = " << row << std::endl;
//std::cerr << "SA[r]/index = " << m_SA->at(row) << "/" << index << std::endl;
true_pos -= (row <= true_pos ? 1 : 0); // track changes
m_L->erase(m_L->begin() + row);
m_F->erase(m_F->begin() + row);
@ -198,7 +206,7 @@ void DynSuffixArray::Delete(unsigned index, unsigned num2del)
}
m_L->at(m_ISA->at(index))= ltmp;
Reorder(LastFirstFunc(m_ISA->at(index)), true_pos);
PrintAuxArrays();
//PrintAuxArrays();
}
void DynSuffixArray::Substitute(vuint_t* /* newSents */, unsigned /* newIndex */)

View File

@ -71,7 +71,7 @@ public:
bool Load(const std::string &filePath, FactorDirection direction);
size_t GetNumScoreComponents() const;
std::string GetScoreProducerWeightShortName() const
std::string GetScoreProducerWeightShortName(unsigned) const
{
return "g";
}

View File

@ -57,7 +57,7 @@ public:
return 1;
};
virtual std::string GetScoreProducerWeightShortName() const {
virtual std::string GetScoreProducerWeightShortName(unsigned) const {
return "lex";
};

View File

@ -276,17 +276,13 @@ void Hypothesis::CalcScore(const SquareMatrix &futureScore)
// cached in the translation option-- there is no principled distinction
const vector<const StatelessFeatureFunction*>& sfs =
m_manager.GetTranslationSystem()->GetStatelessFeatureFunctions();
VERBOSE(3,"There are " << sfs.size() << " stateless feature functions" << endl);
for (unsigned i = 0; i < sfs.size(); ++i) {
VERBOSE(3,"\tStateless score producer:\t" << sfs[i]->GetScoreProducerDescription() << endl);
sfs[i]->Evaluate(m_targetPhrase, &m_scoreBreakdown);
}
const vector<const StatefulFeatureFunction*>& ffs =
m_manager.GetTranslationSystem()->GetStatefulFeatureFunctions();
VERBOSE(3,"There are " << ffs.size() << " stateful feature functions" << endl);
for (unsigned i = 0; i < ffs.size(); ++i) {
VERBOSE(3,"\tStateful score producer:\t" << ffs[i]->GetScoreProducerDescription() << endl);
m_ffStates[i] = ffs[i]->Evaluate(
*this,
m_prevHypo ? m_prevHypo->m_ffStates[i] : NULL,

View File

@ -39,22 +39,34 @@ void LMList::CleanUp()
RemoveAllInColl(m_coll);
}
void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGramScore, ScoreComponentCollection* breakdown) const
void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGramScore, float &retOOVScore, ScoreComponentCollection* breakdown) const
{
const_iterator lmIter;
for (lmIter = begin(); lmIter != end(); ++lmIter) {
const LanguageModel &lm = **lmIter;
const float weightLM = lm.GetWeight();
const float oovWeightLM = lm.GetOOVWeight();
float fullScore, nGramScore;
float fullScore, nGramScore;
size_t oovCount;
// do not process, if factors not defined yet (happens in partial translation options)
if (!lm.Useable(phrase))
continue;
lm.CalcScore(phrase, fullScore, nGramScore);
lm.CalcScore(phrase, fullScore, nGramScore, oovCount);
if (StaticData::Instance().GetLMEnableOOVFeature()) {
vector<float> scores(2);
scores[0] = nGramScore;
scores[1] = oovCount;
breakdown->Assign(&lm, scores);
retOOVScore += oovCount * oovWeightLM;
} else {
breakdown->Assign(&lm, nGramScore); // I'm not sure why += doesn't work here- it should be 0.0 right?
}
breakdown->Assign(&lm, nGramScore); // I'm not sure why += doesn't work here- it should be 0.0 right?
retFullScore += fullScore * weightLM;
retNGramScore += nGramScore * weightLM;
}

View File

@ -34,7 +34,7 @@ public:
void CleanUp();
~LMList();
void CalcScore(const Phrase &phrase, float &retFullScore, float &retNGramScore, ScoreComponentCollection* breakdown) const;
void CalcScore(const Phrase &phrase, float &retFullScore, float &retNGramScore, float &retOOVScore, ScoreComponentCollection* breakdown) const;
void CalcAllLMScores(const Phrase &phrase
, ScoreComponentCollection &nGramOnly
@ -45,7 +45,6 @@ public:
size_t GetMaxNGramOrder() const
{ return m_maxNGramOrder; }
};
}

View File

@ -46,6 +46,7 @@ LanguageModel::LanguageModel(LanguageModelImplementation *implementation) :
StatefulFeatureFunction("LM"),
m_implementation(implementation)
{
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
#ifndef WITH_THREADS
// ref counting handled by boost otherwise
m_implementation->IncrementReferenceCount();
@ -56,6 +57,7 @@ LanguageModel::LanguageModel(LanguageModel *loadedLM) :
StatefulFeatureFunction("LM"),
m_implementation(loadedLM->m_implementation)
{
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
#ifndef WITH_THREADS
// ref counting handled by boost otherwise
m_implementation->IncrementReferenceCount();
@ -73,16 +75,23 @@ LanguageModel::~LanguageModel()
// don't inline virtual funcs...
size_t LanguageModel::GetNumScoreComponents() const
{
return 1;
if (m_enableOOVFeature) {
return 2;
} else {
return 1;
}
}
void LanguageModel::CalcScore(const Phrase &phrase
, float &fullScore
, float &ngramScore) const
, float &ngramScore
, size_t &oovCount) const
{
fullScore = 0;
ngramScore = 0;
oovCount = 0;
size_t phraseSize = phrase.GetSize();
if (!phraseSize) return;
@ -110,10 +119,13 @@ void LanguageModel::CalcScore(const Phrase &phrase
// do nothing, don't include prob for <s> unigram
assert(currPos == 0);
} else {
float partScore = m_implementation->GetValueGivenState(contextFactor, *state).score;
fullScore += partScore;
LMResult result = m_implementation->GetValueGivenState(contextFactor, *state);
fullScore += result.score;
if (contextFactor.size() == GetNGramOrder())
ngramScore += partScore;
ngramScore += result.score;
if (contextFactor.size() == 1 && result.unknown)
++oovCount;
}
}
@ -219,7 +231,16 @@ FFState* LanguageModel::Evaluate(
m_implementation->GetState(contextFactor, *res);
}
}
out->PlusEquals(this, lmScore);
if (m_enableOOVFeature) {
vector<float> scores(2);
scores[0] = lmScore;
scores[1] = 0;
out->PlusEquals(this, scores);
} else {
out->PlusEquals(this, lmScore);
}
IFVERBOSE(2) {
hypo.GetManager().GetSentenceStats().AddTimeCalcLM( clock()-t );
}
@ -227,7 +248,15 @@ FFState* LanguageModel::Evaluate(
}
float LanguageModel::GetWeight() const {
return StaticData::Instance().GetAllWeights().GetScoreForProducer(this);
return StaticData::Instance().GetAllWeights().GetScoresForProducer(this)[0];
}
float LanguageModel::GetOOVWeight() const {
if (m_enableOOVFeature) {
return StaticData::Instance().GetAllWeights().GetScoresForProducer(this)[1];
} else {
return 0;
}
}
FFState* LanguageModel::EvaluateChart(

View File

@ -52,6 +52,8 @@ protected:
#else
LanguageModelImplementation *m_implementation;
#endif
bool m_enableOOVFeature;
public:
@ -85,11 +87,13 @@ public:
* Useable() should be called beforehand on the phrase
* \param fullScore scores of all unigram, bigram... of contiguous n-gram of the phrase
* \param ngramScore score of only n-gram of order m_nGramOrder
* \param oovCount number of LM OOVs
*/
void CalcScore(
const Phrase &phrase,
float &fullScore,
float &ngramScore) const;
float &ngramScore,
size_t &oovCount) const;
void CalcScoreChart(
const Phrase &phrase,
@ -104,8 +108,9 @@ public:
float GetWeight() const;
float GetOOVWeight() const;
std::string GetScoreProducerWeightShortName() const
std::string GetScoreProducerWeightShortName(unsigned) const
{
return "lm";
}

View File

@ -97,7 +97,7 @@ float LanguageModelDMapLM::GetValue(
return score;
}
FFState* LanguageModelDMapLM::GetNullContextState() const {
const FFState* LanguageModelDMapLM::GetNullContextState() const {
DMapLMState* state = new DMapLMState();
state->m_last_succeeding_order = GetNGramOrder();
return state;
@ -109,7 +109,7 @@ FFState* LanguageModelDMapLM::GetNewSentenceState() const {
return state;
}
FFState* LanguageModelDMapLM::GetBeginSentenceState() const {
const FFState* LanguageModelDMapLM::GetBeginSentenceState() const {
DMapLMState* state = new DMapLMState();
state->m_last_succeeding_order = GetNGramOrder();
return state;

View File

@ -37,9 +37,9 @@ public:
LMResult GetValueGivenState(const std::vector<const Word*>&, FFState&) const;
LMResult GetValueForgotState(const std::vector<const Word*>&, FFState&) const;
float GetValue(const std::vector<const Word*>&, size_t, size_t*) const;
FFState* GetNullContextState() const;
const FFState* GetNullContextState() const;
FFState* GetNewSentenceState() const;
FFState* GetBeginSentenceState() const;
const FFState* GetBeginSentenceState() const;
FFState* NewState(const FFState*) const;
void CleanUpAfterSentenceProcessing();
void InitializeBeforeSentenceProcessing();

View File

@ -92,8 +92,8 @@ public:
// This is here so models can implement a shortcut to GetValueAndState.
virtual void GetState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
virtual FFState *GetNullContextState() const = 0;
virtual FFState *GetBeginSentenceState() const = 0;
virtual const FFState *GetNullContextState() const = 0;
virtual const FFState *GetBeginSentenceState() const = 0;
virtual FFState *NewState(const FFState *from = NULL) const = 0;
//! max n-gram order of LM
@ -109,6 +109,11 @@ public:
return m_sentenceEndArray;
}
std::string GetScoreProducerWeightShortName(unsigned) const {
return "lm";
}
//! overrideable funtions for IRST LM to cleanup. Maybe something to do with on demand/cache loading/unloading
virtual void InitializeBeforeSentenceProcessing() {};
virtual void CleanUpAfterSentenceProcessing() {};

View File

@ -119,11 +119,11 @@ public:
return ret;
}
FFState *GetNullContextState() const {
const FFState *GetNullContextState() const {
return m_lmImpl->GetNullContextState();
}
FFState *GetBeginSentenceState() const {
const FFState *GetBeginSentenceState() const {
return m_lmImpl->GetBeginSentenceState();
}

View File

@ -22,6 +22,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <cassert>
#include <cstring>
#include <iostream>
#include <stdlib.h>
#include "lm/binary_format.hh"
#include "lm/enumerate_vocab.hh"
#include "lm/model.hh"
@ -40,6 +41,8 @@ using namespace std;
namespace Moses
{
LanguageModelKenBase::~LanguageModelKenBase() {}
namespace
{
@ -78,14 +81,14 @@ struct KenLMState : public FFState {
/** Implementation of single factor LM using Ken's code.
*/
template <class Model> class LanguageModelKen : public LanguageModelSingleFactor
template <class Model> class LanguageModelKen : public LanguageModelKenBase
{
private:
Model *m_ngram;
std::vector<lm::WordIndex> m_lmIdLookup;
bool m_lazy;
FFState *m_nullContextState;
FFState *m_beginSentenceState;
KenLMState m_nullContextState;
KenLMState m_beginSentenceState;
void TranslateIDs(const std::vector<const Word*> &contextFactor, lm::WordIndex *indices) const;
@ -97,12 +100,20 @@ public:
, FactorType factorType
, size_t nGramOrder);
LMResult GetValueGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const;
LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
LMResult GetValueGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const {
return GetKenFullScoreGivenState(contextFactor, state);
}
LMKenResult GetKenFullScoreGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const;
LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const {
return GetKenFullScoreForgotState(contextFactor, outState);
}
LMKenResult GetKenFullScoreForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
void GetState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
FFState *GetNullContextState() const;
FFState *GetBeginSentenceState() const;
const FFState *GetNullContextState() const;
const FFState *GetBeginSentenceState() const;
FFState *NewState(const FFState *from = NULL) const;
lm::WordIndex GetLmID(const std::string &str) const;
@ -159,24 +170,26 @@ template <class Model> bool LanguageModelKen<Model>::Load(const std::string &fil
config.enumerate_vocab = &builder;
config.load_method = m_lazy ? util::LAZY : util::POPULATE_OR_READ;
m_ngram = new Model(filePath.c_str(), config);
try {
m_ngram = new Model(filePath.c_str(), config);
} catch (std::exception &e) {
std::cerr << e.what() << std::endl;
abort();
}
m_nGramOrder = m_ngram->Order();
KenLMState *tmp = new KenLMState();
tmp->state = m_ngram->NullContextState();
m_nullContextState = tmp;
tmp = new KenLMState();
tmp->state = m_ngram->BeginSentenceState();
m_beginSentenceState = tmp;
m_nullContextState.state = m_ngram->NullContextState();
m_beginSentenceState.state = m_ngram->BeginSentenceState();
return true;
}
template <class Model> LMResult LanguageModelKen<Model>::GetValueGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const
template <class Model> LMKenResult LanguageModelKen<Model>::GetKenFullScoreGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const
{
LMResult result;
LMKenResult result;
if (contextFactor.empty()) {
result.score = 0.0;
result.unknown = false;
result.ngram_length = 0;
return result;
}
lm::ngram::State &realState = static_cast<KenLMState&>(state).state;
@ -187,16 +200,18 @@ template <class Model> LMResult LanguageModelKen<Model>::GetValueGivenState(cons
result.score = TransformLMScore(ret.prob);
result.unknown = (new_word == 0);
result.ngram_length = ret.ngram_length;
return result;
}
template <class Model> LMResult LanguageModelKen<Model>::GetValueForgotState(const vector<const Word*> &contextFactor, FFState &outState) const
template <class Model> LMKenResult LanguageModelKen<Model>::GetKenFullScoreForgotState(const vector<const Word*> &contextFactor, FFState &outState) const
{
LMResult result;
LMKenResult result;
if (contextFactor.empty()) {
static_cast<KenLMState&>(outState).state = m_ngram->NullContextState();
result.score = 0.0;
result.unknown = false;
result.ngram_length = 0;
return result;
}
@ -207,6 +222,7 @@ template <class Model> LMResult LanguageModelKen<Model>::GetValueForgotState(con
result.score = TransformLMScore(ret.prob);
result.unknown = (indices[0] == 0);
result.ngram_length = ret.ngram_length;
return result;
}
@ -221,14 +237,14 @@ template <class Model> void LanguageModelKen<Model>::GetState(const std::vector<
m_ngram->GetState(indices, indices + contextFactor.size(), static_cast<KenLMState&>(outState).state);
}
template <class Model> FFState *LanguageModelKen<Model>::GetNullContextState() const
template <class Model> const FFState *LanguageModelKen<Model>::GetNullContextState() const
{
return m_nullContextState;
return &m_nullContextState;
}
template <class Model> FFState *LanguageModelKen<Model>::GetBeginSentenceState() const
template <class Model> const FFState *LanguageModelKen<Model>::GetBeginSentenceState() const
{
return m_beginSentenceState;
return &m_beginSentenceState;
}
template <class Model> FFState *LanguageModelKen<Model>::NewState(const FFState *from) const

View File

@ -28,10 +28,26 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
// Doesn't actually load; moses wants the Load method for that. It needs the file to autodetect binary format.
// kenlm specific score value
struct LMKenResult : public LMResult {
unsigned char ngram_length;
};
// base-class for the actual LanguageModelKen; only here to provide a specific behaviour without exposing the implementation
class LanguageModelKenBase : public LanguageModelSingleFactor {
public:
virtual ~LanguageModelKenBase();
// scoring functions which provide more info than the common interface of LanguageModel
virtual LMKenResult GetKenFullScoreGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const = 0;
virtual LMKenResult GetKenFullScoreForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const = 0;
};
// Doesn't actually load; moses wants the Load method for that. It needs the file to autodetect binary format.
LanguageModelSingleFactor *ConstructKenLM(const std::string &file, bool lazy);
};
}
#endif

View File

@ -286,12 +286,12 @@ FFState *LanguageModelParallelBackoff::NewState(const FFState * /*from*/) const
return NULL;
}
FFState *LanguageModelParallelBackoff::GetNullContextState() const
const FFState *LanguageModelParallelBackoff::GetNullContextState() const
{
return NULL;
}
FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const
const FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const
{
return NULL;
}

View File

@ -90,8 +90,8 @@ public:
void CreateFactors();
LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
FFState *GetNullContextState() const;
FFState *GetBeginSentenceState() const;
const FFState *GetNullContextState() const;
const FFState *GetBeginSentenceState() const;
FFState *NewState(const FFState *from) const;
};

View File

@ -61,12 +61,12 @@ LanguageModelPointerState::LanguageModelPointerState()
LanguageModelPointerState::~LanguageModelPointerState() {}
FFState *LanguageModelPointerState::GetNullContextState() const
const FFState *LanguageModelPointerState::GetNullContextState() const
{
return m_nullContextState;
}
FFState *LanguageModelPointerState::GetBeginSentenceState() const
const FFState *LanguageModelPointerState::GetBeginSentenceState() const
{
return m_beginSentenceState;
}

View File

@ -83,8 +83,8 @@ protected:
virtual ~LanguageModelPointerState();
virtual FFState *GetNullContextState() const;
virtual FFState *GetBeginSentenceState() const;
virtual const FFState *GetNullContextState() const;
virtual const FFState *GetBeginSentenceState() const;
virtual FFState *NewState(const FFState *from = NULL) const;
virtual LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const;

View File

@ -69,11 +69,11 @@ public:
return m_lmImpl->Load(filePath, m_factorType, nGramOrder);
}
FFState *GetNullContextState() const {
const FFState *GetNullContextState() const {
return m_lmImpl->GetNullContextState();
}
FFState *GetBeginSentenceState() const {
const FFState *GetBeginSentenceState() const {
return m_lmImpl->GetBeginSentenceState();
}

View File

@ -42,7 +42,7 @@ public:
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
std::string GetScoreProducerWeightShortName() const {
std::string GetScoreProducerWeightShortName(unsigned) const {
return "d";
};

View File

@ -70,6 +70,7 @@ libmoses_la_HEADERS = \
LanguageModelRemote.h \
LanguageModelSingleFactor.h \
LanguageModelSkip.h \
LanguageModelKen.h \
LexicalReordering.h \
LexicalReorderingState.h \
LexicalReorderingTable.h \

View File

@ -57,6 +57,7 @@ Parameter::Parameter()
AddParam("include-alignment-in-n-best", "include word alignment in the n-best list. default is false");
AddParam("lmodel-file", "location and properties of the language models");
AddParam("lmodel-dub", "dictionary upper bounds of language models");
AddParam("lmodel-oov-feature", "add language model oov feature, one per model");
AddParam("mapping", "description of decoding steps");
AddParam("max-partial-trans-opt", "maximum number of partial translation options per input span (during mapping steps)");
AddParam("max-trans-opt-per-coverage", "maximum number of translation options per input span (after applying mapping steps)");
@ -123,6 +124,7 @@ Parameter::Parameter()
AddParam("time-out", "seconds after which is interrupted (-1=no time-out, default is -1)");
AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
#ifdef HAVE_PROTOBUF
AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
#endif
@ -155,6 +157,7 @@ Parameter::Parameter()
AddParam("translation-systems", "specify multiple translation systems, each consisting of an id, followed by a set of models ids, eg '0 T1 R1 L0'");
AddParam("show-weights", "print feature weights and exit");
AddParam("alignment-output-file", "print output word alignments into given file");
AddParam("sort-word-alignment", "Sort word alignments for more consistent display. 0=no sort (default), 1=target order");
}
Parameter::~Parameter()
@ -314,7 +317,8 @@ bool Parameter::Validate()
}
}
if (m_setting["lmodel-file"].size() != m_setting["weight-l"].size()) {
if (m_setting["lmodel-file"].size() * (m_setting.find("lmodel-oov-feature") != m_setting.end() ? 2 : 1)
!= m_setting["weight-l"].size()) {
stringstream errorMsg("");
errorMsg << "Config and parameters specify "
<< static_cast<int>(m_setting["lmodel-file"].size())
@ -322,6 +326,7 @@ bool Parameter::Validate()
<< static_cast<int>(m_setting["weight-l"].size())
<< " weights (weight-l)";
errorMsg << endl << "You might be giving '-lmodel-file TYPE FACTOR ORDER FILENAME' but you should be giving these four as a single argument, i.e. '-lmodel-file \"TYPE FACTOR ORDER FILENAME\"'";
errorMsg << endl << "You should also remember that each language model requires 2 weights, if and only if lmodel-oov-feature is on.";
UserMessage::Add(errorMsg.str());
noErrorFlag = false;
}

View File

@ -27,7 +27,7 @@ size_t PhraseBoundaryFeature::GetNumScoreComponents() const
return ScoreProducer::unlimited;
}
string PhraseBoundaryFeature::GetScoreProducerWeightShortName() const
string PhraseBoundaryFeature::GetScoreProducerWeightShortName(unsigned) const
{
return "pb";
}

View File

@ -34,7 +34,7 @@ public:
PhraseBoundaryFeature(const FactorList& sourceFactors, const FactorList& targetFactors);
size_t GetNumScoreComponents() const;
std::string GetScoreProducerWeightShortName() const;
std::string GetScoreProducerWeightShortName(unsigned) const;
size_t GetNumInputScores() const;
virtual const FFState* EmptyHypothesisState(const InputType &) const;

View File

@ -224,6 +224,14 @@ PhraseDictionaryFeature::~PhraseDictionaryFeature()
{}
std::string PhraseDictionaryFeature::GetScoreProducerWeightShortName(unsigned idx) const
{
if (idx < GetNumInputScores()){
return "I";
}else{
return "tm";
}
}
size_t PhraseDictionaryFeature::GetNumScoreComponents() const
{

View File

@ -113,9 +113,8 @@ public:
virtual bool ComputeValueInTranslationOption() const;
std::string GetScoreProducerWeightShortName() const {
return "tm";
}
std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
size_t GetNumScoreComponents() const;
size_t GetNumInputScores() const;

View File

@ -68,7 +68,7 @@ public:
const LMList& languageModels,
const WordPenaltyProducer* wpProducer);
std::string GetScoreProducerDescription() const {
std::string GetScoreProducerDescription(unsigned) const {
return "BerkeleyPt";
}

View File

@ -66,7 +66,7 @@ public:
}
virtual ~PhraseDictionarySCFG();
std::string GetScoreProducerDescription() const {
std::string GetScoreProducerDescription(unsigned) const {
return "Hieu's Reordering Model";
}

View File

@ -661,7 +661,7 @@ GetTargetCandidates(PrefixPtr p,
imp->ConvertTgtCand(tcands,rv,wa);
}
std::string PhraseDictionaryTree::GetScoreProducerDescription() const
std::string PhraseDictionaryTree::GetScoreProducerDescription(unsigned) const
{
return "PhraseDictionaryTree";
}

View File

@ -122,8 +122,8 @@ public:
// print target candidates for a given prefix pointer to a stream, mainly
// for debugging
void PrintTargetCandidates(PrefixPtr p,std::ostream& out) const;
std::string GetScoreProducerDescription() const;
std::string GetScoreProducerWeightShortName() const {
std::string GetScoreProducerDescription(unsigned) const;
std::string GetScoreProducerWeightShortName(unsigned) const {
return "tm";
}
};

View File

@ -105,11 +105,21 @@ size_t PhraseDictionaryTreeAdaptor::GetNumInputScores() const
return imp->GetNumInputScores();
}
std::string PhraseDictionaryTreeAdaptor::GetScoreProducerDescription() const
std::string PhraseDictionaryTreeAdaptor::GetScoreProducerDescription(unsigned idx) const{
if (idx < imp->GetNumInputScores()){
return "InputScore";
}else{
return "PhraseModel";
}
}
std::string PhraseDictionaryTreeAdaptor::GetScoreProducerWeightShortName(unsigned idx) const
{
return "PhraseModel";
if (idx < imp->GetNumInputScores()){
return "I";
}else{
return "tm";
}
}
}

View File

@ -60,10 +60,8 @@ public:
// this function can be only used for UNKNOWN source phrases
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase);
std::string GetScoreProducerDescription() const;
std::string GetScoreProducerWeightShortName() const {
return "tm";
}
std::string GetScoreProducerDescription(unsigned idx=0) const;
std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
size_t GetNumInputScores() const;
virtual void InitializeForInput(InputType const& source);

View File

@ -25,7 +25,7 @@ public:
// basic properties
size_t GetNumScoreComponents() const { return ScoreProducer::unlimited; }
std::string GetScoreProducerWeightShortName() const { return "pl"; }
std::string GetScoreProducerWeightShortName(unsigned) const { return "pl"; }
size_t GetNumInputScores() const { return 0; }
};

View File

@ -19,7 +19,7 @@ size_t PhrasePairFeature::GetNumScoreComponents() const
return ScoreProducer::unlimited;
}
string PhrasePairFeature::GetScoreProducerWeightShortName() const
string PhrasePairFeature::GetScoreProducerWeightShortName(unsigned) const
{
return "pp";
}

View File

@ -23,7 +23,7 @@ class PhrasePairFeature: public StatelessFeatureFunction {
size_t GetNumScoreComponents() const;
std::string GetScoreProducerWeightShortName() const;
std::string GetScoreProducerWeightShortName(unsigned) const;
size_t GetNumInputScores() const;

View File

@ -43,7 +43,7 @@ public:
const std::string& GetScoreProducerDescription() const {return m_description;}
//! returns the weight parameter name of this producer (used in n-best list)
virtual std::string GetScoreProducerWeightShortName() const = 0;
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const = 0;
//! returns the number of scores gathered from the input (0 by default)
virtual size_t GetNumInputScores() const {

View File

@ -31,7 +31,7 @@ public:
// basic properties
size_t GetNumScoreComponents() const { return ScoreProducer::unlimited; }
std::string GetScoreProducerWeightShortName() const { return "swd"; }
std::string GetScoreProducerWeightShortName(unsigned) const { return "swd"; }
size_t GetNumInputScores() const { return 0; }
};

View File

@ -95,6 +95,7 @@ StaticData::StaticData()
,m_detailedTranslationReportingFilePath()
,m_onlyDistinctNBest(false)
,m_factorDelimiter("|") // default delimiter between factors
,m_lmEnableOOVFeature(false)
,m_isAlwaysCreateDirectTranslationOption(false)
{
@ -145,6 +146,10 @@ bool StaticData::LoadData(Parameter *parameter)
}
}
if(m_parameter->GetParam("sort-word-alignment").size()) {
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
}
// factor delimiter
if (m_parameter->GetParam("factor-delimiter").size() > 0) {
m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
@ -223,6 +228,7 @@ bool StaticData::LoadData(Parameter *parameter)
} else
m_outputSearchGraphPB = false;
#endif
SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", true );
// include feature names in the n-best list
SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true );
@ -359,6 +365,8 @@ bool StaticData::LoadData(Parameter *parameter)
// unknown word processing
SetBooleanParameter( &m_dropUnknown, "drop-unknown", false );
SetBooleanParameter( &m_lmEnableOOVFeature, "lmodel-oov-feature", false);
// minimum Bayes risk decoding
SetBooleanParameter( &m_mbr, "minimum-bayes-risk", false );
m_mbrSize = (m_parameter->GetParam("mbr-size").size() > 0) ?
@ -472,19 +480,19 @@ bool StaticData::LoadData(Parameter *parameter)
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
for(size_t i=0; i<m_parameter->GetParam("report-sparse-features").size(); i++) {
const std::string &name = m_parameter->GetParam("report-sparse-features")[i];
if (m_targetBigramFeature && name.compare(m_targetBigramFeature->GetScoreProducerWeightShortName()) == 0)
if (m_targetBigramFeature && name.compare(m_targetBigramFeature->GetScoreProducerWeightShortName(0)) == 0)
m_targetBigramFeature->SetSparseFeatureReporting();
if (m_phrasePairFeature && name.compare(m_phrasePairFeature->GetScoreProducerWeightShortName()) == 0)
if (m_phrasePairFeature && name.compare(m_phrasePairFeature->GetScoreProducerWeightShortName(0)) == 0)
m_phrasePairFeature->SetSparseFeatureReporting();
if (m_phraseBoundaryFeature && name.compare(m_phraseBoundaryFeature->GetScoreProducerWeightShortName()) == 0)
if (m_phraseBoundaryFeature && name.compare(m_phraseBoundaryFeature->GetScoreProducerWeightShortName(0)) == 0)
m_phraseBoundaryFeature->SetSparseFeatureReporting();
if (m_phraseLengthFeature && name.compare(m_phraseLengthFeature->GetScoreProducerWeightShortName()) == 0)
if (m_phraseLengthFeature && name.compare(m_phraseLengthFeature->GetScoreProducerWeightShortName(0)) == 0)
m_phraseLengthFeature->SetSparseFeatureReporting();
if (m_targetWordInsertionFeature && name.compare(m_targetWordInsertionFeature->GetScoreProducerWeightShortName()) == 0)
if (m_targetWordInsertionFeature && name.compare(m_targetWordInsertionFeature->GetScoreProducerWeightShortName(0)) == 0)
m_targetWordInsertionFeature->SetSparseFeatureReporting();
if (m_sourceWordDeletionFeature && name.compare(m_sourceWordDeletionFeature->GetScoreProducerWeightShortName()) == 0)
if (m_sourceWordDeletionFeature && name.compare(m_sourceWordDeletionFeature->GetScoreProducerWeightShortName(0)) == 0)
m_sourceWordDeletionFeature->SetSparseFeatureReporting();
if (m_wordTranslationFeature && name.compare(m_wordTranslationFeature->GetScoreProducerWeightShortName()) == 0)
if (m_wordTranslationFeature && name.compare(m_wordTranslationFeature->GetScoreProducerWeightShortName(0)) == 0)
m_wordTranslationFeature->SetSparseFeatureReporting();
}
}

View File

@ -187,6 +187,7 @@ protected:
float m_lmbrMapWeight; //! Weight given to the map solution. See Kumar et al 09 for details
size_t m_lmcache_cleanup_threshold; //! number of translations after which LM claenup is performed (0=never, N=after N translations; default is 1)
bool m_lmEnableOOVFeature;
bool m_timeout; //! use timeout
size_t m_timeout_threshold; //! seconds after which time out is activated
@ -208,6 +209,7 @@ protected:
#ifdef HAVE_PROTOBUF
bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
#endif
bool m_unprunedSearchGraph; //! do not exclude dead ends (chart decoder only)
size_t m_cubePruningPopLimit;
size_t m_cubePruningDiversity;
@ -220,7 +222,7 @@ protected:
Word m_inputDefaultNonTerminal, m_outputDefaultNonTerminal;
SourceLabelOverlap m_sourceLabelOverlap;
UnknownLHSList m_unknownLHS;
WordAlignmentSort m_wordAlignmentSort;
StaticData();
@ -568,6 +570,10 @@ public:
return m_lmcache_cleanup_threshold;
}
bool GetLMEnableOOVFeature() const {
return m_lmEnableOOVFeature;
}
bool GetOutputSearchGraph() const {
return m_outputSearchGraph;
}
@ -582,6 +588,9 @@ public:
return m_outputSearchGraphPB;
}
#endif
bool GetUnprunedSearchGraph() const {
return m_unprunedSearchGraph;
}
XmlInputType GetXmlInputType() const {
return m_xmlInputType;
@ -627,7 +636,6 @@ public:
return 999999; /* TODO wtf! */
}
bool ContinuePartialTranslation() const {
return m_continuePartialTranslation;
}
@ -639,6 +647,9 @@ public:
}
void SetAllWeightsScoreComponentCollection(const ScoreComponentCollection &weightsScoreComponentCollection);
WordAlignmentSort GetWordAlignmentSort() const {
return m_wordAlignmentSort;
}
};
}

View File

@ -36,11 +36,11 @@ namespace Moses
return m_NumScoreComponents;
}
std::string SyntacticLanguageModel::GetScoreProducerDescription() const {
std::string SyntacticLanguageModel::GetScoreProducerDescription(unsigned) const {
return "Syntactic Language Model";
}
std::string SyntacticLanguageModel::GetScoreProducerWeightShortName() const {
std::string SyntacticLanguageModel::GetScoreProducerWeightShortName(unsigned) const {
return "slm";
}

Some files were not shown because too many files have changed in this diff Show More