Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Ian Johnson 2013-07-03 10:55:30 +01:00
commit e20fbc0754
123 changed files with 3041 additions and 1535 deletions

View File

@ -108,6 +108,10 @@ project : default-build
<link>static
;
#Apparently OS X likes to link against iconv for fgetsUTF8.
lib iconv ;
requirements += <os>MACOSX:<library>iconv ;
project : requirements
<threading>multi:<define>WITH_THREADS
<threading>multi:<library>boost_thread

View File

@ -1121,6 +1121,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/InputFeature.h</locationURI>
</link>
<link>
<name>FF/OSM-Feature</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>FF/PhraseBasedFeatureContext.cpp</name>
<type>1</type>
@ -1166,6 +1171,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhrasePairFeature.h</locationURI>
</link>
<link>
<name>FF/PhrasePenalty.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhrasePenalty.cpp</locationURI>
</link>
<link>
<name>FF/PhrasePenalty.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhrasePenalty.h</locationURI>
</link>
<link>
<name>FF/SourceWordDeletionFeature.cpp</name>
<type>1</type>
@ -1556,6 +1571,16 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/WordCoocTable.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.cpp</locationURI>
</link>
<link>
<name>TranslationModel/WordCoocTable.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.h</locationURI>
</link>
<link>
<name>TranslationModel/fuzzy-match</name>
<type>2</type>
@ -1581,6 +1606,26 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/lm.log</locationURI>
</link>
<link>
<name>FF/OSM-Feature/OpSequenceModel.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/OpSequenceModel.cpp</locationURI>
</link>
<link>
<name>FF/OSM-Feature/OpSequenceModel.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/OpSequenceModel.h</locationURI>
</link>
<link>
<name>FF/OSM-Feature/osmHyp.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/osmHyp.cpp</locationURI>
</link>
<link>
<name>FF/OSM-Feature/osmHyp.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/osmHyp.h</locationURI>
</link>
<link>
<name>TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp</name>
<type>1</type>

View File

@ -33,6 +33,8 @@ int main(int argc, char *argv[]) {
po::options_description options("Language model building options");
lm::builder::PipelineConfig pipeline;
std::string text, arpa;
options.add_options()
("order,o", po::value<std::size_t>(&pipeline.order)
#if BOOST_VERSION >= 104200
@ -47,18 +49,21 @@ int main(int argc, char *argv[]) {
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout");
if (argc == 1) {
std::cerr <<
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
"Please cite:\n"
"@inproceedings{kenlm,\n"
"author = {Kenneth Heafield},\n"
"title = {{KenLM}: Faster and Smaller Language Model Queries},\n"
"booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
"month = {July}, year={2011},\n"
"address = {Edinburgh, UK},\n"
"publisher = {Association for Computational Linguistics},\n"
"@inproceedings{Heafield-estimate,\n"
" author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
" title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
" year = {2013},\n"
" month = {8},\n"
" booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
" address = {Sofia, Bulgaria},\n"
" url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
"}\n\n"
"Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n"
"the model (-o) is the only mandatory option. As this is an on-disk program,\n"
@ -91,9 +96,17 @@ int main(int argc, char *argv[]) {
initial.adder_out.block_count = 2;
pipeline.read_backoffs = initial.adder_out;
util::scoped_fd in(0), out(1);
if (vm.count("text")) {
in.reset(util::OpenReadOrThrow(text.c_str()));
}
if (vm.count("arpa")) {
out.reset(util::CreateOrThrow(arpa.c_str()));
}
// Read from stdin
try {
lm::builder::Pipeline(pipeline, 0, 1);
lm::builder::Pipeline(pipeline, in.release(), out.release());
} catch (const util::MallocException &e) {
std::cerr << e.what() << std::endl;
std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;

View File

@ -53,7 +53,7 @@ class NGram {
Payload &Value() { return *reinterpret_cast<Payload *>(end_); }
uint64_t &Count() { return Value().count; }
const uint64_t Count() const { return Value().count; }
uint64_t Count() const { return Value().count; }
std::size_t Order() const { return end_ - begin_; }

View File

@ -304,5 +304,26 @@ template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::DontBhiks
template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::ArrayBhiksha>, SortedVocabulary>;
} // namespace detail
base::Model *LoadVirtual(const char *file_name, const Config &config, ModelType model_type) {
RecognizeBinary(file_name, model_type);
switch (model_type) {
case PROBING:
return new ProbingModel(file_name, config);
case REST_PROBING:
return new RestProbingModel(file_name, config);
case TRIE:
return new TrieModel(file_name, config);
case QUANT_TRIE:
return new QuantTrieModel(file_name, config);
case ARRAY_TRIE:
return new ArrayTrieModel(file_name, config);
case QUANT_ARRAY_TRIE:
return new QuantArrayTrieModel(file_name, config);
default:
UTIL_THROW(FormatLoadException, "Confused by model type " << model_type);
}
}
} // namespace ngram
} // namespace lm

View File

@ -67,7 +67,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
/* Get the state for a context. Don't use this if you can avoid it. Use
* BeginSentenceState or EmptyContextState and extend from those. If
* BeginSentenceState or NullContextState and extend from those. If
* you're only going to use this state to call FullScore once, use
* FullScoreForgotState.
* To use this function, make an array of WordIndex containing the context
@ -153,6 +153,11 @@ LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<Separat
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
typedef ProbingModel Model;
/* Autorecognize the file type, load, and return the virtual base class. Don't
* use the virtual base class if you can avoid it. Instead, use the above
* classes as template arguments to your own virtual feature function.*/
base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING);
} // namespace ngram
} // namespace lm

View File

@ -54,7 +54,7 @@ template <class Weights> class ActivateUnigram {
Weights *modify_;
};
// Find the lower order entry, inserting blanks along the way as necessary.
// Find the lower order entry, inserting blanks along the way as necessary.
template <class Value> void FindLower(
const std::vector<uint64_t> &keys,
typename Value::Weights &unigram,
@ -64,7 +64,7 @@ template <class Value> void FindLower(
typename Value::ProbingEntry entry;
// Backoff will always be 0.0. We'll get the probability and rest in another pass.
entry.value.backoff = kNoExtensionBackoff;
// Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
// Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
for (int lower = keys.size() - 2; ; --lower) {
if (lower == -1) {
between.push_back(&unigram);
@ -77,11 +77,11 @@ template <class Value> void FindLower(
}
}
// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
template <class Added, class Build> void AdjustLower(
const Added &added,
const Build &build,
std::vector<typename Build::Value::Weights *> &between,
std::vector<typename Build::Value::Weights *> &between,
const unsigned int n,
const std::vector<WordIndex> &vocab_ids,
typename Build::Value::Weights *unigrams,
@ -93,14 +93,14 @@ template <class Added, class Build> void AdjustLower(
}
typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
float prob = -fabs(between.back()->prob);
// Order of the n-gram on which probabilities are based.
// Order of the n-gram on which probabilities are based.
unsigned char basis = n - between.size();
assert(basis != 0);
typename Build::Value::Weights **change = &between.back();
// Skip the basis.
--change;
if (basis == 1) {
// Hallucinate a bigram based on a unigram's backoff and a unigram probability.
// Hallucinate a bigram based on a unigram's backoff and a unigram probability.
float &backoff = unigrams[vocab_ids[1]].backoff;
SetExtension(backoff);
prob += backoff;
@ -128,14 +128,14 @@ template <class Added, class Build> void AdjustLower(
typename std::vector<typename Value::Weights *>::const_iterator i(between.begin());
build.MarkExtends(**i, added);
const typename Value::Weights *longer = *i;
// Everything has probability but is not marked as extending.
// Everything has probability but is not marked as extending.
for (++i; i != between.end(); ++i) {
build.MarkExtends(**i, *longer);
longer = *i;
}
}
// Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds.
// Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds.
template <class Build> void MarkLower(
const std::vector<uint64_t> &keys,
const Build &build,
@ -144,15 +144,15 @@ template <class Build> void MarkLower(
int start_order,
const typename Build::Value::Weights &longer) {
if (start_order == 0) return;
typename util::ProbingHashTable<typename Build::Value::ProbingEntry, util::IdentityHash>::MutableIterator iter;
// Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
// Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
for (int even_lower = start_order - 2 /* index in middle */; ; --even_lower) {
if (even_lower == -1) {
build.MarkExtends(unigram, longer);
return;
}
middle[even_lower].UnsafeMutableFind(keys[even_lower], iter);
if (!build.MarkExtends(iter->value, longer)) return;
if (!build.MarkExtends(
middle[even_lower].UnsafeMutableMustFind(keys[even_lower])->value,
longer)) return;
}
}
@ -168,7 +168,6 @@ template <class Build, class Activate, class Store> void ReadNGrams(
Store &store,
PositiveProbWarn &warn) {
typedef typename Build::Value Value;
typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
assert(n >= 2);
ReadNGramHeader(f, n);
@ -186,7 +185,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
for (unsigned int h = 1; h < n - 1; ++h) {
keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
}
// Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0.
// Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0.
util::SetSign(entry.value.prob);
entry.key = keys[n-2];
@ -203,7 +202,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
} // namespace
namespace detail {
template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
std::size_t allocated = Unigram::Size(counts[0]);
unigram_ = Unigram(start, counts[0], allocated);

View File

@ -71,7 +71,7 @@ template <class Value> class HashedSearch {
static const bool kDifferentRest = Value::kDifferentRest;
static const unsigned int kVersion = 0;
// TODO: move probing_multiplier here with next binary file format update.
// TODO: move probing_multiplier here with next binary file format update.
static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
@ -102,14 +102,9 @@ template <class Value> class HashedSearch {
return ret;
}
#pragma GCC diagnostic ignored "-Wuninitialized"
MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const {
node = extend_pointer;
typename Middle::ConstIterator found;
bool got = middle_[extend_length - 2].Find(extend_pointer, found);
assert(got);
(void)got;
return MiddlePointer(found->value);
return MiddlePointer(middle_[extend_length - 2].MustFind(extend_pointer)->value);
}
MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_pointer) const {
@ -126,14 +121,14 @@ template <class Value> class HashedSearch {
}
LongestPointer LookupLongest(WordIndex word, const Node &node) const {
// Sign bit is always on because longest n-grams do not extend left.
// Sign bit is always on because longest n-grams do not extend left.
typename Longest::ConstIterator found;
if (!longest_.Find(CombineWordHash(node, word), found)) return LongestPointer();
return LongestPointer(found->value.prob);
}
// Generate a node without necessarily checking that it actually exists.
// Optionally return false if it's know to not exist.
// Generate a node without necessarily checking that it actually exists.
// Optionally return false if it's know to not exist.
bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const {
assert(begin != end);
node = static_cast<Node>(*begin);
@ -144,7 +139,7 @@ template <class Value> class HashedSearch {
}
private:
// Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
// Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
void DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn);
template <class Build> void ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build);
@ -153,7 +148,7 @@ template <class Value> class HashedSearch {
public:
Unigram() {}
Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
unigram_(static_cast<typename Value::Weights*>(start))
#ifdef DEBUG
, count_(count)

View File

@ -6,6 +6,7 @@
#include "util/string_piece.hh"
#include <string>
#include <string.h>
namespace lm {
namespace base {
@ -119,7 +120,9 @@ class Model {
size_t StateSize() const { return state_size_; }
const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); }
const void *NullContextMemory() const { return null_context_memory_; }
void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); }
// Requires in_state != out_state
virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;

View File

@ -65,7 +65,7 @@ int main(int argc, char **argv)
sourcePhrase.CreateFromString(Input, input, line, "||dummy_string||", NULL);
TargetPhraseVectorPtr decodedPhraseColl
= pdc.GetTargetPhraseCollectionRaw(sourcePhrase);
= pdc.GetTargetPhraseCollectionRaw(sourcePhrase);
if(decodedPhraseColl != NULL) {
if(reportCounts)

View File

@ -61,6 +61,9 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
Phrase* unksrc = new Phrase(1);
unksrc->AddWord() = sourceWord;
Word &newWord = unksrc->GetWord(0);
newWord.SetIsOOV(true);
m_unksrcs.push_back(unksrc);
//TranslationOption *transOpt;

View File

@ -34,16 +34,6 @@ DecodeFeature::DecodeFeature( const std::string& description
: StatelessFeatureFunction(description, line)
{
VERBOSE(2,"DecodeFeature:" << std::endl);
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
}
DecodeFeature::DecodeFeature( const std::string& description
@ -67,7 +57,7 @@ DecodeFeature::DecodeFeature(const std::string& description
VERBOSE(2,"DecodeFeature: input=" << m_inputFactors << " output=" << m_outputFactors << std::endl);
}
bool DecodeFeature::SetParameter(const std::string& key, const std::string& value)
void DecodeFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "input-factor") {
m_input =Tokenize<FactorType>(value, ",");
@ -76,9 +66,8 @@ bool DecodeFeature::SetParameter(const std::string& key, const std::string& valu
m_output =Tokenize<FactorType>(value, ",");
m_outputFactors = FactorMask(m_output);
} else {
return StatelessFeatureFunction::SetParameter(key, value);
StatelessFeatureFunction::SetParameter(key, value);
}
return true;
}

View File

@ -61,7 +61,7 @@ public:
const std::vector<FactorType>& GetOutput() const;
bool IsUseable(const FactorMask &mask) const;
virtual bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
protected:
std::vector<FactorType> m_input;

View File

@ -4,6 +4,8 @@
#include "moses/WordsRange.h"
#include "moses/StaticData.h"
using namespace std;
namespace Moses
{
struct DistortionState_traditional : public FFState {
@ -19,6 +21,12 @@ struct DistortionState_traditional : public FFState {
}
};
DistortionScoreProducer::DistortionScoreProducer(const std::string &line)
: StatefulFeatureFunction("Distortion", 1, line)
{
ReadParameters();
}
const FFState* DistortionScoreProducer::EmptyHypothesisState(const InputType &input) const
{
// fake previous translated phrase start and end

View File

@ -18,10 +18,7 @@ class WordsRange;
class DistortionScoreProducer : public StatefulFeatureFunction
{
public:
DistortionScoreProducer(const std::string &line)
: StatefulFeatureFunction("Distortion", 1, line) {
CHECK(m_args.size() == 0);
}
DistortionScoreProducer(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
return true;

View File

@ -49,17 +49,6 @@ void FeatureFunction::Initialize(const std::string& description, const std::stri
{
ParseLine(description, line);
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
if (m_description == "") {
size_t index = description_counts.count(description);
@ -91,29 +80,33 @@ void FeatureFunction::ParseLine(const std::string& description, const std::strin
pair<set<string>::iterator,bool> ret = keys.insert(args[0]);
UTIL_THROW_IF(!ret.second, util::Exception, "Duplicate key in line " << line);
m_args.push_back(args);
if (args[0] == "num-features") {
m_numScoreComponents = Scan<size_t>(args[1]);
} else if (args[0] == "name") {
m_description = args[1];
} else {
m_args.push_back(args);
}
}
}
bool FeatureFunction::SetParameter(const std::string& key, const std::string& value)
void FeatureFunction::SetParameter(const std::string& key, const std::string& value)
{
if (key == "num-features") {
m_numScoreComponents = Scan<size_t>(value);
} else if (key == "name") {
m_description = value;
} else if (key == "tuneable") {
if (key == "tuneable") {
m_tuneable = Scan<bool>(value);
} else {
return false;
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
}
return true;
}
void FeatureFunction::OverrideParameter(const std::string& key, const std::string& value)
void FeatureFunction::ReadParameters()
{
bool ret = SetParameter(key, value);
UTIL_THROW_IF(!ret, util::Exception, "Unknown argument" << key);
while (!m_args.empty()) {
const vector<string> &args = m_args[0];
SetParameter(args[0], args[1]);
m_args.erase(m_args.begin());
}
}
}

View File

@ -106,8 +106,8 @@ public:
, ScoreComponentCollection &scoreBreakdown) const {
}
virtual bool SetParameter(const std::string& key, const std::string& value);
virtual void OverrideParameter(const std::string& key, const std::string& value);
virtual void SetParameter(const std::string& key, const std::string& value);
virtual void ReadParameters();
};
}

View File

@ -13,18 +13,7 @@ GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
: StatelessFeatureFunction("GlobalLexicalModel",1, line)
{
std::cerr << "Creating global lexical model...\n";
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
// define bias word
FactorCollection &factorCollection = FactorCollection::Instance();
@ -34,7 +23,7 @@ GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
}
bool GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value)
void GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value)
{
if (key == "file") {
m_filePath = value;
@ -43,9 +32,8 @@ bool GlobalLexicalModel::SetParameter(const std::string& key, const std::string&
} else if (key == "outputFactors") {
m_outputFactorsVec = Tokenize<FactorType>(value,",");
} else {
return StatelessFeatureFunction::SetParameter(key, value);
StatelessFeatureFunction::SetParameter(key, value);
}
return true;
}
GlobalLexicalModel::~GlobalLexicalModel()

View File

@ -77,7 +77,7 @@ public:
ScoreComponentCollection* accumulator) const {
throw std::logic_error("GlobalLexicalModel not supported in chart decoder, yet");
}
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
};

View File

@ -0,0 +1,305 @@
#include <fstream>
#include "OpSequenceModel.h"
#include "osmHyp.h"
#include "util/check.hh"
#include "moses/Util.h"
using namespace std;
using namespace lm::ngram;
namespace Moses
{
OpSequenceModel::OpSequenceModel(const std::string &line)
:StatefulFeatureFunction("OpSequenceModel", 5, line )
{
ReadParameters();
}
void OpSequenceModel :: readLanguageModel(const char *lmFile)
{
string unkOp = "_TRANS_SLF_";
/*
// Code for SRILM
vector <int> numbers;
int nonWordFlag = 0;
ptrOp = new Api;
ptrOp -> read_lm(lmFile,lmOrder);
numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str())));
unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag);
*/
// Code to load KenLM
OSM = new Model(m_lmPath.c_str());
State startState = OSM->NullContextState();
State endState;
unkOpProb = OSM->Score(startState,OSM->GetVocabulary().Index(unkOp),endState);
}
void OpSequenceModel::Load()
{
/*
// load future cost
//vector <string> input;
ifstream sr (m_featurePath.c_str());
char* tmp;
CHECK(sr.is_open());
vector<FactorType> factorOrder;
factorOrder.push_back(0);
string line;
while (std::getline(sr, line))
{
std::vector<std::string> tokens;
tokens = TokenizeMultiCharSeparator(line, "|||");
CHECK(tokens.size() == 3);
Phrase source, target;
source.CreateFromString(Input, factorOrder, tokens[0], "|", NULL);
target.CreateFromString(Output, factorOrder, tokens[1], "|", NULL);
ParallelPhrase pp(source, target);
Scores scores = Tokenize<float>(tokens[2], " ");
m_futureCost[pp] = scores;
// m_coll[pp] = scores;
}
*/
readLanguageModel(m_lmPath.c_str());
}
void OpSequenceModel:: Evaluate(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{
osmHypothesis obj;
obj.setState(OSM->NullContextState());
WordsBitmap myBitmap(source.GetSize());
vector <string> mySourcePhrase;
vector <string> myTargetPhrase;
vector<float> scores(5);
vector <int> alignments;
int startIndex = 0;
int endIndex = source.GetSize();
const AlignmentInfo &align = targetPhrase.GetAlignTerm();
AlignmentInfo::const_iterator iter;
for (iter = align.begin(); iter != align.end(); ++iter)
{
alignments.push_back(iter->first);
alignments.push_back(iter->second);
}
for (int i = 0; i < targetPhrase.GetSize(); i++)
{
if (targetPhrase.GetWord(i).IsOOV())
myTargetPhrase.push_back("_TRANS_SLF_");
else
myTargetPhrase.push_back(targetPhrase.GetWord(i).GetFactor(0)->GetString().as_string());
}
for (int i = 0; i < source.GetSize(); i++)
{
mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string());
}
obj.setPhrases(mySourcePhrase , myTargetPhrase);
obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize());
obj.computeOSMFeature(startIndex,myBitmap);
obj.calculateOSMProb(*OSM);
obj.populateScores(scores);
estimatedFutureScore.PlusEquals(this, scores);
}
FFState* OpSequenceModel::Evaluate(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
const WordsBitmap &bitmap = cur_hypo.GetWordsBitmap();
WordsBitmap myBitmap = bitmap;
const Manager &manager = cur_hypo.GetManager();
const InputType &source = manager.GetSource();
const Sentence &sourceSentence = static_cast<const Sentence&>(source);
osmHypothesis obj;
vector <string> mySourcePhrase;
vector <string> myTargetPhrase;
vector<float> scores(5);
//target.GetWord(0)
//cerr << target <<" --- "<<target.GetSourcePhrase()<< endl; // English ...
//cerr << align << endl; // Alignments ...
//cerr << cur_hypo.GetCurrSourceWordsRange() << endl;
//cerr << source <<endl;
// int a = sourceRange.GetStartPos();
// cerr << source.GetWord(a);
//cerr <<a<<endl;
//const Sentence &sentence = static_cast<const Sentence&>(curr_hypo.GetManager().GetSource());
const WordsRange & sourceRange = cur_hypo.GetCurrSourceWordsRange();
int startIndex = sourceRange.GetStartPos();
int endIndex = sourceRange.GetEndPos();
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
osmState * statePtr;
vector <int> alignments;
AlignmentInfo::const_iterator iter;
for (iter = align.begin(); iter != align.end(); ++iter) {
//cerr << iter->first << "----" << iter->second << " ";
alignments.push_back(iter->first);
alignments.push_back(iter->second);
}
//cerr<<bitmap<<endl;
//cerr<<startIndex<<" "<<endIndex<<endl;
for (int i = startIndex; i <= endIndex; i++)
{
myBitmap.SetValue(i,0); // resetting coverage of this phrase ...
mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string());
// cerr<<mySourcePhrase[i]<<endl;
}
for (int i = 0; i < target.GetSize(); i++)
{
if (target.GetWord(i).IsOOV())
myTargetPhrase.push_back("_TRANS_SLF_");
else
myTargetPhrase.push_back(target.GetWord(i).GetFactor(0)->GetString().as_string());
}
//cerr<<myBitmap<<endl;
obj.setState(prev_state);
obj.constructCepts(alignments,startIndex,endIndex,target.GetSize());
obj.setPhrases(mySourcePhrase , myTargetPhrase);
obj.computeOSMFeature(startIndex,myBitmap);
obj.calculateOSMProb(*OSM);
obj.populateScores(scores);
/*
if (bitmap.GetFirstGapPos() == NOT_FOUND)
{
int xx;
cerr<<bitmap<<endl;
int a = bitmap.GetFirstGapPos();
obj.print();
cin>>xx;
}
*/
/*
vector<float> scores(5);
scores[0] = 0.343423f;
scores[1] = 1.343423f;
scores[2] = 2.343423f;
scores[3] = 3.343423f;
scores[4] = 4.343423f;
*/
accumulator->PlusEquals(this, scores);
return obj.saveState();
//return statePtr;
// return NULL;
}
FFState* OpSequenceModel::EvaluateChart(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const
{
abort();
}
const FFState* OpSequenceModel::EmptyHypothesisState(const InputType &input) const
{
cerr << "OpSequenceModel::EmptyHypothesisState()" << endl;
State startState = OSM->BeginSentenceState();
return new osmState(startState);
}
std::string OpSequenceModel::GetScoreProducerWeightShortName(unsigned idx) const
{
return "osm";
}
std::vector<float> OpSequenceModel::GetFutureScores(const Phrase &source, const Phrase &target) const
{
ParallelPhrase pp(source, target);
std::map<ParallelPhrase, Scores>::const_iterator iter;
iter = m_futureCost.find(pp);
//iter = m_coll.find(pp);
if (iter == m_futureCost.end()) {
vector<float> scores(5, 0);
scores[0] = unkOpProb;
return scores;
}
else {
const vector<float> &scores = iter->second;
return scores;
}
}
void OpSequenceModel::SetParameter(const std::string& key, const std::string& value)
{
if (key == "feature-path") {
m_featurePath = value;
} else if (key == "path") {
m_lmPath = value;
} else if (key == "order") {
lmOrder = Scan<int>(value);
} else {
StatefulFeatureFunction::SetParameter(key, value);
}
}
} // namespace

View File

@ -0,0 +1,69 @@
#pragma once
#include <string>
#include <map>
#include <vector>
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/Manager.h"
#include "moses/FF/OSM-Feature/osmHyp.h"
#include "lm/model.hh"
namespace Moses
{
class OpSequenceModel : public StatefulFeatureFunction
{
public:
lm::ngram::Model * OSM;
int lmOrder;
float unkOpProb;
OpSequenceModel(const std::string &line);
void readLanguageModel(const char *);
void Load();
FFState* Evaluate(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
void Evaluate(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
virtual FFState* EvaluateChart(
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection* accumulator) const;
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
std::vector<float> GetFutureScores(const Phrase &source, const Phrase &target) const;
void SetParameter(const std::string& key, const std::string& value);
bool IsUseable(const FactorMask &mask) const
{ return true; }
protected:
typedef std::pair<Phrase, Phrase> ParallelPhrase;
typedef std::vector<float> Scores;
std::map<ParallelPhrase, Scores> m_futureCost;
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
std::set <int> targetNullWords;
std::string m_featurePath, m_lmPath;
};
} // namespace

View File

@ -0,0 +1,650 @@
#include "osmHyp.h"
#include <sstream>
using namespace std;
using namespace lm::ngram;
namespace Moses
{
osmState::osmState(const State & val)
:j(0)
,E(0)
{
lmState = val;
}
void osmState::saveState(int jVal, int eVal, map <int , string> & gapVal)
{
gap.clear();
gap = gapVal;
j = jVal;
E = eVal;
}
int osmState::Compare(const FFState& otherBase) const
{
const osmState &other = static_cast<const osmState&>(otherBase);
if (j != other.j)
return (j < other.j) ? -1 : +1;
if (E != other.E)
return (E < other.E) ? -1 : +1;
if (gap != other.gap)
return (gap < other.gap) ? -1 : +1;
if (lmState.length < other.lmState.length) return -1;
if (lmState.length > other.lmState.length) return 1;
return 0;
}
std::string osmState :: getName() const
{
return "done";
}
//////////////////////////////////////////////////
osmHypothesis :: osmHypothesis()
{
opProb = 0;
gapWidth = 0;
gapCount = 0;
openGapCount = 0;
deletionCount = 0;
gapCount = 0;
j = 0;
E = 0;
gap.clear();
}
void osmHypothesis :: setState(const FFState* prev_state)
{
if(prev_state != NULL)
{
j = static_cast <const osmState *> (prev_state)->getJ();
E = static_cast <const osmState *> (prev_state)->getE();
gap = static_cast <const osmState *> (prev_state)->getGap();
lmState = static_cast <const osmState *> (prev_state)->getLMState();
}
}
osmState * osmHypothesis :: saveState()
{
osmState * statePtr = new osmState(lmState);
statePtr->saveState(j,E,gap);
return statePtr;
}
int osmHypothesis :: isTranslationOperation(int x)
{
if (operations[x].find("_JMP_BCK_") != -1)
return 0;
if (operations[x].find("_JMP_FWD_") != -1)
return 0;
if (operations[x].find("_CONT_CEPT_") != -1)
return 0;
if (operations[x].find("_INS_GAP_") != -1)
return 0;
return 1;
}
void osmHypothesis :: removeReorderingOperations()
{
gapCount = 0;
deletionCount = 0;
openGapCount = 0;
gapWidth = 0;
//cout<<"I came here"<<endl;
std::vector <std::string> tupleSequence;
for (int x = 0; x < operations.size(); x++)
{
// cout<<operations[x]<<endl;
if(isTranslationOperation(x) == 1)
{
tupleSequence.push_back(operations[x]);
}
}
operations.clear();
operations = tupleSequence;
}
void osmHypothesis :: calculateOSMProb(Model & ptrOp)
{
opProb = 0;
State currState = lmState;
State temp;
for (int i = 0; i<operations.size(); i++)
{
temp = currState;
opProb += ptrOp.Score(temp,ptrOp.GetVocabulary().Index(operations[i]),currState);
}
lmState = currState;
//print();
}
int osmHypothesis :: firstOpenGap(vector <int> & coverageVector)
{
int firstOG =-1;
for(int nd = 0; nd < coverageVector.size(); nd++)
{
if(coverageVector[nd]==0)
{
firstOG = nd;
return firstOG;
}
}
return firstOG;
}
string osmHypothesis :: intToString(int num)
{
std::ostringstream stm;
stm<<num;
return stm.str();
}
void osmHypothesis :: generateOperations(int & startIndex , int j1 , int contFlag , WordsBitmap & coverageVector , string english , string german , set <int> & targetNullWords , vector <string> & currF)
{
int gFlag = 0;
int gp = 0;
int ans;
if ( j < j1) // j1 is the index of the source word we are about to generate ...
{
//if(coverageVector[j]==0) // if source word at j is not generated yet ...
if(coverageVector.GetValue(j)==0) // if source word at j is not generated yet ...
{
operations.push_back("_INS_GAP_");
gFlag++;
gap[j]="Unfilled";
}
if (j == E)
{
j = j1;
}
else
{
operations.push_back("_JMP_FWD_");
j=E;
}
}
if (j1 < j)
{
// if(j < E && coverageVector[j]==0)
if(j < E && coverageVector.GetValue(j)==0)
{
operations.push_back("_INS_GAP_");
gFlag++;
gap[j]="Unfilled";
}
j=closestGap(gap,j1,gp);
operations.push_back("_JMP_BCK_"+ intToString(gp));
//cout<<"I am j "<<j<<endl;
//cout<<"I am j1 "<<j1<<endl;
if(j==j1)
gap[j]="Filled";
}
if (j < j1)
{
operations.push_back("_INS_GAP_");
gap[j] = "Unfilled";
gFlag++;
j=j1;
}
if(contFlag == 0) // First words of the multi-word cept ...
{
if(english == "_TRANS_SLF_") // Unknown word ...
{
operations.push_back("_TRANS_SLF_");
}
else
{
operations.push_back("_TRANS_" + english + "_TO_" + german);
}
//ans = firstOpenGap(coverageVector);
ans = coverageVector.GetFirstGapPos();
if (ans != -1)
gapWidth += j - ans;
}
else if (contFlag == 2)
{
operations.push_back("_INS_" + german);
ans = coverageVector.GetFirstGapPos();
if (ans != -1)
gapWidth += j - ans;
deletionCount++;
}
else
{
operations.push_back("_CONT_CEPT_");
}
//coverageVector[j]=1;
coverageVector.SetValue(j,1);
j+=1;
if(E<j)
E=j;
if (gFlag > 0)
gapCount++;
openGapCount += getOpenGaps();
//if (coverageVector[j] == 0 && targetNullWords.find(j) != targetNullWords.end())
if (coverageVector.GetValue(j) == 0 && targetNullWords.find(j) != targetNullWords.end())
{
j1 = j;
german = currF[j1-startIndex];
english = "_INS_";
generateOperations(startIndex, j1, 2 , coverageVector , english , german , targetNullWords , currF);
}
}
void osmHypothesis :: print()
{
for (int i = 0; i< operations.size(); i++)
{
cerr<<operations[i]<<" ";
}
cerr<<endl<<endl;
cerr<<"Operation Probability "<<opProb<<endl;
cerr<<"Gap Count "<<gapCount<<endl;
cerr<<"Open Gap Count "<<openGapCount<<endl;
cerr<<"Gap Width "<<gapWidth<<endl;
cerr<<"Deletion Count "<<deletionCount<<endl;
cerr<<"_______________"<<endl;
}
int osmHypothesis :: closestGap(map <int,string> gap, int j1, int & gp)
{
int dist=1172;
int value=-1;
int temp=0;
gp=0;
int opGap=0;
map <int,string> :: iterator iter;
iter=gap.end();
do
{
iter--;
//cout<<"Trapped "<<iter->first<<endl;
if(iter->first==j1 && iter->second== "Unfilled")
{
opGap++;
gp = opGap;
return j1;
}
if(iter->second =="Unfilled")
{
opGap++;
temp = iter->first - j1;
if(temp<0)
temp=temp * -1;
if(dist>temp && iter->first < j1)
{
dist=temp;
value=iter->first;
gp=opGap;
}
}
}
while(iter!=gap.begin());
return value;
}
int osmHypothesis :: getOpenGaps()
{
map <int,string> :: iterator iter;
int nd = 0;
for (iter = gap.begin(); iter!=gap.end(); iter++)
{
if(iter->second == "Unfilled")
nd++;
}
return nd;
}
void osmHypothesis :: generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes)
{
operations.push_back("_DEL_" + english);
currTargetIndex++;
while(doneTargetIndexes.find(currTargetIndex) != doneTargetIndexes.end())
{
currTargetIndex++;
}
if (sourceNullWords.find(currTargetIndex) != sourceNullWords.end())
{
english = currE[currTargetIndex];
generateDeleteOperations(english,currTargetIndex,doneTargetIndexes);
}
}
void osmHypothesis :: computeOSMFeature(int startIndex , WordsBitmap & coverageVector)
{
set <int> doneTargetIndexes;
set <int> eSide;
set <int> fSide;
set <int> :: iterator iter;
string english;
string source;
int j1;
int start = 0;
int targetIndex = 0;
doneTargetIndexes.clear();
if (targetNullWords.size() != 0) // Source words to be deleted in the start of this phrase ...
{
iter = targetNullWords.begin();
if (*iter == startIndex)
{
j1 = startIndex;
source = currF[j1-startIndex];
english = "_INS_";
generateOperations(startIndex, j1, 2 , coverageVector , english , source , targetNullWords , currF);
}
}
if (sourceNullWords.find(targetIndex) != sourceNullWords.end()) // first word has to be deleted ...
{
english = currE[targetIndex];
generateDeleteOperations(english,targetIndex, doneTargetIndexes);
}
for (int i = 0; i < ceptsInPhrase.size(); i++)
{
source = "";
english = "";
fSide = ceptsInPhrase[i].first;
eSide = ceptsInPhrase[i].second;
iter = eSide.begin();
targetIndex = *iter;
english += currE[*iter];
iter++;
for (; iter != eSide.end(); iter++)
{
if(*iter == targetIndex+1)
targetIndex++;
else
doneTargetIndexes.insert(*iter);
english += "^_^";
english += currE[*iter];
}
iter = fSide.begin();
source += currF[*iter];
iter++;
for (; iter != fSide.end(); iter++)
{
source += "^_^";
source += currF[*iter];
}
iter = fSide.begin();
j1 = *iter + startIndex;
iter++;
generateOperations(startIndex, j1, 0 , coverageVector , english , source , targetNullWords , currF);
for (; iter != fSide.end(); iter++)
{
j1 = *iter + startIndex;
generateOperations(startIndex, j1, 1 , coverageVector , english , source , targetNullWords , currF);
}
targetIndex++; // Check whether the next target word is unaligned ...
while(doneTargetIndexes.find(targetIndex) != doneTargetIndexes.end())
{
targetIndex++;
}
if(sourceNullWords.find(targetIndex) != sourceNullWords.end())
{
english = currE[targetIndex];
generateDeleteOperations(english,targetIndex, doneTargetIndexes);
}
}
//removeReorderingOperations();
//print();
}
void osmHypothesis :: getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT)
{
set <int> :: iterator iter;
int sz = eSide.size();
vector <int> t;
for (iter = eSide.begin(); iter != eSide.end(); iter++)
{
t = tS[*iter];
for (int i = 0; i < t.size(); i++)
{
fSide.insert(t[i]);
}
}
for (iter = fSide.begin(); iter != fSide.end(); iter++)
{
t = sT[*iter];
for (int i = 0 ; i<t.size(); i++)
{
eSide.insert(t[i]);
}
}
if (eSide.size () > sz)
{
getMeCepts(eSide,fSide,tS,sT);
}
}
void osmHypothesis :: constructCepts(vector <int> & align , int startIndex , int endIndex, int targetPhraseLength)
{
std::map <int , vector <int> > sT;
std::map <int , vector <int> > tS;
std::set <int> eSide;
std::set <int> fSide;
std::set <int> :: iterator iter;
std :: map <int , vector <int> > :: iterator iter2;
std :: pair < set <int> , set <int> > cept;
int src;
int tgt;
for (int i = 0; i < align.size(); i+=2)
{
src = align[i];
tgt = align[i+1];
tS[tgt].push_back(src);
sT[src].push_back(tgt);
}
for (int i = startIndex; i<= endIndex; i++) // What are unaligned source words in this phrase ...
{
if (sT.find(i-startIndex) == sT.end())
{
targetNullWords.insert(i);
}
}
for (int i = 0; i < targetPhraseLength; i++) // What are unaligned target words in this phrase ...
{
if (tS.find(i) == tS.end())
{
sourceNullWords.insert(i);
}
}
while (tS.size() != 0 && sT.size() != 0)
{
iter2 = tS.begin();
eSide.clear();
fSide.clear();
eSide.insert (iter2->first);
getMeCepts(eSide, fSide, tS , sT);
for (iter = eSide.begin(); iter != eSide.end(); iter++)
{
iter2 = tS.find(*iter);
tS.erase(iter2);
}
for (iter = fSide.begin(); iter != fSide.end(); iter++)
{
iter2 = sT.find(*iter);
sT.erase(iter2);
}
cept = make_pair (fSide , eSide);
ceptsInPhrase.push_back(cept);
}
/*
cerr<<"Extracted Cepts "<<endl;
for (int i = 0; i < ceptsInPhrase.size(); i++)
{
fSide = ceptsInPhrase[i].first;
eSide = ceptsInPhrase[i].second;
for (iter = eSide.begin(); iter != eSide.end(); iter++)
{
cerr<<*iter<<" ";
}
cerr<<"<---> ";
for (iter = fSide.begin(); iter != fSide.end(); iter++)
{
cerr<<*iter<<" ";
}
cerr<<endl;
}
cerr<<endl;
cerr<<"Unaligned Target Words"<<endl;
for (iter = sourceNullWords.begin(); iter != sourceNullWords.end(); iter++)
cerr<<*iter<<"<--->"<<endl;
cerr<<"Unaligned Source Words"<<endl;
for (iter = targetNullWords.begin(); iter != targetNullWords.end(); iter++)
cerr<<*iter<<"<--->"<<endl;
*/
}
void osmHypothesis :: populateScores(vector <float> & scores)
{
scores.clear();
scores.push_back(opProb);
scores.push_back(gapWidth);
scores.push_back(gapCount);
scores.push_back(openGapCount);
scores.push_back(deletionCount);
}
} // namespace

View File

@ -0,0 +1,89 @@
#pragma once
# include "moses/FF/FFState.h"
# include "moses/Manager.h"
#include "lm/model.hh"
# include <set>
# include <map>
# include <string>
# include <vector>
namespace Moses
{
class osmState : public FFState
{
public:
osmState(const lm::ngram::State & val);
int Compare(const FFState& other) const;
void saveState(int jVal, int eVal, std::map <int , std::string> & gapVal);
int getJ()const {return j;}
int getE()const {return E;}
std::map <int , std::string> getGap() const { return gap;}
lm::ngram::State getLMState() const {return lmState;}
void print() const;
std::string getName() const;
protected:
int j, E;
std::map <int,std::string> gap;
lm::ngram::State lmState;
};
class osmHypothesis
{
private:
std::vector <std::string> operations; // List of operations required to generated this hyp ...
std::map <int,std::string> gap; // Maintains gap history ...
int j; // Position after the last source word generated ...
int E; // Position after the right most source word so far generated ...
lm::ngram::State lmState; // KenLM's Model State ...
int gapCount; // Number of gaps inserted ...
int deletionCount;
int openGapCount;
int gapWidth;
double opProb;
std::vector <std::string> currE;
std::vector <std::string> currF;
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
std::set <int> targetNullWords;
std::set <int> sourceNullWords;
int closestGap(std::map <int,std::string> gap,int j1, int & gp);
int firstOpenGap(std::vector <int> & coverageVector);
std::string intToString(int);
int getOpenGaps();
int isTranslationOperation(int j);
void removeReorderingOperations();
void getMeCepts ( std::set <int> & eSide , std::set <int> & fSide , std::map <int , std::vector <int> > & tS , std::map <int , std::vector <int> > & sT);
public:
osmHypothesis();
~osmHypothesis(){};
void generateOperations(int & startIndex, int j1 , int contFlag , WordsBitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF);
void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes);
void calculateOSMProb(lm::ngram::Model & ptrOp);
void computeOSMFeature(int startIndex , WordsBitmap & coverageVector);
void constructCepts(std::vector <int> & align , int startIndex , int endIndex, int targetPhraseLength);
void setPhrases(std::vector <std::string> & val1 , std::vector <std::string> & val2){currF = val1; currE = val2;}
void setState(const FFState* prev_state);
osmState * saveState();
void print();
void populateScores(std::vector <float> & scores);
void setState(const lm::ngram::State & val){lmState = val;}
};
} // namespace

View File

@ -19,31 +19,18 @@ PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line)
: StatefulFeatureFunction("PhraseBoundaryFeature", 0, line)
{
std::cerr << "Initializing source word deletion feature.." << std::endl;
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
}
bool PhraseBoundaryFeature::SetParameter(const std::string& key, const std::string& value)
void PhraseBoundaryFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "source") {
m_sourceFactors = Tokenize<FactorType>(value, ",");
} else if (key == "target") {
m_targetFactors = Tokenize<FactorType>(value, ",");
} else {
return StatefulFeatureFunction::SetParameter(key, value);
StatefulFeatureFunction::SetParameter(key, value);
}
return true;
}
const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const

View File

@ -52,7 +52,7 @@ public:
ScoreComponentCollection* ) const {
throw std::logic_error("PhraseBoundaryState not supported in chart decoder, yet");
}
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
private:
void AddFeatures(

View File

@ -12,7 +12,7 @@ using namespace std;
PhraseLengthFeature::PhraseLengthFeature(const std::string &line)
:StatelessFeatureFunction("PhraseLengthFeature", 0, line)
{
CHECK(m_args.size() == 0);
ReadParameters();
}
void PhraseLengthFeature::Evaluate(const Phrase &source

View File

@ -17,18 +17,7 @@ PhrasePairFeature::PhrasePairFeature(const std::string &line)
:StatelessFeatureFunction("PhrasePairFeature", 0, line)
{
std::cerr << "Initializing PhrasePairFeature.." << std::endl;
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
if (m_simple == 1) std::cerr << "using simple phrase pairs.. ";
if (m_sourceContext == 1) std::cerr << "using source context.. ";
@ -43,7 +32,7 @@ PhrasePairFeature::PhrasePairFeature(const std::string &line)
}
}
bool PhrasePairFeature::SetParameter(const std::string& key, const std::string& value)
void PhrasePairFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "input-factor") {
m_sourceFactorId = Scan<FactorType>(value);
@ -62,10 +51,8 @@ bool PhrasePairFeature::SetParameter(const std::string& key, const std::string&
} else if (key == "ignore-punctuation") {
m_filePathSource = value;
} else {
return StatelessFeatureFunction::SetParameter(key, value);
StatelessFeatureFunction::SetParameter(key, value);
}
return true;
}
void PhrasePairFeature::Load()

View File

@ -46,7 +46,7 @@ public:
}
void Load();
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
};

View File

@ -0,0 +1,22 @@
#include "PhrasePenalty.h"
#include "moses/ScoreComponentCollection.h"
namespace Moses
{
PhrasePenalty::PhrasePenalty(const std::string &line)
: StatelessFeatureFunction("PhrasePenalty",1, line)
{
ReadParameters();
}
void PhrasePenalty::Evaluate(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{
scoreBreakdown.Assign(this, 1.0f);
}
} // namespace

24
moses/FF/PhrasePenalty.h Normal file
View File

@ -0,0 +1,24 @@
#pragma once
#include "StatelessFeatureFunction.h"
namespace Moses
{
class PhrasePenalty : public StatelessFeatureFunction
{
public:
PhrasePenalty(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
return true;
}
virtual void Evaluate(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const;
};
} //namespace

View File

@ -22,30 +22,18 @@ SourceWordDeletionFeature::SourceWordDeletionFeature(const std::string &line)
m_unrestricted(true)
{
std::cerr << "Initializing source word deletion feature.." << std::endl;
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
}
bool SourceWordDeletionFeature::SetParameter(const std::string& key, const std::string& value)
void SourceWordDeletionFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "factor") {
m_factorType = Scan<FactorType>(value);
} else if (key == "path") {
m_filename = value;
} else {
return StatelessFeatureFunction::SetParameter(key, value);
StatelessFeatureFunction::SetParameter(key, value);
}
return true;
}
void SourceWordDeletionFeature::Load()

View File

@ -37,7 +37,7 @@ public:
const TargetPhrase& targetPhrase,
ScoreComponentCollection* accumulator,
const AlignmentInfo &alignmentInfo) const;
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
};

View File

@ -21,18 +21,7 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line)
:StatefulFeatureFunction("TargetBigramFeature", 0, line)
{
std::cerr << "Initializing target bigram feature.." << std::endl;
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
FactorCollection& factorCollection = FactorCollection::Instance();
const Factor* bosFactor =
@ -41,7 +30,7 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line)
}
bool TargetBigramFeature::SetParameter(const std::string& key, const std::string& value)
void TargetBigramFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "factor") {
m_factorType = Scan<FactorType>(value);
@ -50,7 +39,6 @@ bool TargetBigramFeature::SetParameter(const std::string& key, const std::string
} else {
StatefulFeatureFunction::SetParameter(key, value);
}
return true;
}
void TargetBigramFeature::Load()

View File

@ -47,7 +47,7 @@ public:
ScoreComponentCollection* ) const {
abort();
}
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
private:
FactorType m_factorType;

View File

@ -41,22 +41,10 @@ TargetNgramFeature::TargetNgramFeature(const std::string &line)
:StatefulFeatureFunction("TargetNgramFeature", 0, line)
{
std::cerr << "Initializing target ngram feature.." << std::endl;
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
}
bool TargetNgramFeature::SetParameter(const std::string& key, const std::string& value)
void TargetNgramFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "factor") {
m_factorType = Scan<FactorType>(value);
@ -65,9 +53,8 @@ bool TargetNgramFeature::SetParameter(const std::string& key, const std::string&
} else if (key == "lower-ngrams") {
m_lower_ngrams = Scan<bool>(value);
} else {
return StatefulFeatureFunction::SetParameter(key, value);
StatefulFeatureFunction::SetParameter(key, value);
}
return true;
}
bool TargetNgramFeature::Load(const std::string &filePath)

View File

@ -191,7 +191,7 @@ public:
virtual FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureId,
ScoreComponentCollection* accumulator) const;
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
private:
FactorType m_factorType;

View File

@ -20,30 +20,18 @@ TargetWordInsertionFeature::TargetWordInsertionFeature(const std::string &line)
m_unrestricted(true)
{
std::cerr << "Initializing target word insertion feature.." << std::endl;
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
}
bool TargetWordInsertionFeature::SetParameter(const std::string& key, const std::string& value)
void TargetWordInsertionFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "factor") {
m_factorType = Scan<FactorType>(value);
} else if (key == "path") {
m_filename = value;
} else {
return StatelessFeatureFunction::SetParameter(key, value);
StatelessFeatureFunction::SetParameter(key, value);
}
return true;
}
void TargetWordInsertionFeature::Load()

View File

@ -37,7 +37,7 @@ public:
const TargetPhrase& targetPhrase,
ScoreComponentCollection* accumulator,
const AlignmentInfo &alignmentInfo) const;
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
};

View File

@ -1,7 +1,17 @@
#include "UnknownWordPenaltyProducer.h"
#include <vector>
#include <string>
using namespace std;
namespace Moses
{
UnknownWordPenaltyProducer::UnknownWordPenaltyProducer(const std::string &line)
: StatelessFeatureFunction("UnknownWordPenalty",1, line)
{
m_tuneable = false;
ReadParameters();
}
}

View File

@ -15,11 +15,7 @@ class WordsRange;
class UnknownWordPenaltyProducer : public StatelessFeatureFunction
{
public:
UnknownWordPenaltyProducer(const std::string &line)
: StatelessFeatureFunction("UnknownWordPenalty",1, line) {
m_tuneable = false;
CHECK(m_args.size() == 0);
}
UnknownWordPenaltyProducer(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
return true;

View File

@ -2,8 +2,16 @@
#include "moses/TargetPhrase.h"
#include "moses/ScoreComponentCollection.h"
using namespace std;
namespace Moses
{
WordPenaltyProducer::WordPenaltyProducer(const std::string &line)
: StatelessFeatureFunction("WordPenalty",1, line)
{
ReadParameters();
}
void WordPenaltyProducer::Evaluate(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown

View File

@ -12,10 +12,7 @@ class ScoreComponentCollection;
class WordPenaltyProducer : public StatelessFeatureFunction
{
public:
WordPenaltyProducer(const std::string &line)
: StatelessFeatureFunction("WordPenalty",1, line) {
CHECK(m_args.size() == 0);
}
WordPenaltyProducer(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
return true;

View File

@ -26,18 +26,7 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line)
,m_domainTrigger(false)
{
std::cerr << "Initializing word translation feature.. " << endl;
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
if (m_simple == 1) std::cerr << "using simple word translations.. ";
if (m_sourceContext == 1) std::cerr << "using source context.. ";
@ -71,7 +60,7 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line)
}
bool WordTranslationFeature::SetParameter(const std::string& key, const std::string& value)
void WordTranslationFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "input-factor") {
m_factorTypeSource = Scan<FactorType>(value);
@ -94,9 +83,8 @@ bool WordTranslationFeature::SetParameter(const std::string& key, const std::str
} else if (key == "target-path") {
m_filePathTarget = value;
} else {
return StatelessFeatureFunction::SetParameter(key, value);
StatelessFeatureFunction::SetParameter(key, value);
}
return true;
}
void WordTranslationFeature::Load()

View File

@ -52,7 +52,7 @@ public:
void EvaluateChart(const ChartBasedFeatureContext& context,
ScoreComponentCollection* accumulator) const;
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
};
}

View File

@ -38,14 +38,7 @@ namespace Moses
GenerationDictionary::GenerationDictionary(const std::string &line)
: DecodeFeature("Generation", line)
{
for (size_t i = 0; i < m_args.size(); ++i) {
const vector<string> &args = m_args[i];
if (args[0] == "path") {
m_filePath = args[1];
}
}
ReadParameters();
}
void GenerationDictionary::Load()
@ -133,5 +126,14 @@ const OutputWordCollection *GenerationDictionary::FindWord(const Word &word) con
return ret;
}
void GenerationDictionary::SetParameter(const std::string& key, const std::string& value)
{
if (key == "path") {
m_filePath = value;
} else {
DecodeFeature::SetParameter(key, value);
}
}
}

View File

@ -68,6 +68,7 @@ public:
* Or NULL if the input word isn't found. The search function used is the WordComparer functor
*/
const OutputWordCollection *FindWord(const Word &word) const;
void SetParameter(const std::string& key, const std::string& value);
};

View File

@ -11,7 +11,6 @@ if $(with-dlib) {
}
alias headers : ../util//kenutil : : : $(max-factors) $(dlib) ;
alias ThreadPool : ThreadPool.cpp ;
if [ option.get "with-synlm" : no : yes ] = yes
@ -41,6 +40,7 @@ lib moses :
TranslationModel/Scope3Parser/*.cpp
TranslationModel/CYKPlusParser/*.cpp
FF/*.cpp
FF/OSM-Feature/*.cpp
: #exceptions
ThreadPool.cpp
SyntacticLanguageModel.cpp

View File

@ -383,9 +383,10 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string
try {
lm::ngram::ModelType model_type;
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
switch(model_type) {
case lm::ngram::PROBING:
return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
case lm::ngram::REST_PROBING:
return new LanguageModelKen<lm::ngram::RestProbingModel>(description, line, file, factorType, lazy);
case lm::ngram::TRIE:

View File

@ -38,7 +38,7 @@ public:
}
LabelId add(const Key& k) {
std::pair<typename M::iterator,bool> p
=m.insert(std::make_pair(k,data.size()));
=m.insert(std::make_pair(k,data.size()));
if(p.second) data.push_back(k);
CHECK(static_cast<size_t>(p.first->second)<data.size());
return p.first->second;

View File

@ -68,6 +68,7 @@ Manager::~Manager()
{
delete m_transOptColl;
delete m_search;
// this is a comment ...
StaticData::Instance().CleanUpAfterSentenceProcessing(m_source);
}

View File

@ -275,13 +275,15 @@ bool Parameter::LoadParam(int argc, char* argv[])
}
// overwrite parameters with values from switches
for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++) {
for(PARAM_STRING::const_iterator iterParam = m_description.begin();
iterParam != m_description.end(); iterParam++) {
const string paramName = iterParam->first;
OverwriteParam("-" + paramName, paramName, argc, argv);
}
// ... also shortcuts
for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); iterParam != m_abbreviation.end(); iterParam++) {
for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin();
iterParam != m_abbreviation.end(); iterParam++) {
const string paramName = iterParam->first;
const string paramShortName = iterParam->second;
OverwriteParam("-" + paramShortName, paramName, argc, argv);
@ -294,7 +296,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
verbose = Scan<int>(m_setting["verbose"][0]);
if (verbose >= 1) { // only if verbose
TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; iterParam != m_setting.end(); iterParam++) {
for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ;
iterParam != m_setting.end(); iterParam++) {
TRACE_ERR( "\t" << iterParam->first << ": ");
for ( size_t i = 0; i < iterParam->second.size(); i++ )
TRACE_ERR( iterParam->second[i] << " ");
@ -303,7 +306,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
}
// convert old weights args to new format
if (!isParamSpecified("feature"))
// WHAT IS GOING ON HERE??? - UG
if (!isParamSpecified("feature")) // UG
ConvertWeightArgs();
CreateWeightsMap();
WeightOverwrite();
@ -331,11 +335,11 @@ std::vector<float> &Parameter::GetWeights(const std::string &name)
{
std::vector<float> &ret = m_weights[name];
cerr << "WEIGHT " << name << "=";
for (size_t i = 0; i < ret.size(); ++i) {
cerr << ret[i] << ",";
}
cerr << endl;
// cerr << "WEIGHT " << name << "=";
// for (size_t i = 0; i < ret.size(); ++i) {
// cerr << ret[i] << ",";
// }
// cerr << endl;
return ret;
}
@ -357,7 +361,10 @@ void Parameter::SetWeight(const std::string &name, size_t ind, const vector<floa
newWeights.push_back(line);
}
void Parameter::AddWeight(const std::string &name, size_t ind, const std::vector<float> &weights)
void
Parameter::
AddWeight(const std::string &name, size_t ind,
const std::vector<float> &weights)
{
PARAM_VEC &newWeights = m_setting["weight"];
@ -478,6 +485,12 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
case Compact:
ptType = "PhraseDictionaryCompact";
break;
case SuffixArray:
ptType = "PhraseDictionarySuffixArray";
break;
case DSuffixArray:
ptType = "PhraseDictionaryDynSuffixArray";
break;
default:
break;
}
@ -502,6 +515,9 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
++currOldInd;
}
// cerr << weights.size() << " PHRASE TABLE WEIGHTS "
// << __FILE__ << ":" << __LINE__ << endl;
AddWeight(ptType, ptInd, weights);
// actual pt
@ -527,7 +543,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
ptLine << "num-features=" << numScoreComponent << " ";
ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";
if (implementation == SuffixArray) {
if (implementation == SuffixArray || implementation == DSuffixArray) {
ptLine << "target-path=" << token[5] << " ";
ptLine << "alignment-path=" << token[6] << " ";
}

View File

@ -61,6 +61,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/FF/DistortionScoreProducer.h"
#include "moses/FF/WordPenaltyProducer.h"
#include "moses/FF/InputFeature.h"
#include "moses/FF/PhrasePenalty.h"
#include "moses/FF/OSM-Feature/OpSequenceModel.h"
#include "LM/Ken.h"
#ifdef LM_IRST
@ -691,6 +693,14 @@ bool StaticData::LoadData(Parameter *parameter)
PhraseDictionaryDynSuffixArray* model = new PhraseDictionaryDynSuffixArray(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
} else if (feature == "OpSequenceModel") {
OpSequenceModel* model = new OpSequenceModel(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
} else if (feature == "PhrasePenalty") {
PhrasePenalty* model = new PhrasePenalty(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
}
#ifdef HAVE_SYNLM
@ -938,7 +948,7 @@ const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGra
boost::mutex::scoped_lock lock(m_transOptCacheMutex);
#endif
std::map<std::pair<std::pair<size_t, std::string>, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter
= m_transOptCache.find(key);
= m_transOptCache.find(key);
if (iter == m_transOptCache.end())
return NULL;
iter->second.second = clock(); // update last used time
@ -1166,7 +1176,6 @@ void StaticData::LoadFeatureFunctions()
}
}
// load phrase table
for (size_t i = 0; i < m_phraseDictionary.size(); ++i) {
PhraseDictionary *pt = m_phraseDictionary[i];
pt->Load();

View File

@ -673,7 +673,7 @@ public:
return false;
}
std::map< std::string, std::set< std::string > >::const_iterator lookupIgnoreFF
= m_weightSettingIgnoreFF.find( m_currentWeightSetting );
= m_weightSettingIgnoreFF.find( m_currentWeightSetting );
if (lookupIgnoreFF == m_weightSettingIgnoreFF.end()) {
return false;
}
@ -691,7 +691,7 @@ public:
return false;
}
std::map< std::string, std::set< size_t > >::const_iterator lookupIgnoreDP
= m_weightSettingIgnoreDP.find( m_currentWeightSetting );
= m_weightSettingIgnoreDP.find( m_currentWeightSetting );
if (lookupIgnoreDP == m_weightSettingIgnoreDP.end()) {
return false;
}

View File

@ -35,11 +35,11 @@ struct CompareTargetPhrase {
void TargetPhraseCollection::NthElement(size_t tableLimit)
{
vector<TargetPhrase*>::iterator
iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit) ?m_collection.end() : m_collection.begin() + tableLimit;
//std::sort(m_collection.begin(), m_collection.end(), CompareTargetPhrase());
std::nth_element(m_collection.begin(), iterMiddle, m_collection.end(), CompareTargetPhrase());
vector<TargetPhrase*>::iterator nth;
nth = (tableLimit && tableLimit <= m_collection.size()
? m_collection.begin() + tableLimit
: m_collection.end());
std::nth_element(m_collection.begin(), nth, m_collection.end(), CompareTargetPhrase());
}
void TargetPhraseCollection::Prune(bool adhereTableLimit, size_t tableLimit)

File diff suppressed because it is too large Load Diff

View File

@ -5,23 +5,29 @@
#include "moses/TranslationModel/DynSAInclude/vocab.h"
#include "moses/TranslationModel/DynSAInclude/types.h"
#include "moses/TranslationModel/DynSAInclude/utils.h"
#include "moses/TranslationModel/WordCoocTable.h"
#include "moses/InputFileStream.h"
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"
#include <boost/dynamic_bitset.hpp>
#include "moses/TargetPhraseCollection.h"
#include <map>
using namespace std;
namespace Moses
{
class PhraseDictionaryDynSuffixArray;
/** @todo ask Abbey Levenberg
*/
class SAPhrase
{
public:
std::vector<wordID_t> words;
vector<wordID_t> words;
SAPhrase(size_t phraseSize)
:words(phraseSize) {
}
:words(phraseSize)
{}
void SetId(size_t pos, wordID_t id) {
CHECK(pos < words.size());
@ -43,12 +49,16 @@ public:
, m_endTarget(endTarget)
, m_startSource(startSource)
, m_endSource(endSource)
, m_sntIndex(sntIndex) {
}
, m_sntIndex(sntIndex)
{}
size_t GetTargetSize() const {
return m_endTarget - m_startTarget + 1;
}
size_t GetSourceSize() const {
return m_endSource - m_startSource + 1;
}
};
/** @todo ask Abbey Levenberg
@ -58,32 +68,43 @@ class SentenceAlignment
public:
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
int m_sntIndex;
std::vector<wordID_t>* trgSnt;
std::vector<wordID_t>* srcSnt;
std::vector<int> numberAligned;
std::vector< std::vector<int> > alignedList;
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
vector<wordID_t>* trgSnt;
vector<wordID_t>* srcSnt;
vector<int> numberAligned;
vector< vector<int> > alignedList;
bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret,
int startSource, int endSource) const;
};
class ScoresComp
{
public:
ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
ScoresComp(const vector<float>& weights): m_weights(weights) {}
bool operator()(const Scores& s1, const Scores& s2) const {
return s1[0] < s2[0]; // just p(e|f) as approximation
/*float score1(0), score2(0);
int idx1(0), idx2(0);
for (Scores::const_iterator itr = s1.begin();
itr != s1.end(); ++itr) {
score1 += log(*itr * m_weights.at(idx1++));
}
for (Scores::const_iterator itr = s2.begin();
itr != s2.end(); ++itr) {
score2 += log(*itr * m_weights.at(idx2++));
}
return score1 < score2;*/
// float score1(0), score2(0);
// int idx1(0), idx2(0);
// for (Scores::const_iterator itr = s1.begin();
// itr != s1.end(); ++itr) {
// score1 += log(*itr * m_weights.at(idx1++));
// }
// for (Scores::const_iterator itr = s2.begin();
// itr != s2.end(); ++itr) {
// score2 += log(*itr * m_weights.at(idx2++));
// }
// return score1 < score2;
}
private:
const std::vector<float>& m_weights;
const vector<float>& m_weights;
};
struct BetterPhrase {
ScoresComp const& cmp;
BetterPhrase(ScoresComp const& sc);
// bool operator()(pair<Scores, TargetPhrase const*> const& a,
// pair<Scores, TargetPhrase const*> const& b) const;
bool operator()(pair<Scores, SAPhrase const*> const& a,
pair<Scores, SAPhrase const*> const& b) const;
};
/** @todo ask Abbey Levenberg
@ -93,66 +114,70 @@ class BilingualDynSuffixArray
public:
BilingualDynSuffixArray();
~BilingualDynSuffixArray();
bool Load( const std::vector<FactorType>& inputFactors,
const std::vector<FactorType>& outputTactors,
std::string source, std::string target, std::string alignments,
const std::vector<float> &weight);
bool LoadTM( const std::vector<FactorType>& inputFactors,
const std::vector<FactorType>& outputTactors,
std::string source, std::string target, std::string alignments,
const std::vector<float> &weight);
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
void addSntPair(string& source, string& target, string& alignment);
private:
DynSuffixArray* m_srcSA;
DynSuffixArray* m_trgSA;
std::vector<wordID_t>* m_srcCorpus;
std::vector<wordID_t>* m_trgCorpus;
std::vector<FactorType> m_inputFactors;
std::vector<FactorType> m_outputFactors;
bool Load( const vector<FactorType>& inputFactors,
const vector<FactorType>& outputTactors,
string source, string target, string alignments,
const vector<float> &weight);
// bool LoadTM( const vector<FactorType>& inputFactors,
// const vector<FactorType>& outputTactors,
// string source, string target, string alignments,
// const vector<float> &weight);
void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair<Scores, TargetPhrase*> >& target) const;
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
void CleanUp(const InputType& source);
void addSntPair(string& source, string& target, string& alignment);
pair<float,float>
GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const;
TargetPhrase*
GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
private:
mutable WordCoocTable m_wrd_cooc;
DynSuffixArray * m_srcSA;
DynSuffixArray * m_trgSA;
vector<wordID_t>* m_srcCorpus;
vector<wordID_t>* m_trgCorpus;
vector<FactorType> m_inputFactors;
vector<FactorType> m_outputFactors;
vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
Vocab* m_srcVocab, *m_trgVocab;
ScoresComp* m_scoreCmp;
std::vector<SentenceAlignment> m_alignments;
std::vector<std::vector<short> > m_rawAlignments;
vector<SentenceAlignment> m_alignments;
vector<vector<short> > m_rawAlignments;
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
mutable std::set<wordID_t> m_freqWordsCached;
mutable map<pair<wordID_t, wordID_t>, pair<float, float> > m_wordPairCache;
mutable set<wordID_t> m_freqWordsCached;
const size_t m_maxPhraseLength, m_maxSampleSize;
int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
std::vector<wordID_t>&, std::vector<wordID_t>&,
const size_t m_maxPTEntries;
int LoadCorpus(FactorDirection direction,
InputFileStream&, const vector<FactorType>& factors,
vector<wordID_t>&, vector<wordID_t>&,
Vocab*);
int LoadAlignments(InputFileStream& aligs);
int LoadRawAlignments(InputFileStream& aligs);
int LoadRawAlignments(string& aligs);
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
bool ExtractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
int SampleSelection(std::vector<unsigned>&, int = 300) const;
int SampleSelection(vector<unsigned>&, int = 300) const;
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
vector<int> GetSntIndexes(vector<unsigned>&, int, const vector<unsigned>&) const;
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
void CacheWordProbs(wordID_t) const;
void CacheFreqWords() const;
void ClearWordInCache(wordID_t);
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
pair<float, float> GetLexicalWeight(const PhrasePair&) const;
int GetSourceSentenceSize(size_t sentenceId) const;
int GetTargetSentenceSize(size_t sentenceId) const;
int GetSourceSentenceSize(size_t sentenceId) const {
return (sentenceId==m_srcSntBreaks.size()-1) ?
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
}
int GetTargetSentenceSize(size_t sentenceId) const {
return (sentenceId==m_trgSntBreaks.size()-1) ?
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
}
};
} // end namespace
#endif

View File

@ -234,12 +234,12 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
std::vector<float> weightT = staticData.GetWeights(&m_dictionary);
targetPhraseCollection
= tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec
,m_outputFactorsVec
,m_dictionary
,weightT
,m_filePath
, m_dbWrapper.GetVocab());
= tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec
,m_outputFactorsVec
,m_dictionary
,weightT
,m_filePath
, m_dbWrapper.GetVocab());
delete tpcollBerkeleyDb;
m_cache[tpCollFilePos] = targetPhraseCollection;

View File

@ -428,7 +428,7 @@ void CompressionTaskReordering::operator()()
while(scoresNum < m_encodedScores.size()) {
std::string scores = m_encodedScores[scoresNum];
std::string compressedScores
= m_creator.CompressEncodedScores(scores);
= m_creator.CompressEncodedScores(scores);
std::string dummy;
PackedItem packedItem(scoresNum, dummy, compressedScores, 0);

View File

@ -61,7 +61,7 @@ PhraseDecoder::~PhraseDecoder()
inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
= m_sourceSymbolsMap.find(symbol);
= m_sourceSymbolsMap.find(symbol);
if(it != m_sourceSymbolsMap.end())
return it->second;
@ -200,7 +200,7 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
if(m_coding == PREnc) {
std::pair<TargetPhraseVectorPtr, size_t> cachedPhraseColl
= m_decodingCache.Retrieve(sourcePhrase);
= m_decodingCache.Retrieve(sourcePhrase);
// Has been cached and is complete or does not need to be completed
if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0))
@ -255,7 +255,7 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
if(m_coding == REnc) {
for(size_t i = 0; i < sourcePhrase.GetSize(); i++) {
std::string sourceWord
= sourcePhrase.GetWord(i).GetString(*m_input, false);
= sourcePhrase.GetWord(i).GetString(*m_input, false);
unsigned idx = GetSourceSymbolId(sourceWord);
sourceWords.push_back(idx);
}

View File

@ -41,6 +41,17 @@ using namespace std;
namespace Moses
{
PhraseDictionaryCompact::PhraseDictionaryCompact(const std::string &line)
:PhraseDictionary("PhraseDictionaryCompact", line)
,m_inMemory(true)
,m_useAlignmentInfo(true)
,m_hash(10, 16)
,m_phraseDecoder(0)
,m_weight(0)
{
ReadParameters();
}
void PhraseDictionaryCompact::Load()
{
const StaticData &staticData = StaticData::Instance();
@ -106,7 +117,7 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c
// Retrieve target phrase collection from phrase table
TargetPhraseVectorPtr decodedPhraseColl
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));

View File

@ -68,14 +68,7 @@ protected:
std::vector<float> m_weight;
public:
PhraseDictionaryCompact(const std::string &line)
:PhraseDictionary("PhraseDictionaryCompact", line)
,m_inMemory(true)
,m_useAlignmentInfo(true)
,m_hash(10, 16)
,m_phraseDecoder(0)
,m_weight(0) {
}
PhraseDictionaryCompact(const std::string &line);
~PhraseDictionaryCompact();

View File

@ -426,7 +426,7 @@ void PhraseTableCreator::AddTargetSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
= m_sourceSymbolsMap.find(symbol);
= m_sourceSymbolsMap.find(symbol);
if(it != m_sourceSymbolsMap.end())
return it->second;
@ -437,7 +437,7 @@ unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
= m_targetSymbolsMap.find(symbol);
= m_targetSymbolsMap.find(symbol);
if(it != m_targetSymbolsMap.end())
return it->second;
@ -451,7 +451,7 @@ unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol)
boost::mutex::scoped_lock lock(m_mutex);
#endif
boost::unordered_map<std::string, unsigned>::iterator it
= m_targetSymbolsMap.find(symbol);
= m_targetSymbolsMap.find(symbol);
if(it != m_targetSymbolsMap.end())
return it->second;
@ -1200,7 +1200,7 @@ void CompressionTask::operator()()
while(collectionNum < m_encodedCollections.size()) {
std::string collection = m_encodedCollections[collectionNum];
std::string compressedCollection
= m_creator.CompressEncodedCollection(collection);
= m_creator.CompressEncodedCollection(collection);
std::string dummy;
PackedItem packedItem(collectionNum, dummy, compressedCollection, 0);

View File

@ -143,7 +143,7 @@ public:
return data;
else {
typename std::vector<DataType>::iterator it
= std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
= std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
if(it != m_bestVec.end())
return *it;
else

View File

@ -1,5 +1,6 @@
#include "DynSuffixArray.h"
#include <iostream>
#include <boost/foreach.hpp>
using namespace std;
@ -215,8 +216,37 @@ void DynSuffixArray::Substitute(vuint_t* /* newSents */, unsigned /* newIndex */
return;
}
ComparePosition::
ComparePosition(vuint_t const& crp, vuint_t const& sfa)
: m_crp(crp), m_sfa(sfa) { }
bool
ComparePosition::
operator()(unsigned const& i, vector<wordID_t> const& phrase) const
{
unsigned const* x = &m_crp.at(i);
unsigned const* e = &m_crp.back();
size_t k = 0;
for (; k < phrase.size() && x < e; ++k, ++x)
if (*x != phrase[k]) return *x < phrase[k];
return (x == e && k < phrase.size());
}
bool
ComparePosition::
operator()(vector<wordID_t> const& phrase, unsigned const& i) const
{
unsigned const* x = &m_crp.at(i);
unsigned const* e = &m_crp.back();
size_t k = 0;
for (; k < phrase.size() && x < e; ++k, ++x)
if (*x != phrase[k]) return phrase[k] < *x;
return false; // (k == phrase.size() && x < e);
}
bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
{
// DOES THIS EVEN WORK WHEN A DynSuffixArray has been saved and reloaded????
pair<vuint_t::iterator,vuint_t::iterator> bounds;
indices->clear();
size_t phrasesize = phrase->size();
@ -251,6 +281,16 @@ bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
return (indices->size() > 0);
}
size_t
DynSuffixArray::
GetCount(vuint_t const& phrase) const
{
ComparePosition cmp(*m_corpus, *m_SA);
vuint_t::const_iterator lb = lower_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
vuint_t::const_iterator ub = upper_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
return ub-lb;
}
void DynSuffixArray::Save(FILE* fout)
{
fWriteVector(fout, *m_SA);

View File

@ -11,9 +11,25 @@
namespace Moses
{
using namespace std;
typedef std::vector<unsigned> vuint_t;
/// compare position /i/ in the suffix array /m_sfa/ into corpus /m_crp/
/// against reference phrase /phrase/
// added by Ulrich Germann
class ComparePosition
{
vuint_t const& m_crp;
vuint_t const& m_sfa;
public:
ComparePosition(vuint_t const& crp, vuint_t const& sfa);
bool operator()(unsigned const& i, vector<wordID_t> const& phrase) const;
bool operator()(vector<wordID_t> const& phrase, unsigned const& i) const;
};
/** @todo ask Abbey Levenberg
*/
class DynSuffixArray
@ -30,6 +46,8 @@ public:
void Delete(unsigned, unsigned);
void Substitute(vuint_t*, unsigned);
size_t GetCount(vuint_t const& phrase) const;
private:
vuint_t* m_SA;
vuint_t* m_ISA;
@ -46,10 +64,10 @@ private:
void PrintAuxArrays() {
std::cerr << "SA\tISA\tF\tL\n";
for(size_t i=0; i < m_SA->size(); ++i)
std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t"
<< m_F->at(i) << "\t" << m_L->at(i) << std::endl;
}
};
} //end namespace
#endif

View File

@ -34,16 +34,6 @@ PhraseDictionary::PhraseDictionary(const std::string &description, const std::st
:DecodeFeature(description, line)
,m_tableLimit(20) // default
{
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
}
@ -54,16 +44,15 @@ GetTargetPhraseCollection(InputType const& src,WordsRange const& range) const
return GetTargetPhraseCollection(phrase);
}
bool PhraseDictionary::SetParameter(const std::string& key, const std::string& value)
void PhraseDictionary::SetParameter(const std::string& key, const std::string& value)
{
if (key == "path") {
m_filePath = value;
} else if (key == "table-limit") {
m_tableLimit = Scan<size_t>(value);
} else {
return DecodeFeature::SetParameter(key, value);
DecodeFeature::SetParameter(key, value);
}
return true;
}
void PhraseDictionary::SetFeaturesToApply()

View File

@ -91,7 +91,7 @@ public:
return m_featuresToApply;
}
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
protected:
size_t m_tableLimit;

View File

@ -0,0 +1,4 @@
Specifying Dynamic Suffix Array-based Phrase Tables in moses.ini
[ttable-file]
14 0 0 5 <source language text file> <target language text file> <file with alignment info in symal format>

View File

@ -3,84 +3,35 @@
#include "moses/StaticData.h"
#include "moses/TargetPhrase.h"
#include <iomanip>
#include <boost/foreach.hpp>
using namespace std;
namespace Moses
{
PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line)
:PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
PhraseDictionaryDynSuffixArray::
PhraseDictionaryDynSuffixArray(const std::string &line)
: PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
,m_biSA(new BilingualDynSuffixArray())
{
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
}
PhraseDictionaryDynSuffixArray::~PhraseDictionaryDynSuffixArray()
{
delete m_biSA;
}
void PhraseDictionaryDynSuffixArray::Load()
{
SetFeaturesToApply();
const StaticData &staticData = StaticData::Instance();
vector<float> weight = staticData.GetWeights(this);
m_biSA->Load( m_input, m_output, m_source, m_target, m_alignments, weight);
vector<float> weight = StaticData::Instance().GetWeights(this);
m_biSA->Load(m_input, m_output, m_source, m_target, m_alignments, weight);
}
const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCollection(const Phrase& src) const
PhraseDictionaryDynSuffixArray::
~PhraseDictionaryDynSuffixArray()
{
TargetPhraseCollection *ret = new TargetPhraseCollection();
std::vector< std::pair< Scores, TargetPhrase*> > trg;
// extract target phrases and their scores from suffix array
m_biSA->GetTargetPhrasesByLexicalWeight( src, trg);
std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
for(itr = trg.begin(); itr != trg.end(); ++itr) {
Scores scoreVector = itr->first;
TargetPhrase *targetPhrase = itr->second;
//std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),NegateScore);
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
targetPhrase->Evaluate(src);
//cout << *targetPhrase << "\t" << std::setprecision(8) << scoreVector[2] << endl;
ret->Add(targetPhrase);
}
ret->NthElement(m_tableLimit); // sort the phrases for the dcoder
return ret;
delete m_biSA;
}
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
{
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
}
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
{
// need to implement --
}
ChartRuleLookupManager *PhraseDictionaryDynSuffixArray::CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
{
throw "Chart decoding not supported by PhraseDictionaryDynSuffixArray";
}
bool PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
{
if (key == "source") {
m_source = value;
@ -89,9 +40,66 @@ bool PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const
} else if (key == "alignment") {
m_alignments = value;
} else {
return PhraseDictionary::SetParameter(key, value);
PhraseDictionary::SetParameter(key, value);
}
return true;
}
const TargetPhraseCollection*
PhraseDictionaryDynSuffixArray::
GetTargetPhraseCollection(const Phrase& src) const
{
typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics
m_biSA->GatherCands(src,pstats);
TargetPhraseCollection *ret = new TargetPhraseCollection();
BOOST_FOREACH(pstat_entry & e, pstats) {
TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src);
tp->GetScoreBreakdown().Assign(this,e.second);
ret->Add(tp);
}
// return ret;
// TargetPhraseCollection *ret = new TargetPhraseCollection();
// std::vector< std::pair< Scores, TargetPhrase*> > trg;
//
// // extract target phrases and their scores from suffix array
// m_biSA->GetTargetPhrasesByLexicalWeight(src, trg);
//
// std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
// for(itr = trg.begin(); itr != trg.end(); ++itr) {
// Scores scoreVector = itr->first;
// TargetPhrase *targetPhrase = itr->second;
// std::transform(scoreVector.begin(),scoreVector.end(),
// scoreVector.begin(),FloorScore);
// targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
// targetPhrase->Evaluate();
// ret->Add(targetPhrase);
// }
ret->NthElement(m_tableLimit); // sort the phrases for the decoder
return ret;
}
void
PhraseDictionaryDynSuffixArray::
insertSnt(string& source, string& target, string& alignment)
{
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
}
void
PhraseDictionaryDynSuffixArray::
deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
{
// need to implement --
}
ChartRuleLookupManager*
PhraseDictionaryDynSuffixArray::
CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
{
CHECK(false);
return 0;
}
}// end namepsace

View File

@ -17,21 +17,19 @@ class PhraseDictionaryDynSuffixArray: public PhraseDictionary
public:
PhraseDictionaryDynSuffixArray(const std::string &line);
~PhraseDictionaryDynSuffixArray();
bool InitDictionary();
void Load();
// functions below required by base class
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
void insertSnt(string&, string&, string&);
void deleteSnt(unsigned, unsigned);
ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&);
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
private:
BilingualDynSuffixArray *m_biSA;
std::string m_source, m_target, m_alignments;
std::vector<float> m_weight;
};
} // end namespace

View File

@ -39,6 +39,11 @@ using namespace std;
namespace Moses
{
PhraseDictionaryMemory::PhraseDictionaryMemory(const std::string &line)
: RuleTableTrie("PhraseDictionaryMemory", line)
{
ReadParameters();
}
TargetPhraseCollection &PhraseDictionaryMemory::GetOrCreateTargetPhraseCollection(
const Phrase &source

View File

@ -43,10 +43,7 @@ protected:
}
public:
PhraseDictionaryMemory(const std::string &line)
: RuleTableTrie("PhraseDictionaryMemory", line) {
CHECK(m_args.size() == 0);
}
PhraseDictionaryMemory(const std::string &line);
const PhraseDictionaryNodeMemory &GetRootNode() const {
return m_collection;

View File

@ -28,17 +28,7 @@ namespace Moses
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
:PhraseDictionary("PhraseDictionaryMultiModel", line)
{
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
if (m_mode != "interpolate") {
ostringstream msg;
@ -56,23 +46,12 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &description, const std::string &line)
:PhraseDictionary(description, line)
{
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
if (description == "PhraseDictionaryMultiModelCounts") {
CHECK(m_pdStr.size() == m_multimodelweights.size() || m_pdStr.size()*4 == m_multimodelweights.size());
}
}
bool PhraseDictionaryMultiModel::SetParameter(const std::string& key, const std::string& value)
void PhraseDictionaryMultiModel::SetParameter(const std::string& key, const std::string& value)
{
if (key == "mode") {
m_mode = value;
@ -82,9 +61,8 @@ bool PhraseDictionaryMultiModel::SetParameter(const std::string& key, const std:
} else if (key == "lambda") {
m_multimodelweights = Tokenize<float>(value, ",");
} else {
return PhraseDictionary::SetParameter(key, value);
PhraseDictionary::SetParameter(key, value);
}
return true;
}
PhraseDictionaryMultiModel::~PhraseDictionaryMultiModel()

View File

@ -81,7 +81,7 @@ public:
/* Don't do anything source specific here as this object is shared between threads.*/
}
ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&);
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
const std::vector<float>* GetTemporaryMultiModelWeightsVector() const;
void SetTemporaryMultiModelWeightsVector(std::vector<float> weights);

View File

@ -68,17 +68,7 @@ PhraseDictionaryMultiModelCounts::PhraseDictionaryMultiModelCounts(const std::st
//m_mode = "interpolate";
//m_combineFunction = LinearInterpolationFromCounts;
cerr << "m_args=" << m_args.size() << endl;
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = SetParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
CHECK(m_args.size() == 0);
ReadParameters();
CHECK(m_targetTable.size() == m_pdStr.size());
@ -94,7 +84,7 @@ PhraseDictionaryMultiModelCounts::PhraseDictionaryMultiModelCounts(const std::st
}
bool PhraseDictionaryMultiModelCounts::SetParameter(const std::string& key, const std::string& value)
void PhraseDictionaryMultiModelCounts::SetParameter(const std::string& key, const std::string& value)
{
if (key == "mode") {
m_mode = value;
@ -107,10 +97,8 @@ bool PhraseDictionaryMultiModelCounts::SetParameter(const std::string& key, cons
} else if (key == "target-table") {
m_targetTable = Tokenize(value, ",");
} else {
return PhraseDictionaryMultiModel::SetParameter(key, value);
PhraseDictionaryMultiModel::SetParameter(key, value);
}
return true;
}
PhraseDictionaryMultiModelCounts::~PhraseDictionaryMultiModelCounts()

View File

@ -103,7 +103,7 @@ public:
/* Don't do anything source specific here as this object is shared between threads.*/
}
bool SetParameter(const std::string& key, const std::string& value);
void SetParameter(const std::string& key, const std::string& value);
private:
std::vector<PhraseDictionary*> m_inverse_pd;

View File

@ -29,7 +29,7 @@ PhraseDictionaryTreeAdaptor::
PhraseDictionaryTreeAdaptor(const std::string &line)
: PhraseDictionary("PhraseDictionaryBinary", line)
{
CHECK(m_args.size() == 0);
ReadParameters();
}
PhraseDictionaryTreeAdaptor::~PhraseDictionaryTreeAdaptor()

View File

@ -3,11 +3,17 @@
#ifndef moses_PhraseDictionaryTreeAdaptor_h
#define moses_PhraseDictionaryTreeAdaptor_h
#include <vector>
#include "util/check.hh"
#include "moses/TypeDef.h"
#include "moses/TargetPhraseCollection.h"
#include "moses/TranslationModel/PhraseDictionary.h"
#include "util/check.hh"
#include <vector>
#ifdef WITH_THREADS
#include <boost/thread/tss.hpp>
#else
#include <boost/scoped_ptr.hpp>
#endif
namespace Moses
{
@ -24,7 +30,11 @@ class PhraseDictionaryTreeAdaptor : public PhraseDictionary
{
typedef PhraseDictionary MyBase;
#ifdef WITH_THREADS
boost::thread_specific_ptr<PDTAimp> m_implementation;
#else
boost::scoped_ptr<PDTAimp> m_implementation;
#endif
friend class PDTAimp;
PhraseDictionaryTreeAdaptor();

View File

@ -27,7 +27,8 @@ PhraseDictionaryALSuffixArray::PhraseDictionaryALSuffixArray(const std::string &
if (staticData.ThreadCount() > 1) {
throw runtime_error("Suffix array implementation is not threadsafe");
}
CHECK(m_args.size() == 0);
ReadParameters();
}
void PhraseDictionaryALSuffixArray::Load()

View File

@ -30,6 +30,12 @@ using namespace std;
namespace Moses
{
PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
: MyBase("PhraseDictionaryOnDisk", line)
{
ReadParameters();
}
PhraseDictionaryOnDisk::~PhraseDictionaryOnDisk()
{
}

View File

@ -30,6 +30,12 @@
#include "OnDiskPt/PhraseNode.h"
#include "util/check.hh"
#ifdef WITH_THREADS
#include <boost/thread/tss.hpp>
#else
#include <boost/scoped_ptr.hpp>
#endif
namespace Moses
{
class TargetPhraseCollection;
@ -43,16 +49,17 @@ class PhraseDictionaryOnDisk : public PhraseDictionary
friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryOnDisk&);
protected:
#ifdef WITH_THREADS
boost::thread_specific_ptr<OnDiskPt::OnDiskWrapper> m_implementation;
#else
boost::scoped_ptr<OnDiskPt::OnDiskWrapper> m_implementation;
#endif
OnDiskPt::OnDiskWrapper &GetImplementation();
const OnDiskPt::OnDiskWrapper &GetImplementation() const;
public:
PhraseDictionaryOnDisk(const std::string &line)
: MyBase("PhraseDictionaryOnDisk", line) {
CHECK(m_args.size() == 0);
}
PhraseDictionaryOnDisk(const std::string &line);
~PhraseDictionaryOnDisk();
void Load();

View File

@ -48,12 +48,6 @@ public:
void Load();
// Required by PhraseDictionary.
virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const {
CHECK(false);
return NULL;
}
private:
friend class RuleTableLoader;

View File

@ -0,0 +1,72 @@
#include "moses/TranslationModel/WordCoocTable.h"
using namespace std;
namespace Moses
{
WordCoocTable::
WordCoocTable()
{
m_cooc.reserve(1000000);
m_marg1.reserve(1000000);
m_marg2.reserve(1000000);
}
WordCoocTable::
WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2)
: m_cooc(VocabSize1), m_marg1(VocabSize1,0), m_marg2(VocabSize2, 0)
{}
void
WordCoocTable::
Count(size_t const a, size_t const b)
{
while (a >= m_marg1.size()) {
m_cooc.push_back(my_map_t());
m_marg1.push_back(0);
}
while (b >= m_marg2.size())
m_marg2.push_back(0);
++m_marg1[a];
++m_marg2[b];
++m_cooc[a][b];
}
uint32_t
WordCoocTable::
GetJoint(size_t const a, size_t const b) const
{
if (a >= m_marg1.size() || b >= m_marg2.size()) return 0;
my_map_t::const_iterator m = m_cooc.at(a).find(b);
if (m == m_cooc[a].end()) return 0;
return m->second;
}
uint32_t
WordCoocTable::
GetMarg1(size_t const x) const
{
return x >= m_marg1.size() ? 0 : m_marg1[x];
}
uint32_t
WordCoocTable::
GetMarg2(size_t const x) const
{
return x >= m_marg2.size() ? 0 : m_marg2[x];
}
float
WordCoocTable::
pfwd(size_t const a, size_t const b) const
{
return float(GetJoint(a,b))/GetMarg1(a);
}
float
WordCoocTable::
pbwd(size_t const a, size_t const b) const
{
// cerr << "at " << __FILE__ << ":" << __LINE__ << endl;
return float(GetJoint(a,b))/GetMarg2(b);
}
}

View File

@ -0,0 +1,72 @@
#ifndef moses_WordCoocTable_h
#define moses_WordCoocTable_h
#include "moses/TranslationModel/DynSAInclude/vocab.h"
#include "moses/TranslationModel/DynSAInclude/types.h"
#include "moses/TranslationModel/DynSAInclude/utils.h"
#include "moses/InputFileStream.h"
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"
#include <boost/dynamic_bitset.hpp>
#include <map>
namespace Moses
{
using namespace std;
#ifndef bitvector
typedef boost::dynamic_bitset<uint64_t> bitvector;
#endif
/**
* Stores word cooccurrence counts
* @todo ask Uli Germann
*/
class WordCoocTable
{
typedef map<wordID_t,uint32_t> my_map_t;
vector<my_map_t> m_cooc;
vector<uint32_t> m_marg1;
vector<uint32_t> m_marg2;
public:
WordCoocTable();
WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2);
uint32_t GetJoint(size_t const a, size_t const b) const;
uint32_t GetMarg1(size_t const x) const;
uint32_t GetMarg2(size_t const x) const;
float pfwd(size_t const a, size_t const b) const;
float pbwd(size_t const a, size_t const b) const;
void
Count(size_t const a, size_t const b);
template<typename idvec, typename alnvec>
void
Count(idvec const& s1, idvec const& s2, alnvec const& aln,
wordID_t const NULL1, wordID_t const NULL2);
};
template<typename idvec, typename alnvec>
void
WordCoocTable::
Count(idvec const& s1, idvec const& s2, alnvec const& aln,
wordID_t const NULL1, wordID_t const NULL2)
{
boost::dynamic_bitset<uint64_t> check1(s1.size()), check2(s2.size());
check1.set();
check2.set();
for (size_t i = 0; i < aln.size(); i += 2) {
Count(s1[aln[i]], s2[aln[i+1]]);
check1.reset(aln[i]);
check2.reset(aln[i+1]);
}
for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i))
Count(s1[i], NULL2);
for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i))
Count(NULL1, s2[i]);
}
}
#endif

View File

@ -39,17 +39,6 @@ using namespace std;
namespace Moses
{
InputLatticeNode::InputLatticeNode(const Phrase &phrase, const WordsRange &range)
:m_phrase(phrase)
,m_range(range)
{
}
void InputLatticeNode::AddNext(const InputLatticeNode &next)
{
m_next.push_back(&next);
}
/** helper for pruning */
bool CompareTranslationOption(const TranslationOption *a, const TranslationOption *b)
{
@ -245,6 +234,7 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,s
// add to dictionary
Word &targetWord = targetPhrase.AddWord();
targetWord.SetIsOOV(true);
for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
FactorType factorType = static_cast<FactorType>(currFactor);
@ -373,7 +363,6 @@ void TranslationOptionCollection::CreateTranslationOptions()
// in the phraseDictionary (which is the- possibly filtered-- phrase
// table loaded on initialization), generate TranslationOption objects
// for all phrases
const StaticData &staticData = StaticData::Instance();
// there may be multiple decoding graphs (factorizations of decoding)
const vector <DecodeGraph*> &decodeGraphList = StaticData::Instance().GetDecodeGraphs();
@ -384,13 +373,10 @@ void TranslationOptionCollection::CreateTranslationOptions()
// loop over all decoding graphs, each generates translation options
for (size_t graphInd = 0 ; graphInd < decodeGraphList.size() ; graphInd++) {
if (staticData.IsDecodingGraphIgnored( graphInd )) {
std::cerr << "ignoring decoding path " << graphInd << std::endl;
continue;
}
if (decodeGraphList.size() > 1) {
VERBOSE(3,"Creating translation options from decoding graph " << graphInd << endl);
}
const DecodeGraph &decodeGraph = *decodeGraphList[graphInd];
// generate phrases that start at startPos ...
for (size_t startPos = 0 ; startPos < size; startPos++) {
@ -401,12 +387,10 @@ void TranslationOptionCollection::CreateTranslationOptions()
// ... and that end at endPos
for (size_t endPos = startPos ; endPos < startPos + maxSize ; endPos++) {
if (graphInd > 0 && // only skip subsequent graphs
decodeGraphBackoff[graphInd] != 0 && // limited use of backoff specified
(endPos-startPos+1 > decodeGraphBackoff[graphInd] || // size exceeds backoff limit or ...
m_collection[startPos][endPos-startPos].size() > 0)) { // already covered
VERBOSE(3,"No backoff to graph " << graphInd << " for span [" << startPos << ";" << endPos << "]");
VERBOSE(3,", length limit: " << decodeGraphBackoff[graphInd]);
VERBOSE(3,", found so far: " << m_collection[startPos][endPos-startPos].size() << endl);
decodeGraphBackoff[graphInd] != 0 && // use of backoff specified
(endPos-startPos+1 >= decodeGraphBackoff[graphInd] || // size exceeds backoff limit or ...
m_collection[startPos][endPos-startPos].size() > 0)) { // no phrases found so far
VERBOSE(3,"No backoff to graph " << graphInd << " for span [" << startPos << ";" << endPos << "]" << endl);
// do not create more options
continue;
}
@ -472,6 +456,117 @@ void TranslationOptionCollection::Sort()
}
/** create translation options that exactly cover a specific input span.
* Called by CreateTranslationOptions() and ProcessUnknownWord()
* \param decodeGraph list of decoding steps
* \param factorCollection input sentence with all factors
* \param startPos first position in input sentence
* \param lastPos last position in input sentence
* \param adhereTableLimit whether phrase & generation table limits are adhered to
*/
void TranslationOptionCollection::CreateTranslationOptionsForRange(
const DecodeGraph &decodeGraph
, size_t startPos
, size_t endPos
, bool adhereTableLimit
, size_t graphInd)
{
if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos)) {
Phrase *sourcePhrase = NULL; // can't initialise with substring, in case it's confusion network
// consult persistent (cross-sentence) cache for stored translation options
bool skipTransOptCreation = false
, useCache = StaticData::Instance().GetUseTransOptCache();
if (useCache) {
const WordsRange wordsRange(startPos, endPos);
sourcePhrase = new Phrase(m_source.GetSubString(wordsRange));
const TranslationOptionList *transOptList = StaticData::Instance().FindTransOptListInCache(decodeGraph, *sourcePhrase);
// is phrase in cache?
if (transOptList != NULL) {
skipTransOptCreation = true;
TranslationOptionList::const_iterator iterTransOpt;
for (iterTransOpt = transOptList->begin() ; iterTransOpt != transOptList->end() ; ++iterTransOpt) {
TranslationOption *transOpt = new TranslationOption(**iterTransOpt, wordsRange);
Add(transOpt);
}
}
} // useCache
if (!skipTransOptCreation) {
// partial trans opt stored in here
PartialTranslOptColl* oldPtoc = new PartialTranslOptColl;
size_t totalEarlyPruned = 0;
// initial translation step
list <const DecodeStep* >::const_iterator iterStep = decodeGraph.begin();
const DecodeStep &decodeStep = **iterStep;
static_cast<const DecodeStepTranslation&>(decodeStep).ProcessInitialTranslation
(m_source, *oldPtoc
, startPos, endPos, adhereTableLimit );
// do rest of decode steps
int indexStep = 0;
for (++iterStep ; iterStep != decodeGraph.end() ; ++iterStep) {
const DecodeStep &decodeStep = **iterStep;
PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
// go thru each intermediate trans opt just created
const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
for (iterPartialTranslOpt = partTransOptList.begin() ; iterPartialTranslOpt != partTransOptList.end() ; ++iterPartialTranslOpt) {
TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
decodeStep.Process(inputPartialTranslOpt
, decodeStep
, *newPtoc
, this
, adhereTableLimit
, *sourcePhrase);
}
// last but 1 partial trans not required anymore
totalEarlyPruned += newPtoc->GetPrunedCount();
delete oldPtoc;
oldPtoc = newPtoc;
indexStep++;
} // for (++iterStep
// add to fully formed translation option list
PartialTranslOptColl &lastPartialTranslOptColl = *oldPtoc;
const vector<TranslationOption*>& partTransOptList = lastPartialTranslOptColl.GetList();
vector<TranslationOption*>::const_iterator iterColl;
for (iterColl = partTransOptList.begin() ; iterColl != partTransOptList.end() ; ++iterColl) {
TranslationOption *transOpt = *iterColl;
Add(transOpt);
}
// storing translation options in persistent cache (kept across sentences)
if (useCache) {
if (partTransOptList.size() > 0) {
TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
StaticData::Instance().AddTransOptListToCache(decodeGraph, *sourcePhrase, transOptList);
}
}
lastPartialTranslOptColl.DetachAll();
totalEarlyPruned += oldPtoc->GetPrunedCount();
delete oldPtoc;
// TRACE_ERR( "Early translation options pruned: " << totalEarlyPruned << endl);
} // if (!skipTransOptCreation)
if (useCache)
delete sourcePhrase;
} // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
if (graphInd == 0 && StaticData::Instance().GetXmlInputType() != XmlPassThrough && HasXmlOptionsOverlappingRange(startPos,endPos)) {
CreateXmlOptionsForRange(startPos, endPos);
}
}
/** Check if this range overlaps with any XML options. This doesn't need to be an exact match, only an overlap.
* by default, we don't support XML options. subclasses need to override this function.

View File

@ -43,25 +43,6 @@ class FactorMask;
class Word;
class DecodeGraph;
/** Each node contains
1. substring used to searching the phrase table
2. the source range it covers
3. a list of InputLatticeNode that it is a prefix of
This is for both sentence input, and confusion network/lattices
*/
class InputLatticeNode
{
protected:
Phrase m_phrase;
WordsRange m_range;
std::vector<const InputLatticeNode*> m_next;
public:
InputLatticeNode(const Phrase &phrase, const WordsRange &range);
void AddNext(const InputLatticeNode &next);
};
/** Contains all phrase translations applicable to current input type (a sentence or confusion network).
* A key insight into efficient decoding is that various input
* conditions (trelliss, factored input, normal text, xml markup)
@ -133,21 +114,12 @@ public:
//! Create all possible translations from the phrase tables
virtual void CreateTranslationOptions();
//! Create translation options that exactly cover a specific input span.
/** create translation options that exactly cover a specific input span.
* Called by CreateTranslationOptions() and ProcessUnknownWord()
* \param decodeGraph list of decoding steps
* \param factorCollection input sentence with all factors
* \param startPos first position in input sentence
* \param lastPos last position in input sentence
* \param adhereTableLimit whether phrase & generation table limits are adhered to
*/
virtual void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
, size_t startPosition
, size_t endPosition
, bool adhereTableLimit
, size_t graphInd) = 0;
, size_t graphInd);
//!Check if this range has XML options
virtual bool HasXmlOptionsOverlappingRange(size_t startPosition, size_t endPosition) const;

View File

@ -1,15 +1,9 @@
// $Id$
#include <cassert>
#include <iostream>
#include "TranslationOptionCollectionConfusionNet.h"
#include "ConfusionNet.h"
#include "DecodeStep.h"
#include "FactorCollection.h"
#include "DecodeStepTranslation.h"
#include "DecodeStepGeneration.h"
#include "moses/FF/InputFeature.h"
using namespace std;
namespace Moses
{
@ -18,104 +12,7 @@ namespace Moses
TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet(
const ConfusionNet &input
, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
{
const StaticData &staticData = StaticData::Instance();
const InputFeature *inputFeature = staticData.GetInputFeature();
CHECK(inputFeature);
size_t size = input.GetSize();
// create matrix
for (size_t startPos = 0; startPos < size; ++startPos) {
std::vector<std::vector<SourcePath> > vec;
m_collection.push_back( vec );
size_t maxSize = size - startPos;
size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
maxSize = std::min(maxSize, maxSizePhrase);
for (size_t endPos = 0 ; endPos < maxSize ; ++endPos) {
std::vector<SourcePath> vec;
m_collection[startPos].push_back( vec );
}
// cut up confusion network into substrings
// start with 1-word phrases
std::vector<SourcePath> &subphrases = GetPhrases(startPos, startPos);
assert(subphrases.size() == 0);
const ConfusionNet::Column &col = input.GetColumn(startPos);
ConfusionNet::Column::const_iterator iter;
for (iter = col.begin(); iter != col.end(); ++iter) {
subphrases.push_back(SourcePath());
SourcePath &sourcePath = subphrases.back();
const std::pair<Word,std::vector<float> > &inputNode = *iter;
//cerr << "word=" << inputNode.first << " scores=" << inputNode.second.size() << endl;
sourcePath.first.AddWord(inputNode.first);
sourcePath.second.PlusEquals(inputFeature, inputNode.second);
} // for (iter = col.begin(); iter != col.end(); ++iter) {
} // for (size_t startPos = 0; startPos < size; ++startPos) {
// create subphrases by appending words to previous subphrases
for (size_t startPos = 0; startPos < size; ++startPos) {
size_t maxSize = size - startPos;
size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
maxSize = std::min(maxSize, maxSizePhrase);
size_t end = startPos + maxSize - 1;
for (size_t endPos = startPos + 1; endPos < end; ++endPos) {
std::vector<SourcePath> &newSubphrases = GetPhrases(startPos, endPos);
const std::vector<SourcePath> &prevSubphrases = GetPhrases(startPos, endPos - 1);
const ConfusionNet::Column &col = input.GetColumn(endPos);
CreateSubPhrases(newSubphrases, prevSubphrases, col, *inputFeature);
}
}
/*
for (size_t startPos = 0; startPos < size; ++startPos) {
for (size_t endPos = startPos; endPos < size; ++endPos) {
cerr << "RANGE=" << startPos << "-" << endPos << endl;
const std::vector<SourcePath> &subphrases = GetPhrases(startPos, endPos);
std::vector<SourcePath>::const_iterator iterSourcePath;
for (iterSourcePath = subphrases.begin(); iterSourcePath != subphrases.end(); ++iterSourcePath) {
const SourcePath &sourcePath = *iterSourcePath;
cerr << sourcePath.first << " " <<sourcePath.second << endl;
}
}
}
*/
}
void TranslationOptionCollectionConfusionNet::CreateSubPhrases(std::vector<SourcePath> &newSubphrases
, const std::vector<SourcePath> &prevSubphrases
, const ConfusionNet::Column &col
, const InputFeature &inputFeature)
{
std::vector<SourcePath>::const_iterator iterSourcePath;
for (iterSourcePath = prevSubphrases.begin(); iterSourcePath != prevSubphrases.end(); ++iterSourcePath) {
const SourcePath &sourcePath = *iterSourcePath;
const Phrase &prevSubPhrase = sourcePath.first;
const ScoreComponentCollection &prevScore = sourcePath.second;
ConfusionNet::Column::const_iterator iterCol;
for (iterCol = col.begin(); iterCol != col.end(); ++iterCol) {
const std::pair<Word,std::vector<float> > &node = *iterCol;
Phrase subphrase(prevSubPhrase);
subphrase.AddWord(node.first);
ScoreComponentCollection score(prevScore);
score.PlusEquals(&inputFeature, node.second);
SourcePath newSourcePath(subphrase, score);
newSubphrases.push_back(newSourcePath);
}
}
}
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold) {}
/* forcibly create translation option for a particular source word.
* call the base class' ProcessOneUnknownWord() for each possible word in the confusion network
@ -133,122 +30,6 @@ void TranslationOptionCollectionConfusionNet::ProcessUnknownWord(size_t sourcePo
}
const std::vector<TranslationOptionCollectionConfusionNet::SourcePath> &TranslationOptionCollectionConfusionNet::GetPhrases(size_t startPos, size_t endPos) const
{
size_t offset = endPos - startPos;
CHECK(offset < m_collection[startPos].size());
return m_collection[startPos][offset];
}
std::vector<TranslationOptionCollectionConfusionNet::SourcePath> &TranslationOptionCollectionConfusionNet::GetPhrases(size_t startPos, size_t endPos)
{
size_t offset = endPos - startPos;
CHECK(offset < m_collection[startPos].size());
return m_collection[startPos][offset];
}
void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRange(
const DecodeGraph &decodeGraph
, size_t startPos
, size_t endPos
, bool adhereTableLimit
, size_t graphInd)
{
if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos)) {
Phrase *sourcePhrase = NULL; // can't initialise with substring, in case it's confusion network
// consult persistent (cross-sentence) cache for stored translation options
bool skipTransOptCreation = false
, useCache = StaticData::Instance().GetUseTransOptCache();
if (useCache) {
const WordsRange wordsRange(startPos, endPos);
sourcePhrase = new Phrase(m_source.GetSubString(wordsRange));
const TranslationOptionList *transOptList = StaticData::Instance().FindTransOptListInCache(decodeGraph, *sourcePhrase);
// is phrase in cache?
if (transOptList != NULL) {
skipTransOptCreation = true;
TranslationOptionList::const_iterator iterTransOpt;
for (iterTransOpt = transOptList->begin() ; iterTransOpt != transOptList->end() ; ++iterTransOpt) {
TranslationOption *transOpt = new TranslationOption(**iterTransOpt, wordsRange);
Add(transOpt);
}
}
} // useCache
if (!skipTransOptCreation) {
// partial trans opt stored in here
PartialTranslOptColl* oldPtoc = new PartialTranslOptColl;
size_t totalEarlyPruned = 0;
// initial translation step
list <const DecodeStep* >::const_iterator iterStep = decodeGraph.begin();
const DecodeStep &decodeStep = **iterStep;
static_cast<const DecodeStepTranslation&>(decodeStep).ProcessInitialTranslation
(m_source, *oldPtoc
, startPos, endPos, adhereTableLimit );
// do rest of decode steps
int indexStep = 1;
for (++iterStep; iterStep != decodeGraph.end() ; ++iterStep, ++indexStep) {
const DecodeStep &decodeStep = **iterStep;
PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
// go thru each intermediate trans opt just created
const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
for (iterPartialTranslOpt = partTransOptList.begin() ; iterPartialTranslOpt != partTransOptList.end() ; ++iterPartialTranslOpt) {
TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
decodeStep.Process(inputPartialTranslOpt
, decodeStep
, *newPtoc
, this
, adhereTableLimit
, *sourcePhrase);
}
// last but 1 partial trans not required anymore
totalEarlyPruned += newPtoc->GetPrunedCount();
delete oldPtoc;
oldPtoc = newPtoc;
} // for (++iterStep
// add to fully formed translation option list
PartialTranslOptColl &lastPartialTranslOptColl = *oldPtoc;
const vector<TranslationOption*>& partTransOptList = lastPartialTranslOptColl.GetList();
vector<TranslationOption*>::const_iterator iterColl;
for (iterColl = partTransOptList.begin() ; iterColl != partTransOptList.end() ; ++iterColl) {
TranslationOption *transOpt = *iterColl;
Add(transOpt);
}
// storing translation options in persistent cache (kept across sentences)
if (useCache) {
if (partTransOptList.size() > 0) {
TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
StaticData::Instance().AddTransOptListToCache(decodeGraph, *sourcePhrase, transOptList);
}
}
lastPartialTranslOptColl.DetachAll();
totalEarlyPruned += oldPtoc->GetPrunedCount();
delete oldPtoc;
// TRACE_ERR( "Early translation options pruned: " << totalEarlyPruned << endl);
} // if (!skipTransOptCreation)
if (useCache)
delete sourcePhrase;
} // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
if (graphInd == 0 && StaticData::Instance().GetXmlInputType() != XmlPassThrough && HasXmlOptionsOverlappingRange(startPos,endPos)) {
CreateXmlOptionsForRange(startPos, endPos);
}
}
} // namespace

View File

@ -3,11 +3,11 @@
#define moses_TranslationOptionCollectionConfusionNet_h
#include "TranslationOptionCollection.h"
#include "ConfusionNet.h"
namespace Moses
{
class InputFeature;
class ConfusionNet;
/** Holds all translation options, for all spans, of a particular confusion network input
* Inherited from TranslationOptionCollection.
@ -15,28 +15,12 @@ class InputFeature;
class TranslationOptionCollectionConfusionNet : public TranslationOptionCollection
{
public:
typedef std::pair<Phrase, ScoreComponentCollection> SourcePath;
TranslationOptionCollectionConfusionNet(const ConfusionNet &source, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
void ProcessUnknownWord(size_t sourcePos);
const std::vector<SourcePath> &GetPhrases(size_t startPos, size_t endPos) const;
std::vector<SourcePath> &GetPhrases(size_t startPos, size_t endPos);
protected:
std::vector<std::vector<std::vector<SourcePath> > > m_collection;
void CreateSubPhrases(std::vector<SourcePath> &newSubphrases
, const std::vector<SourcePath> &prevSubphrases
, const ConfusionNet::Column &col
, const InputFeature &inputFeature);
void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
, size_t startPosition
, size_t endPosition
, bool adhereTableLimit
, size_t graphInd);
};
}
#endif

View File

@ -24,44 +24,14 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "DecodeStep.h"
#include "FactorCollection.h"
#include "WordsRange.h"
#include "DecodeStepTranslation.h"
#include "DecodeStepGeneration.h"
using namespace std;
namespace Moses
{
/** constructor; just initialize the base class */
TranslationOptionCollectionText::TranslationOptionCollectionText(Sentence const &input
, size_t maxNoTransOptPerCoverage
, float translationOptionThreshold)
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
{
size_t size = input.GetSize();
m_collection.resize(size);
for (size_t startPos = 0; startPos < size; ++startPos) {
std::vector<InputLatticeNode> &vec = m_collection[startPos];
for (size_t endPos = startPos; endPos < size; ++endPos) {
Phrase subphrase(input.GetSubString(WordsRange(startPos, endPos)));
WordsRange range(startPos, endPos);
InputLatticeNode node(subphrase, range);
if (range.GetNumWordsCovered() > 1) {
InputLatticeNode prevNode = GetPhrase(startPos, endPos - 1);
node.AddNext(prevNode);
}
vec.push_back(node);
}
}
/*
for (size_t startPos = 0; startPos < size; ++startPos) {
for (size_t endPos = startPos; endPos < size; ++endPos) {
cerr << startPos << "-" << endPos << "=" << GetPhrase(startPos, endPos) << endl;
}
}
*/
}
TranslationOptionCollectionText::TranslationOptionCollectionText(Sentence const &inputSentence, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
: TranslationOptionCollection(inputSentence, maxNoTransOptPerCoverage, translationOptionThreshold) {}
/* forcibly create translation option for a particular source word.
* For text, this function is easy, just call the base class' ProcessOneUnknownWord()
@ -96,118 +66,10 @@ void TranslationOptionCollectionText::CreateXmlOptionsForRange(size_t startPosit
for(size_t i=0; i<xmlOptions.size(); i++) {
Add(xmlOptions[i]);
}
};
}
const InputLatticeNode &TranslationOptionCollectionText::GetPhrase(size_t startPos, size_t endPos) const
{
size_t offset = endPos - startPos;
CHECK(offset < m_collection[startPos].size());
return m_collection[startPos][offset];
}
void TranslationOptionCollectionText::CreateTranslationOptionsForRange(
const DecodeGraph &decodeGraph
, size_t startPos
, size_t endPos
, bool adhereTableLimit
, size_t graphInd)
{
if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos)) {
Phrase *sourcePhrase = NULL; // can't initialise with substring, in case it's confusion network
// consult persistent (cross-sentence) cache for stored translation options
bool skipTransOptCreation = false
, useCache = StaticData::Instance().GetUseTransOptCache();
if (useCache) {
const WordsRange wordsRange(startPos, endPos);
sourcePhrase = new Phrase(m_source.GetSubString(wordsRange));
const TranslationOptionList *transOptList = StaticData::Instance().FindTransOptListInCache(decodeGraph, *sourcePhrase);
// is phrase in cache?
if (transOptList != NULL) {
skipTransOptCreation = true;
TranslationOptionList::const_iterator iterTransOpt;
for (iterTransOpt = transOptList->begin() ; iterTransOpt != transOptList->end() ; ++iterTransOpt) {
TranslationOption *transOpt = new TranslationOption(**iterTransOpt, wordsRange);
Add(transOpt);
}
}
} // useCache
if (!skipTransOptCreation) {
// partial trans opt stored in here
PartialTranslOptColl* oldPtoc = new PartialTranslOptColl;
size_t totalEarlyPruned = 0;
// initial translation step
list <const DecodeStep* >::const_iterator iterStep = decodeGraph.begin();
const DecodeStep &decodeStep = **iterStep;
static_cast<const DecodeStepTranslation&>(decodeStep).ProcessInitialTranslation
(m_source, *oldPtoc
, startPos, endPos, adhereTableLimit );
// do rest of decode steps
int indexStep = 1;
for (++iterStep; iterStep != decodeGraph.end() ; ++iterStep, ++indexStep) {
const DecodeStep &decodeStep = **iterStep;
PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
// go thru each intermediate trans opt just created
const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
for (iterPartialTranslOpt = partTransOptList.begin() ; iterPartialTranslOpt != partTransOptList.end() ; ++iterPartialTranslOpt) {
TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
decodeStep.Process(inputPartialTranslOpt
, decodeStep
, *newPtoc
, this
, adhereTableLimit
, *sourcePhrase);
}
// last but 1 partial trans not required anymore
totalEarlyPruned += newPtoc->GetPrunedCount();
delete oldPtoc;
oldPtoc = newPtoc;
} // for (++iterStep
// add to fully formed translation option list
PartialTranslOptColl &lastPartialTranslOptColl = *oldPtoc;
const vector<TranslationOption*>& partTransOptList = lastPartialTranslOptColl.GetList();
vector<TranslationOption*>::const_iterator iterColl;
for (iterColl = partTransOptList.begin() ; iterColl != partTransOptList.end() ; ++iterColl) {
TranslationOption *transOpt = *iterColl;
Add(transOpt);
}
// storing translation options in persistent cache (kept across sentences)
if (useCache) {
if (partTransOptList.size() > 0) {
TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
StaticData::Instance().AddTransOptListToCache(decodeGraph, *sourcePhrase, transOptList);
}
}
lastPartialTranslOptColl.DetachAll();
totalEarlyPruned += oldPtoc->GetPrunedCount();
delete oldPtoc;
// TRACE_ERR( "Early translation options pruned: " << totalEarlyPruned << endl);
} // if (!skipTransOptCreation)
if (useCache)
delete sourcePhrase;
} // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
if (graphInd == 0 && StaticData::Instance().GetXmlInputType() != XmlPassThrough && HasXmlOptionsOverlappingRange(startPos,endPos)) {
CreateXmlOptionsForRange(startPos, endPos);
}
}
} // namespace

View File

@ -22,9 +22,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#ifndef moses_TranslationOptionCollectionText_h
#define moses_TranslationOptionCollectionText_h
#include <vector>
#include "TranslationOptionCollection.h"
#include "Phrase.h"
namespace Moses
{
@ -37,23 +35,18 @@ class Sentence;
class TranslationOptionCollectionText : public TranslationOptionCollection
{
public:
TranslationOptionCollectionText(Sentence const& input, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
void ProcessUnknownWord(size_t sourcePos);
TranslationOptionCollectionText(Sentence const& inputSentence, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
bool HasXmlOptionsOverlappingRange(size_t startPosition, size_t endPosition) const;
void CreateXmlOptionsForRange(size_t startPosition, size_t endPosition);
const InputLatticeNode &GetPhrase(size_t startPos, size_t endPos) const;
protected:
std::vector<std::vector<InputLatticeNode> > m_collection;
void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
, size_t startPosition
, size_t endPosition
, bool adhereTableLimit
, size_t graphInd);
};
}
#endif

View File

@ -108,27 +108,28 @@ enum DistortionOrientationOptions {
enum PhraseTableImplementation {
Memory = 0
,Binary = 1
,OnDisk = 2
//,GlueRule = 3
//,Joshua = 4
//,MemorySourceLabel = 5
,SCFG = 6
//,BerkeleyDb = 7
,SuffixArray = 8
,Hiero = 9
,ALSuffixArray = 10
,FuzzyMatch = 11
,Compact = 12
,Interpolated = 13
,Binary = 1
,OnDisk = 2
//,GlueRule = 3
//,Joshua = 4
//,MemorySourceLabel = 5
,SCFG = 6
//,BerkeleyDb = 7
,SuffixArray = 8
,Hiero = 9
,ALSuffixArray = 10
,FuzzyMatch = 11
,Compact = 12
,Interpolated = 13
,DSuffixArray = 14
};
enum InputTypeEnum {
SentenceInput = 0
,ConfusionNetworkInput = 1
,WordLatticeInput = 2
,TreeInputType = 3
,WordLatticeInput2 = 4
,ConfusionNetworkInput = 1
,WordLatticeInput = 2
,TreeInputType = 3
,WordLatticeInput2 = 4
};
@ -141,7 +142,7 @@ enum XmlInputType {
enum DictionaryFind {
Best = 0
,All = 1
,All = 1
};
enum ParsingAlgorithm {
@ -151,22 +152,22 @@ enum ParsingAlgorithm {
enum SearchAlgorithm {
Normal = 0
,CubePruning = 1
,CubeGrowing = 2
,ChartDecoding= 3
,NormalBatch = 4
,ChartIncremental = 5
,CubePruning = 1
,CubeGrowing = 2
,ChartDecoding= 3
,NormalBatch = 4
,ChartIncremental = 5
};
enum SourceLabelOverlap {
SourceLabelOverlapAdd = 0
,SourceLabelOverlapReplace = 1
,SourceLabelOverlapDiscard = 2
,SourceLabelOverlapReplace = 1
,SourceLabelOverlapDiscard = 2
};
enum WordAlignmentSort {
NoSort = 0
,TargetOrder = 1
,TargetOrder = 1
};
enum FormatType {

View File

@ -52,11 +52,14 @@ protected:
FactorArray m_factorArray; /**< set of factors */
bool m_isNonTerminal;
bool m_isOOV;
public:
/** deep copy */
Word(const Word &copy)
:m_isNonTerminal(copy.m_isNonTerminal) {
:m_isNonTerminal(copy.m_isNonTerminal)
,m_isOOV(copy.m_isOOV)
{
std::memcpy(m_factorArray, copy.m_factorArray, sizeof(FactorArray));
}
@ -64,6 +67,7 @@ public:
explicit Word(bool isNonTerminal = false) {
std::memset(m_factorArray, 0, sizeof(FactorArray));
m_isNonTerminal = isNonTerminal;
m_isOOV = false;
}
~Word() {}
@ -92,6 +96,13 @@ public:
m_isNonTerminal = val;
}
inline bool IsOOV() const {
return m_isOOV;
}
inline void SetIsOOV(bool val) {
m_isOOV = val;
}
/** add the factors from sourceWord into this representation,
* NULL elements in sourceWord will be skipped */
void Merge(const Word &sourceWord);

View File

@ -0,0 +1,51 @@
#ifndef __sampling_h
#define __sampling_h
// Utility functions for proper sub-sampling.
// (c) 2007-2012 Ulrich Germann
namespace Moses
{
inline
size_t
randInt(size_t N)
{
return N*(rand()/(RAND_MAX+1.));
}
// select a random sample of size /s/ without restitution from the range of
// integers [0,N);
template<typename idx_t>
void
randomSample(vector<idx_t>& v, size_t s, size_t N)
{
// see also Knuth: Art of Computer Programming Vol. 2, p. 142
s = min(s,N);
v.resize(s);
// the first option tries to be a bit more efficient than O(N) in picking
// the samples. The threshold is an ad-hoc, off-the-cuff guess. I still
// need to figure out the optimal break-even point between a linear sweep
// and repeatedly picking random numbers with the risk of hitting the same
// number many times.
if (s*10<N) {
boost::dynamic_bitset<uint64_t> check(N,0);
for (size_t i = 0; i < v.size(); i++) {
size_t x = randInt(N);
while (check[x]) x = randInt(N);
check[x]=true;
v[i] = x;
}
} else {
size_t m=0;
for (size_t t = 0; m <= s && t < N; t++)
if (s==N || randInt(N-t) < s-m) v[m++] = t;
}
}
};
#endif

View File

@ -0,0 +1,85 @@
#ifndef __n_best_list_h
#define __n_best_list_h
#include <algorithm>
#include "moses/generic/sorting/VectorIndexSorter.h"
// NBest List; (c) 2007-2012 Ulrich Germann
//
// The 'trick' used in this implementation is to maintain a heap of size <= N
// such that the lowest-scoring item is on top of the heap. For each incoming
// item we can then determine easily if it is in the top N.
namespace Moses
{
using namespace std;
template<typename THINGY, typename CMP>
class
NBestList
{
vector<uint32_t> m_heap;
vector<THINGY> m_list;
VectorIndexSorter<THINGY, CMP, uint32_t> m_better;
mutable vector<uint32_t> m_order;
mutable bool m_changed;
public:
NBestList(size_t const max_size, CMP const& cmp);
NBestList(size_t const max_size);
bool add(THINGY const& item);
THINGY const& operator[](int i) const;
size_t size() const {
return m_heap.size();
}
};
template<typename THINGY, typename CMP>
NBestList<THINGY,CMP>::
NBestList(size_t const max_size, CMP const& cmp)
: m_better(m_list, cmp), m_changed(false)
{
m_heap.reserve(max_size);
}
template<typename THINGY, typename CMP>
NBestList<THINGY,CMP>::
NBestList(size_t const max_size)
: m_better(m_heap), m_changed(false)
{
m_heap.reserve(max_size);
}
template<typename THINGY, typename CMP>
bool
NBestList<THINGY,CMP>::
add(THINGY const& item)
{
if (m_heap.size() == m_heap.capacity()) {
if (m_better.Compare(item, m_list[m_heap.at(0)])) {
pop_heap(m_heap.begin(),m_heap.end(),m_better);
m_list[m_heap.back()] = item;
} else return false;
} else {
m_list.push_back(item);
m_heap.push_back(m_heap.size());
}
push_heap(m_heap.begin(),m_heap.end(),m_better);
return m_changed = true;
}
template<typename THINGY, typename CMP>
THINGY const&
NBestList<THINGY,CMP>::
operator[](int i) const
{
if (m_changed) {
m_order.assign(m_heap.begin(),m_heap.end());
for (size_t k = m_heap.size(); k != 0; --k)
pop_heap(m_order.begin(), m_order.begin()+k);
m_changed = false;
}
if (i < 0) i += m_order.size();
return m_list[m_order.at(i)];
}
}
#endif

View File

@ -0,0 +1,69 @@
#ifndef __vector_index_sorter_h
#define __vector_index_sorter_h
// VectorIndexSorter; (c) 2007-2012 Ulrich Germann
// A VectorIndexSorter is a function object for sorting indices into a vector
// of objects (instead of sorting the vector itself).
//
// typcial use:
// vector<thingy> my_vector;
// VectorIndexSorter<thingy,less<thingy>,int> sorter(my_vector);
// vector<int> order;
// sorter.get_order(order);
namespace Moses
{
template<typename VAL, typename COMP = greater<VAL>, typename IDX_T=size_t>
class
VectorIndexSorter : public binary_function<IDX_T const&, IDX_T const&, bool>
{
vector<VAL> const& m_vecref;
boost::shared_ptr<COMP> m_comp;
public:
COMP const& Compare;
VectorIndexSorter(vector<VAL> const& v, COMP const& comp)
: m_vecref(v), Compare(comp)
{ }
VectorIndexSorter(vector<VAL> const& v)
: m_vecref(v), m_comp(new COMP()), Compare(*m_comp)
{ }
bool operator()(IDX_T const & a, IDX_T const & b) const {
bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b));
bool bwd = Compare(m_vecref[b], m_vecref[a]);
return (fwd == bwd ? a < b : fwd);
}
boost::shared_ptr<vector<IDX_T> >
GetOrder() const;
void
GetOrder(vector<IDX_T> & order) const;
};
template<typename VAL, typename COMP, typename IDX_T>
boost::shared_ptr<vector<IDX_T> >
VectorIndexSorter<VAL,COMP,IDX_T>::
GetOrder() const
{
boost::shared_ptr<vector<IDX_T> > ret(new vector<IDX_T>(m_vecref.size()));
get_order(*ret);
return ret;
}
template<typename VAL, typename COMP, typename IDX_T>
void
VectorIndexSorter<VAL,COMP,IDX_T>::
GetOrder(vector<IDX_T> & order) const
{
order.resize(m_vecref.size());
for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i;
sort(order.begin(), order.end(), *this);
}
}
#endif

View File

@ -137,7 +137,7 @@ void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
const SplitPoints &point = *p;
if (point.size() > 3) {
const vector< SyntaxNode* >& topNodes
= tree.GetNodes( point[0], point[point.size()-1]-1);
= tree.GetNodes( point[0], point[point.size()-1]-1);
string topLabel = topNodes[0]->GetLabel();
for(size_t i=2; i<point.size()-1; i++) {
@ -155,7 +155,7 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
if (point.size() > 3) {
int endPoint = point[point.size()-1]-1;
const vector< SyntaxNode* >& topNodes
= tree.GetNodes( point[0], endPoint);
= tree.GetNodes( point[0], endPoint);
string topLabel = topNodes[0]->GetLabel();
for(size_t i=1; i<point.size()-2; i++) {

View File

@ -2552,6 +2552,8 @@ sub define_tuningevaluation_filter {
# get model, and whether suffix array is used. Determines the pt implementation.
my $sa_exec_dir = &get("TRAINING:suffix-array");
my $sa_extractors = &get("GENERAL:sa_extractors");
$sa_extractors = 1 unless $sa_extractors;
my ($ptImpl, $numFF);
if ($hierarchical) {
@ -2564,7 +2566,7 @@ sub define_tuningevaluation_filter {
}
}
else {
$ptImpl = 0; # phrase-based
$ptImpl = 0; # phrase-based
}
# config file specified?
@ -2589,11 +2591,14 @@ sub define_tuningevaluation_filter {
# filter command
if ($sa_exec_dir) {
# suffix array
$cmd .= "$scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $sa_exec_dir $phrase_translation_table $input_filter $filter_dir \n";
$cmd .= "$scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $sa_exec_dir $phrase_translation_table $input_filter $filter_dir $sa_extractors \n";
my $escaped_filter_dir = $filter_dir;
$escaped_filter_dir =~ s/\//\\\\\//g;
$cmd .= "cat $config | sed s/10\\ 0\\ 0\\ 7.*/10\\ 0\\ 0\\ 7\\ $escaped_filter_dir/g > $filter_dir/moses.ini \n";
# kind of a hack -- the correct thing would be to make the generation of the config file ($filter_dir/moses.ini)
# set the PhraseDictionaryALSuffixArray's path to the filtered directory rather than to the suffix array itself
$cmd .= "sed -i 's%path=$phrase_translation_table%path=$filter_dir%' $filter_dir/moses.ini\n";
}
else {
# normal phrase table

51
scripts/generic/ph_numbers.perl Executable file
View File

@ -0,0 +1,51 @@
#!/usr/bin/perl -w
# Script to recognize and replace numbers in Moses training corpora
# and decoder input
#
# (c) 2013 TAUS
use strict;
use Getopt::Std;
my $debug = $ENV{DEBUG} || 0;
my %opts;
if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) {
print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n";
exit;
}
my $sourceLocale = $opts{s} || "";
my $targetLocale = $opts{t} || "";
my $numberSymbol = $opts{m} || '@NUM@';
while(<>) {
# [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?
# while(/\G(.*?)\s*([+-]?\p{Digit}+[+-\p{Digit}\.,eE])/) {
chomp;
my $output = "";
my $remainder = "";
while(/\G(.*?)(\s*)([+-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+-eE]*)/g) {
print STDERR "Between: x$1x\n" if $debug;
print STDERR "Number: x$3x\n" if $debug;
$output .= $1;
if($opts{c}) {
$output .= $2.$numberSymbol;
}
else {
if($opts{l}) {
$output .= $2."<ne translation=\"$3\">$numberSymbol</ne>";
}
else {
$output .= $2."<ne translation=\"$numberSymbol\" entity=\"$3\">$numberSymbol</ne>";
}
}
$remainder = $';
}
print STDERR "Remainder: x".$remainder."x\n" if $debug;
print STDERR "\n" if $debug;
$output .= $remainder if $remainder;
$output .= "\n";
print $output;
}

View File

@ -1,88 +0,0 @@
#!/usr/bin/perl -w
# Compatible with sri LM-creating script, eg.
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
# To use it in the EMS, add this to the [LM] section
# lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irst-dir"
# settings = ""
# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
# It should point to the root of the LM toolkit, eg
# irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
# And make sure that $cores is defined, eg $cores = 8
# And make sure the $settings variable is empty. This script doesn't understand some of the sri args like -unk and will complain.
use strict;
use FindBin qw($RealBin);
use Getopt::Long;
my $order = 3;
my $corpusPath;
my $lmPath;
my $cores = 2;
my $irstPath;
my $tempPath = "tmp";
my $p = 1;
my $s;
my $temp;
GetOptions("order=s" => \$order,
"text=s" => \$corpusPath,
"lm=s" => \$lmPath,
"cores=s" => \$cores,
"irst-dir=s" => \$irstPath,
"temp-dir=s" => \$tempPath,
"p=i" => \$p, # irstlm parameter: delete singletons
"s=s" => \$s, # irstlm parameter: smoothing method
"interpolate!" => \$temp, #ignore
"kndiscount!" => \$temp #ignore
) or exit 1;
#die("ERROR: please set order") unless defined($order);
die("ERROR: please set text") unless defined($corpusPath);
die("ERROR: please set lm") unless defined($lmPath);
die("ERROR: please set irst-dir") unless defined($irstPath);
my $ext = ($corpusPath =~ m/([^.]+)$/)[0];
print "extension is $ext\n";
$tempPath .= "/irstlm-build-tmp.$$";
`mkdir -p $tempPath`;
my $cmd;
if ($ext eq "gz")
{
$cmd = "zcat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
}
else
{
$cmd = "cat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
}
print STDERR "EXECUTING $cmd\n";
`$cmd`;
$cmd = "IRSTLM=$irstPath/.. $irstPath/build-lm.sh -t $tempPath/stat4 -i \"gunzip -c $tempPath/monolingual.setagged.gz\" -n $order -o $tempPath/iarpa.gz -k $cores";
$cmd .= " -p" if $p;
$cmd .= " -s $s" if defined($s);
print STDERR "EXECUTING $cmd\n";
`$cmd`;
$ext = ($lmPath =~ m/([^.]+)$/)[0];
print "extension is $ext\n";
if ($ext eq "gz")
{
$cmd = "$irstPath/compile-lm --text $tempPath/iarpa.gz /dev/stdout | gzip -c > $lmPath";
}
else
{
$cmd = "$irstPath/compile-lm --text $tempPath/iarpa.gz $lmPath";
}
print STDERR "EXECUTING $cmd\n";
`$cmd`;
$cmd = "rm -rf $tempPath";
print STDERR "EXECUTING $cmd\n";
`$cmd`;
print STDERR "FINISH.\n";

View File

@ -0,0 +1,40 @@
#!/usr/bin/perl -w
# Compatible with sri LM-creating script, eg.
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
# To use it in the EMS, add this to the [LM] section
# lm-training = "$moses-script-dir/generic/trainlm-lmplz.perl -lmplz $lmplz"
# settings = "-T $working-dir/tmp -S 10G"
# Also, make sure that $lmplz is defined (in the [LM] or [GENERAL] section.
# It should point to the binary file
# lmplz = /home/waziz/workspace/github/moses/bin/lmplz
use strict;
use FindBin qw($RealBin);
use Getopt::Long qw/GetOptionsFromArray/;
#use Getopt::Long;
Getopt::Long::Configure("pass_through", "no_ignore_case");
my $order = 3; # order of language model (default trigram)
my $corpus; # input text data
my $lm; # generated language model
my $lmplz; # bin directory of IRSTLM
my $help = 0;
my @optconfig = (
"-order=s" => \$order,
"-text=s" => \$corpus,
"-lm=s" => \$lm,
"-lmplz=s" => \$lmplz,
);
GetOptionsFromArray(\@ARGV, @optconfig);
die("ERROR: please set text") unless defined($corpus);
die("ERROR: please set lm") unless defined($lm);
die("ERROR: please set lmplz") unless defined($lmplz);
my $settings = join(' ', @ARGV);
my $cmd = "$lmplz --order $order $settings < $corpus > $lm";
print STDERR "EXECUTING $cmd\n";
`$cmd`;

Some files were not shown because too many files have changed in this diff Show More