mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
e20fbc0754
4
Jamroot
4
Jamroot
@ -108,6 +108,10 @@ project : default-build
|
||||
<link>static
|
||||
;
|
||||
|
||||
#Apparently OS X likes to link against iconv for fgetsUTF8.
|
||||
lib iconv ;
|
||||
requirements += <os>MACOSX:<library>iconv ;
|
||||
|
||||
project : requirements
|
||||
<threading>multi:<define>WITH_THREADS
|
||||
<threading>multi:<library>boost_thread
|
||||
|
@ -1121,6 +1121,11 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/InputFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/OSM-Feature</name>
|
||||
<type>2</type>
|
||||
<locationURI>virtual:/virtual</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/PhraseBasedFeatureContext.cpp</name>
|
||||
<type>1</type>
|
||||
@ -1166,6 +1171,16 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhrasePairFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/PhrasePenalty.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhrasePenalty.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/PhrasePenalty.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhrasePenalty.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/SourceWordDeletionFeature.cpp</name>
|
||||
<type>1</type>
|
||||
@ -1556,6 +1571,16 @@
|
||||
<type>2</type>
|
||||
<locationURI>virtual:/virtual</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/WordCoocTable.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/WordCoocTable.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/fuzzy-match</name>
|
||||
<type>2</type>
|
||||
@ -1581,6 +1606,26 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/bin/lm.log</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/OSM-Feature/OpSequenceModel.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/OpSequenceModel.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/OSM-Feature/OpSequenceModel.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/OpSequenceModel.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/OSM-Feature/osmHyp.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/osmHyp.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/OSM-Feature/osmHyp.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/OSM-Feature/osmHyp.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp</name>
|
||||
<type>1</type>
|
||||
|
@ -33,6 +33,8 @@ int main(int argc, char *argv[]) {
|
||||
po::options_description options("Language model building options");
|
||||
lm::builder::PipelineConfig pipeline;
|
||||
|
||||
std::string text, arpa;
|
||||
|
||||
options.add_options()
|
||||
("order,o", po::value<std::size_t>(&pipeline.order)
|
||||
#if BOOST_VERSION >= 104200
|
||||
@ -47,18 +49,21 @@ int main(int argc, char *argv[]) {
|
||||
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
|
||||
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
|
||||
("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
|
||||
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
|
||||
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
|
||||
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
|
||||
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout");
|
||||
if (argc == 1) {
|
||||
std::cerr <<
|
||||
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
|
||||
"Please cite:\n"
|
||||
"@inproceedings{kenlm,\n"
|
||||
"author = {Kenneth Heafield},\n"
|
||||
"title = {{KenLM}: Faster and Smaller Language Model Queries},\n"
|
||||
"booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
|
||||
"month = {July}, year={2011},\n"
|
||||
"address = {Edinburgh, UK},\n"
|
||||
"publisher = {Association for Computational Linguistics},\n"
|
||||
"@inproceedings{Heafield-estimate,\n"
|
||||
" author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
|
||||
" title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
|
||||
" year = {2013},\n"
|
||||
" month = {8},\n"
|
||||
" booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
|
||||
" address = {Sofia, Bulgaria},\n"
|
||||
" url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
|
||||
"}\n\n"
|
||||
"Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n"
|
||||
"the model (-o) is the only mandatory option. As this is an on-disk program,\n"
|
||||
@ -91,9 +96,17 @@ int main(int argc, char *argv[]) {
|
||||
initial.adder_out.block_count = 2;
|
||||
pipeline.read_backoffs = initial.adder_out;
|
||||
|
||||
util::scoped_fd in(0), out(1);
|
||||
if (vm.count("text")) {
|
||||
in.reset(util::OpenReadOrThrow(text.c_str()));
|
||||
}
|
||||
if (vm.count("arpa")) {
|
||||
out.reset(util::CreateOrThrow(arpa.c_str()));
|
||||
}
|
||||
|
||||
// Read from stdin
|
||||
try {
|
||||
lm::builder::Pipeline(pipeline, 0, 1);
|
||||
lm::builder::Pipeline(pipeline, in.release(), out.release());
|
||||
} catch (const util::MallocException &e) {
|
||||
std::cerr << e.what() << std::endl;
|
||||
std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
|
||||
|
@ -53,7 +53,7 @@ class NGram {
|
||||
Payload &Value() { return *reinterpret_cast<Payload *>(end_); }
|
||||
|
||||
uint64_t &Count() { return Value().count; }
|
||||
const uint64_t Count() const { return Value().count; }
|
||||
uint64_t Count() const { return Value().count; }
|
||||
|
||||
std::size_t Order() const { return end_ - begin_; }
|
||||
|
||||
|
21
lm/model.cc
21
lm/model.cc
@ -304,5 +304,26 @@ template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::DontBhiks
|
||||
template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::ArrayBhiksha>, SortedVocabulary>;
|
||||
|
||||
} // namespace detail
|
||||
|
||||
base::Model *LoadVirtual(const char *file_name, const Config &config, ModelType model_type) {
|
||||
RecognizeBinary(file_name, model_type);
|
||||
switch (model_type) {
|
||||
case PROBING:
|
||||
return new ProbingModel(file_name, config);
|
||||
case REST_PROBING:
|
||||
return new RestProbingModel(file_name, config);
|
||||
case TRIE:
|
||||
return new TrieModel(file_name, config);
|
||||
case QUANT_TRIE:
|
||||
return new QuantTrieModel(file_name, config);
|
||||
case ARRAY_TRIE:
|
||||
return new ArrayTrieModel(file_name, config);
|
||||
case QUANT_ARRAY_TRIE:
|
||||
return new QuantArrayTrieModel(file_name, config);
|
||||
default:
|
||||
UTIL_THROW(FormatLoadException, "Confused by model type " << model_type);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ngram
|
||||
} // namespace lm
|
||||
|
@ -67,7 +67,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
|
||||
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
|
||||
|
||||
/* Get the state for a context. Don't use this if you can avoid it. Use
|
||||
* BeginSentenceState or EmptyContextState and extend from those. If
|
||||
* BeginSentenceState or NullContextState and extend from those. If
|
||||
* you're only going to use this state to call FullScore once, use
|
||||
* FullScoreForgotState.
|
||||
* To use this function, make an array of WordIndex containing the context
|
||||
@ -153,6 +153,11 @@ LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<Separat
|
||||
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
|
||||
typedef ProbingModel Model;
|
||||
|
||||
/* Autorecognize the file type, load, and return the virtual base class. Don't
|
||||
* use the virtual base class if you can avoid it. Instead, use the above
|
||||
* classes as template arguments to your own virtual feature function.*/
|
||||
base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING);
|
||||
|
||||
} // namespace ngram
|
||||
} // namespace lm
|
||||
|
||||
|
@ -54,7 +54,7 @@ template <class Weights> class ActivateUnigram {
|
||||
Weights *modify_;
|
||||
};
|
||||
|
||||
// Find the lower order entry, inserting blanks along the way as necessary.
|
||||
// Find the lower order entry, inserting blanks along the way as necessary.
|
||||
template <class Value> void FindLower(
|
||||
const std::vector<uint64_t> &keys,
|
||||
typename Value::Weights &unigram,
|
||||
@ -64,7 +64,7 @@ template <class Value> void FindLower(
|
||||
typename Value::ProbingEntry entry;
|
||||
// Backoff will always be 0.0. We'll get the probability and rest in another pass.
|
||||
entry.value.backoff = kNoExtensionBackoff;
|
||||
// Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
|
||||
// Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
|
||||
for (int lower = keys.size() - 2; ; --lower) {
|
||||
if (lower == -1) {
|
||||
between.push_back(&unigram);
|
||||
@ -77,11 +77,11 @@ template <class Value> void FindLower(
|
||||
}
|
||||
}
|
||||
|
||||
// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
|
||||
// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
|
||||
template <class Added, class Build> void AdjustLower(
|
||||
const Added &added,
|
||||
const Build &build,
|
||||
std::vector<typename Build::Value::Weights *> &between,
|
||||
std::vector<typename Build::Value::Weights *> &between,
|
||||
const unsigned int n,
|
||||
const std::vector<WordIndex> &vocab_ids,
|
||||
typename Build::Value::Weights *unigrams,
|
||||
@ -93,14 +93,14 @@ template <class Added, class Build> void AdjustLower(
|
||||
}
|
||||
typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
|
||||
float prob = -fabs(between.back()->prob);
|
||||
// Order of the n-gram on which probabilities are based.
|
||||
// Order of the n-gram on which probabilities are based.
|
||||
unsigned char basis = n - between.size();
|
||||
assert(basis != 0);
|
||||
typename Build::Value::Weights **change = &between.back();
|
||||
// Skip the basis.
|
||||
--change;
|
||||
if (basis == 1) {
|
||||
// Hallucinate a bigram based on a unigram's backoff and a unigram probability.
|
||||
// Hallucinate a bigram based on a unigram's backoff and a unigram probability.
|
||||
float &backoff = unigrams[vocab_ids[1]].backoff;
|
||||
SetExtension(backoff);
|
||||
prob += backoff;
|
||||
@ -128,14 +128,14 @@ template <class Added, class Build> void AdjustLower(
|
||||
typename std::vector<typename Value::Weights *>::const_iterator i(between.begin());
|
||||
build.MarkExtends(**i, added);
|
||||
const typename Value::Weights *longer = *i;
|
||||
// Everything has probability but is not marked as extending.
|
||||
// Everything has probability but is not marked as extending.
|
||||
for (++i; i != between.end(); ++i) {
|
||||
build.MarkExtends(**i, *longer);
|
||||
longer = *i;
|
||||
}
|
||||
}
|
||||
|
||||
// Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds.
|
||||
// Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds.
|
||||
template <class Build> void MarkLower(
|
||||
const std::vector<uint64_t> &keys,
|
||||
const Build &build,
|
||||
@ -144,15 +144,15 @@ template <class Build> void MarkLower(
|
||||
int start_order,
|
||||
const typename Build::Value::Weights &longer) {
|
||||
if (start_order == 0) return;
|
||||
typename util::ProbingHashTable<typename Build::Value::ProbingEntry, util::IdentityHash>::MutableIterator iter;
|
||||
// Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
|
||||
// Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
|
||||
for (int even_lower = start_order - 2 /* index in middle */; ; --even_lower) {
|
||||
if (even_lower == -1) {
|
||||
build.MarkExtends(unigram, longer);
|
||||
return;
|
||||
}
|
||||
middle[even_lower].UnsafeMutableFind(keys[even_lower], iter);
|
||||
if (!build.MarkExtends(iter->value, longer)) return;
|
||||
if (!build.MarkExtends(
|
||||
middle[even_lower].UnsafeMutableMustFind(keys[even_lower])->value,
|
||||
longer)) return;
|
||||
}
|
||||
}
|
||||
|
||||
@ -168,7 +168,6 @@ template <class Build, class Activate, class Store> void ReadNGrams(
|
||||
Store &store,
|
||||
PositiveProbWarn &warn) {
|
||||
typedef typename Build::Value Value;
|
||||
typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
|
||||
assert(n >= 2);
|
||||
ReadNGramHeader(f, n);
|
||||
|
||||
@ -186,7 +185,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
|
||||
for (unsigned int h = 1; h < n - 1; ++h) {
|
||||
keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
|
||||
}
|
||||
// Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0.
|
||||
// Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0.
|
||||
util::SetSign(entry.value.prob);
|
||||
entry.key = keys[n-2];
|
||||
|
||||
@ -203,7 +202,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
|
||||
|
||||
} // namespace
|
||||
namespace detail {
|
||||
|
||||
|
||||
template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
|
||||
std::size_t allocated = Unigram::Size(counts[0]);
|
||||
unigram_ = Unigram(start, counts[0], allocated);
|
||||
|
@ -71,7 +71,7 @@ template <class Value> class HashedSearch {
|
||||
static const bool kDifferentRest = Value::kDifferentRest;
|
||||
static const unsigned int kVersion = 0;
|
||||
|
||||
// TODO: move probing_multiplier here with next binary file format update.
|
||||
// TODO: move probing_multiplier here with next binary file format update.
|
||||
static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
|
||||
|
||||
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
|
||||
@ -102,14 +102,9 @@ template <class Value> class HashedSearch {
|
||||
return ret;
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic ignored "-Wuninitialized"
|
||||
MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const {
|
||||
node = extend_pointer;
|
||||
typename Middle::ConstIterator found;
|
||||
bool got = middle_[extend_length - 2].Find(extend_pointer, found);
|
||||
assert(got);
|
||||
(void)got;
|
||||
return MiddlePointer(found->value);
|
||||
return MiddlePointer(middle_[extend_length - 2].MustFind(extend_pointer)->value);
|
||||
}
|
||||
|
||||
MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_pointer) const {
|
||||
@ -126,14 +121,14 @@ template <class Value> class HashedSearch {
|
||||
}
|
||||
|
||||
LongestPointer LookupLongest(WordIndex word, const Node &node) const {
|
||||
// Sign bit is always on because longest n-grams do not extend left.
|
||||
// Sign bit is always on because longest n-grams do not extend left.
|
||||
typename Longest::ConstIterator found;
|
||||
if (!longest_.Find(CombineWordHash(node, word), found)) return LongestPointer();
|
||||
return LongestPointer(found->value.prob);
|
||||
}
|
||||
|
||||
// Generate a node without necessarily checking that it actually exists.
|
||||
// Optionally return false if it's know to not exist.
|
||||
// Generate a node without necessarily checking that it actually exists.
|
||||
// Optionally return false if it's know to not exist.
|
||||
bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const {
|
||||
assert(begin != end);
|
||||
node = static_cast<Node>(*begin);
|
||||
@ -144,7 +139,7 @@ template <class Value> class HashedSearch {
|
||||
}
|
||||
|
||||
private:
|
||||
// Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
|
||||
// Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
|
||||
void DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn);
|
||||
|
||||
template <class Build> void ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build);
|
||||
@ -153,7 +148,7 @@ template <class Value> class HashedSearch {
|
||||
public:
|
||||
Unigram() {}
|
||||
|
||||
Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
|
||||
Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
|
||||
unigram_(static_cast<typename Value::Weights*>(start))
|
||||
#ifdef DEBUG
|
||||
, count_(count)
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "util/string_piece.hh"
|
||||
|
||||
#include <string>
|
||||
#include <string.h>
|
||||
|
||||
namespace lm {
|
||||
namespace base {
|
||||
@ -119,7 +120,9 @@ class Model {
|
||||
|
||||
size_t StateSize() const { return state_size_; }
|
||||
const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
|
||||
void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); }
|
||||
const void *NullContextMemory() const { return null_context_memory_; }
|
||||
void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); }
|
||||
|
||||
// Requires in_state != out_state
|
||||
virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
|
||||
|
@ -65,7 +65,7 @@ int main(int argc, char **argv)
|
||||
sourcePhrase.CreateFromString(Input, input, line, "||dummy_string||", NULL);
|
||||
|
||||
TargetPhraseVectorPtr decodedPhraseColl
|
||||
= pdc.GetTargetPhraseCollectionRaw(sourcePhrase);
|
||||
= pdc.GetTargetPhraseCollectionRaw(sourcePhrase);
|
||||
|
||||
if(decodedPhraseColl != NULL) {
|
||||
if(reportCounts)
|
||||
|
@ -61,6 +61,9 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
|
||||
|
||||
Phrase* unksrc = new Phrase(1);
|
||||
unksrc->AddWord() = sourceWord;
|
||||
Word &newWord = unksrc->GetWord(0);
|
||||
newWord.SetIsOOV(true);
|
||||
|
||||
m_unksrcs.push_back(unksrc);
|
||||
|
||||
//TranslationOption *transOpt;
|
||||
|
@ -34,16 +34,6 @@ DecodeFeature::DecodeFeature( const std::string& description
|
||||
: StatelessFeatureFunction(description, line)
|
||||
{
|
||||
VERBOSE(2,"DecodeFeature:" << std::endl);
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DecodeFeature::DecodeFeature( const std::string& description
|
||||
@ -67,7 +57,7 @@ DecodeFeature::DecodeFeature(const std::string& description
|
||||
VERBOSE(2,"DecodeFeature: input=" << m_inputFactors << " output=" << m_outputFactors << std::endl);
|
||||
}
|
||||
|
||||
bool DecodeFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
void DecodeFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "input-factor") {
|
||||
m_input =Tokenize<FactorType>(value, ",");
|
||||
@ -76,9 +66,8 @@ bool DecodeFeature::SetParameter(const std::string& key, const std::string& valu
|
||||
m_output =Tokenize<FactorType>(value, ",");
|
||||
m_outputFactors = FactorMask(m_output);
|
||||
} else {
|
||||
return StatelessFeatureFunction::SetParameter(key, value);
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
@ -61,7 +61,7 @@ public:
|
||||
const std::vector<FactorType>& GetOutput() const;
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const;
|
||||
virtual bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
protected:
|
||||
std::vector<FactorType> m_input;
|
||||
|
@ -4,6 +4,8 @@
|
||||
#include "moses/WordsRange.h"
|
||||
#include "moses/StaticData.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
struct DistortionState_traditional : public FFState {
|
||||
@ -19,6 +21,12 @@ struct DistortionState_traditional : public FFState {
|
||||
}
|
||||
};
|
||||
|
||||
DistortionScoreProducer::DistortionScoreProducer(const std::string &line)
|
||||
: StatefulFeatureFunction("Distortion", 1, line)
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
const FFState* DistortionScoreProducer::EmptyHypothesisState(const InputType &input) const
|
||||
{
|
||||
// fake previous translated phrase start and end
|
||||
|
@ -18,10 +18,7 @@ class WordsRange;
|
||||
class DistortionScoreProducer : public StatefulFeatureFunction
|
||||
{
|
||||
public:
|
||||
DistortionScoreProducer(const std::string &line)
|
||||
: StatefulFeatureFunction("Distortion", 1, line) {
|
||||
CHECK(m_args.size() == 0);
|
||||
}
|
||||
DistortionScoreProducer(const std::string &line);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
|
@ -49,17 +49,6 @@ void FeatureFunction::Initialize(const std::string& description, const std::stri
|
||||
{
|
||||
ParseLine(description, line);
|
||||
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_description == "") {
|
||||
size_t index = description_counts.count(description);
|
||||
|
||||
@ -91,29 +80,33 @@ void FeatureFunction::ParseLine(const std::string& description, const std::strin
|
||||
pair<set<string>::iterator,bool> ret = keys.insert(args[0]);
|
||||
UTIL_THROW_IF(!ret.second, util::Exception, "Duplicate key in line " << line);
|
||||
|
||||
m_args.push_back(args);
|
||||
if (args[0] == "num-features") {
|
||||
m_numScoreComponents = Scan<size_t>(args[1]);
|
||||
} else if (args[0] == "name") {
|
||||
m_description = args[1];
|
||||
} else {
|
||||
m_args.push_back(args);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool FeatureFunction::SetParameter(const std::string& key, const std::string& value)
|
||||
void FeatureFunction::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "num-features") {
|
||||
m_numScoreComponents = Scan<size_t>(value);
|
||||
} else if (key == "name") {
|
||||
m_description = value;
|
||||
} else if (key == "tuneable") {
|
||||
if (key == "tuneable") {
|
||||
m_tuneable = Scan<bool>(value);
|
||||
} else {
|
||||
return false;
|
||||
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void FeatureFunction::OverrideParameter(const std::string& key, const std::string& value)
|
||||
void FeatureFunction::ReadParameters()
|
||||
{
|
||||
bool ret = SetParameter(key, value);
|
||||
UTIL_THROW_IF(!ret, util::Exception, "Unknown argument" << key);
|
||||
while (!m_args.empty()) {
|
||||
const vector<string> &args = m_args[0];
|
||||
SetParameter(args[0], args[1]);
|
||||
|
||||
m_args.erase(m_args.begin());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -106,8 +106,8 @@ public:
|
||||
, ScoreComponentCollection &scoreBreakdown) const {
|
||||
}
|
||||
|
||||
virtual bool SetParameter(const std::string& key, const std::string& value);
|
||||
virtual void OverrideParameter(const std::string& key, const std::string& value);
|
||||
virtual void SetParameter(const std::string& key, const std::string& value);
|
||||
virtual void ReadParameters();
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -13,18 +13,7 @@ GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
|
||||
: StatelessFeatureFunction("GlobalLexicalModel",1, line)
|
||||
{
|
||||
std::cerr << "Creating global lexical model...\n";
|
||||
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
ReadParameters();
|
||||
|
||||
// define bias word
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
@ -34,7 +23,7 @@ GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
|
||||
|
||||
}
|
||||
|
||||
bool GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value)
|
||||
void GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "file") {
|
||||
m_filePath = value;
|
||||
@ -43,9 +32,8 @@ bool GlobalLexicalModel::SetParameter(const std::string& key, const std::string&
|
||||
} else if (key == "outputFactors") {
|
||||
m_outputFactorsVec = Tokenize<FactorType>(value,",");
|
||||
} else {
|
||||
return StatelessFeatureFunction::SetParameter(key, value);
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
GlobalLexicalModel::~GlobalLexicalModel()
|
||||
|
@ -77,7 +77,7 @@ public:
|
||||
ScoreComponentCollection* accumulator) const {
|
||||
throw std::logic_error("GlobalLexicalModel not supported in chart decoder, yet");
|
||||
}
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
};
|
||||
|
||||
|
305
moses/FF/OSM-Feature/OpSequenceModel.cpp
Normal file
305
moses/FF/OSM-Feature/OpSequenceModel.cpp
Normal file
@ -0,0 +1,305 @@
|
||||
#include <fstream>
|
||||
#include "OpSequenceModel.h"
|
||||
#include "osmHyp.h"
|
||||
#include "util/check.hh"
|
||||
#include "moses/Util.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace lm::ngram;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
OpSequenceModel::OpSequenceModel(const std::string &line)
|
||||
:StatefulFeatureFunction("OpSequenceModel", 5, line )
|
||||
{
|
||||
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void OpSequenceModel :: readLanguageModel(const char *lmFile)
|
||||
{
|
||||
|
||||
string unkOp = "_TRANS_SLF_";
|
||||
|
||||
|
||||
/*
|
||||
|
||||
// Code for SRILM
|
||||
|
||||
vector <int> numbers;
|
||||
int nonWordFlag = 0;
|
||||
|
||||
ptrOp = new Api;
|
||||
ptrOp -> read_lm(lmFile,lmOrder);
|
||||
numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str())));
|
||||
unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag);
|
||||
|
||||
*/
|
||||
|
||||
// Code to load KenLM
|
||||
|
||||
OSM = new Model(m_lmPath.c_str());
|
||||
State startState = OSM->NullContextState();
|
||||
State endState;
|
||||
unkOpProb = OSM->Score(startState,OSM->GetVocabulary().Index(unkOp),endState);
|
||||
}
|
||||
|
||||
|
||||
void OpSequenceModel::Load()
|
||||
{
|
||||
|
||||
/*
|
||||
// load future cost
|
||||
|
||||
//vector <string> input;
|
||||
ifstream sr (m_featurePath.c_str());
|
||||
char* tmp;
|
||||
|
||||
CHECK(sr.is_open());
|
||||
|
||||
vector<FactorType> factorOrder;
|
||||
factorOrder.push_back(0);
|
||||
|
||||
string line;
|
||||
while (std::getline(sr, line))
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
tokens = TokenizeMultiCharSeparator(line, "|||");
|
||||
CHECK(tokens.size() == 3);
|
||||
|
||||
Phrase source, target;
|
||||
source.CreateFromString(Input, factorOrder, tokens[0], "|", NULL);
|
||||
target.CreateFromString(Output, factorOrder, tokens[1], "|", NULL);
|
||||
|
||||
ParallelPhrase pp(source, target);
|
||||
Scores scores = Tokenize<float>(tokens[2], " ");
|
||||
m_futureCost[pp] = scores;
|
||||
// m_coll[pp] = scores;
|
||||
}
|
||||
|
||||
*/
|
||||
readLanguageModel(m_lmPath.c_str());
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void OpSequenceModel:: Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
|
||||
osmHypothesis obj;
|
||||
obj.setState(OSM->NullContextState());
|
||||
WordsBitmap myBitmap(source.GetSize());
|
||||
vector <string> mySourcePhrase;
|
||||
vector <string> myTargetPhrase;
|
||||
vector<float> scores(5);
|
||||
vector <int> alignments;
|
||||
int startIndex = 0;
|
||||
int endIndex = source.GetSize();
|
||||
|
||||
const AlignmentInfo &align = targetPhrase.GetAlignTerm();
|
||||
AlignmentInfo::const_iterator iter;
|
||||
|
||||
|
||||
for (iter = align.begin(); iter != align.end(); ++iter)
|
||||
{
|
||||
alignments.push_back(iter->first);
|
||||
alignments.push_back(iter->second);
|
||||
}
|
||||
|
||||
for (int i = 0; i < targetPhrase.GetSize(); i++)
|
||||
{
|
||||
if (targetPhrase.GetWord(i).IsOOV())
|
||||
myTargetPhrase.push_back("_TRANS_SLF_");
|
||||
else
|
||||
myTargetPhrase.push_back(targetPhrase.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
}
|
||||
|
||||
for (int i = 0; i < source.GetSize(); i++)
|
||||
{
|
||||
mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
}
|
||||
|
||||
obj.setPhrases(mySourcePhrase , myTargetPhrase);
|
||||
obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize());
|
||||
obj.computeOSMFeature(startIndex,myBitmap);
|
||||
obj.calculateOSMProb(*OSM);
|
||||
obj.populateScores(scores);
|
||||
estimatedFutureScore.PlusEquals(this, scores);
|
||||
|
||||
}
|
||||
|
||||
|
||||
FFState* OpSequenceModel::Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
|
||||
const WordsBitmap &bitmap = cur_hypo.GetWordsBitmap();
|
||||
WordsBitmap myBitmap = bitmap;
|
||||
const Manager &manager = cur_hypo.GetManager();
|
||||
const InputType &source = manager.GetSource();
|
||||
const Sentence &sourceSentence = static_cast<const Sentence&>(source);
|
||||
osmHypothesis obj;
|
||||
vector <string> mySourcePhrase;
|
||||
vector <string> myTargetPhrase;
|
||||
vector<float> scores(5);
|
||||
|
||||
|
||||
//target.GetWord(0)
|
||||
|
||||
//cerr << target <<" --- "<<target.GetSourcePhrase()<< endl; // English ...
|
||||
|
||||
//cerr << align << endl; // Alignments ...
|
||||
//cerr << cur_hypo.GetCurrSourceWordsRange() << endl;
|
||||
|
||||
//cerr << source <<endl;
|
||||
|
||||
// int a = sourceRange.GetStartPos();
|
||||
// cerr << source.GetWord(a);
|
||||
//cerr <<a<<endl;
|
||||
|
||||
//const Sentence &sentence = static_cast<const Sentence&>(curr_hypo.GetManager().GetSource());
|
||||
|
||||
|
||||
const WordsRange & sourceRange = cur_hypo.GetCurrSourceWordsRange();
|
||||
int startIndex = sourceRange.GetStartPos();
|
||||
int endIndex = sourceRange.GetEndPos();
|
||||
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
|
||||
osmState * statePtr;
|
||||
|
||||
vector <int> alignments;
|
||||
|
||||
|
||||
|
||||
AlignmentInfo::const_iterator iter;
|
||||
|
||||
for (iter = align.begin(); iter != align.end(); ++iter) {
|
||||
//cerr << iter->first << "----" << iter->second << " ";
|
||||
alignments.push_back(iter->first);
|
||||
alignments.push_back(iter->second);
|
||||
}
|
||||
|
||||
|
||||
//cerr<<bitmap<<endl;
|
||||
//cerr<<startIndex<<" "<<endIndex<<endl;
|
||||
|
||||
|
||||
for (int i = startIndex; i <= endIndex; i++)
|
||||
{
|
||||
myBitmap.SetValue(i,0); // resetting coverage of this phrase ...
|
||||
mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
// cerr<<mySourcePhrase[i]<<endl;
|
||||
}
|
||||
|
||||
for (int i = 0; i < target.GetSize(); i++)
|
||||
{
|
||||
|
||||
if (target.GetWord(i).IsOOV())
|
||||
myTargetPhrase.push_back("_TRANS_SLF_");
|
||||
else
|
||||
myTargetPhrase.push_back(target.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
|
||||
}
|
||||
|
||||
|
||||
//cerr<<myBitmap<<endl;
|
||||
|
||||
obj.setState(prev_state);
|
||||
obj.constructCepts(alignments,startIndex,endIndex,target.GetSize());
|
||||
obj.setPhrases(mySourcePhrase , myTargetPhrase);
|
||||
obj.computeOSMFeature(startIndex,myBitmap);
|
||||
obj.calculateOSMProb(*OSM);
|
||||
obj.populateScores(scores);
|
||||
|
||||
/*
|
||||
if (bitmap.GetFirstGapPos() == NOT_FOUND)
|
||||
{
|
||||
|
||||
int xx;
|
||||
cerr<<bitmap<<endl;
|
||||
int a = bitmap.GetFirstGapPos();
|
||||
obj.print();
|
||||
cin>>xx;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
vector<float> scores(5);
|
||||
scores[0] = 0.343423f;
|
||||
scores[1] = 1.343423f;
|
||||
scores[2] = 2.343423f;
|
||||
scores[3] = 3.343423f;
|
||||
scores[4] = 4.343423f;
|
||||
*/
|
||||
|
||||
accumulator->PlusEquals(this, scores);
|
||||
|
||||
return obj.saveState();
|
||||
|
||||
|
||||
|
||||
|
||||
//return statePtr;
|
||||
// return NULL;
|
||||
}
|
||||
|
||||
FFState* OpSequenceModel::EvaluateChart(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
abort();
|
||||
|
||||
}
|
||||
|
||||
const FFState* OpSequenceModel::EmptyHypothesisState(const InputType &input) const
|
||||
{
|
||||
cerr << "OpSequenceModel::EmptyHypothesisState()" << endl;
|
||||
|
||||
State startState = OSM->BeginSentenceState();
|
||||
|
||||
return new osmState(startState);
|
||||
}
|
||||
|
||||
std::string OpSequenceModel::GetScoreProducerWeightShortName(unsigned idx) const
|
||||
{
|
||||
return "osm";
|
||||
}
|
||||
|
||||
std::vector<float> OpSequenceModel::GetFutureScores(const Phrase &source, const Phrase &target) const
|
||||
{
|
||||
ParallelPhrase pp(source, target);
|
||||
std::map<ParallelPhrase, Scores>::const_iterator iter;
|
||||
iter = m_futureCost.find(pp);
|
||||
//iter = m_coll.find(pp);
|
||||
if (iter == m_futureCost.end()) {
|
||||
vector<float> scores(5, 0);
|
||||
scores[0] = unkOpProb;
|
||||
return scores;
|
||||
}
|
||||
else {
|
||||
const vector<float> &scores = iter->second;
|
||||
return scores;
|
||||
}
|
||||
}
|
||||
|
||||
void OpSequenceModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "feature-path") {
|
||||
m_featurePath = value;
|
||||
} else if (key == "path") {
|
||||
m_lmPath = value;
|
||||
} else if (key == "order") {
|
||||
lmOrder = Scan<int>(value);
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
69
moses/FF/OSM-Feature/OpSequenceModel.h
Normal file
69
moses/FF/OSM-Feature/OpSequenceModel.h
Normal file
@ -0,0 +1,69 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include "moses/FF/StatefulFeatureFunction.h"
|
||||
#include "moses/Manager.h"
|
||||
#include "moses/FF/OSM-Feature/osmHyp.h"
|
||||
#include "lm/model.hh"
|
||||
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class OpSequenceModel : public StatefulFeatureFunction
|
||||
{
|
||||
public:
|
||||
|
||||
|
||||
lm::ngram::Model * OSM;
|
||||
|
||||
int lmOrder;
|
||||
float unkOpProb;
|
||||
|
||||
OpSequenceModel(const std::string &line);
|
||||
|
||||
void readLanguageModel(const char *);
|
||||
void Load();
|
||||
|
||||
FFState* Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
|
||||
virtual FFState* EvaluateChart(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
||||
|
||||
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
|
||||
|
||||
std::vector<float> GetFutureScores(const Phrase &source, const Phrase &target) const;
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const
|
||||
{ return true; }
|
||||
|
||||
protected:
|
||||
typedef std::pair<Phrase, Phrase> ParallelPhrase;
|
||||
typedef std::vector<float> Scores;
|
||||
std::map<ParallelPhrase, Scores> m_futureCost;
|
||||
|
||||
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
|
||||
std::set <int> targetNullWords;
|
||||
std::string m_featurePath, m_lmPath;
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
} // namespace
|
650
moses/FF/OSM-Feature/osmHyp.cpp
Normal file
650
moses/FF/OSM-Feature/osmHyp.cpp
Normal file
@ -0,0 +1,650 @@
|
||||
#include "osmHyp.h"
|
||||
#include <sstream>
|
||||
|
||||
using namespace std;
|
||||
using namespace lm::ngram;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
osmState::osmState(const State & val)
|
||||
:j(0)
|
||||
,E(0)
|
||||
{
|
||||
lmState = val;
|
||||
|
||||
}
|
||||
|
||||
void osmState::saveState(int jVal, int eVal, map <int , string> & gapVal)
|
||||
{
|
||||
gap.clear();
|
||||
gap = gapVal;
|
||||
j = jVal;
|
||||
E = eVal;
|
||||
}
|
||||
|
||||
int osmState::Compare(const FFState& otherBase) const
|
||||
{
|
||||
const osmState &other = static_cast<const osmState&>(otherBase);
|
||||
if (j != other.j)
|
||||
return (j < other.j) ? -1 : +1;
|
||||
if (E != other.E)
|
||||
return (E < other.E) ? -1 : +1;
|
||||
if (gap != other.gap)
|
||||
return (gap < other.gap) ? -1 : +1;
|
||||
|
||||
if (lmState.length < other.lmState.length) return -1;
|
||||
|
||||
if (lmState.length > other.lmState.length) return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
std::string osmState :: getName() const
|
||||
{
|
||||
|
||||
return "done";
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
osmHypothesis :: osmHypothesis()
|
||||
{
|
||||
opProb = 0;
|
||||
gapWidth = 0;
|
||||
gapCount = 0;
|
||||
openGapCount = 0;
|
||||
deletionCount = 0;
|
||||
gapCount = 0;
|
||||
j = 0;
|
||||
E = 0;
|
||||
gap.clear();
|
||||
}
|
||||
|
||||
void osmHypothesis :: setState(const FFState* prev_state)
|
||||
{
|
||||
|
||||
if(prev_state != NULL)
|
||||
{
|
||||
|
||||
j = static_cast <const osmState *> (prev_state)->getJ();
|
||||
E = static_cast <const osmState *> (prev_state)->getE();
|
||||
gap = static_cast <const osmState *> (prev_state)->getGap();
|
||||
lmState = static_cast <const osmState *> (prev_state)->getLMState();
|
||||
}
|
||||
}
|
||||
|
||||
osmState * osmHypothesis :: saveState()
|
||||
{
|
||||
|
||||
osmState * statePtr = new osmState(lmState);
|
||||
statePtr->saveState(j,E,gap);
|
||||
return statePtr;
|
||||
}
|
||||
|
||||
int osmHypothesis :: isTranslationOperation(int x)
|
||||
{
|
||||
if (operations[x].find("_JMP_BCK_") != -1)
|
||||
return 0;
|
||||
|
||||
if (operations[x].find("_JMP_FWD_") != -1)
|
||||
return 0;
|
||||
|
||||
if (operations[x].find("_CONT_CEPT_") != -1)
|
||||
return 0;
|
||||
|
||||
if (operations[x].find("_INS_GAP_") != -1)
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: removeReorderingOperations()
|
||||
{
|
||||
gapCount = 0;
|
||||
deletionCount = 0;
|
||||
openGapCount = 0;
|
||||
gapWidth = 0;
|
||||
//cout<<"I came here"<<endl;
|
||||
|
||||
std::vector <std::string> tupleSequence;
|
||||
|
||||
for (int x = 0; x < operations.size(); x++)
|
||||
{
|
||||
// cout<<operations[x]<<endl;
|
||||
|
||||
if(isTranslationOperation(x) == 1)
|
||||
{
|
||||
tupleSequence.push_back(operations[x]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
operations.clear();
|
||||
operations = tupleSequence;
|
||||
}
|
||||
|
||||
void osmHypothesis :: calculateOSMProb(Model & ptrOp)
|
||||
{
|
||||
|
||||
opProb = 0;
|
||||
State currState = lmState;
|
||||
State temp;
|
||||
|
||||
for (int i = 0; i<operations.size(); i++)
|
||||
{
|
||||
temp = currState;
|
||||
opProb += ptrOp.Score(temp,ptrOp.GetVocabulary().Index(operations[i]),currState);
|
||||
}
|
||||
|
||||
lmState = currState;
|
||||
|
||||
//print();
|
||||
}
|
||||
|
||||
|
||||
int osmHypothesis :: firstOpenGap(vector <int> & coverageVector)
|
||||
{
|
||||
|
||||
int firstOG =-1;
|
||||
|
||||
for(int nd = 0; nd < coverageVector.size(); nd++)
|
||||
{
|
||||
if(coverageVector[nd]==0)
|
||||
{
|
||||
firstOG = nd;
|
||||
return firstOG;
|
||||
}
|
||||
}
|
||||
|
||||
return firstOG;
|
||||
|
||||
}
|
||||
|
||||
string osmHypothesis :: intToString(int num)
|
||||
{
|
||||
|
||||
std::ostringstream stm;
|
||||
stm<<num;
|
||||
|
||||
return stm.str();
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: generateOperations(int & startIndex , int j1 , int contFlag , WordsBitmap & coverageVector , string english , string german , set <int> & targetNullWords , vector <string> & currF)
|
||||
{
|
||||
|
||||
int gFlag = 0;
|
||||
int gp = 0;
|
||||
int ans;
|
||||
|
||||
|
||||
if ( j < j1) // j1 is the index of the source word we are about to generate ...
|
||||
{
|
||||
//if(coverageVector[j]==0) // if source word at j is not generated yet ...
|
||||
if(coverageVector.GetValue(j)==0) // if source word at j is not generated yet ...
|
||||
{
|
||||
operations.push_back("_INS_GAP_");
|
||||
gFlag++;
|
||||
gap[j]="Unfilled";
|
||||
}
|
||||
if (j == E)
|
||||
{
|
||||
j = j1;
|
||||
}
|
||||
else
|
||||
{
|
||||
operations.push_back("_JMP_FWD_");
|
||||
j=E;
|
||||
}
|
||||
}
|
||||
|
||||
if (j1 < j)
|
||||
{
|
||||
// if(j < E && coverageVector[j]==0)
|
||||
if(j < E && coverageVector.GetValue(j)==0)
|
||||
{
|
||||
operations.push_back("_INS_GAP_");
|
||||
gFlag++;
|
||||
gap[j]="Unfilled";
|
||||
}
|
||||
|
||||
j=closestGap(gap,j1,gp);
|
||||
operations.push_back("_JMP_BCK_"+ intToString(gp));
|
||||
|
||||
//cout<<"I am j "<<j<<endl;
|
||||
//cout<<"I am j1 "<<j1<<endl;
|
||||
|
||||
if(j==j1)
|
||||
gap[j]="Filled";
|
||||
}
|
||||
|
||||
if (j < j1)
|
||||
{
|
||||
operations.push_back("_INS_GAP_");
|
||||
gap[j] = "Unfilled";
|
||||
gFlag++;
|
||||
j=j1;
|
||||
}
|
||||
|
||||
if(contFlag == 0) // First words of the multi-word cept ...
|
||||
{
|
||||
|
||||
if(english == "_TRANS_SLF_") // Unknown word ...
|
||||
{
|
||||
operations.push_back("_TRANS_SLF_");
|
||||
}
|
||||
else
|
||||
{
|
||||
operations.push_back("_TRANS_" + english + "_TO_" + german);
|
||||
}
|
||||
|
||||
//ans = firstOpenGap(coverageVector);
|
||||
ans = coverageVector.GetFirstGapPos();
|
||||
|
||||
if (ans != -1)
|
||||
gapWidth += j - ans;
|
||||
|
||||
}
|
||||
else if (contFlag == 2)
|
||||
{
|
||||
|
||||
operations.push_back("_INS_" + german);
|
||||
ans = coverageVector.GetFirstGapPos();
|
||||
|
||||
if (ans != -1)
|
||||
gapWidth += j - ans;
|
||||
deletionCount++;
|
||||
}
|
||||
else
|
||||
{
|
||||
operations.push_back("_CONT_CEPT_");
|
||||
}
|
||||
|
||||
//coverageVector[j]=1;
|
||||
coverageVector.SetValue(j,1);
|
||||
j+=1;
|
||||
|
||||
if(E<j)
|
||||
E=j;
|
||||
|
||||
if (gFlag > 0)
|
||||
gapCount++;
|
||||
|
||||
openGapCount += getOpenGaps();
|
||||
|
||||
//if (coverageVector[j] == 0 && targetNullWords.find(j) != targetNullWords.end())
|
||||
if (coverageVector.GetValue(j) == 0 && targetNullWords.find(j) != targetNullWords.end())
|
||||
{
|
||||
j1 = j;
|
||||
german = currF[j1-startIndex];
|
||||
english = "_INS_";
|
||||
generateOperations(startIndex, j1, 2 , coverageVector , english , german , targetNullWords , currF);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: print()
|
||||
{
|
||||
for (int i = 0; i< operations.size(); i++)
|
||||
{
|
||||
cerr<<operations[i]<<" ";
|
||||
|
||||
}
|
||||
|
||||
cerr<<endl<<endl;
|
||||
|
||||
cerr<<"Operation Probability "<<opProb<<endl;
|
||||
cerr<<"Gap Count "<<gapCount<<endl;
|
||||
cerr<<"Open Gap Count "<<openGapCount<<endl;
|
||||
cerr<<"Gap Width "<<gapWidth<<endl;
|
||||
cerr<<"Deletion Count "<<deletionCount<<endl;
|
||||
|
||||
cerr<<"_______________"<<endl;
|
||||
}
|
||||
|
||||
int osmHypothesis :: closestGap(map <int,string> gap, int j1, int & gp)
|
||||
{
|
||||
|
||||
int dist=1172;
|
||||
int value=-1;
|
||||
int temp=0;
|
||||
gp=0;
|
||||
int opGap=0;
|
||||
|
||||
map <int,string> :: iterator iter;
|
||||
|
||||
iter=gap.end();
|
||||
|
||||
do
|
||||
{
|
||||
iter--;
|
||||
//cout<<"Trapped "<<iter->first<<endl;
|
||||
|
||||
if(iter->first==j1 && iter->second== "Unfilled")
|
||||
{
|
||||
opGap++;
|
||||
gp = opGap;
|
||||
return j1;
|
||||
|
||||
}
|
||||
|
||||
if(iter->second =="Unfilled")
|
||||
{
|
||||
opGap++;
|
||||
temp = iter->first - j1;
|
||||
|
||||
if(temp<0)
|
||||
temp=temp * -1;
|
||||
|
||||
if(dist>temp && iter->first < j1)
|
||||
{
|
||||
dist=temp;
|
||||
value=iter->first;
|
||||
gp=opGap;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
while(iter!=gap.begin());
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int osmHypothesis :: getOpenGaps()
|
||||
{
|
||||
map <int,string> :: iterator iter;
|
||||
|
||||
int nd = 0;
|
||||
for (iter = gap.begin(); iter!=gap.end(); iter++)
|
||||
{
|
||||
if(iter->second == "Unfilled")
|
||||
nd++;
|
||||
}
|
||||
|
||||
return nd;
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes)
|
||||
{
|
||||
|
||||
operations.push_back("_DEL_" + english);
|
||||
currTargetIndex++;
|
||||
|
||||
while(doneTargetIndexes.find(currTargetIndex) != doneTargetIndexes.end())
|
||||
{
|
||||
currTargetIndex++;
|
||||
}
|
||||
|
||||
if (sourceNullWords.find(currTargetIndex) != sourceNullWords.end())
|
||||
{
|
||||
english = currE[currTargetIndex];
|
||||
generateDeleteOperations(english,currTargetIndex,doneTargetIndexes);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: computeOSMFeature(int startIndex , WordsBitmap & coverageVector)
|
||||
{
|
||||
|
||||
set <int> doneTargetIndexes;
|
||||
set <int> eSide;
|
||||
set <int> fSide;
|
||||
set <int> :: iterator iter;
|
||||
string english;
|
||||
string source;
|
||||
int j1;
|
||||
int start = 0;
|
||||
int targetIndex = 0;
|
||||
doneTargetIndexes.clear();
|
||||
|
||||
|
||||
if (targetNullWords.size() != 0) // Source words to be deleted in the start of this phrase ...
|
||||
{
|
||||
iter = targetNullWords.begin();
|
||||
|
||||
if (*iter == startIndex)
|
||||
{
|
||||
|
||||
j1 = startIndex;
|
||||
source = currF[j1-startIndex];
|
||||
english = "_INS_";
|
||||
generateOperations(startIndex, j1, 2 , coverageVector , english , source , targetNullWords , currF);
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceNullWords.find(targetIndex) != sourceNullWords.end()) // first word has to be deleted ...
|
||||
{
|
||||
english = currE[targetIndex];
|
||||
generateDeleteOperations(english,targetIndex, doneTargetIndexes);
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < ceptsInPhrase.size(); i++)
|
||||
{
|
||||
source = "";
|
||||
english = "";
|
||||
|
||||
fSide = ceptsInPhrase[i].first;
|
||||
eSide = ceptsInPhrase[i].second;
|
||||
|
||||
iter = eSide.begin();
|
||||
targetIndex = *iter;
|
||||
english += currE[*iter];
|
||||
iter++;
|
||||
|
||||
for (; iter != eSide.end(); iter++)
|
||||
{
|
||||
if(*iter == targetIndex+1)
|
||||
targetIndex++;
|
||||
else
|
||||
doneTargetIndexes.insert(*iter);
|
||||
|
||||
english += "^_^";
|
||||
english += currE[*iter];
|
||||
}
|
||||
|
||||
iter = fSide.begin();
|
||||
source += currF[*iter];
|
||||
iter++;
|
||||
|
||||
for (; iter != fSide.end(); iter++)
|
||||
{
|
||||
source += "^_^";
|
||||
source += currF[*iter];
|
||||
}
|
||||
|
||||
iter = fSide.begin();
|
||||
j1 = *iter + startIndex;
|
||||
iter++;
|
||||
|
||||
generateOperations(startIndex, j1, 0 , coverageVector , english , source , targetNullWords , currF);
|
||||
|
||||
|
||||
for (; iter != fSide.end(); iter++)
|
||||
{
|
||||
j1 = *iter + startIndex;
|
||||
generateOperations(startIndex, j1, 1 , coverageVector , english , source , targetNullWords , currF);
|
||||
}
|
||||
|
||||
targetIndex++; // Check whether the next target word is unaligned ...
|
||||
|
||||
while(doneTargetIndexes.find(targetIndex) != doneTargetIndexes.end())
|
||||
{
|
||||
targetIndex++;
|
||||
}
|
||||
|
||||
if(sourceNullWords.find(targetIndex) != sourceNullWords.end())
|
||||
{
|
||||
english = currE[targetIndex];
|
||||
generateDeleteOperations(english,targetIndex, doneTargetIndexes);
|
||||
}
|
||||
}
|
||||
|
||||
//removeReorderingOperations();
|
||||
|
||||
//print();
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT)
|
||||
{
|
||||
set <int> :: iterator iter;
|
||||
|
||||
int sz = eSide.size();
|
||||
vector <int> t;
|
||||
|
||||
for (iter = eSide.begin(); iter != eSide.end(); iter++)
|
||||
{
|
||||
t = tS[*iter];
|
||||
|
||||
for (int i = 0; i < t.size(); i++)
|
||||
{
|
||||
fSide.insert(t[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (iter = fSide.begin(); iter != fSide.end(); iter++)
|
||||
{
|
||||
|
||||
t = sT[*iter];
|
||||
|
||||
for (int i = 0 ; i<t.size(); i++)
|
||||
{
|
||||
eSide.insert(t[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (eSide.size () > sz)
|
||||
{
|
||||
getMeCepts(eSide,fSide,tS,sT);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: constructCepts(vector <int> & align , int startIndex , int endIndex, int targetPhraseLength)
|
||||
{
|
||||
|
||||
std::map <int , vector <int> > sT;
|
||||
std::map <int , vector <int> > tS;
|
||||
std::set <int> eSide;
|
||||
std::set <int> fSide;
|
||||
std::set <int> :: iterator iter;
|
||||
std :: map <int , vector <int> > :: iterator iter2;
|
||||
std :: pair < set <int> , set <int> > cept;
|
||||
int src;
|
||||
int tgt;
|
||||
|
||||
|
||||
for (int i = 0; i < align.size(); i+=2)
|
||||
{
|
||||
src = align[i];
|
||||
tgt = align[i+1];
|
||||
tS[tgt].push_back(src);
|
||||
sT[src].push_back(tgt);
|
||||
}
|
||||
|
||||
for (int i = startIndex; i<= endIndex; i++) // What are unaligned source words in this phrase ...
|
||||
{
|
||||
if (sT.find(i-startIndex) == sT.end())
|
||||
{
|
||||
targetNullWords.insert(i);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < targetPhraseLength; i++) // What are unaligned target words in this phrase ...
|
||||
{
|
||||
if (tS.find(i) == tS.end())
|
||||
{
|
||||
sourceNullWords.insert(i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
while (tS.size() != 0 && sT.size() != 0)
|
||||
{
|
||||
|
||||
iter2 = tS.begin();
|
||||
|
||||
eSide.clear();
|
||||
fSide.clear();
|
||||
eSide.insert (iter2->first);
|
||||
|
||||
getMeCepts(eSide, fSide, tS , sT);
|
||||
|
||||
for (iter = eSide.begin(); iter != eSide.end(); iter++)
|
||||
{
|
||||
iter2 = tS.find(*iter);
|
||||
tS.erase(iter2);
|
||||
}
|
||||
|
||||
for (iter = fSide.begin(); iter != fSide.end(); iter++)
|
||||
{
|
||||
iter2 = sT.find(*iter);
|
||||
sT.erase(iter2);
|
||||
}
|
||||
|
||||
cept = make_pair (fSide , eSide);
|
||||
ceptsInPhrase.push_back(cept);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
|
||||
cerr<<"Extracted Cepts "<<endl;
|
||||
for (int i = 0; i < ceptsInPhrase.size(); i++)
|
||||
{
|
||||
|
||||
fSide = ceptsInPhrase[i].first;
|
||||
eSide = ceptsInPhrase[i].second;
|
||||
|
||||
for (iter = eSide.begin(); iter != eSide.end(); iter++)
|
||||
{
|
||||
cerr<<*iter<<" ";
|
||||
}
|
||||
cerr<<"<---> ";
|
||||
|
||||
for (iter = fSide.begin(); iter != fSide.end(); iter++)
|
||||
{
|
||||
cerr<<*iter<<" ";
|
||||
}
|
||||
|
||||
cerr<<endl;
|
||||
}
|
||||
cerr<<endl;
|
||||
|
||||
cerr<<"Unaligned Target Words"<<endl;
|
||||
|
||||
for (iter = sourceNullWords.begin(); iter != sourceNullWords.end(); iter++)
|
||||
cerr<<*iter<<"<--->"<<endl;
|
||||
|
||||
cerr<<"Unaligned Source Words"<<endl;
|
||||
|
||||
for (iter = targetNullWords.begin(); iter != targetNullWords.end(); iter++)
|
||||
cerr<<*iter<<"<--->"<<endl;
|
||||
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: populateScores(vector <float> & scores)
|
||||
{
|
||||
scores.clear();
|
||||
scores.push_back(opProb);
|
||||
scores.push_back(gapWidth);
|
||||
scores.push_back(gapCount);
|
||||
scores.push_back(openGapCount);
|
||||
scores.push_back(deletionCount);
|
||||
}
|
||||
|
||||
|
||||
} // namespace
|
||||
|
89
moses/FF/OSM-Feature/osmHyp.h
Normal file
89
moses/FF/OSM-Feature/osmHyp.h
Normal file
@ -0,0 +1,89 @@
|
||||
#pragma once
|
||||
|
||||
# include "moses/FF/FFState.h"
|
||||
# include "moses/Manager.h"
|
||||
#include "lm/model.hh"
|
||||
# include <set>
|
||||
# include <map>
|
||||
# include <string>
|
||||
# include <vector>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class osmState : public FFState
|
||||
{
|
||||
public:
|
||||
osmState(const lm::ngram::State & val);
|
||||
int Compare(const FFState& other) const;
|
||||
void saveState(int jVal, int eVal, std::map <int , std::string> & gapVal);
|
||||
int getJ()const {return j;}
|
||||
int getE()const {return E;}
|
||||
std::map <int , std::string> getGap() const { return gap;}
|
||||
|
||||
lm::ngram::State getLMState() const {return lmState;}
|
||||
|
||||
void print() const;
|
||||
std::string getName() const;
|
||||
|
||||
protected:
|
||||
int j, E;
|
||||
std::map <int,std::string> gap;
|
||||
lm::ngram::State lmState;
|
||||
};
|
||||
|
||||
class osmHypothesis
|
||||
{
|
||||
|
||||
private:
|
||||
|
||||
|
||||
std::vector <std::string> operations; // List of operations required to generated this hyp ...
|
||||
std::map <int,std::string> gap; // Maintains gap history ...
|
||||
int j; // Position after the last source word generated ...
|
||||
int E; // Position after the right most source word so far generated ...
|
||||
lm::ngram::State lmState; // KenLM's Model State ...
|
||||
|
||||
int gapCount; // Number of gaps inserted ...
|
||||
int deletionCount;
|
||||
int openGapCount;
|
||||
int gapWidth;
|
||||
double opProb;
|
||||
|
||||
std::vector <std::string> currE;
|
||||
std::vector <std::string> currF;
|
||||
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
|
||||
std::set <int> targetNullWords;
|
||||
std::set <int> sourceNullWords;
|
||||
|
||||
int closestGap(std::map <int,std::string> gap,int j1, int & gp);
|
||||
int firstOpenGap(std::vector <int> & coverageVector);
|
||||
std::string intToString(int);
|
||||
int getOpenGaps();
|
||||
int isTranslationOperation(int j);
|
||||
void removeReorderingOperations();
|
||||
|
||||
void getMeCepts ( std::set <int> & eSide , std::set <int> & fSide , std::map <int , std::vector <int> > & tS , std::map <int , std::vector <int> > & sT);
|
||||
|
||||
public:
|
||||
|
||||
osmHypothesis();
|
||||
~osmHypothesis(){};
|
||||
void generateOperations(int & startIndex, int j1 , int contFlag , WordsBitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF);
|
||||
void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes);
|
||||
void calculateOSMProb(lm::ngram::Model & ptrOp);
|
||||
void computeOSMFeature(int startIndex , WordsBitmap & coverageVector);
|
||||
void constructCepts(std::vector <int> & align , int startIndex , int endIndex, int targetPhraseLength);
|
||||
void setPhrases(std::vector <std::string> & val1 , std::vector <std::string> & val2){currF = val1; currE = val2;}
|
||||
void setState(const FFState* prev_state);
|
||||
osmState * saveState();
|
||||
void print();
|
||||
void populateScores(std::vector <float> & scores);
|
||||
void setState(const lm::ngram::State & val){lmState = val;}
|
||||
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
|
@ -19,31 +19,18 @@ PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line)
|
||||
: StatefulFeatureFunction("PhraseBoundaryFeature", 0, line)
|
||||
{
|
||||
std::cerr << "Initializing source word deletion feature.." << std::endl;
|
||||
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
bool PhraseBoundaryFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
void PhraseBoundaryFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "source") {
|
||||
m_sourceFactors = Tokenize<FactorType>(value, ",");
|
||||
} else if (key == "target") {
|
||||
m_targetFactors = Tokenize<FactorType>(value, ",");
|
||||
} else {
|
||||
return StatefulFeatureFunction::SetParameter(key, value);
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const
|
||||
|
@ -52,7 +52,7 @@ public:
|
||||
ScoreComponentCollection* ) const {
|
||||
throw std::logic_error("PhraseBoundaryState not supported in chart decoder, yet");
|
||||
}
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
private:
|
||||
void AddFeatures(
|
||||
|
@ -12,7 +12,7 @@ using namespace std;
|
||||
PhraseLengthFeature::PhraseLengthFeature(const std::string &line)
|
||||
:StatelessFeatureFunction("PhraseLengthFeature", 0, line)
|
||||
{
|
||||
CHECK(m_args.size() == 0);
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void PhraseLengthFeature::Evaluate(const Phrase &source
|
||||
|
@ -17,18 +17,7 @@ PhrasePairFeature::PhrasePairFeature(const std::string &line)
|
||||
:StatelessFeatureFunction("PhrasePairFeature", 0, line)
|
||||
{
|
||||
std::cerr << "Initializing PhrasePairFeature.." << std::endl;
|
||||
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
ReadParameters();
|
||||
|
||||
if (m_simple == 1) std::cerr << "using simple phrase pairs.. ";
|
||||
if (m_sourceContext == 1) std::cerr << "using source context.. ";
|
||||
@ -43,7 +32,7 @@ PhrasePairFeature::PhrasePairFeature(const std::string &line)
|
||||
}
|
||||
}
|
||||
|
||||
bool PhrasePairFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
void PhrasePairFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "input-factor") {
|
||||
m_sourceFactorId = Scan<FactorType>(value);
|
||||
@ -62,10 +51,8 @@ bool PhrasePairFeature::SetParameter(const std::string& key, const std::string&
|
||||
} else if (key == "ignore-punctuation") {
|
||||
m_filePathSource = value;
|
||||
} else {
|
||||
return StatelessFeatureFunction::SetParameter(key, value);
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
void PhrasePairFeature::Load()
|
||||
|
@ -46,7 +46,7 @@ public:
|
||||
}
|
||||
|
||||
void Load();
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
};
|
||||
|
||||
|
22
moses/FF/PhrasePenalty.cpp
Normal file
22
moses/FF/PhrasePenalty.cpp
Normal file
@ -0,0 +1,22 @@
|
||||
|
||||
#include "PhrasePenalty.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
PhrasePenalty::PhrasePenalty(const std::string &line)
|
||||
: StatelessFeatureFunction("PhrasePenalty",1, line)
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void PhrasePenalty::Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
scoreBreakdown.Assign(this, 1.0f);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
24
moses/FF/PhrasePenalty.h
Normal file
24
moses/FF/PhrasePenalty.h
Normal file
@ -0,0 +1,24 @@
|
||||
#pragma once
|
||||
|
||||
#include "StatelessFeatureFunction.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class PhrasePenalty : public StatelessFeatureFunction
|
||||
{
|
||||
public:
|
||||
PhrasePenalty(const std::string &line);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
};
|
||||
|
||||
} //namespace
|
||||
|
@ -22,30 +22,18 @@ SourceWordDeletionFeature::SourceWordDeletionFeature(const std::string &line)
|
||||
m_unrestricted(true)
|
||||
{
|
||||
std::cerr << "Initializing source word deletion feature.." << std::endl;
|
||||
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
bool SourceWordDeletionFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
void SourceWordDeletionFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "factor") {
|
||||
m_factorType = Scan<FactorType>(value);
|
||||
} else if (key == "path") {
|
||||
m_filename = value;
|
||||
} else {
|
||||
return StatelessFeatureFunction::SetParameter(key, value);
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void SourceWordDeletionFeature::Load()
|
||||
|
@ -37,7 +37,7 @@ public:
|
||||
const TargetPhrase& targetPhrase,
|
||||
ScoreComponentCollection* accumulator,
|
||||
const AlignmentInfo &alignmentInfo) const;
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
};
|
||||
|
||||
|
@ -21,18 +21,7 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line)
|
||||
:StatefulFeatureFunction("TargetBigramFeature", 0, line)
|
||||
{
|
||||
std::cerr << "Initializing target bigram feature.." << std::endl;
|
||||
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
ReadParameters();
|
||||
|
||||
FactorCollection& factorCollection = FactorCollection::Instance();
|
||||
const Factor* bosFactor =
|
||||
@ -41,7 +30,7 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line)
|
||||
|
||||
}
|
||||
|
||||
bool TargetBigramFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
void TargetBigramFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "factor") {
|
||||
m_factorType = Scan<FactorType>(value);
|
||||
@ -50,7 +39,6 @@ bool TargetBigramFeature::SetParameter(const std::string& key, const std::string
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void TargetBigramFeature::Load()
|
||||
|
@ -47,7 +47,7 @@ public:
|
||||
ScoreComponentCollection* ) const {
|
||||
abort();
|
||||
}
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
private:
|
||||
FactorType m_factorType;
|
||||
|
@ -41,22 +41,10 @@ TargetNgramFeature::TargetNgramFeature(const std::string &line)
|
||||
:StatefulFeatureFunction("TargetNgramFeature", 0, line)
|
||||
{
|
||||
std::cerr << "Initializing target ngram feature.." << std::endl;
|
||||
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
bool TargetNgramFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
void TargetNgramFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "factor") {
|
||||
m_factorType = Scan<FactorType>(value);
|
||||
@ -65,9 +53,8 @@ bool TargetNgramFeature::SetParameter(const std::string& key, const std::string&
|
||||
} else if (key == "lower-ngrams") {
|
||||
m_lower_ngrams = Scan<bool>(value);
|
||||
} else {
|
||||
return StatefulFeatureFunction::SetParameter(key, value);
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TargetNgramFeature::Load(const std::string &filePath)
|
||||
|
@ -191,7 +191,7 @@ public:
|
||||
|
||||
virtual FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureId,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
private:
|
||||
FactorType m_factorType;
|
||||
|
@ -20,30 +20,18 @@ TargetWordInsertionFeature::TargetWordInsertionFeature(const std::string &line)
|
||||
m_unrestricted(true)
|
||||
{
|
||||
std::cerr << "Initializing target word insertion feature.." << std::endl;
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
bool TargetWordInsertionFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
void TargetWordInsertionFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "factor") {
|
||||
m_factorType = Scan<FactorType>(value);
|
||||
} else if (key == "path") {
|
||||
m_filename = value;
|
||||
} else {
|
||||
return StatelessFeatureFunction::SetParameter(key, value);
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void TargetWordInsertionFeature::Load()
|
||||
|
@ -37,7 +37,7 @@ public:
|
||||
const TargetPhrase& targetPhrase,
|
||||
ScoreComponentCollection* accumulator,
|
||||
const AlignmentInfo &alignmentInfo) const;
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
};
|
||||
|
||||
|
@ -1,7 +1,17 @@
|
||||
#include "UnknownWordPenaltyProducer.h"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
UnknownWordPenaltyProducer::UnknownWordPenaltyProducer(const std::string &line)
|
||||
: StatelessFeatureFunction("UnknownWordPenalty",1, line)
|
||||
{
|
||||
m_tuneable = false;
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -15,11 +15,7 @@ class WordsRange;
|
||||
class UnknownWordPenaltyProducer : public StatelessFeatureFunction
|
||||
{
|
||||
public:
|
||||
UnknownWordPenaltyProducer(const std::string &line)
|
||||
: StatelessFeatureFunction("UnknownWordPenalty",1, line) {
|
||||
m_tuneable = false;
|
||||
CHECK(m_args.size() == 0);
|
||||
}
|
||||
UnknownWordPenaltyProducer(const std::string &line);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
|
@ -2,8 +2,16 @@
|
||||
#include "moses/TargetPhrase.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
WordPenaltyProducer::WordPenaltyProducer(const std::string &line)
|
||||
: StatelessFeatureFunction("WordPenalty",1, line)
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void WordPenaltyProducer::Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
|
@ -12,10 +12,7 @@ class ScoreComponentCollection;
|
||||
class WordPenaltyProducer : public StatelessFeatureFunction
|
||||
{
|
||||
public:
|
||||
WordPenaltyProducer(const std::string &line)
|
||||
: StatelessFeatureFunction("WordPenalty",1, line) {
|
||||
CHECK(m_args.size() == 0);
|
||||
}
|
||||
WordPenaltyProducer(const std::string &line);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
|
@ -26,18 +26,7 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line)
|
||||
,m_domainTrigger(false)
|
||||
{
|
||||
std::cerr << "Initializing word translation feature.. " << endl;
|
||||
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
ReadParameters();
|
||||
|
||||
if (m_simple == 1) std::cerr << "using simple word translations.. ";
|
||||
if (m_sourceContext == 1) std::cerr << "using source context.. ";
|
||||
@ -71,7 +60,7 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line)
|
||||
|
||||
}
|
||||
|
||||
bool WordTranslationFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
void WordTranslationFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "input-factor") {
|
||||
m_factorTypeSource = Scan<FactorType>(value);
|
||||
@ -94,9 +83,8 @@ bool WordTranslationFeature::SetParameter(const std::string& key, const std::str
|
||||
} else if (key == "target-path") {
|
||||
m_filePathTarget = value;
|
||||
} else {
|
||||
return StatelessFeatureFunction::SetParameter(key, value);
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void WordTranslationFeature::Load()
|
||||
|
@ -52,7 +52,7 @@ public:
|
||||
|
||||
void EvaluateChart(const ChartBasedFeatureContext& context,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -38,14 +38,7 @@ namespace Moses
|
||||
GenerationDictionary::GenerationDictionary(const std::string &line)
|
||||
: DecodeFeature("Generation", line)
|
||||
{
|
||||
for (size_t i = 0; i < m_args.size(); ++i) {
|
||||
const vector<string> &args = m_args[i];
|
||||
|
||||
if (args[0] == "path") {
|
||||
m_filePath = args[1];
|
||||
}
|
||||
}
|
||||
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void GenerationDictionary::Load()
|
||||
@ -133,5 +126,14 @@ const OutputWordCollection *GenerationDictionary::FindWord(const Word &word) con
|
||||
return ret;
|
||||
}
|
||||
|
||||
void GenerationDictionary::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "path") {
|
||||
m_filePath = value;
|
||||
} else {
|
||||
DecodeFeature::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -68,6 +68,7 @@ public:
|
||||
* Or NULL if the input word isn't found. The search function used is the WordComparer functor
|
||||
*/
|
||||
const OutputWordCollection *FindWord(const Word &word) const;
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
};
|
||||
|
||||
|
@ -11,7 +11,6 @@ if $(with-dlib) {
|
||||
}
|
||||
|
||||
alias headers : ../util//kenutil : : : $(max-factors) $(dlib) ;
|
||||
|
||||
alias ThreadPool : ThreadPool.cpp ;
|
||||
|
||||
if [ option.get "with-synlm" : no : yes ] = yes
|
||||
@ -41,6 +40,7 @@ lib moses :
|
||||
TranslationModel/Scope3Parser/*.cpp
|
||||
TranslationModel/CYKPlusParser/*.cpp
|
||||
FF/*.cpp
|
||||
FF/OSM-Feature/*.cpp
|
||||
: #exceptions
|
||||
ThreadPool.cpp
|
||||
SyntacticLanguageModel.cpp
|
||||
|
@ -383,9 +383,10 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string
|
||||
try {
|
||||
lm::ngram::ModelType model_type;
|
||||
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
|
||||
|
||||
switch(model_type) {
|
||||
case lm::ngram::PROBING:
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
|
||||
case lm::ngram::REST_PROBING:
|
||||
return new LanguageModelKen<lm::ngram::RestProbingModel>(description, line, file, factorType, lazy);
|
||||
case lm::ngram::TRIE:
|
||||
|
@ -38,7 +38,7 @@ public:
|
||||
}
|
||||
LabelId add(const Key& k) {
|
||||
std::pair<typename M::iterator,bool> p
|
||||
=m.insert(std::make_pair(k,data.size()));
|
||||
=m.insert(std::make_pair(k,data.size()));
|
||||
if(p.second) data.push_back(k);
|
||||
CHECK(static_cast<size_t>(p.first->second)<data.size());
|
||||
return p.first->second;
|
||||
|
@ -68,6 +68,7 @@ Manager::~Manager()
|
||||
{
|
||||
delete m_transOptColl;
|
||||
delete m_search;
|
||||
// this is a comment ...
|
||||
|
||||
StaticData::Instance().CleanUpAfterSentenceProcessing(m_source);
|
||||
}
|
||||
|
@ -275,13 +275,15 @@ bool Parameter::LoadParam(int argc, char* argv[])
|
||||
}
|
||||
|
||||
// overwrite parameters with values from switches
|
||||
for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++) {
|
||||
for(PARAM_STRING::const_iterator iterParam = m_description.begin();
|
||||
iterParam != m_description.end(); iterParam++) {
|
||||
const string paramName = iterParam->first;
|
||||
OverwriteParam("-" + paramName, paramName, argc, argv);
|
||||
}
|
||||
|
||||
// ... also shortcuts
|
||||
for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); iterParam != m_abbreviation.end(); iterParam++) {
|
||||
for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin();
|
||||
iterParam != m_abbreviation.end(); iterParam++) {
|
||||
const string paramName = iterParam->first;
|
||||
const string paramShortName = iterParam->second;
|
||||
OverwriteParam("-" + paramShortName, paramName, argc, argv);
|
||||
@ -294,7 +296,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
|
||||
verbose = Scan<int>(m_setting["verbose"][0]);
|
||||
if (verbose >= 1) { // only if verbose
|
||||
TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
|
||||
for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; iterParam != m_setting.end(); iterParam++) {
|
||||
for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ;
|
||||
iterParam != m_setting.end(); iterParam++) {
|
||||
TRACE_ERR( "\t" << iterParam->first << ": ");
|
||||
for ( size_t i = 0; i < iterParam->second.size(); i++ )
|
||||
TRACE_ERR( iterParam->second[i] << " ");
|
||||
@ -303,7 +306,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
|
||||
}
|
||||
|
||||
// convert old weights args to new format
|
||||
if (!isParamSpecified("feature"))
|
||||
// WHAT IS GOING ON HERE??? - UG
|
||||
if (!isParamSpecified("feature")) // UG
|
||||
ConvertWeightArgs();
|
||||
CreateWeightsMap();
|
||||
WeightOverwrite();
|
||||
@ -331,11 +335,11 @@ std::vector<float> &Parameter::GetWeights(const std::string &name)
|
||||
{
|
||||
std::vector<float> &ret = m_weights[name];
|
||||
|
||||
cerr << "WEIGHT " << name << "=";
|
||||
for (size_t i = 0; i < ret.size(); ++i) {
|
||||
cerr << ret[i] << ",";
|
||||
}
|
||||
cerr << endl;
|
||||
// cerr << "WEIGHT " << name << "=";
|
||||
// for (size_t i = 0; i < ret.size(); ++i) {
|
||||
// cerr << ret[i] << ",";
|
||||
// }
|
||||
// cerr << endl;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -357,7 +361,10 @@ void Parameter::SetWeight(const std::string &name, size_t ind, const vector<floa
|
||||
newWeights.push_back(line);
|
||||
}
|
||||
|
||||
void Parameter::AddWeight(const std::string &name, size_t ind, const std::vector<float> &weights)
|
||||
void
|
||||
Parameter::
|
||||
AddWeight(const std::string &name, size_t ind,
|
||||
const std::vector<float> &weights)
|
||||
{
|
||||
PARAM_VEC &newWeights = m_setting["weight"];
|
||||
|
||||
@ -478,6 +485,12 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
|
||||
case Compact:
|
||||
ptType = "PhraseDictionaryCompact";
|
||||
break;
|
||||
case SuffixArray:
|
||||
ptType = "PhraseDictionarySuffixArray";
|
||||
break;
|
||||
case DSuffixArray:
|
||||
ptType = "PhraseDictionaryDynSuffixArray";
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -502,6 +515,9 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
|
||||
|
||||
++currOldInd;
|
||||
}
|
||||
|
||||
// cerr << weights.size() << " PHRASE TABLE WEIGHTS "
|
||||
// << __FILE__ << ":" << __LINE__ << endl;
|
||||
AddWeight(ptType, ptInd, weights);
|
||||
|
||||
// actual pt
|
||||
@ -527,7 +543,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
|
||||
ptLine << "num-features=" << numScoreComponent << " ";
|
||||
ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";
|
||||
|
||||
if (implementation == SuffixArray) {
|
||||
if (implementation == SuffixArray || implementation == DSuffixArray) {
|
||||
ptLine << "target-path=" << token[5] << " ";
|
||||
ptLine << "alignment-path=" << token[6] << " ";
|
||||
}
|
||||
|
@ -61,6 +61,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "moses/FF/DistortionScoreProducer.h"
|
||||
#include "moses/FF/WordPenaltyProducer.h"
|
||||
#include "moses/FF/InputFeature.h"
|
||||
#include "moses/FF/PhrasePenalty.h"
|
||||
#include "moses/FF/OSM-Feature/OpSequenceModel.h"
|
||||
|
||||
#include "LM/Ken.h"
|
||||
#ifdef LM_IRST
|
||||
@ -691,6 +693,14 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
PhraseDictionaryDynSuffixArray* model = new PhraseDictionaryDynSuffixArray(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "OpSequenceModel") {
|
||||
OpSequenceModel* model = new OpSequenceModel(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhrasePenalty") {
|
||||
PhrasePenalty* model = new PhrasePenalty(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
}
|
||||
|
||||
#ifdef HAVE_SYNLM
|
||||
@ -938,7 +948,7 @@ const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGra
|
||||
boost::mutex::scoped_lock lock(m_transOptCacheMutex);
|
||||
#endif
|
||||
std::map<std::pair<std::pair<size_t, std::string>, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter
|
||||
= m_transOptCache.find(key);
|
||||
= m_transOptCache.find(key);
|
||||
if (iter == m_transOptCache.end())
|
||||
return NULL;
|
||||
iter->second.second = clock(); // update last used time
|
||||
@ -1166,7 +1176,6 @@ void StaticData::LoadFeatureFunctions()
|
||||
}
|
||||
}
|
||||
|
||||
// load phrase table
|
||||
for (size_t i = 0; i < m_phraseDictionary.size(); ++i) {
|
||||
PhraseDictionary *pt = m_phraseDictionary[i];
|
||||
pt->Load();
|
||||
|
@ -673,7 +673,7 @@ public:
|
||||
return false;
|
||||
}
|
||||
std::map< std::string, std::set< std::string > >::const_iterator lookupIgnoreFF
|
||||
= m_weightSettingIgnoreFF.find( m_currentWeightSetting );
|
||||
= m_weightSettingIgnoreFF.find( m_currentWeightSetting );
|
||||
if (lookupIgnoreFF == m_weightSettingIgnoreFF.end()) {
|
||||
return false;
|
||||
}
|
||||
@ -691,7 +691,7 @@ public:
|
||||
return false;
|
||||
}
|
||||
std::map< std::string, std::set< size_t > >::const_iterator lookupIgnoreDP
|
||||
= m_weightSettingIgnoreDP.find( m_currentWeightSetting );
|
||||
= m_weightSettingIgnoreDP.find( m_currentWeightSetting );
|
||||
if (lookupIgnoreDP == m_weightSettingIgnoreDP.end()) {
|
||||
return false;
|
||||
}
|
||||
|
@ -35,11 +35,11 @@ struct CompareTargetPhrase {
|
||||
|
||||
void TargetPhraseCollection::NthElement(size_t tableLimit)
|
||||
{
|
||||
vector<TargetPhrase*>::iterator
|
||||
iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit) ?m_collection.end() : m_collection.begin() + tableLimit;
|
||||
|
||||
//std::sort(m_collection.begin(), m_collection.end(), CompareTargetPhrase());
|
||||
std::nth_element(m_collection.begin(), iterMiddle, m_collection.end(), CompareTargetPhrase());
|
||||
vector<TargetPhrase*>::iterator nth;
|
||||
nth = (tableLimit && tableLimit <= m_collection.size()
|
||||
? m_collection.begin() + tableLimit
|
||||
: m_collection.end());
|
||||
std::nth_element(m_collection.begin(), nth, m_collection.end(), CompareTargetPhrase());
|
||||
}
|
||||
|
||||
void TargetPhraseCollection::Prune(bool adhereTableLimit, size_t tableLimit)
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -5,23 +5,29 @@
|
||||
#include "moses/TranslationModel/DynSAInclude/vocab.h"
|
||||
#include "moses/TranslationModel/DynSAInclude/types.h"
|
||||
#include "moses/TranslationModel/DynSAInclude/utils.h"
|
||||
#include "moses/TranslationModel/WordCoocTable.h"
|
||||
#include "moses/InputFileStream.h"
|
||||
#include "moses/FactorTypeSet.h"
|
||||
#include "moses/TargetPhrase.h"
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
#include "moses/TargetPhraseCollection.h"
|
||||
#include <map>
|
||||
|
||||
using namespace std;
|
||||
namespace Moses
|
||||
{
|
||||
class PhraseDictionaryDynSuffixArray;
|
||||
|
||||
/** @todo ask Abbey Levenberg
|
||||
*/
|
||||
class SAPhrase
|
||||
{
|
||||
public:
|
||||
std::vector<wordID_t> words;
|
||||
vector<wordID_t> words;
|
||||
|
||||
SAPhrase(size_t phraseSize)
|
||||
:words(phraseSize) {
|
||||
}
|
||||
:words(phraseSize)
|
||||
{}
|
||||
|
||||
void SetId(size_t pos, wordID_t id) {
|
||||
CHECK(pos < words.size());
|
||||
@ -43,12 +49,16 @@ public:
|
||||
, m_endTarget(endTarget)
|
||||
, m_startSource(startSource)
|
||||
, m_endSource(endSource)
|
||||
, m_sntIndex(sntIndex) {
|
||||
}
|
||||
, m_sntIndex(sntIndex)
|
||||
{}
|
||||
|
||||
size_t GetTargetSize() const {
|
||||
return m_endTarget - m_startTarget + 1;
|
||||
}
|
||||
|
||||
size_t GetSourceSize() const {
|
||||
return m_endSource - m_startSource + 1;
|
||||
}
|
||||
};
|
||||
|
||||
/** @todo ask Abbey Levenberg
|
||||
@ -58,32 +68,43 @@ class SentenceAlignment
|
||||
public:
|
||||
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
|
||||
int m_sntIndex;
|
||||
std::vector<wordID_t>* trgSnt;
|
||||
std::vector<wordID_t>* srcSnt;
|
||||
std::vector<int> numberAligned;
|
||||
std::vector< std::vector<int> > alignedList;
|
||||
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
|
||||
vector<wordID_t>* trgSnt;
|
||||
vector<wordID_t>* srcSnt;
|
||||
vector<int> numberAligned;
|
||||
vector< vector<int> > alignedList;
|
||||
bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret,
|
||||
int startSource, int endSource) const;
|
||||
};
|
||||
|
||||
class ScoresComp
|
||||
{
|
||||
public:
|
||||
ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
|
||||
ScoresComp(const vector<float>& weights): m_weights(weights) {}
|
||||
bool operator()(const Scores& s1, const Scores& s2) const {
|
||||
return s1[0] < s2[0]; // just p(e|f) as approximation
|
||||
/*float score1(0), score2(0);
|
||||
int idx1(0), idx2(0);
|
||||
for (Scores::const_iterator itr = s1.begin();
|
||||
itr != s1.end(); ++itr) {
|
||||
score1 += log(*itr * m_weights.at(idx1++));
|
||||
}
|
||||
for (Scores::const_iterator itr = s2.begin();
|
||||
itr != s2.end(); ++itr) {
|
||||
score2 += log(*itr * m_weights.at(idx2++));
|
||||
}
|
||||
return score1 < score2;*/
|
||||
// float score1(0), score2(0);
|
||||
// int idx1(0), idx2(0);
|
||||
// for (Scores::const_iterator itr = s1.begin();
|
||||
// itr != s1.end(); ++itr) {
|
||||
// score1 += log(*itr * m_weights.at(idx1++));
|
||||
// }
|
||||
// for (Scores::const_iterator itr = s2.begin();
|
||||
// itr != s2.end(); ++itr) {
|
||||
// score2 += log(*itr * m_weights.at(idx2++));
|
||||
// }
|
||||
// return score1 < score2;
|
||||
}
|
||||
private:
|
||||
const std::vector<float>& m_weights;
|
||||
const vector<float>& m_weights;
|
||||
};
|
||||
|
||||
struct BetterPhrase {
|
||||
ScoresComp const& cmp;
|
||||
BetterPhrase(ScoresComp const& sc);
|
||||
// bool operator()(pair<Scores, TargetPhrase const*> const& a,
|
||||
// pair<Scores, TargetPhrase const*> const& b) const;
|
||||
bool operator()(pair<Scores, SAPhrase const*> const& a,
|
||||
pair<Scores, SAPhrase const*> const& b) const;
|
||||
};
|
||||
|
||||
/** @todo ask Abbey Levenberg
|
||||
@ -93,66 +114,70 @@ class BilingualDynSuffixArray
|
||||
public:
|
||||
BilingualDynSuffixArray();
|
||||
~BilingualDynSuffixArray();
|
||||
bool Load( const std::vector<FactorType>& inputFactors,
|
||||
const std::vector<FactorType>& outputTactors,
|
||||
std::string source, std::string target, std::string alignments,
|
||||
const std::vector<float> &weight);
|
||||
bool LoadTM( const std::vector<FactorType>& inputFactors,
|
||||
const std::vector<FactorType>& outputTactors,
|
||||
std::string source, std::string target, std::string alignments,
|
||||
const std::vector<float> &weight);
|
||||
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
|
||||
void addSntPair(string& source, string& target, string& alignment);
|
||||
private:
|
||||
DynSuffixArray* m_srcSA;
|
||||
DynSuffixArray* m_trgSA;
|
||||
std::vector<wordID_t>* m_srcCorpus;
|
||||
std::vector<wordID_t>* m_trgCorpus;
|
||||
std::vector<FactorType> m_inputFactors;
|
||||
std::vector<FactorType> m_outputFactors;
|
||||
bool Load( const vector<FactorType>& inputFactors,
|
||||
const vector<FactorType>& outputTactors,
|
||||
string source, string target, string alignments,
|
||||
const vector<float> &weight);
|
||||
// bool LoadTM( const vector<FactorType>& inputFactors,
|
||||
// const vector<FactorType>& outputTactors,
|
||||
// string source, string target, string alignments,
|
||||
// const vector<float> &weight);
|
||||
void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair<Scores, TargetPhrase*> >& target) const;
|
||||
|
||||
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
|
||||
void CleanUp(const InputType& source);
|
||||
void addSntPair(string& source, string& target, string& alignment);
|
||||
pair<float,float>
|
||||
GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const;
|
||||
|
||||
TargetPhrase*
|
||||
GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
|
||||
|
||||
private:
|
||||
|
||||
|
||||
mutable WordCoocTable m_wrd_cooc;
|
||||
DynSuffixArray * m_srcSA;
|
||||
DynSuffixArray * m_trgSA;
|
||||
vector<wordID_t>* m_srcCorpus;
|
||||
vector<wordID_t>* m_trgCorpus;
|
||||
vector<FactorType> m_inputFactors;
|
||||
vector<FactorType> m_outputFactors;
|
||||
|
||||
vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
|
||||
|
||||
Vocab* m_srcVocab, *m_trgVocab;
|
||||
ScoresComp* m_scoreCmp;
|
||||
|
||||
std::vector<SentenceAlignment> m_alignments;
|
||||
std::vector<std::vector<short> > m_rawAlignments;
|
||||
vector<SentenceAlignment> m_alignments;
|
||||
vector<vector<short> > m_rawAlignments;
|
||||
|
||||
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
|
||||
mutable std::set<wordID_t> m_freqWordsCached;
|
||||
mutable map<pair<wordID_t, wordID_t>, pair<float, float> > m_wordPairCache;
|
||||
mutable set<wordID_t> m_freqWordsCached;
|
||||
const size_t m_maxPhraseLength, m_maxSampleSize;
|
||||
|
||||
int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
|
||||
std::vector<wordID_t>&, std::vector<wordID_t>&,
|
||||
const size_t m_maxPTEntries;
|
||||
int LoadCorpus(FactorDirection direction,
|
||||
InputFileStream&, const vector<FactorType>& factors,
|
||||
vector<wordID_t>&, vector<wordID_t>&,
|
||||
Vocab*);
|
||||
int LoadAlignments(InputFileStream& aligs);
|
||||
int LoadRawAlignments(InputFileStream& aligs);
|
||||
int LoadRawAlignments(string& aligs);
|
||||
|
||||
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
|
||||
bool ExtractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;
|
||||
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
|
||||
int SampleSelection(std::vector<unsigned>&, int = 300) const;
|
||||
int SampleSelection(vector<unsigned>&, int = 300) const;
|
||||
|
||||
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
|
||||
TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
|
||||
vector<int> GetSntIndexes(vector<unsigned>&, int, const vector<unsigned>&) const;
|
||||
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
|
||||
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
|
||||
void CacheWordProbs(wordID_t) const;
|
||||
void CacheFreqWords() const;
|
||||
void ClearWordInCache(wordID_t);
|
||||
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
|
||||
pair<float, float> GetLexicalWeight(const PhrasePair&) const;
|
||||
|
||||
int GetSourceSentenceSize(size_t sentenceId) const;
|
||||
int GetTargetSentenceSize(size_t sentenceId) const;
|
||||
|
||||
int GetSourceSentenceSize(size_t sentenceId) const {
|
||||
return (sentenceId==m_srcSntBreaks.size()-1) ?
|
||||
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
|
||||
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
|
||||
}
|
||||
int GetTargetSentenceSize(size_t sentenceId) const {
|
||||
return (sentenceId==m_trgSntBreaks.size()-1) ?
|
||||
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
|
||||
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
|
||||
}
|
||||
};
|
||||
} // end namespace
|
||||
#endif
|
||||
|
@ -234,12 +234,12 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
|
||||
|
||||
std::vector<float> weightT = staticData.GetWeights(&m_dictionary);
|
||||
targetPhraseCollection
|
||||
= tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec
|
||||
,m_outputFactorsVec
|
||||
,m_dictionary
|
||||
,weightT
|
||||
,m_filePath
|
||||
, m_dbWrapper.GetVocab());
|
||||
= tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec
|
||||
,m_outputFactorsVec
|
||||
,m_dictionary
|
||||
,weightT
|
||||
,m_filePath
|
||||
, m_dbWrapper.GetVocab());
|
||||
|
||||
delete tpcollBerkeleyDb;
|
||||
m_cache[tpCollFilePos] = targetPhraseCollection;
|
||||
|
@ -428,7 +428,7 @@ void CompressionTaskReordering::operator()()
|
||||
while(scoresNum < m_encodedScores.size()) {
|
||||
std::string scores = m_encodedScores[scoresNum];
|
||||
std::string compressedScores
|
||||
= m_creator.CompressEncodedScores(scores);
|
||||
= m_creator.CompressEncodedScores(scores);
|
||||
|
||||
std::string dummy;
|
||||
PackedItem packedItem(scoresNum, dummy, compressedScores, 0);
|
||||
|
@ -61,7 +61,7 @@ PhraseDecoder::~PhraseDecoder()
|
||||
inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol)
|
||||
{
|
||||
boost::unordered_map<std::string, unsigned>::iterator it
|
||||
= m_sourceSymbolsMap.find(symbol);
|
||||
= m_sourceSymbolsMap.find(symbol);
|
||||
if(it != m_sourceSymbolsMap.end())
|
||||
return it->second;
|
||||
|
||||
@ -200,7 +200,7 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
|
||||
|
||||
if(m_coding == PREnc) {
|
||||
std::pair<TargetPhraseVectorPtr, size_t> cachedPhraseColl
|
||||
= m_decodingCache.Retrieve(sourcePhrase);
|
||||
= m_decodingCache.Retrieve(sourcePhrase);
|
||||
|
||||
// Has been cached and is complete or does not need to be completed
|
||||
if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0))
|
||||
@ -255,7 +255,7 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
|
||||
if(m_coding == REnc) {
|
||||
for(size_t i = 0; i < sourcePhrase.GetSize(); i++) {
|
||||
std::string sourceWord
|
||||
= sourcePhrase.GetWord(i).GetString(*m_input, false);
|
||||
= sourcePhrase.GetWord(i).GetString(*m_input, false);
|
||||
unsigned idx = GetSourceSymbolId(sourceWord);
|
||||
sourceWords.push_back(idx);
|
||||
}
|
||||
|
@ -41,6 +41,17 @@ using namespace std;
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
PhraseDictionaryCompact::PhraseDictionaryCompact(const std::string &line)
|
||||
:PhraseDictionary("PhraseDictionaryCompact", line)
|
||||
,m_inMemory(true)
|
||||
,m_useAlignmentInfo(true)
|
||||
,m_hash(10, 16)
|
||||
,m_phraseDecoder(0)
|
||||
,m_weight(0)
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void PhraseDictionaryCompact::Load()
|
||||
{
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
@ -106,7 +117,7 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c
|
||||
|
||||
// Retrieve target phrase collection from phrase table
|
||||
TargetPhraseVectorPtr decodedPhraseColl
|
||||
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
|
||||
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
|
||||
|
||||
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
|
||||
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
|
||||
|
@ -68,14 +68,7 @@ protected:
|
||||
|
||||
std::vector<float> m_weight;
|
||||
public:
|
||||
PhraseDictionaryCompact(const std::string &line)
|
||||
:PhraseDictionary("PhraseDictionaryCompact", line)
|
||||
,m_inMemory(true)
|
||||
,m_useAlignmentInfo(true)
|
||||
,m_hash(10, 16)
|
||||
,m_phraseDecoder(0)
|
||||
,m_weight(0) {
|
||||
}
|
||||
PhraseDictionaryCompact(const std::string &line);
|
||||
|
||||
~PhraseDictionaryCompact();
|
||||
|
||||
|
@ -426,7 +426,7 @@ void PhraseTableCreator::AddTargetSymbolId(std::string& symbol)
|
||||
unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
|
||||
{
|
||||
boost::unordered_map<std::string, unsigned>::iterator it
|
||||
= m_sourceSymbolsMap.find(symbol);
|
||||
= m_sourceSymbolsMap.find(symbol);
|
||||
|
||||
if(it != m_sourceSymbolsMap.end())
|
||||
return it->second;
|
||||
@ -437,7 +437,7 @@ unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
|
||||
unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol)
|
||||
{
|
||||
boost::unordered_map<std::string, unsigned>::iterator it
|
||||
= m_targetSymbolsMap.find(symbol);
|
||||
= m_targetSymbolsMap.find(symbol);
|
||||
|
||||
if(it != m_targetSymbolsMap.end())
|
||||
return it->second;
|
||||
@ -451,7 +451,7 @@ unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol)
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
boost::unordered_map<std::string, unsigned>::iterator it
|
||||
= m_targetSymbolsMap.find(symbol);
|
||||
= m_targetSymbolsMap.find(symbol);
|
||||
|
||||
if(it != m_targetSymbolsMap.end())
|
||||
return it->second;
|
||||
@ -1200,7 +1200,7 @@ void CompressionTask::operator()()
|
||||
while(collectionNum < m_encodedCollections.size()) {
|
||||
std::string collection = m_encodedCollections[collectionNum];
|
||||
std::string compressedCollection
|
||||
= m_creator.CompressEncodedCollection(collection);
|
||||
= m_creator.CompressEncodedCollection(collection);
|
||||
|
||||
std::string dummy;
|
||||
PackedItem packedItem(collectionNum, dummy, compressedCollection, 0);
|
||||
|
@ -143,7 +143,7 @@ public:
|
||||
return data;
|
||||
else {
|
||||
typename std::vector<DataType>::iterator it
|
||||
= std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
|
||||
= std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
|
||||
if(it != m_bestVec.end())
|
||||
return *it;
|
||||
else
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "DynSuffixArray.h"
|
||||
#include <iostream>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -215,8 +216,37 @@ void DynSuffixArray::Substitute(vuint_t* /* newSents */, unsigned /* newIndex */
|
||||
return;
|
||||
}
|
||||
|
||||
ComparePosition::
|
||||
ComparePosition(vuint_t const& crp, vuint_t const& sfa)
|
||||
: m_crp(crp), m_sfa(sfa) { }
|
||||
|
||||
bool
|
||||
ComparePosition::
|
||||
operator()(unsigned const& i, vector<wordID_t> const& phrase) const
|
||||
{
|
||||
unsigned const* x = &m_crp.at(i);
|
||||
unsigned const* e = &m_crp.back();
|
||||
size_t k = 0;
|
||||
for (; k < phrase.size() && x < e; ++k, ++x)
|
||||
if (*x != phrase[k]) return *x < phrase[k];
|
||||
return (x == e && k < phrase.size());
|
||||
}
|
||||
|
||||
bool
|
||||
ComparePosition::
|
||||
operator()(vector<wordID_t> const& phrase, unsigned const& i) const
|
||||
{
|
||||
unsigned const* x = &m_crp.at(i);
|
||||
unsigned const* e = &m_crp.back();
|
||||
size_t k = 0;
|
||||
for (; k < phrase.size() && x < e; ++k, ++x)
|
||||
if (*x != phrase[k]) return phrase[k] < *x;
|
||||
return false; // (k == phrase.size() && x < e);
|
||||
}
|
||||
|
||||
bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
|
||||
{
|
||||
// DOES THIS EVEN WORK WHEN A DynSuffixArray has been saved and reloaded????
|
||||
pair<vuint_t::iterator,vuint_t::iterator> bounds;
|
||||
indices->clear();
|
||||
size_t phrasesize = phrase->size();
|
||||
@ -251,6 +281,16 @@ bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
|
||||
return (indices->size() > 0);
|
||||
}
|
||||
|
||||
size_t
|
||||
DynSuffixArray::
|
||||
GetCount(vuint_t const& phrase) const
|
||||
{
|
||||
ComparePosition cmp(*m_corpus, *m_SA);
|
||||
vuint_t::const_iterator lb = lower_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
|
||||
vuint_t::const_iterator ub = upper_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
|
||||
return ub-lb;
|
||||
}
|
||||
|
||||
void DynSuffixArray::Save(FILE* fout)
|
||||
{
|
||||
fWriteVector(fout, *m_SA);
|
||||
|
@ -11,9 +11,25 @@
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
using namespace std;
|
||||
typedef std::vector<unsigned> vuint_t;
|
||||
|
||||
|
||||
/// compare position /i/ in the suffix array /m_sfa/ into corpus /m_crp/
|
||||
/// against reference phrase /phrase/
|
||||
// added by Ulrich Germann
|
||||
class ComparePosition
|
||||
{
|
||||
vuint_t const& m_crp;
|
||||
vuint_t const& m_sfa;
|
||||
|
||||
public:
|
||||
ComparePosition(vuint_t const& crp, vuint_t const& sfa);
|
||||
bool operator()(unsigned const& i, vector<wordID_t> const& phrase) const;
|
||||
bool operator()(vector<wordID_t> const& phrase, unsigned const& i) const;
|
||||
};
|
||||
|
||||
|
||||
/** @todo ask Abbey Levenberg
|
||||
*/
|
||||
class DynSuffixArray
|
||||
@ -30,6 +46,8 @@ public:
|
||||
void Delete(unsigned, unsigned);
|
||||
void Substitute(vuint_t*, unsigned);
|
||||
|
||||
size_t GetCount(vuint_t const& phrase) const;
|
||||
|
||||
private:
|
||||
vuint_t* m_SA;
|
||||
vuint_t* m_ISA;
|
||||
@ -46,10 +64,10 @@ private:
|
||||
void PrintAuxArrays() {
|
||||
std::cerr << "SA\tISA\tF\tL\n";
|
||||
for(size_t i=0; i < m_SA->size(); ++i)
|
||||
std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
|
||||
std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t"
|
||||
<< m_F->at(i) << "\t" << m_L->at(i) << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
} //end namespace
|
||||
|
||||
#endif
|
||||
|
@ -34,16 +34,6 @@ PhraseDictionary::PhraseDictionary(const std::string &description, const std::st
|
||||
:DecodeFeature(description, line)
|
||||
,m_tableLimit(20) // default
|
||||
{
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -54,16 +44,15 @@ GetTargetPhraseCollection(InputType const& src,WordsRange const& range) const
|
||||
return GetTargetPhraseCollection(phrase);
|
||||
}
|
||||
|
||||
bool PhraseDictionary::SetParameter(const std::string& key, const std::string& value)
|
||||
void PhraseDictionary::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "path") {
|
||||
m_filePath = value;
|
||||
} else if (key == "table-limit") {
|
||||
m_tableLimit = Scan<size_t>(value);
|
||||
} else {
|
||||
return DecodeFeature::SetParameter(key, value);
|
||||
DecodeFeature::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void PhraseDictionary::SetFeaturesToApply()
|
||||
|
@ -91,7 +91,7 @@ public:
|
||||
return m_featuresToApply;
|
||||
}
|
||||
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
protected:
|
||||
size_t m_tableLimit;
|
||||
|
@ -0,0 +1,4 @@
|
||||
Specifying Dynamic Suffix Array-based Phrase Tables in moses.ini
|
||||
|
||||
[ttable-file]
|
||||
14 0 0 5 <source language text file> <target language text file> <file with alignment info in symal format>
|
@ -3,84 +3,35 @@
|
||||
#include "moses/StaticData.h"
|
||||
#include "moses/TargetPhrase.h"
|
||||
#include <iomanip>
|
||||
|
||||
#include <boost/foreach.hpp>
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line)
|
||||
:PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
|
||||
PhraseDictionaryDynSuffixArray::
|
||||
PhraseDictionaryDynSuffixArray(const std::string &line)
|
||||
: PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
|
||||
,m_biSA(new BilingualDynSuffixArray())
|
||||
{
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
PhraseDictionaryDynSuffixArray::~PhraseDictionaryDynSuffixArray()
|
||||
{
|
||||
delete m_biSA;
|
||||
}
|
||||
|
||||
void PhraseDictionaryDynSuffixArray::Load()
|
||||
{
|
||||
SetFeaturesToApply();
|
||||
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
vector<float> weight = staticData.GetWeights(this);
|
||||
|
||||
m_biSA->Load( m_input, m_output, m_source, m_target, m_alignments, weight);
|
||||
vector<float> weight = StaticData::Instance().GetWeights(this);
|
||||
m_biSA->Load(m_input, m_output, m_source, m_target, m_alignments, weight);
|
||||
}
|
||||
|
||||
const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCollection(const Phrase& src) const
|
||||
PhraseDictionaryDynSuffixArray::
|
||||
~PhraseDictionaryDynSuffixArray()
|
||||
{
|
||||
TargetPhraseCollection *ret = new TargetPhraseCollection();
|
||||
std::vector< std::pair< Scores, TargetPhrase*> > trg;
|
||||
// extract target phrases and their scores from suffix array
|
||||
m_biSA->GetTargetPhrasesByLexicalWeight( src, trg);
|
||||
|
||||
std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
|
||||
for(itr = trg.begin(); itr != trg.end(); ++itr) {
|
||||
Scores scoreVector = itr->first;
|
||||
TargetPhrase *targetPhrase = itr->second;
|
||||
//std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),NegateScore);
|
||||
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
|
||||
|
||||
targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
|
||||
targetPhrase->Evaluate(src);
|
||||
|
||||
//cout << *targetPhrase << "\t" << std::setprecision(8) << scoreVector[2] << endl;
|
||||
ret->Add(targetPhrase);
|
||||
}
|
||||
ret->NthElement(m_tableLimit); // sort the phrases for the dcoder
|
||||
return ret;
|
||||
delete m_biSA;
|
||||
}
|
||||
|
||||
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
|
||||
{
|
||||
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
|
||||
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
|
||||
}
|
||||
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
|
||||
{
|
||||
// need to implement --
|
||||
}
|
||||
|
||||
ChartRuleLookupManager *PhraseDictionaryDynSuffixArray::CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
|
||||
{
|
||||
throw "Chart decoding not supported by PhraseDictionaryDynSuffixArray";
|
||||
}
|
||||
|
||||
bool PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
|
||||
void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "source") {
|
||||
m_source = value;
|
||||
@ -89,9 +40,66 @@ bool PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const
|
||||
} else if (key == "alignment") {
|
||||
m_alignments = value;
|
||||
} else {
|
||||
return PhraseDictionary::SetParameter(key, value);
|
||||
PhraseDictionary::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const TargetPhraseCollection*
|
||||
PhraseDictionaryDynSuffixArray::
|
||||
GetTargetPhraseCollection(const Phrase& src) const
|
||||
{
|
||||
typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
|
||||
map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics
|
||||
m_biSA->GatherCands(src,pstats);
|
||||
|
||||
TargetPhraseCollection *ret = new TargetPhraseCollection();
|
||||
BOOST_FOREACH(pstat_entry & e, pstats) {
|
||||
TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src);
|
||||
tp->GetScoreBreakdown().Assign(this,e.second);
|
||||
ret->Add(tp);
|
||||
}
|
||||
// return ret;
|
||||
// TargetPhraseCollection *ret = new TargetPhraseCollection();
|
||||
// std::vector< std::pair< Scores, TargetPhrase*> > trg;
|
||||
//
|
||||
// // extract target phrases and their scores from suffix array
|
||||
// m_biSA->GetTargetPhrasesByLexicalWeight(src, trg);
|
||||
//
|
||||
// std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
|
||||
// for(itr = trg.begin(); itr != trg.end(); ++itr) {
|
||||
// Scores scoreVector = itr->first;
|
||||
// TargetPhrase *targetPhrase = itr->second;
|
||||
// std::transform(scoreVector.begin(),scoreVector.end(),
|
||||
// scoreVector.begin(),FloorScore);
|
||||
// targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
|
||||
// targetPhrase->Evaluate();
|
||||
// ret->Add(targetPhrase);
|
||||
// }
|
||||
ret->NthElement(m_tableLimit); // sort the phrases for the decoder
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
PhraseDictionaryDynSuffixArray::
|
||||
insertSnt(string& source, string& target, string& alignment)
|
||||
{
|
||||
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
|
||||
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
|
||||
}
|
||||
|
||||
void
|
||||
PhraseDictionaryDynSuffixArray::
|
||||
deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
|
||||
{
|
||||
// need to implement --
|
||||
}
|
||||
|
||||
ChartRuleLookupManager*
|
||||
PhraseDictionaryDynSuffixArray::
|
||||
CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
|
||||
{
|
||||
CHECK(false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
}// end namepsace
|
||||
|
@ -17,21 +17,19 @@ class PhraseDictionaryDynSuffixArray: public PhraseDictionary
|
||||
public:
|
||||
PhraseDictionaryDynSuffixArray(const std::string &line);
|
||||
~PhraseDictionaryDynSuffixArray();
|
||||
|
||||
bool InitDictionary();
|
||||
void Load();
|
||||
|
||||
// functions below required by base class
|
||||
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
|
||||
void insertSnt(string&, string&, string&);
|
||||
void deleteSnt(unsigned, unsigned);
|
||||
ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&);
|
||||
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
private:
|
||||
BilingualDynSuffixArray *m_biSA;
|
||||
std::string m_source, m_target, m_alignments;
|
||||
|
||||
std::vector<float> m_weight;
|
||||
};
|
||||
|
||||
} // end namespace
|
||||
|
@ -39,6 +39,11 @@ using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
PhraseDictionaryMemory::PhraseDictionaryMemory(const std::string &line)
|
||||
: RuleTableTrie("PhraseDictionaryMemory", line)
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
TargetPhraseCollection &PhraseDictionaryMemory::GetOrCreateTargetPhraseCollection(
|
||||
const Phrase &source
|
||||
|
@ -43,10 +43,7 @@ protected:
|
||||
}
|
||||
|
||||
public:
|
||||
PhraseDictionaryMemory(const std::string &line)
|
||||
: RuleTableTrie("PhraseDictionaryMemory", line) {
|
||||
CHECK(m_args.size() == 0);
|
||||
}
|
||||
PhraseDictionaryMemory(const std::string &line);
|
||||
|
||||
const PhraseDictionaryNodeMemory &GetRootNode() const {
|
||||
return m_collection;
|
||||
|
@ -28,17 +28,7 @@ namespace Moses
|
||||
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
|
||||
:PhraseDictionary("PhraseDictionaryMultiModel", line)
|
||||
{
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
ReadParameters();
|
||||
|
||||
if (m_mode != "interpolate") {
|
||||
ostringstream msg;
|
||||
@ -56,23 +46,12 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
|
||||
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &description, const std::string &line)
|
||||
:PhraseDictionary(description, line)
|
||||
{
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
|
||||
if (description == "PhraseDictionaryMultiModelCounts") {
|
||||
CHECK(m_pdStr.size() == m_multimodelweights.size() || m_pdStr.size()*4 == m_multimodelweights.size());
|
||||
}
|
||||
}
|
||||
|
||||
bool PhraseDictionaryMultiModel::SetParameter(const std::string& key, const std::string& value)
|
||||
void PhraseDictionaryMultiModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "mode") {
|
||||
m_mode = value;
|
||||
@ -82,9 +61,8 @@ bool PhraseDictionaryMultiModel::SetParameter(const std::string& key, const std:
|
||||
} else if (key == "lambda") {
|
||||
m_multimodelweights = Tokenize<float>(value, ",");
|
||||
} else {
|
||||
return PhraseDictionary::SetParameter(key, value);
|
||||
PhraseDictionary::SetParameter(key, value);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
PhraseDictionaryMultiModel::~PhraseDictionaryMultiModel()
|
||||
|
@ -81,7 +81,7 @@ public:
|
||||
/* Don't do anything source specific here as this object is shared between threads.*/
|
||||
}
|
||||
ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&);
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
const std::vector<float>* GetTemporaryMultiModelWeightsVector() const;
|
||||
void SetTemporaryMultiModelWeightsVector(std::vector<float> weights);
|
||||
|
@ -68,17 +68,7 @@ PhraseDictionaryMultiModelCounts::PhraseDictionaryMultiModelCounts(const std::st
|
||||
//m_mode = "interpolate";
|
||||
//m_combineFunction = LinearInterpolationFromCounts;
|
||||
cerr << "m_args=" << m_args.size() << endl;
|
||||
size_t ind = 0;
|
||||
while (ind < m_args.size()) {
|
||||
vector<string> &args = m_args[ind];
|
||||
bool consumed = SetParameter(args[0], args[1]);
|
||||
if (consumed) {
|
||||
m_args.erase(m_args.begin() + ind);
|
||||
} else {
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
ReadParameters();
|
||||
|
||||
CHECK(m_targetTable.size() == m_pdStr.size());
|
||||
|
||||
@ -94,7 +84,7 @@ PhraseDictionaryMultiModelCounts::PhraseDictionaryMultiModelCounts(const std::st
|
||||
|
||||
}
|
||||
|
||||
bool PhraseDictionaryMultiModelCounts::SetParameter(const std::string& key, const std::string& value)
|
||||
void PhraseDictionaryMultiModelCounts::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "mode") {
|
||||
m_mode = value;
|
||||
@ -107,10 +97,8 @@ bool PhraseDictionaryMultiModelCounts::SetParameter(const std::string& key, cons
|
||||
} else if (key == "target-table") {
|
||||
m_targetTable = Tokenize(value, ",");
|
||||
} else {
|
||||
return PhraseDictionaryMultiModel::SetParameter(key, value);
|
||||
PhraseDictionaryMultiModel::SetParameter(key, value);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
PhraseDictionaryMultiModelCounts::~PhraseDictionaryMultiModelCounts()
|
||||
|
@ -103,7 +103,7 @@ public:
|
||||
/* Don't do anything source specific here as this object is shared between threads.*/
|
||||
}
|
||||
|
||||
bool SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
private:
|
||||
std::vector<PhraseDictionary*> m_inverse_pd;
|
||||
|
@ -29,7 +29,7 @@ PhraseDictionaryTreeAdaptor::
|
||||
PhraseDictionaryTreeAdaptor(const std::string &line)
|
||||
: PhraseDictionary("PhraseDictionaryBinary", line)
|
||||
{
|
||||
CHECK(m_args.size() == 0);
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
PhraseDictionaryTreeAdaptor::~PhraseDictionaryTreeAdaptor()
|
||||
|
@ -3,11 +3,17 @@
|
||||
#ifndef moses_PhraseDictionaryTreeAdaptor_h
|
||||
#define moses_PhraseDictionaryTreeAdaptor_h
|
||||
|
||||
#include <vector>
|
||||
#include "util/check.hh"
|
||||
#include "moses/TypeDef.h"
|
||||
#include "moses/TargetPhraseCollection.h"
|
||||
#include "moses/TranslationModel/PhraseDictionary.h"
|
||||
#include "util/check.hh"
|
||||
#include <vector>
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
#include <boost/thread/tss.hpp>
|
||||
#else
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
#endif
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -24,7 +30,11 @@ class PhraseDictionaryTreeAdaptor : public PhraseDictionary
|
||||
{
|
||||
typedef PhraseDictionary MyBase;
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
boost::thread_specific_ptr<PDTAimp> m_implementation;
|
||||
#else
|
||||
boost::scoped_ptr<PDTAimp> m_implementation;
|
||||
#endif
|
||||
|
||||
friend class PDTAimp;
|
||||
PhraseDictionaryTreeAdaptor();
|
||||
|
@ -27,7 +27,8 @@ PhraseDictionaryALSuffixArray::PhraseDictionaryALSuffixArray(const std::string &
|
||||
if (staticData.ThreadCount() > 1) {
|
||||
throw runtime_error("Suffix array implementation is not threadsafe");
|
||||
}
|
||||
CHECK(m_args.size() == 0);
|
||||
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void PhraseDictionaryALSuffixArray::Load()
|
||||
|
@ -30,6 +30,12 @@ using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
|
||||
: MyBase("PhraseDictionaryOnDisk", line)
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
PhraseDictionaryOnDisk::~PhraseDictionaryOnDisk()
|
||||
{
|
||||
}
|
||||
|
@ -30,6 +30,12 @@
|
||||
#include "OnDiskPt/PhraseNode.h"
|
||||
#include "util/check.hh"
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
#include <boost/thread/tss.hpp>
|
||||
#else
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
#endif
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
class TargetPhraseCollection;
|
||||
@ -43,16 +49,17 @@ class PhraseDictionaryOnDisk : public PhraseDictionary
|
||||
friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryOnDisk&);
|
||||
|
||||
protected:
|
||||
#ifdef WITH_THREADS
|
||||
boost::thread_specific_ptr<OnDiskPt::OnDiskWrapper> m_implementation;
|
||||
#else
|
||||
boost::scoped_ptr<OnDiskPt::OnDiskWrapper> m_implementation;
|
||||
#endif
|
||||
|
||||
OnDiskPt::OnDiskWrapper &GetImplementation();
|
||||
const OnDiskPt::OnDiskWrapper &GetImplementation() const;
|
||||
|
||||
public:
|
||||
PhraseDictionaryOnDisk(const std::string &line)
|
||||
: MyBase("PhraseDictionaryOnDisk", line) {
|
||||
CHECK(m_args.size() == 0);
|
||||
}
|
||||
PhraseDictionaryOnDisk(const std::string &line);
|
||||
~PhraseDictionaryOnDisk();
|
||||
void Load();
|
||||
|
||||
|
@ -48,12 +48,6 @@ public:
|
||||
|
||||
void Load();
|
||||
|
||||
// Required by PhraseDictionary.
|
||||
virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const {
|
||||
CHECK(false);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class RuleTableLoader;
|
||||
|
||||
|
72
moses/TranslationModel/WordCoocTable.cpp
Normal file
72
moses/TranslationModel/WordCoocTable.cpp
Normal file
@ -0,0 +1,72 @@
|
||||
#include "moses/TranslationModel/WordCoocTable.h"
|
||||
using namespace std;
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
WordCoocTable::
|
||||
WordCoocTable()
|
||||
{
|
||||
m_cooc.reserve(1000000);
|
||||
m_marg1.reserve(1000000);
|
||||
m_marg2.reserve(1000000);
|
||||
}
|
||||
|
||||
WordCoocTable::
|
||||
WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2)
|
||||
: m_cooc(VocabSize1), m_marg1(VocabSize1,0), m_marg2(VocabSize2, 0)
|
||||
{}
|
||||
|
||||
void
|
||||
WordCoocTable::
|
||||
Count(size_t const a, size_t const b)
|
||||
{
|
||||
while (a >= m_marg1.size()) {
|
||||
m_cooc.push_back(my_map_t());
|
||||
m_marg1.push_back(0);
|
||||
}
|
||||
while (b >= m_marg2.size())
|
||||
m_marg2.push_back(0);
|
||||
++m_marg1[a];
|
||||
++m_marg2[b];
|
||||
++m_cooc[a][b];
|
||||
}
|
||||
|
||||
uint32_t
|
||||
WordCoocTable::
|
||||
GetJoint(size_t const a, size_t const b) const
|
||||
{
|
||||
if (a >= m_marg1.size() || b >= m_marg2.size()) return 0;
|
||||
my_map_t::const_iterator m = m_cooc.at(a).find(b);
|
||||
if (m == m_cooc[a].end()) return 0;
|
||||
return m->second;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
WordCoocTable::
|
||||
GetMarg1(size_t const x) const
|
||||
{
|
||||
return x >= m_marg1.size() ? 0 : m_marg1[x];
|
||||
}
|
||||
|
||||
uint32_t
|
||||
WordCoocTable::
|
||||
GetMarg2(size_t const x) const
|
||||
{
|
||||
return x >= m_marg2.size() ? 0 : m_marg2[x];
|
||||
}
|
||||
|
||||
float
|
||||
WordCoocTable::
|
||||
pfwd(size_t const a, size_t const b) const
|
||||
{
|
||||
return float(GetJoint(a,b))/GetMarg1(a);
|
||||
}
|
||||
|
||||
float
|
||||
WordCoocTable::
|
||||
pbwd(size_t const a, size_t const b) const
|
||||
{
|
||||
// cerr << "at " << __FILE__ << ":" << __LINE__ << endl;
|
||||
return float(GetJoint(a,b))/GetMarg2(b);
|
||||
}
|
||||
}
|
72
moses/TranslationModel/WordCoocTable.h
Normal file
72
moses/TranslationModel/WordCoocTable.h
Normal file
@ -0,0 +1,72 @@
|
||||
#ifndef moses_WordCoocTable_h
|
||||
#define moses_WordCoocTable_h
|
||||
|
||||
#include "moses/TranslationModel/DynSAInclude/vocab.h"
|
||||
#include "moses/TranslationModel/DynSAInclude/types.h"
|
||||
#include "moses/TranslationModel/DynSAInclude/utils.h"
|
||||
#include "moses/InputFileStream.h"
|
||||
#include "moses/FactorTypeSet.h"
|
||||
#include "moses/TargetPhrase.h"
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
#include <map>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
using namespace std;
|
||||
|
||||
#ifndef bitvector
|
||||
typedef boost::dynamic_bitset<uint64_t> bitvector;
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Stores word cooccurrence counts
|
||||
* @todo ask Uli Germann
|
||||
*/
|
||||
class WordCoocTable
|
||||
{
|
||||
typedef map<wordID_t,uint32_t> my_map_t;
|
||||
vector<my_map_t> m_cooc;
|
||||
vector<uint32_t> m_marg1;
|
||||
vector<uint32_t> m_marg2;
|
||||
public:
|
||||
WordCoocTable();
|
||||
WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2);
|
||||
uint32_t GetJoint(size_t const a, size_t const b) const;
|
||||
uint32_t GetMarg1(size_t const x) const;
|
||||
uint32_t GetMarg2(size_t const x) const;
|
||||
float pfwd(size_t const a, size_t const b) const;
|
||||
float pbwd(size_t const a, size_t const b) const;
|
||||
void
|
||||
Count(size_t const a, size_t const b);
|
||||
|
||||
template<typename idvec, typename alnvec>
|
||||
void
|
||||
Count(idvec const& s1, idvec const& s2, alnvec const& aln,
|
||||
wordID_t const NULL1, wordID_t const NULL2);
|
||||
|
||||
};
|
||||
|
||||
template<typename idvec, typename alnvec>
|
||||
void
|
||||
WordCoocTable::
|
||||
Count(idvec const& s1, idvec const& s2, alnvec const& aln,
|
||||
wordID_t const NULL1, wordID_t const NULL2)
|
||||
{
|
||||
boost::dynamic_bitset<uint64_t> check1(s1.size()), check2(s2.size());
|
||||
check1.set();
|
||||
check2.set();
|
||||
for (size_t i = 0; i < aln.size(); i += 2) {
|
||||
Count(s1[aln[i]], s2[aln[i+1]]);
|
||||
check1.reset(aln[i]);
|
||||
check2.reset(aln[i+1]);
|
||||
}
|
||||
for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i))
|
||||
Count(s1[i], NULL2);
|
||||
for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i))
|
||||
Count(NULL1, s2[i]);
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
@ -39,17 +39,6 @@ using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
InputLatticeNode::InputLatticeNode(const Phrase &phrase, const WordsRange &range)
|
||||
:m_phrase(phrase)
|
||||
,m_range(range)
|
||||
{
|
||||
}
|
||||
|
||||
void InputLatticeNode::AddNext(const InputLatticeNode &next)
|
||||
{
|
||||
m_next.push_back(&next);
|
||||
}
|
||||
|
||||
/** helper for pruning */
|
||||
bool CompareTranslationOption(const TranslationOption *a, const TranslationOption *b)
|
||||
{
|
||||
@ -245,6 +234,7 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,s
|
||||
// add to dictionary
|
||||
|
||||
Word &targetWord = targetPhrase.AddWord();
|
||||
targetWord.SetIsOOV(true);
|
||||
|
||||
for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
|
||||
FactorType factorType = static_cast<FactorType>(currFactor);
|
||||
@ -373,7 +363,6 @@ void TranslationOptionCollection::CreateTranslationOptions()
|
||||
// in the phraseDictionary (which is the- possibly filtered-- phrase
|
||||
// table loaded on initialization), generate TranslationOption objects
|
||||
// for all phrases
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
|
||||
// there may be multiple decoding graphs (factorizations of decoding)
|
||||
const vector <DecodeGraph*> &decodeGraphList = StaticData::Instance().GetDecodeGraphs();
|
||||
@ -384,13 +373,10 @@ void TranslationOptionCollection::CreateTranslationOptions()
|
||||
|
||||
// loop over all decoding graphs, each generates translation options
|
||||
for (size_t graphInd = 0 ; graphInd < decodeGraphList.size() ; graphInd++) {
|
||||
if (staticData.IsDecodingGraphIgnored( graphInd )) {
|
||||
std::cerr << "ignoring decoding path " << graphInd << std::endl;
|
||||
continue;
|
||||
}
|
||||
if (decodeGraphList.size() > 1) {
|
||||
VERBOSE(3,"Creating translation options from decoding graph " << graphInd << endl);
|
||||
}
|
||||
|
||||
const DecodeGraph &decodeGraph = *decodeGraphList[graphInd];
|
||||
// generate phrases that start at startPos ...
|
||||
for (size_t startPos = 0 ; startPos < size; startPos++) {
|
||||
@ -401,12 +387,10 @@ void TranslationOptionCollection::CreateTranslationOptions()
|
||||
// ... and that end at endPos
|
||||
for (size_t endPos = startPos ; endPos < startPos + maxSize ; endPos++) {
|
||||
if (graphInd > 0 && // only skip subsequent graphs
|
||||
decodeGraphBackoff[graphInd] != 0 && // limited use of backoff specified
|
||||
(endPos-startPos+1 > decodeGraphBackoff[graphInd] || // size exceeds backoff limit or ...
|
||||
m_collection[startPos][endPos-startPos].size() > 0)) { // already covered
|
||||
VERBOSE(3,"No backoff to graph " << graphInd << " for span [" << startPos << ";" << endPos << "]");
|
||||
VERBOSE(3,", length limit: " << decodeGraphBackoff[graphInd]);
|
||||
VERBOSE(3,", found so far: " << m_collection[startPos][endPos-startPos].size() << endl);
|
||||
decodeGraphBackoff[graphInd] != 0 && // use of backoff specified
|
||||
(endPos-startPos+1 >= decodeGraphBackoff[graphInd] || // size exceeds backoff limit or ...
|
||||
m_collection[startPos][endPos-startPos].size() > 0)) { // no phrases found so far
|
||||
VERBOSE(3,"No backoff to graph " << graphInd << " for span [" << startPos << ";" << endPos << "]" << endl);
|
||||
// do not create more options
|
||||
continue;
|
||||
}
|
||||
@ -472,6 +456,117 @@ void TranslationOptionCollection::Sort()
|
||||
}
|
||||
|
||||
|
||||
/** create translation options that exactly cover a specific input span.
|
||||
* Called by CreateTranslationOptions() and ProcessUnknownWord()
|
||||
* \param decodeGraph list of decoding steps
|
||||
* \param factorCollection input sentence with all factors
|
||||
* \param startPos first position in input sentence
|
||||
* \param lastPos last position in input sentence
|
||||
* \param adhereTableLimit whether phrase & generation table limits are adhered to
|
||||
*/
|
||||
void TranslationOptionCollection::CreateTranslationOptionsForRange(
|
||||
const DecodeGraph &decodeGraph
|
||||
, size_t startPos
|
||||
, size_t endPos
|
||||
, bool adhereTableLimit
|
||||
, size_t graphInd)
|
||||
{
|
||||
if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos)) {
|
||||
Phrase *sourcePhrase = NULL; // can't initialise with substring, in case it's confusion network
|
||||
|
||||
// consult persistent (cross-sentence) cache for stored translation options
|
||||
bool skipTransOptCreation = false
|
||||
, useCache = StaticData::Instance().GetUseTransOptCache();
|
||||
if (useCache) {
|
||||
const WordsRange wordsRange(startPos, endPos);
|
||||
sourcePhrase = new Phrase(m_source.GetSubString(wordsRange));
|
||||
|
||||
const TranslationOptionList *transOptList = StaticData::Instance().FindTransOptListInCache(decodeGraph, *sourcePhrase);
|
||||
// is phrase in cache?
|
||||
if (transOptList != NULL) {
|
||||
skipTransOptCreation = true;
|
||||
TranslationOptionList::const_iterator iterTransOpt;
|
||||
for (iterTransOpt = transOptList->begin() ; iterTransOpt != transOptList->end() ; ++iterTransOpt) {
|
||||
TranslationOption *transOpt = new TranslationOption(**iterTransOpt, wordsRange);
|
||||
Add(transOpt);
|
||||
}
|
||||
}
|
||||
} // useCache
|
||||
|
||||
if (!skipTransOptCreation) {
|
||||
// partial trans opt stored in here
|
||||
PartialTranslOptColl* oldPtoc = new PartialTranslOptColl;
|
||||
size_t totalEarlyPruned = 0;
|
||||
|
||||
// initial translation step
|
||||
list <const DecodeStep* >::const_iterator iterStep = decodeGraph.begin();
|
||||
const DecodeStep &decodeStep = **iterStep;
|
||||
|
||||
static_cast<const DecodeStepTranslation&>(decodeStep).ProcessInitialTranslation
|
||||
(m_source, *oldPtoc
|
||||
, startPos, endPos, adhereTableLimit );
|
||||
|
||||
// do rest of decode steps
|
||||
int indexStep = 0;
|
||||
|
||||
for (++iterStep ; iterStep != decodeGraph.end() ; ++iterStep) {
|
||||
|
||||
const DecodeStep &decodeStep = **iterStep;
|
||||
PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
|
||||
|
||||
// go thru each intermediate trans opt just created
|
||||
const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
|
||||
vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
|
||||
for (iterPartialTranslOpt = partTransOptList.begin() ; iterPartialTranslOpt != partTransOptList.end() ; ++iterPartialTranslOpt) {
|
||||
TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
|
||||
|
||||
decodeStep.Process(inputPartialTranslOpt
|
||||
, decodeStep
|
||||
, *newPtoc
|
||||
, this
|
||||
, adhereTableLimit
|
||||
, *sourcePhrase);
|
||||
}
|
||||
|
||||
// last but 1 partial trans not required anymore
|
||||
totalEarlyPruned += newPtoc->GetPrunedCount();
|
||||
delete oldPtoc;
|
||||
oldPtoc = newPtoc;
|
||||
|
||||
indexStep++;
|
||||
} // for (++iterStep
|
||||
|
||||
// add to fully formed translation option list
|
||||
PartialTranslOptColl &lastPartialTranslOptColl = *oldPtoc;
|
||||
const vector<TranslationOption*>& partTransOptList = lastPartialTranslOptColl.GetList();
|
||||
vector<TranslationOption*>::const_iterator iterColl;
|
||||
for (iterColl = partTransOptList.begin() ; iterColl != partTransOptList.end() ; ++iterColl) {
|
||||
TranslationOption *transOpt = *iterColl;
|
||||
Add(transOpt);
|
||||
}
|
||||
|
||||
// storing translation options in persistent cache (kept across sentences)
|
||||
if (useCache) {
|
||||
if (partTransOptList.size() > 0) {
|
||||
TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
|
||||
StaticData::Instance().AddTransOptListToCache(decodeGraph, *sourcePhrase, transOptList);
|
||||
}
|
||||
}
|
||||
|
||||
lastPartialTranslOptColl.DetachAll();
|
||||
totalEarlyPruned += oldPtoc->GetPrunedCount();
|
||||
delete oldPtoc;
|
||||
// TRACE_ERR( "Early translation options pruned: " << totalEarlyPruned << endl);
|
||||
} // if (!skipTransOptCreation)
|
||||
|
||||
if (useCache)
|
||||
delete sourcePhrase;
|
||||
} // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
|
||||
|
||||
if (graphInd == 0 && StaticData::Instance().GetXmlInputType() != XmlPassThrough && HasXmlOptionsOverlappingRange(startPos,endPos)) {
|
||||
CreateXmlOptionsForRange(startPos, endPos);
|
||||
}
|
||||
}
|
||||
|
||||
/** Check if this range overlaps with any XML options. This doesn't need to be an exact match, only an overlap.
|
||||
* by default, we don't support XML options. subclasses need to override this function.
|
||||
|
@ -43,25 +43,6 @@ class FactorMask;
|
||||
class Word;
|
||||
class DecodeGraph;
|
||||
|
||||
/** Each node contains
|
||||
1. substring used to searching the phrase table
|
||||
2. the source range it covers
|
||||
3. a list of InputLatticeNode that it is a prefix of
|
||||
This is for both sentence input, and confusion network/lattices
|
||||
*/
|
||||
class InputLatticeNode
|
||||
{
|
||||
protected:
|
||||
Phrase m_phrase;
|
||||
WordsRange m_range;
|
||||
std::vector<const InputLatticeNode*> m_next;
|
||||
|
||||
public:
|
||||
InputLatticeNode(const Phrase &phrase, const WordsRange &range);
|
||||
void AddNext(const InputLatticeNode &next);
|
||||
|
||||
};
|
||||
|
||||
/** Contains all phrase translations applicable to current input type (a sentence or confusion network).
|
||||
* A key insight into efficient decoding is that various input
|
||||
* conditions (trelliss, factored input, normal text, xml markup)
|
||||
@ -133,21 +114,12 @@ public:
|
||||
|
||||
//! Create all possible translations from the phrase tables
|
||||
virtual void CreateTranslationOptions();
|
||||
|
||||
//! Create translation options that exactly cover a specific input span.
|
||||
/** create translation options that exactly cover a specific input span.
|
||||
* Called by CreateTranslationOptions() and ProcessUnknownWord()
|
||||
* \param decodeGraph list of decoding steps
|
||||
* \param factorCollection input sentence with all factors
|
||||
* \param startPos first position in input sentence
|
||||
* \param lastPos last position in input sentence
|
||||
* \param adhereTableLimit whether phrase & generation table limits are adhered to
|
||||
*/
|
||||
virtual void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
|
||||
, size_t startPosition
|
||||
, size_t endPosition
|
||||
, bool adhereTableLimit
|
||||
, size_t graphInd) = 0;
|
||||
, size_t graphInd);
|
||||
|
||||
//!Check if this range has XML options
|
||||
virtual bool HasXmlOptionsOverlappingRange(size_t startPosition, size_t endPosition) const;
|
||||
|
@ -1,15 +1,9 @@
|
||||
// $Id$
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include "TranslationOptionCollectionConfusionNet.h"
|
||||
#include "ConfusionNet.h"
|
||||
#include "DecodeStep.h"
|
||||
#include "FactorCollection.h"
|
||||
#include "DecodeStepTranslation.h"
|
||||
#include "DecodeStepGeneration.h"
|
||||
#include "moses/FF/InputFeature.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -18,104 +12,7 @@ namespace Moses
|
||||
TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet(
|
||||
const ConfusionNet &input
|
||||
, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
|
||||
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
|
||||
{
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
const InputFeature *inputFeature = staticData.GetInputFeature();
|
||||
CHECK(inputFeature);
|
||||
|
||||
size_t size = input.GetSize();
|
||||
|
||||
// create matrix
|
||||
for (size_t startPos = 0; startPos < size; ++startPos) {
|
||||
std::vector<std::vector<SourcePath> > vec;
|
||||
m_collection.push_back( vec );
|
||||
size_t maxSize = size - startPos;
|
||||
size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
|
||||
maxSize = std::min(maxSize, maxSizePhrase);
|
||||
|
||||
for (size_t endPos = 0 ; endPos < maxSize ; ++endPos) {
|
||||
std::vector<SourcePath> vec;
|
||||
m_collection[startPos].push_back( vec );
|
||||
}
|
||||
|
||||
|
||||
// cut up confusion network into substrings
|
||||
// start with 1-word phrases
|
||||
std::vector<SourcePath> &subphrases = GetPhrases(startPos, startPos);
|
||||
assert(subphrases.size() == 0);
|
||||
|
||||
const ConfusionNet::Column &col = input.GetColumn(startPos);
|
||||
ConfusionNet::Column::const_iterator iter;
|
||||
for (iter = col.begin(); iter != col.end(); ++iter) {
|
||||
subphrases.push_back(SourcePath());
|
||||
SourcePath &sourcePath = subphrases.back();
|
||||
|
||||
const std::pair<Word,std::vector<float> > &inputNode = *iter;
|
||||
|
||||
//cerr << "word=" << inputNode.first << " scores=" << inputNode.second.size() << endl;
|
||||
sourcePath.first.AddWord(inputNode.first);
|
||||
sourcePath.second.PlusEquals(inputFeature, inputNode.second);
|
||||
|
||||
} // for (iter = col.begin(); iter != col.end(); ++iter) {
|
||||
} // for (size_t startPos = 0; startPos < size; ++startPos) {
|
||||
|
||||
// create subphrases by appending words to previous subphrases
|
||||
for (size_t startPos = 0; startPos < size; ++startPos) {
|
||||
size_t maxSize = size - startPos;
|
||||
size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
|
||||
maxSize = std::min(maxSize, maxSizePhrase);
|
||||
size_t end = startPos + maxSize - 1;
|
||||
|
||||
for (size_t endPos = startPos + 1; endPos < end; ++endPos) {
|
||||
std::vector<SourcePath> &newSubphrases = GetPhrases(startPos, endPos);
|
||||
const std::vector<SourcePath> &prevSubphrases = GetPhrases(startPos, endPos - 1);
|
||||
const ConfusionNet::Column &col = input.GetColumn(endPos);
|
||||
CreateSubPhrases(newSubphrases, prevSubphrases, col, *inputFeature);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
for (size_t startPos = 0; startPos < size; ++startPos) {
|
||||
for (size_t endPos = startPos; endPos < size; ++endPos) {
|
||||
cerr << "RANGE=" << startPos << "-" << endPos << endl;
|
||||
|
||||
const std::vector<SourcePath> &subphrases = GetPhrases(startPos, endPos);
|
||||
std::vector<SourcePath>::const_iterator iterSourcePath;
|
||||
for (iterSourcePath = subphrases.begin(); iterSourcePath != subphrases.end(); ++iterSourcePath) {
|
||||
const SourcePath &sourcePath = *iterSourcePath;
|
||||
cerr << sourcePath.first << " " <<sourcePath.second << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
void TranslationOptionCollectionConfusionNet::CreateSubPhrases(std::vector<SourcePath> &newSubphrases
|
||||
, const std::vector<SourcePath> &prevSubphrases
|
||||
, const ConfusionNet::Column &col
|
||||
, const InputFeature &inputFeature)
|
||||
{
|
||||
std::vector<SourcePath>::const_iterator iterSourcePath;
|
||||
for (iterSourcePath = prevSubphrases.begin(); iterSourcePath != prevSubphrases.end(); ++iterSourcePath) {
|
||||
const SourcePath &sourcePath = *iterSourcePath;
|
||||
const Phrase &prevSubPhrase = sourcePath.first;
|
||||
const ScoreComponentCollection &prevScore = sourcePath.second;
|
||||
|
||||
ConfusionNet::Column::const_iterator iterCol;
|
||||
for (iterCol = col.begin(); iterCol != col.end(); ++iterCol) {
|
||||
const std::pair<Word,std::vector<float> > &node = *iterCol;
|
||||
Phrase subphrase(prevSubPhrase);
|
||||
subphrase.AddWord(node.first);
|
||||
|
||||
ScoreComponentCollection score(prevScore);
|
||||
score.PlusEquals(&inputFeature, node.second);
|
||||
|
||||
SourcePath newSourcePath(subphrase, score);
|
||||
newSubphrases.push_back(newSourcePath);
|
||||
}
|
||||
}
|
||||
}
|
||||
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold) {}
|
||||
|
||||
/* forcibly create translation option for a particular source word.
|
||||
* call the base class' ProcessOneUnknownWord() for each possible word in the confusion network
|
||||
@ -133,122 +30,6 @@ void TranslationOptionCollectionConfusionNet::ProcessUnknownWord(size_t sourcePo
|
||||
|
||||
}
|
||||
|
||||
const std::vector<TranslationOptionCollectionConfusionNet::SourcePath> &TranslationOptionCollectionConfusionNet::GetPhrases(size_t startPos, size_t endPos) const
|
||||
{
|
||||
size_t offset = endPos - startPos;
|
||||
CHECK(offset < m_collection[startPos].size());
|
||||
return m_collection[startPos][offset];
|
||||
}
|
||||
|
||||
std::vector<TranslationOptionCollectionConfusionNet::SourcePath> &TranslationOptionCollectionConfusionNet::GetPhrases(size_t startPos, size_t endPos)
|
||||
{
|
||||
size_t offset = endPos - startPos;
|
||||
CHECK(offset < m_collection[startPos].size());
|
||||
return m_collection[startPos][offset];
|
||||
}
|
||||
|
||||
void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRange(
|
||||
const DecodeGraph &decodeGraph
|
||||
, size_t startPos
|
||||
, size_t endPos
|
||||
, bool adhereTableLimit
|
||||
, size_t graphInd)
|
||||
{
|
||||
if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos)) {
|
||||
Phrase *sourcePhrase = NULL; // can't initialise with substring, in case it's confusion network
|
||||
|
||||
// consult persistent (cross-sentence) cache for stored translation options
|
||||
bool skipTransOptCreation = false
|
||||
, useCache = StaticData::Instance().GetUseTransOptCache();
|
||||
if (useCache) {
|
||||
const WordsRange wordsRange(startPos, endPos);
|
||||
sourcePhrase = new Phrase(m_source.GetSubString(wordsRange));
|
||||
|
||||
const TranslationOptionList *transOptList = StaticData::Instance().FindTransOptListInCache(decodeGraph, *sourcePhrase);
|
||||
// is phrase in cache?
|
||||
if (transOptList != NULL) {
|
||||
skipTransOptCreation = true;
|
||||
TranslationOptionList::const_iterator iterTransOpt;
|
||||
for (iterTransOpt = transOptList->begin() ; iterTransOpt != transOptList->end() ; ++iterTransOpt) {
|
||||
TranslationOption *transOpt = new TranslationOption(**iterTransOpt, wordsRange);
|
||||
Add(transOpt);
|
||||
}
|
||||
}
|
||||
} // useCache
|
||||
|
||||
if (!skipTransOptCreation) {
|
||||
// partial trans opt stored in here
|
||||
PartialTranslOptColl* oldPtoc = new PartialTranslOptColl;
|
||||
size_t totalEarlyPruned = 0;
|
||||
|
||||
// initial translation step
|
||||
list <const DecodeStep* >::const_iterator iterStep = decodeGraph.begin();
|
||||
const DecodeStep &decodeStep = **iterStep;
|
||||
|
||||
static_cast<const DecodeStepTranslation&>(decodeStep).ProcessInitialTranslation
|
||||
(m_source, *oldPtoc
|
||||
, startPos, endPos, adhereTableLimit );
|
||||
|
||||
// do rest of decode steps
|
||||
int indexStep = 1;
|
||||
|
||||
for (++iterStep; iterStep != decodeGraph.end() ; ++iterStep, ++indexStep) {
|
||||
const DecodeStep &decodeStep = **iterStep;
|
||||
PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
|
||||
|
||||
// go thru each intermediate trans opt just created
|
||||
const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
|
||||
vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
|
||||
for (iterPartialTranslOpt = partTransOptList.begin() ; iterPartialTranslOpt != partTransOptList.end() ; ++iterPartialTranslOpt) {
|
||||
TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
|
||||
|
||||
decodeStep.Process(inputPartialTranslOpt
|
||||
, decodeStep
|
||||
, *newPtoc
|
||||
, this
|
||||
, adhereTableLimit
|
||||
, *sourcePhrase);
|
||||
}
|
||||
|
||||
// last but 1 partial trans not required anymore
|
||||
totalEarlyPruned += newPtoc->GetPrunedCount();
|
||||
delete oldPtoc;
|
||||
oldPtoc = newPtoc;
|
||||
|
||||
} // for (++iterStep
|
||||
|
||||
// add to fully formed translation option list
|
||||
PartialTranslOptColl &lastPartialTranslOptColl = *oldPtoc;
|
||||
const vector<TranslationOption*>& partTransOptList = lastPartialTranslOptColl.GetList();
|
||||
vector<TranslationOption*>::const_iterator iterColl;
|
||||
for (iterColl = partTransOptList.begin() ; iterColl != partTransOptList.end() ; ++iterColl) {
|
||||
TranslationOption *transOpt = *iterColl;
|
||||
Add(transOpt);
|
||||
}
|
||||
|
||||
// storing translation options in persistent cache (kept across sentences)
|
||||
if (useCache) {
|
||||
if (partTransOptList.size() > 0) {
|
||||
TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
|
||||
StaticData::Instance().AddTransOptListToCache(decodeGraph, *sourcePhrase, transOptList);
|
||||
}
|
||||
}
|
||||
|
||||
lastPartialTranslOptColl.DetachAll();
|
||||
totalEarlyPruned += oldPtoc->GetPrunedCount();
|
||||
delete oldPtoc;
|
||||
// TRACE_ERR( "Early translation options pruned: " << totalEarlyPruned << endl);
|
||||
} // if (!skipTransOptCreation)
|
||||
|
||||
if (useCache)
|
||||
delete sourcePhrase;
|
||||
} // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
|
||||
|
||||
if (graphInd == 0 && StaticData::Instance().GetXmlInputType() != XmlPassThrough && HasXmlOptionsOverlappingRange(startPos,endPos)) {
|
||||
CreateXmlOptionsForRange(startPos, endPos);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
|
@ -3,11 +3,11 @@
|
||||
#define moses_TranslationOptionCollectionConfusionNet_h
|
||||
|
||||
#include "TranslationOptionCollection.h"
|
||||
#include "ConfusionNet.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
class InputFeature;
|
||||
|
||||
class ConfusionNet;
|
||||
|
||||
/** Holds all translation options, for all spans, of a particular confusion network input
|
||||
* Inherited from TranslationOptionCollection.
|
||||
@ -15,28 +15,12 @@ class InputFeature;
|
||||
class TranslationOptionCollectionConfusionNet : public TranslationOptionCollection
|
||||
{
|
||||
public:
|
||||
typedef std::pair<Phrase, ScoreComponentCollection> SourcePath;
|
||||
|
||||
TranslationOptionCollectionConfusionNet(const ConfusionNet &source, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
|
||||
|
||||
void ProcessUnknownWord(size_t sourcePos);
|
||||
|
||||
const std::vector<SourcePath> &GetPhrases(size_t startPos, size_t endPos) const;
|
||||
std::vector<SourcePath> &GetPhrases(size_t startPos, size_t endPos);
|
||||
protected:
|
||||
std::vector<std::vector<std::vector<SourcePath> > > m_collection;
|
||||
|
||||
void CreateSubPhrases(std::vector<SourcePath> &newSubphrases
|
||||
, const std::vector<SourcePath> &prevSubphrases
|
||||
, const ConfusionNet::Column &col
|
||||
, const InputFeature &inputFeature);
|
||||
|
||||
void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
|
||||
, size_t startPosition
|
||||
, size_t endPosition
|
||||
, bool adhereTableLimit
|
||||
, size_t graphInd);
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -24,44 +24,14 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "DecodeStep.h"
|
||||
#include "FactorCollection.h"
|
||||
#include "WordsRange.h"
|
||||
#include "DecodeStepTranslation.h"
|
||||
#include "DecodeStepGeneration.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
/** constructor; just initialize the base class */
|
||||
TranslationOptionCollectionText::TranslationOptionCollectionText(Sentence const &input
|
||||
, size_t maxNoTransOptPerCoverage
|
||||
, float translationOptionThreshold)
|
||||
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
|
||||
{
|
||||
size_t size = input.GetSize();
|
||||
m_collection.resize(size);
|
||||
for (size_t startPos = 0; startPos < size; ++startPos) {
|
||||
std::vector<InputLatticeNode> &vec = m_collection[startPos];
|
||||
for (size_t endPos = startPos; endPos < size; ++endPos) {
|
||||
Phrase subphrase(input.GetSubString(WordsRange(startPos, endPos)));
|
||||
WordsRange range(startPos, endPos);
|
||||
InputLatticeNode node(subphrase, range);
|
||||
|
||||
if (range.GetNumWordsCovered() > 1) {
|
||||
InputLatticeNode prevNode = GetPhrase(startPos, endPos - 1);
|
||||
node.AddNext(prevNode);
|
||||
}
|
||||
|
||||
vec.push_back(node);
|
||||
}
|
||||
}
|
||||
/*
|
||||
for (size_t startPos = 0; startPos < size; ++startPos) {
|
||||
for (size_t endPos = startPos; endPos < size; ++endPos) {
|
||||
cerr << startPos << "-" << endPos << "=" << GetPhrase(startPos, endPos) << endl;
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
TranslationOptionCollectionText::TranslationOptionCollectionText(Sentence const &inputSentence, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
|
||||
: TranslationOptionCollection(inputSentence, maxNoTransOptPerCoverage, translationOptionThreshold) {}
|
||||
|
||||
/* forcibly create translation option for a particular source word.
|
||||
* For text, this function is easy, just call the base class' ProcessOneUnknownWord()
|
||||
@ -96,118 +66,10 @@ void TranslationOptionCollectionText::CreateXmlOptionsForRange(size_t startPosit
|
||||
for(size_t i=0; i<xmlOptions.size(); i++) {
|
||||
Add(xmlOptions[i]);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
const InputLatticeNode &TranslationOptionCollectionText::GetPhrase(size_t startPos, size_t endPos) const
|
||||
{
|
||||
size_t offset = endPos - startPos;
|
||||
CHECK(offset < m_collection[startPos].size());
|
||||
return m_collection[startPos][offset];
|
||||
}
|
||||
|
||||
void TranslationOptionCollectionText::CreateTranslationOptionsForRange(
|
||||
const DecodeGraph &decodeGraph
|
||||
, size_t startPos
|
||||
, size_t endPos
|
||||
, bool adhereTableLimit
|
||||
, size_t graphInd)
|
||||
{
|
||||
if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos)) {
|
||||
Phrase *sourcePhrase = NULL; // can't initialise with substring, in case it's confusion network
|
||||
|
||||
// consult persistent (cross-sentence) cache for stored translation options
|
||||
bool skipTransOptCreation = false
|
||||
, useCache = StaticData::Instance().GetUseTransOptCache();
|
||||
if (useCache) {
|
||||
const WordsRange wordsRange(startPos, endPos);
|
||||
sourcePhrase = new Phrase(m_source.GetSubString(wordsRange));
|
||||
|
||||
const TranslationOptionList *transOptList = StaticData::Instance().FindTransOptListInCache(decodeGraph, *sourcePhrase);
|
||||
// is phrase in cache?
|
||||
if (transOptList != NULL) {
|
||||
skipTransOptCreation = true;
|
||||
TranslationOptionList::const_iterator iterTransOpt;
|
||||
for (iterTransOpt = transOptList->begin() ; iterTransOpt != transOptList->end() ; ++iterTransOpt) {
|
||||
TranslationOption *transOpt = new TranslationOption(**iterTransOpt, wordsRange);
|
||||
Add(transOpt);
|
||||
}
|
||||
}
|
||||
} // useCache
|
||||
|
||||
if (!skipTransOptCreation) {
|
||||
// partial trans opt stored in here
|
||||
PartialTranslOptColl* oldPtoc = new PartialTranslOptColl;
|
||||
size_t totalEarlyPruned = 0;
|
||||
|
||||
// initial translation step
|
||||
list <const DecodeStep* >::const_iterator iterStep = decodeGraph.begin();
|
||||
const DecodeStep &decodeStep = **iterStep;
|
||||
|
||||
static_cast<const DecodeStepTranslation&>(decodeStep).ProcessInitialTranslation
|
||||
(m_source, *oldPtoc
|
||||
, startPos, endPos, adhereTableLimit );
|
||||
|
||||
// do rest of decode steps
|
||||
int indexStep = 1;
|
||||
|
||||
for (++iterStep; iterStep != decodeGraph.end() ; ++iterStep, ++indexStep) {
|
||||
const DecodeStep &decodeStep = **iterStep;
|
||||
PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
|
||||
|
||||
// go thru each intermediate trans opt just created
|
||||
const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
|
||||
vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
|
||||
for (iterPartialTranslOpt = partTransOptList.begin() ; iterPartialTranslOpt != partTransOptList.end() ; ++iterPartialTranslOpt) {
|
||||
TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
|
||||
|
||||
decodeStep.Process(inputPartialTranslOpt
|
||||
, decodeStep
|
||||
, *newPtoc
|
||||
, this
|
||||
, adhereTableLimit
|
||||
, *sourcePhrase);
|
||||
}
|
||||
|
||||
// last but 1 partial trans not required anymore
|
||||
totalEarlyPruned += newPtoc->GetPrunedCount();
|
||||
delete oldPtoc;
|
||||
oldPtoc = newPtoc;
|
||||
|
||||
} // for (++iterStep
|
||||
|
||||
// add to fully formed translation option list
|
||||
PartialTranslOptColl &lastPartialTranslOptColl = *oldPtoc;
|
||||
const vector<TranslationOption*>& partTransOptList = lastPartialTranslOptColl.GetList();
|
||||
vector<TranslationOption*>::const_iterator iterColl;
|
||||
for (iterColl = partTransOptList.begin() ; iterColl != partTransOptList.end() ; ++iterColl) {
|
||||
TranslationOption *transOpt = *iterColl;
|
||||
Add(transOpt);
|
||||
}
|
||||
|
||||
// storing translation options in persistent cache (kept across sentences)
|
||||
if (useCache) {
|
||||
if (partTransOptList.size() > 0) {
|
||||
TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
|
||||
StaticData::Instance().AddTransOptListToCache(decodeGraph, *sourcePhrase, transOptList);
|
||||
}
|
||||
}
|
||||
|
||||
lastPartialTranslOptColl.DetachAll();
|
||||
totalEarlyPruned += oldPtoc->GetPrunedCount();
|
||||
delete oldPtoc;
|
||||
// TRACE_ERR( "Early translation options pruned: " << totalEarlyPruned << endl);
|
||||
} // if (!skipTransOptCreation)
|
||||
|
||||
if (useCache)
|
||||
delete sourcePhrase;
|
||||
} // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
|
||||
|
||||
if (graphInd == 0 && StaticData::Instance().GetXmlInputType() != XmlPassThrough && HasXmlOptionsOverlappingRange(startPos,endPos)) {
|
||||
CreateXmlOptionsForRange(startPos, endPos);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
|
||||
|
@ -22,9 +22,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#ifndef moses_TranslationOptionCollectionText_h
|
||||
#define moses_TranslationOptionCollectionText_h
|
||||
|
||||
#include <vector>
|
||||
#include "TranslationOptionCollection.h"
|
||||
#include "Phrase.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -37,23 +35,18 @@ class Sentence;
|
||||
class TranslationOptionCollectionText : public TranslationOptionCollection
|
||||
{
|
||||
public:
|
||||
TranslationOptionCollectionText(Sentence const& input, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
|
||||
|
||||
void ProcessUnknownWord(size_t sourcePos);
|
||||
|
||||
TranslationOptionCollectionText(Sentence const& inputSentence, size_t maxNoTransOptPerCoverage, float translationOptionThreshold);
|
||||
|
||||
bool HasXmlOptionsOverlappingRange(size_t startPosition, size_t endPosition) const;
|
||||
|
||||
void CreateXmlOptionsForRange(size_t startPosition, size_t endPosition);
|
||||
const InputLatticeNode &GetPhrase(size_t startPos, size_t endPos) const;
|
||||
|
||||
protected:
|
||||
std::vector<std::vector<InputLatticeNode> > m_collection;
|
||||
|
||||
void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
|
||||
, size_t startPosition
|
||||
, size_t endPosition
|
||||
, bool adhereTableLimit
|
||||
, size_t graphInd);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -108,27 +108,28 @@ enum DistortionOrientationOptions {
|
||||
|
||||
enum PhraseTableImplementation {
|
||||
Memory = 0
|
||||
,Binary = 1
|
||||
,OnDisk = 2
|
||||
//,GlueRule = 3
|
||||
//,Joshua = 4
|
||||
//,MemorySourceLabel = 5
|
||||
,SCFG = 6
|
||||
//,BerkeleyDb = 7
|
||||
,SuffixArray = 8
|
||||
,Hiero = 9
|
||||
,ALSuffixArray = 10
|
||||
,FuzzyMatch = 11
|
||||
,Compact = 12
|
||||
,Interpolated = 13
|
||||
,Binary = 1
|
||||
,OnDisk = 2
|
||||
//,GlueRule = 3
|
||||
//,Joshua = 4
|
||||
//,MemorySourceLabel = 5
|
||||
,SCFG = 6
|
||||
//,BerkeleyDb = 7
|
||||
,SuffixArray = 8
|
||||
,Hiero = 9
|
||||
,ALSuffixArray = 10
|
||||
,FuzzyMatch = 11
|
||||
,Compact = 12
|
||||
,Interpolated = 13
|
||||
,DSuffixArray = 14
|
||||
};
|
||||
|
||||
enum InputTypeEnum {
|
||||
SentenceInput = 0
|
||||
,ConfusionNetworkInput = 1
|
||||
,WordLatticeInput = 2
|
||||
,TreeInputType = 3
|
||||
,WordLatticeInput2 = 4
|
||||
,ConfusionNetworkInput = 1
|
||||
,WordLatticeInput = 2
|
||||
,TreeInputType = 3
|
||||
,WordLatticeInput2 = 4
|
||||
|
||||
};
|
||||
|
||||
@ -141,7 +142,7 @@ enum XmlInputType {
|
||||
|
||||
enum DictionaryFind {
|
||||
Best = 0
|
||||
,All = 1
|
||||
,All = 1
|
||||
};
|
||||
|
||||
enum ParsingAlgorithm {
|
||||
@ -151,22 +152,22 @@ enum ParsingAlgorithm {
|
||||
|
||||
enum SearchAlgorithm {
|
||||
Normal = 0
|
||||
,CubePruning = 1
|
||||
,CubeGrowing = 2
|
||||
,ChartDecoding= 3
|
||||
,NormalBatch = 4
|
||||
,ChartIncremental = 5
|
||||
,CubePruning = 1
|
||||
,CubeGrowing = 2
|
||||
,ChartDecoding= 3
|
||||
,NormalBatch = 4
|
||||
,ChartIncremental = 5
|
||||
};
|
||||
|
||||
enum SourceLabelOverlap {
|
||||
SourceLabelOverlapAdd = 0
|
||||
,SourceLabelOverlapReplace = 1
|
||||
,SourceLabelOverlapDiscard = 2
|
||||
,SourceLabelOverlapReplace = 1
|
||||
,SourceLabelOverlapDiscard = 2
|
||||
};
|
||||
|
||||
enum WordAlignmentSort {
|
||||
NoSort = 0
|
||||
,TargetOrder = 1
|
||||
,TargetOrder = 1
|
||||
};
|
||||
|
||||
enum FormatType {
|
||||
|
13
moses/Word.h
13
moses/Word.h
@ -52,11 +52,14 @@ protected:
|
||||
|
||||
FactorArray m_factorArray; /**< set of factors */
|
||||
bool m_isNonTerminal;
|
||||
bool m_isOOV;
|
||||
|
||||
public:
|
||||
/** deep copy */
|
||||
Word(const Word ©)
|
||||
:m_isNonTerminal(copy.m_isNonTerminal) {
|
||||
:m_isNonTerminal(copy.m_isNonTerminal)
|
||||
,m_isOOV(copy.m_isOOV)
|
||||
{
|
||||
std::memcpy(m_factorArray, copy.m_factorArray, sizeof(FactorArray));
|
||||
}
|
||||
|
||||
@ -64,6 +67,7 @@ public:
|
||||
explicit Word(bool isNonTerminal = false) {
|
||||
std::memset(m_factorArray, 0, sizeof(FactorArray));
|
||||
m_isNonTerminal = isNonTerminal;
|
||||
m_isOOV = false;
|
||||
}
|
||||
|
||||
~Word() {}
|
||||
@ -92,6 +96,13 @@ public:
|
||||
m_isNonTerminal = val;
|
||||
}
|
||||
|
||||
inline bool IsOOV() const {
|
||||
return m_isOOV;
|
||||
}
|
||||
inline void SetIsOOV(bool val) {
|
||||
m_isOOV = val;
|
||||
}
|
||||
|
||||
/** add the factors from sourceWord into this representation,
|
||||
* NULL elements in sourceWord will be skipped */
|
||||
void Merge(const Word &sourceWord);
|
||||
|
51
moses/generic/sampling/Sampling.h
Normal file
51
moses/generic/sampling/Sampling.h
Normal file
@ -0,0 +1,51 @@
|
||||
#ifndef __sampling_h
|
||||
#define __sampling_h
|
||||
|
||||
// Utility functions for proper sub-sampling.
|
||||
// (c) 2007-2012 Ulrich Germann
|
||||
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
inline
|
||||
size_t
|
||||
randInt(size_t N)
|
||||
{
|
||||
return N*(rand()/(RAND_MAX+1.));
|
||||
}
|
||||
|
||||
// select a random sample of size /s/ without restitution from the range of
|
||||
// integers [0,N);
|
||||
template<typename idx_t>
|
||||
void
|
||||
randomSample(vector<idx_t>& v, size_t s, size_t N)
|
||||
{
|
||||
// see also Knuth: Art of Computer Programming Vol. 2, p. 142
|
||||
|
||||
s = min(s,N);
|
||||
v.resize(s);
|
||||
|
||||
// the first option tries to be a bit more efficient than O(N) in picking
|
||||
// the samples. The threshold is an ad-hoc, off-the-cuff guess. I still
|
||||
// need to figure out the optimal break-even point between a linear sweep
|
||||
// and repeatedly picking random numbers with the risk of hitting the same
|
||||
// number many times.
|
||||
if (s*10<N) {
|
||||
boost::dynamic_bitset<uint64_t> check(N,0);
|
||||
for (size_t i = 0; i < v.size(); i++) {
|
||||
size_t x = randInt(N);
|
||||
while (check[x]) x = randInt(N);
|
||||
check[x]=true;
|
||||
v[i] = x;
|
||||
}
|
||||
} else {
|
||||
size_t m=0;
|
||||
for (size_t t = 0; m <= s && t < N; t++)
|
||||
if (s==N || randInt(N-t) < s-m) v[m++] = t;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
85
moses/generic/sorting/NBestList.h
Normal file
85
moses/generic/sorting/NBestList.h
Normal file
@ -0,0 +1,85 @@
|
||||
#ifndef __n_best_list_h
|
||||
#define __n_best_list_h
|
||||
#include <algorithm>
|
||||
#include "moses/generic/sorting/VectorIndexSorter.h"
|
||||
|
||||
// NBest List; (c) 2007-2012 Ulrich Germann
|
||||
//
|
||||
// The 'trick' used in this implementation is to maintain a heap of size <= N
|
||||
// such that the lowest-scoring item is on top of the heap. For each incoming
|
||||
// item we can then determine easily if it is in the top N.
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
template<typename THINGY, typename CMP>
|
||||
class
|
||||
NBestList
|
||||
{
|
||||
vector<uint32_t> m_heap;
|
||||
vector<THINGY> m_list;
|
||||
VectorIndexSorter<THINGY, CMP, uint32_t> m_better;
|
||||
mutable vector<uint32_t> m_order;
|
||||
mutable bool m_changed;
|
||||
public:
|
||||
NBestList(size_t const max_size, CMP const& cmp);
|
||||
NBestList(size_t const max_size);
|
||||
bool add(THINGY const& item);
|
||||
THINGY const& operator[](int i) const;
|
||||
size_t size() const {
|
||||
return m_heap.size();
|
||||
}
|
||||
};
|
||||
|
||||
template<typename THINGY, typename CMP>
|
||||
NBestList<THINGY,CMP>::
|
||||
NBestList(size_t const max_size, CMP const& cmp)
|
||||
: m_better(m_list, cmp), m_changed(false)
|
||||
{
|
||||
m_heap.reserve(max_size);
|
||||
}
|
||||
|
||||
template<typename THINGY, typename CMP>
|
||||
NBestList<THINGY,CMP>::
|
||||
NBestList(size_t const max_size)
|
||||
: m_better(m_heap), m_changed(false)
|
||||
{
|
||||
m_heap.reserve(max_size);
|
||||
}
|
||||
|
||||
template<typename THINGY, typename CMP>
|
||||
bool
|
||||
NBestList<THINGY,CMP>::
|
||||
add(THINGY const& item)
|
||||
{
|
||||
if (m_heap.size() == m_heap.capacity()) {
|
||||
if (m_better.Compare(item, m_list[m_heap.at(0)])) {
|
||||
pop_heap(m_heap.begin(),m_heap.end(),m_better);
|
||||
m_list[m_heap.back()] = item;
|
||||
} else return false;
|
||||
} else {
|
||||
m_list.push_back(item);
|
||||
m_heap.push_back(m_heap.size());
|
||||
}
|
||||
push_heap(m_heap.begin(),m_heap.end(),m_better);
|
||||
return m_changed = true;
|
||||
}
|
||||
|
||||
template<typename THINGY, typename CMP>
|
||||
THINGY const&
|
||||
NBestList<THINGY,CMP>::
|
||||
operator[](int i) const
|
||||
{
|
||||
if (m_changed) {
|
||||
m_order.assign(m_heap.begin(),m_heap.end());
|
||||
for (size_t k = m_heap.size(); k != 0; --k)
|
||||
pop_heap(m_order.begin(), m_order.begin()+k);
|
||||
m_changed = false;
|
||||
}
|
||||
if (i < 0) i += m_order.size();
|
||||
return m_list[m_order.at(i)];
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
69
moses/generic/sorting/VectorIndexSorter.h
Normal file
69
moses/generic/sorting/VectorIndexSorter.h
Normal file
@ -0,0 +1,69 @@
|
||||
#ifndef __vector_index_sorter_h
|
||||
#define __vector_index_sorter_h
|
||||
|
||||
// VectorIndexSorter; (c) 2007-2012 Ulrich Germann
|
||||
|
||||
// A VectorIndexSorter is a function object for sorting indices into a vector
|
||||
// of objects (instead of sorting the vector itself).
|
||||
//
|
||||
// typcial use:
|
||||
// vector<thingy> my_vector;
|
||||
// VectorIndexSorter<thingy,less<thingy>,int> sorter(my_vector);
|
||||
// vector<int> order;
|
||||
// sorter.get_order(order);
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
template<typename VAL, typename COMP = greater<VAL>, typename IDX_T=size_t>
|
||||
class
|
||||
VectorIndexSorter : public binary_function<IDX_T const&, IDX_T const&, bool>
|
||||
{
|
||||
vector<VAL> const& m_vecref;
|
||||
boost::shared_ptr<COMP> m_comp;
|
||||
public:
|
||||
|
||||
COMP const& Compare;
|
||||
VectorIndexSorter(vector<VAL> const& v, COMP const& comp)
|
||||
: m_vecref(v), Compare(comp)
|
||||
{ }
|
||||
|
||||
VectorIndexSorter(vector<VAL> const& v)
|
||||
: m_vecref(v), m_comp(new COMP()), Compare(*m_comp)
|
||||
{ }
|
||||
|
||||
bool operator()(IDX_T const & a, IDX_T const & b) const {
|
||||
bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b));
|
||||
bool bwd = Compare(m_vecref[b], m_vecref[a]);
|
||||
return (fwd == bwd ? a < b : fwd);
|
||||
}
|
||||
|
||||
boost::shared_ptr<vector<IDX_T> >
|
||||
GetOrder() const;
|
||||
|
||||
void
|
||||
GetOrder(vector<IDX_T> & order) const;
|
||||
|
||||
};
|
||||
|
||||
template<typename VAL, typename COMP, typename IDX_T>
|
||||
boost::shared_ptr<vector<IDX_T> >
|
||||
VectorIndexSorter<VAL,COMP,IDX_T>::
|
||||
GetOrder() const
|
||||
{
|
||||
boost::shared_ptr<vector<IDX_T> > ret(new vector<IDX_T>(m_vecref.size()));
|
||||
get_order(*ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<typename VAL, typename COMP, typename IDX_T>
|
||||
void
|
||||
VectorIndexSorter<VAL,COMP,IDX_T>::
|
||||
GetOrder(vector<IDX_T> & order) const
|
||||
{
|
||||
order.resize(m_vecref.size());
|
||||
for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i;
|
||||
sort(order.begin(), order.end(), *this);
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
@ -137,7 +137,7 @@ void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
|
||||
const SplitPoints &point = *p;
|
||||
if (point.size() > 3) {
|
||||
const vector< SyntaxNode* >& topNodes
|
||||
= tree.GetNodes( point[0], point[point.size()-1]-1);
|
||||
= tree.GetNodes( point[0], point[point.size()-1]-1);
|
||||
string topLabel = topNodes[0]->GetLabel();
|
||||
|
||||
for(size_t i=2; i<point.size()-1; i++) {
|
||||
@ -155,7 +155,7 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
|
||||
if (point.size() > 3) {
|
||||
int endPoint = point[point.size()-1]-1;
|
||||
const vector< SyntaxNode* >& topNodes
|
||||
= tree.GetNodes( point[0], endPoint);
|
||||
= tree.GetNodes( point[0], endPoint);
|
||||
string topLabel = topNodes[0]->GetLabel();
|
||||
|
||||
for(size_t i=1; i<point.size()-2; i++) {
|
||||
|
@ -2552,6 +2552,8 @@ sub define_tuningevaluation_filter {
|
||||
|
||||
# get model, and whether suffix array is used. Determines the pt implementation.
|
||||
my $sa_exec_dir = &get("TRAINING:suffix-array");
|
||||
my $sa_extractors = &get("GENERAL:sa_extractors");
|
||||
$sa_extractors = 1 unless $sa_extractors;
|
||||
|
||||
my ($ptImpl, $numFF);
|
||||
if ($hierarchical) {
|
||||
@ -2564,7 +2566,7 @@ sub define_tuningevaluation_filter {
|
||||
}
|
||||
}
|
||||
else {
|
||||
$ptImpl = 0; # phrase-based
|
||||
$ptImpl = 0; # phrase-based
|
||||
}
|
||||
|
||||
# config file specified?
|
||||
@ -2589,11 +2591,14 @@ sub define_tuningevaluation_filter {
|
||||
# filter command
|
||||
if ($sa_exec_dir) {
|
||||
# suffix array
|
||||
$cmd .= "$scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $sa_exec_dir $phrase_translation_table $input_filter $filter_dir \n";
|
||||
$cmd .= "$scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $sa_exec_dir $phrase_translation_table $input_filter $filter_dir $sa_extractors \n";
|
||||
|
||||
my $escaped_filter_dir = $filter_dir;
|
||||
$escaped_filter_dir =~ s/\//\\\\\//g;
|
||||
$cmd .= "cat $config | sed s/10\\ 0\\ 0\\ 7.*/10\\ 0\\ 0\\ 7\\ $escaped_filter_dir/g > $filter_dir/moses.ini \n";
|
||||
# kind of a hack -- the correct thing would be to make the generation of the config file ($filter_dir/moses.ini)
|
||||
# set the PhraseDictionaryALSuffixArray's path to the filtered directory rather than to the suffix array itself
|
||||
$cmd .= "sed -i 's%path=$phrase_translation_table%path=$filter_dir%' $filter_dir/moses.ini\n";
|
||||
}
|
||||
else {
|
||||
# normal phrase table
|
||||
|
51
scripts/generic/ph_numbers.perl
Executable file
51
scripts/generic/ph_numbers.perl
Executable file
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
# Script to recognize and replace numbers in Moses training corpora
|
||||
# and decoder input
|
||||
#
|
||||
# (c) 2013 TAUS
|
||||
|
||||
use strict;
|
||||
|
||||
use Getopt::Std;
|
||||
|
||||
my $debug = $ENV{DEBUG} || 0;
|
||||
|
||||
my %opts;
|
||||
if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) {
|
||||
print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n";
|
||||
exit;
|
||||
}
|
||||
my $sourceLocale = $opts{s} || "";
|
||||
my $targetLocale = $opts{t} || "";
|
||||
my $numberSymbol = $opts{m} || '@NUM@';
|
||||
|
||||
while(<>) {
|
||||
# [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?
|
||||
# while(/\G(.*?)\s*([+-]?\p{Digit}+[+-\p{Digit}\.,eE])/) {
|
||||
chomp;
|
||||
my $output = "";
|
||||
my $remainder = "";
|
||||
while(/\G(.*?)(\s*)([+-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+-eE]*)/g) {
|
||||
print STDERR "Between: x$1x\n" if $debug;
|
||||
print STDERR "Number: x$3x\n" if $debug;
|
||||
$output .= $1;
|
||||
if($opts{c}) {
|
||||
$output .= $2.$numberSymbol;
|
||||
}
|
||||
else {
|
||||
if($opts{l}) {
|
||||
$output .= $2."<ne translation=\"$3\">$numberSymbol</ne>";
|
||||
}
|
||||
else {
|
||||
$output .= $2."<ne translation=\"$numberSymbol\" entity=\"$3\">$numberSymbol</ne>";
|
||||
}
|
||||
}
|
||||
$remainder = $';
|
||||
}
|
||||
print STDERR "Remainder: x".$remainder."x\n" if $debug;
|
||||
print STDERR "\n" if $debug;
|
||||
$output .= $remainder if $remainder;
|
||||
$output .= "\n";
|
||||
print $output;
|
||||
}
|
@ -1,88 +0,0 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
# Compatible with sri LM-creating script, eg.
|
||||
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
|
||||
# To use it in the EMS, add this to the [LM] section
|
||||
# lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irst-dir"
|
||||
# settings = ""
|
||||
# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
|
||||
# It should point to the root of the LM toolkit, eg
|
||||
# irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
|
||||
# And make sure that $cores is defined, eg $cores = 8
|
||||
# And make sure the $settings variable is empty. This script doesn't understand some of the sri args like -unk and will complain.
|
||||
|
||||
use strict;
|
||||
use FindBin qw($RealBin);
|
||||
use Getopt::Long;
|
||||
|
||||
my $order = 3;
|
||||
my $corpusPath;
|
||||
my $lmPath;
|
||||
my $cores = 2;
|
||||
my $irstPath;
|
||||
my $tempPath = "tmp";
|
||||
my $p = 1;
|
||||
my $s;
|
||||
my $temp;
|
||||
|
||||
GetOptions("order=s" => \$order,
|
||||
"text=s" => \$corpusPath,
|
||||
"lm=s" => \$lmPath,
|
||||
"cores=s" => \$cores,
|
||||
"irst-dir=s" => \$irstPath,
|
||||
"temp-dir=s" => \$tempPath,
|
||||
"p=i" => \$p, # irstlm parameter: delete singletons
|
||||
"s=s" => \$s, # irstlm parameter: smoothing method
|
||||
"interpolate!" => \$temp, #ignore
|
||||
"kndiscount!" => \$temp #ignore
|
||||
) or exit 1;
|
||||
|
||||
#die("ERROR: please set order") unless defined($order);
|
||||
die("ERROR: please set text") unless defined($corpusPath);
|
||||
die("ERROR: please set lm") unless defined($lmPath);
|
||||
die("ERROR: please set irst-dir") unless defined($irstPath);
|
||||
|
||||
my $ext = ($corpusPath =~ m/([^.]+)$/)[0];
|
||||
print "extension is $ext\n";
|
||||
|
||||
$tempPath .= "/irstlm-build-tmp.$$";
|
||||
`mkdir -p $tempPath`;
|
||||
|
||||
my $cmd;
|
||||
if ($ext eq "gz")
|
||||
{
|
||||
$cmd = "zcat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
|
||||
}
|
||||
else
|
||||
{
|
||||
$cmd = "cat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
|
||||
}
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
$cmd = "IRSTLM=$irstPath/.. $irstPath/build-lm.sh -t $tempPath/stat4 -i \"gunzip -c $tempPath/monolingual.setagged.gz\" -n $order -o $tempPath/iarpa.gz -k $cores";
|
||||
$cmd .= " -p" if $p;
|
||||
$cmd .= " -s $s" if defined($s);
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
$ext = ($lmPath =~ m/([^.]+)$/)[0];
|
||||
print "extension is $ext\n";
|
||||
|
||||
if ($ext eq "gz")
|
||||
{
|
||||
$cmd = "$irstPath/compile-lm --text $tempPath/iarpa.gz /dev/stdout | gzip -c > $lmPath";
|
||||
}
|
||||
else
|
||||
{
|
||||
$cmd = "$irstPath/compile-lm --text $tempPath/iarpa.gz $lmPath";
|
||||
}
|
||||
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
$cmd = "rm -rf $tempPath";
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
print STDERR "FINISH.\n";
|
40
scripts/generic/trainlm-lmplz.perl
Executable file
40
scripts/generic/trainlm-lmplz.perl
Executable file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
# Compatible with sri LM-creating script, eg.
|
||||
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
|
||||
# To use it in the EMS, add this to the [LM] section
|
||||
# lm-training = "$moses-script-dir/generic/trainlm-lmplz.perl -lmplz $lmplz"
|
||||
# settings = "-T $working-dir/tmp -S 10G"
|
||||
# Also, make sure that $lmplz is defined (in the [LM] or [GENERAL] section.
|
||||
# It should point to the binary file
|
||||
# lmplz = /home/waziz/workspace/github/moses/bin/lmplz
|
||||
|
||||
use strict;
|
||||
use FindBin qw($RealBin);
|
||||
use Getopt::Long qw/GetOptionsFromArray/;
|
||||
#use Getopt::Long;
|
||||
Getopt::Long::Configure("pass_through", "no_ignore_case");
|
||||
|
||||
my $order = 3; # order of language model (default trigram)
|
||||
my $corpus; # input text data
|
||||
my $lm; # generated language model
|
||||
my $lmplz; # bin directory of IRSTLM
|
||||
my $help = 0;
|
||||
|
||||
my @optconfig = (
|
||||
"-order=s" => \$order,
|
||||
"-text=s" => \$corpus,
|
||||
"-lm=s" => \$lm,
|
||||
"-lmplz=s" => \$lmplz,
|
||||
);
|
||||
|
||||
GetOptionsFromArray(\@ARGV, @optconfig);
|
||||
die("ERROR: please set text") unless defined($corpus);
|
||||
die("ERROR: please set lm") unless defined($lm);
|
||||
die("ERROR: please set lmplz") unless defined($lmplz);
|
||||
|
||||
my $settings = join(' ', @ARGV);
|
||||
my $cmd = "$lmplz --order $order $settings < $corpus > $lm";
|
||||
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user