diff --git a/Jamroot b/Jamroot index e7987be66..b7d5e485b 100644 --- a/Jamroot +++ b/Jamroot @@ -89,7 +89,7 @@ if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_min requirements += tcmalloc_and_profiler unwind -fno-omit-frame-pointer -fno-omit-frame-pointer ; } else { external-lib tcmalloc_minimal ; - requirements += multi:$(tcmalloc_minimal) ; + requirements += multi:tcmalloc_minimal ; } } else { echo "Tip: install tcmalloc for faster threading. See BUILD-INSTRUCTIONS.txt for more information." ; diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject index f551380fd..e32a5baea 100644 --- a/contrib/other-builds/OnDiskPt/.cproject +++ b/contrib/other-builds/OnDiskPt/.cproject @@ -11,12 +11,12 @@ - - + + @@ -72,13 +72,13 @@ - - + + diff --git a/contrib/other-builds/extract-rules/.project b/contrib/other-builds/extract-rules/.project index 76de5a624..29ffed2a9 100644 --- a/contrib/other-builds/extract-rules/.project +++ b/contrib/other-builds/extract-rules/.project @@ -65,6 +65,11 @@ 1 PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h + + RuleExtractionOptions.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/RuleExtractionOptions.h + SentenceAlignment.cpp 1 diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject index cc46823a0..463e992bd 100644 --- a/contrib/other-builds/mert_lib/.cproject +++ b/contrib/other-builds/mert_lib/.cproject @@ -11,11 +11,11 @@ - + @@ -64,11 +64,11 @@ - + diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject index b6cbc127d..86dfbac5b 100644 --- a/contrib/other-builds/moses-chart-cmd/.cproject +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -5,13 +5,13 @@ - - + + @@ -70,7 +70,6 @@ - @@ -108,13 +107,13 @@ - - + + diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject index f9eeebf1d..828b71395 100644 --- a/contrib/other-builds/moses-cmd/.cproject +++ b/contrib/other-builds/moses-cmd/.cproject @@ -5,13 +5,13 @@ - - + + @@ -71,7 +71,6 @@ - @@ -109,13 +108,13 @@ - - + + diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject index ba645f3e7..862a1deb1 100644 --- a/contrib/other-builds/moses/.cproject +++ b/contrib/other-builds/moses/.cproject @@ -11,12 +11,12 @@ - - + + @@ -88,13 +88,13 @@ - - + + diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp index aa32e1fef..5a12be70a 100644 --- a/mert/FeatureStats.cpp +++ b/mert/FeatureStats.cpp @@ -181,10 +181,8 @@ FeatureStats::FeatureStats(const size_t size) FeatureStats::~FeatureStats() { - if (m_array) { - delete [] m_array; - m_array = NULL; - } + delete [] m_array; + m_array = NULL; } void FeatureStats::Copy(const FeatureStats &stats) diff --git a/mert/ScoreStats.cpp b/mert/ScoreStats.cpp index 1c66cdb5f..771880fa1 100644 --- a/mert/ScoreStats.cpp +++ b/mert/ScoreStats.cpp @@ -35,10 +35,8 @@ ScoreStats::ScoreStats(const size_t size) ScoreStats::~ScoreStats() { - if (m_array) { - delete [] m_array; - m_array = NULL; - } + delete [] m_array; + m_array = NULL; } void ScoreStats::Copy(const ScoreStats &stats) @@ -157,4 +155,4 @@ bool operator==(const ScoreStats& s1, const ScoreStats& s2) return true; } -} \ No newline at end of file +} diff --git a/mert/Singleton.h b/mert/Singleton.h index f50925fa4..df1386650 100644 --- a/mert/Singleton.h +++ b/mert/Singleton.h @@ -21,10 +21,8 @@ public: } static void Delete() { - if (m_instance) { - delete m_instance; - m_instance = NULL; - } + delete m_instance; + m_instance = NULL; } private: diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp index 572dad097..111006981 100644 --- a/moses-chart-cmd/IOWrapper.cpp +++ b/moses-chart-cmd/IOWrapper.cpp @@ -50,7 +50,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "moses/FeatureVector.h" #include "moses/FF/StatefulFeatureFunction.h" #include "moses/FF/StatelessFeatureFunction.h" -#include "moses/FF/SyntaxConstraintFeature.h" +#include "moses/FF/TreeStructureFeature.h" #include "util/exception.hh" using namespace std; @@ -395,14 +395,16 @@ void IOWrapper::OutputDetailedTreeFragmentsTranslationReport( UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL, "No output file for tree fragments specified"); - //Tree of full sentence (to stderr) - const vector& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions(); - for( size_t i=0; iGetScoreProducerDescription() == "SyntaxConstraintFeature0") { - const TreeState* tree = dynamic_cast(hypo->GetFFState(i)); - out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n"; - break; + //Tree of full sentence + const StatefulFeatureFunction* treeStructure = StaticData::Instance().GetTreeStructure(); + if (treeStructure != NULL) { + const vector& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions(); + for( size_t i=0; i(hypo->GetFFState(i)); + out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n"; + break; + } } } diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp index dac7e85a0..deccf74e4 100644 --- a/moses/ChartParser.cpp +++ b/moses/ChartParser.cpp @@ -97,7 +97,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo("0-0"); - if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) { + if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.GetTreeStructure() != NULL) { targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]"); } diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 2f7fdc84f..c07f1bb77 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -34,7 +34,7 @@ #include "moses/FF/ExternalFeature.h" #include "moses/FF/ConstrainedDecoding.h" #include "moses/FF/CoveredReferenceFeature.h" -#include "moses/FF/SyntaxConstraintFeature.h" +#include "moses/FF/TreeStructureFeature.h" #include "moses/FF/SoftMatchingFeature.h" #include "moses/FF/HyperParameterAsWeight.h" @@ -174,7 +174,7 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(ConstrainedDecoding); MOSES_FNAME(CoveredReferenceFeature); MOSES_FNAME(ExternalFeature); - MOSES_FNAME(SyntaxConstraintFeature); + MOSES_FNAME(TreeStructureFeature); MOSES_FNAME(SoftMatchingFeature); MOSES_FNAME(HyperParameterAsWeight); diff --git a/moses/FF/LexicalReordering/LexicalReordering.cpp b/moses/FF/LexicalReordering/LexicalReordering.cpp index c73c0324b..6a2a488d9 100644 --- a/moses/FF/LexicalReordering/LexicalReordering.cpp +++ b/moses/FF/LexicalReordering/LexicalReordering.cpp @@ -52,8 +52,7 @@ LexicalReordering::LexicalReordering(const std::string &line) LexicalReordering::~LexicalReordering() { - if(m_table) - delete m_table; + delete m_table; delete m_configuration; } diff --git a/moses/FF/SyntaxConstraintFeature.cpp b/moses/FF/SyntaxConstraintFeature.cpp deleted file mode 100644 index b5cb71158..000000000 --- a/moses/FF/SyntaxConstraintFeature.cpp +++ /dev/null @@ -1,186 +0,0 @@ -#include "SyntaxConstraintFeature.h" -#include "moses/ScoreComponentCollection.h" -#include "moses/Hypothesis.h" -#include "moses/ChartHypothesis.h" -#include "moses/TargetPhrase.h" -#include -#include - -using namespace std; - -namespace Moses -{ - -InternalTree::InternalTree(const std::string & line, const bool terminal) { - - size_t found = line.find_first_of("[] "); - m_isTerminal = terminal; - - if (found == line.npos) { - m_value = line; - } - - else { - AddSubTree(line, 0); - } -} - -size_t InternalTree::AddSubTree(const std::string & line, size_t pos) { - - std::string value = ""; - char token = 0; - - while (token != ']' && pos != std::string::npos) - { - size_t oldpos = pos; - pos = line.find_first_of("[] ", pos); - if (pos == std::string::npos) break; - token = line[pos]; - value = line.substr(oldpos,pos-oldpos); - - if (token == '[') { - if (m_value.size() > 0) { - TreePointer child(new InternalTree(value, false)); - m_children.push_back(child); - pos = child->AddSubTree(line, pos+1); - } - else { - if (value.size() > 0) { - m_value = value; - } - pos = AddSubTree(line, pos+1); - } - } - else if (token == ' ' || token == ']') { - if (value.size() > 0 && ! m_value.size() > 0) { - m_value = value; - } - else if (value.size() > 0) { - m_isTerminal = false; - TreePointer child(new InternalTree(value, true)); - m_children.push_back(child); - } - if (token == ' ') { - pos++; - } - } - - if (m_children.size() > 0) { - m_isTerminal = false; - } - } - - if (pos == std::string::npos) { - return line.size(); - } - return min(line.size(),pos+1); - -} - -std::string InternalTree::GetString() const { - - std::string ret = " "; - - if (!m_isTerminal) { - ret += "["; - } - - ret += m_value; - for (std::vector::const_iterator it = m_children.begin(); it != m_children.end(); ++it) - { - ret += (*it)->GetString(); - } - - if (!m_isTerminal) { - ret += "]"; - } - return ret; - -} - -void InternalTree::Combine(const std::vector &previous) { - - std::vector::iterator it; - bool found = false; - leafNT next_leafNT(this); - for (std::vector::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) { - found = next_leafNT(it); - if (found) { - *it = *it_prev; - } - else { - std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n"; - } - } -} - -bool InternalTree::FlatSearch(const std::string & label, std::vector::const_iterator & it) const { - for (it = m_children.begin(); it != m_children.end(); ++it) { - if ((*it)->GetLabel() == label) { - return true; - } - } - return false; -} - -bool InternalTree::RecursiveSearch(const std::string & label, std::vector::const_iterator & it) const { - for (it = m_children.begin(); it != m_children.end(); ++it) { - if ((*it)->GetLabel() == label) { - return true; - } - std::vector::const_iterator it2; - if ((*it)->RecursiveSearch(label, it2)) { - it = it2; - return true; - } - } - return false; -} - -bool InternalTree::RecursiveSearch(const std::string & label, std::vector::const_iterator & it, InternalTree const* &parent) const { - for (it = m_children.begin(); it != m_children.end(); ++it) { - if ((*it)->GetLabel() == label) { - parent = this; - return true; - } - std::vector::const_iterator it2; - if ((*it)->RecursiveSearch(label, it2, parent)) { - it = it2; - return true; - } - } - return false; -} - -FFState* SyntaxConstraintFeature::EvaluateChart(const ChartHypothesis& cur_hypo - , int featureID /* used to index the state in the previous hypotheses */ - , ScoreComponentCollection* accumulator) const -{ - std::string tree; - bool found = 0; - cur_hypo.GetCurrTargetPhrase().GetProperty("Tree", tree, found); - - TreePointer mytree (new InternalTree(tree)); - - //get subtrees (in target order) - std::vector previous_trees; - for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) { - const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos); - if (word.IsNonTerminal()) { - size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos]; - const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermInd); - const TreeState* prev = dynamic_cast(prevHypo->GetFFState(featureID)); - const TreePointer prev_tree = prev->GetTree(); - previous_trees.push_back(prev_tree); - } - } - - mytree->Combine(previous_trees); - - - return new TreeState(mytree); - -} - -} - diff --git a/moses/FF/TreeStructureFeature.cpp b/moses/FF/TreeStructureFeature.cpp new file mode 100644 index 000000000..aa879fe0e --- /dev/null +++ b/moses/FF/TreeStructureFeature.cpp @@ -0,0 +1,315 @@ +#include "TreeStructureFeature.h" +#include "moses/StaticData.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/Hypothesis.h" +#include "moses/ChartHypothesis.h" +#include "moses/TargetPhrase.h" +#include +#include + +using namespace std; + +namespace Moses +{ + +InternalTree::InternalTree(const std::string & line, const bool terminal): + m_value_nt(0), + m_isTerminal(terminal) + { + + size_t found = line.find_first_of("[] "); + + if (found == line.npos) { + m_value = line; + } + + else { + AddSubTree(line, 0); + } +} + +size_t InternalTree::AddSubTree(const std::string & line, size_t pos) { + + std::string value = ""; + char token = 0; + + while (token != ']' && pos != std::string::npos) + { + size_t oldpos = pos; + pos = line.find_first_of("[] ", pos); + if (pos == std::string::npos) break; + token = line[pos]; + value = line.substr(oldpos,pos-oldpos); + + if (token == '[') { + if (m_value.size() > 0) { + TreePointer child(new InternalTree(value, false)); + m_children.push_back(child); + pos = child->AddSubTree(line, pos+1); + } + else { + if (value.size() > 0) { + m_value = value; + } + pos = AddSubTree(line, pos+1); + } + } + else if (token == ' ' || token == ']') { + if (value.size() > 0 && ! m_value.size() > 0) { + m_value = value; + } + else if (value.size() > 0) { + m_isTerminal = false; + TreePointer child(new InternalTree(value, true)); + m_children.push_back(child); + } + if (token == ' ') { + pos++; + } + } + + if (m_children.size() > 0) { + m_isTerminal = false; + } + } + + if (pos == std::string::npos) { + return line.size(); + } + return min(line.size(),pos+1); + +} + +std::string InternalTree::GetString() const { + + std::string ret = " "; + + if (!m_isTerminal) { + ret += "["; + } + + ret += m_value; + for (std::vector::const_iterator it = m_children.begin(); it != m_children.end(); ++it) + { + ret += (*it)->GetString(); + } + + if (!m_isTerminal) { + ret += "]"; + } + return ret; + +} + + +void InternalTree::Combine(const std::vector &previous) { + + std::vector::iterator it; + bool found = false; + leafNT next_leafNT(this); + for (std::vector::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) { + found = next_leafNT(it); + if (found) { + *it = *it_prev; + } + else { + std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n"; + } + } +} + + +bool InternalTree::FlatSearch(const std::string & label, std::vector::const_iterator & it) const { + for (it = m_children.begin(); it != m_children.end(); ++it) { + if ((*it)->GetLabel() == label) { + return true; + } + } + return false; +} + +bool InternalTree::RecursiveSearch(const std::string & label, std::vector::const_iterator & it) const { + for (it = m_children.begin(); it != m_children.end(); ++it) { + if ((*it)->GetLabel() == label) { + return true; + } + std::vector::const_iterator it2; + if ((*it)->RecursiveSearch(label, it2)) { + it = it2; + return true; + } + } + return false; +} + +bool InternalTree::RecursiveSearch(const std::string & label, std::vector::const_iterator & it, InternalTree const* &parent) const { + for (it = m_children.begin(); it != m_children.end(); ++it) { + if ((*it)->GetLabel() == label) { + parent = this; + return true; + } + std::vector::const_iterator it2; + if ((*it)->RecursiveSearch(label, it2, parent)) { + it = it2; + return true; + } + } + return false; +} + + +bool InternalTree::FlatSearch(const NTLabel & label, std::vector::const_iterator & it) const { + for (it = m_children.begin(); it != m_children.end(); ++it) { + if ((*it)->GetNTLabel() == label) { + return true; + } + } + return false; +} + +bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector::const_iterator & it) const { + for (it = m_children.begin(); it != m_children.end(); ++it) { + if ((*it)->GetNTLabel() == label) { + return true; + } + std::vector::const_iterator it2; + if ((*it)->RecursiveSearch(label, it2)) { + it = it2; + return true; + } + } + return false; +} + +bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector::const_iterator & it, InternalTree const* &parent) const { + for (it = m_children.begin(); it != m_children.end(); ++it) { + if ((*it)->GetNTLabel() == label) { + parent = this; + return true; + } + std::vector::const_iterator it2; + if ((*it)->RecursiveSearch(label, it2, parent)) { + it = it2; + return true; + } + } + return false; +} + + +bool InternalTree::FlatSearch(const std::vector & labels, std::vector::const_iterator & it) const { + for (it = m_children.begin(); it != m_children.end(); ++it) { + if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) { + return true; + } + } + return false; +} + +bool InternalTree::RecursiveSearch(const std::vector & labels, std::vector::const_iterator & it) const { + for (it = m_children.begin(); it != m_children.end(); ++it) { + if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) { + return true; + } + std::vector::const_iterator it2; + if ((*it)->RecursiveSearch(labels, it2)) { + it = it2; + return true; + } + } + return false; +} + +bool InternalTree::RecursiveSearch(const std::vector & labels, std::vector::const_iterator & it, InternalTree const* &parent) const { + for (it = m_children.begin(); it != m_children.end(); ++it) { + if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) { + parent = this; + return true; + } + std::vector::const_iterator it2; + if ((*it)->RecursiveSearch(labels, it2, parent)) { + it = it2; + return true; + } + } + return false; +} + + +void TreeStructureFeature::Load() { + + // syntactic constraints can be hooked in here. + m_constraints = NULL; + m_labelset = NULL; + + StaticData &staticData = StaticData::InstanceNonConst(); + staticData.SetTreeStructure(this); +} + + +// define NT labels (ints) that are mapped from strings for quicker comparison. +void TreeStructureFeature::AddNTLabels(TreePointer root) const { + std::string label = root->GetLabel(); + + if (root->IsTerminal()) { + return; + } + + std::map::const_iterator it = m_labelset->string_to_label.find(label); + if (it != m_labelset->string_to_label.end()) { + root->SetNTLabel(it->second); + } + + std::vector children = root->GetChildren(); + for (std::vector::const_iterator it2 = children.begin(); it2 != children.end(); ++it2) { + AddNTLabels(*it2); + } +} + +FFState* TreeStructureFeature::EvaluateChart(const ChartHypothesis& cur_hypo + , int featureID /* used to index the state in the previous hypotheses */ + , ScoreComponentCollection* accumulator) const +{ + std::string tree; + bool found = 0; + cur_hypo.GetCurrTargetPhrase().GetProperty("Tree", tree, found); + if (found) { + TreePointer mytree (new InternalTree(tree)); + + if (m_labelset) { + AddNTLabels(mytree); + } + + //get subtrees (in target order) + std::vector previous_trees; + for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) { + const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos); + if (word.IsNonTerminal()) { + size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos]; + const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermInd); + const TreeState* prev = dynamic_cast(prevHypo->GetFFState(featureID)); + const TreePointer prev_tree = prev->GetTree(); + previous_trees.push_back(prev_tree); + } + } + + std::vector sparse_features; + if (m_constraints) { + sparse_features = m_constraints->SyntacticRules(mytree, previous_trees); + } + mytree->Combine(previous_trees); + + //sparse scores + for (std::vector::const_iterator feature=sparse_features.begin(); feature != sparse_features.end(); ++feature) { + accumulator->PlusEquals(this, *feature, 1); + } + return new TreeState(mytree); + } + else { + UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found"); + } + +} + +} + diff --git a/moses/FF/SyntaxConstraintFeature.h b/moses/FF/TreeStructureFeature.h similarity index 58% rename from moses/FF/SyntaxConstraintFeature.h rename to moses/FF/TreeStructureFeature.h index 06f1a1382..1a5b8b5e3 100644 --- a/moses/FF/SyntaxConstraintFeature.h +++ b/moses/FF/TreeStructureFeature.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "StatefulFeatureFunction.h" #include "FFState.h" #include @@ -12,14 +13,25 @@ namespace Moses class InternalTree; typedef boost::shared_ptr TreePointer; +typedef int NTLabel; class InternalTree { std::string m_value; +NTLabel m_value_nt; std::vector m_children; bool m_isTerminal; public: InternalTree(const std::string & line, const bool terminal = false); + InternalTree(const InternalTree & tree): + m_value(tree.m_value), + m_isTerminal(tree.m_isTerminal) { + const std::vector & children = tree.m_children; + for (std::vector::const_iterator it = children.begin(); it != children.end(); it++) { + TreePointer child (new InternalTree(**it)); + m_children.push_back(child); + } + } size_t AddSubTree(const std::string & line, size_t start); std::string GetString() const; @@ -27,6 +39,17 @@ public: const std::string & GetLabel() const { return m_value; } + + // optionally identify label by int instead of string; + // allows abstraction if multiple nonterminal strings should map to same label. + const NTLabel & GetNTLabel() const { + return m_value_nt; + } + + void SetNTLabel(NTLabel value) { + m_value_nt = value; + } + size_t GetLength() const { return m_children.size(); } @@ -45,6 +68,8 @@ public: return (!m_isTerminal && m_children.size() == 0); } + // different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents. + // can be used for formulating syntax constraints. // if found, 'it' is iterator to first tree node that matches search string bool FlatSearch(const std::string & label, std::vector::const_iterator & it) const; @@ -53,6 +78,41 @@ public: // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node bool RecursiveSearch(const std::string & label, std::vector::const_iterator & it, InternalTree const* &parent) const; + // use NTLabel for search to reduce number of string comparisons / deal with synonymous labels + // if found, 'it' is iterator to first tree node that matches search string + bool FlatSearch(const NTLabel & label, std::vector::const_iterator & it) const; + bool RecursiveSearch(const NTLabel & label, std::vector::const_iterator & it) const; + + // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node + bool RecursiveSearch(const NTLabel & label, std::vector::const_iterator & it, InternalTree const* &parent) const; + + // pass vector of possible labels to search + // if found, 'it' is iterator to first tree node that matches search string + bool FlatSearch(const std::vector & labels, std::vector::const_iterator & it) const; + bool RecursiveSearch(const std::vector & labels, std::vector::const_iterator & it) const; + + // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node + bool RecursiveSearch(const std::vector & labels, std::vector::const_iterator & it, InternalTree const* &parent) const; + + +}; + +// mapping from string nonterminal label to int representation. +// allows abstraction if multiple nonterminal strings should map to same label. +struct LabelSet +{ +public: + std::map string_to_label; +}; + + +// class to implement language-specific syntactic constraints. +// the method SyntacticRules must return a vector of strings (each identifying a constraint violation), which are then made into sparse features. +class SyntaxConstraints +{ +public: + virtual std::vector SyntacticRules(TreePointer root, const std::vector &previous) = 0; + virtual ~SyntaxConstraints() {}; }; @@ -71,18 +131,23 @@ public: int Compare(const FFState& other) const {return 0;}; }; -class SyntaxConstraintFeature : public StatefulFeatureFunction +class TreeStructureFeature : public StatefulFeatureFunction { + SyntaxConstraints* m_constraints; + LabelSet* m_labelset; public: - SyntaxConstraintFeature(const std::string &line) + TreeStructureFeature(const std::string &line) :StatefulFeatureFunction(0, line) { ReadParameters(); } + ~TreeStructureFeature() {delete m_constraints;}; virtual const FFState* EmptyHypothesisState(const InputType &input) const { return new TreeState(TreePointer()); } + void AddNTLabels(TreePointer root) const; + bool IsUseable(const FactorMask &mask) const { return true; } @@ -105,6 +170,7 @@ public: int /* featureID - used to index the state in the previous hypotheses */, ScoreComponentCollection* accumulator) const; + void Load(); }; // Python-like generator that yields next nonterminal leaf on every call diff --git a/moses/LM/DALMWrapper.cpp b/moses/LM/DALMWrapper.cpp index 269cd77d0..33ce4c1f4 100644 --- a/moses/LM/DALMWrapper.cpp +++ b/moses/LM/DALMWrapper.cpp @@ -9,8 +9,8 @@ #include "moses/FactorCollection.h" #include "moses/InputFileStream.h" #include "util/exception.hh" -#include "ChartState.h" -#include "util/exception.hh" +#include "moses/ChartHypothesis.h" +#include "moses/ChartManager.h" using namespace std; @@ -58,6 +58,16 @@ public: delete state; } + void reset(const DALMState &from){ + delete state; + state = new DALM::State(*from.state); + } + + void reset(DALM::State *s){ + delete state; + state = s; + } + virtual int Compare(const FFState& other) const{ const DALMState &o = static_cast(other); if(state->get_count() < o.state->get_count()) return -1; @@ -74,6 +84,83 @@ public: } }; +class DALMChartState : public FFState +{ +private: + const ChartHypothesis &hypo; + DALM::Fragment *prefixFragments; + unsigned short prefixLength; + float prefixScore; + DALMState *rightContext; + bool isLarge; + +public: + DALMChartState( + const ChartHypothesis &hypo, + DALM::Fragment *prefixFragments, + unsigned short prefixLength, + float prefixScore, + DALMState *rightContext, + bool isLarge) + : hypo(hypo), + prefixFragments(prefixFragments), + prefixLength(prefixLength), + prefixScore(prefixScore), + rightContext(rightContext), + isLarge(isLarge) + {} + + virtual ~DALMChartState(){ + delete [] prefixFragments; + delete rightContext; + } + + unsigned short GetPrefixLength() const{ + return prefixLength; + } + + const DALM::Fragment *GetPrefixFragments() const{ + return prefixFragments; + } + + float GetPrefixScore() const{ + return prefixScore; + } + + const DALMState *GetRightContext() const{ + return rightContext; + } + + bool LargeEnough() const{ + return isLarge; + } + + virtual int Compare(const FFState& other) const{ + const DALMChartState &o = static_cast(other); + // prefix + if (hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for " ..." + if (prefixLength != o.prefixLength){ + return (prefixLength < o.prefixLength)?-1:1; + } else { + if(prefixLength > 0){ + DALM::Fragment &f = prefixFragments[prefixLength-1]; + DALM::Fragment &of = o.prefixFragments[prefixLength-1]; + int ret = DALM::compare_fragments(f, of); + if(ret != 0) return ret; + } + } + } + + // suffix + size_t inputSize = hypo.GetManager().GetSource().GetSize(); + if (hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... " + int ret = o.rightContext->Compare(*rightContext); + if (ret != 0) return ret; + } + return 0; + } +}; + LanguageModelDALM::LanguageModelDALM(const std::string &line) :LanguageModel(line) { @@ -96,7 +183,7 @@ void LanguageModelDALM::Load() ///////////////////// // READING INIFILE // ///////////////////// - string inifile= m_filePath + "/dalm.ini"; + string inifile= m_filePath + "/dalm.ini"; string model; // Path to the double-array file. string words; // Path to the vocabulary file. @@ -104,8 +191,8 @@ void LanguageModelDALM::Load() read_ini(inifile.c_str(), model, words, wordstxt); model = m_filePath + "/" + model; - words = m_filePath + "/" + words; - wordstxt = m_filePath + "/" + wordstxt; + words = m_filePath + "/" + words; + wordstxt = m_filePath + "/" + wordstxt; UTIL_THROW_IF(model.empty() || words.empty() || wordstxt.empty(), util::FileOpenException, @@ -150,60 +237,40 @@ void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float size_t phraseSize = phrase.GetSize(); if (!phraseSize) return; - DALMState *dalm_state = new DALMState(m_nGramOrder); - size_t currPos = 0; size_t hist_count = 0; + DALMState *dalm_state = new DALMState(m_nGramOrder); + DALM::State *state = dalm_state->get_state(); + + if(phrase.GetWord(0).GetFactor(m_factorType) == m_beginSentenceFactor){ + m_lm->init_state(*state); + currPos++; + hist_count++; + } while (currPos < phraseSize) { const Word &word = phrase.GetWord(currPos); hist_count++; if (word.IsNonTerminal()) { - // do nothing. reset ngram. needed to score target phrases during pt loading in chart decoding - dalm_state->refresh(); + state->refresh(); hist_count = 0; } else { - if (word.GetFactor(m_factorType) == m_beginSentenceFactor) { - // do nothing, don't include prob for unigram - if (currPos != 0) { - UTIL_THROW2("Either your data contains in a position other than the first word or your language model is missing . Did you build your ARPA using IRSTLM and forget to run add-start-end.sh?"); - } - m_lm->init_state(*dalm_state->get_state()); - } else { - LMResult result = GetValue(word, dalm_state->get_state()); - fullScore += result.score; - if (hist_count >= m_nGramOrder) ngramScore += result.score; - if (result.unknown) ++oovCount; - } + DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType)); + float score = m_lm->query(wid, *state); + fullScore += score; + if (hist_count >= m_nGramOrder) ngramScore += score; + if (wid==m_vocab->unk()) ++oovCount; } currPos++; } + + fullScore = TransformLMScore(fullScore); + ngramScore = TransformLMScore(ngramScore); delete dalm_state; } -LMResult LanguageModelDALM::GetValue(DALM::VocabId wid, DALM::State* finalState) const{ - LMResult ret; - - // last word is unk? - ret.unknown = (wid == m_vocab->unk()); - - // calc score. - float score = m_lm->query(wid, *finalState); - score = TransformLMScore(score); - ret.score = score; - - return ret; -} - -LMResult LanguageModelDALM::GetValue(const Word &word, DALM::State* finalState) const -{ - DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType)); - - return GetValue(wid, finalState); -} - FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const{ // In this function, we only compute the LM scores of n-grams that overlap a // phrase boundary. Phrase-internal scores are taken directly from the @@ -222,28 +289,28 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps, const std::size_t adjust_end = std::min(end, begin + m_nGramOrder - 1); DALMState *dalm_state = new DALMState(*dalm_ps); + DALM::State *state = dalm_state->get_state(); - std::size_t position = begin; float score = 0.0; - for(; position < adjust_end; position++){ - score += GetValue(hypo.GetWord(position), dalm_state->get_state()).score; + for(std::size_t position=begin; position < adjust_end; position++){ + score += m_lm->query(GetVocabId(hypo.GetWord(position).GetFactor(m_factorType)), *state); } if (hypo.IsSourceCompleted()) { // Score end of sentence. std::vector indices(m_nGramOrder-1); const DALM::VocabId *last = LastIDs(hypo, &indices.front()); - m_lm->set_state(&indices.front(), (last-&indices.front()), *dalm_state->get_state()); + m_lm->set_state(&indices.front(), (last-&indices.front()), *state); - float s = GetValue(wid_end, dalm_state->get_state()).score; - score += s; + score += m_lm->query(wid_end, *state); } else if (adjust_end < end) { // Get state after adding a long phrase. std::vector indices(m_nGramOrder-1); const DALM::VocabId *last = LastIDs(hypo, &indices.front()); - m_lm->set_state(&indices.front(), (last-&indices.front()), *dalm_state->get_state()); + m_lm->set_state(&indices.front(), (last-&indices.front()), *state); } + score = TransformLMScore(score); if (OOVFeatureEnabled()) { std::vector scores(2); scores[0] = score; @@ -257,129 +324,184 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps, } FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const{ - LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, m_nGramOrder); // initialize language model context state DALMState *dalm_state = new DALMState(m_nGramOrder); + DALM::State *state = dalm_state->get_state(); + + size_t contextSize = m_nGramOrder-1; + DALM::Fragment *prefixFragments = new DALM::Fragment[contextSize]; + unsigned short prefixLength = 0; + bool isLarge = false; // initial language model scores float prefixScore = 0.0; // not yet final for initial words (lack context) - float finalizedScore = 0.0; // finalized, has sufficient context + float hypoScore = 0.0; // total hypothesis score. + + const TargetPhrase &targetPhrase = hypo.GetCurrTargetPhrase(); + size_t hypoSize = targetPhrase.GetSize(); // get index map for underlying hypotheses const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = - hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap(); + targetPhrase.GetAlignNonTerm().GetNonTermIndexMap(); + + size_t phrasePos = 0; + + // begginig of sentence. + if(hypoSize > 0){ + const Word &word = targetPhrase.GetWord(0); + if(!word.IsNonTerminal()){ + DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType)); + if(word.GetFactor(m_factorType) == m_beginSentenceFactor){ + m_lm->init_state(*state); + // state is finalized. + isLarge = true; + }else{ + if(isLarge){ + float score = m_lm->query(wid, *state); + hypoScore += score; + }else{ + float score = m_lm->query(wid, *state, prefixFragments[prefixLength]); + + prefixScore += score; + hypoScore += score; + prefixLength++; + if(prefixLength >= contextSize) isLarge = true; + } + } + }else{ + // special case: rule starts with non-terminal -> copy everything + size_t nonTermIndex = nonTermIndexMap[0]; + const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex); + + const DALMChartState* prevState = + static_cast(prevHypo->GetFFState(featureID)); + + // get prefixScore and hypoScore + prefixScore = prevState->GetPrefixScore(); + hypoScore = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]); + + // get language model state + dalm_state->reset(*prevState->GetRightContext()); + state = dalm_state->get_state(); + + prefixLength = prevState->GetPrefixLength(); + const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments(); + std::memcpy(prefixFragments, prevPrefixFragments, sizeof(DALM::Fragment)*prefixLength); + isLarge = prevState->LargeEnough(); + } + phrasePos++; + } // loop over rule - for (size_t phrasePos = 0, wordPos = 0; - phrasePos < hypo.GetCurrTargetPhrase().GetSize(); - phrasePos++) { + for (; phrasePos < hypoSize; phrasePos++) { // consult rule for either word or non-terminal - const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos); + const Word &word = targetPhrase.GetWord(phrasePos); // regular word if (!word.IsNonTerminal()) { - // beginning of sentence symbol ? -> just update state - if (word.GetFactor(m_factorType) == m_beginSentenceFactor) { - UTIL_THROW_IF2(phrasePos != 0, - "Sentence start symbol must be at the beginning of sentence"); - m_lm->init_state(*dalm_state->get_state()); - } - // score a regular word added by the rule - else { - updateChartScore( &prefixScore, &finalizedScore, GetValue(word, dalm_state->get_state()).score, ++wordPos ); - } + DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType)); + if (isLarge) { + hypoScore += m_lm->query(wid, *state); + }else{ + float score = m_lm->query(wid, *state, prefixFragments[prefixLength]); + prefixScore += score; + hypoScore += score; + prefixLength++; + if(prefixLength >= contextSize) isLarge = true; + } } // non-terminal, add phrase from underlying hypothesis + // internal non-terminal else { // look up underlying hypothesis size_t nonTermIndex = nonTermIndexMap[phrasePos]; const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex); - const LanguageModelChartState* prevState = - static_cast(prevHypo->GetFFState(featureID)); + const DALMChartState* prevState = + static_cast(prevHypo->GetFFState(featureID)); - size_t subPhraseLength = prevState->GetNumTargetTerminals(); - // special case: rule starts with non-terminal -> copy everything - if (phrasePos == 0) { + size_t prevPrefixLength = prevState->GetPrefixLength(); + const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments(); + DALM::Gap gap(*state); + // score its prefix + for(size_t prefixPos = 0; prefixPos < prevPrefixLength; prefixPos++) { + const DALM::Fragment &f = prevPrefixFragments[prefixPos]; - // get prefixScore and finalizedScore - prefixScore = prevState->GetPrefixScore(); - finalizedScore = prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] - prefixScore; - - // get language model state - delete dalm_state; - dalm_state = new DALMState( *static_cast(prevState->GetRightContext()) ); - wordPos += subPhraseLength; + if (isLarge) { + hypoScore += m_lm->query(f, *state, gap); + } else { + float score = m_lm->query(f, *state, gap, prefixFragments[prefixLength]); + prefixScore += score; + hypoScore += score; + prefixLength++; + if(prefixLength >= contextSize) isLarge = true; + } + gap.succ(); } - // internal non-terminal - else { - // score its prefix - size_t wpos = wordPos; - for(size_t prefixPos = 0; - prefixPos < m_nGramOrder-1 // up to LM order window - && prefixPos < subPhraseLength; // up to length - prefixPos++) { - const Word &word = prevState->GetPrefix().GetWord(prefixPos); - updateChartScore( &prefixScore, &finalizedScore, GetValue(word, dalm_state->get_state()).score, ++wpos ); - } - wordPos += subPhraseLength; - - // check if we are dealing with a large sub-phrase - if (subPhraseLength > m_nGramOrder - 1) { - // add its finalized language model score - finalizedScore += - prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] // full score - - prevState->GetPrefixScore(); // - prefix score - - // copy language model state - delete dalm_state; - dalm_state = new DALMState( *static_cast(prevState->GetRightContext()) ); - } - } + // check if we are dealing with a large sub-phrase + if (prevState->LargeEnough()) { + // add its language model score + hypoScore += UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]); + hypoScore -= prevState->GetPrefixScore(); // remove overwrapped score. + // copy language model state + dalm_state->reset(*prevState->GetRightContext()); + state = dalm_state->get_state(); + } else { + DALM::State *state_new = new DALM::State(*prevState->GetRightContext()->get_state()); + m_lm->set_state(*state_new, *state, gap); + dalm_state->reset(state_new); + state = dalm_state->get_state(); + } } } // assign combined score to score breakdown - out->Assign(this, prefixScore + finalizedScore); + out->Assign(this, TransformLMScore(hypoScore)); - ret->Set(prefixScore, dalm_state); - return ret; + return new DALMChartState(hypo, prefixFragments, prefixLength, prefixScore, dalm_state, isLarge); } bool LanguageModelDALM::IsUseable(const FactorMask &mask) const { - bool ret = mask[m_factorType]; - return ret; + return mask[m_factorType]; } void LanguageModelDALM::CreateVocabMapping(const std::string &wordstxt) { InputFileStream vocabStrm(wordstxt); + std::vector< std::pair > vlist; string line; + std::size_t max_fid = 0; while(getline(vocabStrm, line)) { const Factor *factor = FactorCollection::Instance().AddFactor(line); + std::size_t fid = factor->GetId(); DALM::VocabId wid = m_vocab->lookup(line.c_str()); - VocabMap::value_type entry(factor, wid); - m_vocabMap.insert(entry); + vlist.push_back(std::pair(fid, wid)); + if(max_fid < fid) max_fid = fid; } + for(std::size_t i = 0; i < m_vocabMap.size(); i++){ + m_vocabMap[i] = m_vocab->unk(); + } + + m_vocabMap.resize(max_fid+1, m_vocab->unk()); + std::vector< std::pair >::iterator it = vlist.begin(); + while(it != vlist.end()){ + std::pair &entry = *it; + m_vocabMap[entry.first] = entry.second; + + ++it; + } } DALM::VocabId LanguageModelDALM::GetVocabId(const Factor *factor) const { - VocabMap::left_map::const_iterator iter; - iter = m_vocabMap.left.find(factor); - if (iter != m_vocabMap.left.end()) { - return iter->second; - } - else { - // not in mapping. Must be UNK - return m_vocab->unk(); - } + std::size_t fid = factor->GetId(); + return (m_vocabMap.size() > fid)? m_vocabMap[fid] : m_vocab->unk(); } void LanguageModelDALM::SetParameter(const std::string& key, const std::string& value) @@ -395,13 +517,4 @@ void LanguageModelDALM::SetParameter(const std::string& key, const std::string& } } -void LanguageModelDALM::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const -{ - if (wordPos < m_nGramOrder) { - *prefixScore += score; - } else { - *finalizedScore += score; - } -} - } diff --git a/moses/LM/DALMWrapper.h b/moses/LM/DALMWrapper.h index 82b178544..e17aeb851 100644 --- a/moses/LM/DALMWrapper.h +++ b/moses/LM/DALMWrapper.h @@ -53,17 +53,12 @@ protected: DALM::LM *m_lm; DALM::VocabId wid_start, wid_end; - typedef boost::bimap VocabMap; - mutable VocabMap m_vocabMap; + mutable std::vector m_vocabMap; void CreateVocabMapping(const std::string &wordstxt); DALM::VocabId GetVocabId(const Factor *factor) const; private: - LMResult GetValue(DALM::VocabId wid, DALM::State* finalState) const; - LMResult GetValue(const Word &word, DALM::State* finalState) const; - void updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const; - // Convert last words of hypothesis into vocab ids, returning an end pointer. DALM::VocabId *LastIDs(const Hypothesis &hypo, DALM::VocabId *indices) const { DALM::VocabId *index = indices; diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile index 8c155c5d9..4f964ddd8 100644 --- a/moses/LM/Jamfile +++ b/moses/LM/Jamfile @@ -94,9 +94,16 @@ if $(with-nplm) { local with-dalm = [ option.get "with-dalm" ] ; if $(with-dalm) { lib dalm : : $(with-dalm)/lib ; - lib MurmurHash3 : : $(with-dalm)/lib ; - obj DALM.o : DALMWrapper.cpp dalm MurmurHash3 ..//headers : $(with-dalm)/include $(with-dalm)/darts-clone ; - alias dalmALIAS : DALM.o dalm MurmurHash3 : : : LM_DALM ; + + if [ path.exists $(with-dalm)/lib/libMurmurHash3.a ] { + lib MurmurHash3 : : $(with-dalm)/lib ; + alias dalm-libs : dalm MurmurHash3 ; + } else { + alias dalm-libs : dalm ; + } + + obj DALM.o : DALMWrapper.cpp dalm-libs ..//headers : $(with-dalm)/include $(with-dalm)/darts-clone ; + alias dalmALIAS : DALM.o dalm-libs : : : LM_DALM ; dependencies += dalmALIAS ; lmmacros += LM_DALM ; } diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 782144360..9ab1564fc 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -66,6 +66,7 @@ StaticData::StaticData() ,m_lmEnableOOVFeature(false) ,m_isAlwaysCreateDirectTranslationOption(false) ,m_currentWeightSetting("default") + ,m_treeStructure(NULL) { m_xmlBrackets.first="<"; m_xmlBrackets.second=">"; @@ -1184,5 +1185,52 @@ void StaticData::CheckLEGACYPT() } +void StaticData::ResetWeights(const std::string &denseWeights, const std::string &sparseFile) +{ + m_allWeights = ScoreComponentCollection(); + + // dense weights + string name(""); + vector weights; + vector toks = Tokenize(denseWeights); + for (size_t i = 0; i < toks.size(); ++i) { + const string &tok = toks[i]; + + if (tok.substr(tok.size() - 1, 1) == "=") { + // start of new feature + + if (name != "") { + // save previous ff + const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name); + m_allWeights.Assign(&ff, weights); + weights.clear(); + } + + name = tok.substr(0, tok.size() - 1); + } else { + // a weight for curr ff + float weight = Scan(toks[i]); + weights.push_back(weight); + } + } + + const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name); + m_allWeights.Assign(&ff, weights); + + // sparse weights + InputFileStream sparseStrme(sparseFile); + string line; + while (getline(sparseStrme, line)) { + vector toks = Tokenize(line); + UTIL_THROW_IF2(toks.size() != 2, "Incorrect sparse weight format. Should be FFName_spareseName weight"); + + vector names = Tokenize(toks[0], "_"); + UTIL_THROW_IF2(names.size() != 2, "Incorrect sparse weight name. Should be FFName_spareseName"); + + const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(names[0]); + m_allWeights.Assign(&ff, names[1], Scan(toks[1])); + } +} + } // namespace diff --git a/moses/StaticData.h b/moses/StaticData.h index def81afae..51db96958 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -221,6 +221,8 @@ protected: std::map > m_soft_matches_map; std::map > m_soft_matches_map_reverse; + const StatefulFeatureFunction* m_treeStructure; + public: bool IsAlwaysCreateDirectTranslationOption() const { @@ -756,6 +758,20 @@ public: bool AdjacentOnly() const { return m_adjacentOnly; } + + + void ResetWeights(const std::string &denseWeights, const std::string &sparseFile); + + + // need global access for output of tree structure + const StatefulFeatureFunction* GetTreeStructure() const { + return m_treeStructure; + } + + void SetTreeStructure(const StatefulFeatureFunction* treeStructure) { + m_treeStructure = treeStructure; + } + }; } diff --git a/moses/TranslationModel/DynSAInclude/onlineRLM.h b/moses/TranslationModel/DynSAInclude/onlineRLM.h index 929602399..1e7a9c2d6 100644 --- a/moses/TranslationModel/DynSAInclude/onlineRLM.h +++ b/moses/TranslationModel/DynSAInclude/onlineRLM.h @@ -43,10 +43,10 @@ public: alpha_[i] = i * log10(0.4); } ~OnlineRLM() { - if(alpha_) delete[] alpha_; + delete[] alpha_; if(bAdapting_) delete vocab_; else vocab_ = NULL; - if(cache_) delete cache_; + delete cache_; delete bPrefix_; delete bHit_; } diff --git a/moses/TranslationModel/PhraseDictionary.cpp b/moses/TranslationModel/PhraseDictionary.cpp index ef91d520f..f42dc5245 100644 --- a/moses/TranslationModel/PhraseDictionary.cpp +++ b/moses/TranslationModel/PhraseDictionary.cpp @@ -58,8 +58,7 @@ const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollectionLEGACY( size_t hash = hash_value(src); - std::map >::iterator iter; - + CacheColl::iterator iter; iter = cache.find(hash); if (iter == cache.end()) { @@ -179,7 +178,7 @@ void PhraseDictionary::ReduceCache() const // find cutoff for last used time priority_queue< clock_t > lastUsedTimes; - std::map >::iterator iter; + CacheColl::iterator iter; iter = cache.begin(); while( iter != cache.end() ) { lastUsedTimes.push( iter->second.second ); @@ -193,7 +192,7 @@ void PhraseDictionary::ReduceCache() const iter = cache.begin(); while( iter != cache.end() ) { if (iter->second.second < cutoffLastUsedTime) { - std::map >::iterator iterRemove = iter++; + CacheColl::iterator iterRemove = iter++; delete iterRemove->second.first; cache.erase(iterRemove); } else iter++; diff --git a/moses/TranslationModel/PhraseDictionary.h b/moses/TranslationModel/PhraseDictionary.h index d158d394b..c6639137a 100644 --- a/moses/TranslationModel/PhraseDictionary.h +++ b/moses/TranslationModel/PhraseDictionary.h @@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include +#include #ifdef WITH_THREADS #include @@ -54,7 +55,7 @@ class ChartCellCollectionBase; class ChartRuleLookupManager; class ChartParser; -class CacheColl : public std::map > +class CacheColl : public boost::unordered_map > { // 1st = hash of source phrase/ address of phrase-table node // 2nd = all translations diff --git a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp index b3a1e0296..84ab07532 100644 --- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp +++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp @@ -59,7 +59,7 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input CacheColl &cache = GetCache(); - std::map >::iterator iter; + CacheColl::iterator iter; iter = cache.find(hash); if (iter != cache.end()) { diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp index 45b881765..fc3ffff06 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp @@ -165,7 +165,7 @@ const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollection( CacheColl &cache = GetCache(); size_t hash = (size_t) ptNode->GetFilePos(); - std::map >::iterator iter; + CacheColl::iterator iter; iter = cache.find(hash); diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp index e2814f33c..a975b4126 100644 --- a/phrase-extract/ExtractionPhrasePair.cpp +++ b/phrase-extract/ExtractionPhrasePair.cpp @@ -47,8 +47,8 @@ ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource, m_count(count), m_pcfgSum(pcfgSum) { - assert(phraseSource.empty()); - assert(phraseTarget.empty()); + assert(phraseSource->empty()); + assert(phraseTarget->empty()); m_count = count; m_pcfgSum = pcfgSum; diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index b318561df..de0d7f646 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -235,8 +235,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC // SCORES ... string directScores, directSparseScores, indirectScores, indirectSparseScores; - breakdownCoreAndSparse( itemDirect[2], directScores, directSparseScores ); - breakdownCoreAndSparse( itemIndirect[2], indirectScores, indirectSparseScores ); + breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores ); + breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores ); vector directCounts = tokenize(itemDirect[4].c_str()); vector indirectCounts = tokenize(itemIndirect[4].c_str()); @@ -307,7 +307,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC } // alignment - fileConsolidated << " ||| " << itemDirect[3]; + fileConsolidated << " ||| " << itemDirect[2]; // counts, for debugging fileConsolidated << "||| " << countE << " " << countF << " " << countEF; diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp index b2cde6d64..bc8fd7233 100644 --- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp +++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp @@ -166,8 +166,9 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out) void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g) { Write(rule,false); - m_fwd << " Tree "; + m_fwd << " {{Tree "; g.PrintTree(m_fwd); + m_fwd << "}}"; m_fwd << std::endl; m_inv << std::endl; } diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 65a12d176..df1c08800 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -506,7 +506,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ) { - assert(phrasePair.isValid()); + assert(phrasePair.IsValid()); const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource(); float count = phrasePair.GetCount(); @@ -555,6 +555,51 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, phraseTableFile << " ||| "; } + // alignment + if ( hierarchicalFlag ) { + // always output alignment if hiero style + assert(phraseTarget->size() == bestAlignmentT2S->size()+1); + std::vector alignment; + for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) { + if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) { + if ( bestAlignmentT2S->at(j).size() != 1 ) { + std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl; + phraseTableFile.flush(); + assert(bestAlignmentT2S->at(j).size() == 1); + } + size_t sourcePos = *(bestAlignmentT2S->at(j).begin()); + //phraseTableFile << sourcePos << "-" << j << " "; + std::stringstream point; + point << sourcePos << "-" << j; + alignment.push_back(point.str()); + } else { + for ( std::set::iterator setIter = (bestAlignmentT2S->at(j)).begin(); + setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) { + size_t sourcePos = *setIter; + std::stringstream point; + point << sourcePos << "-" << j; + alignment.push_back(point.str()); + } + } + } + // now print all alignments, sorted by source index + sort(alignment.begin(), alignment.end()); + for (size_t i = 0; i < alignment.size(); ++i) { + phraseTableFile << alignment[i] << " "; + } + } else if ( !inverseFlag && wordAlignmentFlag) { + // alignment info in pb model + for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) { + for ( std::set::iterator setIter = (bestAlignmentT2S->at(j)).begin(); + setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) { + size_t sourcePos = *setIter; + phraseTableFile << sourcePos << "-" << j << " "; + } + } + } + + phraseTableFile << " ||| "; + // lexical translation probability if (lexFlag) { double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S ); @@ -596,53 +641,6 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, phraseTableFile << " " << i->first << " " << i->second; } - phraseTableFile << " ||| "; - - // output alignment info - if ( !inverseFlag ) { - if ( hierarchicalFlag ) { - // always output alignment if hiero style - assert(phraseTarget->size() == bestAlignmentT2S->size()+1); - std::vector alignment; - for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) { - if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) { - if ( bestAlignmentT2S->at(j).size() != 1 ) { - std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl; - phraseTableFile.flush(); - assert(bestAlignmentT2S->at(j).size() == 1); - } - size_t sourcePos = *(bestAlignmentT2S->at(j).begin()); - //phraseTableFile << sourcePos << "-" << j << " "; - std::stringstream point; - point << sourcePos << "-" << j; - alignment.push_back(point.str()); - } else { - for ( std::set::iterator setIter = (bestAlignmentT2S->at(j)).begin(); - setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) { - size_t sourcePos = *setIter; - std::stringstream point; - point << sourcePos << "-" << j; - alignment.push_back(point.str()); - } - } - } - // now print all alignments, sorted by source index - sort(alignment.begin(), alignment.end()); - for (size_t i = 0; i < alignment.size(); ++i) { - phraseTableFile << alignment[i] << " "; - } - } else if (wordAlignmentFlag) { - // alignment info in pb model - for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) { - for ( std::set::iterator setIter = (bestAlignmentT2S->at(j)).begin(); - setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) { - size_t sourcePos = *setIter; - phraseTableFile << sourcePos << "-" << j << " "; - } - } - } - } - // counts phraseTableFile << " ||| " << totalCount << " " << count; if (kneserNeyFlag) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 7a72b1f95..327106d6f 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -236,9 +236,8 @@ tokenize-tuning factorize-tuning in: tokenized-tuning out: factorized-tuning - rerun-on-change: TRAINING:output-factors default-name: lm/interpolate-tuning.factored - pass-unless: factors + pass-unless: TRAINING:output-factors parallelizable: yes error: can't open error: incompatible number of words in factor diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 212260226..6e549c008 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -981,6 +981,9 @@ sub define_step { elsif ($DO_STEP[$i] eq 'TRAINING:create-config' || $DO_STEP[$i] eq 'TRAINING:create-config-interpolated-lm') { &define_training_create_config($i); } + elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:factorize-tuning') { + &define_interpolated_lm_factorize_tuning($i); + } elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:interpolate') { &define_interpolated_lm_interpolate($i); } @@ -1512,6 +1515,21 @@ sub define_lm_factorize { &create_step($step_id,$cmd); } +sub define_interpolated_lm_factorize_tuning { + my ($step_id) = @_; + my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); + + my ($output,$input) = &get_output_and_input($step_id); + my $factor = &check_backoff_and_get_array("TRAINING:output-factors"); + + my $dir = &check_and_get("GENERAL:working-dir"); + my $temp_dir = &check_and_get("INPUT-FACTOR:temp-dir") . ".$VERSION"; + my $cmd = "mkdir -p $temp_dir\n" + . &factorize_one_language("OUTPUT-FACTOR",$input,$output,$factor,$step_id); + + &create_step($step_id,$cmd); +} + sub define_splitter_train { my ($step_id,$set) = @_; @@ -1986,6 +2004,10 @@ sub define_training_extract_phrases { if (&get("TRAINING:use-ghkm")) { $cmd .= "-ghkm "; } + + if (&get("TRAINING:ghkm-tree-fragments")) { + $cmd .= "-ghkm-tree-fragments "; + } } my $extract_settings = &get("TRAINING:extract-settings"); @@ -2013,6 +2035,12 @@ sub define_training_build_ttable { $cmd .= "-no-word-alignment " if defined($word_alignment) && $word_alignment eq "no"; $cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features"); + + if (&get("TRAINING:hierarchical-rule-set")) { + if (&get("TRAINING:ghkm-tree-fragments")) { + $cmd .= "-ghkm-tree-fragments "; + } + } &create_step($step_id,$cmd); } @@ -2267,6 +2295,7 @@ sub define_interpolated_lm_interpolate { $interpolation_script, $tuning, @LM) = &get_output_and_input($step_id); my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir"); my $group = &get("INTERPOLATED-LM:group"); + my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); my $cmd = ""; @@ -2299,9 +2328,12 @@ sub define_interpolated_lm_interpolate { $group_string =~ s/ $//; $group_string .= " "; while($group_string =~ /^([^ ,]+)([ ,]+)(.*)$/) { - die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition") - if ! defined($POSITION{$1}); - $numbered_string .= $POSITION{$1}.$2; + # die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition") + # if ! defined($POSITION{$1}); +# detect that elsewhere! + if (defined($POSITION{$1})) { + $numbered_string .= $POSITION{$1}.$2; + } $group_string = $3; } chop($numbered_string); @@ -2313,7 +2345,12 @@ sub define_interpolated_lm_interpolate { $name .= ".$$FACTOR[$factor]" if defined($FACTOR); $name .= ".order$order"; } - $cmd .= "$interpolation_script --tuning $tuning --name $name --srilm $srilm_dir --lm $lm_list"; + my $factored_tuning = $tuning; + if (&backoff_and_get("TRAINING:output-factors")) { + $factored_tuning = "$tuning.factor$factor"; + $cmd .= "$scripts/training/reduce-factors.perl --corpus $tuning --reduced $factored_tuning --factor $factor\n"; + } + $cmd .= "$interpolation_script --tuning $factored_tuning --name $name --srilm $srilm_dir --lm $lm_list"; $cmd .= " --group \"$numbered_string\"" if defined($group); $cmd .= "\n"; } diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 8a1ba4c76..59a83ec91 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -86,15 +86,23 @@ sub split_xml { my $i = 0; $MARKUP[0] = ""; while($line =~ /\S/) { + # XML tag if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { $MARKUP[$i] .= $1." "; $line = $2; } + # non-XML text elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { $WORD[$i++] = $1; $MARKUP[$i] = ""; $line = $2; } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } else { die("ERROR: huh? $line\n"); } diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index a1340f3b6..22f402196 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -70,15 +70,23 @@ sub split_xml { my $i = 0; $MARKUP[0] = ""; while($line =~ /\S/) { + # XML tag if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { $MARKUP[$i] .= $1." "; $line = $2; } + # non-XML text elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { $WORD[$i++] = $1; $MARKUP[$i] = ""; $line = $2; } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } else { die("ERROR: huh? $line\n"); } diff --git a/scripts/training/flexibility_score.py b/scripts/training/flexibility_score.py index 2b9e3b694..826574d7b 100755 --- a/scripts/training/flexibility_score.py +++ b/scripts/training/flexibility_score.py @@ -124,14 +124,14 @@ class FlexScore: line = self.phrase_pairs[src][target] flexscore_l = b"{0:.6g}".format(self.flexprob_l[src][target]) flexscore_r = b"{0:.6g}".format(self.flexprob_r[src][target]) - line[2] += b' ' + flexscore_l + b' ' + flexscore_r + line[3] += b' ' + flexscore_l + b' ' + flexscore_r if self.hierarchical: try: flexscore_d = b"{0:.6g}".format(self.flexprob_d[src][target]) except KeyError: flexscore_d = b"1" - line[2] += b' ' + flexscore_d + line[3] += b' ' + flexscore_d return b' ||| '.join(line) + b'\n' diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index ec18d07dd..01585c2fa 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -1087,7 +1087,9 @@ if($___RETURN_BEST_DEV) { if(defined $sparse_weights_file) { $best_sparse_file = "run$bestit.sparse-weights"; } - create_config($___CONFIG_ORIG, "./moses.ini", get_featlist_from_file("run$bestit.dense"), + my $best_featlist = get_featlist_from_file("run$bestit.dense"); + $best_featlist->{"untuneables"} = $featlist->{"untuneables"}; + create_config($___CONFIG_ORIG, "./moses.ini", $best_featlist, $bestit, $bestbleu, $best_sparse_file); } else { diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl new file mode 100755 index 000000000..fd4906a48 --- /dev/null +++ b/scripts/training/reduce-factors.perl @@ -0,0 +1,109 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long "GetOptions"; +use FindBin qw($RealBin); + +my $___FACTOR_DELIMITER = "|"; + +# utilities +my $ZCAT = "gzip -cd"; +my $BZCAT = "bzcat"; + +my ($CORPUS,$REDUCED,$FACTOR); +die("ERROR: wrong syntax when invoking reduce-factors") + unless &GetOptions('corpus=s' => \$CORPUS, + 'reduced-corpus=s' => \$REDUCED, + 'factor=s' => \$FACTOR); + +&reduce_factors($CORPUS,$REDUCED,$FACTOR); + +# from train-model.perl +sub reduce_factors { + my ($full,$reduced,$factors) = @_; + + my @INCLUDE = sort {$a <=> $b} split(/,/,$factors); + + print "Reducing factors to produce $reduced @ ".`date`; + while(-e $reduced.".lock") { + sleep(10); + } + if (-e $reduced) { + print STDERR " $reduced in place, reusing\n"; + return; + } + if (-e $reduced.".gz") { + print STDERR " $reduced.gz in place, reusing\n"; + return; + } + + # peek at input, to check if we are asked to produce exactly the + # available factors + my $inh = open_or_zcat($full); + my $firstline = <$inh>; + die "Corpus file $full is empty" unless $firstline; + close $inh; + # pick first word + $firstline =~ s/^\s*//; + $firstline =~ s/\s.*//; + # count factors + my $maxfactorindex = $firstline =~ tr/|/|/; + if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { + # create just symlink; preserving compression + my $realfull = $full; + if (!-e $realfull && -e $realfull.".gz") { + $realfull .= ".gz"; + $reduced =~ s/(\.gz)?$/.gz/; + } + safesystem("ln -s '$realfull' '$reduced'") + or die "Failed to create symlink $realfull -> $reduced"; + return; + } + + # The default is to select the needed factors + `touch $reduced.lock`; + *IN = open_or_zcat($full); + open(OUT,">".$reduced) or die "ERROR: Can't write $reduced"; + my $nr = 0; + while() { + $nr++; + print STDERR "." if $nr % 10000 == 0; + print STDERR "($nr)" if $nr % 100000 == 0; + chomp; s/ +/ /g; s/^ //; s/ $//; + my $first = 1; + foreach (split) { + my @FACTOR = split /\Q$___FACTOR_DELIMITER/; + # \Q causes to disable metacharacters in regex + print OUT " " unless $first; + $first = 0; + my $first_factor = 1; + foreach my $outfactor (@INCLUDE) { + print OUT "|" unless $first_factor; + $first_factor = 0; + my $out = $FACTOR[$outfactor]; + die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out; + print OUT $out; + } + } + print OUT "\n"; + } + print STDERR "\n"; + close(OUT); + close(IN); + `rm -f $reduced.lock`; +} + +sub open_or_zcat { + my $fn = shift; + my $read = $fn; + $fn = $fn.".gz" if ! -e $fn && -e $fn.".gz"; + $fn = $fn.".bz2" if ! -e $fn && -e $fn.".bz2"; + if ($fn =~ /\.bz2$/) { + $read = "$BZCAT $fn|"; + } elsif ($fn =~ /\.gz$/) { + $read = "$ZCAT $fn|"; + } + my $hdl; + open($hdl,$read) or die "Can't read $fn ($read)"; + return $hdl; +} diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 2cc469cc7..64c3eee02 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -752,7 +752,7 @@ sub reduce_factors { $firstline =~ s/^\s*//; $firstline =~ s/\s.*//; # count factors - my $maxfactorindex = $firstline =~ tr/|/|/; + my $maxfactorindex = $firstline =~ tr/$___FACTOR_DELIMITER/$___FACTOR_DELIMITER/; if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { # create just symlink; preserving compression my $realfull = $full; @@ -785,7 +785,7 @@ sub reduce_factors { $first = 0; my $first_factor = 1; foreach my $outfactor (@INCLUDE) { - print OUT "|" unless $first_factor; + print OUT $___FACTOR_DELIMITER unless $first_factor; $first_factor = 0; my $out = $FACTOR[$outfactor]; die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out; @@ -1785,19 +1785,19 @@ sub get_generation { while() { chomp; foreach (split) { - my @FACTOR = split(/\|/); + my @FACTOR = split /\Q$___FACTOR_DELIMITER/; my ($source,$target); my $first_factor = 1; foreach my $factor (split(/,/,$factor_e_source)) { - $source .= "|" unless $first_factor; + $source .= $___FACTOR_DELIMITER unless $first_factor; $first_factor = 0; $source .= $FACTOR[$factor]; } $first_factor = 1; foreach my $factor (split(/,/,$factor_e)) { - $target .= "|" unless $first_factor; + $target .= $___FACTOR_DELIMITER unless $first_factor; $first_factor = 0; $target .= $FACTOR[$factor]; }