mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
f3ec79d278
31
moses/PP/CountsPhraseProperty.cpp
Normal file
31
moses/PP/CountsPhraseProperty.cpp
Normal file
@ -0,0 +1,31 @@
|
||||
#include "moses/PP/CountsPhraseProperty.h"
|
||||
#include <sstream>
|
||||
#include <assert.h>
|
||||
#include "util/exception.hh"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void CountsPhraseProperty::ProcessValue()
|
||||
{
|
||||
std::istringstream tokenizer(m_value);
|
||||
|
||||
if (! (tokenizer >> m_targetMarginal)) { // first token: countE
|
||||
UTIL_THROW2("CountsPhraseProperty: Not able to read target marginal. Flawed property?");
|
||||
}
|
||||
assert( m_targetMarginal > 0 );
|
||||
|
||||
if (! (tokenizer >> m_sourceMarginal)) { // first token: countF
|
||||
UTIL_THROW2("CountsPhraseProperty: Not able to read source marginal. Flawed property?");
|
||||
}
|
||||
assert( m_sourceMarginal > 0 );
|
||||
|
||||
if (! (tokenizer >> m_jointCount)) { // first token: countEF
|
||||
UTIL_THROW2("CountsPhraseProperty: Not able to read joint count. Flawed property?");
|
||||
}
|
||||
assert( m_jointCount > 0 );
|
||||
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
54
moses/PP/CountsPhraseProperty.h
Normal file
54
moses/PP/CountsPhraseProperty.h
Normal file
@ -0,0 +1,54 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "moses/PP/PhraseProperty.h"
|
||||
#include <string>
|
||||
#include <list>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
// A simple phrase property class to access the three phrase count values.
|
||||
//
|
||||
// The counts are usually not needed during decoding and are not loaded
|
||||
// from the phrase table. This is just a workaround that can make them
|
||||
// available to features which have a use for them.
|
||||
//
|
||||
// If you need access to the counts, copy the two marginal counts and the
|
||||
// joint count into an additional information property with key "Counts",
|
||||
// e.g. using awk:
|
||||
//
|
||||
// $ zcat phrase-table.gz | awk -F' \|\|\| ' '{printf("%s {{Counts %s}}\n",$0,$5);}' | gzip -c > phrase-table.withCountsPP.gz
|
||||
//
|
||||
// CountsPhraseProperty reads them from the phrase table and provides
|
||||
// methods GetSourceMarginal(), GetTargetMarginal(), GetJointCount().
|
||||
|
||||
|
||||
class CountsPhraseProperty : public PhraseProperty
|
||||
{
|
||||
public:
|
||||
|
||||
CountsPhraseProperty(const std::string &value) : PhraseProperty(value) {};
|
||||
|
||||
virtual void ProcessValue();
|
||||
|
||||
size_t GetSourceMarginal() const {
|
||||
return m_sourceMarginal;
|
||||
}
|
||||
|
||||
size_t GetTargetMarginal() const {
|
||||
return m_targetMarginal;
|
||||
}
|
||||
|
||||
float GetJointCount() const {
|
||||
return m_jointCount;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
float m_sourceMarginal, m_targetMarginal, m_jointCount;
|
||||
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -4,6 +4,8 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "moses/PP/CountsPhraseProperty.h"
|
||||
#include "moses/PP/SourceLabelsPhraseProperty.h"
|
||||
#include "moses/PP/TreeStructurePhraseProperty.h"
|
||||
|
||||
namespace Moses
|
||||
@ -50,6 +52,8 @@ PhrasePropertyFactory::PhrasePropertyFactory()
|
||||
// Properties with different key than class.
|
||||
#define MOSES_PNAME2(name, type) Add(name, new DefaultPhrasePropertyCreator< type >());
|
||||
|
||||
MOSES_PNAME2("Counts", CountsPhraseProperty);
|
||||
MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty);
|
||||
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
|
||||
|
||||
}
|
||||
|
125
moses/PP/SourceLabelsPhraseProperty.cpp
Normal file
125
moses/PP/SourceLabelsPhraseProperty.cpp
Normal file
@ -0,0 +1,125 @@
|
||||
#include "moses/PP/SourceLabelsPhraseProperty.h"
|
||||
#include <iostream>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <queue>
|
||||
#include <assert.h>
|
||||
#include <limits>
|
||||
#include "util/exception.hh"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void SourceLabelsPhraseProperty::ProcessValue()
|
||||
{
|
||||
std::istringstream tokenizer(m_value);
|
||||
|
||||
if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property?");
|
||||
}
|
||||
assert( m_nNTs > 0 );
|
||||
|
||||
if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property?");
|
||||
}
|
||||
assert( m_totalCount > 0.0 );
|
||||
|
||||
|
||||
|
||||
// read source-labelled rule items
|
||||
|
||||
std::priority_queue<float> ruleLabelledCountsPQ;
|
||||
|
||||
while (tokenizer.peek() != EOF) {
|
||||
try {
|
||||
|
||||
SourceLabelsPhrasePropertyItem item;
|
||||
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
||||
|
||||
if (m_nNTs == 1) {
|
||||
|
||||
item.m_sourceLabelsRHSCount = m_totalCount;
|
||||
|
||||
} else { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
|
||||
|
||||
for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
|
||||
size_t sourceLabelRHS;
|
||||
if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property?");
|
||||
}
|
||||
item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
|
||||
}
|
||||
|
||||
if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property?");
|
||||
}
|
||||
|
||||
if (! (tokenizer >> numberOfLHSsGivenRHS)) {
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property?");
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
|
||||
size_t sourceLabelLHS;
|
||||
if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property?");
|
||||
}
|
||||
float ruleSourceLabelledCount;
|
||||
if (! (tokenizer >> ruleSourceLabelledCount)) {
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property?");
|
||||
}
|
||||
item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
|
||||
ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
|
||||
}
|
||||
|
||||
m_sourceLabelItems.push_back(item);
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
|
||||
}
|
||||
}
|
||||
|
||||
// keep only top N label vectors
|
||||
const size_t N=50;
|
||||
|
||||
if (ruleLabelledCountsPQ.size() > N) {
|
||||
|
||||
float topNRuleLabelledCount = std::numeric_limits<int>::max();
|
||||
for (size_t i=0; !ruleLabelledCountsPQ.empty() && i<N; ++i) {
|
||||
topNRuleLabelledCount = ruleLabelledCountsPQ.top();
|
||||
ruleLabelledCountsPQ.pop();
|
||||
}
|
||||
|
||||
size_t nKept=0;
|
||||
std::list<SourceLabelsPhrasePropertyItem>::iterator itemIter=m_sourceLabelItems.begin();
|
||||
while (itemIter!=m_sourceLabelItems.end()) {
|
||||
if (itemIter->m_sourceLabelsRHSCount < topNRuleLabelledCount) {
|
||||
itemIter = m_sourceLabelItems.erase(itemIter);
|
||||
} else {
|
||||
std::list< std::pair<size_t,float> >::iterator itemLHSIter=(itemIter->m_sourceLabelsLHSList).begin();
|
||||
while (itemLHSIter!=(itemIter->m_sourceLabelsLHSList).end()) {
|
||||
if (itemLHSIter->second < topNRuleLabelledCount) {
|
||||
itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter);
|
||||
} else {
|
||||
if (nKept >= N) {
|
||||
itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter,(itemIter->m_sourceLabelsLHSList).end());
|
||||
} else {
|
||||
++nKept;
|
||||
++itemLHSIter;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((itemIter->m_sourceLabelsLHSList).empty()) {
|
||||
itemIter = m_sourceLabelItems.erase(itemIter);
|
||||
} else {
|
||||
++itemIter;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
71
moses/PP/SourceLabelsPhraseProperty.h
Normal file
71
moses/PP/SourceLabelsPhraseProperty.h
Normal file
@ -0,0 +1,71 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "moses/PP/PhraseProperty.h"
|
||||
#include <string>
|
||||
#include <list>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
// Note that we require label tokens (strings) in the corresponding property values of phrase table entries
|
||||
// to be replaced beforehand by indices (size_t) of a label vocabulary. (TODO: change that?)
|
||||
|
||||
class SourceLabelsPhrasePropertyItem
|
||||
{
|
||||
friend class SourceLabelsPhraseProperty;
|
||||
|
||||
public:
|
||||
SourceLabelsPhrasePropertyItem() {};
|
||||
|
||||
float GetSourceLabelsRHSCount() const
|
||||
{
|
||||
return m_sourceLabelsRHSCount;
|
||||
};
|
||||
|
||||
const std::list<size_t> &GetSourceLabelsRHS() const
|
||||
{
|
||||
return m_sourceLabelsRHS;
|
||||
};
|
||||
|
||||
const std::list< std::pair<size_t,float> > &GetSourceLabelsLHSList() const
|
||||
{
|
||||
return m_sourceLabelsLHSList;
|
||||
};
|
||||
|
||||
private:
|
||||
float m_sourceLabelsRHSCount;
|
||||
std::list<size_t> m_sourceLabelsRHS; // should be of size nNTs-1 (empty if initial rule, i.e. no right-hand side non-terminals)
|
||||
std::list< std::pair<size_t,float> > m_sourceLabelsLHSList; // list of left-hand sides for this right-hand side, with counts
|
||||
};
|
||||
|
||||
|
||||
class SourceLabelsPhraseProperty : public PhraseProperty
|
||||
{
|
||||
public:
|
||||
SourceLabelsPhraseProperty(const std::string &value) : PhraseProperty(value) {};
|
||||
|
||||
virtual void ProcessValue();
|
||||
|
||||
size_t GetNumberOfNonTerminals() const {
|
||||
return m_nNTs;
|
||||
}
|
||||
|
||||
float GetTotalCount() const {
|
||||
return m_totalCount;
|
||||
}
|
||||
|
||||
const std::list<SourceLabelsPhrasePropertyItem> &GetSourceLabelItems() const {
|
||||
return m_sourceLabelItems;
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
size_t m_nNTs;
|
||||
float m_totalCount;
|
||||
|
||||
std::list<SourceLabelsPhrasePropertyItem> m_sourceLabelItems;
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -321,5 +321,148 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke
|
||||
}
|
||||
|
||||
|
||||
std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
|
||||
std::set<std::string>& labelSet,
|
||||
boost::unordered_map<std::string,float>& countsLabelsLHS,
|
||||
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& jointCountsRulesTargetLHSAndLabelsLHS,
|
||||
Vocabulary &vcbT) const
|
||||
{
|
||||
const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
|
||||
|
||||
if ( allPropertyValues == NULL ) {
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string lhs="", rhs="", currentRhs="";
|
||||
float currentRhsCount = 0.0;
|
||||
std::list< std::pair<std::string,float> > lhsGivenCurrentRhsCounts;
|
||||
|
||||
std::ostringstream oss;
|
||||
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||
iter!=allPropertyValues->end(); ++iter) {
|
||||
|
||||
size_t space = (iter->first).find_last_of(' ');
|
||||
if ( space == string::npos ) {
|
||||
lhs = iter->first;
|
||||
rhs.clear();
|
||||
} else {
|
||||
lhs = (iter->first).substr(space+1);
|
||||
rhs = (iter->first).substr(0,space);
|
||||
}
|
||||
|
||||
labelSet.insert(lhs);
|
||||
|
||||
if ( rhs.compare(currentRhs) ) {
|
||||
|
||||
if ( iter!=allPropertyValues->begin() ) {
|
||||
if ( !currentRhs.empty() ) {
|
||||
istringstream tokenizer(currentRhs);
|
||||
std::string rhsLabel;
|
||||
while ( tokenizer.peek() != EOF ) {
|
||||
tokenizer >> rhsLabel;
|
||||
labelSet.insert(rhsLabel);
|
||||
}
|
||||
oss << " " << currentRhs << " " << currentRhsCount;
|
||||
}
|
||||
if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
|
||||
if ( !currentRhs.empty() ) {
|
||||
oss << " " << lhsGivenCurrentRhsCounts.size();
|
||||
}
|
||||
for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
|
||||
iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
|
||||
oss << " " << iter2->first << " " << iter2->second;
|
||||
|
||||
// update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
|
||||
std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
|
||||
ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
|
||||
ruleTargetLhs.erase(ruleTargetLhs.size()-1);
|
||||
|
||||
std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
|
||||
countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
||||
if (!insertedCountsLabelsLHS.second) {
|
||||
(insertedCountsLabelsLHS.first)->second += iter2->second;
|
||||
}
|
||||
|
||||
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
|
||||
jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
|
||||
if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
|
||||
boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
|
||||
jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
||||
jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
|
||||
} else {
|
||||
boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
|
||||
std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
|
||||
jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
||||
if (!insertedJointCounts.second) {
|
||||
(insertedJointCounts.first)->second += iter2->second;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
lhsGivenCurrentRhsCounts.clear();
|
||||
}
|
||||
|
||||
currentRhsCount = 0.0;
|
||||
currentRhs = rhs;
|
||||
}
|
||||
|
||||
currentRhsCount += iter->second;
|
||||
lhsGivenCurrentRhsCounts.push_back( std::pair<std::string,float>(lhs,iter->second) );
|
||||
}
|
||||
|
||||
if ( !currentRhs.empty() ) {
|
||||
istringstream tokenizer(currentRhs);
|
||||
std::string rhsLabel;
|
||||
while ( tokenizer.peek() != EOF ) {
|
||||
tokenizer >> rhsLabel;
|
||||
labelSet.insert(rhsLabel);
|
||||
}
|
||||
oss << " " << currentRhs << " " << currentRhsCount;
|
||||
}
|
||||
if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
|
||||
if ( !currentRhs.empty() ) {
|
||||
oss << " " << lhsGivenCurrentRhsCounts.size();
|
||||
}
|
||||
for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
|
||||
iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
|
||||
oss << " " << iter2->first << " " << iter2->second;
|
||||
|
||||
// update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
|
||||
std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
|
||||
ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
|
||||
ruleTargetLhs.erase(ruleTargetLhs.size()-1);
|
||||
|
||||
std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
|
||||
countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
||||
if (!insertedCountsLabelsLHS.second) {
|
||||
(insertedCountsLabelsLHS.first)->second += iter2->second;
|
||||
}
|
||||
|
||||
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
|
||||
jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
|
||||
if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
|
||||
boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
|
||||
jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
||||
jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
|
||||
} else {
|
||||
boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
|
||||
std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
|
||||
jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
||||
if (!insertedJointCounts.second) {
|
||||
(insertedJointCounts.first)->second += iter2->second;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
std::string allPropertyValuesString(oss.str());
|
||||
return allPropertyValuesString;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
namespace MosesTraining {
|
||||
|
||||
@ -124,6 +125,12 @@ public:
|
||||
|
||||
std::string CollectAllPropertyValues(const std::string &key) const;
|
||||
|
||||
std::string CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
|
||||
std::set<std::string>& sourceLabelSet,
|
||||
boost::unordered_map<std::string,float>& sourceLHSCounts,
|
||||
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& sourceRHSAndLHSJointCounts,
|
||||
Vocabulary &vcbT) const;
|
||||
|
||||
void AddProperties( const std::string &str, float count );
|
||||
|
||||
void AddProperty( const std::string &key, const std::string &value, float count )
|
||||
|
@ -90,7 +90,7 @@ public:
|
||||
float count,
|
||||
int sentenceId) const {};
|
||||
|
||||
/** Add the values for this feature function. */
|
||||
/** Add the values for this score feature. */
|
||||
virtual void add(const ScoreFeatureContext& context,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const = 0;
|
||||
|
@ -30,6 +30,10 @@
|
||||
#include "ScfgRule.h"
|
||||
#include "ScfgRuleWriter.h"
|
||||
#include "Span.h"
|
||||
#include "SyntaxTree.h"
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "XmlTreeParser.h"
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
@ -63,7 +67,9 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
OutputFileStream fwdExtractStream;
|
||||
OutputFileStream invExtractStream;
|
||||
std::ofstream glueGrammarStream;
|
||||
std::ofstream unknownWordStream;
|
||||
std::ofstream targetUnknownWordStream;
|
||||
std::ofstream sourceUnknownWordStream;
|
||||
std::ofstream sourceLabelSetStream;
|
||||
std::ofstream unknownWordSoftMatchesStream;
|
||||
std::string fwdFileName = options.extractFile;
|
||||
std::string invFileName = options.extractFile + std::string(".inv");
|
||||
@ -76,26 +82,44 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
if (!options.glueGrammarFile.empty()) {
|
||||
OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream);
|
||||
}
|
||||
if (!options.unknownWordFile.empty()) {
|
||||
OpenOutputFileOrDie(options.unknownWordFile, unknownWordStream);
|
||||
if (!options.targetUnknownWordFile.empty()) {
|
||||
OpenOutputFileOrDie(options.targetUnknownWordFile, targetUnknownWordStream);
|
||||
}
|
||||
if (!options.sourceUnknownWordFile.empty()) {
|
||||
OpenOutputFileOrDie(options.sourceUnknownWordFile, sourceUnknownWordStream);
|
||||
}
|
||||
if (!options.sourceLabelSetFile.empty()) {
|
||||
if (!options.sourceLabels) {
|
||||
Error("SourceLabels should be active if SourceLabelSet is supposed to be written to a file");
|
||||
}
|
||||
OpenOutputFileOrDie(options.sourceLabelSetFile, sourceLabelSetStream); // TODO: global sourceLabelSet cannot be determined during parallelized extraction
|
||||
}
|
||||
if (!options.unknownWordSoftMatchesFile.empty()) {
|
||||
OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
|
||||
}
|
||||
|
||||
// Target label sets for producing glue grammar.
|
||||
std::set<std::string> labelSet;
|
||||
std::map<std::string, int> topLabelSet;
|
||||
std::set<std::string> targetLabelSet;
|
||||
std::map<std::string, int> targetTopLabelSet;
|
||||
|
||||
// Source label sets for producing glue grammar.
|
||||
std::set<std::string> sourceLabelSet;
|
||||
std::map<std::string, int> sourceTopLabelSet;
|
||||
|
||||
// Word count statistics for producing unknown word labels.
|
||||
std::map<std::string, int> wordCount;
|
||||
std::map<std::string, std::string> wordLabel;
|
||||
std::map<std::string, int> targetWordCount;
|
||||
std::map<std::string, std::string> targetWordLabel;
|
||||
|
||||
// Word count statistics for producing unknown word labels: source side.
|
||||
std::map<std::string, int> sourceWordCount;
|
||||
std::map<std::string, std::string> sourceWordLabel;
|
||||
|
||||
std::string targetLine;
|
||||
std::string sourceLine;
|
||||
std::string alignmentLine;
|
||||
Alignment alignment;
|
||||
XmlTreeParser xmlTreeParser(labelSet, topLabelSet);
|
||||
XmlTreeParser xmlTreeParser(targetLabelSet, targetTopLabelSet);
|
||||
// XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
|
||||
ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options);
|
||||
size_t lineNum = options.sentenceOffset;
|
||||
while (true) {
|
||||
@ -118,30 +142,71 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
std::cerr << "skipping line " << lineNum << " with empty target tree\n";
|
||||
continue;
|
||||
}
|
||||
std::auto_ptr<ParseTree> t;
|
||||
std::auto_ptr<ParseTree> targetParseTree;
|
||||
try {
|
||||
t = xmlTreeParser.Parse(targetLine);
|
||||
assert(t.get());
|
||||
targetParseTree = xmlTreeParser.Parse(targetLine);
|
||||
assert(targetParseTree.get());
|
||||
} catch (const Exception &e) {
|
||||
std::ostringstream s;
|
||||
s << "Failed to parse XML tree at line " << lineNum;
|
||||
std::ostringstream oss;
|
||||
oss << "Failed to parse target XML tree at line " << lineNum;
|
||||
if (!e.GetMsg().empty()) {
|
||||
s << ": " << e.GetMsg();
|
||||
oss << ": " << e.GetMsg();
|
||||
}
|
||||
Error(oss.str());
|
||||
}
|
||||
|
||||
|
||||
// Parse source tree and construct a SyntaxTree object.
|
||||
MosesTraining::SyntaxTree sourceSyntaxTree;
|
||||
MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL;
|
||||
|
||||
if (options.sourceLabels) {
|
||||
try {
|
||||
if (!ProcessAndStripXMLTags(sourceLine, sourceSyntaxTree, sourceLabelSet, sourceTopLabelSet, false)) {
|
||||
throw Exception("");
|
||||
}
|
||||
sourceSyntaxTree.ConnectNodes();
|
||||
sourceSyntaxTreeRoot = sourceSyntaxTree.GetTop();
|
||||
assert(sourceSyntaxTreeRoot);
|
||||
} catch (const Exception &e) {
|
||||
std::ostringstream oss;
|
||||
oss << "Failed to parse source XML tree at line " << lineNum;
|
||||
if (!e.GetMsg().empty()) {
|
||||
oss << ": " << e.GetMsg();
|
||||
}
|
||||
Error(oss.str());
|
||||
}
|
||||
Error(s.str());
|
||||
}
|
||||
|
||||
// Read source tokens.
|
||||
std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
|
||||
|
||||
// Construct a source ParseTree object object from the SyntaxTree object.
|
||||
std::auto_ptr<ParseTree> sourceParseTree;
|
||||
|
||||
if (options.sourceLabels) {
|
||||
try {
|
||||
sourceParseTree = XmlTreeParser::ConvertTree(*sourceSyntaxTreeRoot, sourceTokens);
|
||||
assert(sourceParseTree.get());
|
||||
} catch (const Exception &e) {
|
||||
std::ostringstream oss;
|
||||
oss << "Failed to parse source XML tree at line " << lineNum;
|
||||
if (!e.GetMsg().empty()) {
|
||||
oss << ": " << e.GetMsg();
|
||||
}
|
||||
Error(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Read word alignments.
|
||||
try {
|
||||
ReadAlignment(alignmentLine, alignment);
|
||||
} catch (const Exception &e) {
|
||||
std::ostringstream s;
|
||||
s << "Failed to read alignment at line " << lineNum << ": ";
|
||||
s << e.GetMsg();
|
||||
Error(s.str());
|
||||
std::ostringstream oss;
|
||||
oss << "Failed to read alignment at line " << lineNum << ": ";
|
||||
oss << e.GetMsg();
|
||||
Error(oss.str());
|
||||
}
|
||||
if (alignment.size() == 0) {
|
||||
std::cerr << "skipping line " << lineNum << " without alignment points\n";
|
||||
@ -149,13 +214,18 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
// Record word counts.
|
||||
if (!options.unknownWordFile.empty()) {
|
||||
CollectWordLabelCounts(*t, options, wordCount, wordLabel);
|
||||
if (!options.targetUnknownWordFile.empty()) {
|
||||
CollectWordLabelCounts(*targetParseTree, options, targetWordCount, targetWordLabel);
|
||||
}
|
||||
|
||||
// Record word counts: source side.
|
||||
if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
|
||||
CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount, sourceWordLabel);
|
||||
}
|
||||
|
||||
// Form an alignment graph from the target tree, source words, and
|
||||
// alignment.
|
||||
AlignmentGraph graph(t.get(), sourceTokens, alignment);
|
||||
AlignmentGraph graph(targetParseTree.get(), sourceTokens, alignment);
|
||||
|
||||
// Extract minimal rules, adding each rule to its root node's rule set.
|
||||
graph.ExtractMinimalRules(options);
|
||||
@ -172,29 +242,54 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
const std::vector<const Subgraph *> &rules = (*p)->GetRules();
|
||||
for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
|
||||
q != rules.end(); ++q) {
|
||||
ScfgRule r(**q);
|
||||
ScfgRule *r = 0;
|
||||
if (options.sourceLabels) {
|
||||
r = new ScfgRule(**q, &sourceSyntaxTree);
|
||||
} else {
|
||||
r = new ScfgRule(**q);
|
||||
}
|
||||
// TODO Can scope pruning be done earlier?
|
||||
if (r.Scope() <= options.maxScope) {
|
||||
if (r->Scope() <= options.maxScope) {
|
||||
if (!options.treeFragments) {
|
||||
writer.Write(r);
|
||||
writer.Write(*r);
|
||||
} else {
|
||||
writer.Write(r,**q);
|
||||
writer.Write(*r,**q);
|
||||
}
|
||||
}
|
||||
delete r;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!options.glueGrammarFile.empty()) {
|
||||
WriteGlueGrammar(labelSet, topLabelSet, glueGrammarStream);
|
||||
std::map<std::string,size_t> sourceLabels;
|
||||
if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
|
||||
|
||||
sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side)
|
||||
sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side)
|
||||
sourceLabelSet.insert("TOPLABEL"); // as used in the glue grammar
|
||||
sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar
|
||||
size_t index = 0;
|
||||
for (std::set<std::string>::const_iterator iter=sourceLabelSet.begin();
|
||||
iter!=sourceLabelSet.end(); ++iter, ++index) {
|
||||
sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
|
||||
}
|
||||
WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
|
||||
}
|
||||
|
||||
if (!options.unknownWordFile.empty()) {
|
||||
WriteUnknownWordLabel(wordCount, wordLabel, options, unknownWordStream);
|
||||
if (!options.glueGrammarFile.empty()) {
|
||||
WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream);
|
||||
}
|
||||
|
||||
if (!options.targetUnknownWordFile.empty()) {
|
||||
WriteUnknownWordLabel(targetWordCount, targetWordLabel, options, targetUnknownWordStream);
|
||||
}
|
||||
|
||||
if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
|
||||
WriteUnknownWordLabel(sourceWordCount, sourceWordLabel, options, sourceUnknownWordStream, true);
|
||||
}
|
||||
|
||||
if (!options.unknownWordSoftMatchesFile.empty()) {
|
||||
WriteUnknownWordSoftMatches(labelSet, unknownWordSoftMatchesStream);
|
||||
WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -305,12 +400,20 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
|
||||
"include score based on PCFG scores in target corpus")
|
||||
("TreeFragments",
|
||||
"output parse tree information")
|
||||
("SourceLabels",
|
||||
"output source syntax label information")
|
||||
("SourceLabelSet",
|
||||
po::value(&options.sourceLabelSetFile),
|
||||
"write source syntax label set to named file")
|
||||
("SentenceOffset",
|
||||
po::value(&options.sentenceOffset)->default_value(options.sentenceOffset),
|
||||
"set sentence number offset if processing split corpus")
|
||||
("UnknownWordLabel",
|
||||
po::value(&options.unknownWordFile),
|
||||
po::value(&options.targetUnknownWordFile),
|
||||
"write unknown word labels to named file")
|
||||
("SourceUnknownWordLabel",
|
||||
po::value(&options.sourceUnknownWordFile),
|
||||
"write source syntax unknown word labels to named file")
|
||||
("UnknownWordMinRelFreq",
|
||||
po::value(&options.unknownWordMinRelFreq)->default_value(
|
||||
options.unknownWordMinRelFreq),
|
||||
@ -402,6 +505,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
|
||||
if (vm.count("TreeFragments")) {
|
||||
options.treeFragments = true;
|
||||
}
|
||||
if (vm.count("SourceLabels")) {
|
||||
options.sourceLabels = true;
|
||||
}
|
||||
if (vm.count("UnknownWordUniform")) {
|
||||
options.unknownWordUniform = true;
|
||||
}
|
||||
@ -411,7 +517,10 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
|
||||
|
||||
// Workaround for extract-parallel issue.
|
||||
if (options.sentenceOffset > 0) {
|
||||
options.unknownWordFile.clear();
|
||||
options.targetUnknownWordFile.clear();
|
||||
}
|
||||
if (options.sentenceOffset > 0) {
|
||||
options.sourceUnknownWordFile.clear();
|
||||
options.unknownWordSoftMatchesFile.clear();
|
||||
}
|
||||
}
|
||||
@ -422,7 +531,7 @@ void ExtractGHKM::Error(const std::string &msg) const
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s)
|
||||
std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s) const
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
|
||||
@ -454,9 +563,11 @@ std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s)
|
||||
void ExtractGHKM::WriteGlueGrammar(
|
||||
const std::set<std::string> &labelSet,
|
||||
const std::map<std::string, int> &topLabelSet,
|
||||
const std::map<std::string,size_t> &sourceLabels,
|
||||
const Options &options,
|
||||
std::ostream &out)
|
||||
{
|
||||
// chose a top label that is not already a label
|
||||
// choose a top label that is not already a label
|
||||
std::string topLabel = "QQQQQQ";
|
||||
for(size_t i = 1; i <= topLabel.length(); i++) {
|
||||
if (labelSet.find(topLabel.substr(0,i)) == labelSet.end() ) {
|
||||
@ -465,23 +576,75 @@ void ExtractGHKM::WriteGlueGrammar(
|
||||
}
|
||||
}
|
||||
|
||||
std::string sourceTopLabel = "TOPLABEL";
|
||||
std::string sourceSLabel = "S";
|
||||
std::string sourceSomeLabel = "SOMELABEL";
|
||||
|
||||
// basic rules
|
||||
out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| ||| {{Tree [" << topLabel << " <s>]}}" << std::endl;
|
||||
out << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 ||| ||| ||| {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}" << std::endl;
|
||||
out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| |||";
|
||||
if (options.treeFragments) {
|
||||
out << " {{Tree [" << topLabel << " <s>]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 1 1 " << sourceTopLabel << " 1}}";
|
||||
}
|
||||
out << std::endl;
|
||||
|
||||
out << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 ||| ||| |||";
|
||||
if (options.treeFragments) {
|
||||
out << " {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 2 1 " << sourceTopLabel << " 1 1 " << sourceTopLabel << " 1}}";
|
||||
}
|
||||
out << std::endl;
|
||||
|
||||
// top rules
|
||||
for (std::map<std::string, int>::const_iterator i = topLabelSet.begin();
|
||||
i != topLabelSet.end(); ++i) {
|
||||
out << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1 ||| ||| ||| {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}" << std::endl;
|
||||
out << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1 ||| ||| |||";
|
||||
if (options.treeFragments) {
|
||||
out << " {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 2 1 " << sourceSLabel << " 1 1 " << sourceTopLabel << " 1}}";
|
||||
}
|
||||
out << std::endl;
|
||||
}
|
||||
|
||||
// glue rules
|
||||
for(std::set<std::string>::const_iterator i = labelSet.begin();
|
||||
i != labelSet.end(); i++ ) {
|
||||
out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| ||| {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}" << std::endl;
|
||||
out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| |||";
|
||||
if (options.treeFragments) {
|
||||
out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 3 2.718 " << sourceTopLabel << " " << sourceSomeLabel << " 2.718 1 " << sourceTopLabel << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL"
|
||||
}
|
||||
out << std::endl;
|
||||
}
|
||||
|
||||
// glue rule for unknown word...
|
||||
out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| ||| {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}" << std::endl;
|
||||
out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| |||";
|
||||
if (options.treeFragments) {
|
||||
out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 3 1 " << sourceTopLabel << " " << sourceSomeLabel << " 1 1 " << sourceTopLabel << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
|
||||
}
|
||||
out << std::endl;
|
||||
}
|
||||
|
||||
void ExtractGHKM::WriteSourceLabelSet(
|
||||
const std::map<std::string,size_t> &sourceLabels,
|
||||
std::ostream &out)
|
||||
{
|
||||
out << sourceLabels.size() << std::endl;
|
||||
for (std::map<std::string,size_t>::const_iterator iter=sourceLabels.begin();
|
||||
iter!=sourceLabels.end(); ++iter) {
|
||||
out << iter->first << " " << iter->second << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void ExtractGHKM::CollectWordLabelCounts(
|
||||
@ -513,11 +676,26 @@ void ExtractGHKM::CollectWordLabelCounts(
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> ExtractGHKM::ReadTokens(const ParseTree &root) const
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
std::vector<const ParseTree*> leaves;
|
||||
root.GetLeaves(std::back_inserter(leaves));
|
||||
for (std::vector<const ParseTree *>::const_iterator p = leaves.begin();
|
||||
p != leaves.end(); ++p) {
|
||||
const ParseTree &leaf = **p;
|
||||
const std::string &word = leaf.GetLabel();
|
||||
tokens.push_back(word);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
void ExtractGHKM::WriteUnknownWordLabel(
|
||||
const std::map<std::string, int> &wordCount,
|
||||
const std::map<std::string, std::string> &wordLabel,
|
||||
const Options &options,
|
||||
std::ostream &out)
|
||||
std::ostream &out,
|
||||
bool writeCounts)
|
||||
{
|
||||
if (!options.unknownWordSoftMatchesFile.empty()) {
|
||||
out << "UNK 1" << std::endl;
|
||||
@ -537,12 +715,19 @@ void ExtractGHKM::WriteUnknownWordLabel(
|
||||
++total;
|
||||
}
|
||||
}
|
||||
for (std::map<std::string, int>::const_iterator p = labelCount.begin();
|
||||
p != labelCount.end(); ++p) {
|
||||
double ratio = static_cast<double>(p->second) / static_cast<double>(total);
|
||||
if (ratio >= options.unknownWordMinRelFreq) {
|
||||
float weight = options.unknownWordUniform ? 1.0f : ratio;
|
||||
out << p->first << " " << weight << std::endl;
|
||||
if ( writeCounts ) {
|
||||
for (std::map<std::string, int>::const_iterator p = labelCount.begin();
|
||||
p != labelCount.end(); ++p) {
|
||||
out << p->first << " " << p->second << std::endl;
|
||||
}
|
||||
} else {
|
||||
for (std::map<std::string, int>::const_iterator p = labelCount.begin();
|
||||
p != labelCount.end(); ++p) {
|
||||
double ratio = static_cast<double>(p->second) / static_cast<double>(total);
|
||||
if (ratio >= options.unknownWordMinRelFreq) {
|
||||
float weight = options.unknownWordUniform ? 1.0f : ratio;
|
||||
out << p->first << " " << weight << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -59,13 +59,19 @@ private:
|
||||
void WriteUnknownWordLabel(const std::map<std::string, int> &,
|
||||
const std::map<std::string, std::string> &,
|
||||
const Options &,
|
||||
std::ostream &);
|
||||
std::ostream &,
|
||||
bool writeCounts=false);
|
||||
void WriteUnknownWordSoftMatches(const std::set<std::string> &,
|
||||
std::ostream &);
|
||||
void WriteGlueGrammar(const std::set<std::string> &,
|
||||
const std::map<std::string, int> &,
|
||||
const std::map<std::string,size_t> &,
|
||||
const Options &,
|
||||
std::ostream &);
|
||||
std::vector<std::string> ReadTokens(const std::string &);
|
||||
void WriteSourceLabelSet(const std::map<std::string,size_t> &,
|
||||
std::ostream &);
|
||||
std::vector<std::string> ReadTokens(const std::string &) const;
|
||||
std::vector<std::string> ReadTokens(const ParseTree &root) const;
|
||||
|
||||
void ProcessOptions(int, char *[], Options &) const;
|
||||
|
||||
|
@ -41,6 +41,7 @@ public:
|
||||
, minimal(false)
|
||||
, pcfg(false)
|
||||
, treeFragments(false)
|
||||
, sourceLabels(false)
|
||||
, sentenceOffset(0)
|
||||
, unpairedExtractFormat(false)
|
||||
, unknownWordMinRelFreq(0.03f)
|
||||
@ -64,9 +65,12 @@ public:
|
||||
bool minimal;
|
||||
bool pcfg;
|
||||
bool treeFragments;
|
||||
bool sourceLabels;
|
||||
std::string sourceLabelSetFile;
|
||||
int sentenceOffset;
|
||||
bool unpairedExtractFormat;
|
||||
std::string unknownWordFile;
|
||||
std::string targetUnknownWordFile;
|
||||
std::string sourceUnknownWordFile;
|
||||
std::string unknownWordSoftMatchesFile;
|
||||
float unknownWordMinRelFreq;
|
||||
bool unknownWordUniform;
|
||||
|
@ -63,7 +63,7 @@ public:
|
||||
bool IsLeaf() const;
|
||||
|
||||
template<typename OutputIterator>
|
||||
void GetLeaves(OutputIterator);
|
||||
void GetLeaves(OutputIterator) const;
|
||||
|
||||
private:
|
||||
// Disallow copying
|
||||
@ -77,7 +77,7 @@ private:
|
||||
};
|
||||
|
||||
template<typename OutputIterator>
|
||||
void ParseTree::GetLeaves(OutputIterator result)
|
||||
void ParseTree::GetLeaves(OutputIterator result) const
|
||||
{
|
||||
if (IsLeaf()) {
|
||||
*result++ = this;
|
||||
|
@ -21,6 +21,7 @@
|
||||
|
||||
#include "Node.h"
|
||||
#include "Subgraph.h"
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
@ -29,11 +30,14 @@ namespace Moses
|
||||
namespace GHKM
|
||||
{
|
||||
|
||||
ScfgRule::ScfgRule(const Subgraph &fragment)
|
||||
ScfgRule::ScfgRule(const Subgraph &fragment,
|
||||
const MosesTraining::SyntaxTree *sourceSyntaxTree)
|
||||
: m_sourceLHS("X", NonTerminal)
|
||||
, m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
|
||||
, m_pcfgScore(fragment.GetPcfgScore())
|
||||
, m_hasSourceLabels(sourceSyntaxTree)
|
||||
{
|
||||
|
||||
// Source RHS
|
||||
|
||||
const std::set<const Node *> &leaves = fragment.GetLeaves();
|
||||
@ -55,6 +59,7 @@ ScfgRule::ScfgRule(const Subgraph &fragment)
|
||||
std::map<const Node *, std::vector<int> > sourceOrder;
|
||||
|
||||
m_sourceRHS.reserve(sourceRHSNodes.size());
|
||||
m_numberOfNonTerminals = 0;
|
||||
int srcIndex = 0;
|
||||
for (std::vector<const Node *>::const_iterator p(sourceRHSNodes.begin());
|
||||
p != sourceRHSNodes.end(); ++p, ++srcIndex) {
|
||||
@ -62,6 +67,11 @@ ScfgRule::ScfgRule(const Subgraph &fragment)
|
||||
if (sinkNode.GetType() == TREE) {
|
||||
m_sourceRHS.push_back(Symbol("X", NonTerminal));
|
||||
sourceOrder[&sinkNode].push_back(srcIndex);
|
||||
++m_numberOfNonTerminals;
|
||||
if (sourceSyntaxTree) {
|
||||
// Source syntax label
|
||||
PushSourceLabel(sourceSyntaxTree,&sinkNode,"XRHS");
|
||||
}
|
||||
} else {
|
||||
assert(sinkNode.GetType() == SOURCE);
|
||||
m_sourceRHS.push_back(Symbol(sinkNode.GetLabel(), Terminal));
|
||||
@ -112,6 +122,76 @@ ScfgRule::ScfgRule(const Subgraph &fragment)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceSyntaxTree) {
|
||||
// Source syntax label for root node (if sourceSyntaxTree available)
|
||||
PushSourceLabel(sourceSyntaxTree,fragment.GetRoot(),"XLHS");
|
||||
// All non-terminal spans (including the LHS) should have obtained a label
|
||||
// (a source-side syntactic constituent label if the span matches, "XLHS" otherwise)
|
||||
assert(m_sourceLabels.size() == m_numberOfNonTerminals+1);
|
||||
}
|
||||
}
|
||||
|
||||
void ScfgRule::PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
|
||||
const Node *node,
|
||||
const std::string &nonMatchingLabel)
|
||||
{
|
||||
ContiguousSpan span = Closure(node->GetSpan());
|
||||
if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span?
|
||||
std::vector<MosesTraining::SyntaxNode*> sourceLabels =
|
||||
sourceSyntaxTree->GetNodes(span.first,span.second);
|
||||
if (!sourceLabels.empty()) {
|
||||
// store the topmost matching label from the source syntax tree
|
||||
m_sourceLabels.push_back(sourceLabels.back()->GetLabel());
|
||||
}
|
||||
} else {
|
||||
// no matching source-side syntactic constituent: store nonMatchingLabel
|
||||
m_sourceLabels.push_back(nonMatchingLabel);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: rather implement the method external to ScfgRule
|
||||
void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::string,float>* > &coocCounts, float count) const
|
||||
{
|
||||
std::map<int, int> sourceToTargetNTMap;
|
||||
std::map<int, int> targetToSourceNTMap;
|
||||
|
||||
for (Alignment::const_iterator p(m_alignment.begin());
|
||||
p != m_alignment.end(); ++p) {
|
||||
if ( m_sourceRHS[p->first].GetType() == NonTerminal ) {
|
||||
assert(m_targetRHS[p->second].GetType() == NonTerminal);
|
||||
sourceToTargetNTMap[p->first] = p->second;
|
||||
}
|
||||
}
|
||||
|
||||
size_t sourceIndex = 0;
|
||||
size_t sourceNonTerminalIndex = 0;
|
||||
for (std::vector<Symbol>::const_iterator p=m_sourceRHS.begin();
|
||||
p != m_sourceRHS.end(); ++p, ++sourceIndex) {
|
||||
if ( p->GetType() == NonTerminal ) {
|
||||
const std::string &sourceLabel = m_sourceLabels[sourceNonTerminalIndex];
|
||||
int targetIndex = sourceToTargetNTMap[sourceIndex];
|
||||
const std::string &targetLabel = m_targetRHS[targetIndex].GetValue();
|
||||
++sourceNonTerminalIndex;
|
||||
|
||||
std::map<std::string,float>* countMap = NULL;
|
||||
std::map< std::string, std::map<std::string,float>* >::iterator iter = coocCounts.find(sourceLabel);
|
||||
if ( iter == coocCounts.end() ) {
|
||||
std::map<std::string,float> *newCountMap = new std::map<std::string,float>();
|
||||
std::pair< std::map< std::string, std::map<std::string,float>* >::iterator, bool > inserted =
|
||||
coocCounts.insert( std::pair< std::string, std::map<std::string,float>* >(sourceLabel, newCountMap) );
|
||||
assert(inserted.second);
|
||||
countMap = (inserted.first)->second;
|
||||
} else {
|
||||
countMap = iter->second;
|
||||
}
|
||||
std::pair< std::map<std::string,float>::iterator, bool > inserted =
|
||||
countMap->insert( std::pair< std::string,float>(targetLabel, count) );
|
||||
if ( !inserted.second ) {
|
||||
(inserted.first)->second += count;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int ScfgRule::Scope() const
|
||||
|
@ -22,9 +22,13 @@
|
||||
#define EXTRACT_GHKM_SCFG_RULE_H_
|
||||
|
||||
#include "Alignment.h"
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -55,7 +59,8 @@ private:
|
||||
class ScfgRule
|
||||
{
|
||||
public:
|
||||
ScfgRule(const Subgraph &fragment);
|
||||
ScfgRule(const Subgraph &fragment,
|
||||
const MosesTraining::SyntaxTree *sourceSyntaxTree = 0);
|
||||
|
||||
const Symbol &GetSourceLHS() const {
|
||||
return m_sourceLHS;
|
||||
@ -75,18 +80,36 @@ public:
|
||||
float GetPcfgScore() const {
|
||||
return m_pcfgScore;
|
||||
}
|
||||
bool HasSourceLabels() const {
|
||||
return m_hasSourceLabels;
|
||||
}
|
||||
void PrintSourceLabels(std::ostream &out) const {
|
||||
for (std::vector<std::string>::const_iterator it = m_sourceLabels.begin();
|
||||
it != m_sourceLabels.end(); ++it) {
|
||||
out << " " << (*it);
|
||||
}
|
||||
}
|
||||
void UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::string,float>* > &coocCounts,
|
||||
float count) const;
|
||||
|
||||
int Scope() const;
|
||||
|
||||
private:
|
||||
static bool PartitionOrderComp(const Node *, const Node *);
|
||||
|
||||
void PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
|
||||
const Node *node,
|
||||
const std::string &nonMatchingLabel);
|
||||
|
||||
Symbol m_sourceLHS;
|
||||
Symbol m_targetLHS;
|
||||
std::vector<Symbol> m_sourceRHS;
|
||||
std::vector<Symbol> m_targetRHS;
|
||||
Alignment m_alignment;
|
||||
float m_pcfgScore;
|
||||
bool m_hasSourceLabels;
|
||||
std::vector<std::string> m_sourceLabels;
|
||||
unsigned m_numberOfNonTerminals;
|
||||
};
|
||||
|
||||
} // namespace GHKM
|
||||
|
@ -66,6 +66,12 @@ void ScfgRuleWriter::Write(const ScfgRule &rule, bool printEndl)
|
||||
m_fwd << " ||| " << std::exp(rule.GetPcfgScore());
|
||||
}
|
||||
|
||||
if (m_options.sourceLabels && rule.HasSourceLabels()) {
|
||||
m_fwd << " {{SourceLabels";
|
||||
rule.PrintSourceLabels(m_fwd);
|
||||
m_fwd << "}}";
|
||||
}
|
||||
|
||||
if (printEndl) {
|
||||
m_fwd << std::endl;
|
||||
m_inv << std::endl;
|
||||
|
@ -45,9 +45,11 @@ class XmlTreeParser
|
||||
public:
|
||||
XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
|
||||
std::auto_ptr<ParseTree> Parse(const std::string &);
|
||||
|
||||
static std::auto_ptr<ParseTree> ConvertTree(const MosesTraining::SyntaxNode &,
|
||||
const std::vector<std::string> &);
|
||||
|
||||
private:
|
||||
std::auto_ptr<ParseTree> ConvertTree(const MosesTraining::SyntaxNode &,
|
||||
const std::vector<std::string> &);
|
||||
|
||||
std::set<std::string> &m_labelSet;
|
||||
std::map<std::string, int> &m_topLabelSet;
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
#include "ScoreFeature.h"
|
||||
#include "tables-core.h"
|
||||
@ -46,6 +47,10 @@ bool inverseFlag = false;
|
||||
bool hierarchicalFlag = false;
|
||||
bool pcfgFlag = false;
|
||||
bool treeFragmentsFlag = false;
|
||||
bool sourceSyntaxLabelsFlag = false;
|
||||
bool sourceSyntaxLabelSetFlag = false;
|
||||
bool sourceSyntaxLabelCountsLHSFlag = false;
|
||||
bool targetPreferenceLabelsFlag = false;
|
||||
bool unpairedExtractFormatFlag = false;
|
||||
bool conditionOnTargetLhsFlag = false;
|
||||
bool wordAlignmentFlag = true;
|
||||
@ -61,13 +66,19 @@ bool crossedNonTerm = false;
|
||||
int countOfCounts[COC_MAX+1];
|
||||
int totalDistinct = 0;
|
||||
float minCountHierarchical = 0;
|
||||
std::map<std::string,float> sourceLHSCounts;
|
||||
std::map<std::string, std::map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
|
||||
|
||||
boost::unordered_map<std::string,float> sourceLHSCounts;
|
||||
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
|
||||
std::set<std::string> sourceLabelSet;
|
||||
std::map<std::string,size_t> sourceLabels;
|
||||
std::vector<std::string> sourceLabelsByIndex;
|
||||
|
||||
boost::unordered_map<std::string,float> targetPreferenceLHSCounts;
|
||||
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetPreferenceLHSJointCounts;
|
||||
std::set<std::string> targetPreferenceLabelSet;
|
||||
std::map<std::string,size_t> targetPreferenceLabels;
|
||||
std::vector<std::string> targetPreferenceLabelsByIndex;
|
||||
|
||||
Vocabulary vcbT;
|
||||
Vocabulary vcbS;
|
||||
|
||||
@ -81,6 +92,11 @@ void processLine( std::string line,
|
||||
std::string &additionalPropertiesString,
|
||||
float &count, float &pcfgSum );
|
||||
void writeCountOfCounts( const std::string &fileNameCountOfCounts );
|
||||
void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
|
||||
const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
|
||||
const std::string &fileNameLeftHandSideSourceLabelCounts,
|
||||
const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
|
||||
void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
|
||||
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
|
||||
@ -102,15 +118,21 @@ int main(int argc, char* argv[])
|
||||
|
||||
ScoreFeatureManager featureManager;
|
||||
if (argc < 4) {
|
||||
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
|
||||
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelSet] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
|
||||
std::cerr << featureManager.usage() << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
std::string fileNameExtract = argv[1];
|
||||
std::string fileNameLex = argv[2];
|
||||
std::string fileNamePhraseTable = argv[3];
|
||||
std::string fileNameSourceLabelSet;
|
||||
std::string fileNameCountOfCounts;
|
||||
std::string fileNameFunctionWords;
|
||||
std::string fileNameLeftHandSideSourceLabelCounts;
|
||||
std::string fileNameLeftHandSideTargetSourceLabelCounts;
|
||||
std::string fileNameTargetPreferenceLabelSet;
|
||||
std::string fileNameLeftHandSideTargetPreferenceLabelCounts;
|
||||
std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts;
|
||||
std::vector<std::string> featureArgs; // all unknown args passed to feature manager
|
||||
|
||||
for(int i=4; i<argc; i++) {
|
||||
@ -126,6 +148,26 @@ int main(int argc, char* argv[])
|
||||
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
|
||||
treeFragmentsFlag = true;
|
||||
std::cerr << "including tree fragment information from syntactic parse\n";
|
||||
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
|
||||
sourceSyntaxLabelsFlag = true;
|
||||
std::cerr << "including source label information" << std::endl;
|
||||
} else if (strcmp(argv[i],"--SourceLabelSet") == 0) {
|
||||
sourceSyntaxLabelSetFlag = true;
|
||||
fileNameSourceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.src";
|
||||
std::cerr << "writing source syntax label set to file " << fileNameSourceLabelSet << std::endl;
|
||||
} else if (strcmp(argv[i],"--SourceLabelCountsLHS") == 0) {
|
||||
sourceSyntaxLabelCountsLHSFlag = true;
|
||||
fileNameLeftHandSideSourceLabelCounts = std::string(fileNamePhraseTable) + ".src.lhs";
|
||||
fileNameLeftHandSideTargetSourceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-src.lhs";
|
||||
std::cerr << "counting left-hand side source labels and writing them to files " << fileNameLeftHandSideSourceLabelCounts << " and " << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
|
||||
} else if (strcmp(argv[i],"--TargetPreferenceLabels") == 0) {
|
||||
targetPreferenceLabelsFlag = true;
|
||||
std::cerr << "including target preference label information" << std::endl;
|
||||
fileNameTargetPreferenceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
|
||||
std::cerr << "writing target preference label set to file " << fileNameTargetPreferenceLabelSet << std::endl;
|
||||
fileNameLeftHandSideTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
|
||||
fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
|
||||
std::cerr << "counting left-hand side target preference labels and writing them to files " << fileNameLeftHandSideTargetPreferenceLabelCounts << " and " << fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts << std::endl;
|
||||
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
|
||||
unpairedExtractFormatFlag = true;
|
||||
std::cerr << "processing unpaired extract format" << std::endl;
|
||||
@ -243,7 +285,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
int i=0;
|
||||
// TODO why read only the 1st line?
|
||||
if ( getline(extractFileP, line)) {
|
||||
if ( getline(extractFileP, line) ) {
|
||||
++i;
|
||||
tmpPhraseSource = new PHRASE();
|
||||
tmpPhraseTarget = new PHRASE();
|
||||
@ -373,6 +415,26 @@ int main(int argc, char* argv[])
|
||||
if (goodTuringFlag || kneserNeyFlag) {
|
||||
writeCountOfCounts( fileNameCountOfCounts );
|
||||
}
|
||||
|
||||
// source syntax labels
|
||||
if (sourceSyntaxLabelsFlag && sourceSyntaxLabelSetFlag && !inverseFlag) {
|
||||
writeLabelSet( sourceLabelSet, fileNameSourceLabelSet );
|
||||
}
|
||||
if (sourceSyntaxLabelsFlag && sourceSyntaxLabelCountsLHSFlag && !inverseFlag) {
|
||||
writeLeftHandSideLabelCounts( sourceLHSCounts,
|
||||
targetLHSAndSourceLHSJointCounts,
|
||||
fileNameLeftHandSideSourceLabelCounts,
|
||||
fileNameLeftHandSideTargetSourceLabelCounts );
|
||||
}
|
||||
|
||||
// target preference labels
|
||||
if (targetPreferenceLabelsFlag && !inverseFlag) {
|
||||
writeLabelSet( targetPreferenceLabelSet, fileNameTargetPreferenceLabelSet );
|
||||
writeLeftHandSideLabelCounts( targetPreferenceLHSCounts,
|
||||
ruleTargetLHSAndTargetPreferenceLHSJointCounts,
|
||||
fileNameLeftHandSideTargetPreferenceLabelCounts,
|
||||
fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -467,6 +529,70 @@ void writeCountOfCounts( const string &fileNameCountOfCounts )
|
||||
}
|
||||
|
||||
|
||||
void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
|
||||
const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
|
||||
const std::string &fileNameLeftHandSideSourceLabelCounts,
|
||||
const std::string &fileNameLeftHandSideTargetSourceLabelCounts )
|
||||
{
|
||||
// open file
|
||||
Moses::OutputFileStream leftHandSideSourceLabelCounts;
|
||||
bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts.c_str());
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open left-hand side label counts file "
|
||||
<< fileNameLeftHandSideSourceLabelCounts << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// write source left-hand side counts
|
||||
for (boost::unordered_map<std::string,float>::const_iterator iter=sourceLHSCounts.begin();
|
||||
iter!=sourceLHSCounts.end(); ++iter) {
|
||||
leftHandSideSourceLabelCounts << iter->first << " " << iter->second << std::endl;
|
||||
}
|
||||
|
||||
leftHandSideSourceLabelCounts.Close();
|
||||
|
||||
// open file
|
||||
Moses::OutputFileStream leftHandSideTargetSourceLabelCounts;
|
||||
success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts.c_str());
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open left-hand side label joint counts file "
|
||||
<< fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// write source left-hand side / target left-hand side joint counts
|
||||
for (boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::const_iterator iter=targetLHSAndSourceLHSJointCounts.begin();
|
||||
iter!=targetLHSAndSourceLHSJointCounts.end(); ++iter) {
|
||||
for (boost::unordered_map<std::string,float>::const_iterator iter2=(iter->second)->begin();
|
||||
iter2!=(iter->second)->end(); ++iter2) {
|
||||
leftHandSideTargetSourceLabelCounts << iter->first << " "<< iter2->first << " " << iter2->second << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
leftHandSideTargetSourceLabelCounts.Close();
|
||||
}
|
||||
|
||||
|
||||
void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName )
|
||||
{
|
||||
// open file
|
||||
Moses::OutputFileStream out;
|
||||
bool success = out.Open(fileName.c_str());
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open label set file "
|
||||
<< fileName << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
for (std::set<std::string>::const_iterator iter=labelSet.begin();
|
||||
iter!=labelSet.end(); ++iter) {
|
||||
out << *iter << std::endl;
|
||||
}
|
||||
|
||||
out.Close();
|
||||
}
|
||||
|
||||
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
|
||||
{
|
||||
@ -639,7 +765,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
if (kneserNeyFlag)
|
||||
phraseTableFile << " " << distinctCount;
|
||||
|
||||
if ((treeFragmentsFlag) &&
|
||||
if ((treeFragmentsFlag || sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) &&
|
||||
!inverseFlag) {
|
||||
phraseTableFile << " |||";
|
||||
}
|
||||
@ -654,6 +780,49 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
}
|
||||
}
|
||||
|
||||
// syntax labels
|
||||
if ((sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && !inverseFlag) {
|
||||
unsigned nNTs = 1;
|
||||
for(size_t j=0; j<phraseSource->size()-1; ++j) {
|
||||
if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
|
||||
++nNTs;
|
||||
}
|
||||
// source syntax labels
|
||||
if (sourceSyntaxLabelsFlag) {
|
||||
std::string sourceLabelCounts;
|
||||
sourceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("SourceLabels",
|
||||
sourceLabelSet,
|
||||
sourceLHSCounts,
|
||||
targetLHSAndSourceLHSJointCounts,
|
||||
vcbT);
|
||||
if ( !sourceLabelCounts.empty() ) {
|
||||
phraseTableFile << " {{SourceLabels "
|
||||
<< nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
|
||||
<< " "
|
||||
<< count // rule count
|
||||
<< sourceLabelCounts
|
||||
<< "}}";
|
||||
}
|
||||
}
|
||||
// target preference labels
|
||||
if (targetPreferenceLabelsFlag) {
|
||||
std::string targetPreferenceLabelCounts;
|
||||
targetPreferenceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
|
||||
targetPreferenceLabelSet,
|
||||
targetPreferenceLHSCounts,
|
||||
ruleTargetLHSAndTargetPreferenceLHSJointCounts,
|
||||
vcbT);
|
||||
if ( !targetPreferenceLabelCounts.empty() ) {
|
||||
phraseTableFile << " {{TargetPreferences "
|
||||
<< nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
|
||||
<< " "
|
||||
<< count // rule count
|
||||
<< targetPreferenceLabelCounts
|
||||
<< "}}";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
phraseTableFile << std::endl;
|
||||
}
|
||||
|
||||
@ -894,3 +1063,4 @@ void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,22 @@
|
||||
#pragma once
|
||||
/*
|
||||
* score.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 28/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2009 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
@ -27,7 +27,7 @@ public:
|
||||
std::vector< WORD > vocab;
|
||||
WORD_ID storeIfNew( const WORD& );
|
||||
WORD_ID getWordID( const WORD& );
|
||||
inline WORD &getWord( WORD_ID id ) {
|
||||
inline WORD &getWord( const WORD_ID id ) {
|
||||
return vocab[ id ];
|
||||
}
|
||||
};
|
||||
|
@ -150,8 +150,14 @@ tokenize
|
||||
pass-unless: output-tokenizer
|
||||
template: $output-tokenizer < IN > OUT
|
||||
parallelizable: yes
|
||||
factorize
|
||||
mock-parse
|
||||
in: tokenized-corpus
|
||||
out: mock-parsed-corpus
|
||||
default-name: lm/mock-parsed
|
||||
pass-unless: mock-output-parser-lm
|
||||
template: $mock-output-parser-lm < IN > OUT
|
||||
factorize
|
||||
in: mock-parsed-corpus
|
||||
out: factorized-corpus
|
||||
rerun-on-change: TRAINING:output-factors
|
||||
default-name: lm/factored
|
||||
@ -234,8 +240,14 @@ tokenize-tuning
|
||||
pass-unless: output-tokenizer
|
||||
template: $output-tokenizer < IN > OUT
|
||||
parallelizable: yes
|
||||
factorize-tuning
|
||||
mock-parse-tuning
|
||||
in: tokenized-tuning
|
||||
out: mock-parsed-tuning
|
||||
default-name: lm/interpolate-tuning.mock-parsed
|
||||
pass-unless: mock-output-parser-lm
|
||||
template: $mock-output-parser-lm < IN > OUT
|
||||
factorize-tuning
|
||||
in: mock-parsed-tuning
|
||||
out: factorized-tuning
|
||||
default-name: lm/interpolate-tuning.factored
|
||||
pass-unless: TRAINING:output-factors
|
||||
@ -705,17 +717,32 @@ tokenize-input-devtest
|
||||
pass-unless: input-tokenizer
|
||||
ignore-unless: use-mira
|
||||
template: $input-tokenizer < IN > OUT
|
||||
parse-input
|
||||
mock-parse-input
|
||||
in: tokenized-input
|
||||
out: mock-parsed-input
|
||||
default-name: tuning/input.mock-parsed
|
||||
pass-unless: mock-input-parser-devtesteval
|
||||
template: $mock-input-parser-devtesteval < IN > OUT
|
||||
mock-parse-input-devtest
|
||||
in: tokenized-input-devtest
|
||||
out: mock-parsed-input-devtest
|
||||
default-name: tuning/input.devtest.mock-parsed
|
||||
pass-unless: mock-input-parser-devtesteval
|
||||
ignore-unless: use-mira
|
||||
template: $mock-input-parser-devtesteval < IN > OUT
|
||||
parse-input
|
||||
in: mock-parsed-input
|
||||
out: parsed-input
|
||||
default-name: tuning/input.parsed
|
||||
pass-unless: input-parser
|
||||
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
|
||||
template: $input-parser < IN > OUT
|
||||
parse-input-devtest
|
||||
in: tokenized-input-devtest
|
||||
in: mock-parsed-input-devtesteval
|
||||
out: parsed-input-devtest
|
||||
default-name: tuning/input.devtest.parsed
|
||||
pass-unless: input-parser
|
||||
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
|
||||
ignore-unless: use-mira
|
||||
template: $input-parser < IN > OUT
|
||||
parse-relax-input
|
||||
@ -723,14 +750,16 @@ parse-relax-input
|
||||
out: parse-relaxed-input
|
||||
default-name: tuning/input.parse-relaxed
|
||||
pass-unless: input-parse-relaxer
|
||||
template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
|
||||
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
|
||||
template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
|
||||
parse-relax-input-devtest
|
||||
in: parsed-input-devtest
|
||||
out: parse-relaxed-input-devtest
|
||||
default-name: tuning/input.devtest.parse-relaxed
|
||||
pass-unless: input-parse-relaxer
|
||||
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
|
||||
ignore-unless: use-mira
|
||||
template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
|
||||
template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
|
||||
factorize-input
|
||||
in: parse-relaxed-input
|
||||
out: factorized-input
|
||||
@ -832,8 +861,20 @@ tokenize-reference-devtest
|
||||
ignore-unless: use-mira
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-tokenizer < IN > OUT
|
||||
lowercase-reference
|
||||
mock-parse-reference
|
||||
in: tokenized-reference
|
||||
out: mock-parsed-reference
|
||||
default-name: tuning/reference.mock-parsed
|
||||
pass-unless: mock-output-parser-references
|
||||
template: $mock-output-parser-references < IN > OUT
|
||||
mock-parse-reference-devtest
|
||||
in: tokenized-input-devtest
|
||||
out: mock-parsed-reference-devtest
|
||||
default-name: tuning/reference.devtest.mock-parsed
|
||||
pass-unless: mock-output-parser-references
|
||||
template: $mock-output-parser-references < IN > OUT
|
||||
lowercase-reference
|
||||
in: mock-parsed-reference
|
||||
out: truecased-reference
|
||||
default-name: tuning/reference.lc
|
||||
pass-unless: output-lowercaser
|
||||
@ -841,7 +882,7 @@ lowercase-reference
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-lowercaser < IN > OUT
|
||||
lowercase-reference-devtest
|
||||
in: tokenized-reference-devtest
|
||||
in: mock-parsed-reference-devtest
|
||||
out: truecased-reference-devtest
|
||||
default-name: tuning/reference.devtest.lc
|
||||
pass-unless: output-lowercaser
|
||||
@ -850,7 +891,7 @@ lowercase-reference-devtest
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-lowercaser < IN > OUT
|
||||
truecase-reference
|
||||
in: tokenized-reference TRUECASER:truecase-model
|
||||
in: mock-parsed-reference TRUECASER:truecase-model
|
||||
out: truecased-reference
|
||||
rerun-on-change: output-truecaser
|
||||
default-name: tuning/reference.tc
|
||||
@ -858,7 +899,7 @@ truecase-reference
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-truecaser -model IN1.$output-extension < IN > OUT
|
||||
truecase-reference-devtest
|
||||
in: tokenized-reference-devtest TRUECASER:truecase-model
|
||||
in: mock-parsed-reference-devtest TRUECASER:truecase-model
|
||||
out: truecased-reference-devtest
|
||||
rerun-on-change: output-truecaser
|
||||
default-name: tuning/reference.devtest.tc
|
||||
@ -959,18 +1000,26 @@ tokenize-input
|
||||
default-name: evaluation/input.tok
|
||||
pass-unless: input-tokenizer
|
||||
template: $input-tokenizer < IN > OUT
|
||||
parse-input
|
||||
mock-parse-input
|
||||
in: tokenized-input
|
||||
out: mock-parsed-input
|
||||
default-name: evaluation/input.mock-parsed
|
||||
pass-unless: mock-input-parser-devtesteval
|
||||
template: $mock-input-parser-devtesteval < IN > OUT
|
||||
parse-input
|
||||
in: mock-parsed-input
|
||||
out: parsed-input
|
||||
default-name: evaluation/input.parsed
|
||||
pass-unless: input-parser
|
||||
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
|
||||
template: $input-parser < IN > OUT
|
||||
parse-relax-input
|
||||
in: parsed-input
|
||||
out: parse-relaxed-input
|
||||
default-name: tuning/input.parse-relaxed
|
||||
pass-unless: input-parse-relaxer
|
||||
template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
|
||||
pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
|
||||
template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
|
||||
factorize-input
|
||||
in: parse-relaxed-input
|
||||
out: factorized-input
|
||||
@ -1093,8 +1142,14 @@ tokenize-reference
|
||||
pass-unless: output-tokenizer
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-tokenizer < IN > OUT
|
||||
lowercase-reference
|
||||
mock-parse-reference
|
||||
in: tokenized-reference
|
||||
out: mock-parsed-reference
|
||||
default-name: evaluation/reference.mock-parsed
|
||||
pass-unless: mock-output-parser-references
|
||||
template: $mock-output-parser-references < IN > OUT
|
||||
lowercase-reference
|
||||
in: mock-parsed-reference
|
||||
out: reference
|
||||
default-name: evaluation/reference
|
||||
pass-unless: output-lowercaser
|
||||
|
@ -2406,24 +2406,16 @@ sub define_training_create_config {
|
||||
$cmd .= "-transliteration-phrase-table $transliteration_pt ";
|
||||
}
|
||||
|
||||
if($osm){
|
||||
|
||||
if ($osm) {
|
||||
my $osm_settings = &get("TRAINING:operation-sequence-model-settings");
|
||||
|
||||
|
||||
if($osm_settings =~ /factor/){
|
||||
|
||||
$cmd .= "-osm-model $osm/ ";
|
||||
my $find = "--factor";
|
||||
my $replace = "-osm-setting";
|
||||
$osm_settings =~ s/$find/$replace/g;
|
||||
$cmd .= "$osm_settings ";
|
||||
}
|
||||
else{
|
||||
$cmd .= "-osm-model $osm/operationLM.bin ";
|
||||
}
|
||||
if ($osm_settings =~ /-factor *(\S+)/){
|
||||
$cmd .= "-osm-model $osm/ -osm-setting $1 ";
|
||||
}
|
||||
else {
|
||||
$cmd .= "-osm-model $osm/operationLM.bin ";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# sparse lexical features provide additional content for config file
|
||||
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user