mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
Roll out mixed syntax
This commit is contained in:
parent
be9b3cb1c6
commit
73f1d259a1
194
phrase-extract/extract-mixed-syntax/AlignedSentence.cpp
Normal file
194
phrase-extract/extract-mixed-syntax/AlignedSentence.cpp
Normal file
@ -0,0 +1,194 @@
|
|||||||
|
/*
|
||||||
|
* AlignedSentence.cpp
|
||||||
|
*
|
||||||
|
* Created on: 18 Feb 2014
|
||||||
|
* Author: s0565741
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include "moses/Util.h"
|
||||||
|
#include "AlignedSentence.h"
|
||||||
|
#include "Parameter.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
|
AlignedSentence::AlignedSentence(int lineNum,
|
||||||
|
const std::string &source,
|
||||||
|
const std::string &target,
|
||||||
|
const std::string &alignment)
|
||||||
|
:m_lineNum(lineNum)
|
||||||
|
{
|
||||||
|
PopulateWordVec(m_source, source);
|
||||||
|
PopulateWordVec(m_target, target);
|
||||||
|
PopulateAlignment(alignment);
|
||||||
|
}
|
||||||
|
|
||||||
|
AlignedSentence::~AlignedSentence() {
|
||||||
|
Moses::RemoveAllInColl(m_source);
|
||||||
|
Moses::RemoveAllInColl(m_target);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentence::PopulateWordVec(Phrase &vec, const std::string &line)
|
||||||
|
{
|
||||||
|
std::vector<string> toks;
|
||||||
|
Moses::Tokenize(toks, line);
|
||||||
|
|
||||||
|
vec.resize(toks.size());
|
||||||
|
for (size_t i = 0; i < vec.size(); ++i) {
|
||||||
|
const string &tok = toks[i];
|
||||||
|
Word *word = new Word(i, tok);
|
||||||
|
vec[i] = word;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentence::PopulateAlignment(const std::string &line)
|
||||||
|
{
|
||||||
|
vector<string> alignStr;
|
||||||
|
Moses::Tokenize(alignStr, line);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < alignStr.size(); ++i) {
|
||||||
|
vector<int> alignPair;
|
||||||
|
Moses::Tokenize(alignPair, alignStr[i], "-");
|
||||||
|
assert(alignPair.size() == 2);
|
||||||
|
|
||||||
|
int sourcePos = alignPair[0];
|
||||||
|
int targetPos = alignPair[1];
|
||||||
|
|
||||||
|
if (sourcePos >= m_source.size()) {
|
||||||
|
cerr << "ERROR1:AlignedSentence=" << Debug() << endl;
|
||||||
|
cerr << "m_source=" << m_source.size() << endl;
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
assert(sourcePos < m_source.size());
|
||||||
|
assert(targetPos < m_target.size());
|
||||||
|
Word *sourceWord = m_source[sourcePos];
|
||||||
|
Word *targetWord = m_target[targetPos];
|
||||||
|
|
||||||
|
sourceWord->AddAlignment(targetWord);
|
||||||
|
targetWord->AddAlignment(sourceWord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string AlignedSentence::Debug() const
|
||||||
|
{
|
||||||
|
stringstream out;
|
||||||
|
out << "m_lineNum:";
|
||||||
|
out << m_lineNum;
|
||||||
|
out << endl;
|
||||||
|
|
||||||
|
out << "m_source:";
|
||||||
|
out << m_source.Debug();
|
||||||
|
out << endl;
|
||||||
|
|
||||||
|
out << "m_target:";
|
||||||
|
out << m_target.Debug();
|
||||||
|
out << endl;
|
||||||
|
|
||||||
|
out << "consistent phrases:" << endl;
|
||||||
|
out << m_consistentPhrases.Debug();
|
||||||
|
out << endl;
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> AlignedSentence::GetSourceAlignmentCount() const
|
||||||
|
{
|
||||||
|
vector<int> ret(m_source.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < m_source.size(); ++i) {
|
||||||
|
const Word &word = *m_source[i];
|
||||||
|
ret[i] = word.GetAlignmentIndex().size();
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentence::Create(const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
CreateConsistentPhrases(params);
|
||||||
|
m_consistentPhrases.AddHieroNonTerms(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentence::CreateConsistentPhrases(const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
int countT = m_target.size();
|
||||||
|
int countS = m_source.size();
|
||||||
|
|
||||||
|
m_consistentPhrases.Initialize(countS);
|
||||||
|
|
||||||
|
// check alignments for target phrase startT...endT
|
||||||
|
for(int lengthT=1;
|
||||||
|
lengthT <= params.maxSpan && lengthT <= countT;
|
||||||
|
lengthT++) {
|
||||||
|
for(int startT=0; startT < countT-(lengthT-1); startT++) {
|
||||||
|
|
||||||
|
// that's nice to have
|
||||||
|
int endT = startT + lengthT - 1;
|
||||||
|
|
||||||
|
// find find aligned source words
|
||||||
|
// first: find minimum and maximum source word
|
||||||
|
int minS = 9999;
|
||||||
|
int maxS = -1;
|
||||||
|
vector< int > usedS = GetSourceAlignmentCount();
|
||||||
|
for(int ti=startT; ti<=endT; ti++) {
|
||||||
|
const Word &word = *m_target[ti];
|
||||||
|
const std::set<int> &alignment = word.GetAlignmentIndex();
|
||||||
|
|
||||||
|
std::set<int>::const_iterator iterAlign;
|
||||||
|
for(iterAlign = alignment.begin(); iterAlign != alignment.end(); ++iterAlign) {
|
||||||
|
int si = *iterAlign;
|
||||||
|
if (si<minS) {
|
||||||
|
minS = si;
|
||||||
|
}
|
||||||
|
if (si>maxS) {
|
||||||
|
maxS = si;
|
||||||
|
}
|
||||||
|
usedS[ si ]--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// unaligned phrases are not allowed
|
||||||
|
if( maxS == -1 )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// source phrase has to be within limits
|
||||||
|
size_t width = maxS - minS + 1;
|
||||||
|
|
||||||
|
if( width < params.minSpan )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if( width > params.maxSpan )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// check if source words are aligned to out of bound target words
|
||||||
|
bool out_of_bounds = false;
|
||||||
|
for(int si=minS; si<=maxS && !out_of_bounds; si++)
|
||||||
|
if (usedS[si]>0) {
|
||||||
|
out_of_bounds = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if out of bound, you gotta go
|
||||||
|
if (out_of_bounds)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// done with all the checks, lets go over all consistent phrase pairs
|
||||||
|
// start point of source phrase may retreat over unaligned
|
||||||
|
for(int startS=minS;
|
||||||
|
(startS>=0 &&
|
||||||
|
startS>maxS - params.maxSpan && // within length limit
|
||||||
|
(startS==minS || m_source[startS]->GetAlignment().size()==0)); // unaligned
|
||||||
|
startS--) {
|
||||||
|
// end point of source phrase may advance over unaligned
|
||||||
|
for(int endS=maxS;
|
||||||
|
(endS<countS && endS<startS + params.maxSpan && // within length limit
|
||||||
|
(endS==maxS || m_source[endS]->GetAlignment().size()==0)); // unaligned
|
||||||
|
endS++) {
|
||||||
|
|
||||||
|
// take note that this is a valid phrase alignment
|
||||||
|
m_consistentPhrases.Add(startS, endS, startT, endT, params);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
51
phrase-extract/extract-mixed-syntax/AlignedSentence.h
Normal file
51
phrase-extract/extract-mixed-syntax/AlignedSentence.h
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
/*
|
||||||
|
* AlignedSentence.h
|
||||||
|
*
|
||||||
|
* Created on: 18 Feb 2014
|
||||||
|
* Author: s0565741
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <set>
|
||||||
|
#include "ConsistentPhrases.h"
|
||||||
|
#include "Phrase.h"
|
||||||
|
#include "moses/TypeDef.h"
|
||||||
|
|
||||||
|
class Parameter;
|
||||||
|
|
||||||
|
class AlignedSentence {
|
||||||
|
public:
|
||||||
|
AlignedSentence(int lineNum)
|
||||||
|
:m_lineNum(lineNum)
|
||||||
|
{}
|
||||||
|
|
||||||
|
AlignedSentence(int lineNum,
|
||||||
|
const std::string &source,
|
||||||
|
const std::string &target,
|
||||||
|
const std::string &alignment);
|
||||||
|
virtual ~AlignedSentence();
|
||||||
|
virtual void Create(const Parameter ¶ms);
|
||||||
|
|
||||||
|
const Phrase &GetPhrase(Moses::FactorDirection direction) const
|
||||||
|
{ return (direction == Moses::Input) ? m_source : m_target; }
|
||||||
|
|
||||||
|
const ConsistentPhrases &GetConsistentPhrases() const
|
||||||
|
{ return m_consistentPhrases; }
|
||||||
|
|
||||||
|
virtual std::string Debug() const;
|
||||||
|
|
||||||
|
int m_lineNum;
|
||||||
|
protected:
|
||||||
|
Phrase m_source, m_target;
|
||||||
|
ConsistentPhrases m_consistentPhrases;
|
||||||
|
|
||||||
|
void CreateConsistentPhrases(const Parameter ¶ms);
|
||||||
|
void PopulateWordVec(Phrase &vec, const std::string &line);
|
||||||
|
|
||||||
|
// m_source and m_target MUST be populated before calling this
|
||||||
|
void PopulateAlignment(const std::string &line);
|
||||||
|
std::vector<int> GetSourceAlignmentCount() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
183
phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp
Normal file
183
phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp
Normal file
@ -0,0 +1,183 @@
|
|||||||
|
/*
|
||||||
|
* AlignedSentenceSyntax.cpp
|
||||||
|
*
|
||||||
|
* Created on: 26 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "AlignedSentenceSyntax.h"
|
||||||
|
#include "Parameter.h"
|
||||||
|
#include "pugixml.hpp"
|
||||||
|
#include "moses/Util.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
AlignedSentenceSyntax::AlignedSentenceSyntax(int lineNum,
|
||||||
|
const std::string &source,
|
||||||
|
const std::string &target,
|
||||||
|
const std::string &alignment)
|
||||||
|
:AlignedSentence(lineNum)
|
||||||
|
,m_sourceStr(source)
|
||||||
|
,m_targetStr(target)
|
||||||
|
,m_alignmentStr(alignment)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
AlignedSentenceSyntax::~AlignedSentenceSyntax() {
|
||||||
|
// TODO Auto-generated destructor stub
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentenceSyntax::Populate(bool isSyntax, int mixedSyntaxType, const Parameter ¶ms,
|
||||||
|
string line, Phrase &phrase, SyntaxTree &tree)
|
||||||
|
{
|
||||||
|
// parse source and target string
|
||||||
|
if (isSyntax) {
|
||||||
|
line = "<xml><tree label=\"X\">" + line + "</tree></xml>";
|
||||||
|
XMLParse(phrase, tree, line, params);
|
||||||
|
|
||||||
|
if (mixedSyntaxType != 0) {
|
||||||
|
// mixed syntax. Always add [X] where there isn't 1
|
||||||
|
tree.SetHieroLabel(params.hieroNonTerm);
|
||||||
|
if (mixedSyntaxType == 2) {
|
||||||
|
tree.AddToAll(params.hieroNonTerm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PopulateWordVec(phrase, line);
|
||||||
|
tree.SetHieroLabel(params.hieroNonTerm);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentenceSyntax::Create(const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
Populate(params.sourceSyntax, params.mixedSyntaxType, params, m_sourceStr,
|
||||||
|
m_source, m_sourceTree);
|
||||||
|
Populate(params.targetSyntax, params.mixedSyntaxType, params, m_targetStr,
|
||||||
|
m_target, m_targetTree);
|
||||||
|
|
||||||
|
PopulateAlignment(m_alignmentStr);
|
||||||
|
CreateConsistentPhrases(params);
|
||||||
|
|
||||||
|
// create labels
|
||||||
|
CreateNonTerms();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Escape(string &text)
|
||||||
|
{
|
||||||
|
text = Moses::Replace(text, "&", "&");
|
||||||
|
text = Moses::Replace(text, "|", "|");
|
||||||
|
text = Moses::Replace(text, "<", "<");
|
||||||
|
text = Moses::Replace(text, ">", ">");
|
||||||
|
text = Moses::Replace(text, "'", "'");
|
||||||
|
text = Moses::Replace(text, "\"", """);
|
||||||
|
text = Moses::Replace(text, "[", "[");
|
||||||
|
text = Moses::Replace(text, "]", "]");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentenceSyntax::XMLParse(Phrase &output,
|
||||||
|
SyntaxTree &tree,
|
||||||
|
const pugi::xml_node &parentNode,
|
||||||
|
const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
int childNum = 0;
|
||||||
|
for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling())
|
||||||
|
{
|
||||||
|
string nodeName = childNode.name();
|
||||||
|
|
||||||
|
// span label
|
||||||
|
string label;
|
||||||
|
int startPos = output.size();
|
||||||
|
|
||||||
|
if (!nodeName.empty()) {
|
||||||
|
pugi::xml_attribute attribute = childNode.attribute("label");
|
||||||
|
label = attribute.as_string();
|
||||||
|
|
||||||
|
// recursively call this function. For proper recursive trees
|
||||||
|
XMLParse(output, tree, childNode, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// fill phrase vector
|
||||||
|
string text = childNode.value();
|
||||||
|
Escape(text);
|
||||||
|
//cerr << childNum << " " << label << "=" << text << endl;
|
||||||
|
|
||||||
|
std::vector<string> toks;
|
||||||
|
Moses::Tokenize(toks, text);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < toks.size(); ++i) {
|
||||||
|
const string &tok = toks[i];
|
||||||
|
Word *word = new Word(output.size(), tok);
|
||||||
|
output.push_back(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
// is it a labelled span?
|
||||||
|
int endPos = output.size() - 1;
|
||||||
|
|
||||||
|
// fill syntax labels
|
||||||
|
if (!label.empty()) {
|
||||||
|
label = "[" + label + "]";
|
||||||
|
tree.Add(startPos, endPos, label, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
++childNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentenceSyntax::XMLParse(Phrase &output,
|
||||||
|
SyntaxTree &tree,
|
||||||
|
const std::string input,
|
||||||
|
const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
pugi::xml_document doc;
|
||||||
|
pugi::xml_parse_result result = doc.load(input.c_str(),
|
||||||
|
pugi::parse_default | pugi::parse_comments);
|
||||||
|
|
||||||
|
pugi::xml_node topNode = doc.child("xml");
|
||||||
|
XMLParse(output, tree, topNode, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentenceSyntax::CreateNonTerms()
|
||||||
|
{
|
||||||
|
for (int sourceStart = 0; sourceStart < m_source.size(); ++sourceStart) {
|
||||||
|
for (int sourceEnd = sourceStart; sourceEnd < m_source.size(); ++sourceEnd) {
|
||||||
|
ConsistentPhrases::Coll &coll = m_consistentPhrases.GetColl(sourceStart, sourceEnd);
|
||||||
|
const SyntaxTree::Labels &sourceLabels = m_sourceTree.Find(sourceStart, sourceEnd);
|
||||||
|
|
||||||
|
ConsistentPhrases::Coll::iterator iter;
|
||||||
|
for (iter = coll.begin(); iter != coll.end(); ++iter) {
|
||||||
|
ConsistentPhrase &cp = **iter;
|
||||||
|
|
||||||
|
int targetStart = cp.corners[2];
|
||||||
|
int targetEnd = cp.corners[3];
|
||||||
|
const SyntaxTree::Labels &targetLabels = m_targetTree.Find(targetStart, targetEnd);
|
||||||
|
|
||||||
|
CreateNonTerms(cp, sourceLabels, targetLabels);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignedSentenceSyntax::CreateNonTerms(ConsistentPhrase &cp,
|
||||||
|
const SyntaxTree::Labels &sourceLabels,
|
||||||
|
const SyntaxTree::Labels &targetLabels)
|
||||||
|
{
|
||||||
|
SyntaxTree::Labels::const_iterator iterSource;
|
||||||
|
for (iterSource = sourceLabels.begin(); iterSource != sourceLabels.end(); ++iterSource) {
|
||||||
|
const string &sourceLabel = *iterSource;
|
||||||
|
|
||||||
|
SyntaxTree::Labels::const_iterator iterTarget;
|
||||||
|
for (iterTarget = targetLabels.begin(); iterTarget != targetLabels.end(); ++iterTarget) {
|
||||||
|
const string &targetLabel = *iterTarget;
|
||||||
|
cp.AddNonTerms(sourceLabel, targetLabel);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
46
phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.h
Normal file
46
phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.h
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
/*
|
||||||
|
* AlignedSentenceSyntax.h
|
||||||
|
*
|
||||||
|
* Created on: 26 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "AlignedSentence.h"
|
||||||
|
#include "SyntaxTree.h"
|
||||||
|
#include "pugixml.hpp"
|
||||||
|
|
||||||
|
class AlignedSentenceSyntax : public AlignedSentence
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
AlignedSentenceSyntax(int lineNum,
|
||||||
|
const std::string &source,
|
||||||
|
const std::string &target,
|
||||||
|
const std::string &alignment);
|
||||||
|
virtual ~AlignedSentenceSyntax();
|
||||||
|
|
||||||
|
void Create(const Parameter ¶ms);
|
||||||
|
|
||||||
|
//virtual std::string Debug() const;
|
||||||
|
protected:
|
||||||
|
std::string m_sourceStr, m_targetStr, m_alignmentStr;
|
||||||
|
SyntaxTree m_sourceTree, m_targetTree;
|
||||||
|
|
||||||
|
void XMLParse(Phrase &output,
|
||||||
|
SyntaxTree &tree,
|
||||||
|
const std::string input,
|
||||||
|
const Parameter ¶ms);
|
||||||
|
void XMLParse(Phrase &output,
|
||||||
|
SyntaxTree &tree,
|
||||||
|
const pugi::xml_node &parentNode,
|
||||||
|
const Parameter ¶ms);
|
||||||
|
void CreateNonTerms();
|
||||||
|
void CreateNonTerms(ConsistentPhrase &cp,
|
||||||
|
const SyntaxTree::Labels &sourceLabels,
|
||||||
|
const SyntaxTree::Labels &targetLabels);
|
||||||
|
void Populate(bool isSyntax, int mixedSyntaxType, const Parameter ¶ms,
|
||||||
|
std::string line, Phrase &phrase, SyntaxTree &tree);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
66
phrase-extract/extract-mixed-syntax/ConsistentPhrase.cpp
Normal file
66
phrase-extract/extract-mixed-syntax/ConsistentPhrase.cpp
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
/*
|
||||||
|
* ConsistentPhrase.cpp
|
||||||
|
*
|
||||||
|
* Created on: 20 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
#include <sstream>
|
||||||
|
#include "ConsistentPhrase.h"
|
||||||
|
#include "Word.h"
|
||||||
|
#include "NonTerm.h"
|
||||||
|
#include "Parameter.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
ConsistentPhrase::ConsistentPhrase(
|
||||||
|
int sourceStart, int sourceEnd,
|
||||||
|
int targetStart, int targetEnd,
|
||||||
|
const Parameter ¶ms)
|
||||||
|
:corners(4)
|
||||||
|
,m_hieroNonTerm(*this, params.hieroNonTerm, params.hieroNonTerm)
|
||||||
|
{
|
||||||
|
corners[0] = sourceStart;
|
||||||
|
corners[1] = sourceEnd;
|
||||||
|
corners[2] = targetStart;
|
||||||
|
corners[3] = targetEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
ConsistentPhrase::~ConsistentPhrase() {
|
||||||
|
// TODO Auto-generated destructor stub
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ConsistentPhrase::operator<(const ConsistentPhrase &other) const
|
||||||
|
{
|
||||||
|
return corners < other.corners;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConsistentPhrase::AddNonTerms(const std::string &source,
|
||||||
|
const std::string &target)
|
||||||
|
{
|
||||||
|
m_nonTerms.push_back(NonTerm(*this, source, target));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ConsistentPhrase::TargetOverlap(const ConsistentPhrase &other) const
|
||||||
|
{
|
||||||
|
if ( other.corners[3] < corners[2] || other.corners[2] > corners[3])
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ConsistentPhrase::Debug() const
|
||||||
|
{
|
||||||
|
stringstream out;
|
||||||
|
out << "[" << corners[0] << "-" << corners[1]
|
||||||
|
<< "][" << corners[2] << "-" << corners[3] << "]";
|
||||||
|
|
||||||
|
out << "NT:";
|
||||||
|
for (size_t i = 0; i < m_nonTerms.size(); ++i) {
|
||||||
|
const NonTerm &nonTerm = m_nonTerms[i];
|
||||||
|
out << nonTerm.GetLabel(Moses::Input) << ":" << nonTerm.GetLabel(Moses::Output);
|
||||||
|
}
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
51
phrase-extract/extract-mixed-syntax/ConsistentPhrase.h
Normal file
51
phrase-extract/extract-mixed-syntax/ConsistentPhrase.h
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
/*
|
||||||
|
* ConsistentPhrase.h
|
||||||
|
*
|
||||||
|
* Created on: 20 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#include "moses/TypeDef.h"
|
||||||
|
#include "NonTerm.h"
|
||||||
|
|
||||||
|
class ConsistentPhrase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef std::vector<NonTerm> NonTerms;
|
||||||
|
|
||||||
|
std::vector<int> corners;
|
||||||
|
|
||||||
|
ConsistentPhrase(const ConsistentPhrase ©); // do not implement
|
||||||
|
ConsistentPhrase(int sourceStart, int sourceEnd,
|
||||||
|
int targetStart, int targetEnd,
|
||||||
|
const Parameter ¶ms);
|
||||||
|
|
||||||
|
virtual ~ConsistentPhrase();
|
||||||
|
|
||||||
|
int GetWidth(Moses::FactorDirection direction) const
|
||||||
|
{ return (direction == Moses::Input) ? corners[1] - corners[0] + 1 : corners[3] - corners[2] + 1; }
|
||||||
|
|
||||||
|
|
||||||
|
void AddNonTerms(const std::string &source,
|
||||||
|
const std::string &target);
|
||||||
|
const NonTerms &GetNonTerms() const
|
||||||
|
{ return m_nonTerms;}
|
||||||
|
const NonTerm &GetHieroNonTerm() const
|
||||||
|
{ return m_hieroNonTerm;}
|
||||||
|
|
||||||
|
bool TargetOverlap(const ConsistentPhrase &other) const;
|
||||||
|
|
||||||
|
bool operator<(const ConsistentPhrase &other) const;
|
||||||
|
|
||||||
|
std::string Debug() const;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
NonTerms m_nonTerms;
|
||||||
|
NonTerm m_hieroNonTerm;
|
||||||
|
};
|
||||||
|
|
103
phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
Normal file
103
phrase-extract/extract-mixed-syntax/ConsistentPhrases.cpp
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
/*
|
||||||
|
* ConsistentPhrases.cpp
|
||||||
|
*
|
||||||
|
* Created on: 20 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
#include <sstream>
|
||||||
|
#include <cassert>
|
||||||
|
#include "ConsistentPhrases.h"
|
||||||
|
#include "NonTerm.h"
|
||||||
|
#include "Parameter.h"
|
||||||
|
#include "moses/Util.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
ConsistentPhrases::ConsistentPhrases()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
ConsistentPhrases::~ConsistentPhrases() {
|
||||||
|
for (int start = 0; start < m_coll.size(); ++start) {
|
||||||
|
std::vector<Coll> &allSourceStart = m_coll[start];
|
||||||
|
|
||||||
|
for (int size = 0; size < allSourceStart.size(); ++size) {
|
||||||
|
Coll &coll = allSourceStart[size];
|
||||||
|
Moses::RemoveAllInColl(coll);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConsistentPhrases::Initialize(size_t size)
|
||||||
|
{
|
||||||
|
m_coll.resize(size);
|
||||||
|
|
||||||
|
for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
|
||||||
|
std::vector<Coll> &allSourceStart = m_coll[sourceStart];
|
||||||
|
allSourceStart.resize(size - sourceStart);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConsistentPhrases::Add(int sourceStart, int sourceEnd,
|
||||||
|
int targetStart, int targetEnd,
|
||||||
|
const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
Coll &coll = m_coll[sourceStart][sourceEnd - sourceStart];
|
||||||
|
ConsistentPhrase *cp = new ConsistentPhrase(sourceStart, sourceEnd,
|
||||||
|
targetStart, targetEnd,
|
||||||
|
params);
|
||||||
|
|
||||||
|
pair<Coll::iterator, bool> inserted = coll.insert(cp);
|
||||||
|
assert(inserted.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
const ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd) const
|
||||||
|
{
|
||||||
|
const std::vector<Coll> &allSourceStart = m_coll[sourceStart];
|
||||||
|
const Coll &ret = allSourceStart[sourceEnd - sourceStart];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
ConsistentPhrases::Coll &ConsistentPhrases::GetColl(int sourceStart, int sourceEnd)
|
||||||
|
{
|
||||||
|
std::vector<Coll> &allSourceStart = m_coll[sourceStart];
|
||||||
|
Coll &ret = allSourceStart[sourceEnd - sourceStart];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ConsistentPhrases::Debug() const
|
||||||
|
{
|
||||||
|
std::stringstream out;
|
||||||
|
for (int start = 0; start < m_coll.size(); ++start) {
|
||||||
|
const std::vector<Coll> &allSourceStart = m_coll[start];
|
||||||
|
|
||||||
|
for (int size = 0; size < allSourceStart.size(); ++size) {
|
||||||
|
const Coll &coll = allSourceStart[size];
|
||||||
|
|
||||||
|
Coll::const_iterator iter;
|
||||||
|
for (iter = coll.begin(); iter != coll.end(); ++iter) {
|
||||||
|
const ConsistentPhrase &consistentPhrase = **iter;
|
||||||
|
out << consistentPhrase.Debug() << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConsistentPhrases::AddHieroNonTerms(const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
// add [X] labels everywhere
|
||||||
|
for (int i = 0; i < m_coll.size(); ++i) {
|
||||||
|
vector<Coll> &inner = m_coll[i];
|
||||||
|
for (int j = 0; j < inner.size(); ++j) {
|
||||||
|
ConsistentPhrases::Coll &coll = inner[j];
|
||||||
|
ConsistentPhrases::Coll::iterator iter;
|
||||||
|
for (iter = coll.begin(); iter != coll.end(); ++iter) {
|
||||||
|
ConsistentPhrase &cp = **iter;
|
||||||
|
cp.AddNonTerms(params.hieroNonTerm, params.hieroNonTerm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
40
phrase-extract/extract-mixed-syntax/ConsistentPhrases.h
Normal file
40
phrase-extract/extract-mixed-syntax/ConsistentPhrases.h
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
/*
|
||||||
|
* ConsistentPhrases.h
|
||||||
|
*
|
||||||
|
* Created on: 20 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#include "ConsistentPhrase.h"
|
||||||
|
|
||||||
|
class Word;
|
||||||
|
class Parameter;
|
||||||
|
|
||||||
|
class ConsistentPhrases {
|
||||||
|
public:
|
||||||
|
typedef std::set<ConsistentPhrase*> Coll;
|
||||||
|
|
||||||
|
ConsistentPhrases();
|
||||||
|
virtual ~ConsistentPhrases();
|
||||||
|
|
||||||
|
void Initialize(size_t size);
|
||||||
|
|
||||||
|
void Add(int sourceStart, int sourceEnd,
|
||||||
|
int targetStart, int targetEnd,
|
||||||
|
const Parameter ¶ms);
|
||||||
|
|
||||||
|
void AddHieroNonTerms(const Parameter ¶ms);
|
||||||
|
|
||||||
|
const Coll &GetColl(int sourceStart, int sourceEnd) const;
|
||||||
|
Coll &GetColl(int sourceStart, int sourceEnd);
|
||||||
|
|
||||||
|
std::string Debug() const;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector< std::vector<Coll> > m_coll;
|
||||||
|
};
|
||||||
|
|
62
phrase-extract/extract-mixed-syntax/InputFileStream.cpp
Normal file
62
phrase-extract/extract-mixed-syntax/InputFileStream.cpp
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
|
||||||
|
|
||||||
|
/***********************************************************************
|
||||||
|
Moses - factored phrase-based language decoder
|
||||||
|
Copyright (C) 2006 University of Edinburgh
|
||||||
|
|
||||||
|
This library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with this library; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
***********************************************************************/
|
||||||
|
|
||||||
|
#include "InputFileStream.h"
|
||||||
|
#include "gzfilebuf.h"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
namespace Moses
|
||||||
|
{
|
||||||
|
InputFileStream::InputFileStream(const std::string &filePath)
|
||||||
|
: std::istream(NULL)
|
||||||
|
, m_streambuf(NULL)
|
||||||
|
{
|
||||||
|
if (filePath.size() > 3 &&
|
||||||
|
filePath.substr(filePath.size() - 3, 3) == ".gz")
|
||||||
|
{
|
||||||
|
m_streambuf = new gzfilebuf(filePath.c_str());
|
||||||
|
} else {
|
||||||
|
std::filebuf* fb = new std::filebuf();
|
||||||
|
fb = fb->open(filePath.c_str(), std::ios::in);
|
||||||
|
if (! fb) {
|
||||||
|
cerr << "Can't read " << filePath.c_str() << endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
m_streambuf = fb;
|
||||||
|
}
|
||||||
|
this->init(m_streambuf);
|
||||||
|
}
|
||||||
|
|
||||||
|
InputFileStream::~InputFileStream()
|
||||||
|
{
|
||||||
|
delete m_streambuf;
|
||||||
|
m_streambuf = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void InputFileStream::Close()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
48
phrase-extract/extract-mixed-syntax/InputFileStream.h
Normal file
48
phrase-extract/extract-mixed-syntax/InputFileStream.h
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
|
||||||
|
|
||||||
|
/***********************************************************************
|
||||||
|
Moses - factored phrase-based language decoder
|
||||||
|
Copyright (C) 2006 University of Edinburgh
|
||||||
|
|
||||||
|
This library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with this library; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
***********************************************************************/
|
||||||
|
|
||||||
|
#ifndef moses_InputFileStream_h
|
||||||
|
#define moses_InputFileStream_h
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
|
/** Used in place of std::istream, can read zipped files if it ends in .gz
|
||||||
|
*/
|
||||||
|
class InputFileStream : public std::istream
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
std::streambuf *m_streambuf;
|
||||||
|
public:
|
||||||
|
|
||||||
|
InputFileStream(const std::string &filePath);
|
||||||
|
~InputFileStream();
|
||||||
|
|
||||||
|
void Close();
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
203
phrase-extract/extract-mixed-syntax/Main.cpp
Normal file
203
phrase-extract/extract-mixed-syntax/Main.cpp
Normal file
@ -0,0 +1,203 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <boost/program_options.hpp>
|
||||||
|
|
||||||
|
#include "Main.h"
|
||||||
|
#include "InputFileStream.h"
|
||||||
|
#include "OutputFileStream.h"
|
||||||
|
#include "AlignedSentence.h"
|
||||||
|
#include "AlignedSentenceSyntax.h"
|
||||||
|
#include "Parameter.h"
|
||||||
|
#include "Rules.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
bool g_debug = false;
|
||||||
|
|
||||||
|
int main(int argc, char** argv)
|
||||||
|
{
|
||||||
|
cerr << "Starting" << endl;
|
||||||
|
|
||||||
|
Parameter params;
|
||||||
|
|
||||||
|
namespace po = boost::program_options;
|
||||||
|
po::options_description desc("Options");
|
||||||
|
desc.add_options()
|
||||||
|
("help", "Print help messages")
|
||||||
|
("MaxSpan", po::value<int>()->default_value(params.maxSpan), "Max (source) span of a rule. ie. number of words in the source")
|
||||||
|
("MinSpan", po::value<int>()->default_value(params.minSpan), "Min (source) span of a rule.")
|
||||||
|
("GlueGrammar", po::value<string>()->default_value(params.gluePath), "Output glue grammar to here")
|
||||||
|
("SentenceOffset", po::value<long>()->default_value(params.sentenceOffset), "Starting sentence id. Not used")
|
||||||
|
("GZOutput", "Compress extract files")
|
||||||
|
("MaxNonTerm", po::value<int>()->default_value(params.maxNonTerm), "Maximum number of non-terms allowed per rule")
|
||||||
|
("MaxHieroNonTerm", po::value<int>()->default_value(params.maxHieroNonTerm), "Maximum number of Hiero non-term. Usually, --MaxNonTerm is the normal constraint")
|
||||||
|
("MinHoleSource", po::value<int>()->default_value(params.minHoleSource), "Minimum source span for a non-term.")
|
||||||
|
|
||||||
|
("SourceSyntax", "Source sentence is a parse tree")
|
||||||
|
("TargetSyntax", "Target sentence is a parse tree")
|
||||||
|
("MixedSyntaxType", po::value<int>()->default_value(params.mixedSyntaxType), "Hieu's Mixed syntax type. 0(default)=no mixed syntax, 1=add [X] only if no syntactic label. 2=add [X] everywhere")
|
||||||
|
("MultiLabel", po::value<int>()->default_value(params.multiLabel), "What to do with multiple labels on the same span. 0(default)=keep them all, 1=keep only top-most, 2=keep only bottom-most")
|
||||||
|
("HieroSourceLHS", "Always use Hiero source LHS? Default = 0")
|
||||||
|
("MaxSpanFreeNonTermSource", po::value<int>()->default_value(params.maxSpanFreeNonTermSource), "Max number of words covered by beginning/end NT. Default = 0 (no limit)")
|
||||||
|
("NoNieceTerminal", "Don't extract rule if 1 of the non-term covers the same word as 1 of the terminals")
|
||||||
|
("MaxScope", po::value<int>()->default_value(params.maxScope), "maximum scope (see Hopkins and Langmead (2010)). Default is HIGH")
|
||||||
|
("MinScope", po::value<int>()->default_value(params.minScope), "min scope.")
|
||||||
|
|
||||||
|
("SpanLength", "Property - span length of RHS each non-term")
|
||||||
|
|
||||||
|
("NonTermContext", "Property - (source) left and right, inside and outside words of each non-term ")
|
||||||
|
("NonTermContextTarget", "Property - (target) left and right, inside and outside words of each non-term")
|
||||||
|
("NonTermContextFactor", po::value<int>()->default_value(params.nonTermContextFactor), "Factor to use for non-term context property.")
|
||||||
|
|
||||||
|
("NumSourceFactors", po::value<int>()->default_value(params.numSourceFactors), "Number of source factors.")
|
||||||
|
("NumTargetFactors", po::value<int>()->default_value(params.numTargetFactors), "Number of target factors.")
|
||||||
|
|
||||||
|
("HieroNonTerm", po::value<string>()->default_value(params.hieroNonTerm), "Hiero non-terminal label, including bracket")
|
||||||
|
("ScopeSpan", po::value<string>()->default_value(params.scopeSpanStr), "Min and max span for rules of each scope. Format is min,max:min,max...")
|
||||||
|
|
||||||
|
("NonTermConsecSource", "Allow consecutive non-terms on the source side");
|
||||||
|
|
||||||
|
|
||||||
|
po::variables_map vm;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
po::store(po::parse_command_line(argc, argv, desc),
|
||||||
|
vm); // can throw
|
||||||
|
|
||||||
|
/** --help option
|
||||||
|
*/
|
||||||
|
if ( vm.count("help") || argc < 5 )
|
||||||
|
{
|
||||||
|
std::cout << argv[0] << " target source alignment [options...]" << std::endl
|
||||||
|
<< desc << std::endl;
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
po::notify(vm); // throws on error, so do after help in case
|
||||||
|
// there are any problems
|
||||||
|
}
|
||||||
|
catch(po::error& e)
|
||||||
|
{
|
||||||
|
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
|
||||||
|
std::cerr << desc << std::endl;
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vm.count("MaxSpan")) params.maxSpan = vm["MaxSpan"].as<int>();
|
||||||
|
if (vm.count("MinSpan")) params.minSpan = vm["MinSpan"].as<int>();
|
||||||
|
if (vm.count("GZOutput")) params.gzOutput = true;
|
||||||
|
if (vm.count("GlueGrammar")) params.gluePath = vm["GlueGrammar"].as<string>();
|
||||||
|
if (vm.count("SentenceOffset")) params.sentenceOffset = vm["SentenceOffset"].as<long>();
|
||||||
|
if (vm.count("MaxNonTerm")) params.maxNonTerm = vm["MaxNonTerm"].as<int>();
|
||||||
|
if (vm.count("MaxHieroNonTerm")) params.maxHieroNonTerm = vm["MaxHieroNonTerm"].as<int>();
|
||||||
|
if (vm.count("MinHoleSource")) params.minHoleSource = vm["MinHoleSource"].as<int>();
|
||||||
|
|
||||||
|
if (vm.count("SourceSyntax")) params.sourceSyntax = true;
|
||||||
|
if (vm.count("TargetSyntax")) params.targetSyntax = true;
|
||||||
|
if (vm.count("MixedSyntaxType")) params.mixedSyntaxType = vm["MixedSyntaxType"].as<int>();
|
||||||
|
if (vm.count("MultiLabel")) params.multiLabel = vm["MultiLabel"].as<int>();
|
||||||
|
if (vm.count("HieroSourceLHS")) params.hieroSourceLHS = true;
|
||||||
|
if (vm.count("MaxSpanFreeNonTermSource")) params.maxSpanFreeNonTermSource = vm["MaxSpanFreeNonTermSource"].as<int>();
|
||||||
|
if (vm.count("NoNieceTerminal")) params.nieceTerminal = false;
|
||||||
|
if (vm.count("MaxScope")) params.maxScope = vm["MaxScope"].as<int>();
|
||||||
|
if (vm.count("MinScope")) params.minScope = vm["MinScope"].as<int>();
|
||||||
|
|
||||||
|
// properties
|
||||||
|
if (vm.count("SpanLength")) params.spanLength = true;
|
||||||
|
if (vm.count("NonTermContext")) params.nonTermContext = true;
|
||||||
|
if (vm.count("NonTermContextTarget")) params.nonTermContextTarget = true;
|
||||||
|
if (vm.count("NonTermContextFactor")) params.nonTermContextFactor = vm["NonTermContextFactor"].as<int>();
|
||||||
|
|
||||||
|
if (vm.count("NumSourceFactors")) params.numSourceFactors = vm["NumSourceFactors"].as<int>();
|
||||||
|
if (vm.count("NumTargetFactors")) params.numTargetFactors = vm["NumTargetFactors"].as<int>();
|
||||||
|
|
||||||
|
if (vm.count("HieroNonTerm")) params.hieroNonTerm = vm["HieroNonTerm"].as<string>();
|
||||||
|
if (vm.count("ScopeSpan")) {
|
||||||
|
params.SetScopeSpan(vm["ScopeSpan"].as<string>());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vm.count("NonTermConsecSource")) params.nonTermConsecSource = true;
|
||||||
|
|
||||||
|
// input files;
|
||||||
|
string pathTarget = argv[1];
|
||||||
|
string pathSource = argv[2];
|
||||||
|
string pathAlignment = argv[3];
|
||||||
|
|
||||||
|
string pathExtract = argv[4];
|
||||||
|
string pathExtractInv = pathExtract + ".inv";
|
||||||
|
if (params.gzOutput) {
|
||||||
|
pathExtract += ".gz";
|
||||||
|
pathExtractInv += ".gz";
|
||||||
|
}
|
||||||
|
|
||||||
|
Moses::InputFileStream strmTarget(pathTarget);
|
||||||
|
Moses::InputFileStream strmSource(pathSource);
|
||||||
|
Moses::InputFileStream strmAlignment(pathAlignment);
|
||||||
|
Moses::OutputFileStream extractFile(pathExtract);
|
||||||
|
Moses::OutputFileStream extractInvFile(pathExtractInv);
|
||||||
|
|
||||||
|
|
||||||
|
// MAIN LOOP
|
||||||
|
int lineNum = 1;
|
||||||
|
string lineTarget, lineSource, lineAlignment;
|
||||||
|
while (getline(strmTarget, lineTarget)) {
|
||||||
|
if (lineNum % 10000 == 0) {
|
||||||
|
cerr << lineNum << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
bool success;
|
||||||
|
success = getline(strmSource, lineSource);
|
||||||
|
if (!success) {
|
||||||
|
throw "Couldn't read source";
|
||||||
|
}
|
||||||
|
success = getline(strmAlignment, lineAlignment);
|
||||||
|
if (!success) {
|
||||||
|
throw "Couldn't read alignment";
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
cerr << "lineTarget=" << lineTarget << endl;
|
||||||
|
cerr << "lineSource=" << lineSource << endl;
|
||||||
|
cerr << "lineAlignment=" << lineAlignment << endl;
|
||||||
|
*/
|
||||||
|
|
||||||
|
AlignedSentence *alignedSentence;
|
||||||
|
|
||||||
|
if (params.sourceSyntax || params.targetSyntax) {
|
||||||
|
alignedSentence = new AlignedSentenceSyntax(lineNum, lineSource, lineTarget, lineAlignment);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
alignedSentence = new AlignedSentence(lineNum, lineSource, lineTarget, lineAlignment);
|
||||||
|
}
|
||||||
|
|
||||||
|
alignedSentence->Create(params);
|
||||||
|
//cerr << alignedSentence->Debug();
|
||||||
|
|
||||||
|
Rules rules(*alignedSentence);
|
||||||
|
rules.Extend(params);
|
||||||
|
rules.Consolidate(params);
|
||||||
|
//cerr << rules.Debug();
|
||||||
|
|
||||||
|
rules.Output(extractFile, true, params);
|
||||||
|
rules.Output(extractInvFile, false, params);
|
||||||
|
|
||||||
|
delete alignedSentence;
|
||||||
|
|
||||||
|
++lineNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!params.gluePath.empty()) {
|
||||||
|
Moses::OutputFileStream glueFile(params.gluePath);
|
||||||
|
CreateGlueGrammar(glueFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
cerr << "Finished" << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CreateGlueGrammar(Moses::OutputFileStream &glueFile)
|
||||||
|
{
|
||||||
|
glueFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
|
||||||
|
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
|
||||||
|
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
|
||||||
|
|
||||||
|
}
|
12
phrase-extract/extract-mixed-syntax/Main.h
Normal file
12
phrase-extract/extract-mixed-syntax/Main.h
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
/*
|
||||||
|
* Main.h
|
||||||
|
*
|
||||||
|
* Created on: 28 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "OutputFileStream.h"
|
||||||
|
|
||||||
|
void CreateGlueGrammar(Moses::OutputFileStream &glueFile);
|
||||||
|
|
17
phrase-extract/extract-mixed-syntax/Makefile
Normal file
17
phrase-extract/extract-mixed-syntax/Makefile
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
all: extract-mixed-syntax
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f *.o extract-mixed-syntax
|
||||||
|
|
||||||
|
.cpp.o:
|
||||||
|
g++ -O4 -g -c -I../../../boost/include -I../../../ $<
|
||||||
|
|
||||||
|
OBJECTS = AlignedSentence.o ConsistentPhrase.o ConsistentPhrases.o InputFileStream.o \
|
||||||
|
Main.o OutputFileStream.o Parameter.o Phrase.o Rule.o Rules.o RuleSymbol.o \
|
||||||
|
SyntaxTree.o Word.o NonTerm.o RulePhrase.o AlignedSentenceSyntax.o pugixml.o
|
||||||
|
|
||||||
|
extract-mixed-syntax: $(OBJECTS)
|
||||||
|
|
||||||
|
g++ $(OBJECTS) -L../../../boost/lib64 -L../../../lib -lz -lboost_iostreams-mt -lboost_program_options-mt -lmoses -o extract-mixed-syntax
|
||||||
|
|
||||||
|
|
65
phrase-extract/extract-mixed-syntax/NonTerm.cpp
Normal file
65
phrase-extract/extract-mixed-syntax/NonTerm.cpp
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
/*
|
||||||
|
* NonTerm.cpp
|
||||||
|
*
|
||||||
|
* Created on: 22 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include "NonTerm.h"
|
||||||
|
#include "Word.h"
|
||||||
|
#include "ConsistentPhrase.h"
|
||||||
|
#include "Parameter.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
NonTerm::NonTerm(const ConsistentPhrase &consistentPhrase,
|
||||||
|
const std::string &source,
|
||||||
|
const std::string &target)
|
||||||
|
:m_consistentPhrase(&consistentPhrase)
|
||||||
|
,m_source(source)
|
||||||
|
,m_target(target)
|
||||||
|
{
|
||||||
|
// TODO Auto-generated constructor stub
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
NonTerm::~NonTerm() {
|
||||||
|
// TODO Auto-generated destructor stub
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string NonTerm::Debug() const
|
||||||
|
{
|
||||||
|
stringstream out;
|
||||||
|
out << m_source << m_target;
|
||||||
|
out << m_consistentPhrase->Debug();
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
void NonTerm::Output(std::ostream &out) const
|
||||||
|
{
|
||||||
|
out << m_source << m_target;
|
||||||
|
}
|
||||||
|
|
||||||
|
void NonTerm::Output(std::ostream &out, Moses::FactorDirection direction) const
|
||||||
|
{
|
||||||
|
out << GetLabel(direction);
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string &NonTerm::GetLabel(Moses::FactorDirection direction) const
|
||||||
|
{
|
||||||
|
return (direction == Moses::Input) ? m_source : m_target;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NonTerm::IsHiero(Moses::FactorDirection direction, const Parameter ¶ms) const
|
||||||
|
{
|
||||||
|
const std::string &label = NonTerm::GetLabel(direction);
|
||||||
|
return label == params.hieroNonTerm;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NonTerm::IsHiero(const Parameter ¶ms) const
|
||||||
|
{
|
||||||
|
return IsHiero(Moses::Input, params) && IsHiero(Moses::Output, params);
|
||||||
|
}
|
||||||
|
int NonTerm::GetWidth(Moses::FactorDirection direction) const
|
||||||
|
{ return GetConsistentPhrase().GetWidth(direction); }
|
47
phrase-extract/extract-mixed-syntax/NonTerm.h
Normal file
47
phrase-extract/extract-mixed-syntax/NonTerm.h
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
/*
|
||||||
|
* NonTerm.h
|
||||||
|
*
|
||||||
|
* Created on: 22 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
#include <string>
|
||||||
|
#include "RuleSymbol.h"
|
||||||
|
#include "moses/TypeDef.h"
|
||||||
|
|
||||||
|
class ConsistentPhrase;
|
||||||
|
class Parameter;
|
||||||
|
|
||||||
|
class NonTerm : public RuleSymbol
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
|
||||||
|
NonTerm(const ConsistentPhrase &consistentPhrase,
|
||||||
|
const std::string &source,
|
||||||
|
const std::string &target);
|
||||||
|
virtual ~NonTerm();
|
||||||
|
|
||||||
|
const ConsistentPhrase &GetConsistentPhrase() const
|
||||||
|
{ return *m_consistentPhrase; }
|
||||||
|
|
||||||
|
int GetWidth(Moses::FactorDirection direction) const;
|
||||||
|
|
||||||
|
virtual bool IsNonTerm() const
|
||||||
|
{ return true; }
|
||||||
|
|
||||||
|
std::string GetString() const
|
||||||
|
{ return m_source + m_target; }
|
||||||
|
|
||||||
|
virtual std::string Debug() const;
|
||||||
|
virtual void Output(std::ostream &out) const;
|
||||||
|
void Output(std::ostream &out, Moses::FactorDirection direction) const;
|
||||||
|
|
||||||
|
const std::string &GetLabel(Moses::FactorDirection direction) const;
|
||||||
|
bool IsHiero(Moses::FactorDirection direction, const Parameter ¶ms) const;
|
||||||
|
bool IsHiero(const Parameter ¶ms) const;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
const ConsistentPhrase *m_consistentPhrase;
|
||||||
|
std::string m_source, m_target;
|
||||||
|
};
|
||||||
|
|
79
phrase-extract/extract-mixed-syntax/OutputFileStream.cpp
Normal file
79
phrase-extract/extract-mixed-syntax/OutputFileStream.cpp
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
|
||||||
|
|
||||||
|
/***********************************************************************
|
||||||
|
Moses - factored phrase-based language decoder
|
||||||
|
Copyright (C) 2006 University of Edinburgh
|
||||||
|
|
||||||
|
This library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with this library; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
***********************************************************************/
|
||||||
|
|
||||||
|
#include <boost/iostreams/filter/gzip.hpp>
|
||||||
|
#include "OutputFileStream.h"
|
||||||
|
#include "gzfilebuf.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
namespace Moses
|
||||||
|
{
|
||||||
|
OutputFileStream::OutputFileStream()
|
||||||
|
:boost::iostreams::filtering_ostream()
|
||||||
|
,m_outFile(NULL)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
OutputFileStream::OutputFileStream(const std::string &filePath)
|
||||||
|
: m_outFile(NULL)
|
||||||
|
{
|
||||||
|
Open(filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
OutputFileStream::~OutputFileStream()
|
||||||
|
{
|
||||||
|
Close();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OutputFileStream::Open(const std::string &filePath)
|
||||||
|
{
|
||||||
|
m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
|
||||||
|
if (m_outFile->fail()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
|
||||||
|
this->push(boost::iostreams::gzip_compressor());
|
||||||
|
}
|
||||||
|
this->push(*m_outFile);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void OutputFileStream::Close()
|
||||||
|
{
|
||||||
|
if (m_outFile == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this->flush();
|
||||||
|
this->pop(); // file
|
||||||
|
|
||||||
|
m_outFile->close();
|
||||||
|
delete m_outFile;
|
||||||
|
m_outFile = NULL;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
50
phrase-extract/extract-mixed-syntax/OutputFileStream.h
Normal file
50
phrase-extract/extract-mixed-syntax/OutputFileStream.h
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
|
||||||
|
|
||||||
|
/***********************************************************************
|
||||||
|
Moses - factored phrase-based language decoder
|
||||||
|
Copyright (C) 2006 University of Edinburgh
|
||||||
|
|
||||||
|
This library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with this library; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
***********************************************************************/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
#include <iostream>
|
||||||
|
#include <boost/iostreams/filtering_stream.hpp>
|
||||||
|
|
||||||
|
namespace Moses
|
||||||
|
{
|
||||||
|
|
||||||
|
/** Used in place of std::istream, can read zipped files if it ends in .gz
|
||||||
|
*/
|
||||||
|
class OutputFileStream : public boost::iostreams::filtering_ostream
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
std::ofstream *m_outFile;
|
||||||
|
public:
|
||||||
|
OutputFileStream();
|
||||||
|
|
||||||
|
OutputFileStream(const std::string &filePath);
|
||||||
|
virtual ~OutputFileStream();
|
||||||
|
|
||||||
|
bool Open(const std::string &filePath);
|
||||||
|
void Close();
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
69
phrase-extract/extract-mixed-syntax/Parameter.cpp
Normal file
69
phrase-extract/extract-mixed-syntax/Parameter.cpp
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
/*
|
||||||
|
* Parameter.cpp
|
||||||
|
*
|
||||||
|
* Created on: 17 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
#include "Parameter.h"
|
||||||
|
#include "moses/Util.h"
|
||||||
|
#include "util/exception.hh"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
Parameter::Parameter()
|
||||||
|
:maxSpan(10)
|
||||||
|
,minSpan(0)
|
||||||
|
,maxNonTerm(2)
|
||||||
|
,maxHieroNonTerm(999)
|
||||||
|
,maxSymbolsTarget(999)
|
||||||
|
,maxSymbolsSource(5)
|
||||||
|
,minHoleSource(2)
|
||||||
|
,sentenceOffset(0)
|
||||||
|
,nonTermConsecSource(false)
|
||||||
|
,requireAlignedWord(true)
|
||||||
|
,fractionalCounting(true)
|
||||||
|
,gzOutput(false)
|
||||||
|
|
||||||
|
,hieroNonTerm("[X]")
|
||||||
|
,sourceSyntax(false)
|
||||||
|
,targetSyntax(false)
|
||||||
|
|
||||||
|
,mixedSyntaxType(0)
|
||||||
|
,multiLabel(0)
|
||||||
|
,nonTermConsecSourceMixed(true)
|
||||||
|
,hieroSourceLHS(false)
|
||||||
|
,maxSpanFreeNonTermSource(0)
|
||||||
|
,nieceTerminal(true)
|
||||||
|
,maxScope(UNDEFINED)
|
||||||
|
,minScope(0)
|
||||||
|
|
||||||
|
,spanLength(false)
|
||||||
|
,nonTermContext(false)
|
||||||
|
,nonTermContextTarget(false)
|
||||||
|
,nonTermContextFactor(0)
|
||||||
|
|
||||||
|
,numSourceFactors(1)
|
||||||
|
,numTargetFactors(1)
|
||||||
|
{}
|
||||||
|
|
||||||
|
Parameter::~Parameter() {
|
||||||
|
// TODO Auto-generated destructor stub
|
||||||
|
}
|
||||||
|
|
||||||
|
void Parameter::SetScopeSpan(const std::string &str)
|
||||||
|
{
|
||||||
|
scopeSpanStr = str;
|
||||||
|
vector<string> toks1;
|
||||||
|
Moses::Tokenize(toks1, str, ":");
|
||||||
|
|
||||||
|
for (size_t i = 0; i < toks1.size(); ++i) {
|
||||||
|
const string &tok1 = toks1[i];
|
||||||
|
|
||||||
|
vector<int> toks2;
|
||||||
|
Moses::Tokenize<int>(toks2, tok1, ",");
|
||||||
|
UTIL_THROW_IF2(toks2.size() != 2, "Format is min,max:min,max... String is " << tok1);
|
||||||
|
|
||||||
|
std::pair<int,int> values(toks2[0], toks2[1]);
|
||||||
|
scopeSpan.push_back(values);
|
||||||
|
}
|
||||||
|
}
|
62
phrase-extract/extract-mixed-syntax/Parameter.h
Normal file
62
phrase-extract/extract-mixed-syntax/Parameter.h
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
/*
|
||||||
|
* Parameter.h
|
||||||
|
*
|
||||||
|
* Created on: 17 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <limits>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#define UNDEFINED std::numeric_limits<int>::max()
|
||||||
|
|
||||||
|
class Parameter
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
Parameter();
|
||||||
|
virtual ~Parameter();
|
||||||
|
|
||||||
|
int maxSpan;
|
||||||
|
int minSpan;
|
||||||
|
int maxNonTerm;
|
||||||
|
int maxHieroNonTerm;
|
||||||
|
int maxSymbolsTarget;
|
||||||
|
int maxSymbolsSource;
|
||||||
|
int minHoleSource;
|
||||||
|
|
||||||
|
long sentenceOffset;
|
||||||
|
|
||||||
|
bool nonTermConsecSource;
|
||||||
|
bool requireAlignedWord;
|
||||||
|
bool fractionalCounting;
|
||||||
|
bool gzOutput;
|
||||||
|
|
||||||
|
std::string hieroNonTerm;
|
||||||
|
std::string gluePath;
|
||||||
|
|
||||||
|
bool sourceSyntax, targetSyntax;
|
||||||
|
|
||||||
|
int mixedSyntaxType, multiLabel;
|
||||||
|
bool nonTermConsecSourceMixed;
|
||||||
|
bool hieroSourceLHS;
|
||||||
|
int maxSpanFreeNonTermSource;
|
||||||
|
bool nieceTerminal;
|
||||||
|
int maxScope, minScope;
|
||||||
|
|
||||||
|
// properties
|
||||||
|
bool spanLength;
|
||||||
|
bool nonTermContext;
|
||||||
|
bool nonTermContextTarget;
|
||||||
|
int nonTermContextFactor;
|
||||||
|
|
||||||
|
int numSourceFactors, numTargetFactors;
|
||||||
|
|
||||||
|
std::string scopeSpanStr;
|
||||||
|
std::vector<std::pair<int,int> > scopeSpan;
|
||||||
|
|
||||||
|
void SetScopeSpan(const std::string &str);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
14
phrase-extract/extract-mixed-syntax/Phrase.cpp
Normal file
14
phrase-extract/extract-mixed-syntax/Phrase.cpp
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
#include <sstream>
|
||||||
|
#include "Phrase.h"
|
||||||
|
|
||||||
|
std::string Phrase::Debug() const
|
||||||
|
{
|
||||||
|
std::stringstream out;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size(); ++i) {
|
||||||
|
Word &word = *at(i);
|
||||||
|
out << word.Debug() << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
19
phrase-extract/extract-mixed-syntax/Phrase.h
Normal file
19
phrase-extract/extract-mixed-syntax/Phrase.h
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include "Word.h"
|
||||||
|
|
||||||
|
// a vector of terminals
|
||||||
|
class Phrase : public std::vector<Word*>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
Phrase()
|
||||||
|
{}
|
||||||
|
|
||||||
|
Phrase(size_t size)
|
||||||
|
:std::vector<Word*>(size)
|
||||||
|
{}
|
||||||
|
|
||||||
|
std::string Debug() const;
|
||||||
|
|
||||||
|
};
|
637
phrase-extract/extract-mixed-syntax/Rule.cpp
Normal file
637
phrase-extract/extract-mixed-syntax/Rule.cpp
Normal file
@ -0,0 +1,637 @@
|
|||||||
|
/*
|
||||||
|
* Rule.cpp
|
||||||
|
*
|
||||||
|
* Created on: 20 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include <algorithm>
|
||||||
|
#include "Rule.h"
|
||||||
|
#include "AlignedSentence.h"
|
||||||
|
#include "ConsistentPhrase.h"
|
||||||
|
#include "NonTerm.h"
|
||||||
|
#include "Parameter.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
Rule::Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence)
|
||||||
|
:m_lhs(lhsNonTerm)
|
||||||
|
,m_alignedSentence(alignedSentence)
|
||||||
|
,m_isValid(true)
|
||||||
|
,m_canRecurse(true)
|
||||||
|
{
|
||||||
|
CreateSource();
|
||||||
|
}
|
||||||
|
|
||||||
|
Rule::Rule(const Rule ©, const NonTerm &nonTerm)
|
||||||
|
:m_lhs(copy.m_lhs)
|
||||||
|
,m_alignedSentence(copy.m_alignedSentence)
|
||||||
|
,m_isValid(true)
|
||||||
|
,m_canRecurse(true)
|
||||||
|
,m_nonterms(copy.m_nonterms)
|
||||||
|
{
|
||||||
|
m_nonterms.push_back(&nonTerm);
|
||||||
|
CreateSource();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
Rule::~Rule() {
|
||||||
|
// TODO Auto-generated destructor stub
|
||||||
|
}
|
||||||
|
|
||||||
|
const ConsistentPhrase &Rule::GetConsistentPhrase() const
|
||||||
|
{ return m_lhs.GetConsistentPhrase(); }
|
||||||
|
|
||||||
|
void Rule::CreateSource()
|
||||||
|
{
|
||||||
|
const NonTerm *cp = NULL;
|
||||||
|
size_t nonTermInd = 0;
|
||||||
|
if (nonTermInd < m_nonterms.size()) {
|
||||||
|
cp = m_nonterms[nonTermInd];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int sourcePos = m_lhs.GetConsistentPhrase().corners[0];
|
||||||
|
sourcePos <= m_lhs.GetConsistentPhrase().corners[1];
|
||||||
|
++sourcePos) {
|
||||||
|
|
||||||
|
const RuleSymbol *ruleSymbol;
|
||||||
|
if (cp && cp->GetConsistentPhrase().corners[0] <= sourcePos && sourcePos <= cp->GetConsistentPhrase().corners[1]) {
|
||||||
|
// replace words with non-term
|
||||||
|
ruleSymbol = cp;
|
||||||
|
sourcePos = cp->GetConsistentPhrase().corners[1];
|
||||||
|
if (m_nonterms.size()) {
|
||||||
|
cp = m_nonterms[nonTermInd];
|
||||||
|
}
|
||||||
|
|
||||||
|
// move to next non-term
|
||||||
|
++nonTermInd;
|
||||||
|
cp = (nonTermInd < m_nonterms.size()) ? m_nonterms[nonTermInd] : NULL;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// terminal
|
||||||
|
ruleSymbol = m_alignedSentence.GetPhrase(Moses::Input)[sourcePos];
|
||||||
|
}
|
||||||
|
|
||||||
|
m_source.Add(ruleSymbol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int Rule::GetNextSourcePosForNonTerm() const
|
||||||
|
{
|
||||||
|
if (m_nonterms.empty()) {
|
||||||
|
// no non-terms so far. Can start next non-term on left corner
|
||||||
|
return m_lhs.GetConsistentPhrase().corners[0];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// next non-term can start just left of previous
|
||||||
|
const ConsistentPhrase &cp = m_nonterms.back()->GetConsistentPhrase();
|
||||||
|
int nextPos = cp.corners[1] + 1;
|
||||||
|
return nextPos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Rule::Debug() const
|
||||||
|
{
|
||||||
|
stringstream out;
|
||||||
|
|
||||||
|
// source
|
||||||
|
for (size_t i = 0; i < m_source.GetSize(); ++i) {
|
||||||
|
const RuleSymbol &symbol = *m_source[i];
|
||||||
|
out << symbol.Debug() << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
// target
|
||||||
|
out << "||| ";
|
||||||
|
for (size_t i = 0; i < m_target.GetSize(); ++i) {
|
||||||
|
const RuleSymbol &symbol = *m_target[i];
|
||||||
|
out << symbol.Debug() << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
out << "||| ";
|
||||||
|
Alignments::const_iterator iterAlign;
|
||||||
|
for (iterAlign = m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) {
|
||||||
|
const std::pair<int,int> &alignPair = *iterAlign;
|
||||||
|
out << alignPair.first << "-" << alignPair.second << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
// overall range
|
||||||
|
out << "||| LHS=" << m_lhs.Debug();
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rule::Output(std::ostream &out, bool forward, const Parameter ¶ms) const
|
||||||
|
{
|
||||||
|
if (forward) {
|
||||||
|
// source
|
||||||
|
m_source.Output(out);
|
||||||
|
m_lhs.Output(out, Moses::Input);
|
||||||
|
|
||||||
|
out << " ||| ";
|
||||||
|
|
||||||
|
// target
|
||||||
|
m_target.Output(out);
|
||||||
|
m_lhs.Output(out, Moses::Output);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// target
|
||||||
|
m_target.Output(out);
|
||||||
|
m_lhs.Output(out, Moses::Output);
|
||||||
|
|
||||||
|
out << " ||| ";
|
||||||
|
|
||||||
|
// source
|
||||||
|
m_source.Output(out);
|
||||||
|
m_lhs.Output(out, Moses::Input);
|
||||||
|
}
|
||||||
|
|
||||||
|
out << " ||| ";
|
||||||
|
|
||||||
|
// alignment
|
||||||
|
Alignments::const_iterator iterAlign;
|
||||||
|
for (iterAlign = m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) {
|
||||||
|
const std::pair<int,int> &alignPair = *iterAlign;
|
||||||
|
|
||||||
|
if (forward) {
|
||||||
|
out << alignPair.first << "-" << alignPair.second << " ";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
out << alignPair.second << "-" << alignPair.first << " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out << "||| ";
|
||||||
|
|
||||||
|
// count
|
||||||
|
out << m_count;
|
||||||
|
|
||||||
|
out << " ||| ";
|
||||||
|
|
||||||
|
// properties
|
||||||
|
|
||||||
|
// span length
|
||||||
|
if (forward && params.spanLength && m_nonterms.size()) {
|
||||||
|
out << "{{SpanLength ";
|
||||||
|
|
||||||
|
for (size_t i = 0; i < m_nonterms.size(); ++i) {
|
||||||
|
const NonTerm &nonTerm = *m_nonterms[i];
|
||||||
|
const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
|
||||||
|
out << i << "," << cp.GetWidth(Moses::Input) << "," << cp.GetWidth(Moses::Output) << " ";
|
||||||
|
}
|
||||||
|
out << "}} ";
|
||||||
|
}
|
||||||
|
|
||||||
|
// non-term context (source)
|
||||||
|
if (forward && params.nonTermContext && m_nonterms.size()) {
|
||||||
|
out << "{{NonTermContext ";
|
||||||
|
|
||||||
|
int factor = params.nonTermContextFactor;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < m_nonterms.size(); ++i) {
|
||||||
|
const NonTerm &nonTerm = *m_nonterms[i];
|
||||||
|
const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
|
||||||
|
NonTermContext(1, factor, i, cp, out);
|
||||||
|
}
|
||||||
|
out << "}} ";
|
||||||
|
}
|
||||||
|
|
||||||
|
// non-term context (target)
|
||||||
|
if (forward && params.nonTermContextTarget && m_nonterms.size()) {
|
||||||
|
out << "{{NonTermContextTarget ";
|
||||||
|
|
||||||
|
int factor = params.nonTermContextFactor;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < m_nonterms.size(); ++i) {
|
||||||
|
const NonTerm &nonTerm = *m_nonterms[i];
|
||||||
|
const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
|
||||||
|
NonTermContext(2, factor, i, cp, out);
|
||||||
|
}
|
||||||
|
out << "}} ";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rule::NonTermContextFactor(int factor, const Word &word, std::ostream &out) const
|
||||||
|
{
|
||||||
|
out << word.GetString(factor) << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rule::NonTermContext(int sourceTarget, int factor, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const
|
||||||
|
{
|
||||||
|
int startPos, endPos;
|
||||||
|
const Phrase *phrase;
|
||||||
|
|
||||||
|
if (sourceTarget == 1) {
|
||||||
|
startPos = cp.corners[0];
|
||||||
|
endPos = cp.corners[1];
|
||||||
|
phrase = &m_alignedSentence.GetPhrase(Moses::Input);
|
||||||
|
}
|
||||||
|
else if (sourceTarget == 2) {
|
||||||
|
startPos = cp.corners[2];
|
||||||
|
endPos = cp.corners[3];
|
||||||
|
phrase = &m_alignedSentence.GetPhrase(Moses::Output);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
out << ntInd << " ";
|
||||||
|
|
||||||
|
// left outside
|
||||||
|
if (startPos == 0) {
|
||||||
|
out << "<s> ";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
NonTermContextFactor(factor, *phrase->at(startPos - 1), out);
|
||||||
|
}
|
||||||
|
|
||||||
|
// left inside
|
||||||
|
NonTermContextFactor(factor, *phrase->at(startPos), out);
|
||||||
|
|
||||||
|
// right inside
|
||||||
|
NonTermContextFactor(factor, *phrase->at(endPos), out);
|
||||||
|
|
||||||
|
// right outside
|
||||||
|
if (endPos == phrase->size() - 1) {
|
||||||
|
out << "</s> ";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
NonTermContextFactor(factor, *phrase->at(endPos + 1), out);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rule::Prevalidate(const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
const ConsistentPhrase &cp = m_lhs.GetConsistentPhrase();
|
||||||
|
|
||||||
|
// check number of source symbols in rule
|
||||||
|
if (m_source.GetSize() > params.maxSymbolsSource) {
|
||||||
|
m_isValid = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check that last non-term added isn't too small
|
||||||
|
if (m_nonterms.size()) {
|
||||||
|
const NonTerm &lastNonTerm = *m_nonterms.back();
|
||||||
|
const ConsistentPhrase &cp = lastNonTerm.GetConsistentPhrase();
|
||||||
|
|
||||||
|
int sourceWidth = cp.corners[1] - cp.corners[0] + 1;
|
||||||
|
if (sourceWidth < params.minHoleSource) {
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check number of non-terms
|
||||||
|
int numNonTerms = 0;
|
||||||
|
int numHieroNonTerms = 0;
|
||||||
|
for (size_t i = 0; i < m_source.GetSize(); ++i) {
|
||||||
|
const RuleSymbol *arc = m_source[i];
|
||||||
|
if (arc->IsNonTerm()) {
|
||||||
|
++numNonTerms;
|
||||||
|
const NonTerm &nonTerm = *static_cast<const NonTerm*>(arc);
|
||||||
|
bool isHiero = nonTerm.IsHiero(params);
|
||||||
|
if (isHiero) {
|
||||||
|
++numHieroNonTerms;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numNonTerms >= params.maxNonTerm) {
|
||||||
|
m_canRecurse = false;
|
||||||
|
if (numNonTerms > params.maxNonTerm) {
|
||||||
|
m_isValid = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numHieroNonTerms >= params.maxHieroNonTerm) {
|
||||||
|
m_canRecurse = false;
|
||||||
|
if (numHieroNonTerms > params.maxHieroNonTerm) {
|
||||||
|
m_isValid = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if 2 consecutive non-terms in source
|
||||||
|
if (!params.nonTermConsecSource && m_nonterms.size() >= 2) {
|
||||||
|
const NonTerm &lastNonTerm = *m_nonterms.back();
|
||||||
|
const NonTerm &secondLastNonTerm = *m_nonterms[m_nonterms.size() - 2];
|
||||||
|
if (secondLastNonTerm.GetConsistentPhrase().corners[1] + 1 ==
|
||||||
|
lastNonTerm.GetConsistentPhrase().corners[0]) {
|
||||||
|
if (params.mixedSyntaxType == 0) {
|
||||||
|
// ordinary hiero or syntax model
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Hieu's mixed syntax
|
||||||
|
if (lastNonTerm.IsHiero(Moses::Input, params)
|
||||||
|
&& secondLastNonTerm.IsHiero(Moses::Input, params)) {
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//check to see if it overlaps with any other non-terms
|
||||||
|
if (m_nonterms.size() >= 2) {
|
||||||
|
const NonTerm &lastNonTerm = *m_nonterms.back();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < m_nonterms.size() - 1; ++i) {
|
||||||
|
const NonTerm &otherNonTerm = *m_nonterms[i];
|
||||||
|
bool overlap = lastNonTerm.GetConsistentPhrase().TargetOverlap(otherNonTerm.GetConsistentPhrase());
|
||||||
|
|
||||||
|
if (overlap) {
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check that at least 1 word is aligned
|
||||||
|
if (params.requireAlignedWord) {
|
||||||
|
bool ok = false;
|
||||||
|
for (size_t i = 0; i < m_source.GetSize(); ++i) {
|
||||||
|
const RuleSymbol &symbol = *m_source[i];
|
||||||
|
if (!symbol.IsNonTerm()) {
|
||||||
|
const Word &word = static_cast<const Word&>(symbol);
|
||||||
|
if (word.GetAlignment().size()) {
|
||||||
|
ok = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ok) {
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.maxSpanFreeNonTermSource) {
|
||||||
|
const NonTerm *front = dynamic_cast<const NonTerm*>(m_source[0]);
|
||||||
|
if (front) {
|
||||||
|
int width = front->GetWidth(Moses::Input);
|
||||||
|
if (width > params.maxSpanFreeNonTermSource) {
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const NonTerm *back = dynamic_cast<const NonTerm*>(m_source.Back());
|
||||||
|
if (back) {
|
||||||
|
int width = back->GetWidth(Moses::Input);
|
||||||
|
if (width > params.maxSpanFreeNonTermSource) {
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!params.nieceTerminal) {
|
||||||
|
// collect terminal in a rule
|
||||||
|
std::set<const Word*> terms;
|
||||||
|
for (size_t i = 0; i < m_source.GetSize(); ++i) {
|
||||||
|
const Word *word = dynamic_cast<const Word*>(m_source[i]);
|
||||||
|
if (word) {
|
||||||
|
terms.insert(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// look in non-terms
|
||||||
|
for (size_t i = 0; i < m_source.GetSize(); ++i) {
|
||||||
|
const NonTerm *nonTerm = dynamic_cast<const NonTerm*>(m_source[i]);
|
||||||
|
if (nonTerm) {
|
||||||
|
const ConsistentPhrase &cp = nonTerm->GetConsistentPhrase();
|
||||||
|
bool containTerm = ContainTerm(cp, terms);
|
||||||
|
|
||||||
|
if (containTerm) {
|
||||||
|
//cerr << "ruleSource=" << *ruleSource << " ";
|
||||||
|
//cerr << "ntRange=" << ntRange << endl;
|
||||||
|
|
||||||
|
// non-term contains 1 of the terms in the rule.
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.maxScope != UNDEFINED || params.minScope > 0) {
|
||||||
|
int scope = GetScope(params);
|
||||||
|
if (scope > params.maxScope) {
|
||||||
|
// scope of subsequent rules will be the same or increase
|
||||||
|
// therefore can NOT recurse
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (scope < params.minScope) {
|
||||||
|
// scope of subsequent rules may increase
|
||||||
|
// therefore can recurse
|
||||||
|
m_isValid = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// min/max span per scope
|
||||||
|
if (params.scopeSpan.size()) {
|
||||||
|
int scope = GetScope(params);
|
||||||
|
if (scope >= params.scopeSpan.size()) {
|
||||||
|
// no constraint on it. It's ok
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const std::pair<int,int> &constraint = params.scopeSpan[scope];
|
||||||
|
int sourceWidth = m_lhs.GetWidth(Moses::Input);
|
||||||
|
if (sourceWidth < constraint.first || sourceWidth > constraint.second) {
|
||||||
|
m_isValid = false;
|
||||||
|
m_canRecurse = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int Rule::GetScope(const Parameter ¶ms) const
|
||||||
|
{
|
||||||
|
size_t scope = 0;
|
||||||
|
bool previousIsAmbiguous = false;
|
||||||
|
|
||||||
|
if (m_source[0]->IsNonTerm()) {
|
||||||
|
scope++;
|
||||||
|
previousIsAmbiguous = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 1; i < m_source.GetSize(); ++i) {
|
||||||
|
const RuleSymbol *symbol = m_source[i];
|
||||||
|
bool isAmbiguous = symbol->IsNonTerm();
|
||||||
|
if (isAmbiguous) {
|
||||||
|
// mixed syntax
|
||||||
|
const NonTerm *nt = static_cast<const NonTerm*>(symbol);
|
||||||
|
isAmbiguous = nt->IsHiero(Moses::Input, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isAmbiguous && previousIsAmbiguous) {
|
||||||
|
scope++;
|
||||||
|
}
|
||||||
|
previousIsAmbiguous = isAmbiguous;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (previousIsAmbiguous) {
|
||||||
|
scope++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return scope;
|
||||||
|
|
||||||
|
/*
|
||||||
|
int scope = 0;
|
||||||
|
if (m_source.GetSize() > 1) {
|
||||||
|
const RuleSymbol &front = *m_source.Front();
|
||||||
|
if (front.IsNonTerm()) {
|
||||||
|
++scope;
|
||||||
|
}
|
||||||
|
|
||||||
|
const RuleSymbol &back = *m_source.Back();
|
||||||
|
if (back.IsNonTerm()) {
|
||||||
|
++scope;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return scope;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
bool Contains(const T *sought, const set<const T*> &coll)
|
||||||
|
{
|
||||||
|
std::set<const Word*>::const_iterator iter;
|
||||||
|
for (iter = coll.begin(); iter != coll.end(); ++iter) {
|
||||||
|
const Word *found = *iter;
|
||||||
|
if (sought->CompareString(*found) == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Rule::ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const
|
||||||
|
{
|
||||||
|
const Phrase &sourceSentence = m_alignedSentence.GetPhrase(Moses::Input);
|
||||||
|
|
||||||
|
for (int pos = cp.corners[0]; pos <= cp.corners[1]; ++pos) {
|
||||||
|
const Word *soughtWord = sourceSentence[pos];
|
||||||
|
|
||||||
|
// find same word in set
|
||||||
|
if (Contains(soughtWord, terms)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CompareTargetNonTerms(const NonTerm *a, const NonTerm *b)
|
||||||
|
{
|
||||||
|
// compare just start target pos
|
||||||
|
return a->GetConsistentPhrase().corners[2] < b->GetConsistentPhrase().corners[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rule::CreateTarget(const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
if (!m_isValid) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<const NonTerm*> targetNonTerm(m_nonterms);
|
||||||
|
std::sort(targetNonTerm.begin(), targetNonTerm.end(), CompareTargetNonTerms);
|
||||||
|
|
||||||
|
const NonTerm *cp = NULL;
|
||||||
|
size_t nonTermInd = 0;
|
||||||
|
if (nonTermInd < targetNonTerm.size()) {
|
||||||
|
cp = targetNonTerm[nonTermInd];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int targetPos = m_lhs.GetConsistentPhrase().corners[2];
|
||||||
|
targetPos <= m_lhs.GetConsistentPhrase().corners[3];
|
||||||
|
++targetPos) {
|
||||||
|
|
||||||
|
const RuleSymbol *ruleSymbol;
|
||||||
|
if (cp && cp->GetConsistentPhrase().corners[2] <= targetPos && targetPos <= cp->GetConsistentPhrase().corners[3]) {
|
||||||
|
// replace words with non-term
|
||||||
|
ruleSymbol = cp;
|
||||||
|
targetPos = cp->GetConsistentPhrase().corners[3];
|
||||||
|
if (targetNonTerm.size()) {
|
||||||
|
cp = targetNonTerm[nonTermInd];
|
||||||
|
}
|
||||||
|
|
||||||
|
// move to next non-term
|
||||||
|
++nonTermInd;
|
||||||
|
cp = (nonTermInd < targetNonTerm.size()) ? targetNonTerm[nonTermInd] : NULL;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// terminal
|
||||||
|
ruleSymbol = m_alignedSentence.GetPhrase(Moses::Output)[targetPos];
|
||||||
|
}
|
||||||
|
|
||||||
|
m_target.Add(ruleSymbol);
|
||||||
|
}
|
||||||
|
|
||||||
|
CreateAlignments();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void Rule::CreateAlignments()
|
||||||
|
{
|
||||||
|
int sourceStart = GetConsistentPhrase().corners[0];
|
||||||
|
int targetStart = GetConsistentPhrase().corners[2];
|
||||||
|
|
||||||
|
for (size_t sourcePos = 0; sourcePos < m_source.GetSize(); ++sourcePos) {
|
||||||
|
const RuleSymbol *symbol = m_source[sourcePos];
|
||||||
|
if (!symbol->IsNonTerm()) {
|
||||||
|
// terminals
|
||||||
|
const Word &sourceWord = static_cast<const Word&>(*symbol);
|
||||||
|
const std::set<const Word *> &targetWords = sourceWord.GetAlignment();
|
||||||
|
CreateAlignments(sourcePos, targetWords);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// non-terms. same object in both source & target
|
||||||
|
CreateAlignments(sourcePos, symbol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rule::CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords)
|
||||||
|
{
|
||||||
|
std::set<const Word *>::const_iterator iterTarget;
|
||||||
|
for (iterTarget = targetWords.begin(); iterTarget != targetWords.end(); ++iterTarget) {
|
||||||
|
const Word *targetWord = *iterTarget;
|
||||||
|
CreateAlignments(sourcePos, targetWord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rule::CreateAlignments(int sourcePos, const RuleSymbol *targetSought)
|
||||||
|
{
|
||||||
|
// should be in target phrase
|
||||||
|
for (size_t targetPos = 0; targetPos < m_target.GetSize(); ++targetPos) {
|
||||||
|
const RuleSymbol *foundSymbol = m_target[targetPos];
|
||||||
|
if (targetSought == foundSymbol) {
|
||||||
|
pair<int, int> alignPoint(sourcePos, targetPos);
|
||||||
|
m_alignments.insert(alignPoint);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw "not found";
|
||||||
|
}
|
||||||
|
|
90
phrase-extract/extract-mixed-syntax/Rule.h
Normal file
90
phrase-extract/extract-mixed-syntax/Rule.h
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
/*
|
||||||
|
* Rule.h
|
||||||
|
*
|
||||||
|
* Created on: 20 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
#include <vector>
|
||||||
|
#include "Phrase.h"
|
||||||
|
#include "RulePhrase.h"
|
||||||
|
#include "moses/TypeDef.h"
|
||||||
|
|
||||||
|
class ConsistentPhrase;
|
||||||
|
class AlignedSentence;
|
||||||
|
class NonTerm;
|
||||||
|
class Parameter;
|
||||||
|
|
||||||
|
|
||||||
|
class Rule {
|
||||||
|
public:
|
||||||
|
typedef std::set<std::pair<int,int> > Alignments;
|
||||||
|
|
||||||
|
Rule(const Rule ©); // do not implement
|
||||||
|
|
||||||
|
// original rule with no non-term
|
||||||
|
Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence);
|
||||||
|
|
||||||
|
// extend a rule, adding 1 new non-term
|
||||||
|
Rule(const Rule ©, const NonTerm &nonTerm);
|
||||||
|
|
||||||
|
virtual ~Rule();
|
||||||
|
|
||||||
|
bool IsValid() const
|
||||||
|
{ return m_isValid; }
|
||||||
|
|
||||||
|
bool CanRecurse() const
|
||||||
|
{ return m_canRecurse; }
|
||||||
|
|
||||||
|
const NonTerm &GetLHS() const
|
||||||
|
{ return m_lhs; }
|
||||||
|
|
||||||
|
const ConsistentPhrase &GetConsistentPhrase() const;
|
||||||
|
|
||||||
|
int GetNextSourcePosForNonTerm() const;
|
||||||
|
|
||||||
|
void SetCount(float count)
|
||||||
|
{ m_count = count; }
|
||||||
|
float GetCount() const
|
||||||
|
{ return m_count; }
|
||||||
|
|
||||||
|
const Alignments &GetAlignments() const
|
||||||
|
{ return m_alignments; }
|
||||||
|
|
||||||
|
std::string Debug() const;
|
||||||
|
void Output(std::ostream &out, bool forward, const Parameter ¶ms) const;
|
||||||
|
|
||||||
|
void Prevalidate(const Parameter ¶ms);
|
||||||
|
void CreateTarget(const Parameter ¶ms);
|
||||||
|
|
||||||
|
const RulePhrase &GetPhrase(Moses::FactorDirection direction) const
|
||||||
|
{ return (direction == Moses::Input) ? m_source : m_target; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
const NonTerm &m_lhs;
|
||||||
|
const AlignedSentence &m_alignedSentence;
|
||||||
|
RulePhrase m_source, m_target;
|
||||||
|
float m_count;
|
||||||
|
|
||||||
|
Alignments m_alignments;
|
||||||
|
|
||||||
|
// in source order
|
||||||
|
std::vector<const NonTerm*> m_nonterms;
|
||||||
|
|
||||||
|
bool m_isValid, m_canRecurse;
|
||||||
|
|
||||||
|
void CreateSource();
|
||||||
|
void CreateAlignments();
|
||||||
|
void CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords);
|
||||||
|
void CreateAlignments(int sourcePos, const RuleSymbol *targetSought);
|
||||||
|
|
||||||
|
bool ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const;
|
||||||
|
int GetScope(const Parameter ¶ms) const;
|
||||||
|
|
||||||
|
void NonTermContext(int sourceTarget, int factors, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const;
|
||||||
|
// sourceTarget: 1 = source, 2 = target
|
||||||
|
|
||||||
|
void NonTermContextFactor(int factor, const Word &word, std::ostream &out) const;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
50
phrase-extract/extract-mixed-syntax/RulePhrase.cpp
Normal file
50
phrase-extract/extract-mixed-syntax/RulePhrase.cpp
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
/*
|
||||||
|
* RulePhrase.cpp
|
||||||
|
*
|
||||||
|
* Created on: 26 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include "RulePhrase.h"
|
||||||
|
#include "RuleSymbol.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
extern bool g_debug;
|
||||||
|
|
||||||
|
int RulePhrase::Compare(const RulePhrase &other) const
|
||||||
|
{
|
||||||
|
if (GetSize() != other.GetSize()) {
|
||||||
|
return GetSize() < other.GetSize() ? -1 : +1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < m_coll.size(); ++i) {
|
||||||
|
const RuleSymbol &symbol = *m_coll[i];
|
||||||
|
const RuleSymbol &otherSymbol = *other.m_coll[i];
|
||||||
|
int compare = symbol.Compare(otherSymbol);
|
||||||
|
|
||||||
|
if (compare) {
|
||||||
|
return compare;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void RulePhrase::Output(std::ostream &out) const
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < m_coll.size(); ++i) {
|
||||||
|
const RuleSymbol &symbol = *m_coll[i];
|
||||||
|
symbol.Output(out);
|
||||||
|
out << " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string RulePhrase::Debug() const
|
||||||
|
{
|
||||||
|
std::stringstream out;
|
||||||
|
Output(out);
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
49
phrase-extract/extract-mixed-syntax/RulePhrase.h
Normal file
49
phrase-extract/extract-mixed-syntax/RulePhrase.h
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
/*
|
||||||
|
* RulePhrase.h
|
||||||
|
*
|
||||||
|
* Created on: 26 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef RULEPHRASE_H_
|
||||||
|
#define RULEPHRASE_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
class RuleSymbol;
|
||||||
|
|
||||||
|
// a phrase of terms and non-terms for 1 side of a rule
|
||||||
|
class RulePhrase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef std::vector<const RuleSymbol*> Coll;
|
||||||
|
Coll m_coll;
|
||||||
|
|
||||||
|
size_t GetSize() const
|
||||||
|
{ return m_coll.size(); }
|
||||||
|
|
||||||
|
void Add(const RuleSymbol *symbol)
|
||||||
|
{
|
||||||
|
m_coll.push_back(symbol);
|
||||||
|
}
|
||||||
|
|
||||||
|
const RuleSymbol* operator[](size_t index) const {
|
||||||
|
return m_coll[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
const RuleSymbol* Front() const {
|
||||||
|
return m_coll.front();
|
||||||
|
}
|
||||||
|
const RuleSymbol* Back() const {
|
||||||
|
return m_coll.back();
|
||||||
|
}
|
||||||
|
|
||||||
|
int Compare(const RulePhrase &other) const;
|
||||||
|
|
||||||
|
void Output(std::ostream &out) const;
|
||||||
|
std::string Debug() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* RULEPHRASE_H_ */
|
36
phrase-extract/extract-mixed-syntax/RuleSymbol.cpp
Normal file
36
phrase-extract/extract-mixed-syntax/RuleSymbol.cpp
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
/*
|
||||||
|
* RuleSymbol.cpp
|
||||||
|
*
|
||||||
|
* Created on: 21 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "RuleSymbol.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
RuleSymbol::RuleSymbol() {
|
||||||
|
// TODO Auto-generated constructor stub
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
RuleSymbol::~RuleSymbol() {
|
||||||
|
// TODO Auto-generated destructor stub
|
||||||
|
}
|
||||||
|
|
||||||
|
int RuleSymbol::Compare(const RuleSymbol &other) const
|
||||||
|
{
|
||||||
|
if (IsNonTerm() != other.IsNonTerm()) {
|
||||||
|
return IsNonTerm() ? -1 : +1;
|
||||||
|
}
|
||||||
|
|
||||||
|
string str = GetString();
|
||||||
|
string otherStr = other.GetString();
|
||||||
|
|
||||||
|
if (str == otherStr) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return (str < otherStr) ? -1 : +1;
|
||||||
|
}
|
||||||
|
}
|
31
phrase-extract/extract-mixed-syntax/RuleSymbol.h
Normal file
31
phrase-extract/extract-mixed-syntax/RuleSymbol.h
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
/*
|
||||||
|
* RuleSymbol.h
|
||||||
|
*
|
||||||
|
* Created on: 21 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef RULESYMBOL_H_
|
||||||
|
#define RULESYMBOL_H_
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// base class - terminal or non-term
|
||||||
|
class RuleSymbol {
|
||||||
|
public:
|
||||||
|
RuleSymbol();
|
||||||
|
virtual ~RuleSymbol();
|
||||||
|
|
||||||
|
virtual bool IsNonTerm() const = 0;
|
||||||
|
|
||||||
|
virtual std::string Debug() const = 0;
|
||||||
|
virtual void Output(std::ostream &out) const = 0;
|
||||||
|
|
||||||
|
virtual std::string GetString() const = 0;
|
||||||
|
|
||||||
|
int Compare(const RuleSymbol &other) const;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* RULESYMBOL_H_ */
|
227
phrase-extract/extract-mixed-syntax/Rules.cpp
Normal file
227
phrase-extract/extract-mixed-syntax/Rules.cpp
Normal file
@ -0,0 +1,227 @@
|
|||||||
|
/*
|
||||||
|
* Rules.cpp
|
||||||
|
*
|
||||||
|
* Created on: 20 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include "Rules.h"
|
||||||
|
#include "ConsistentPhrase.h"
|
||||||
|
#include "ConsistentPhrases.h"
|
||||||
|
#include "AlignedSentence.h"
|
||||||
|
#include "Rule.h"
|
||||||
|
#include "Parameter.h"
|
||||||
|
#include "moses/Util.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
extern bool g_debug;
|
||||||
|
|
||||||
|
Rules::Rules(const AlignedSentence &alignedSentence)
|
||||||
|
:m_alignedSentence(alignedSentence)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
Rules::~Rules() {
|
||||||
|
Moses::RemoveAllInColl(m_keepRules);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::CreateRules(const ConsistentPhrase &cp,
|
||||||
|
const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
if (params.hieroSourceLHS) {
|
||||||
|
const NonTerm &nonTerm = cp.GetHieroNonTerm();
|
||||||
|
CreateRule(nonTerm, params);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
|
||||||
|
for (size_t i = 0; i < nonTerms.size(); ++i) {
|
||||||
|
const NonTerm &nonTerm = nonTerms[i];
|
||||||
|
CreateRule(nonTerm, params);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::CreateRule(const NonTerm &nonTerm,
|
||||||
|
const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
Rule *rule = new Rule(nonTerm, m_alignedSentence);
|
||||||
|
|
||||||
|
rule->Prevalidate(params);
|
||||||
|
rule->CreateTarget(params);
|
||||||
|
|
||||||
|
|
||||||
|
if (rule->CanRecurse()) {
|
||||||
|
Extend(*rule, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rule->IsValid()) {
|
||||||
|
m_keepRules.insert(rule);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
delete rule;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::Extend(const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
|
||||||
|
|
||||||
|
size_t size = m_alignedSentence.GetPhrase(Moses::Input).size();
|
||||||
|
for (size_t sourceStart = 0; sourceStart < size; ++sourceStart) {
|
||||||
|
for (size_t sourceEnd = sourceStart; sourceEnd < size; ++sourceEnd) {
|
||||||
|
const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
|
||||||
|
|
||||||
|
ConsistentPhrases::Coll::const_iterator iter;
|
||||||
|
for (iter = cps.begin(); iter != cps.end(); ++iter) {
|
||||||
|
const ConsistentPhrase &cp = **iter;
|
||||||
|
CreateRules(cp, params);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::Extend(const Rule &rule, const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
const ConsistentPhrases &allCPS = m_alignedSentence.GetConsistentPhrases();
|
||||||
|
int sourceMin = rule.GetNextSourcePosForNonTerm();
|
||||||
|
|
||||||
|
int ruleStart = rule.GetConsistentPhrase().corners[0];
|
||||||
|
int ruleEnd = rule.GetConsistentPhrase().corners[1];
|
||||||
|
|
||||||
|
for (int sourceStart = sourceMin; sourceStart <= ruleEnd; ++sourceStart) {
|
||||||
|
for (int sourceEnd = sourceStart; sourceEnd <= ruleEnd; ++sourceEnd) {
|
||||||
|
if (sourceStart == ruleStart && sourceEnd == ruleEnd) {
|
||||||
|
// don't cover whole rule with 1 non-term
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const ConsistentPhrases::Coll &cps = allCPS.GetColl(sourceStart, sourceEnd);
|
||||||
|
Extend(rule, cps, params);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::Extend(const Rule &rule, const ConsistentPhrases::Coll &cps, const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
ConsistentPhrases::Coll::const_iterator iter;
|
||||||
|
for (iter = cps.begin(); iter != cps.end(); ++iter) {
|
||||||
|
const ConsistentPhrase &cp = **iter;
|
||||||
|
Extend(rule, cp, params);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::Extend(const Rule &rule, const ConsistentPhrase &cp, const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
const ConsistentPhrase::NonTerms &nonTerms = cp.GetNonTerms();
|
||||||
|
for (size_t i = 0; i < nonTerms.size(); ++i) {
|
||||||
|
const NonTerm &nonTerm = nonTerms[i];
|
||||||
|
|
||||||
|
Rule *newRule = new Rule(rule, nonTerm);
|
||||||
|
newRule->Prevalidate(params);
|
||||||
|
newRule->CreateTarget(params);
|
||||||
|
|
||||||
|
if (newRule->CanRecurse()) {
|
||||||
|
// recursively extend
|
||||||
|
Extend(*newRule, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (newRule->IsValid()) {
|
||||||
|
m_keepRules.insert(newRule);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
delete newRule;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Rules::Debug() const
|
||||||
|
{
|
||||||
|
stringstream out;
|
||||||
|
|
||||||
|
std::set<Rule*>::const_iterator iter;
|
||||||
|
out << "m_keepRules:" << endl;
|
||||||
|
for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
|
||||||
|
const Rule &rule = **iter;
|
||||||
|
out << rule.Debug() << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::Output(std::ostream &out, bool forward, const Parameter ¶ms) const
|
||||||
|
{
|
||||||
|
std::set<Rule*, CompareRules>::const_iterator iter;
|
||||||
|
for (iter = m_mergeRules.begin(); iter != m_mergeRules.end(); ++iter) {
|
||||||
|
const Rule &rule = **iter;
|
||||||
|
rule.Output(out, forward, params);
|
||||||
|
out << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::Consolidate(const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
if (params.fractionalCounting) {
|
||||||
|
CalcFractionalCount();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
std::set<Rule*>::iterator iter;
|
||||||
|
for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
|
||||||
|
Rule &rule = **iter;
|
||||||
|
rule.SetCount(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MergeRules(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::MergeRules(const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
typedef std::set<Rule*, CompareRules> MergeRules;
|
||||||
|
|
||||||
|
std::set<Rule*>::const_iterator iterOrig;
|
||||||
|
for (iterOrig = m_keepRules.begin(); iterOrig != m_keepRules.end(); ++iterOrig) {
|
||||||
|
Rule *origRule = *iterOrig;
|
||||||
|
|
||||||
|
pair<MergeRules::iterator, bool> inserted = m_mergeRules.insert(origRule);
|
||||||
|
if (!inserted.second) {
|
||||||
|
// already there, just add count
|
||||||
|
Rule &rule = **inserted.first;
|
||||||
|
float newCount = rule.GetCount() + origRule->GetCount();
|
||||||
|
rule.SetCount(newCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Rules::CalcFractionalCount()
|
||||||
|
{
|
||||||
|
typedef std::set<Rule*> RuleColl;
|
||||||
|
typedef std::map<const ConsistentPhrase*, RuleColl> RuleByConsistentPhrase;
|
||||||
|
RuleByConsistentPhrase allRules;
|
||||||
|
|
||||||
|
// sort by source AND target ranges
|
||||||
|
std::set<Rule*>::const_iterator iter;
|
||||||
|
for (iter = m_keepRules.begin(); iter != m_keepRules.end(); ++iter) {
|
||||||
|
Rule *rule = *iter;
|
||||||
|
const ConsistentPhrase &cp = rule->GetConsistentPhrase();
|
||||||
|
RuleColl &ruleColl = allRules[&cp];
|
||||||
|
ruleColl.insert(rule);
|
||||||
|
}
|
||||||
|
|
||||||
|
// fractional count
|
||||||
|
RuleByConsistentPhrase::iterator iterOuter;
|
||||||
|
for (iterOuter = allRules.begin(); iterOuter != allRules.end(); ++iterOuter) {
|
||||||
|
RuleColl &rules = iterOuter->second;
|
||||||
|
|
||||||
|
RuleColl::iterator iterInner;
|
||||||
|
for (iterInner = rules.begin(); iterInner != rules.end(); ++iterInner) {
|
||||||
|
Rule &rule = **iterInner;
|
||||||
|
rule.SetCount(1.0f / (float) rules.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
72
phrase-extract/extract-mixed-syntax/Rules.h
Normal file
72
phrase-extract/extract-mixed-syntax/Rules.h
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
/*
|
||||||
|
* Rules.h
|
||||||
|
*
|
||||||
|
* Created on: 20 Feb 2014
|
||||||
|
* Author: hieu
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
#include <iostream>
|
||||||
|
#include "ConsistentPhrases.h"
|
||||||
|
#include "Rule.h"
|
||||||
|
|
||||||
|
extern bool g_debug;
|
||||||
|
|
||||||
|
class AlignedSentence;
|
||||||
|
class Parameter;
|
||||||
|
|
||||||
|
struct CompareRules {
|
||||||
|
bool operator()(const Rule *a, const Rule *b)
|
||||||
|
{
|
||||||
|
int compare;
|
||||||
|
|
||||||
|
compare = a->GetPhrase(Moses::Input).Compare(b->GetPhrase(Moses::Input));
|
||||||
|
if (compare) return compare < 0;
|
||||||
|
|
||||||
|
compare = a->GetPhrase(Moses::Output).Compare(b->GetPhrase(Moses::Output));
|
||||||
|
if (compare) return compare < 0;
|
||||||
|
|
||||||
|
if (a->GetAlignments() != b->GetAlignments()) {
|
||||||
|
return a->GetAlignments() < b->GetAlignments();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a->GetLHS().GetString() != b->GetLHS().GetString()) {
|
||||||
|
return a->GetLHS().GetString() < b->GetLHS().GetString();
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class Rules {
|
||||||
|
public:
|
||||||
|
Rules(const AlignedSentence &alignedSentence);
|
||||||
|
virtual ~Rules();
|
||||||
|
void Extend(const Parameter ¶ms);
|
||||||
|
void Consolidate(const Parameter ¶ms);
|
||||||
|
|
||||||
|
std::string Debug() const;
|
||||||
|
void Output(std::ostream &out, bool forward, const Parameter ¶ms) const;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
const AlignedSentence &m_alignedSentence;
|
||||||
|
std::set<Rule*> m_keepRules;
|
||||||
|
std::set<Rule*, CompareRules> m_mergeRules;
|
||||||
|
|
||||||
|
void Extend(const Rule &rule, const Parameter ¶ms);
|
||||||
|
void Extend(const Rule &rule, const ConsistentPhrases::Coll &cps, const Parameter ¶ms);
|
||||||
|
void Extend(const Rule &rule, const ConsistentPhrase &cp, const Parameter ¶ms);
|
||||||
|
|
||||||
|
// create original rules
|
||||||
|
void CreateRules(const ConsistentPhrase &cp,
|
||||||
|
const Parameter ¶ms);
|
||||||
|
void CreateRule(const NonTerm &nonTerm,
|
||||||
|
const Parameter ¶ms);
|
||||||
|
|
||||||
|
void MergeRules(const Parameter ¶ms);
|
||||||
|
void CalcFractionalCount();
|
||||||
|
|
||||||
|
};
|
||||||
|
|
47
phrase-extract/extract-mixed-syntax/SyntaxTree.cpp
Normal file
47
phrase-extract/extract-mixed-syntax/SyntaxTree.cpp
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
#include <cassert>
|
||||||
|
#include <iostream>
|
||||||
|
#include "SyntaxTree.h"
|
||||||
|
#include "Parameter.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
void SyntaxTree::Add(int startPos, int endPos, const std::string &label, const Parameter ¶ms)
|
||||||
|
{
|
||||||
|
//cerr << "add " << label << " to " << "[" << startPos << "-" << endPos << "]" << endl;
|
||||||
|
|
||||||
|
Range range(startPos, endPos);
|
||||||
|
Labels &labels = m_coll[range];
|
||||||
|
|
||||||
|
bool add = true;
|
||||||
|
if (labels.size()) {
|
||||||
|
if (params.multiLabel == 1) {
|
||||||
|
// delete the label in collection and add new
|
||||||
|
assert(labels.size() == 1);
|
||||||
|
labels.clear();
|
||||||
|
}
|
||||||
|
else if (params.multiLabel == 2) {
|
||||||
|
// ignore this label
|
||||||
|
add = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (add) {
|
||||||
|
labels.push_back(label);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SyntaxTree::AddToAll(const std::string &label)
|
||||||
|
{
|
||||||
|
Coll::iterator iter;
|
||||||
|
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
|
||||||
|
Labels &labels = iter->second;
|
||||||
|
labels.push_back(label);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const SyntaxTree::Labels &SyntaxTree::Find(int startPos, int endPos) const
|
||||||
|
{
|
||||||
|
Coll::const_iterator iter;
|
||||||
|
iter = m_coll.find(Range(startPos, endPos));
|
||||||
|
return (iter == m_coll.end()) ? m_defaultLabels : iter->second;
|
||||||
|
}
|
32
phrase-extract/extract-mixed-syntax/SyntaxTree.h
Normal file
32
phrase-extract/extract-mixed-syntax/SyntaxTree.h
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class Parameter;
|
||||||
|
|
||||||
|
class SyntaxTree
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef std::pair<int, int> Range;
|
||||||
|
typedef std::vector<std::string> Labels;
|
||||||
|
typedef std::map<Range, Labels> Coll;
|
||||||
|
|
||||||
|
void Add(int startPos, int endPos, const std::string &label, const Parameter ¶ms);
|
||||||
|
void AddToAll(const std::string &label);
|
||||||
|
|
||||||
|
const Labels &Find(int startPos, int endPos) const;
|
||||||
|
|
||||||
|
void SetHieroLabel(const std::string &label) {
|
||||||
|
m_defaultLabels.push_back(label);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected:
|
||||||
|
|
||||||
|
Coll m_coll;
|
||||||
|
Labels m_defaultLabels;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
68
phrase-extract/extract-mixed-syntax/Word.cpp
Normal file
68
phrase-extract/extract-mixed-syntax/Word.cpp
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
/*
|
||||||
|
* Word.cpp
|
||||||
|
*
|
||||||
|
* Created on: 18 Feb 2014
|
||||||
|
* Author: s0565741
|
||||||
|
*/
|
||||||
|
#include <limits>
|
||||||
|
#include "Word.h"
|
||||||
|
#include "moses/Util.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
Word::Word(int pos, const std::string &str)
|
||||||
|
:m_pos(pos)
|
||||||
|
,m_str(str)
|
||||||
|
{
|
||||||
|
// TODO Auto-generated constructor stub
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
Word::~Word() {
|
||||||
|
// TODO Auto-generated destructor stub
|
||||||
|
}
|
||||||
|
|
||||||
|
void Word::AddAlignment(const Word *other)
|
||||||
|
{
|
||||||
|
m_alignment.insert(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::set<int> Word::GetAlignmentIndex() const
|
||||||
|
{
|
||||||
|
std::set<int> ret;
|
||||||
|
|
||||||
|
std::set<const Word *>::const_iterator iter;
|
||||||
|
for (iter = m_alignment.begin(); iter != m_alignment.end(); ++iter) {
|
||||||
|
const Word &otherWord = **iter;
|
||||||
|
int otherPos = otherWord.GetPos();
|
||||||
|
ret.insert(otherPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Word::Output(std::ostream &out) const
|
||||||
|
{
|
||||||
|
out << m_str;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Word::Debug() const
|
||||||
|
{
|
||||||
|
return m_str;
|
||||||
|
}
|
||||||
|
|
||||||
|
int Word::CompareString(const Word &other) const
|
||||||
|
{
|
||||||
|
return m_str.compare(other.m_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Word::GetString(int factor) const
|
||||||
|
{
|
||||||
|
vector<string> toks;
|
||||||
|
Moses::Tokenize(toks, m_str, "|");
|
||||||
|
|
||||||
|
assert(factor < toks.size());
|
||||||
|
return toks[factor];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
49
phrase-extract/extract-mixed-syntax/Word.h
Normal file
49
phrase-extract/extract-mixed-syntax/Word.h
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
/*
|
||||||
|
* Word.h
|
||||||
|
*
|
||||||
|
* Created on: 18 Feb 2014
|
||||||
|
* Author: s0565741
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <set>
|
||||||
|
#include "RuleSymbol.h"
|
||||||
|
|
||||||
|
// a terminal
|
||||||
|
class Word : public RuleSymbol
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
Word(const Word&); // do not implement
|
||||||
|
Word(int pos, const std::string &str);
|
||||||
|
virtual ~Word();
|
||||||
|
|
||||||
|
virtual bool IsNonTerm() const
|
||||||
|
{ return false; }
|
||||||
|
|
||||||
|
std::string GetString() const
|
||||||
|
{ return m_str; }
|
||||||
|
|
||||||
|
std::string GetString(int factor) const;
|
||||||
|
|
||||||
|
int GetPos() const
|
||||||
|
{ return m_pos; }
|
||||||
|
|
||||||
|
void AddAlignment(const Word *other);
|
||||||
|
|
||||||
|
const std::set<const Word *> &GetAlignment() const
|
||||||
|
{ return m_alignment; }
|
||||||
|
|
||||||
|
std::set<int> GetAlignmentIndex() const;
|
||||||
|
|
||||||
|
void Output(std::ostream &out) const;
|
||||||
|
std::string Debug() const;
|
||||||
|
|
||||||
|
int CompareString(const Word &other) const;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
int m_pos; // original position in sentence, NOT in lattice
|
||||||
|
std::string m_str;
|
||||||
|
std::set<const Word *> m_alignment;
|
||||||
|
};
|
||||||
|
|
81
phrase-extract/extract-mixed-syntax/gzfilebuf.h
Normal file
81
phrase-extract/extract-mixed-syntax/gzfilebuf.h
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
#ifndef moses_gzfile_buf_h
|
||||||
|
#define moses_gzfile_buf_h
|
||||||
|
|
||||||
|
#include <streambuf>
|
||||||
|
#include <zlib.h>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
class gzfilebuf : public std::streambuf {
|
||||||
|
public:
|
||||||
|
gzfilebuf(const char *filename)
|
||||||
|
{ _gzf = gzopen(filename, "rb");
|
||||||
|
setg (_buff+sizeof(int), // beginning of putback area
|
||||||
|
_buff+sizeof(int), // read position
|
||||||
|
_buff+sizeof(int)); // end position
|
||||||
|
}
|
||||||
|
~gzfilebuf() { gzclose(_gzf); }
|
||||||
|
protected:
|
||||||
|
virtual int_type overflow (int_type c) {
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
|
||||||
|
// write multiple characters
|
||||||
|
virtual
|
||||||
|
std::streamsize xsputn (const char* s,
|
||||||
|
std::streamsize num) {
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
|
||||||
|
}
|
||||||
|
|
||||||
|
//read one character
|
||||||
|
virtual int_type underflow () {
|
||||||
|
// is read position before end of _buff?
|
||||||
|
if (gptr() < egptr()) {
|
||||||
|
return traits_type::to_int_type(*gptr());
|
||||||
|
}
|
||||||
|
|
||||||
|
/* process size of putback area
|
||||||
|
* - use number of characters read
|
||||||
|
* - but at most four
|
||||||
|
*/
|
||||||
|
unsigned int numPutback = gptr() - eback();
|
||||||
|
if (numPutback > sizeof(int)) {
|
||||||
|
numPutback = sizeof(int);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* copy up to four characters previously read into
|
||||||
|
* the putback _buff (area of first four characters)
|
||||||
|
*/
|
||||||
|
std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
|
||||||
|
numPutback);
|
||||||
|
|
||||||
|
// read new characters
|
||||||
|
int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
|
||||||
|
if (num <= 0) {
|
||||||
|
// ERROR or EOF
|
||||||
|
return EOF;
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset _buff pointers
|
||||||
|
setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
|
||||||
|
_buff+sizeof(int), // read position
|
||||||
|
_buff+sizeof(int)+num); // end of buffer
|
||||||
|
|
||||||
|
// return next character
|
||||||
|
return traits_type::to_int_type(*gptr());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::streamsize xsgetn (char* s,
|
||||||
|
std::streamsize num) {
|
||||||
|
return gzread(_gzf,s,num);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
gzFile _gzf;
|
||||||
|
static const unsigned int _buffsize = 1024;
|
||||||
|
char _buff[_buffsize];
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
10250
phrase-extract/extract-mixed-syntax/pugixml.cpp
Normal file
10250
phrase-extract/extract-mixed-syntax/pugixml.cpp
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user