add --MultiLabel

This commit is contained in:
Hieu Hoang 2014-03-20 12:09:22 +00:00
parent e389b8f51d
commit b556cdc464
8 changed files with 56 additions and 13 deletions

View File

@ -54,6 +54,8 @@ void AlignedSentence::PopulateAlignment(const std::string &line)
int sourcePos = alignPair[0];
int targetPos = alignPair[1];
cerr << "m_source=" << m_source.size() << endl;
assert(sourcePos < m_source.size());
assert(targetPos < m_target.size());
Word *sourceWord = m_source[sourcePos];

View File

@ -32,7 +32,7 @@ void AlignedSentenceSyntax::Populate(bool isSyntax, int mixedSyntaxType, const P
// parse source and target string
if (isSyntax) {
line = "<xml><tree label=\"X\">" + line + "</tree></xml>";
XMLParse(phrase, tree, line);
XMLParse(phrase, tree, line, params);
if (mixedSyntaxType != 0) {
// mixed syntax. Always add [X] where there isn't 1
@ -72,7 +72,10 @@ void Escape(string &text)
}
void AlignedSentenceSyntax::XMLParse(Phrase &output, SyntaxTree &tree, const pugi::xml_node &parentNode)
void AlignedSentenceSyntax::XMLParse(Phrase &output,
SyntaxTree &tree,
const pugi::xml_node &parentNode,
const Parameter &params)
{
int childNum = 0;
for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling())
@ -88,7 +91,7 @@ void AlignedSentenceSyntax::XMLParse(Phrase &output, SyntaxTree &tree, const pug
label = attribute.as_string();
// recursively call this function. For proper recursive trees
XMLParse(output, tree, childNode);
XMLParse(output, tree, childNode, params);
}
@ -113,7 +116,8 @@ void AlignedSentenceSyntax::XMLParse(Phrase &output, SyntaxTree &tree, const pug
// fill syntax labels
if (!label.empty()) {
label = "[" + label + "]";
tree.Add(startPos, endPos, label);
cerr << "add " << label << " to " << "[" << startPos << "-" << endPos << "]" << endl;
tree.Add(startPos, endPos, label, params);
}
++childNum;
@ -121,14 +125,17 @@ void AlignedSentenceSyntax::XMLParse(Phrase &output, SyntaxTree &tree, const pug
}
void AlignedSentenceSyntax::XMLParse(Phrase &output, SyntaxTree &tree, const std::string input)
void AlignedSentenceSyntax::XMLParse(Phrase &output,
SyntaxTree &tree,
const std::string input,
const Parameter &params)
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load(input.c_str(),
pugi::parse_default | pugi::parse_comments);
pugi::xml_node topNode = doc.child("xml");
XMLParse(output, tree, topNode);
XMLParse(output, tree, topNode, params);
}
void AlignedSentenceSyntax::CreateNonTerms()

View File

@ -26,8 +26,14 @@ protected:
std::string m_sourceStr, m_targetStr, m_alignmentStr;
SyntaxTree m_sourceTree, m_targetTree;
void XMLParse(Phrase &output, SyntaxTree &tree, const std::string input);
void XMLParse(Phrase &output, SyntaxTree &tree, const pugi::xml_node &parentNode);
void XMLParse(Phrase &output,
SyntaxTree &tree,
const std::string input,
const Parameter &params);
void XMLParse(Phrase &output,
SyntaxTree &tree,
const pugi::xml_node &parentNode,
const Parameter &params);
void CreateNonTerms();
void CreateNonTerms(ConsistentPhrase &cp,
const SyntaxTree::Labels &sourceLabels,

View File

@ -33,7 +33,8 @@ int main(int argc, char** argv)
("SourceSyntax", "Source sentence is a parse tree")
("TargetSyntax", "Target sentence is a parse tree")
("MixedSyntaxType", po::value<int>()->default_value(params.mixedSyntaxType), "Hieu's Mixed syntax type. 0(default)=no mixed syntax, 1=add [X] only if no syntactic label. 2=add [X] everywhere");
("MixedSyntaxType", po::value<int>()->default_value(params.mixedSyntaxType), "Hieu's Mixed syntax type. 0(default)=no mixed syntax, 1=add [X] only if no syntactic label. 2=add [X] everywhere")
("MultiLabel", po::value<int>()->default_value(params.multiLabel), "What to do with multiple labels on the same span. 0(default)=keep them all, 1=keep only top-most, 2=keep only bottom-most");
po::variables_map vm;
try
@ -70,6 +71,7 @@ int main(int argc, char** argv)
if (vm.count("SourceSyntax")) params.sourceSyntax = true;
if (vm.count("TargetSyntax")) params.targetSyntax = true;
if (vm.count("MixedSyntaxType")) params.mixedSyntaxType = vm["MixedSyntaxType"].as<int>();
if (vm.count("MultiLabel")) params.multiLabel = vm["MultiLabel"].as<int>();
// input files;
string pathTarget = argv[1];
@ -91,8 +93,11 @@ int main(int argc, char** argv)
// MAIN LOOP
int lineNum = 1;
string lineTarget, lineSource, lineAlignment;
while (getline(strmTarget, lineTarget)) {
cerr << lineNum << " ";
bool success;
success = getline(strmSource, lineSource);
if (!success) {
@ -130,6 +135,8 @@ int main(int argc, char** argv)
rules.Output(extractInvFile, false);
delete alignedSentence;
++lineNum;
}
if (!params.gluePath.empty()) {

View File

@ -24,6 +24,7 @@ Parameter::Parameter()
,targetSyntax(false)
,mixedSyntaxType(0)
,multiLabel(0)
,nonTermConsecSourceMixed(true)
{}

View File

@ -33,7 +33,7 @@ public:
bool sourceSyntax, targetSyntax;
int mixedSyntaxType;
int mixedSyntaxType, multiLabel;
bool nonTermConsecSourceMixed;
};

View File

@ -1,10 +1,28 @@
#include <cassert>
#include "SyntaxTree.h"
#include "Parameter.h"
void SyntaxTree::Add(int startPos, int endPos, const std::string &label)
void SyntaxTree::Add(int startPos, int endPos, const std::string &label, const Parameter &params)
{
Range range(startPos, endPos);
Labels &labels = m_coll[range];
labels.push_back(label);
bool add = true;
if (labels.size()) {
if (params.multiLabel == 1) {
// delete the label in collection and add new
assert(labels.size() == 1);
labels.clear();
}
else if (params.multiLabel == 2) {
// ignore this label
add = false;
}
}
if (add) {
labels.push_back(label);
}
}
void SyntaxTree::AddToAll(const std::string &label)

View File

@ -4,6 +4,8 @@
#include <map>
#include <string>
class Parameter;
class SyntaxTree
{
public:
@ -11,7 +13,7 @@ public:
typedef std::vector<std::string> Labels;
typedef std::map<Range, Labels> Coll;
void Add(int startPos, int endPos, const std::string &label);
void Add(int startPos, int endPos, const std::string &label, const Parameter &params);
void AddToAll(const std::string &label);
const Labels &Find(int startPos, int endPos) const;