This commit is contained in:
Ulrich Germann 2015-05-24 16:12:13 +01:00
commit e49ffb8efa
17 changed files with 16 additions and 1086 deletions

View File

@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -72,13 +72,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -1,132 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.2107801703">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2107801703" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.2107801703" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.2107801703." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.502948364" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.1431969079" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/manual-label}/Debug" id="cdt.managedbuild.target.gnu.builder.exe.debug.2101075234" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1118840081" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2037265673" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.400985496" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.1160903812" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.404589863" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="${workspace_loc:}/../.."/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.967940596" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.789243964" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.2033266575" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.exe.debug.option.debugging.level.1568929819" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.676866714" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.254144861" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.319879082" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.paths.132164474" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
</option>
<option id="gnu.cpp.link.option.libs.1017214824" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="boost_program_options"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1672776758" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1104732611" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.372096550" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="cdt.managedbuild.config.gnu.exe.release.649050588">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.649050588" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.649050588" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.exe.release.649050588." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1107402972" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.1038954684" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
<builder buildPath="${workspace_loc:/manual-label}/Release" id="cdt.managedbuild.target.gnu.builder.exe.release.100518450" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.2005888378" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1743303968" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
<option id="gnu.cpp.compiler.exe.release.option.optimization.level.968169340" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.release.option.debugging.level.977676916" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1889240027" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.924128295" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1914416581" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.exe.release.option.debugging.level.826081780" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2048171432" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.940327646" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.369758737" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1186766936" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.266174128" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.558116084" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="manual-label.cdt.managedbuild.target.gnu.exe.1701243340" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.649050588;cdt.managedbuild.config.gnu.exe.release.649050588.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1743303968;cdt.managedbuild.tool.gnu.cpp.compiler.input.1889240027">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.649050588;cdt.managedbuild.config.gnu.exe.release.649050588.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.924128295;cdt.managedbuild.tool.gnu.c.compiler.input.2048171432">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2107801703;cdt.managedbuild.config.gnu.exe.debug.2107801703.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.2037265673;cdt.managedbuild.tool.gnu.cpp.compiler.input.967940596">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.2107801703;cdt.managedbuild.config.gnu.exe.debug.2107801703.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.789243964;cdt.managedbuild.tool.gnu.c.compiler.input.676866714">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/manual-label"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/manual-label"/>
</configuration>
</storageModule>
</cproject>

View File

@ -1,27 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>manual-label</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>

View File

@ -1,46 +0,0 @@
#include <list>
#include "DeEn.h"
#include "Main.h"
#include "moses/Util.h"
using namespace std;
extern bool g_debug;
bool Contains(const Phrase &source, int start, int end, int factor, const string &str)
{
for (int pos = start; pos <= end; ++pos) {
bool found = IsA(source, pos, 0, factor, str);
if (found) {
return true;
}
}
return false;
}
void LabelDeEn(const Phrase &source, ostream &out)
{
Ranges ranges;
// find ranges to label
for (int start = 0; start < source.size(); ++start) {
for (int end = start; end < source.size(); ++end) {
if (IsA(source, start, -1, 1, "VAFIN")
&& IsA(source, end, +1, 1, "VVINF VVPP")
&& !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) {
Range range(start, end, "reorder-label");
ranges.push_back(range);
}
else if ((start == 0 || IsA(source, start, -1, 1, "$,"))
&& IsA(source, end, +1, 0, "zu")
&& IsA(source, end, +2, 1, "VVINF")
&& !Contains(source, start, end, 1, "$,")) {
Range range(start, end, "reorder-label");
ranges.push_back(range);
}
}
}
OutputWithLabels(source, ranges, out);
}

View File

@ -1,5 +0,0 @@
#pragma once
#include "Main.h"
void LabelDeEn(const Phrase &source, std::ostream &out);

View File

@ -1,202 +0,0 @@
/*
* EnApacheChunker.cpp
*
* Created on: 28 Feb 2014
* Author: hieu
*/
#include <cstdlib>
#include <cstdio>
#include <algorithm>
#include <fstream>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
#include "EnOpenNLPChunker.h"
#include "moses/Util.h"
using namespace std;
using namespace boost::algorithm;
EnOpenNLPChunker::EnOpenNLPChunker(const std::string &openNLPPath)
:m_openNLPPath(openNLPPath)
{
// TODO Auto-generated constructor stub
}
EnOpenNLPChunker::~EnOpenNLPChunker() {
// TODO Auto-generated destructor stub
}
void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector<string> &filterList)
{
const boost::filesystem::path
inPath = boost::filesystem::unique_path(),
outPath = boost::filesystem::unique_path();
// read all input to a temp file
ofstream inFile(inPath.c_str());
string line;
while (getline(in, line)) {
Unescape(line);
inFile << line << endl;
}
inFile.close();
// execute chunker
string cmd = "cat " + inPath.native() + " | "
+ m_openNLPPath + "/bin/opennlp POSTagger "
+ m_openNLPPath + "/models/en-pos-maxent.bin | "
+ m_openNLPPath + "/bin/opennlp ChunkerME "
+ m_openNLPPath + "/models/en-chunker.bin > "
+ outPath.native();
//g << "Executing:" << cmd << endl;
int ret = system(cmd.c_str());
// read result of chunker and output as Moses xml trees
ifstream outFile(outPath.c_str());
size_t lineNum = 0;
while (getline(outFile, line)) {
//cerr << line << endl;
MosesReformat(line, out, filterList);
out << endl;
++lineNum;
}
outFile.close();
// clean up temporary files
remove(inPath.c_str());
remove(outPath.c_str());
}
void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector<string> &filterList)
{
//cerr << "REFORMATING:" << line << endl;
bool inLabel = false;
vector<string> toks;
Moses::Tokenize(toks, line);
for (size_t i = 0; i < toks.size(); ++i) {
const string &tok = toks[i];
if (tok.substr(0, 1) == "[" && tok.substr(1,1) != "_") {
// start of chunk
string label = tok.substr(1);
if (UseLabel(label, filterList)) {
out << "<tree label=\"" << label << "\">";
inLabel = true;
}
}
else if (ends_with(tok, "]")) {
// end of chunk
if (tok.size() > 1) {
if (tok.substr(1,1) == "_") {
// just a word that happens to be ]
vector<string> factors;
Moses::Tokenize(factors, tok, "_");
assert(factors.size() == 2);
Escape(factors[0]);
out << factors[0] << " ";
}
else {
// a word and end of tree
string word = tok.substr(0, tok.size()-1);
vector<string> factors;
Moses::Tokenize(factors, word, "_");
assert(factors.size() == 2);
Escape(factors[0]);
out << factors[0] << " ";
}
if (inLabel) {
out << "</tree> ";
inLabel = false;
}
}
else {
if (inLabel) {
out << "</tree> ";
inLabel = false;
}
}
}
else {
// lexical item
vector<string> factors;
Moses::Tokenize(factors, tok, "_");
if (factors.size() == 2) {
Escape(factors[0]);
out << factors[0] << " ";
}
else if (factors.size() == 1) {
// word is _
assert(tok.substr(0, 2) == "__");
out << "_ ";
}
else {
throw "Unknown format:" + tok;
}
}
}
}
std::string
replaceAll( std::string const& original,
std::string const& before,
std::string const& after )
{
std::string retval;
std::string::const_iterator end = original.end();
std::string::const_iterator current = original.begin();
std::string::const_iterator next =
std::search( current, end, before.begin(), before.end() );
while ( next != end ) {
retval.append( current, next );
retval.append( after );
current = next + before.size();
next = std::search( current, end, before.begin(), before.end() );
}
retval.append( current, next );
return retval;
}
void EnOpenNLPChunker::Escape(string &line)
{
line = replaceAll(line, "&", "&amp;");
line = replaceAll(line, "|", "&#124;");
line = replaceAll(line, "<", "&lt;");
line = replaceAll(line, ">", "&gt;");
line = replaceAll(line, "'", "&apos;");
line = replaceAll(line, "\"", "&quot;");
line = replaceAll(line, "[", "&#91;");
line = replaceAll(line, "]", "&#93;");
}
void EnOpenNLPChunker::Unescape(string &line)
{
line = replaceAll(line, "&#124;", "|");
line = replaceAll(line, "&lt;", "<");
line = replaceAll(line, "&gt;", ">");
line = replaceAll(line, "&quot;", "\"");
line = replaceAll(line, "&apos;", "'");
line = replaceAll(line, "&#91;", "[");
line = replaceAll(line, "&#93;", "]");
line = replaceAll(line, "&amp;", "&");
}
bool EnOpenNLPChunker::UseLabel(const std::string &label, const std::vector<std::string> &filterList) const
{
if (filterList.size() == 0) {
return true;
}
for (size_t i = 0; i < filterList.size(); ++i) {
if (label == filterList[i]) {
return true;
}
}
return false;
}

View File

@ -1,29 +0,0 @@
/*
* EnApacheChunker.h
*
* Created on: 28 Feb 2014
* Author: hieu
*/
#pragma once
#include <vector>
#include <string>
#include <iostream>
class EnOpenNLPChunker {
public:
EnOpenNLPChunker(const std::string &openNLPPath);
virtual ~EnOpenNLPChunker();
void Process(std::istream &in, std::ostream &out, const std::vector<std::string> &filterList);
protected:
const std::string m_openNLPPath;
void Escape(std::string &line);
void Unescape(std::string &line);
void MosesReformat(const std::string &line, std::ostream &out, const std::vector<std::string> &filterList);
bool UseLabel(const std::string &label, const std::vector<std::string> &filterList) const;
};

View File

@ -1,226 +0,0 @@
#include <iostream>
#include <list>
#include <limits>
#include <algorithm>
#include "EnPhrasalVerb.h"
#include "moses/Util.h"
using namespace std;
void EnPhrasalVerb(const Phrase &source, int revision, ostream &out)
{
Ranges ranges;
// find ranges to label
for (int start = 0; start < source.size(); ++start) {
size_t end = std::numeric_limits<size_t>::max();
if (IsA(source, start, 0, 0, "ask asked asking")) {
end = Found(source, start, 0, "out");
}
else if (IsA(source, start, 0, 0, "back backed backing")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "blow blown blew")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "break broke broken")) {
end = Found(source, start, 0, "down up in");
}
else if (IsA(source, start, 0, 0, "bring brought bringing")) {
end = Found(source, start, 0, "down up in");
}
else if (IsA(source, start, 0, 0, "call called calling")) {
end = Found(source, start, 0, "back up off");
}
else if (IsA(source, start, 0, 0, "check checked checking")) {
end = Found(source, start, 0, "out in");
}
else if (IsA(source, start, 0, 0, "cheer cheered cheering")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "clean cleaned cleaning")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "cross crossed crossing")) {
end = Found(source, start, 0, "out");
}
else if (IsA(source, start, 0, 0, "cut cutting")) {
end = Found(source, start, 0, "down off out");
}
else if (IsA(source, start, 0, 0, "do did done")) {
end = Found(source, start, 0, "over up");
}
else if (IsA(source, start, 0, 0, "drop dropped dropping")) {
end = Found(source, start, 0, "off");
}
else if (IsA(source, start, 0, 0, "figure figured figuring")) {
end = Found(source, start, 0, "out");
}
else if (IsA(source, start, 0, 0, "fill filled filling")) {
end = Found(source, start, 0, "in out up");
}
else if (IsA(source, start, 0, 0, "find found finding")) {
end = Found(source, start, 0, "out");
}
else if (IsA(source, start, 0, 0, "get got getting gotten")) {
end = Found(source, start, 0, "across over back");
}
else if (IsA(source, start, 0, 0, "give given gave giving")) {
end = Found(source, start, 0, "away back out up");
}
else if (IsA(source, start, 0, 0, "hand handed handing")) {
end = Found(source, start, 0, "down in over");
}
else if (IsA(source, start, 0, 0, "hold held holding")) {
end = Found(source, start, 0, "back up");
}
else if (IsA(source, start, 0, 0, "keep kept keeping")) {
end = Found(source, start, 0, "from up");
}
else if (IsA(source, start, 0, 0, "let letting")) {
end = Found(source, start, 0, "down in");
}
else if (IsA(source, start, 0, 0, "look looked looking")) {
end = Found(source, start, 0, "over up");
}
else if (IsA(source, start, 0, 0, "make made making")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "mix mixed mixing")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "pass passed passing")) {
end = Found(source, start, 0, "out up");
}
else if (IsA(source, start, 0, 0, "pay payed paying")) {
end = Found(source, start, 0, "back");
}
else if (IsA(source, start, 0, 0, "pick picked picking")) {
end = Found(source, start, 0, "out");
}
else if (IsA(source, start, 0, 0, "point pointed pointing")) {
end = Found(source, start, 0, "out");
}
else if (IsA(source, start, 0, 0, "put putting")) {
end = Found(source, start, 0, "down off out together on");
}
else if (IsA(source, start, 0, 0, "send sending")) {
end = Found(source, start, 0, "back");
}
else if (IsA(source, start, 0, 0, "set setting")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "sort sorted sorting")) {
end = Found(source, start, 0, "out");
}
else if (IsA(source, start, 0, 0, "switch switched switching")) {
end = Found(source, start, 0, "off on");
}
else if (IsA(source, start, 0, 0, "take took taking")) {
end = Found(source, start, 0, "apart back off out");
}
else if (IsA(source, start, 0, 0, "tear torn tearing")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "think thought thinking")) {
end = Found(source, start, 0, "over");
}
else if (IsA(source, start, 0, 0, "thrown threw thrown throwing")) {
end = Found(source, start, 0, "away");
}
else if (IsA(source, start, 0, 0, "turn turned turning")) {
end = Found(source, start, 0, "down off on");
}
else if (IsA(source, start, 0, 0, "try tried trying")) {
end = Found(source, start, 0, "on out");
}
else if (IsA(source, start, 0, 0, "use used using")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "warm warmed warming")) {
end = Found(source, start, 0, "up");
}
else if (IsA(source, start, 0, 0, "work worked working")) {
end = Found(source, start, 0, "out");
}
// found range to label
if (end != std::numeric_limits<size_t>::max() &&
end > start + 1) {
bool add = true;
if (revision == 1 && Exist(source,
start + 1,
end - 1,
1,
"VB VBD VBG VBN VBP VBZ")) {
// there's a verb in between
add = false;
}
if (add) {
Range range(start + 1, end - 1, "reorder-label");
ranges.push_back(range);
}
}
}
OutputWithLabels(source, ranges, out);
}
bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str)
{
vector<string> soughts = Moses::Tokenize(str, " ");
for (size_t i = start; i <= end; ++i) {
const Word &word = source[i];
bool found = Found(word, factor, soughts);
if (found) {
return true;
}
}
return false;
}
size_t Found(const Phrase &source, int pos, int factor, const std::string &str)
{
const size_t MAX_RANGE = 10;
vector<string> soughts = Moses::Tokenize(str, " ");
vector<string> puncts = Moses::Tokenize(". : , ;", " ");
size_t maxEnd = std::min(source.size(), (size_t) pos + MAX_RANGE);
for (size_t i = pos + 1; i < maxEnd; ++i) {
const Word &word = source[i];
bool found;
found = Found(word, factor, puncts);
if (found) {
return std::numeric_limits<size_t>::max();
}
found = Found(word, factor, soughts);
if (found) {
return i;
}
}
return std::numeric_limits<size_t>::max();
}
bool Found(const Word &word, int factor, const vector<string> &soughts)
{
const string &element = word[factor];
for (size_t i = 0; i < soughts.size(); ++i) {
const string &sought = soughts[i];
bool found = (element == sought);
if (found) {
return true;
}
}
return false;
}

View File

@ -1,11 +0,0 @@
#pragma once
#include "Main.h"
// roll your own identification of phrasal verbs
void EnPhrasalVerb(const Phrase &source, int revision, std::ostream &out);
bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str);
size_t Found(const Phrase &source, int pos, int factor, const std::string &str);
bool Found(const Word &word, int factor, const std::vector<std::string> &soughts);

View File

@ -1,29 +0,0 @@
#include "LabelByInitialLetter.h"
#include "Main.h"
using namespace std;
void LabelByInitialLetter(const Phrase &source, std::ostream &out)
{
Ranges ranges;
for (int start = 0; start < source.size(); ++start) {
const string &startWord = source[start][0];
string startChar = startWord.substr(0,1);
for (int end = start + 1; end < source.size(); ++end) {
const string &endWord = source[end][0];
string endChar = endWord.substr(0,1);
if (startChar == endChar) {
Range range(start, end, startChar + "-label");
ranges.push_back(range);
}
}
}
OutputWithLabels(source, ranges, out);
}

View File

@ -1,6 +0,0 @@
#pragma once
#include "Main.h"
void LabelByInitialLetter(const Phrase &source, std::ostream &out);

View File

@ -1,195 +0,0 @@
#include <iostream>
#include <cstdlib>
#include <boost/program_options.hpp>
#include "moses/Util.h"
#include "Main.h"
#include "DeEn.h"
#include "EnPhrasalVerb.h"
#include "EnOpenNLPChunker.h"
#include "LabelByInitialLetter.h"
using namespace std;
bool g_debug = false;
Phrase Tokenize(const string &line);
int main(int argc, char** argv)
{
cerr << "Starting" << endl;
namespace po = boost::program_options;
po::options_description desc("Options");
desc.add_options()
("help", "Print help messages")
("input,i", po::value<string>(), "Input file. Otherwise it will read from standard in")
("output,o", po::value<string>(), "Output file. Otherwise it will print from standard out")
("source-language,s", po::value<string>()->required(), "Source Language")
("target-language,t", po::value<string>()->required(), "Target Language")
("revision,r", po::value<int>()->default_value(0), "Revision")
("filter", po::value<string>(), "Only use labels from this comma-separated list")
("opennlp", po::value<string>()->default_value(""), "Path to Apache OpenNLP toolkit")
;
po::variables_map vm;
try
{
po::store(po::parse_command_line(argc, argv, desc),
vm); // can throw
/** --help option
*/
if ( vm.count("help") )
{
std::cout << "Basic Command Line Parameter App" << std::endl
<< desc << std::endl;
return EXIT_SUCCESS;
}
po::notify(vm); // throws on error, so do after help in case
// there are any problems
}
catch(po::error& e)
{
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return EXIT_FAILURE;
}
istream *inStrm = &cin;
if (vm.count("input")) {
string inStr = vm["input"].as<string>();
cerr << "inStr=" << inStr << endl;
ifstream *inFile = new ifstream(inStr.c_str());
inStrm = inFile;
}
ostream *outStrm = &cout;
if (vm.count("output")) {
string outStr = vm["output"].as<string>();
cerr << "outStr=" << outStr << endl;
ostream *outFile = new ofstream(outStr.c_str());
outStrm = outFile;
}
vector<string> filterList;
if (vm.count("filter")) {
string filter = vm["filter"].as<string>();
Moses::Tokenize(filterList, filter, ",");
}
string sourceLang = vm["source-language"].as<string>();
string targetLang = vm["target-language"].as<string>();
int revision = vm["revision"].as<int>();
cerr << sourceLang << " " << targetLang << " " << revision << endl;
if (sourceLang == "en" && revision == 2) {
if (vm.count("opennlp") == 0) {
throw "Need path to openNLP toolkit";
}
string openNLPPath = vm["opennlp"].as<string>();
EnOpenNLPChunker chunker(openNLPPath);
chunker.Process(*inStrm, *outStrm, filterList);
}
else {
// process line-by-line
string line;
size_t lineNum = 1;
while (getline(*inStrm, line)) {
//cerr << lineNum << ":" << line << endl;
if (lineNum % 1000 == 0) {
cerr << lineNum << " ";
}
Phrase source = Tokenize(line);
if (revision == 600 ) {
LabelByInitialLetter(source, *outStrm);
}
else if (sourceLang == "de" && targetLang == "en") {
LabelDeEn(source, *outStrm);
}
else if (sourceLang == "en") {
if (revision == 0 || revision == 1) {
EnPhrasalVerb(source, revision, *outStrm);
}
else if (revision == 2) {
string openNLPPath = vm["opennlp-path"].as<string>();
EnOpenNLPChunker chunker(openNLPPath);
}
}
++lineNum;
}
}
cerr << "Finished" << endl;
return EXIT_SUCCESS;
}
Phrase Tokenize(const string &line)
{
Phrase ret;
vector<string> toks = Moses::Tokenize(line);
for (size_t i = 0; i < toks.size(); ++i) {
Word word = Moses::Tokenize(toks[i], "|");
ret.push_back(word);
}
return ret;
}
bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str)
{
pos += offset;
if (pos >= source.size() || pos < 0) {
return false;
}
const string &word = source[pos][factor];
vector<string> soughts = Moses::Tokenize(str, " ");
for (int i = 0; i < soughts.size(); ++i) {
string &sought = soughts[i];
bool found = (word == sought);
if (found) {
return true;
}
}
return false;
}
void OutputWithLabels(const Phrase &source, const Ranges ranges, ostream &out)
{
// output sentence, with labels
for (int pos = 0; pos < source.size(); ++pos) {
// output beginning of label
for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
const Range &range = *iter;
if (range.range.first == pos) {
out << "<tree label=\"" + range.label + "\"> ";
}
}
const Word &word = source[pos];
out << word[0] << " ";
for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
const Range &range = *iter;
if (range.range.second == pos) {
out << "</tree> ";
}
}
}
out << endl;
}

View File

@ -1,27 +0,0 @@
#pragma once
#include <iostream>
#include <vector>
#include <string>
#include <list>
typedef std::vector<std::string> Word;
typedef std::vector<Word> Phrase;
struct Range
{
Range(int start,int end, const std::string &l)
:range(start, end)
,label(l)
{}
std::pair<int,int> range;
std::string label;
};
typedef std::list<Range> Ranges;
bool IsA(const Phrase &source, int pos, int offset, int factor, const std::string &str);
void OutputWithLabels(const Phrase &source, const Ranges ranges, std::ostream &out);

View File

@ -1,14 +0,0 @@
all: manual-label
clean:
rm -f *.o manual-label
.cpp.o:
g++ -I../../../boost/include -I../../../ -O3 -g -c $<
OBJECTS = DeEn.o EnOpenNLPChunker.o EnPhrasalVerb.o Main.o LabelByInitialLetter.o
manual-label: $(OBJECTS)
g++ $(OBJECTS) -L../../../boost/lib64 -lz -lboost_program_options-mt -o manual-label

View File

@ -1,131 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="manual-label" InternalType="Console">
<Plugins>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
<Plugin Name="CMakePlugin">
<![CDATA[[{
"name": "Debug",
"enabled": false,
"buildDirectory": "build",
"sourceDirectory": "$(ProjectPath)",
"generator": "",
"buildType": "",
"arguments": [],
"parentProject": ""
}]]]>
</Plugin>
</Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="manual-label">
<File Name="DeEn.cpp"/>
<File Name="DeEn.h"/>
<File Name="EnOpenNLPChunker.cpp"/>
<File Name="EnOpenNLPChunker.h"/>
<File Name="EnPhrasalVerb.cpp"/>
<File Name="EnPhrasalVerb.h"/>
<File Name="LabelByInitialLetter.cpp"/>
<File Name="LabelByInitialLetter.h"/>
<File Name="Main.cpp"/>
<File Name="Main.h"/>
</VirtualDirectory>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
<IncludePath Value="."/>
</Compiler>
<Linker Options="">
<LibraryPath Value="."/>
</Linker>
<ResourceCompiler Options=""/>
</GlobalSettings>
<Configuration Name="Debug" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-g;-O0;-Wall" C_Options="-g;-O0;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder"/>
<IncludePath Value="/Users/hieu/workspace/github/mosesdecoder/boost/include"/>
</Compiler>
<Linker Options="" Required="yes">
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
<Library Value="boost_program_options"/>
<Library Value="boost_filesystem"/>
<Library Value="boost_system"/>
</Linker>
<ResourceCompiler Options="" Required="no"/>
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
<Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
<![CDATA[]]>
</Environment>
<Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="no">
<DebuggerSearchPaths/>
<PostConnectCommands/>
<StartupCommands/>
</Debugger>
<PreBuild/>
<PostBuild/>
<CustomBuild Enabled="no">
<RebuildCommand/>
<CleanCommand/>
<BuildCommand/>
<PreprocessFileCommand/>
<SingleFileCommand/>
<MakefileGenerationCommand/>
<ThirdPartyToolName>None</ThirdPartyToolName>
<WorkingDirectory/>
</CustomBuild>
<AdditionalRules>
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
<SearchPaths/>
</Completion>
</Configuration>
<Configuration Name="Release" CompilerType="GCC" DebuggerType="LLDB Debugger" Type="Executable" BuildCmpWithGlobalSettings="append" BuildLnkWithGlobalSettings="append" BuildResWithGlobalSettings="append">
<Compiler Options="-O2;-Wall" C_Options="-O2;-Wall" Assembler="" Required="yes" PreCompiledHeader="" PCHInCommandLine="no" PCHFlags="" PCHFlagsPolicy="0">
<IncludePath Value="."/>
<Preprocessor Value="NDEBUG"/>
</Compiler>
<Linker Options="" Required="yes"/>
<ResourceCompiler Options="" Required="no"/>
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Release" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
<Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
<![CDATA[]]>
</Environment>
<Debugger IsRemote="no" RemoteHostName="" RemoteHostPort="" DebuggerPath="" IsExtended="no">
<DebuggerSearchPaths/>
<PostConnectCommands/>
<StartupCommands/>
</Debugger>
<PreBuild/>
<PostBuild/>
<CustomBuild Enabled="no">
<RebuildCommand/>
<CleanCommand/>
<BuildCommand/>
<PreprocessFileCommand/>
<SingleFileCommand/>
<MakefileGenerationCommand/>
<ThirdPartyToolName>None</ThirdPartyToolName>
<WorkingDirectory/>
</CustomBuild>
<AdditionalRules>
<CustomPostBuild/>
<CustomPreBuild/>
</AdditionalRules>
<Completion EnableCpp11="no">
<ClangCmpFlagsC/>
<ClangCmpFlags/>
<ClangPP/>
<SearchPaths/>
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -11,11 +11,11 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -79,12 +79,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1911984684" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -220,6 +220,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/ConfusionNet.h</locationURI>
</link>
<link>
<name>ContextParameters.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.cpp</locationURI>
</link>
<link>
<name>ContextParameters.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.h</locationURI>
</link>
<link>
<name>DecodeGraph.cpp</name>
<type>1</type>