perl to cpp

This commit is contained in:
Hieu Hoang 2012-10-24 17:53:11 +01:00
parent 7a71270be2
commit ae97ddc9fe
6 changed files with 436 additions and 8 deletions

View File

@ -0,0 +1,122 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?>
<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.569802710">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.569802710" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.569802710" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.569802710." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.422772512" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.1447685366" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/create_xml/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1871678557" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1541545093" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.234046731" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1525226387" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.282147302" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.535500763" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../moses/src/&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/..&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1320109988" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1559640535" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.1867428644" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.exe.debug.option.debugging.level.48702874" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.163776365" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.270894313" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.409137982" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.942524688" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1378600581" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.2103140635" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="cdt.managedbuild.config.gnu.exe.release.1826515899">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1826515899" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1826515899" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.exe.release.1826515899." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1314265647" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.403224019" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
<builder buildPath="${workspace_loc:/create_xml/Release}" id="cdt.managedbuild.target.gnu.builder.exe.release.2097980559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1658879977" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1893164577" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
<option id="gnu.cpp.compiler.exe.release.option.optimization.level.365325621" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.release.option.debugging.level.1765282174" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1061975067" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1035306257" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.538092359" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.exe.release.option.debugging.level.2144229318" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1988934768" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.1472283986" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.682000823" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1477911614" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.1061449370" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1724587470" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="create_xml.cdt.managedbuild.target.gnu.exe.2121157639" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1826515899;cdt.managedbuild.config.gnu.exe.release.1826515899.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1893164577;cdt.managedbuild.tool.gnu.cpp.compiler.input.1061975067">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.569802710;cdt.managedbuild.config.gnu.exe.debug.569802710.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1559640535;cdt.managedbuild.tool.gnu.c.compiler.input.163776365">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.569802710;cdt.managedbuild.config.gnu.exe.debug.569802710.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.234046731;cdt.managedbuild.tool.gnu.cpp.compiler.input.1320109988">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1826515899;cdt.managedbuild.config.gnu.exe.release.1826515899.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1035306257;cdt.managedbuild.tool.gnu.c.compiler.input.1988934768">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope" versionNumber="1">
<resource resourceType="PROJECT" workspacePath="/fuzzy-match"/>
</storageModule>
</cproject>

View File

@ -0,0 +1,83 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>create_xml</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
<dictionary>
<key>?name?</key>
<value></value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.append_environment</key>
<value>true</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.autoBuildTarget</key>
<value>all</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.buildArguments</key>
<value></value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.buildCommand</key>
<value>make</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.buildLocation</key>
<value>${workspace_loc:/create_xml/Debug}</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
<value>clean</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.contents</key>
<value>org.eclipse.cdt.make.core.activeConfigSettings</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.enableAutoBuild</key>
<value>false</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.enableCleanBuild</key>
<value>true</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.enableFullBuild</key>
<value>true</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.fullBuildTarget</key>
<value>all</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.stopOnError</key>
<value>true</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
<value>true</value>
</dictionary>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>

View File

@ -0,0 +1,47 @@
#include <cassert>
#include <vector>
#include "Alignments.h"
#include "Util.h"
using namespace std;
using namespace Moses;
Alignments::Alignments(const std::string &str, size_t sourceSize, size_t targetSize)
:m_alignS2T(sourceSize)
,m_alignT2S(targetSize)
{
vector<string> toks = Tokenize(str, " ");
for (size_t i = 0; i < toks.size(); ++i)
{
string &tok = toks[i];
vector<int> point = Tokenize<int>(tok, "-");
assert(point.size() == 2);
std::map<int, int>::iterator iter;
// m_alignedToS
std::map<int, int> &targets = m_alignS2T[ point[0] ];
iter = targets.find(point[1]);
if (iter == targets .end()) {
targets[ point[1] ] = 0;
}
else {
++(iter->second);
}
// m_alignedToT
std::map<int, int> &sources = m_alignT2S[ point[1] ];
iter = sources.find(point[0]);
if (iter == targets .end()) {
sources[ point[0] ] = 0;
}
else {
++(iter->second);
}
}
}

View File

@ -0,0 +1,20 @@
#pragma once
#include <string>
#include <vector>
#include <map>
class Alignments
{
public:
std::vector< std::map<int, int> > m_alignS2T, m_alignT2S;
Alignments(const std::string &str, size_t sourceSize, size_t targetSize);
protected:
};

View File

@ -1,2 +1,2 @@
g++ -I ../../moses/src/ -I ../../ create_xml.cpp
g++ -I ../../moses/src/ -I ../../ create_xml.cpp Alignments.cpp

View File

@ -5,6 +5,7 @@
#include <vector>
#include <string>
#include "Util.h"
#include "Alignments.h"
using namespace std;
using namespace Moses;
@ -23,7 +24,7 @@ int main(int argc, char **argv)
int setenceId;
float score;
string source, target, alignment, path;
string source, target, align, path;
string *input = NULL;
int count;
@ -62,21 +63,23 @@ int main(int argc, char **argv)
++step;
break;
case 5:
alignment = inLine;
align = inLine;
++step;
break;
case 6:
path = inLine;
path = inLine + "X";
++step;
break;
case 7:
count = Scan<int>(inLine);
++step;
createXML(source, *input, target, align, path);
step = 0;
break;
}
createXML(source, *input, target, alignment, path);
}
delete input;
@ -86,7 +89,160 @@ int main(int argc, char **argv)
}
void createXML(const string &source, const string &input, const string &targets, const string &aligns, const string &path)
{
void createXML(const string &source, const string &input, const string &target, const string &align, const string &path)
{
vector<string> sourceToks = Tokenize(source, " ")
,inputToks = Tokenize(input, " ")
,targetsToks = Tokenize(target, " ");
Alignments alignments(align, sourceToks.size(), targetsToks.size());
map<int, string> frameInput;
map<int, int> alignI2S;
vector< pair<int, int> > nonTerms;
vector<bool> targetBitmap(targetsToks.size(), true);
vector<bool> inputBitmap;
// STEP 1: FIND MISMATCHES
int s = 0, i = 0;
bool currently_matching = false;
int start_s = 0, start_i = 0;
cerr << input << endl << source << endl << target << endl << path << endl;
for ( size_t p = 0 ; p < path.length() ; p++ )
{
string action = path.substr(p, 1);
// beginning of a mismatch
if ( currently_matching && action != "M" && action != "X" )
{
start_i = i;
start_s = s;
currently_matching = 0;
} // if ( currently_matching
// end of a mismatch
else if ( !currently_matching && ( action == "M" || action == "X" ) )
{
// remove use of affected target words
for ( int ss = start_s ; ss < s ; ss++ ) {
const std::map<int, int> &targets = alignments.m_alignS2T[ss];
std::map<int, int>::const_iterator iter;
for (iter = targets.begin(); iter != targets.end(); ++iter) {
size_t tt = iter->first;
targetBitmap[tt] = 0;
}
// also remove enclosed unaligned words?
} //for ( int ss = start_s ; ss < s ; ss++ ) {
// are there input words that need to be inserted ?
cerr << start_i << "<" << i << "?" << endl;
if (start_i < i ) {
// take note of input words to be inserted
string insertion = "";
for (size_t ii = start_i ; ii < i ; ii++ ) {
insertion += inputToks[ii] + " ";
}
// find position for inserted input words
// find first removed target word
int start_t = 1000;
for ( int ss = start_s ; ss < s ; ss++ ) {
const std::map<int, int> &targets = alignments.m_alignS2T[ss];
std::map<int, int>::const_iterator iter;
for (iter = targets.begin(); iter != targets.end(); ++iter) {
size_t tt = iter->first;
if (tt < start_t) {
start_t = tt;
}
}
// end of sentence? add to end
if ( start_t == 1000 && i > inputToks.size() - 1 ) {
start_t = targetsToks.size() - 1;
}
// backtrack to previous words if unaligned
if ( start_t == 1000 ) {
start_t = -1;
for ( int ss = s - 1 ; start_t == -1 && ss >= 0 ; ss-- ) {
const std::map<int, int> &targets = alignments.m_alignS2T[ss];
std::map<int, int>::const_iterator iter;
for (iter = targets.begin(); iter != targets.end(); ++iter) {
size_t tt = iter->first;
if (tt > start_t) {
start_t = tt;
}
}
}
} // if ( start_t == 1000 ) {
frameInput[start_t] += insertion;
pair<int, int> nt(start_t, start_i);
nonTerms.push_back(nt);
}
currently_matching = 1;
} // if (start_i < i ) {
cerr << action << " " << s << " " << i
<< "(" << start_s << " " << start_i << ")"
<< currently_matching;
if ( action != "I" ) {
cerr << " ->";
const std::map<int, int> &targets = alignments.m_alignS2T[s];
std::map<int, int>::const_iterator iter;
for (iter = targets.begin(); iter != targets.end(); ++iter) {
size_t tt = iter->first;
cerr << " " << tt;
}
}
cerr << endl;
if (action != "I")
s++;
if (action != "D") {
i++;
alignI2S[i] = s;
}
if (action == "M") {
inputBitmap.push_back(1);
}
else if (action == "I" || action == "S") {
inputBitmap.push_back(0);
}
} // else if ( !currently_matching
cerr << target << endl;
for (size_t i = 0; i < targetBitmap.size(); ++i)
cerr << targetBitmap[i];
cerr << endl;
map<int, string>::const_iterator iter;
for (iter = frameInput.begin(); iter != frameInput.end(); ++iter) {
cerr << iter->first << ":" <<iter->second << endl;
}
// STEP 2: BUILD RULE AND FRAME
// hierarchical rule
string rule_s = "";
int rule_pos_s = 0;
//my %RULE_ALIGNMENT_S;
} // for ( size_t p = 0
}