mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
perl to cpp
This commit is contained in:
parent
7a71270be2
commit
ae97ddc9fe
122
scripts/fuzzy-match/.cproject
Normal file
122
scripts/fuzzy-match/.cproject
Normal file
@ -0,0 +1,122 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?>
|
||||
|
||||
<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.569802710">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.569802710" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.569802710" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.569802710." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.422772512" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
|
||||
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.1447685366" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/create_xml/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1871678557" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1541545093" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.234046731" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
|
||||
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1525226387" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.282147302" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.include.paths.535500763" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../moses/src/""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/..""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1320109988" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1559640535" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
|
||||
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.1867428644" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
|
||||
<option id="gnu.c.compiler.exe.debug.option.debugging.level.48702874" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.163776365" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.270894313" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.409137982" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.942524688" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.1378600581" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.2103140635" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.exe.release.1826515899">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1826515899" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1826515899" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.exe.release.1826515899." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.1314265647" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
|
||||
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.403224019" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
|
||||
<builder buildPath="${workspace_loc:/create_xml/Release}" id="cdt.managedbuild.target.gnu.builder.exe.release.2097980559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1658879977" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1893164577" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
|
||||
<option id="gnu.cpp.compiler.exe.release.option.optimization.level.365325621" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.exe.release.option.debugging.level.1765282174" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1061975067" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.1035306257" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
|
||||
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.538092359" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
|
||||
<option id="gnu.c.compiler.exe.release.option.debugging.level.2144229318" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1988934768" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.1472283986" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.682000823" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1477911614" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.1061449370" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1724587470" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<project id="create_xml.cdt.managedbuild.target.gnu.exe.2121157639" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
|
||||
</storageModule>
|
||||
<storageModule moduleId="scannerConfiguration">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1826515899;cdt.managedbuild.config.gnu.exe.release.1826515899.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1893164577;cdt.managedbuild.tool.gnu.cpp.compiler.input.1061975067">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.569802710;cdt.managedbuild.config.gnu.exe.debug.569802710.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1559640535;cdt.managedbuild.tool.gnu.c.compiler.input.163776365">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.569802710;cdt.managedbuild.config.gnu.exe.debug.569802710.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.234046731;cdt.managedbuild.tool.gnu.cpp.compiler.input.1320109988">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1826515899;cdt.managedbuild.config.gnu.exe.release.1826515899.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1035306257;cdt.managedbuild.tool.gnu.c.compiler.input.1988934768">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope" versionNumber="1">
|
||||
<resource resourceType="PROJECT" workspacePath="/fuzzy-match"/>
|
||||
</storageModule>
|
||||
</cproject>
|
83
scripts/fuzzy-match/.project
Normal file
83
scripts/fuzzy-match/.project
Normal file
@ -0,0 +1,83 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>create_xml</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
|
||||
<triggers>clean,full,incremental,</triggers>
|
||||
<arguments>
|
||||
<dictionary>
|
||||
<key>?name?</key>
|
||||
<value></value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.append_environment</key>
|
||||
<value>true</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.autoBuildTarget</key>
|
||||
<value>all</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.buildArguments</key>
|
||||
<value></value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.buildCommand</key>
|
||||
<value>make</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.buildLocation</key>
|
||||
<value>${workspace_loc:/create_xml/Debug}</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
|
||||
<value>clean</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.contents</key>
|
||||
<value>org.eclipse.cdt.make.core.activeConfigSettings</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.enableAutoBuild</key>
|
||||
<value>false</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.enableCleanBuild</key>
|
||||
<value>true</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.enableFullBuild</key>
|
||||
<value>true</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.fullBuildTarget</key>
|
||||
<value>all</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.stopOnError</key>
|
||||
<value>true</value>
|
||||
</dictionary>
|
||||
<dictionary>
|
||||
<key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
|
||||
<value>true</value>
|
||||
</dictionary>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
|
||||
<triggers>full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.cdt.core.cnature</nature>
|
||||
<nature>org.eclipse.cdt.core.ccnature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
47
scripts/fuzzy-match/Alignments.cpp
Normal file
47
scripts/fuzzy-match/Alignments.cpp
Normal file
@ -0,0 +1,47 @@
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#include "Alignments.h"
|
||||
#include "Util.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace Moses;
|
||||
|
||||
Alignments::Alignments(const std::string &str, size_t sourceSize, size_t targetSize)
|
||||
:m_alignS2T(sourceSize)
|
||||
,m_alignT2S(targetSize)
|
||||
{
|
||||
vector<string> toks = Tokenize(str, " ");
|
||||
for (size_t i = 0; i < toks.size(); ++i)
|
||||
{
|
||||
string &tok = toks[i];
|
||||
|
||||
vector<int> point = Tokenize<int>(tok, "-");
|
||||
assert(point.size() == 2);
|
||||
|
||||
std::map<int, int>::iterator iter;
|
||||
|
||||
// m_alignedToS
|
||||
std::map<int, int> &targets = m_alignS2T[ point[0] ];
|
||||
iter = targets.find(point[1]);
|
||||
if (iter == targets .end()) {
|
||||
targets[ point[1] ] = 0;
|
||||
}
|
||||
else {
|
||||
++(iter->second);
|
||||
}
|
||||
|
||||
// m_alignedToT
|
||||
std::map<int, int> &sources = m_alignT2S[ point[1] ];
|
||||
iter = sources.find(point[0]);
|
||||
if (iter == targets .end()) {
|
||||
sources[ point[0] ] = 0;
|
||||
}
|
||||
else {
|
||||
++(iter->second);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
20
scripts/fuzzy-match/Alignments.h
Normal file
20
scripts/fuzzy-match/Alignments.h
Normal file
@ -0,0 +1,20 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
class Alignments
|
||||
{
|
||||
public:
|
||||
std::vector< std::map<int, int> > m_alignS2T, m_alignT2S;
|
||||
|
||||
Alignments(const std::string &str, size_t sourceSize, size_t targetSize);
|
||||
|
||||
|
||||
protected:
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
@ -1,2 +1,2 @@
|
||||
g++ -I ../../moses/src/ -I ../../ create_xml.cpp
|
||||
g++ -I ../../moses/src/ -I ../../ create_xml.cpp Alignments.cpp
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "Util.h"
|
||||
#include "Alignments.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace Moses;
|
||||
@ -23,7 +24,7 @@ int main(int argc, char **argv)
|
||||
|
||||
int setenceId;
|
||||
float score;
|
||||
string source, target, alignment, path;
|
||||
string source, target, align, path;
|
||||
string *input = NULL;
|
||||
int count;
|
||||
|
||||
@ -62,21 +63,23 @@ int main(int argc, char **argv)
|
||||
++step;
|
||||
break;
|
||||
case 5:
|
||||
alignment = inLine;
|
||||
align = inLine;
|
||||
++step;
|
||||
break;
|
||||
case 6:
|
||||
path = inLine;
|
||||
path = inLine + "X";
|
||||
++step;
|
||||
break;
|
||||
case 7:
|
||||
count = Scan<int>(inLine);
|
||||
++step;
|
||||
createXML(source, *input, target, align, path);
|
||||
|
||||
step = 0;
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
createXML(source, *input, target, alignment, path);
|
||||
|
||||
}
|
||||
|
||||
delete input;
|
||||
@ -86,7 +89,160 @@ int main(int argc, char **argv)
|
||||
|
||||
}
|
||||
|
||||
void createXML(const string &source, const string &input, const string &targets, const string &aligns, const string &path)
|
||||
{
|
||||
|
||||
void createXML(const string &source, const string &input, const string &target, const string &align, const string &path)
|
||||
{
|
||||
vector<string> sourceToks = Tokenize(source, " ")
|
||||
,inputToks = Tokenize(input, " ")
|
||||
,targetsToks = Tokenize(target, " ");
|
||||
Alignments alignments(align, sourceToks.size(), targetsToks.size());
|
||||
map<int, string> frameInput;
|
||||
map<int, int> alignI2S;
|
||||
vector< pair<int, int> > nonTerms;
|
||||
vector<bool> targetBitmap(targetsToks.size(), true);
|
||||
vector<bool> inputBitmap;
|
||||
|
||||
// STEP 1: FIND MISMATCHES
|
||||
int s = 0, i = 0;
|
||||
bool currently_matching = false;
|
||||
int start_s = 0, start_i = 0;
|
||||
|
||||
cerr << input << endl << source << endl << target << endl << path << endl;
|
||||
for ( size_t p = 0 ; p < path.length() ; p++ )
|
||||
{
|
||||
string action = path.substr(p, 1);
|
||||
|
||||
// beginning of a mismatch
|
||||
if ( currently_matching && action != "M" && action != "X" )
|
||||
{
|
||||
start_i = i;
|
||||
start_s = s;
|
||||
currently_matching = 0;
|
||||
} // if ( currently_matching
|
||||
// end of a mismatch
|
||||
else if ( !currently_matching && ( action == "M" || action == "X" ) )
|
||||
{
|
||||
|
||||
// remove use of affected target words
|
||||
for ( int ss = start_s ; ss < s ; ss++ ) {
|
||||
const std::map<int, int> &targets = alignments.m_alignS2T[ss];
|
||||
|
||||
std::map<int, int>::const_iterator iter;
|
||||
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
||||
size_t tt = iter->first;
|
||||
targetBitmap[tt] = 0;
|
||||
}
|
||||
|
||||
// also remove enclosed unaligned words?
|
||||
} //for ( int ss = start_s ; ss < s ; ss++ ) {
|
||||
|
||||
// are there input words that need to be inserted ?
|
||||
cerr << start_i << "<" << i << "?" << endl;
|
||||
if (start_i < i ) {
|
||||
|
||||
// take note of input words to be inserted
|
||||
string insertion = "";
|
||||
for (size_t ii = start_i ; ii < i ; ii++ ) {
|
||||
insertion += inputToks[ii] + " ";
|
||||
}
|
||||
|
||||
// find position for inserted input words
|
||||
|
||||
// find first removed target word
|
||||
int start_t = 1000;
|
||||
for ( int ss = start_s ; ss < s ; ss++ ) {
|
||||
const std::map<int, int> &targets = alignments.m_alignS2T[ss];
|
||||
|
||||
std::map<int, int>::const_iterator iter;
|
||||
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
||||
size_t tt = iter->first;
|
||||
if (tt < start_t) {
|
||||
start_t = tt;
|
||||
}
|
||||
}
|
||||
|
||||
// end of sentence? add to end
|
||||
if ( start_t == 1000 && i > inputToks.size() - 1 ) {
|
||||
start_t = targetsToks.size() - 1;
|
||||
}
|
||||
|
||||
// backtrack to previous words if unaligned
|
||||
if ( start_t == 1000 ) {
|
||||
start_t = -1;
|
||||
for ( int ss = s - 1 ; start_t == -1 && ss >= 0 ; ss-- ) {
|
||||
const std::map<int, int> &targets = alignments.m_alignS2T[ss];
|
||||
|
||||
std::map<int, int>::const_iterator iter;
|
||||
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
||||
size_t tt = iter->first;
|
||||
if (tt > start_t) {
|
||||
start_t = tt;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // if ( start_t == 1000 ) {
|
||||
|
||||
frameInput[start_t] += insertion;
|
||||
pair<int, int> nt(start_t, start_i);
|
||||
nonTerms.push_back(nt);
|
||||
}
|
||||
|
||||
currently_matching = 1;
|
||||
|
||||
} // if (start_i < i ) {
|
||||
|
||||
cerr << action << " " << s << " " << i
|
||||
<< "(" << start_s << " " << start_i << ")"
|
||||
<< currently_matching;
|
||||
|
||||
if ( action != "I" ) {
|
||||
cerr << " ->";
|
||||
|
||||
const std::map<int, int> &targets = alignments.m_alignS2T[s];
|
||||
|
||||
std::map<int, int>::const_iterator iter;
|
||||
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
||||
size_t tt = iter->first;
|
||||
cerr << " " << tt;
|
||||
}
|
||||
}
|
||||
cerr << endl;
|
||||
|
||||
if (action != "I")
|
||||
s++;
|
||||
if (action != "D") {
|
||||
i++;
|
||||
alignI2S[i] = s;
|
||||
}
|
||||
|
||||
if (action == "M") {
|
||||
inputBitmap.push_back(1);
|
||||
}
|
||||
else if (action == "I" || action == "S") {
|
||||
inputBitmap.push_back(0);
|
||||
}
|
||||
} // else if ( !currently_matching
|
||||
|
||||
cerr << target << endl;
|
||||
for (size_t i = 0; i < targetBitmap.size(); ++i)
|
||||
cerr << targetBitmap[i];
|
||||
cerr << endl;
|
||||
|
||||
map<int, string>::const_iterator iter;
|
||||
for (iter = frameInput.begin(); iter != frameInput.end(); ++iter) {
|
||||
cerr << iter->first << ":" <<iter->second << endl;
|
||||
}
|
||||
|
||||
// STEP 2: BUILD RULE AND FRAME
|
||||
|
||||
// hierarchical rule
|
||||
string rule_s = "";
|
||||
int rule_pos_s = 0;
|
||||
//my %RULE_ALIGNMENT_S;
|
||||
|
||||
|
||||
} // for ( size_t p = 0
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user