merged master into dynamic-models and solved conflicts

This commit is contained in:
Nicola Bertoldi 2014-04-28 19:18:38 +02:00
commit 20381cbf89
339 changed files with 19727 additions and 3681 deletions

3
.gitignore vendored
View File

@ -76,3 +76,6 @@ mert/sentence-bleu
build/
nbproject/
mingw/MosesGUI/MosesGUI.e4p
mingw/MosesGUI/_eric4project/

30
Jamroot
View File

@ -64,6 +64,8 @@
#
# --max-factors maximum number of factors (default 4)
#
# --unlabelled-source ignore source labels (redundant in hiero or string-to-tree system)
# for better performance
#CONTROLLING THE BUILD
#-a to build from scratch
#-j$NCPUS to compile in parallel
@ -89,7 +91,7 @@ if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_min
requirements += <library>tcmalloc_and_profiler <library>unwind <cflags>-fno-omit-frame-pointer <cxxflags>-fno-omit-frame-pointer ;
} else {
external-lib tcmalloc_minimal ;
requirements += <threading>multi:<library>$(tcmalloc_minimal) ;
requirements += <threading>multi:<library>tcmalloc_minimal ;
}
} else {
echo "Tip: install tcmalloc for faster threading. See BUILD-INSTRUCTIONS.txt for more information." ;
@ -108,6 +110,9 @@ if [ option.get "enable-mpi" : : "yes" ] {
requirements += [ option.get "notrace" : <define>TRACE_ENABLE=1 ] ;
requirements += [ option.get "enable-boost-pool" : : <define>USE_BOOST_POOL ] ;
requirements += [ option.get "with-mm" : : <define>PT_UG ] ;
requirements += [ option.get "with-mm" : : <define>MAX_NUM_FACTORS=4 ] ;
requirements += [ option.get "unlabelled-source" : : <define>UNLABELLED_SOURCE ] ;
if [ option.get "with-cmph" ] {
requirements += <define>HAVE_CMPH ;
@ -137,6 +142,23 @@ project : requirements
#Add directories here if you want their incidental targets too (i.e. tests).
build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses-chart-cmd mira scripts regression-testing ;
if [ option.get "with-mm" : : "yes" ]
{
alias mm :
moses/TranslationModel/UG/mm//mtt-build
moses/TranslationModel/UG/mm//mtt-dump
moses/TranslationModel/UG/mm//symal2mam
moses/TranslationModel/UG/mm//custom-pt
moses/TranslationModel/UG/mm//mmlex-build
moses/TranslationModel/UG/mm//mtt-count-words
moses/TranslationModel/UG//try-align
;
}
else
{
alias mm ;
}
alias programs :
lm//programs
moses-chart-cmd//moses_chart
@ -154,12 +176,10 @@ phrase-extract//pcfg-score
biconcor
mira//mira
contrib/server//mosesserver
#moses/mm//mtt-build
#moses/mm//mtt-dump
#moses/mm//symal2mam
#moses/mm//custom-pt
mm
;
install-bin-libs programs ;
install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-chart-cmd moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
install-headers headers-moses : moses//headers-to-install : moses ;

View File

@ -127,14 +127,14 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
} else {
switch (stage) {
case 0: {
WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1);
if (w != NULL)
out->AddWord(w);
break;
}
case 1: {
Tokenize(targetPhrase, tok, false, true, onDiskWrapper);
Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0);
break;
}
case 2: {
@ -189,8 +189,9 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper)
, OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget)
{
// retSourceTarget: 0 = don't return anything. 1 = source, 2 = target
bool nonTerm = false;
size_t tokSize = token.size();
@ -218,6 +219,10 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
WordPtr word(new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word);
if (retSourceTarget == 1) {
out = word;
}
}
wordStr = token.substr(splitPos, tokSize - splitPos);
@ -225,7 +230,10 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
WordPtr word(new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word);
out = word;
if (retSourceTarget == 2) {
out = word;
}
}
}

View File

@ -27,7 +27,7 @@ typedef std::vector<AlignPair> AlignType;
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper);
, OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget);
OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
, char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
, int numScores

View File

@ -109,7 +109,7 @@ void Word::ConvertToMoses(
for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
overwrite.SetFactor(*t, factorColl.AddFactor(*tok));
overwrite.SetFactor(*t, factorColl.AddFactor(*tok, m_isNonTerminal));
}
UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
}

View File

@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -72,13 +72,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -0,0 +1,133 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.456080129" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.582801917" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/extract-mixed-syntax/Debug}" id="cdt.managedbuild.builder.gnu.cross.1220166455" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.2055012191" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1768196213" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2007889843" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.855436310" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.506549229" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1497326561" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.606353571" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.740521305" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<option id="gnu.cpp.link.option.libs.1946120010" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
</option>
<option id="gnu.cpp.link.option.paths.1563475751" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.106010037" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.136661991" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.2112208574" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.172930211" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.715007893">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.715007893." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.99436307" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.801178939" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/extract-mixed-syntax/Release}" id="cdt.managedbuild.builder.gnu.cross.1999547547" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1481537766" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1967527847" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.442342681" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.1847950300" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.1130138972" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.158429528" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.2020667840" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1372779734" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.371006952" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1770045040" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.707592414" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="extract-mixed-syntax.cdt.managedbuild.target.gnu.cross.exe.1868010260" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038;cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906;cdt.managedbuild.tool.gnu.c.compiler.input.442342681">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915;cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568;cdt.managedbuild.tool.gnu.c.compiler.input.2007889843">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>

View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>extract-mixed-syntax</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>

View File

@ -0,0 +1,37 @@
/*
* Global.cpp
* extract
*
* Created by Hieu Hoang on 01/02/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "Global.h"
bool g_debug = false;
Global::Global()
: minHoleSpanSourceDefault(2)
, maxHoleSpanSourceDefault(7)
, minHoleSpanSourceSyntax(1)
, maxHoleSpanSourceSyntax(1000)
, maxUnaligned(5)
, maxSymbols(5)
, maxNonTerm(3)
, maxNonTermDefault(2)
// int minHoleSize(1)
// int minSubPhraseSize(1) // minimum size of a remaining lexical phrase
, glueGrammarFlag(false)
, unknownWordLabelFlag(false)
//bool zipFiles(false)
, sourceSyntax(true)
, targetSyntax(false)
, mixed(true)
, uppermostOnly(true)
, allowDefaultNonTermEdge(true)
, gzOutput(false)
{}

View File

@ -0,0 +1,45 @@
#pragma once
/*
* Global.h
* extract
*
* Created by Hieu Hoang on 01/02/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <set>
#include <map>
#include <string>
class Global
{
public:
int minHoleSpanSourceDefault;
int maxHoleSpanSourceDefault;
int minHoleSpanSourceSyntax;
int maxHoleSpanSourceSyntax;
int maxSymbols;
bool glueGrammarFlag;
bool unknownWordLabelFlag;
int maxNonTerm;
int maxNonTermDefault;
bool sourceSyntax;
bool targetSyntax;
bool mixed;
int maxUnaligned;
bool uppermostOnly;
bool allowDefaultNonTermEdge;
bool gzOutput;
Global();
Global(const Global&);
};
extern bool g_debug;
#define DEBUG_OUTPUT() void DebugOutput() const;

View File

@ -0,0 +1,62 @@
// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "InputFileStream.h"
#include "gzfilebuf.h"
#include <iostream>
using namespace std;
namespace Moses
{
InputFileStream::InputFileStream(const std::string &filePath)
: std::istream(NULL)
, m_streambuf(NULL)
{
if (filePath.size() > 3 &&
filePath.substr(filePath.size() - 3, 3) == ".gz")
{
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
std::filebuf* fb = new std::filebuf();
fb = fb->open(filePath.c_str(), std::ios::in);
if (! fb) {
cerr << "Can't read " << filePath.c_str() << endl;
exit(1);
}
m_streambuf = fb;
}
this->init(m_streambuf);
}
InputFileStream::~InputFileStream()
{
delete m_streambuf;
m_streambuf = NULL;
}
void InputFileStream::Close()
{
}
}

View File

@ -0,0 +1,48 @@
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_InputFileStream_h
#define moses_InputFileStream_h
#include <cstdlib>
#include <fstream>
#include <string>
namespace Moses
{
/** Used in place of std::istream, can read zipped files if it ends in .gz
*/
class InputFileStream : public std::istream
{
protected:
std::streambuf *m_streambuf;
public:
InputFileStream(const std::string &filePath);
~InputFileStream();
void Close();
};
}
#endif

View File

@ -0,0 +1,180 @@
/*
* Lattice.cpp
* extract
*
* Created by Hieu Hoang on 18/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <cassert>
#include "Lattice.h"
#include "LatticeNode.h"
#include "Tunnel.h"
#include "TunnelCollection.h"
#include "SyntaxTree.h"
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "Rule.h"
#include "RuleCollection.h"
using namespace std;
Lattice::Lattice(size_t sourceSize)
:m_stacks(sourceSize + 1)
{
}
Lattice::~Lattice()
{
std::vector<Stack>::iterator iterStack;
for (iterStack = m_stacks.begin(); iterStack != m_stacks.end(); ++iterStack)
{
Stack &stack = *iterStack;
RemoveAllInColl(stack);
}
}
void Lattice::CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global)
{
// term
Stack &startStack = GetStack(startPos);
LatticeNode *node = new LatticeNode(startPos, &sentence);
startStack.push_back(node);
// non-term
for (size_t endPos = startPos + 1; endPos <= sentence.source.size(); ++endPos)
{
const TunnelList &tunnels = tunnelColl.GetTunnels(startPos, endPos - 1);
TunnelList::const_iterator iterHole;
for (iterHole = tunnels.begin(); iterHole != tunnels.end(); ++iterHole)
{
const Tunnel &tunnel = *iterHole;
CreateArcsUsing1Hole(tunnel, sentence, global);
}
}
}
void Lattice::CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global)
{
size_t startPos = tunnel.GetRange(0).GetStartPos()
, endPos = tunnel.GetRange(0).GetEndPos();
size_t numSymbols = tunnel.GetRange(0).GetWidth();
assert(numSymbols > 0);
Stack &startStack = GetStack(startPos);
// non-terms. cartesian product of source & target labels
assert(startPos == tunnel.GetRange(0).GetStartPos() && endPos == tunnel.GetRange(0).GetEndPos());
size_t startT = tunnel.GetRange(1).GetStartPos()
,endT = tunnel.GetRange(1).GetEndPos();
const SyntaxNodes &nodesS = sentence.sourceTree.GetNodes(startPos, endPos);
const SyntaxNodes &nodesT = sentence.targetTree.GetNodes(startT, endT );
SyntaxNodes::const_iterator iterS, iterT;
for (iterS = nodesS.begin(); iterS != nodesS.end(); ++iterS)
{
const SyntaxNode *syntaxNodeS = *iterS;
for (iterT = nodesT.begin(); iterT != nodesT.end(); ++iterT)
{
const SyntaxNode *syntaxNodeT = *iterT;
bool isSyntax = syntaxNodeS->IsSyntax() || syntaxNodeT->IsSyntax();
size_t maxSourceNonTermSpan = isSyntax ? global.maxHoleSpanSourceSyntax : global.maxHoleSpanSourceDefault;
if (maxSourceNonTermSpan >= endPos - startPos)
{
LatticeNode *node = new LatticeNode(tunnel, syntaxNodeS, syntaxNodeT);
startStack.push_back(node);
}
}
}
}
Stack &Lattice::GetStack(size_t startPos)
{
assert(startPos < m_stacks.size());
return m_stacks[startPos];
}
const Stack &Lattice::GetStack(size_t startPos) const
{
assert(startPos < m_stacks.size());
return m_stacks[startPos];
}
void Lattice::CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global)
{
const Stack &startStack = GetStack(startPos);
Stack::const_iterator iterStack;
for (iterStack = startStack.begin(); iterStack != startStack.end(); ++iterStack)
{
const LatticeNode *node = *iterStack;
Rule *initRule = new Rule(node);
if (initRule->CanRecurse(global, sentence.GetTunnelCollection()))
{ // may or maynot be valid, but can continue to build on this rule
initRule->CreateRules(m_rules, *this, sentence, global);
}
if (initRule->IsValid(global, sentence.GetTunnelCollection()))
{ // add to rule collection
m_rules.Add(global, initRule, sentence);
}
else
{
delete initRule;
}
}
}
Stack Lattice::GetNonTermNode(const Range &sourceRange) const
{
Stack ret;
size_t sourcePos = sourceRange.GetStartPos();
const Stack &origStack = GetStack(sourcePos);
Stack::const_iterator iter;
for (iter = origStack.begin(); iter != origStack.end(); ++iter)
{
LatticeNode *node = *iter;
const Range &nodeRangeS = node->GetSourceRange();
assert(nodeRangeS.GetStartPos() == sourceRange.GetStartPos());
if (! node->IsTerminal() && nodeRangeS.GetEndPos() == sourceRange.GetEndPos())
{
ret.push_back(node);
}
}
return ret;
}
std::ostream& operator<<(std::ostream &out, const Lattice &obj)
{
std::vector<Stack>::const_iterator iter;
for (iter = obj.m_stacks.begin(); iter != obj.m_stacks.end(); ++iter)
{
const Stack &stack = *iter;
Stack::const_iterator iterStack;
for (iterStack = stack.begin(); iterStack != stack.end(); ++iterStack)
{
const LatticeNode &node = **iterStack;
out << node << " ";
}
}
return out;
}

View File

@ -0,0 +1,47 @@
#pragma once
/*
* Lattice.h
* extract
*
* Created by Hieu Hoang on 18/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <iostream>
#include <vector>
#include "RuleCollection.h"
class Global;
class LatticeNode;
class Tunnel;
class TunnelCollection;
class SentenceAlignment;
typedef std::vector<LatticeNode*> Stack;
class Lattice
{
friend std::ostream& operator<<(std::ostream&, const Lattice&);
std::vector<Stack> m_stacks;
RuleCollection m_rules;
Stack &GetStack(size_t endPos);
void CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global);
public:
Lattice(size_t sourceSize);
~Lattice();
void CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global);
void CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global);
const Stack &GetStack(size_t startPos) const;
const RuleCollection &GetRules() const
{ return m_rules; }
Stack GetNonTermNode(const Range &sourceRange) const;
};

View File

@ -0,0 +1,149 @@
/*
* LatticeNode.cpp
* extract
*
* Created by Hieu Hoang on 18/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <sstream>
#include "LatticeNode.h"
#include "SyntaxTree.h"
#include "Tunnel.h"
#include "SentenceAlignment.h"
#include "SymbolSequence.h"
size_t LatticeNode::s_count = 0;
using namespace std;
// for terms
LatticeNode::LatticeNode(size_t pos, const SentenceAlignment *sentence)
:m_tunnel(NULL)
,m_isTerminal(true)
,m_sourceTreeNode(NULL)
,m_targetTreeNode(NULL)
,m_sentence(sentence)
,m_sourceRange(pos, pos)
{
s_count++;
//cerr << *this << endl;
}
// for non-terms
LatticeNode::LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode)
:m_tunnel(&tunnel)
,m_isTerminal(false)
,m_sourceTreeNode(sourceTreeNode)
,m_targetTreeNode(targetTreeNode)
,m_sentence(NULL)
,m_sourceRange(tunnel.GetRange(0))
{
s_count++;
//cerr << *this << endl;
}
bool LatticeNode::IsSyntax() const
{
assert(!m_isTerminal);
bool ret = m_sourceTreeNode->IsSyntax() || m_targetTreeNode->IsSyntax();
return ret;
}
size_t LatticeNode::GetNumSymbols(size_t direction) const
{
return 1;
}
int LatticeNode::Compare(const LatticeNode &otherNode) const
{
int ret = 0;
if (m_isTerminal != otherNode.m_isTerminal)
{
ret = m_isTerminal ? -1 : 1;
}
// both term or non-term
else if (m_isTerminal)
{ // term. compare source span
if (m_sourceRange.GetStartPos() == otherNode.m_sourceRange.GetStartPos())
ret = 0;
else
ret = (m_sourceRange.GetStartPos() < otherNode.m_sourceRange.GetStartPos()) ? -1 : +1;
}
else
{ // non-term. compare source span and BOTH label
assert(!m_isTerminal);
assert(!otherNode.m_isTerminal);
if (m_sourceTreeNode->IsSyntax())
{
ret = m_tunnel->Compare(*otherNode.m_tunnel, 0);
if (ret == 0 && m_sourceTreeNode->GetLabel() != otherNode.m_sourceTreeNode->GetLabel())
{
ret = (m_sourceTreeNode->GetLabel() < otherNode.m_sourceTreeNode->GetLabel()) ? -1 : +1;
}
}
if (ret == 0 && m_targetTreeNode->IsSyntax())
{
ret = m_tunnel->Compare(*otherNode.m_tunnel, 1);
if (ret == 0 && m_targetTreeNode->GetLabel() != otherNode.m_targetTreeNode->GetLabel())
{
ret = (m_targetTreeNode->GetLabel() < otherNode.m_targetTreeNode->GetLabel()) ? -1 : +1;
}
}
}
return ret;
}
void LatticeNode::CreateSymbols(size_t direction, SymbolSequence &symbols) const
{
if (m_isTerminal)
{
/*
const std::vector<std::string> &words = (direction == 0 ? m_sentence->source : m_sentence->target);
size_t startPos = m_tunnel.GetStart(direction)
,endPos = m_tunnel.GetEnd(direction);
for (size_t pos = startPos; pos <= endPos; ++pos)
{
Symbol symbol(words[pos], pos);
symbols.Add(symbol);
}
*/
}
else
{ // output both
Symbol symbol(m_sourceTreeNode->GetLabel(), m_targetTreeNode->GetLabel()
, m_tunnel->GetRange(0).GetStartPos(), m_tunnel->GetRange(0).GetEndPos()
, m_tunnel->GetRange(1).GetStartPos(), m_tunnel->GetRange(1).GetEndPos()
, m_sourceTreeNode->IsSyntax(), m_targetTreeNode->IsSyntax());
symbols.Add(symbol);
}
}
std::ostream& operator<<(std::ostream &out, const LatticeNode &obj)
{
if (obj.m_isTerminal)
{
assert(obj.m_sourceRange.GetWidth() == 1);
size_t pos = obj.m_sourceRange.GetStartPos();
const SentenceAlignment &sentence = *obj.m_sentence;
out << obj.m_sourceRange << "=" << sentence.source[pos];
}
else
{
assert(obj.m_tunnel);
out << obj.GetTunnel() << "=" << obj.m_sourceTreeNode->GetLabel() << obj.m_targetTreeNode->GetLabel() << " ";
}
return out;
}

View File

@ -0,0 +1,77 @@
#pragma once
/*
* LatticeNode.h
* extract
*
* Created by Hieu Hoang on 18/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include <iostream>
#include <cassert>
#include "Range.h"
class Tunnel;
class SyntaxNode;
class SentenceAlignment;
class SymbolSequence;
class LatticeNode
{
friend std::ostream& operator<<(std::ostream&, const LatticeNode&);
bool m_isTerminal;
// for terms & non-term
Range m_sourceRange;
// non-terms. source range should be same as m_sourceRange
const Tunnel *m_tunnel;
public:
static size_t s_count;
const SyntaxNode *m_sourceTreeNode, *m_targetTreeNode;
const SentenceAlignment *m_sentence;
// for terms
LatticeNode(size_t pos, const SentenceAlignment *sentence);
// for non-terms
LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode);
bool IsTerminal() const
{ return m_isTerminal; }
bool IsSyntax() const;
size_t GetNumSymbols(size_t direction) const;
std::string ToString() const;
int Compare(const LatticeNode &otherNode) const;
void CreateSymbols(size_t direction, SymbolSequence &symbols) const;
const Tunnel &GetTunnel() const
{
assert(m_tunnel);
return *m_tunnel;
}
const Range &GetSourceRange() const
{
return m_sourceRange;
}
const SyntaxNode &GetSyntaxNode(size_t direction) const
{
const SyntaxNode *node = direction == 0 ? m_sourceTreeNode : m_targetTreeNode;
assert(node);
return *node;
}
};

View File

@ -0,0 +1,13 @@
all: extract
clean:
rm -f *.o extract-mixed-syntax
.cpp.o:
g++ -O6 -g -c $<
extract: tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o
g++ tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o -lz -lboost_iostreams-mt -o extract-mixed-syntax

View File

@ -0,0 +1,79 @@
// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <boost/iostreams/filter/gzip.hpp>
#include "OutputFileStream.h"
#include "gzfilebuf.h"
using namespace std;
namespace Moses
{
OutputFileStream::OutputFileStream()
:boost::iostreams::filtering_ostream()
,m_outFile(NULL)
{
}
OutputFileStream::OutputFileStream(const std::string &filePath)
: m_outFile(NULL)
{
Open(filePath);
}
OutputFileStream::~OutputFileStream()
{
Close();
}
bool OutputFileStream::Open(const std::string &filePath)
{
m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
if (m_outFile->fail()) {
return false;
}
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
this->push(boost::iostreams::gzip_compressor());
}
this->push(*m_outFile);
return true;
}
void OutputFileStream::Close()
{
if (m_outFile == NULL) {
return;
}
this->flush();
this->pop(); // file
m_outFile->close();
delete m_outFile;
m_outFile = NULL;
return;
}
}

View File

@ -0,0 +1,50 @@
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <cstdlib>
#include <fstream>
#include <string>
#include <iostream>
#include <boost/iostreams/filtering_stream.hpp>
namespace Moses
{
/** Used in place of std::istream, can read zipped files if it ends in .gz
*/
class OutputFileStream : public boost::iostreams::filtering_ostream
{
protected:
std::ofstream *m_outFile;
public:
OutputFileStream();
OutputFileStream(const std::string &filePath);
virtual ~OutputFileStream();
bool Open(const std::string &filePath);
void Close();
};
}

View File

@ -0,0 +1,74 @@
/*
* Range.cpp
* extract
*
* Created by Hieu Hoang on 22/02/2011.
* Copyright 2011 __MyCompanyName__. All rights reserved.
*
*/
#include "Range.h"
using namespace std;
void Range::Merge(const Range &a, const Range &b)
{
if (a.m_startPos == NOT_FOUND)
{ // get the other regardless
m_startPos = b.m_startPos;
}
else if (b.m_startPos == NOT_FOUND)
{
m_startPos = a.m_startPos;
}
else
{
m_startPos = min(a.m_startPos, b.m_startPos);
}
if (a.m_endPos == NOT_FOUND)
{ // get the other regardless
m_endPos = b.m_endPos;
}
else if (b.m_endPos == NOT_FOUND)
{ // do nothing
m_endPos = a.m_endPos;
}
else
{
m_endPos = max(a.m_endPos, b.m_endPos);
}
}
int Range::Compare(const Range &other) const
{
if (m_startPos < other.m_startPos)
return -1;
else if (m_startPos > other.m_startPos)
return +1;
else if (m_endPos < other.m_endPos)
return -1;
else if (m_endPos > other.m_endPos)
return +1;
return 0;
}
bool Range::Overlap(const Range &other) const
{
if ( other.m_endPos < m_startPos || other.m_startPos > m_endPos)
return false;
return true;
}
std::ostream& operator<<(std::ostream &out, const Range &range)
{
out << "[" << range.m_startPos << "-" << range.m_endPos << "]";
return out;
}

View File

@ -0,0 +1,57 @@
/*
* Range.h
* extract
*
* Created by Hieu Hoang on 22/02/2011.
* Copyright 2011 __MyCompanyName__. All rights reserved.
*
*/
#pragma once
#include <string>
#include <iostream>
#include <limits>
#define NOT_FOUND std::numeric_limits<size_t>::max()
class Range
{
friend std::ostream& operator<<(std::ostream&, const Range&);
size_t m_startPos, m_endPos;
public:
Range()
:m_startPos(NOT_FOUND)
,m_endPos(NOT_FOUND)
{}
Range(const Range &copy)
:m_startPos(copy.m_startPos)
,m_endPos(copy.m_endPos)
{}
Range(size_t startPos, size_t endPos)
:m_startPos(startPos)
,m_endPos(endPos)
{}
size_t GetStartPos() const
{ return m_startPos; }
size_t GetEndPos() const
{ return m_endPos; }
size_t GetWidth() const
{ return m_endPos - m_startPos + 1; }
void SetStartPos(size_t startPos)
{ m_startPos = startPos; }
void SetEndPos(size_t endPos)
{ m_endPos = endPos; }
void Merge(const Range &a, const Range &b);
int Compare(const Range &other) const;
bool Overlap(const Range &other) const;
};

View File

@ -0,0 +1,594 @@
/*
* Rule.cpp
* extract
*
* Created by Hieu Hoang on 19/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <algorithm>
#include <sstream>
#include "Rule.h"
#include "Global.h"
#include "LatticeNode.h"
#include "Lattice.h"
#include "SentenceAlignment.h"
#include "Tunnel.h"
#include "TunnelCollection.h"
#include "RuleCollection.h"
using namespace std;
RuleElement::RuleElement(const RuleElement &copy)
:m_latticeNode(copy.m_latticeNode)
,m_alignmentPos(copy.m_alignmentPos)
{
}
Rule::Rule(const LatticeNode *latticeNode)
:m_lhs(NULL)
{
RuleElement element(*latticeNode);
m_coll.push_back(element);
}
Rule::Rule(const Rule &prevRule, const LatticeNode *latticeNode)
:m_coll(prevRule.m_coll)
,m_lhs(NULL)
{
RuleElement element(*latticeNode);
m_coll.push_back(element);
}
Rule::Rule(const Global &global, bool &isValid, const Rule &copy, const LatticeNode *lhs, const SentenceAlignment &sentence)
:m_coll(copy.m_coll)
,m_source(copy.m_source)
,m_target(copy.m_target)
,m_lhs(lhs)
{
CreateSymbols(global, isValid, sentence);
}
Rule::~Rule()
{
}
// helper for sort
struct CompareLatticeNodeTarget
{
bool operator() (const RuleElement *a, const RuleElement *b)
{
const Range &rangeA = a->GetLatticeNode().GetTunnel().GetRange(1)
,&rangeB = b->GetLatticeNode().GetTunnel().GetRange(1);
return rangeA.GetEndPos() < rangeB.GetEndPos();
}
};
void Rule::CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence)
{
vector<RuleElement*> nonTerms;
// source
for (size_t ind = 0; ind < m_coll.size(); ++ind)
{
RuleElement &element = m_coll[ind];
const LatticeNode &node = element.GetLatticeNode();
if (node.IsTerminal())
{
size_t sourcePos = node.GetSourceRange().GetStartPos();
const string &word = sentence.source[sourcePos];
Symbol symbol(word, sourcePos);
m_source.Add(symbol);
}
else
{ // non-term
const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
const string &targetWord = node.GetSyntaxNode(1).GetLabel();
Symbol symbol(sourceWord, targetWord
, node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
, node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
, node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
m_source.Add(symbol);
// store current pos within phrase
element.m_alignmentPos.first = ind;
// for target symbols
nonTerms.push_back(&element);
}
}
// target
isValid = true;
const Range &lhsTargetRange = m_lhs->GetTunnel().GetRange(1);
// check spans of target non-terms
if (nonTerms.size())
{
// sort non-term rules elements by target range
std::sort(nonTerms.begin(), nonTerms.end(), CompareLatticeNodeTarget());
const Range &first = nonTerms.front()->GetLatticeNode().GetTunnel().GetRange(1);
const Range &last = nonTerms.back()->GetLatticeNode().GetTunnel().GetRange(1);
if (first.GetStartPos() < lhsTargetRange.GetStartPos()
|| last.GetEndPos() > lhsTargetRange.GetEndPos())
{
isValid = false;
}
}
if (isValid)
{
size_t indNonTerm = 0;
RuleElement *currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;
for (size_t targetPos = lhsTargetRange.GetStartPos(); targetPos <= lhsTargetRange.GetEndPos(); ++targetPos)
{
if (currNonTermElement && targetPos == currNonTermElement->GetLatticeNode().GetTunnel().GetRange(1).GetStartPos())
{ // start of a non-term. print out non-terms & skip to the end
const LatticeNode &node = currNonTermElement->GetLatticeNode();
const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
const string &targetWord = node.GetSyntaxNode(1).GetLabel();
Symbol symbol(sourceWord, targetWord
, node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
, node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
, node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
m_target.Add(symbol);
// store current pos within phrase
currNonTermElement->m_alignmentPos.second = m_target.GetSize() - 1;
assert(currNonTermElement->m_alignmentPos.first != NOT_FOUND);
targetPos = node.GetTunnel().GetRange(1).GetEndPos();
indNonTerm++;
currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;
}
else
{ // term
const string &word = sentence.target[targetPos];
Symbol symbol(word, targetPos);
m_target.Add(symbol);
}
}
assert(indNonTerm == nonTerms.size());
if (m_target.GetSize() > global.maxSymbols) {
isValid = false;
//cerr << "m_source=" << m_source.GetSize() << ":" << m_source << endl;
//cerr << "m_target=" << m_target.GetSize() << ":" << m_target << endl;
}
}
}
bool Rule::MoreDefaultNonTermThanTerm() const
{
size_t numTerm = 0, numDefaultNonTerm = 0;
CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const RuleElement &element = *iter;
const LatticeNode &node = element.GetLatticeNode();
if (node.IsTerminal())
{
++numTerm;
}
else if (!node.IsSyntax())
{
++numDefaultNonTerm;
}
}
bool ret = numDefaultNonTerm > numTerm;
return ret;
}
bool Rule::SourceHasEdgeDefaultNonTerm() const
{
assert(m_coll.size());
const LatticeNode &first = m_coll.front().GetLatticeNode();
const LatticeNode &last = m_coll.back().GetLatticeNode();
// 1st
if (!first.IsTerminal() && !first.IsSyntax())
{
return true;
}
if (!last.IsTerminal() && !last.IsSyntax())
{
return true;
}
return false;
}
bool Rule::IsValid(const Global &global, const TunnelCollection &tunnelColl) const
{
if (m_coll.size() == 1 && !m_coll[0].GetLatticeNode().IsTerminal()) // can't be only 1 terminal
{
return false;
}
if (MoreDefaultNonTermThanTerm())
{ // must have at least as many terms as non-syntax non-terms
return false;
}
if (!global.allowDefaultNonTermEdge && SourceHasEdgeDefaultNonTerm())
{
return false;
}
if (GetNumSymbols() > global.maxSymbols)
{
return false;
}
if (AdjacentDefaultNonTerms())
{
return false;
}
if (!IsHole(tunnelColl))
{
return false;
}
if (NonTermOverlap())
{
return false;
}
/*
std::pair<size_t, size_t> spanS = GetSpan(0)
,spanT= GetSpan(1);
if (tunnelColl.NumUnalignedWord(0, spanS.first, spanS.second) >= global.maxUnaligned)
return false;
if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
return false;
*/
return true;
}
bool Rule::NonTermOverlap() const
{
vector<Range> ranges;
CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const RuleElement &element = *iter;
if (!element.GetLatticeNode().IsTerminal())
{
const Range &range = element.GetLatticeNode().GetTunnel().GetRange(1);
ranges.push_back(range);
}
}
vector<Range>::const_iterator outerIter;
for (outerIter = ranges.begin(); outerIter != ranges.end(); ++outerIter)
{
const Range &outer = *outerIter;
vector<Range>::const_iterator innerIter;
for (innerIter = outerIter + 1; innerIter != ranges.end(); ++innerIter)
{
const Range &inner = *innerIter;
if (outer.Overlap(inner))
return true;
}
}
return false;
}
Range Rule::GetSourceRange() const
{
assert(m_coll.size());
const Range &first = m_coll.front().GetLatticeNode().GetSourceRange();
const Range &last = m_coll.back().GetLatticeNode().GetSourceRange();
Range ret(first.GetStartPos(), last.GetEndPos());
return ret;
}
bool Rule::IsHole(const TunnelCollection &tunnelColl) const
{
const Range &spanS = GetSourceRange();
const TunnelList &tunnels = tunnelColl.GetTunnels(spanS.GetStartPos(), spanS.GetEndPos());
bool ret = tunnels.size() > 0;
return ret;
}
bool Rule::CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const
{
if (GetNumSymbols() >= global.maxSymbols)
return false;
if (AdjacentDefaultNonTerms())
return false;
if (MaxNonTerm(global))
return false;
if (NonTermOverlap())
{
return false;
}
const Range spanS = GetSourceRange();
if (tunnelColl.NumUnalignedWord(0, spanS.GetStartPos(), spanS.GetEndPos()) >= global.maxUnaligned)
return false;
// if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
// return false;
return true;
}
bool Rule::MaxNonTerm(const Global &global) const
{
//cerr << *this << endl;
size_t numNonTerm = 0, numNonTermDefault = 0;
CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const LatticeNode *node = &(*iter).GetLatticeNode();
if (!node->IsTerminal() )
{
numNonTerm++;
if (!node->IsSyntax())
{
numNonTermDefault++;
}
if (numNonTerm >= global.maxNonTerm || numNonTermDefault >= global.maxNonTermDefault)
return true;
}
}
return false;
}
bool Rule::AdjacentDefaultNonTerms() const
{
assert(m_coll.size() > 0);
const LatticeNode *prevNode = &m_coll.front().GetLatticeNode();
CollType::const_iterator iter;
for (iter = m_coll.begin() + 1; iter != m_coll.end(); ++iter)
{
const LatticeNode *node = &(*iter).GetLatticeNode();
if (!prevNode->IsTerminal() && !node->IsTerminal() && !prevNode->IsSyntax() && !node->IsSyntax() )
{
return true;
}
prevNode = node;
}
return false;
}
size_t Rule::GetNumSymbols() const
{
size_t ret = m_coll.size();
return ret;
}
void Rule::CreateRules(RuleCollection &rules
, const Lattice &lattice
, const SentenceAlignment &sentence
, const Global &global)
{
assert(m_coll.size() > 0);
const LatticeNode *latticeNode = &m_coll.back().GetLatticeNode();
size_t endPos = latticeNode->GetSourceRange().GetEndPos() + 1;
const Stack &stack = lattice.GetStack(endPos);
Stack::const_iterator iter;
for (iter = stack.begin(); iter != stack.end(); ++iter)
{
const LatticeNode *newLatticeNode = *iter;
Rule *newRule = new Rule(*this, newLatticeNode);
//cerr << *newRule << endl;
if (newRule->CanRecurse(global, sentence.GetTunnelCollection()))
{ // may or maynot be valid, but can continue to build on this rule
newRule->CreateRules(rules, lattice, sentence, global);
}
if (newRule->IsValid(global, sentence.GetTunnelCollection()))
{ // add to rule collection
rules.Add(global, newRule, sentence);
}
else
{
delete newRule;
}
}
}
bool Rule::operator<(const Rule &compare) const
{
/*
if (g_debug)
{
cerr << *this << endl << compare;
cerr << endl;
}
*/
bool ret = Compare(compare) < 0;
/*
if (g_debug)
{
cerr << *this << endl << compare << endl << ret << endl << endl;
}
*/
return ret;
}
int Rule::Compare(const Rule &compare) const
{
//cerr << *this << endl << compare << endl;
assert(m_coll.size() > 0);
assert(m_source.GetSize() > 0);
assert(m_target.GetSize() > 0);
int ret = 0;
// compare each fragment
ret = m_source.Compare(compare.m_source);
if (ret != 0)
{
return ret;
}
ret = m_target.Compare(compare.m_target);
if (ret != 0)
{
return ret;
}
// compare lhs
const string &thisSourceLabel = m_lhs->GetSyntaxNode(0).GetLabel();
const string &otherSourceLabel = compare.m_lhs->GetSyntaxNode(0).GetLabel();
if (thisSourceLabel != otherSourceLabel)
{
ret = (thisSourceLabel < otherSourceLabel) ? -1 : +1;
return ret;
}
const string &thisTargetLabel = m_lhs->GetSyntaxNode(1).GetLabel();
const string &otherTargetLabel = compare.m_lhs->GetSyntaxNode(1).GetLabel();
if (thisTargetLabel != otherTargetLabel)
{
ret = (thisTargetLabel < otherTargetLabel) ? -1 : +1;
return ret;
}
assert(ret == 0);
return ret;
}
const LatticeNode &Rule::GetLatticeNode(size_t ind) const
{
assert(ind < m_coll.size());
return m_coll[ind].GetLatticeNode();
}
void Rule::DebugOutput() const
{
Output(cerr);
}
void Rule::Output(std::ostream &out) const
{
stringstream strmeS, strmeT;
std::vector<Symbol>::const_iterator iterSymbol;
for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
strmeS << symbol << " ";
}
for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
strmeT << symbol << " ";
}
// lhs
if (m_lhs)
{
strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
}
out << strmeS.str() << " ||| " << strmeT.str() << " ||| ";
// alignment
Rule::CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const RuleElement &element = *iter;
const LatticeNode &node = element.GetLatticeNode();
bool isTerminal = node.IsTerminal();
if (!isTerminal)
{
out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " ";
}
}
out << "||| 1";
}
void Rule::OutputInv(std::ostream &out) const
{
stringstream strmeS, strmeT;
std::vector<Symbol>::const_iterator iterSymbol;
for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
strmeS << symbol << " ";
}
for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
strmeT << symbol << " ";
}
// lhs
if (m_lhs)
{
strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
}
out << strmeT.str() << " ||| " << strmeS.str() << " ||| ";
// alignment
Rule::CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const RuleElement &element = *iter;
const LatticeNode &node = element.GetLatticeNode();
bool isTerminal = node.IsTerminal();
if (!isTerminal)
{
out << element.m_alignmentPos.second << "-" << element.m_alignmentPos.first << " ";
}
}
out << "||| 1";
}

View File

@ -0,0 +1,96 @@
#pragma once
/*
* Rule.h
* extract
*
* Created by Hieu Hoang on 19/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include <iostream>
#include "LatticeNode.h"
#include "SymbolSequence.h"
#include "Global.h"
class Lattice;
class SentenceAlignment;
class Global;
class RuleCollection;
class SyntaxNode;
class TunnelCollection;
class Range;
class RuleElement
{
protected:
const LatticeNode *m_latticeNode;
public:
std::pair<size_t, size_t> m_alignmentPos;
RuleElement(const RuleElement &copy);
RuleElement(const LatticeNode &latticeNode)
:m_latticeNode(&latticeNode)
,m_alignmentPos(NOT_FOUND, NOT_FOUND)
{}
const LatticeNode &GetLatticeNode() const
{ return *m_latticeNode; }
};
class Rule
{
protected:
typedef std::vector<RuleElement> CollType;
CollType m_coll;
const LatticeNode *m_lhs;
SymbolSequence m_source, m_target;
bool IsHole(const TunnelCollection &tunnelColl) const;
bool NonTermOverlap() const;
const LatticeNode &GetLatticeNode(size_t ind) const;
void CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence);
public:
// init
Rule(const LatticeNode *latticeNode);
// create new rule by appending node to prev rule
Rule(const Rule &prevRule, const LatticeNode *latticeNode);
// create copy with lhs
Rule(const Global &global, bool &isValid, const Rule &copy, const LatticeNode *lhs, const SentenceAlignment &sentence);
// can continue to add to this rule
bool CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const;
virtual ~Rule();
// can add this to the set of rules
bool IsValid(const Global &global, const TunnelCollection &tunnelColl) const;
size_t GetNumSymbols() const;
bool AdjacentDefaultNonTerms() const;
bool MaxNonTerm(const Global &global) const;
bool MoreDefaultNonTermThanTerm() const;
bool SourceHasEdgeDefaultNonTerm() const;
void CreateRules(RuleCollection &rules
, const Lattice &lattice
, const SentenceAlignment &sentence
, const Global &global);
int Compare(const Rule &compare) const;
bool operator<(const Rule &compare) const;
Range GetSourceRange() const;
DEBUG_OUTPUT();
void Output(std::ostream &out) const;
void OutputInv(std::ostream &out) const;
};

View File

@ -0,0 +1,102 @@
/*
* RuleCollection.cpp
* extract
*
* Created by Hieu Hoang on 19/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "RuleCollection.h"
#include "Rule.h"
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "Lattice.h"
#include "SyntaxTree.h"
using namespace std;
RuleCollection::~RuleCollection()
{
RemoveAllInColl(m_coll);
}
void RuleCollection::Add(const Global &global, Rule *rule, const SentenceAlignment &sentence)
{
Range spanS = rule->GetSourceRange();
// cartesian product of lhs
Stack nontermNodes = sentence.GetLattice().GetNonTermNode(spanS);
Stack::const_iterator iterStack;
for (iterStack = nontermNodes.begin(); iterStack != nontermNodes.end(); ++iterStack)
{
const LatticeNode &node = **iterStack;
assert(!node.IsTerminal());
bool isValid;
// create rules with LHS
//cerr << "old:" << *rule << endl;
Rule *newRule = new Rule(global, isValid, *rule, &node, sentence);
if (!isValid)
{ // lhs doesn't match non-term spans
delete newRule;
continue;
}
/*
stringstream s;
s << *newRule;
if (s.str().find("Wiederaufnahme der [X] ||| resumption of the [X] ||| ||| 1") == 0)
{
cerr << "READY:" << *newRule << endl;
g_debug = true;
}
else {
g_debug = false;
}
*/
typedef set<const Rule*, CompareRule>::iterator Iterator;
pair<Iterator,bool> ret = m_coll.insert(newRule);
if (ret.second)
{
//cerr << "ACCEPTED:" << *newRule << endl;
//cerr << "";
}
else
{
//cerr << "REJECTED:" << *newRule << endl;
delete newRule;
}
}
delete rule;
}
void RuleCollection::Output(std::ostream &out) const
{
RuleCollection::CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const Rule &rule = **iter;
rule.Output(out);
out << endl;
}
}
void RuleCollection::OutputInv(std::ostream &out) const
{
RuleCollection::CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const Rule &rule = **iter;
rule.OutputInv(out);
out << endl;
}
}

View File

@ -0,0 +1,55 @@
#pragma once
/*
* RuleCollection.h
* extract
*
* Created by Hieu Hoang on 19/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <set>
#include <iostream>
#include "Rule.h"
class SentenceAlignment;
// helper for sort. Don't compare default non-terminals
struct CompareRule
{
bool operator() (const Rule *a, const Rule *b)
{
/*
if (g_debug)
{
std::cerr << std::endl << (*a) << std::endl << (*b) << " ";
}
*/
bool ret = (*a) < (*b);
/*
if (g_debug)
{
std::cerr << ret << std::endl;
}
*/
return ret;
}
};
class RuleCollection
{
protected:
typedef std::set<const Rule*, CompareRule> CollType;
CollType m_coll;
public:
~RuleCollection();
void Add(const Global &global, Rule *rule, const SentenceAlignment &sentence);
size_t GetSize() const
{ return m_coll.size(); }
void Output(std::ostream &out) const;
void OutputInv(std::ostream &out) const;
};

View File

@ -0,0 +1,331 @@
/*
* SentenceAlignment.cpp
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <set>
#include <map>
#include <sstream>
#include "SentenceAlignment.h"
#include "XmlTree.h"
#include "tables-core.h"
#include "TunnelCollection.h"
#include "Lattice.h"
#include "LatticeNode.h"
using namespace std;
extern std::set< std::string > targetLabelCollection, sourceLabelCollection;
extern std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;
SentenceAlignment::SentenceAlignment()
:m_tunnelCollection(NULL)
,m_lattice(NULL)
{}
SentenceAlignment::~SentenceAlignment()
{
delete m_tunnelCollection;
delete m_lattice;
}
int SentenceAlignment::Create( const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global )
{
// tokenizing English (and potentially extract syntax spans)
if (global.targetSyntax) {
string targetStringCPP = string(targetString);
ProcessAndStripXMLTags( targetStringCPP, targetTree, targetLabelCollection , targetTopLabelCollection );
target = tokenize( targetStringCPP.c_str() );
// cerr << "E: " << targetStringCPP << endl;
}
else {
target = tokenize( targetString.c_str() );
}
// tokenizing source (and potentially extract syntax spans)
if (global.sourceSyntax) {
string sourceStringCPP = string(sourceString);
ProcessAndStripXMLTags( sourceStringCPP, sourceTree, sourceLabelCollection , sourceTopLabelCollection );
source = tokenize( sourceStringCPP.c_str() );
// cerr << "F: " << sourceStringCPP << endl;
}
else {
source = tokenize( sourceString.c_str() );
}
// check if sentences are empty
if (target.size() == 0 || source.size() == 0) {
cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return 0;
}
// prepare data structures for alignments
for(int i=0; i<source.size(); i++) {
alignedCountS.push_back( 0 );
}
for(int i=0; i<target.size(); i++) {
vector< int > dummy;
alignedToT.push_back( dummy );
}
//InitTightest(m_s2tTightest, source.size());
//InitTightest(m_t2sTightest, target.size());
// reading in alignments
vector<string> alignmentSequence = tokenize( alignmentString.c_str() );
for(int i=0; i<alignmentSequence.size(); i++) {
int s,t;
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return 0;
}
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
if (t >= target.size() || s >= source.size()) {
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return 0;
}
alignedToT[t].push_back( s );
alignedCountS[s]++;
//SetAlignment(s, t);
}
bool mixed = global.mixed;
sourceTree.AddDefaultNonTerms(global.sourceSyntax, mixed, source.size());
targetTree.AddDefaultNonTerms(global.targetSyntax, mixed, target.size());
//CalcTightestSpan(m_s2tTightest);
//CalcTightestSpan(m_t2sTightest);
return 1;
}
/*
void SentenceAlignment::InitTightest(Outer &tightest, size_t len)
{
tightest.resize(len);
for (size_t posOuter = 0; posOuter < len; ++posOuter)
{
Inner &inner = tightest[posOuter];
size_t innerSize = len - posOuter;
inner.resize(innerSize);
}
}
void SentenceAlignment::CalcTightestSpan(Outer &tightest)
{
size_t len = tightest.size();
for (size_t startPos = 0; startPos < len; ++startPos)
{
for (size_t endPos = startPos + 1; endPos < len; ++endPos)
{
const Range &prevRange = GetTightest(tightest, startPos, endPos - 1);
const Range &smallRange = GetTightest(tightest, endPos, endPos);
Range &newRange = GetTightest(tightest, startPos, endPos);
newRange.Merge(prevRange, smallRange);
//cerr << "[" << startPos << "-" << endPos << "] --> [" << newRange.GetStartPos() << "-" << newRange.GetEndPos() << "]";
}
}
}
Range &SentenceAlignment::GetTightest(Outer &tightest, size_t startPos, size_t endPos)
{
assert(endPos < tightest.size());
assert(endPos >= startPos);
Inner &inner = tightest[startPos];
size_t ind = endPos - startPos;
Range &ret = inner[ind];
return ret;
}
void SentenceAlignment::SetAlignment(size_t source, size_t target)
{
SetAlignment(m_s2tTightest, source, target);
SetAlignment(m_t2sTightest, target, source);
}
void SentenceAlignment::SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos)
{
Range &range = GetTightest(tightest, thisPos, thisPos);
if (range.GetStartPos() == NOT_FOUND)
{ // not yet set, do them both
assert(range.GetEndPos() == NOT_FOUND);
range.SetStartPos(thatPos);
range.SetEndPos(thatPos);
}
else
{
assert(range.GetEndPos() != NOT_FOUND);
range.SetStartPos( (range.GetStartPos() > thatPos) ? thatPos : range.GetStartPos() );
range.SetEndPos( (range.GetEndPos() < thatPos) ? thatPos : range.GetEndPos() );
}
}
*/
void SentenceAlignment::FindTunnels(const Global &global )
{
int countT = target.size();
int countS = source.size();
int maxSpan = max(global.maxHoleSpanSourceDefault, global.maxHoleSpanSourceSyntax);
m_tunnelCollection = new TunnelCollection(countS);
m_tunnelCollection->alignedCountS = alignedCountS;
m_tunnelCollection->alignedCountT.resize(alignedToT.size());
for (size_t ind = 0; ind < alignedToT.size(); ind++)
{
m_tunnelCollection->alignedCountT[ind] = alignedToT[ind].size();
}
// phrase repository for creating hiero phrases
// check alignments for target phrase startT...endT
for(int lengthT=1;
lengthT <= maxSpan && lengthT <= countT;
lengthT++) {
for(int startT=0; startT < countT-(lengthT-1); startT++) {
// that's nice to have
int endT = startT + lengthT - 1;
// if there is target side syntax, there has to be a node
if (global.targetSyntax && !targetTree.HasNode(startT,endT))
continue;
// find find aligned source words
// first: find minimum and maximum source word
int minS = 9999;
int maxS = -1;
vector< int > usedS = alignedCountS;
for(int ti=startT;ti<=endT;ti++) {
for(int i=0;i<alignedToT[ti].size();i++) {
int si = alignedToT[ti][i];
// cerr << "point (" << si << ", " << ti << ")\n";
if (si<minS) { minS = si; }
if (si>maxS) { maxS = si; }
usedS[ si ]--;
}
}
// unaligned phrases are not allowed
if( maxS == -1 )
continue;
// source phrase has to be within limits
if( maxS-minS >= maxSpan )
{
continue;
}
// check if source words are aligned to out of bound target words
bool out_of_bounds = false;
for(int si=minS;si<=maxS && !out_of_bounds;si++)
{
if (usedS[si]>0) {
out_of_bounds = true;
}
}
// if out of bound, you gotta go
if (out_of_bounds)
continue;
if (m_tunnelCollection->NumUnalignedWord(1, startT, endT) >= global.maxUnaligned)
continue;
// done with all the checks, lets go over all consistent phrase pairs
// start point of source phrase may retreat over unaligned
for(int startS=minS;
(startS>=0 &&
startS>maxS - maxSpan && // within length limit
(startS==minS || alignedCountS[startS]==0)); // unaligned
startS--)
{
// end point of source phrase may advance over unaligned
for(int endS=maxS;
(endS<countS && endS<startS + maxSpan && // within length limit
(endS==maxS || alignedCountS[endS]==0)); // unaligned
endS++)
{
if (m_tunnelCollection->NumUnalignedWord(0, startS, endS) >= global.maxUnaligned)
continue;
// take note that this is a valid phrase alignment
m_tunnelCollection->Add(startS, endS, startT, endT);
}
}
}
}
//cerr << *tunnelCollection << endl;
}
void SentenceAlignment::CreateLattice(const Global &global)
{
size_t countS = source.size();
m_lattice = new Lattice(countS);
for (size_t startPos = 0; startPos < countS; ++startPos)
{
//cerr << "creating arcs for " << startPos << "=";
m_lattice->CreateArcs(startPos, *m_tunnelCollection, *this, global);
//cerr << LatticeNode::s_count << endl;
}
}
void SentenceAlignment::CreateRules(const Global &global)
{
size_t countS = source.size();
for (size_t startPos = 0; startPos < countS; ++startPos)
{
//cerr << "creating rules for " << startPos << "\n";
m_lattice->CreateRules(startPos, *this, global);
}
}
void OutputSentenceStr(std::ostream &out, const std::vector<std::string> &vec)
{
for (size_t pos = 0; pos < vec.size(); ++pos)
{
out << vec[pos] << " ";
}
}
std::ostream& operator<<(std::ostream &out, const SentenceAlignment &obj)
{
OutputSentenceStr(out, obj.target);
out << " ==> ";
OutputSentenceStr(out, obj.source);
out << endl;
out << *obj.m_tunnelCollection;
if (obj.m_lattice)
out << endl << *obj.m_lattice;
return out;
}

View File

@ -0,0 +1,69 @@
#pragma once
/*
* SentenceAlignment.h
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include <cassert>
#include <iostream>
#include "SyntaxTree.h"
#include "Global.h"
#include "Range.h"
class TunnelCollection;
class Lattice;
class SentenceAlignment
{
friend std::ostream& operator<<(std::ostream&, const SentenceAlignment&);
public:
std::vector<std::string> target;
std::vector<std::string> source;
std::vector<int> alignedCountS;
std::vector< std::vector<int> > alignedToT;
SyntaxTree sourceTree, targetTree;
//typedef std::vector<Range> Inner;
//typedef std::vector<Inner> Outer;
//Outer m_s2tTightest, m_t2sTightest;
SentenceAlignment();
~SentenceAlignment();
int Create(const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global);
// void clear() { delete(alignment); };
void FindTunnels( const Global &global ) ;
void CreateLattice(const Global &global);
void CreateRules(const Global &global);
const TunnelCollection &GetTunnelCollection() const
{
assert(m_tunnelCollection);
return *m_tunnelCollection;
}
const Lattice &GetLattice() const
{
assert(m_lattice);
return *m_lattice;
}
protected:
TunnelCollection *m_tunnelCollection;
Lattice *m_lattice;
/*
void CalcTightestSpan(Outer &tightest);
void InitTightest(Outer &tightest, size_t len);
Range &GetTightest(Outer &tightest, size_t startPos, size_t endPos);
void SetAlignment(size_t source, size_t target);
void SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos);
*/
};

View File

@ -0,0 +1,101 @@
/*
* Symbol.cpp
* extract
*
* Created by Hieu Hoang on 21/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <cassert>
#include "Symbol.h"
using namespace std;
Symbol::Symbol(const std::string &label, size_t pos)
:m_label(label)
,m_isTerminal(true)
,m_span(2)
{
m_span[0].first = pos;
}
Symbol::Symbol(const std::string &labelS, const std::string &labelT
, size_t startS, size_t endS
, size_t startT, size_t endT
, bool isSourceSyntax, bool isTargetSyntax)
:m_label(labelS)
,m_labelT(labelT)
,m_isTerminal(false)
,m_span(2)
,m_isSourceSyntax(isSourceSyntax)
,m_isTargetSyntax(isTargetSyntax)
{
m_span[0] = std::pair<size_t, size_t>(startS, endS);
m_span[1] = std::pair<size_t, size_t>(startT, endT);
}
int CompareNonTerm(bool thisIsSyntax, bool otherIsSyntax
, const std::pair<size_t, size_t> &thisSpan, const std::pair<size_t, size_t> &otherSpan
, std::string thisLabel, std::string otherLabel)
{
if (thisIsSyntax != otherIsSyntax)
{ // 1 is [X] & the other is [NP] on the source
return thisIsSyntax ? -1 : +1;
}
assert(thisIsSyntax == otherIsSyntax);
if (thisIsSyntax)
{ // compare span & label
if (thisSpan != otherSpan)
return thisSpan < otherSpan ? -1 : +1;
if (thisLabel != otherLabel)
return thisLabel < otherLabel ? -1 : +1;
}
return 0;
}
int Symbol::Compare(const Symbol &other) const
{
if (m_isTerminal != other.m_isTerminal)
return m_isTerminal ? -1 : +1;
assert(m_isTerminal == other.m_isTerminal);
if (m_isTerminal)
{ // compare labels & pos
if (m_span[0].first != other.m_span[0].first)
return (m_span[0].first < other.m_span[0].first) ? -1 : +1;
if (m_label != other.m_label)
return (m_label < other.m_label) ? -1 : +1;
}
else
{ // non terms
int ret = CompareNonTerm(m_isSourceSyntax, other.m_isSourceSyntax
,m_span[0], other.m_span[0]
,m_label, other.m_label);
if (ret != 0)
return ret;
ret = CompareNonTerm(m_isTargetSyntax, other.m_isTargetSyntax
,m_span[1], other.m_span[1]
,m_label, other.m_label);
if (ret != 0)
return ret;
}
return 0;
}
std::ostream& operator<<(std::ostream &out, const Symbol &obj)
{
if (obj.m_isTerminal)
out << obj.m_label;
else
out << obj.m_label + obj.m_labelT;
return out;
}

View File

@ -0,0 +1,36 @@
#pragma once
/*
* Symbol.h
* extract
*
* Created by Hieu Hoang on 21/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <string>
#include <iostream>
#include <vector>
class Symbol
{
friend std::ostream& operator<<(std::ostream &out, const Symbol &obj);
protected:
std::string m_label, m_labelT; // m_labelT only for non-term
std::vector<std::pair<size_t, size_t> > m_span;
bool m_isTerminal, m_isSourceSyntax, m_isTargetSyntax;
public:
// for terminals
Symbol(const std::string &label, size_t pos);
// for non-terminals
Symbol(const std::string &labelS, const std::string &labelT
, size_t startS, size_t endS
, size_t startT, size_t endT
, bool isSourceSyntax, bool isTargetSyntax);
int Compare(const Symbol &other) const;
};

View File

@ -0,0 +1,56 @@
/*
* SymbolSequence.cpp
* extract
*
* Created by Hieu Hoang on 21/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <cassert>
#include <sstream>
#include "SymbolSequence.h"
using namespace std;
int SymbolSequence::Compare(const SymbolSequence &other) const
{
int ret;
size_t thisSize = GetSize();
size_t otherSize = other.GetSize();
if (thisSize != otherSize)
{
ret = (thisSize < otherSize) ? -1 : +1;
return ret;
}
else
{
assert(thisSize == otherSize);
for (size_t ind = 0; ind < thisSize; ++ind)
{
const Symbol &thisSymbol = GetSymbol(ind);
const Symbol &otherSymbol = other.GetSymbol(ind);
ret = thisSymbol.Compare(otherSymbol);
if (ret != 0)
{
return ret;
}
}
}
assert(ret == 0);
return ret;
}
std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj)
{
SymbolSequence::CollType::const_iterator iterSymbol;
for (iterSymbol = obj.m_coll.begin(); iterSymbol != obj.m_coll.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
out << symbol << " ";
}
return out;
}

View File

@ -0,0 +1,42 @@
#pragma once
/*
* SymbolSequence.h
* extract
*
* Created by Hieu Hoang on 21/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <iostream>
#include <vector>
#include "Symbol.h"
class SymbolSequence
{
friend std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj);
protected:
typedef std::vector<Symbol> CollType;
CollType m_coll;
public:
typedef CollType::iterator iterator;
typedef CollType::const_iterator const_iterator;
const_iterator begin() const { return m_coll.begin(); }
const_iterator end() const { return m_coll.end(); }
void Add(const Symbol &symbol)
{
m_coll.push_back(symbol);
}
size_t GetSize() const
{ return m_coll.size(); }
const Symbol &GetSymbol(size_t ind) const
{ return m_coll[ind]; }
void Clear()
{ m_coll.clear(); }
int Compare(const SymbolSequence &other) const;
};

View File

@ -0,0 +1,245 @@
// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include <cassert>
#include "SyntaxTree.h"
//#include "extract.h"
#include "Global.h"
//extern const Global g_debug;
extern const Global *g_global;
using namespace std;
bool SyntaxNode::IsSyntax() const
{
bool ret = GetLabel() != "[X]";
return ret;
}
SyntaxTree::SyntaxTree()
:m_defaultLHS(0,0, "[X]")
{
m_emptyNode.clear();
}
SyntaxTree::~SyntaxTree()
{
// loop through all m_nodes, delete them
for(int i=0; i<m_nodes.size(); i++)
{
delete m_nodes[i];
}
}
bool HasDuplicates(const SyntaxNodes &nodes)
{
string prevLabel;
SyntaxNodes::const_iterator iter;
for (iter = nodes.begin(); iter != nodes.end(); ++iter)
{
const SyntaxNode &node = **iter;
string label = node.GetLabel();
if (label == prevLabel)
return true;
}
return false;
}
void SyntaxTree::AddNode( int startPos, int endPos, std::string label )
{
SyntaxNode* newNode = new SyntaxNode( startPos, endPos, "[" + label + "]");
m_nodes.push_back( newNode );
SyntaxNodes &nodesChart = m_index[ startPos ][ endPos ];
if (!g_global->uppermostOnly)
{
nodesChart.push_back( newNode );
//assert(!HasDuplicates(m_index[ startPos ][ endPos ]));
}
else
{
if (nodesChart.size() > 0)
{
assert(nodesChart.size() == 1);
//delete nodes[0];
nodesChart.resize(0);
}
assert(nodesChart.size() == 0);
nodesChart.push_back( newNode );
}
}
ParentNodes SyntaxTree::Parse() {
ParentNodes parents;
int size = m_index.size();
// looping through all spans of size >= 2
for( int length=2; length<=size; length++ )
{
for( int startPos = 0; startPos <= size-length; startPos++ )
{
if (HasNode( startPos, startPos+length-1 ))
{
// processing one (parent) span
//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
SplitPoints splitPoints;
splitPoints.push_back( startPos );
//std::cerr << " " << startPos;
int first = 1;
int covered = 0;
while( covered < length )
{
// find largest covering subspan (child)
// starting at last covered position
for( int midPos=length-first; midPos>covered; midPos-- )
{
if( HasNode( startPos+covered, startPos+midPos-1 ) )
{
covered = midPos;
splitPoints.push_back( startPos+covered );
// std::cerr << " " << ( startPos+covered );
first = 0;
}
}
}
// std::cerr << std::endl;
parents.push_back( splitPoints );
}
}
}
return parents;
}
bool SyntaxTree::HasNode( int startPos, int endPos ) const
{
return GetNodes( startPos, endPos).size() > 0;
}
const SyntaxNodes &SyntaxTree::GetNodes( int startPos, int endPos ) const
{
SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
if (startIndex == m_index.end() )
return m_emptyNode;
SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
if (endIndex == startIndex->second.end())
return m_emptyNode;
return endIndex->second;
}
// for printing out tree
std::string SyntaxTree::ToString() const
{
std::stringstream out;
out << *this;
return out.str();
}
void SyntaxTree::AddDefaultNonTerms(size_t phraseSize)
{
for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
{
for (size_t endPos = startPos; endPos < phraseSize; ++endPos)
{
AddNode(startPos, endPos, "X");
}
}
}
void SyntaxTree::AddDefaultNonTerms(bool isSyntax, bool mixed, size_t phraseSize)
{
if (isSyntax)
{
AddDefaultNonTerms(!mixed, phraseSize);
}
else
{ // add X everywhere
AddDefaultNonTerms(phraseSize);
}
}
void SyntaxTree::AddDefaultNonTerms(bool addEverywhere, size_t phraseSize)
{
//cerr << "GetNumWords()=" << GetNumWords() << endl;
//assert(phraseSize == GetNumWords() || GetNumWords() == 1); // 1 if syntax sentence doesn't have any xml. TODO fix syntax tree obj
for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
{
for (size_t endPos = startPos; endPos <= phraseSize; ++endPos)
{
const SyntaxNodes &nodes = GetNodes(startPos, endPos);
if (!addEverywhere && nodes.size() > 0)
{ // only add if no label
continue;
}
AddNode(startPos, endPos, "X");
}
}
}
const SyntaxNodes SyntaxTree::GetNodesForLHS( int startPos, int endPos ) const
{
SyntaxNodes ret(GetNodes(startPos, endPos));
if (ret.size() == 0)
ret.push_back(&m_defaultLHS);
return ret;
}
std::ostream& operator<<(std::ostream& os, const SyntaxTree& t)
{
int size = t.m_index.size();
for(size_t length=1; length<=size; length++)
{
for(size_t space=0; space<length; space++)
{
os << " ";
}
for(size_t start=0; start<=size-length; start++)
{
if (t.HasNode( start, start+(length-1) ))
{
std::string label = t.GetNodes( start, start+(length-1) )[0]->GetLabel() + "#######";
os << label.substr(0,7) << " ";
}
else
{
os << "------- ";
}
}
os << std::endl;
}
return os;
}

View File

@ -0,0 +1,96 @@
#pragma once
// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <string>
#include <vector>
#include <map>
#include <sstream>
class SyntaxNode;
typedef std::vector<const SyntaxNode*> SyntaxNodes;
class SyntaxNode {
protected:
int m_start, m_end;
std::string m_label;
SyntaxNodes m_children;
SyntaxNode* m_parent;
public:
SyntaxNode( int startPos, int endPos, const std::string &label)
:m_start(startPos)
,m_end(endPos)
,m_label(label)
{}
int GetStart() const
{ return m_start; }
int GetEnd() const
{ return m_end; }
const std::string &GetLabel() const
{ return m_label; }
bool IsSyntax() const;
};
typedef std::vector< int > SplitPoints;
typedef std::vector< SplitPoints > ParentNodes;
class SyntaxTree {
protected:
SyntaxNodes m_nodes;
SyntaxNode* m_top;
SyntaxNode m_defaultLHS;
typedef std::map< int, SyntaxNodes > SyntaxTreeIndex2;
typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
SyntaxTreeIndex m_index;
SyntaxNodes m_emptyNode;
friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
public:
SyntaxTree();
~SyntaxTree();
void AddNode( int startPos, int endPos, std::string label );
ParentNodes Parse();
bool HasNode( int startPos, int endPos ) const;
const SyntaxNodes &GetNodes( int startPos, int endPos ) const;
const SyntaxNodes &GetAllNodes() const { return m_nodes; } ;
size_t GetNumWords() const { return m_index.size(); }
std::string ToString() const;
void AddDefaultNonTerms(bool isSyntax, bool addEverywhere, size_t phraseSize);
void AddDefaultNonTerms(bool mixed, size_t phraseSize);
void AddDefaultNonTerms(size_t phraseSize);
const SyntaxNodes GetNodesForLHS( int startPos, int endPos ) const;
};
std::ostream& operator<<(std::ostream&, const SyntaxTree&);

View File

@ -0,0 +1,38 @@
/*
* Tunnel.cpp
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "Tunnel.h"
int Tunnel::Compare(const Tunnel &other) const
{
int ret = m_sourceRange.Compare(other.m_sourceRange);
if (ret != 0)
return ret;
ret = m_targetRange.Compare(other.m_targetRange);
return ret;
}
int Tunnel::Compare(const Tunnel &other, size_t direction) const
{
const Range &thisRange = (direction == 0) ? m_sourceRange : m_targetRange;
const Range &otherRange = (direction == 0) ? other.m_sourceRange : other.m_targetRange;
int ret = thisRange.Compare(otherRange);
return ret;
}
std::ostream& operator<<(std::ostream &out, const Tunnel &tunnel)
{
out << tunnel.m_sourceRange << "==>" << tunnel.m_targetRange;
return out;
}

View File

@ -0,0 +1,49 @@
#pragma once
/*
* Tunnel.h
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include <cassert>
#include <string>
#include <iostream>
#include "Range.h"
// for unaligned source terminal
class Tunnel
{
friend std::ostream& operator<<(std::ostream&, const Tunnel&);
protected:
Range m_sourceRange, m_targetRange;
public:
Tunnel()
{}
Tunnel(const Tunnel &copy)
:m_sourceRange(copy.m_sourceRange)
,m_targetRange(copy.m_targetRange)
{}
Tunnel(const Range &sourceRange, const Range &targetRange)
:m_sourceRange(sourceRange)
,m_targetRange(targetRange)
{}
const Range &GetRange(size_t direction) const
{ return (direction == 0) ? m_sourceRange : m_targetRange; }
int Compare(const Tunnel &other) const;
int Compare(const Tunnel &other, size_t direction) const;
};
typedef std::vector<Tunnel> TunnelList;

View File

@ -0,0 +1,70 @@
/*
* TunnelCollection.cpp
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "TunnelCollection.h"
#include "Range.h"
using namespace std;
size_t TunnelCollection::NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const
{
assert(startPos <= endPos);
if (direction == 0)
assert(endPos < alignedCountS.size());
else
assert(endPos < alignedCountT.size());
size_t ret = 0;
for (size_t ind = startPos; ind <= endPos; ++ind)
{
if (direction == 0 && alignedCountS[ind] == 0)
{
ret++;
}
else if (direction == 1 && alignedCountT[ind] == 0)
{
ret++;
}
}
return ret;
}
void TunnelCollection::Add(int startS, int endS, int startT, int endT)
{
// m_phraseExist[startS][endS - startS].push_back(Tunnel(startT, endT));
m_coll[startS][endS - startS].push_back(Tunnel(Range(startS, endS), Range(startT, endT)));
}
std::ostream& operator<<(std::ostream &out, const TunnelCollection &TunnelCollection)
{
size_t size = TunnelCollection.GetSize();
for (size_t startPos = 0; startPos < size; ++startPos)
{
for (size_t endPos = startPos; endPos < size; ++endPos)
{
const TunnelList &tunnelList = TunnelCollection.GetTunnels(startPos, endPos);
TunnelList::const_iterator iter;
for (iter = tunnelList.begin(); iter != tunnelList.end(); ++iter)
{
const Tunnel &tunnel = *iter;
out << tunnel << " ";
}
}
}
return out;
}

View File

@ -0,0 +1,61 @@
#pragma once
/*
* TunnelCollection.h
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include "Tunnel.h"
// reposity of extracted phrase pairs
// which are potential tunnels in larger phrase pairs
class TunnelCollection
{
friend std::ostream& operator<<(std::ostream&, const TunnelCollection&);
protected:
std::vector< std::vector<TunnelList> > m_coll;
// indexed by source pos. and source length
// maps to list of tunnels where <int, int> are target pos
public:
std::vector<int> alignedCountS, alignedCountT;
TunnelCollection(const TunnelCollection &);
TunnelCollection(size_t size)
:m_coll(size)
{
// size is the length of the source sentence
for (size_t pos = 0; pos < size; ++pos)
{
// create empty tunnel lists
std::vector<TunnelList> &endVec = m_coll[pos];
endVec.resize(size - pos);
}
}
void Add(int startS, int endS, int startT, int endT);
//const TunnelList &GetTargetHoles(int startS, int endS) const
//{
// const TunnelList &targetHoles = m_phraseExist[startS][endS - startS];
// return targetHoles;
//}
const TunnelList &GetTunnels(int startS, int endS) const
{
const TunnelList &sourceHoles = m_coll[startS][endS - startS];
return sourceHoles;
}
const size_t GetSize() const
{ return m_coll.size(); }
size_t NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const;
};

View File

@ -0,0 +1,344 @@
// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <vector>
#include <string>
#include <set>
#include <iostream>
#include <stdlib.h>
#include "SyntaxTree.h"
using namespace std;
inline std::vector<std::string> Tokenize(const std::string& str,
const std::string& delimiters = " \t")
{
std::vector<std::string> tokens;
// Skip delimiters at beginning.
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
// Find first "non-delimiter".
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
while (std::string::npos != pos || std::string::npos != lastPos)
{
// Found a token, add it to the vector.
tokens.push_back(str.substr(lastPos, pos - lastPos));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiters, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiters, lastPos);
}
return tokens;
}
const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
{
std::string res = str;
res.erase(str.find_last_not_of(dropChars)+1);
return res.erase(0, res.find_first_not_of(dropChars));
}
string ParseXmlTagAttribute(const string& tag,const string& attributeName){
/*TODO deal with unescaping \"*/
string tagOpen = attributeName + "=\"";
size_t contentsStart = tag.find(tagOpen);
if (contentsStart == string::npos) return "";
contentsStart += tagOpen.size();
size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
if (contentsEnd == string::npos) {
cerr << "Malformed XML attribute: "<< tag;
return "";
}
size_t possibleEnd;
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
contentsEnd = possibleEnd;
}
return tag.substr(contentsStart,contentsEnd-contentsStart);
}
/**
* Remove "<" and ">" from XML tag
*
* \param str xml token to be stripped
*/
string TrimXml(const string& str)
{
// too short to be xml token -> do nothing
if (str.size() < 2) return str;
// strip first and last character
if (str[0] == '<' && str[str.size() - 1] == '>')
{
return str.substr(1, str.size() - 2);
}
// not an xml token -> do nothing
else { return str; }
}
/**
* Check if the token is an XML tag, i.e. starts with "<"
*
* \param tag token to be checked
*/
bool isXmlTag(const string& tag)
{
return tag[0] == '<';
}
/**
* Split up the input character string into tokens made up of
* either XML tags or text.
* example: this <b> is a </b> test .
* => (this ), (<b>), ( is a ), (</b>), ( test .)
*
* \param str input string
*/
inline vector<string> TokenizeXml(const string& str)
{
string lbrack = "<";
string rbrack = ">";
vector<string> tokens; // vector of tokens to be returned
string::size_type cpos = 0; // current position in string
string::size_type lpos = 0; // left start of xml tag
string::size_type rpos = 0; // right end of xml tag
// walk thorugh the string (loop vver cpos)
while (cpos != str.size())
{
// find the next opening "<" of an xml tag
lpos = str.find_first_of(lbrack, cpos);
if (lpos != string::npos)
{
// find the end of the xml tag
rpos = str.find_first_of(rbrack, lpos);
// sanity check: there has to be closing ">"
if (rpos == string::npos)
{
cerr << "ERROR: malformed XML: " << str << endl;
return tokens;
}
}
else // no more tags found
{
// add the rest as token
tokens.push_back(str.substr(cpos));
break;
}
// add stuff before xml tag as token, if there is any
if (lpos - cpos > 0)
tokens.push_back(str.substr(cpos, lpos - cpos));
// add xml tag as token
tokens.push_back(str.substr(lpos, rpos-lpos+1));
cpos = rpos + 1;
}
return tokens;
}
/**
* Process a sentence with xml annotation
* Xml tags may specifiy additional/replacing translation options
* and reordering constraints
*
* \param line in: sentence, out: sentence without the xml
* \param res vector with translation options specified by xml
* \param reorderingConstraint reordering constraint zones specified by xml
* \param walls reordering constraint walls specified by xml
*/
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
is so we can link things up afterwards. We can't create TranslationOptions as we
parse because we don't have the completed source parsed until after this function
removes all the markup from it (CreateFromString in Sentence::Read).
*/
bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) {
//parse XML markup in translation line
// no xml tag? we're done.
if (line.find_first_of('<') == string::npos) { return true; }
// break up input into a vector of xml tags and text
// example: (this), (<b>), (is a), (</b>), (test .)
vector<string> xmlTokens = TokenizeXml(line);
// we need to store opened tags, until they are closed
// tags are stored as tripled (tagname, startpos, contents)
typedef pair< string, pair< size_t, string > > OpenedTag;
vector< OpenedTag > tagStack; // stack that contains active opened tags
string cleanLine; // return string (text without xml)
size_t wordPos = 0; // position in sentence (in terms of number of words)
bool isLinked = false;
// loop through the tokens
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
{
// not a xml tag, but regular text (may contain many words)
if(!isXmlTag(xmlTokens[xmlTokenPos]))
{
// add a space at boundary, if necessary
if (cleanLine.size()>0 &&
cleanLine[cleanLine.size() - 1] != ' ' &&
xmlTokens[xmlTokenPos][0] != ' ')
{
cleanLine += " ";
}
cleanLine += xmlTokens[xmlTokenPos]; // add to output
wordPos = Tokenize(cleanLine).size(); // count all the words
}
// process xml tag
else
{
// *** get essential information about tag ***
// strip extra boundary spaces and "<" and ">"
string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
// cerr << "XML TAG IS: " << tag << std::endl;
if (tag.size() == 0)
{
cerr << "ERROR: empty tag name: " << line << endl;
return false;
}
// check if unary (e.g., "<wall/>")
bool isUnary = ( tag[tag.size() - 1] == '/' );
// check if opening tag (e.g. "<a>", not "</a>")g
bool isClosed = ( tag[0] == '/' );
bool isOpen = !isClosed;
if (isClosed && isUnary)
{
cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
return false;
}
if (isClosed)
tag = tag.substr(1); // remove "/" at the beginning
if (isUnary)
tag = tag.substr(0,tag.size()-1); // remove "/" at the end
// find the tag name and contents
string::size_type endOfName = tag.find_first_of(' ');
string tagName = tag;
string tagContent = "";
if (endOfName != string::npos) {
tagName = tag.substr(0,endOfName);
tagContent = tag.substr(endOfName+1);
}
// *** process new tag ***
if (isOpen || isUnary)
{
// put the tag on the tag stack
OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
tagStack.push_back( openedTag );
// cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
}
// *** process completed tag ***
if (isClosed || isUnary)
{
// pop last opened tag from stack;
if (tagStack.size() == 0)
{
cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
return false;
}
OpenedTag openedTag = tagStack.back();
tagStack.pop_back();
// tag names have to match
if (openedTag.first != tagName)
{
cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
return false;
}
// assemble remaining information about tag
size_t startPos = openedTag.second.first;
string tagContent = openedTag.second.second;
size_t endPos = wordPos;
// span attribute overwrites position
string span = ParseXmlTagAttribute(tagContent,"span");
if (! span.empty())
{
vector<string> ij = Tokenize(span, "-");
if (ij.size() != 1 && ij.size() != 2) {
cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
return false;
}
startPos = atoi(ij[0].c_str());
if (ij.size() == 1) endPos = startPos + 1;
else endPos = atoi(ij[1].c_str()) + 1;
}
// cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
if (startPos >= endPos)
{
cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
return false;
}
string label = ParseXmlTagAttribute(tagContent,"label");
labelCollection.insert( label );
// report what we have processed so far
if (0) {
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
}
tree.AddNode( startPos, endPos-1, label );
}
}
}
// we are done. check if there are tags that are still open
if (tagStack.size() > 0)
{
cerr << "ERROR: some opened tags were never closed: " << line << endl;
return false;
}
// collect top labels
const SyntaxNodes &topNodes = tree.GetNodes( 0, wordPos-1 );
for( SyntaxNodes::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ )
{
const SyntaxNode *n = *node;
const string &label = n->GetLabel();
if (topLabelCollection.find( label ) == topLabelCollection.end())
topLabelCollection[ label ] = 0;
topLabelCollection[ label ]++;
}
// return de-xml'ed sentence in line
line = cleanLine;
return true;
}

View File

@ -0,0 +1,35 @@
#pragma once
// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <string>
#include <vector>
#include <set>
#include <map>
#include "SyntaxTree.h"
std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
std::string TrimXml(const std::string& str);
bool isXmlTag(const std::string& tag);
inline std::vector<std::string> TokenizeXml(const std::string& str);
bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection );

View File

@ -0,0 +1,310 @@
// $Id: extract.cpp 2828 2010-02-01 16:07:58Z hieuhoang1972 $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cstdio>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <cstring>
#include <sstream>
#include <iostream>
#include "extract.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "Lattice.h"
#ifdef WIN32
// Include Visual Leak Detector
#include <vld.h>
#endif
using namespace std;
void writeGlueGrammar(const string &, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
int main(int argc, char* argv[])
{
cerr << "Extract v2.0, written by Philipp Koehn\n"
<< "rule extraction from an aligned parallel corpus\n";
//time_t starttime = time(NULL);
Global *global = new Global();
g_global = global;
int sentenceOffset = 0;
if (argc < 5) {
cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract "
<< " [ --Hierarchical | --Orientation"
<< " | --GlueGrammar FILE | --UnknownWordLabel FILE"
<< " | --OnlyDirect"
<< " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]"
<< " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]"
<< " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]"
<< " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]"
<< " | --MaxSymbols[" << global->maxSymbols<< "]"
<< " | --MaxNonTerm[" << global->maxNonTerm << "]"
<< " | --SourceSyntax | --TargetSyntax"
<< " | --UppermostOnly[" << g_global->uppermostOnly << "]"
<< endl;
exit(1);
}
char* &fileNameT = argv[1];
char* &fileNameS = argv[2];
char* &fileNameA = argv[3];
string fileNameGlueGrammar;
string fileNameUnknownWordLabel;
string fileNameExtract = string(argv[4]);
int optionInd = 5;
for(int i=optionInd;i<argc;i++)
{
if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) {
global->minHoleSpanSourceDefault = atoi(argv[++i]);
if (global->minHoleSpanSourceDefault < 1) {
cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl;
exit(1);
}
}
else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) {
global->maxHoleSpanSourceDefault = atoi(argv[++i]);
if (global->maxHoleSpanSourceDefault < 1) {
cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl;
exit(1);
}
}
else if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) {
global->minHoleSpanSourceSyntax = atoi(argv[++i]);
if (global->minHoleSpanSourceSyntax < 1) {
cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl;
exit(1);
}
}
else if (strcmp(argv[i],"--UppermostOnly") == 0) {
global->uppermostOnly = atoi(argv[++i]);
}
else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) {
global->maxHoleSpanSourceSyntax = atoi(argv[++i]);
if (global->maxHoleSpanSourceSyntax < 1) {
cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl;
exit(1);
}
}
// maximum number of words in hierarchical phrase
else if (strcmp(argv[i],"--maxSymbols") == 0) {
global->maxSymbols = atoi(argv[++i]);
if (global->maxSymbols < 1) {
cerr << "extract error: --maxSymbols should be at least 1" << endl;
exit(1);
}
}
// maximum number of non-terminals
else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
global->maxNonTerm = atoi(argv[++i]);
if (global->maxNonTerm < 1) {
cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
exit(1);
}
}
// allow consecutive non-terminals (X Y | X Y)
else if (strcmp(argv[i],"--TargetSyntax") == 0) {
global->targetSyntax = true;
}
else if (strcmp(argv[i],"--SourceSyntax") == 0) {
global->sourceSyntax = true;
}
// do not create many part00xx files!
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
// now default
}
else if (strcmp(argv[i],"--GlueGrammar") == 0) {
global->glueGrammarFlag = true;
if (++i >= argc)
{
cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
exit(0);
}
fileNameGlueGrammar = string(argv[i]);
cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
}
else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
global->unknownWordLabelFlag = true;
if (++i >= argc)
{
cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
exit(0);
}
fileNameUnknownWordLabel = string(argv[i]);
cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
}
// TODO: this should be a useful option
//else if (strcmp(argv[i],"--ZipFiles") == 0) {
// zipFiles = true;
//}
// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
else if (strcmp(argv[i],"--Mixed") == 0) {
global->mixed = true;
}
else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) {
global->allowDefaultNonTermEdge = atoi(argv[++i]);
}
else if (strcmp(argv[i], "--GZOutput") == 0) {
global->gzOutput = true;
}
else if (strcmp(argv[i],"--MaxSpan") == 0) {
// ignore
++i;
}
else if (strcmp(argv[i],"--SentenceOffset") == 0) {
if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
exit(1);
}
sentenceOffset = atoi(argv[++i]);
}
else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
}
}
// open input files
Moses::InputFileStream tFile(fileNameT);
Moses::InputFileStream sFile(fileNameS);
Moses::InputFileStream aFile(fileNameA);
// open output files
string fileNameExtractInv = fileNameExtract + ".inv";
if (global->gzOutput) {
fileNameExtract += ".gz";
fileNameExtractInv += ".gz";
}
Moses::OutputFileStream extractFile;
Moses::OutputFileStream extractFileInv;
extractFile.Open(fileNameExtract.c_str());
extractFileInv.Open(fileNameExtractInv.c_str());
// loop through all sentence pairs
int i = sentenceOffset;
while(true) {
i++;
if (i % 1000 == 0) {
cerr << i << " " << flush;
}
string targetString;
string sourceString;
string alignmentString;
bool ok = getline(tFile, targetString);
if (!ok)
break;
getline(sFile, sourceString);
getline(aFile, alignmentString);
//cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl;
//time_t currTime = time(NULL);
//cerr << "A " << (currTime - starttime) << endl;
SentenceAlignment sentencePair;
if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global ))
{
//cerr << sentence.sourceTree << endl;
//cerr << sentence.targetTree << endl;
sentencePair.FindTunnels(*g_global);
//cerr << "C " << (time(NULL) - starttime) << endl;
//cerr << sentencePair << endl;
sentencePair.CreateLattice(*g_global);
//cerr << "D " << (time(NULL) - starttime) << endl;
//cerr << sentencePair << endl;
sentencePair.CreateRules(*g_global);
//cerr << "E " << (time(NULL) - starttime) << endl;
//cerr << sentence.lattice->GetRules().GetSize() << endl;
sentencePair.GetLattice().GetRules().Output(extractFile);
sentencePair.GetLattice().GetRules().OutputInv(extractFileInv);
}
}
tFile.Close();
sFile.Close();
aFile.Close();
extractFile.Close();
extractFileInv.Close();
if (global->glueGrammarFlag) {
writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection);
}
delete global;
}
void writeGlueGrammar( const string & fileName, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
{
ofstream grammarFile;
grammarFile.open(fileName.c_str());
if (!options.targetSyntax) {
grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
} else {
// chose a top label that is not already a label
string topLabel = "QQQQQQ";
for( unsigned int i=1; i<=topLabel.length(); i++) {
if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
topLabel = topLabel.substr(0,i);
break;
}
}
// basic rules
grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| " << endl
<< "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << endl;
// top rules
for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
i != targetTopLabelCollection.end(); i++ ) {
grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << endl;
}
// glue rules
for( set<string>::const_iterator i = targetLabelCollection.begin();
i != targetLabelCollection.end(); i++ ) {
grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << endl;
}
grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << endl; // glue rule for unknown word...
}
grammarFile.close();
}

View File

@ -0,0 +1,34 @@
#pragma once
#include <vector>
#include <list>
#include <map>
#include <set>
#include <string>
#include <fstream>
#include <algorithm>
#include "SyntaxTree.h"
#include "XmlTree.h"
#include "Tunnel.h"
#include "TunnelCollection.h"
#include "SentenceAlignment.h"
#include "Global.h"
std::vector<std::string> tokenize( const char [] );
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
_IS.getline(_LINE, _SIZE, _DELIM); \
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
if (_IS.gcount() == _SIZE-1) { \
cerr << "Line too long! Buffer overflow. Delete lines >=" \
<< _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \
<< endl; \
exit(1); \
} \
}
#define LINE_MAX_LENGTH 1000000
const Global *g_global;
std::set< std::string > targetLabelCollection, sourceLabelCollection;
std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;

View File

@ -0,0 +1,81 @@
#ifndef moses_gzfile_buf_h
#define moses_gzfile_buf_h
#include <streambuf>
#include <zlib.h>
#include <cstring>
class gzfilebuf : public std::streambuf {
public:
gzfilebuf(const char *filename)
{ _gzf = gzopen(filename, "rb");
setg (_buff+sizeof(int), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)); // end position
}
~gzfilebuf() { gzclose(_gzf); }
protected:
virtual int_type overflow (int_type c) {
throw;
}
// write multiple characters
virtual
std::streamsize xsputn (const char* s,
std::streamsize num) {
throw;
}
virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
}
//read one character
virtual int_type underflow () {
// is read position before end of _buff?
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
/* process size of putback area
* - use number of characters read
* - but at most four
*/
unsigned int numPutback = gptr() - eback();
if (numPutback > sizeof(int)) {
numPutback = sizeof(int);
}
/* copy up to four characters previously read into
* the putback _buff (area of first four characters)
*/
std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
numPutback);
// read new characters
int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
if (num <= 0) {
// ERROR or EOF
return EOF;
}
// reset _buff pointers
setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)+num); // end of buffer
// return next character
return traits_type::to_int_type(*gptr());
}
std::streamsize xsgetn (char* s,
std::streamsize num) {
return gzread(_gzf,s,num);
}
private:
gzFile _gzf;
static const unsigned int _buffsize = 1024;
char _buff[_buffsize];
};
#endif

View File

@ -0,0 +1,110 @@
// $Id: tables-core.cpp 3131 2010-04-13 16:29:55Z pjwilliams $
//#include "beammain.h"
//#include "SafeGetLine.h"
#include "tables-core.h"
#define TABLE_LINE_MAX_LENGTH 1000
#define UNKNOWNSTR "UNK"
// as in beamdecoder/tables.cpp
vector<string> tokenize( const char* input ) {
vector< string > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
}
else if (isSpace && !betweenWords) {
token.push_back( string( input+start, i-start ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( string( input+start, i-start ) );
return token;
}
WORD_ID Vocabulary::storeIfNew( const WORD& word ) {
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i != lookup.end() )
return i->second;
WORD_ID id = vocab.size();
vocab.push_back( word );
lookup[ word ] = id;
return id;
}
WORD_ID Vocabulary::getWordID( const WORD& word ) {
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i == lookup.end() )
return 0;
return i->second;
}
PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase ) {
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
if( i != lookup.end() )
return i->second;
PHRASE_ID id = phraseTable.size();
phraseTable.push_back( phrase );
lookup[ phrase ] = id;
return id;
}
PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase ) {
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
if( i == lookup.end() )
return 0;
return i->second;
}
void PhraseTable::clear() {
lookup.clear();
phraseTable.clear();
}
void DTable::init() {
for(int i = -10; i<10; i++)
dtable[i] = -abs( i );
}
/*
void DTable::load( const string& fileName ) {
ifstream inFile;
inFile.open(fileName.c_str());
istream *inFileP = &inFile;
char line[TABLE_LINE_MAX_LENGTH];
int i=0;
while(true) {
i++;
SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
vector<string> token = tokenize( line );
if (token.size() < 2) {
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
continue;
}
int d = atoi( token[0].c_str() );
double prob = log( atof( token[1].c_str() ) );
dtable[ d ] = prob;
}
}
*/
double DTable::get( int distortion ) {
if (dtable.find( distortion ) == dtable.end())
return log( 0.00001 );
return dtable[ distortion ];
}

View File

@ -0,0 +1,72 @@
#pragma once
// $Id: tables-core.h 2416 2009-07-30 11:07:38Z hieuhoang1972 $
#include <iostream>
#include <fstream>
#include <assert.h>
#include <stdlib.h>
#include <string>
#include <queue>
#include <map>
#include <cmath>
using namespace std;
#define TABLE_LINE_MAX_LENGTH 1000
#define UNKNOWNSTR "UNK"
vector<string> tokenize( const char[] );
//! delete and remove every element of a collection object such as map, set, list etc
template<class COLL>
void RemoveAllInColl(COLL &coll)
{
for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter)
{
delete (*iter);
}
coll.clear();
}
typedef string WORD;
typedef unsigned int WORD_ID;
class Vocabulary {
public:
map<WORD, WORD_ID> lookup;
vector< WORD > vocab;
WORD_ID storeIfNew( const WORD& );
WORD_ID getWordID( const WORD& );
inline WORD &getWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
};
typedef vector< WORD_ID > PHRASE;
typedef unsigned int PHRASE_ID;
class PhraseTable {
public:
map< PHRASE, PHRASE_ID > lookup;
vector< PHRASE > phraseTable;
PHRASE_ID storeIfNew( const PHRASE& );
PHRASE_ID getPhraseID( const PHRASE& );
void clear();
inline PHRASE &getPhrase( const PHRASE_ID id ) { return phraseTable[ id ]; }
};
typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC;
class TTable {
public:
map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable;
map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti;
};
class DTable {
public:
map< int, double > dtable;
void init();
void load( const string& );
double get( int );
};

View File

@ -0,0 +1,126 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.499747849" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.798364121" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/extract-ordering}/Debug" id="cdt.managedbuild.builder.gnu.cross.1976289814" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1324749613" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" useByScannerDiscovery="false" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1750299246" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.719498215" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.251118848" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.99297656" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.106920816" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1844372739" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1178164658" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<option id="gnu.cpp.link.option.libs.1434184833" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
<listOptionValue builtIn="false" value="boost_system-mt"/>
<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
</option>
<option id="gnu.cpp.link.option.paths.974811544" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.904916320" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1005231499" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1318928675" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.604255673" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.818331963">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.818331963." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.1489025499" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1052477856" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/extract-ordering}/Release" id="cdt.managedbuild.builder.gnu.cross.33925527" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1884790737" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" useByScannerDiscovery="false" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.197048136" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.106898878" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.1920378037" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.37950410" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1197641703" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1356351201" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2053623412" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1988048517" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1494470963" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1553727957" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="extract-ordering.cdt.managedbuild.target.gnu.cross.exe.1840421491" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417;cdt.managedbuild.tool.gnu.c.compiler.input.106898878">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446;cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964;cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827;cdt.managedbuild.tool.gnu.c.compiler.input.719498215">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>

View File

@ -0,0 +1,74 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>extract-ordering</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
<linkedResources>
<link>
<name>InputFileStream.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.cpp</locationURI>
</link>
<link>
<name>InputFileStream.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.h</locationURI>
</link>
<link>
<name>OutputFileStream.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.cpp</locationURI>
</link>
<link>
<name>OutputFileStream.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
</link>
<link>
<name>SentenceAlignment.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.cpp</locationURI>
</link>
<link>
<name>SentenceAlignment.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.h</locationURI>
</link>
<link>
<name>extract-ordering-main.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ordering-main.cpp</locationURI>
</link>
<link>
<name>tables-core.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.cpp</locationURI>
</link>
<link>
<name>tables-core.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.h</locationURI>
</link>
</linkedResources>
</projectDescription>

View File

@ -65,6 +65,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
</link>
<link>
<name>RuleExtractionOptions.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/RuleExtractionOptions.h</locationURI>
</link>
<link>
<name>SentenceAlignment.cpp</name>
<type>1</type>

View File

@ -5,12 +5,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.1133345948" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -23,7 +23,7 @@
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.966722418" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<option id="gnu.cpp.compiler.option.include.paths.966722418" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
@ -36,11 +36,13 @@
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1048685119" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1295498016" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.paths.338150127" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/mert_lib/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
</option>
<option id="gnu.cpp.link.option.libs.585257079" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="mert_lib"/>
<listOptionValue builtIn="false" value="boost_system-mt"/>
<listOptionValue builtIn="false" value="util"/>
<listOptionValue builtIn="false" value="z"/>
</option>
@ -62,12 +64,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1385955159" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -131,4 +133,5 @@
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
</cproject>

View File

@ -82,10 +82,515 @@
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
<linkedResources>
<link>
<name>bin</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>extractor.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/extractor.cpp</locationURI>
</link>
<link>
<name>bin/gcc-4.8</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BleuDocScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BleuDocScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BleuScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BleuScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BleuScorerTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/BleuScorerTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/CderScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/CderScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Data.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Data.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DataTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/DataTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureArray.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureArray.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureData.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureData.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureDataIterator.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureDataIterator.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureDataTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureDataTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureStats.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FeatureStats.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FileStream.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/FileStream.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/GzFileBuf.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/GzFileBuf.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypPackEnumerator.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/HypPackEnumerator.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/InterpolatedScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/InterpolatedScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MeteorScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MeteorScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MiraFeatureVector.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MiraFeatureVector.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MiraWeightVector.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/MiraWeightVector.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/NgramTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/NgramTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Optimizer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Optimizer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/OptimizerFactory.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/OptimizerFactory.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/OptimizerFactoryTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/OptimizerFactoryTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PerScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PerScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Permutation.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Permutation.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PermutationScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PermutationScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Point.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Point.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PointTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PointTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PreProcessFilter.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/PreProcessFilter.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ReferenceTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ReferenceTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreArray.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreArray.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreData.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreData.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreDataIterator.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreDataIterator.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreStats.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScoreStats.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Scorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Scorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScorerFactory.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ScorerFactory.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SemposOverlapping.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SemposOverlapping.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SemposScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SemposScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SentenceLevelScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SentenceLevelScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SingletonTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/SingletonTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/StatisticsBasedScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/StatisticsBasedScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TerScorer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TerScorer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ThreadPool.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ThreadPool.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Timer.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Timer.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TimerTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TimerTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Util.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Util.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/UtilTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/UtilTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Vocabulary.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/Vocabulary.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/VocabularyTest.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/VocabularyTest.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/bleu_scorer_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/bleu_scorer_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/bleu_scorer_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/bleu_scorer_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/data_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/data_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/data_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/data_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/evaluator</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/evaluator</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/evaluator.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/evaluator.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/extractor</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/extractor</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/extractor.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/extractor.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/feature_data_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/feature_data_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/feature_data_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/feature_data_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/kbmira</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/kbmira</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/kbmira.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/kbmira.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/libmert_lib.a</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/libmert_lib.a</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mert</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mert</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mert.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/mert.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ngram_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ngram_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ngram_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ngram_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/optimizer_factory_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/optimizer_factory_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/optimizer_factory_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/optimizer_factory_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/point_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/point_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/point_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/point_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/pro</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/pro</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/pro.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/pro.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/reference_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/reference_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/reference_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/reference_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/sentence-bleu</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/sentence-bleu</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/sentence-bleu.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/sentence-bleu.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/singleton_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/singleton_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/singleton_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/singleton_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/timer_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/timer_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/timer_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/timer_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/util_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/util_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/util_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/util_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/vocabulary_test</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/vocabulary_test</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/vocabulary_test.passed</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/vocabulary_test.passed</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/alignmentStruct.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/alignmentStruct.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/hashMap.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/hashMap.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/hashMapInfos.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/hashMapInfos.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/hashMapStringInfos.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/hashMapStringInfos.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/infosHasher.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/infosHasher.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/stringHasher.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/stringHasher.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/stringInfosHasher.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/stringInfosHasher.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/terAlignment.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/terAlignment.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/terShift.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/terShift.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/tercalc.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/tercalc.o</locationURI>
</link>
<link>
<name>bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/tools.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/mert/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/TER/tools.o</locationURI>
</link>
</linkedResources>
</projectDescription>

View File

@ -0,0 +1,124 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.1899954923" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1645930772" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/manual-label/Debug}" id="cdt.managedbuild.builder.gnu.cross.1703642277" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1888648788" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1838052643" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.798368516" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.153015988" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.418888584" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.406065865" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1741441821" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1626431978" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<option id="gnu.cpp.link.option.libs.1886912770" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="boost_program_options-mt"/>
</option>
<option id="gnu.cpp.link.option.paths.1541583695" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/lib64&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1367999206" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.31522559" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.826957235" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.350181339" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.97427761" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.564169339" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/manual-label/Release}" id="cdt.managedbuild.builder.gnu.cross.663164336" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.2135645103" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.764935013" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1841809129" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.1877584345" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.935490779" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.355530813" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.940299092" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.17718999" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1527322008" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.480337803" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1788533940" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="manual-label.cdt.managedbuild.target.gnu.cross.exe.2117548180" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943;cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607;cdt.managedbuild.tool.gnu.c.compiler.input.798368516">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437;cdt.managedbuild.tool.gnu.c.compiler.input.1841809129">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503;cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>

View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>manual-label</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>

View File

@ -0,0 +1,86 @@
#include <list>
#include "DeEn.h"
#include "moses/Util.h"
using namespace std;
extern bool g_debug;
bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str)
{
pos += offset;
if (pos >= source.size() || pos < 0) {
return false;
}
const string &word = source[pos][factor];
vector<string> soughts = Moses::Tokenize(str, " ");
for (int i = 0; i < soughts.size(); ++i) {
string &sought = soughts[i];
bool found = (word == sought);
if (found) {
return true;
}
}
return false;
}
bool Contains(const Phrase &source, int start, int end, int factor, const string &str)
{
for (int pos = start; pos <= end; ++pos) {
bool found = IsA(source, pos, 0, factor, str);
if (found) {
return true;
}
}
return false;
}
void LabelDeEn(const Phrase &source, ostream &out)
{
typedef pair<int,int> Range;
typedef list<Range> Ranges;
Ranges ranges;
// find ranges to label
for (int start = 0; start < source.size(); ++start) {
for (int end = start; end < source.size(); ++end) {
if (IsA(source, start, -1, 1, "VAFIN")
&& IsA(source, end, +1, 1, "VVINF VVPP")
&& !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) {
Range range(start, end);
ranges.push_back(range);
}
else if ((start == 0 || IsA(source, start, -1, 1, "$,"))
&& IsA(source, end, +1, 0, "zu")
&& IsA(source, end, +2, 1, "VVINF")
&& !Contains(source, start, end, 1, "$,")) {
Range range(start, end);
ranges.push_back(range);
}
}
}
// output sentence, with labels
for (int pos = 0; pos < source.size(); ++pos) {
// output beginning of label
for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
const Range &range = *iter;
if (range.first == pos) {
out << "<tree label=\"reorder-label\"> ";
}
}
const Word &word = source[pos];
out << word[0] << " ";
for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
const Range &range = *iter;
if (range.second == pos) {
out << "</tree> ";
}
}
}
out << endl;
}

View File

@ -0,0 +1,10 @@
#pragma once
#include <iostream>
#include <vector>
#include <string>
typedef std::vector<std::string> Word;
typedef std::vector<Word> Phrase;
void LabelDeEn(const Phrase &source, std::ostream &out);

View File

@ -0,0 +1,13 @@
all: manual-label
clean:
rm -f *.o manual-label
.cpp.o:
g++ -I../../../ -O6 -g -c $<
manual-label: DeEn.o manual-label.o
g++ DeEn.o manual-label.o -lz -lboost_program_options-mt -o manual-label

View File

@ -0,0 +1,88 @@
#include <iostream>
#include <cstdlib>
#include <boost/program_options.hpp>
#include "moses/Util.h"
#include "DeEn.h"
using namespace std;
bool g_debug = false;
Phrase Tokenize(const string &line);
int main(int argc, char** argv)
{
cerr << "Starting" << endl;
namespace po = boost::program_options;
po::options_description desc("Options");
desc.add_options()
("help", "Print help messages")
("add", "additional options")
("source-language,s", po::value<string>()->required(), "Source Language")
("target-language,t", po::value<string>()->required(), "Target Language");
po::variables_map vm;
try
{
po::store(po::parse_command_line(argc, argv, desc),
vm); // can throw
/** --help option
*/
if ( vm.count("help") )
{
std::cout << "Basic Command Line Parameter App" << std::endl
<< desc << std::endl;
return EXIT_SUCCESS;
}
po::notify(vm); // throws on error, so do after help in case
// there are any problems
}
catch(po::error& e)
{
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return EXIT_FAILURE;
}
string sourceLang = vm["source-language"].as<string>();
string targetLang = vm["target-language"].as<string>();
cerr << sourceLang << " " << targetLang << endl;
string line;
size_t lineNum = 1;
while (getline(cin, line)) {
//cerr << lineNum << ":" << line << endl;
if (lineNum % 1000 == 0) {
cerr << lineNum << " ";
}
Phrase source = Tokenize(line);
LabelDeEn(source, cout);
++lineNum;
}
cerr << "Finished" << endl;
return EXIT_SUCCESS;
}
Phrase Tokenize(const string &line)
{
Phrase ret;
vector<string> toks = Moses::Tokenize(line);
for (size_t i = 0; i < toks.size(); ++i) {
Word word = Moses::Tokenize(toks[i], "|");
ret.push_back(word);
}
return ret;
}

View File

@ -11,11 +11,11 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -46,7 +46,9 @@
</tool>
</toolChain>
</folderInfo>
<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.933309045" name="PreProcessFilter.h" rcbsApplicability="disable" resourcePath="mert/PreProcessFilter.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.195400614" name="MeteorScorer.cpp" rcbsApplicability="disable" resourcePath="MeteorScorer.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.307282660">
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.307282660" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
</fileInfo>
<sourceEntries>
<entry excluding="mert/PreProcessFilter.h|mert/PreProcessFilter.cpp|mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
@ -64,11 +66,11 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -127,4 +129,5 @@
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
</cproject>

File diff suppressed because it is too large Load Diff

View File

@ -5,12 +5,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1385309092" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -50,7 +50,6 @@
<listOptionValue builtIn="false" value="lattice"/>
<listOptionValue builtIn="false" value="misc"/>
<listOptionValue builtIn="false" value="dalm"/>
<listOptionValue builtIn="false" value="MurmurHash3"/>
<listOptionValue builtIn="false" value="search"/>
<listOptionValue builtIn="false" value="RandLM"/>
<listOptionValue builtIn="false" value="OnDiskPt"/>
@ -103,12 +102,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.2038764866" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -173,4 +172,5 @@
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
</cproject>

View File

@ -5,13 +5,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.162355801" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -70,7 +70,6 @@
<listOptionValue builtIn="false" value="irstlm"/>
<listOptionValue builtIn="false" value="dstruct"/>
<listOptionValue builtIn="false" value="dalm"/>
<listOptionValue builtIn="false" value="MurmurHash3"/>
<listOptionValue builtIn="false" value="flm"/>
<listOptionValue builtIn="false" value="oolm"/>
<listOptionValue builtIn="false" value="lattice"/>
@ -108,13 +107,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.516628324" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -5,13 +5,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.461114338" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -71,7 +71,6 @@
<listOptionValue builtIn="false" value="lattice"/>
<listOptionValue builtIn="false" value="misc"/>
<listOptionValue builtIn="false" value="dalm"/>
<listOptionValue builtIn="false" value="MurmurHash3"/>
<listOptionValue builtIn="false" value="search"/>
<listOptionValue builtIn="false" value="RandLM"/>
<listOptionValue builtIn="false" value="OnDiskPt"/>
@ -109,13 +108,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.2121690436" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -88,13 +88,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.401150096" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -166,6 +166,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartHypothesisCollection.h</locationURI>
</link>
<link>
<name>ChartKBestExtractor.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartKBestExtractor.cpp</locationURI>
</link>
<link>
<name>ChartKBestExtractor.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartKBestExtractor.h</locationURI>
</link>
<link>
<name>ChartManager.cpp</name>
<type>1</type>
@ -1066,6 +1076,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ControlRecombination.h</locationURI>
</link>
<link>
<name>FF/CountNonTerms.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/CountNonTerms.cpp</locationURI>
</link>
<link>
<name>FF/CountNonTerms.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/CountNonTerms.h</locationURI>
</link>
<link>
<name>FF/CoveredReferenceFeature.cpp</name>
<type>1</type>
@ -1156,6 +1176,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/GlobalLexicalModelUnlimited.h</locationURI>
</link>
<link>
<name>FF/HyperParameterAsWeight.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/HyperParameterAsWeight.cpp</locationURI>
</link>
<link>
<name>FF/HyperParameterAsWeight.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/HyperParameterAsWeight.h</locationURI>
</link>
<link>
<name>FF/InputFeature.cpp</name>
<type>1</type>
@ -1231,6 +1261,36 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhrasePenalty.h</locationURI>
</link>
<link>
<name>FF/ReferenceComparison.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ReferenceComparison.cpp</locationURI>
</link>
<link>
<name>FF/ReferenceComparison.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ReferenceComparison.h</locationURI>
</link>
<link>
<name>FF/RuleAmbiguity.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/RuleAmbiguity.cpp</locationURI>
</link>
<link>
<name>FF/RuleAmbiguity.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/RuleAmbiguity.h</locationURI>
</link>
<link>
<name>FF/SetSourcePhrase.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SetSourcePhrase.cpp</locationURI>
</link>
<link>
<name>FF/SetSourcePhrase.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SetSourcePhrase.h</locationURI>
</link>
<link>
<name>FF/SkeletonStatefulFF.cpp</name>
<type>1</type>
@ -1251,6 +1311,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SkeletonStatelessFF.h</locationURI>
</link>
<link>
<name>FF/SoftMatchingFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftMatchingFeature.cpp</locationURI>
</link>
<link>
<name>FF/SoftMatchingFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftMatchingFeature.h</locationURI>
</link>
<link>
<name>FF/SourceWordDeletionFeature.cpp</name>
<type>1</type>
@ -1311,6 +1381,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetWordInsertionFeature.h</locationURI>
</link>
<link>
<name>FF/TreeStructureFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TreeStructureFeature.cpp</locationURI>
</link>
<link>
<name>FF/TreeStructureFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TreeStructureFeature.h</locationURI>
</link>
<link>
<name>FF/UnknownWordPenaltyProducer.cpp</name>
<type>1</type>
@ -1836,6 +1916,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h</locationURI>
</link>
<link>
<name>TranslationModel/CYKPlusParser/CompletedRuleCollection.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.cpp</locationURI>
</link>
<link>
<name>TranslationModel/CYKPlusParser/CompletedRuleCollection.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h</locationURI>
</link>
<link>
<name>TranslationModel/CYKPlusParser/DotChart.h</name>
<type>1</type>

View File

@ -5,12 +5,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.634831890" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -20,14 +20,14 @@
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.2040884960" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/score/Debug}" id="cdt.managedbuild.builder.gnu.cross.1709170788" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.786339685" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1516054114" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1061705384" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1516054114" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" useByScannerDiscovery="false" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1061705384" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2108019237" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1013232238" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.1874109813" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.2032778777" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1713606194" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<option id="gnu.cpp.compiler.option.optimization.level.1874109813" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.2032778777" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1713606194" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
</option>
@ -37,9 +37,13 @@
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1563503789" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<option id="gnu.cpp.link.option.paths.1704292838" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
</option>
<option id="gnu.cpp.link.option.libs.936233947" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="util"/>
<listOptionValue builtIn="false" value="moses"/>
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
<listOptionValue builtIn="false" value="boost_system-mt"/>
<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
@ -63,12 +67,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1994357180" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -78,13 +82,13 @@
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1353054437" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/score/Release}" id="cdt.managedbuild.builder.gnu.cross.1851758128" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.323743241" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.534423111" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.518786530" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.534423111" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" useByScannerDiscovery="false" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.518786530" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.392640311" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.307472312" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.407718562" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.1687450255" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.optimization.level.407718562" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.1687450255" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.593478428" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.165176764" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>

View File

@ -25,6 +25,26 @@
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
<linkedResources>
<link>
<name>DomainFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/DomainFeature.cpp</locationURI>
</link>
<link>
<name>DomainFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/DomainFeature.h</locationURI>
</link>
<link>
<name>ExtractionPhrasePair.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/ExtractionPhrasePair.cpp</locationURI>
</link>
<link>
<name>ExtractionPhrasePair.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/ExtractionPhrasePair.h</locationURI>
</link>
<link>
<name>InputFileStream.cpp</name>
<type>1</type>
@ -55,11 +75,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
</link>
<link>
<name>PhraseAlignment.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/PhraseAlignment.cpp</locationURI>
</link>
<link>
<name>ScoreFeature.cpp</name>
<type>1</type>
@ -70,16 +85,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/ScoreFeature.h</locationURI>
</link>
<link>
<name>domain.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/domain.cpp</locationURI>
</link>
<link>
<name>domain.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/domain.h</locationURI>
</link>
<link>
<name>exception.cc</name>
<type>1</type>

View File

@ -12,12 +12,13 @@ Building the RPM SPEC file
The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
- The Git repository from which an installer will be built,
- The branch in the Git repository to build, and
- The branch in the Git repository to build,
- The location of Boost on the build machine, and
- The version of the installed Moses distribution.
For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
For example, to build the RELEASE-1.0 branch in the mosesdecoder repository (git://github.com/moses-smt/mosesdecoder.git):
$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0 -t /usr
This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.

View File

@ -1,11 +1,15 @@
#!/bin/bash
BRANCH="master"
BOOST="/usr"
declare -i NO_RPM_BUILD=0
declare -i RELEASE=1
declare -r RPM_VERSION_TAG="___RPM_VERSION__"
declare -r RPM_RELEASE_TAG="___RPM_RELEASE__"
declare -r BOOST_TAG="___BOOST_LOCATION__"
function usage() {
echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version] -l [RPM release: default ${RELEASE}] -t [Boost install: default ${BOOST}]"
exit 1
}
@ -13,12 +17,14 @@ if [ $# -lt 4 ]; then
usage
fi
while getopts r:b:v:nh OPTION
while getopts r:b:t:v:l:nh OPTION
do
case "$OPTION" in
r) REPO="${OPTARG}";;
b) BRANCH="${OPTARG}";;
t) BOOST="${OPTARG}";;
v) VERSION="${OPTARG}";;
l) RELEASE="${OPTARG}";;
n) NO_RPM_BUILD=1;;
[h\?]) usage;;
esac
@ -53,7 +59,8 @@ if [ ${NO_RPM_BUILD} -eq 0 ]; then
if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
mkdir -p ${HOME}/rpmbuild/SPECS
fi
eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
ESC_BOOST=`echo ${BOOST} | gawk '{gsub(/\//, "\\\\/"); print}'`
eval sed -e \"s/${RPM_VERSION_TAG}/${VERSION}/\" -e \"s/${RPM_RELEASE_TAG}/${RELEASE}/\" -e \"s/${BOOST_TAG}/${ESC_BOOST}/\" ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
mkdir -p ${HOME}/rpmbuild/SOURCES
fi

View File

@ -1,21 +1,26 @@
Name: moses
%define name moses
%define version ___RPM_VERSION__
%define release ___RPM_RELEASE__
Name: %{name}
Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair.
Version: ___RPM_VERSION__
Release: 1
URL: http://www.statmt.org/moses/
Version: %{version}
Release: %{release}
URL: http://www.statmt.org/%{name}-%{version}/
Source0: %{name}-%{version}.tar.gz
License: LGPL
Group: Development/Tools
Vendor: Capita Translation and Interpreting
Packager: Ian Johnson <ian.johnson@capita-ti.com>
Requires: boost >= 1.48, python >= 2.6, perl >= 5
BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
Requires: python >= 2.6, perl >= 5
Prefix: /opt
BuildRoot: %{_builddir}/%{name}-%{version}-%{release}
%description
Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
%prep
%setup -q
mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
mkdir -p $RPM_BUILD_ROOT/opt/%{name}-%{version}/giza++-v1.0.7
wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz
wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
@ -27,39 +32,51 @@ tar -zxf giza-pp-v1.0.7.tgz
cd irstlm-5.70.04
bash regenerate-makefiles.sh --force
./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04
./configure --prefix $RPM_BUILD_ROOT/opt/%{name}-%{version}/irstlm-5.70.04
make
make install
cd ../giza-pp
make
cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/%{name}-%{version}/giza++-v1.0.7
%build
./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
./bjam --with-boost=___BOOST_LOCATION__ --with-irstlm=$RPM_BUILD_ROOT/opt/%{name}-%{version}/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/%{name}-%{version}/giza++-v1.0.7 -j2
%install
mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
cp -R bin $RPM_BUILD_ROOT/opt/moses
cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
mkdir -p $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R bin $RPM_BUILD_ROOT/opt/%{name}-%{version}
cp -R scripts/OSM $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R scripts/Transliteration $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R scripts/analysis $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R scripts/ems $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R scripts/generic $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R scripts/other $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R scripts/recaser $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R scripts/share $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
cp -R scripts/training $RPM_BUILD_ROOT/opt/%{name}-%{version}/scripts
%clean
%files
%defattr(-,root,root)
/opt/moses/bin/*
/opt/moses/scripts/analysis/*
/opt/moses/scripts/ems/*
/opt/moses/scripts/generic/*
/opt/moses/scripts/other/*
/opt/moses/scripts/recaser/*
/opt/moses/scripts/regression-testing/*
/opt/moses/scripts/share/*
/opt/moses/scripts/tokenizer/*
/opt/moses/scripts/training/*
/opt/moses/irstlm-5.70.04/*
/opt/moses/giza++-v1.0.7/*
/opt/%{name}-%{version}/bin/*
/opt/%{name}-%{version}/scripts/OSM/*
/opt/%{name}-%{version}/scripts/Transliteration/*
/opt/%{name}-%{version}/scripts/analysis/*
/opt/%{name}-%{version}/scripts/ems/*
/opt/%{name}-%{version}/scripts/generic/*
/opt/%{name}-%{version}/scripts/other/*
/opt/%{name}-%{version}/scripts/recaser/*
/opt/%{name}-%{version}/scripts/share/*
/opt/%{name}-%{version}/scripts/tokenizer/*
/opt/%{name}-%{version}/scripts/training/*
/opt/%{name}-%{version}/irstlm-5.70.04/*
/opt/%{name}-%{version}/giza++-v1.0.7/*
%pre
if [ "$1" = "1" ]; then
elif [ "$1" = "2" ]; then
rm $RPM_INSTALL_PREFIX/%{name} 2>/dev/null
fi
%post
ln -s $RPM_INSTALL_PREFIX/%{name}-%{version} $RPM_INSTALL_PREFIX/%{name}
%postun
rm -Rf $RPM_INSTALL_PREFIX/%{name}-%{version} 2>/dev/null
rm $RPM_INSTALL_PREFIX/%{name} 2>/dev/null

View File

@ -35,7 +35,7 @@ if $(build-moses-server) = true
xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --libs" ] ;
xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --cflags" ] ;
exe mosesserver : mosesserver.cpp ../../moses//moses ../../OnDiskPt//OnDiskPt : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
exe mosesserver : mosesserver.cpp ../../moses//moses ../../moses-cmd/IOWrapper.cpp ../../OnDiskPt//OnDiskPt : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
} else {
alias mosesserver ;
}

View File

@ -12,6 +12,7 @@
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#include "moses/TreeInput.h"
#include "moses/LM/ORLM.h"
#include "moses-cmd/IOWrapper.h"
#ifdef WITH_THREADS
#include <boost/thread.hpp>
@ -22,6 +23,7 @@
#include <xmlrpc-c/server_abyss.hpp>
using namespace Moses;
using namespace MosesCmd;
using namespace std;
typedef std::map<std::string, xmlrpc_c::value> params_t;
@ -215,6 +217,8 @@ public:
cerr << "Input: " << source << endl;
si = params.find("align");
bool addAlignInfo = (si != params.end());
si = params.find("word-align");
bool addWordAlignInfo = (si != params.end());
si = params.find("sg");
bool addGraphInfo = (si != params.end());
si = params.find("topt");
@ -278,6 +282,20 @@ public:
if (addAlignInfo) {
retData.insert(pair<string, xmlrpc_c::value>("align", xmlrpc_c::value_array(alignInfo)));
}
if (addWordAlignInfo) {
stringstream wordAlignment;
OutputAlignment(wordAlignment, hypo);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair) {
int pos = alignmentPair.find('-');
map<string, xmlrpc_c::value> wordAlignInfo;
wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str()));
wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
}
retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
}
if(addGraphInfo) {
insertGraphInfo(manager,retData);
@ -415,9 +433,25 @@ public:
}
nBestXMLItem["hyp"] = xmlrpc_c::value_string(out.str());
if (addAlignmentInfo)
if (addAlignmentInfo) {
nBestXMLItem["align"] = xmlrpc_c::value_array(alignInfo);
if ((int)edges.size() > 0) {
stringstream wordAlignment;
OutputAlignment(wordAlignment, edges[0]);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair) {
int pos = alignmentPair.find('-');
map<string, xmlrpc_c::value> wordAlignInfo;
wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str()));
wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
}
nBestXMLItem["word-align"] = xmlrpc_c::value_array(alignments);
}
}
// weighted score
nBestXMLItem["totalScore"] = xmlrpc_c::value_double(path.GetTotalScore());
nBestXml.push_back(xmlrpc_c::value_struct(nBestXMLItem));
@ -512,7 +546,7 @@ int main(int argc, char** argv)
xmlrpc_limit_set(XMLRPC_XML_SIZE_LIMIT_ID, 512*1024*1024);
xmlrpc_c::registry myRegistry;
xmlrpc_c::methodPtr const translator(new Translator);
xmlrpc_c::methodPtr const updater(new Updater);
xmlrpc_c::methodPtr const optimizer(new Optimizer);
@ -521,11 +555,20 @@ int main(int argc, char** argv)
myRegistry.addMethod("updater", updater);
myRegistry.addMethod("optimize", optimizer);
xmlrpc_c::serverAbyss myAbyssServer(
myRegistry,
port, // TCP port on which to listen
logfile
);
/* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04
xmlrpc_c::serverAbyss myAbyssServer(
myRegistry,
port, // TCP port on which to listen
logfile
xmlrpc_c::serverAbyss::constrOpt()
.registryPtr(&myRegistry)
.portNumber(port) // TCP port on which to listen
.logFileName(logfile)
.allowOrigin("*")
);
*/
cerr << "Listening on port " << port << endl;
if (isSerial) {

View File

@ -1,5 +1,5 @@
ad ||| af ||| 0.3 0.3 0.3 0.3 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 1000 1000
bd ||| bf ||| 0.3 0.3 0.3 0.3 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 10 10
ad ||| af ||| 0.3 0.3 0.3 0.3 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 1000 1000 ||| sparse_feature 1
bd ||| bf ||| 0.3 0.3 0.3 0.3 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 10 10 |||
der gipfel ||| sommet ||| 0.3 0.3 0.3 0.3 0.00327135 0.00872768 0.0366795 0.611403 2.718 ||| 1-0 ||| 5808 518
der pass ||| le col ||| 0.3 0.3 0.3 0.3 0.0173565 0.0284616 0.288889 0.121619 2.718 ||| 0-0 1-1 ||| 749 45
pass ||| col ||| 0.3 0.3 0.3 0.3 0.1952 0.143937 0.628866 0.681301 2.718 ||| 0-0 ||| 1875 582

View File

@ -1,4 +1,4 @@
ad ||| af ||| 0.6 0.6 0.6 0.6 0.1 0.1 0.1 0.1 2.718 ||| 0-0 ||| 1000 1000
ad ||| af ||| 0.6 0.6 0.6 0.6 0.1 0.1 0.1 0.1 2.718 ||| 0-0 ||| 1000 1000 ||| sparse_feature 2
bd ||| bf ||| 0.6 0.6 0.6 0.6 0.1 0.1 0.1 0.1 2.718 ||| 0-0 ||| 10 10
der pass ||| le passeport ||| 0.6 0.6 0.6 0.6 0.16 0.03063 0.4 0.0748551 2.718 ||| 0-0 1-1 ||| 25 10
pass ||| passeport ||| 0.6 0.6 0.6 0.6 0.28022 0.192612 0.607143 0.675926 2.718 ||| 0-0 ||| 182 84

View File

@ -1,8 +1,8 @@
ad ||| af ||| 0.3 0.3 0.3 0.3 2.718 ||| 0-0 ||| 1000 1000
bd ||| bf ||| 0.3 0.3 0.3 0.3 2.718 ||| 0-0 ||| 10 10
der gipfel ||| sommet ||| 0.00163568 0.00436384 0.0183397 0.305702 2.718 ||| 1-0 ||| 5808 518
der pass ||| le col ||| 0.00867825 0.0142308 0.144445 0.0608095 2.718 ||| 0-0 1-1 ||| 749 45
pass ||| col ||| 0.0976 0.0719685 0.314433 0.340651 2.718 ||| 0-0 ||| 1875 582
pass ||| passeport retrouvé ||| 0.25 0.125 0.000859105 1.9065e-07 2.718 ||| 0-0 ||| 2 582
pass ||| passeport ||| 0.273444 0.221306 0.307008 0.343654 2.718 ||| 0-0 ||| 182 84
sitzung ||| séance ||| 0.528624 0.417705 0.434797 0.492241 2.718 ||| 0-0 ||| 4251 6455
ad ||| af ||| 0.3 0.3 0.3 0.3 ||| 0-0 ||| 1000 1000
bd ||| bf ||| 0.3 0.3 0.3 0.3 ||| 0-0 ||| 10 10
der gipfel ||| sommet ||| 0.00163568 0.00436384 0.0183397 0.305702 ||| 1-0 ||| 5808 518
der pass ||| le col ||| 0.00867825 0.0142308 0.144445 0.0608095 ||| 0-0 1-1 ||| 749 45
pass ||| col ||| 0.0976 0.0719685 0.314433 0.340651 ||| 0-0 ||| 1875 582
pass ||| passeport retrouvé ||| 0.25 0.125 0.000859105 1.9065e-07 ||| 0-0 ||| 2 582
pass ||| passeport ||| 0.273444 0.221306 0.307008 0.343654 ||| 0-0 ||| 15 582
sitzung ||| séance ||| 0.528624 0.417705 0.434797 0.492241 ||| 0-0 ||| 22 17

View File

@ -1,9 +1,9 @@
ad ||| af ||| 0.3 0.3 0.3 0.3 0.11579 0.35574 0.472359 0.469238 2.718 ||| 0-0 ||| 25332.4712297 1074.23173673
bd ||| bf ||| 0.3 0.3 0.3 0.3 0.11579 0.35574 0.472359 0.469238 2.718 ||| 0-0 ||| 253.324712297 10.7423173673
der gipfel ||| sommet ||| 0.3 0.3 0.3 0.3 0.00327135 0.00686984 0.0366795 0.617135 2.718 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.3 0.3 0.3 0.3 0.0173565 0.023534 0.284201 0.0972183 2.718 ||| 0-0 1-1 ||| 749.0 45.7423173673
der pass ||| le passeport ||| 6e-10 6e-10 6e-10 6e-10 0.16 0.0329324 0.0064913 0.00303408 2.718 ||| 0-0 1-1 ||| 608.311780741 45.7423173673
pass ||| col ||| 0.3 0.3 0.3 0.3 0.1952 0.142393 0.6222 0.671744 2.718 ||| 0-0 ||| 1875.0 588.235465885
pass ||| passeport retrouvé ||| 0.3 0.3 0.3 0.3 0.5 0.199258 0.0017 5.11945e-07 2.718 ||| 0-0 ||| 2.0 588.235465885
pass ||| passeport ||| 0.3 0.3 0.3 0.3 0.280174 0.199258 0.0132359 0.0209644 2.718 ||| 0-0 ||| 4443.5097638 588.235465885
sitzung ||| séance ||| 0.3 0.3 0.3 0.3 0.784412 0.59168 0.511045 0.552002 2.718 ||| 0-0 ||| 103459.335197 496.165860589
ad ||| af ||| 0.3 0.3 0.3 0.3 0.115771 0.35574 0.472359 0.469238 ||| 0-0 ||| 25362.6029089 1074.23173673 ||| sparse_feature 1
bd ||| bf ||| 0.3 0.3 0.3 0.3 0.115771 0.35574 0.472359 0.469238 ||| 0-0 ||| 253.626029089 10.7423173673 |||
der gipfel ||| sommet ||| 0.3 0.3 0.3 0.3 0.00327135 0.00686984 0.0366795 0.617135 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.3 0.3 0.3 0.3 0.0173565 0.023534 0.284201 0.0972183 ||| 0-0 1-1 ||| 749.0 45.7423173673
der pass ||| le passeport ||| 6e-10 6e-10 6e-10 6e-10 0.16 0.0329324 0.0064913 0.00303408 ||| 0-0 1-1 ||| 609.065072723 45.7423173673
pass ||| col ||| 0.3 0.3 0.3 0.3 0.1952 0.142393 0.6222 0.671744 ||| 0-0 ||| 1875.0 588.235465885
pass ||| passeport retrouvé ||| 0.3 0.3 0.3 0.3 0.5 0.199258 0.0017 5.11945e-07 ||| 0-0 ||| 2.0 588.235465885
pass ||| passeport ||| 0.3 0.3 0.3 0.3 0.280174 0.199258 0.0132359 0.0209644 ||| 0-0 ||| 4448.99372942 588.235465885
sitzung ||| séance ||| 0.3 0.3 0.3 0.3 0.784412 0.59168 0.511045 0.552002 ||| 0-0 ||| 103587.424966 496.165860589

View File

@ -1,9 +1,9 @@
ad ||| af ||| 0.14 0.136364 0.18 0.3 2.718 ||| 0-0 ||| 1000 1000
bd ||| bf ||| 0.14 0.136364 0.18 0.3 2.718 ||| 0-0 ||| 10 10
der gipfel ||| sommet ||| 0.000327135 0.000793425 0.0073359 0.305702 2.718 ||| 1-0 ||| 5808 518
der pass ||| le col ||| 0.00173565 0.00258742 0.0577778 0.0608095 2.718 ||| 0-0 1-1 ||| 749 45
der pass ||| le passeport ||| 0.144 0.0278455 0.32 0.0374275 2.718 ||| 0-0 1-1 ||| 25 10
pass ||| col ||| 0.01952 0.0130852 0.125773 0.340651 2.718 ||| 0-0 ||| 1875 582
pass ||| passeport retrouvé ||| 0.05 0.0227273 0.000343642 1.9065e-07 2.718 ||| 0-0 ||| 2 582
pass ||| passeport ||| 0.278865 0.197829 0.487089 0.343654 2.718 ||| 0-0 ||| 182 84
sitzung ||| séance ||| 0.733342 0.56532 0.483911 0.492241 2.718 ||| 0-0 ||| 4251 6455
ad ||| af ||| 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 1000 1000
bd ||| bf ||| 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 10 10
der gipfel ||| sommet ||| 0.000327135 0.000793425 0.0073359 0.305702 ||| 1-0 ||| 5808 518
der pass ||| le col ||| 0.00173565 0.00258742 0.0577778 0.0608095 ||| 0-0 1-1 ||| 749 45
der pass ||| le passeport ||| 0.144 0.0278455 0.32 0.0374275 ||| 0-0 1-1 ||| 25 10
pass ||| col ||| 0.01952 0.0130852 0.125773 0.340651 ||| 0-0 ||| 1875 582
pass ||| passeport retrouvé ||| 0.05 0.0227273 0.000343642 1.9065e-07 ||| 0-0 ||| 2 582
pass ||| passeport ||| 0.278865 0.197829 0.487089 0.343654 ||| 0-0 ||| 15 582
sitzung ||| séance ||| 0.733342 0.56532 0.483911 0.492241 ||| 0-0 ||| 22 17

View File

@ -1,9 +1,9 @@
ad ||| af ||| 0.14 0.136364 0.18 0.3 2.718 ||| 0-0 ||| 10000.0 5000.0
bd ||| bf ||| 0.14 0.136364 0.18 0.3 2.718 ||| 0-0 ||| 100.0 50.0
der gipfel ||| sommet ||| 0.00327135 0.00569336 0.0366795 0.651018 2.718 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.0173565 0.0193836 0.152941 0.0675369 2.718 ||| 0-0 1-1 ||| 749.0 85.0
der pass ||| le passeport ||| 0.16 0.0307772 0.188235 0.0128336 2.718 ||| 0-0 1-1 ||| 225.0 85.0
pass ||| col ||| 0.1952 0.121573 0.398693 0.582296 2.718 ||| 0-0 ||| 1875.0 918.0
pass ||| passeport retrouvé ||| 0.5 0.193033 0.00108932 1.16835e-06 2.718 ||| 0-0 ||| 2.0 918.0
pass ||| passeport ||| 0.280097 0.193033 0.22658 0.11065 2.718 ||| 0-0 ||| 1653.0 918.0
sitzung ||| séance ||| 0.784227 0.597753 0.516546 0.559514 2.718 ||| 0-0 ||| 38281.0 25837.0
ad ||| af ||| 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 10000.0 5000.0
bd ||| bf ||| 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 100.0 50.0
der gipfel ||| sommet ||| 0.00327135 0.00569336 0.0366795 0.651018 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.0173565 0.0193836 0.152941 0.0675369 ||| 0-0 1-1 ||| 749.0 85.0
der pass ||| le passeport ||| 0.16 0.0307772 0.188235 0.0128336 ||| 0-0 1-1 ||| 225.0 85.0
pass ||| col ||| 0.1952 0.121573 0.398693 0.582296 ||| 0-0 ||| 1875.0 918.0
pass ||| passeport retrouvé ||| 0.5 0.193033 0.00108932 1.16835e-06 ||| 0-0 ||| 2.0 918.0
pass ||| passeport ||| 0.280097 0.193033 0.22658 0.11065 ||| 0-0 ||| 1653.0 918.0
sitzung ||| séance ||| 0.784227 0.597753 0.516546 0.559514 ||| 0-0 ||| 38281.0 25837.0

View File

@ -1,8 +1,8 @@
ad ||| af ||| 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 1000.0 1000.0
bd ||| bf ||| 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 10.0 10.0
der gipfel ||| sommet ||| 0.00327135 0.00872769 0.0366795 0.611404 2.718 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.0173565 0.0284616 0.288889 0.121619 2.718 ||| 0-0 1-1 ||| 749.0 45.0
pass ||| col ||| 0.1952 0.143937 0.628866 0.681301 2.718 ||| 0-0 ||| 1875.0 582.0
pass ||| passeport retrouvé ||| 0.5 0.25 0.00171821 3.80847e-07 2.718 ||| 0-0 ||| 2.0 582.0
pass ||| passeport ||| 0.266667 0.25 0.00687285 0.0113821 2.718 ||| 0-0 ||| 15.0 582.0
sitzung ||| séance ||| 0.272727 0.237288 0.352941 0.424242 2.718 ||| 0-0 ||| 22.0 17.0
ad ||| af ||| 0.5 0.5 0.5 0.5 ||| 0-0 ||| 1000.0 1000.0
bd ||| bf ||| 0.5 0.5 0.5 0.5 ||| 0-0 ||| 10.0 10.0
der gipfel ||| sommet ||| 0.00327135 0.00872769 0.0366795 0.611404 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.0173565 0.0284616 0.288889 0.121619 ||| 0-0 1-1 ||| 749.0 45.0
pass ||| col ||| 0.1952 0.143937 0.628866 0.681301 ||| 0-0 ||| 1875.0 582.0
pass ||| passeport retrouvé ||| 0.5 0.25 0.00171821 3.80847e-07 ||| 0-0 ||| 2.0 582.0
pass ||| passeport ||| 0.266667 0.25 0.00687285 0.0113821 ||| 0-0 ||| 15.0 582.0
sitzung ||| séance ||| 0.272727 0.237288 0.352941 0.424242 ||| 0-0 ||| 22.0 17.0

View File

@ -1,9 +1,9 @@
ad ||| af ||| 0.11579 0.35574 0.472359 0.469238 2.718 ||| 0-0 ||| 25332.4712297 1074.23173673
bd ||| bf ||| 0.11579 0.35574 0.472359 0.469238 2.718 ||| 0-0 ||| 253.324712297 10.7423173673
der gipfel ||| sommet ||| 0.00327135 0.00686984 0.0366795 0.617135 2.718 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.0173565 0.023534 0.284201 0.0972183 2.718 ||| 0-0 1-1 ||| 749.0 45.7423173673
der pass ||| le passeport ||| 0.16 0.0329324 0.0064913 0.00303408 2.718 ||| 0-0 1-1 ||| 608.311780741 45.7423173673
pass ||| col ||| 0.1952 0.142393 0.6222 0.671744 2.718 ||| 0-0 ||| 1875.0 588.235465885
pass ||| passeport retrouvé ||| 0.5 0.199258 0.0017 5.11945e-07 2.718 ||| 0-0 ||| 2.0 588.235465885
pass ||| passeport ||| 0.280174 0.199258 0.0132359 0.0209644 2.718 ||| 0-0 ||| 4443.5097638 588.235465885
sitzung ||| séance ||| 0.784412 0.59168 0.511045 0.552002 2.718 ||| 0-0 ||| 103459.335197 496.165860589
ad ||| af ||| 0.115771 0.35574 0.472359 0.469238 ||| 0-0 ||| 25362.6029089 1074.23173673
bd ||| bf ||| 0.115771 0.35574 0.472359 0.469238 ||| 0-0 ||| 253.626029089 10.7423173673
der gipfel ||| sommet ||| 0.00327135 0.00686984 0.0366795 0.617135 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.0173565 0.023534 0.284201 0.0972183 ||| 0-0 1-1 ||| 749.0 45.7423173673
der pass ||| le passeport ||| 0.16 0.0329324 0.0064913 0.00303408 ||| 0-0 1-1 ||| 609.065072723 45.7423173673
pass ||| col ||| 0.1952 0.142393 0.6222 0.671744 ||| 0-0 ||| 1875.0 588.235465885
pass ||| passeport retrouvé ||| 0.5 0.199258 0.0017 5.11945e-07 ||| 0-0 ||| 2.0 588.235465885
pass ||| passeport ||| 0.280174 0.199258 0.0132359 0.0209644 ||| 0-0 ||| 4448.99372942 588.235465885
sitzung ||| séance ||| 0.784412 0.59168 0.511045 0.552002 ||| 0-0 ||| 103587.424966 496.165860589

View File

@ -1,4 +1,4 @@
ad ||| af ||| 0.117462 0.117462 0.117462 0.117462 2.718 ||| 0-0 ||| 1000 1000
bd ||| bf ||| 0.117462 0.117462 0.117462 0.117462 2.718 ||| 0-0 ||| 10 10
pass ||| passeport ||| 0.278834 0.197701 0.387861 0.449295 2.718 ||| 0-0 ||| 182 84
sitzung ||| séance ||| 0.705857 0.545304 0.497336 0.544877 2.718 ||| 0-0 ||| 4251 6455
ad ||| af ||| 0.117462 0.117462 0.117462 0.117462 ||| 0-0 ||| 1000 1000
bd ||| bf ||| 0.117462 0.117462 0.117462 0.117462 ||| 0-0 ||| 10 10
pass ||| passeport ||| 0.278834 0.197701 0.387861 0.449295 ||| 0-0 ||| 15 582
sitzung ||| séance ||| 0.705857 0.545304 0.497336 0.544877 ||| 0-0 ||| 22 17

View File

@ -1 +1 @@
([(1.8744705606119034, 2.0752881273042374, 1.5025010618768841, 1.2370391973008494, 0, 0, 1, 1, 22), (0.35011602922315899, 0.74148657814725749, 0.95272965495298623, 0.83588062023889353, 1, 0, 0, 1, 22)], (1, 22, 20))
([(1.8744705606119034, 2.0752881273042374, 1.5025010618768841, 1.2370391973008494, 0, 0, 1, 1, 22), (0.350116029223159, 0.7414865781472575, 0.9527296549529862, 0.8358806202388935, 1, 0, 0, 1, 22)], (1, 22, 20))

View File

@ -1,9 +1,9 @@
ad ||| af ||| 0.242966 0.398085 0.483231 0.482814 2.718 ||| 0-0 ||| 2797.86490081 1043.7557397
bd ||| bf ||| 0.102213 0.111367 0.174411 0.172867 2.718 ||| 0-0 ||| 1807.86490081 53.7557396976
der gipfel ||| sommet ||| 0.00327135 0.00863717 0.0366795 0.612073 2.718 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.0173565 0.0260469 0.146469 0.113553 2.718 ||| 0-0 1-1 ||| 749.0 88.7557396976
der pass ||| le passeport ||| 0.16 0.0389201 0.197196 0.0101009 2.718 ||| 0-0 1-1 ||| 1797.86490081 88.7557396976
pass ||| col ||| 0.1952 0.131811 0.584893 0.63621 2.718 ||| 0-0 ||| 1875.0 625.755739698
pass ||| passeport retrouvé ||| 0.5 0.196956 0.00159806 1.89355e-06 2.718 ||| 0-0 ||| 2.0 625.755739698
pass ||| passeport ||| 0.280108 0.196956 0.0488465 0.0565932 2.718 ||| 0-0 ||| 1812.86490081 625.755739698
sitzung ||| séance ||| 0.778334 0.545019 0.470846 0.502625 2.718 ||| 0-0 ||| 1819.86490081 60.7557396976
ad ||| af ||| 0.242882 0.39808 0.483231 0.482813 ||| 0-0 ||| 2799.50876845 1043.75589858
bd ||| bf ||| 0.102211 0.111366 0.17441 0.172864 ||| 0-0 ||| 1809.50876845 53.7558985771
der gipfel ||| sommet ||| 0.00327135 0.00863716 0.0366795 0.612073 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.0173565 0.0260468 0.146469 0.113553 ||| 0-0 1-1 ||| 749.0 88.7558985771
der pass ||| le passeport ||| 0.16 0.03892 0.197197 0.0101013 ||| 0-0 1-1 ||| 1799.50876845 88.7558985771
pass ||| col ||| 0.1952 0.13181 0.584893 0.636208 ||| 0-0 ||| 1875.0 625.755898577
pass ||| passeport retrouvé ||| 0.5 0.196956 0.00159806 1.89361e-06 ||| 0-0 ||| 2.0 625.755898577
pass ||| passeport ||| 0.280108 0.196956 0.0488467 0.056595 ||| 0-0 ||| 1814.50876845 625.755898577
sitzung ||| séance ||| 0.77834 0.545022 0.470846 0.502627 ||| 0-0 ||| 1821.50876845 60.7558985771

View File

@ -1,9 +1,9 @@
ad ||| af ||| 0.45 0.45 0.45 0.45 0.14 0.136364 0.18 0.3 2.718 ||| 0-0 ||| 10000.0 5000.0
bd ||| bf ||| 0.45 0.45 0.45 0.45 0.14 0.136364 0.18 0.3 2.718 ||| 0-0 ||| 100.0 50.0
der gipfel ||| sommet ||| 0.15 0.15 0.15 0.15 0.00327135 0.00569336 0.0366795 0.651018 2.718 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.15 0.15 0.15 0.15 0.0173565 0.0193836 0.152941 0.0675369 2.718 ||| 0-0 1-1 ||| 749.0 85.0
der pass ||| le passeport ||| 0.3 0.3 0.3 0.3 0.16 0.0307772 0.188235 0.0128336 2.718 ||| 0-0 1-1 ||| 225.0 85.0
pass ||| col ||| 0.15 0.15 0.15 0.15 0.1952 0.121573 0.398693 0.582296 2.718 ||| 0-0 ||| 1875.0 918.0
pass ||| passeport retrouvé ||| 0.15 0.15 0.15 0.15 0.5 0.193033 0.00108932 1.16835e-06 2.718 ||| 0-0 ||| 2.0 918.0
pass ||| passeport ||| 0.45 0.45 0.45 0.45 0.280097 0.193033 0.22658 0.11065 2.718 ||| 0-0 ||| 1653.0 918.0
sitzung ||| séance ||| 0.45 0.45 0.45 0.45 0.784227 0.597753 0.516546 0.559514 2.718 ||| 0-0 ||| 38281.0 25837.0
ad ||| af ||| 0.45 0.45 0.45 0.45 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 10000.0 5000.0 ||| sparse_feature 1
bd ||| bf ||| 0.45 0.45 0.45 0.45 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 100.0 50.0 |||
der gipfel ||| sommet ||| 0.15 0.15 0.15 0.15 0.00327135 0.00569336 0.0366795 0.651018 ||| 1-0 ||| 5808.0 518.0
der pass ||| le col ||| 0.15 0.15 0.15 0.15 0.0173565 0.0193836 0.152941 0.0675369 ||| 0-0 1-1 ||| 749.0 85.0
der pass ||| le passeport ||| 0.3 0.3 0.3 0.3 0.16 0.0307772 0.188235 0.0128336 ||| 0-0 1-1 ||| 225.0 85.0
pass ||| col ||| 0.15 0.15 0.15 0.15 0.1952 0.121573 0.398693 0.582296 ||| 0-0 ||| 1875.0 918.0
pass ||| passeport retrouvé ||| 0.15 0.15 0.15 0.15 0.5 0.193033 0.00108932 1.16835e-06 ||| 0-0 ||| 2.0 918.0
pass ||| passeport ||| 0.45 0.45 0.45 0.45 0.280097 0.193033 0.22658 0.11065 ||| 0-0 ||| 1653.0 918.0
sitzung ||| séance ||| 0.45 0.45 0.45 0.45 0.784227 0.597753 0.516546 0.559514 ||| 0-0 ||| 38281.0 25837.0

View File

@ -106,7 +106,7 @@ class Moses():
scores = line[2].split()
if len(scores) <self.number_of_features:
sys.stderr.write('Error: model only has {0} features. Expected {1}.\n'.format(len(scores),self.number_of_features))
exit()
exit(1)
scores = scores[:self.number_of_features]
model_probabilities = map(float,scores)
@ -114,7 +114,7 @@ class Moses():
if mode == 'counts' and not priority == 2: #priority 2 is MAP
try:
counts = map(float,line[-1].split())
counts = map(float,line[4].split())
try:
target_count,src_count,joint_count = counts
joint_count_e2f = joint_count
@ -145,7 +145,7 @@ class Moses():
if (store == 'all' or store == 'source') and not (filter_by_src and not src in filter_by_src):
if mode == 'counts' and not priority == 2: #priority 2 is MAP
try:
self.phrase_source[src][i] = float(line[-1].split()[1])
self.phrase_source[src][i] = float(line[4].split()[1])
except:
sys.stderr.write(str(line)+'\n')
sys.stderr.write('ERROR: Counts are missing or misformatted. Maybe your phrase table is from an older Moses version that doesn\'t store counts or word alignment?\n')
@ -156,7 +156,7 @@ class Moses():
if (store == 'all' or store == 'target') and not (filter_by_target and not target in filter_by_target):
if mode == 'counts' and not priority == 2: #priority 2 is MAP
try:
self.phrase_target[target][i] = float(line[-1].split()[0])
self.phrase_target[target][i] = float(line[4].split()[0])
except:
sys.stderr.write(str(line)+'\n')
sys.stderr.write('ERROR: Counts are missing or misformatted. Maybe your phrase table is from an older Moses version that doesn\'t store counts or word alignment?\n')
@ -179,7 +179,7 @@ class Moses():
reordering_probabilities[j][i] = p
except IndexError:
sys.stderr.write('\nIndexError: Did you correctly specify the number of reordering features? (--number_of_features N in command line)\n')
exit()
exit(1)
def traverse_incrementally(self,table,models,load_lines,store_flag,mode='interpolate',inverted=False,lowmem=False,flags=None):
"""hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
@ -210,6 +210,9 @@ class Moses():
for line in model:
line = line.rstrip().split(b' ||| ')
if line[-1].endswith(b' |||'):
line[-1] = line[-1][:-4]
line.append('')
if increment != line[0]:
stack[i] = line
@ -300,20 +303,21 @@ class Moses():
def store_info(self,src,target,line):
"""store alignment info and comment section for re-use in output"""
if len(line) == 5:
self.phrase_pairs[src][target][1] = line[3:5]
if len(line) >= 5:
if not self.phrase_pairs[src][target][1]:
self.phrase_pairs[src][target][1] = line[3:]
# assuming that alignment is empty
elif len(line) == 4:
if self.require_alignment:
sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n')
exit()
exit(1)
self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')]
else:
sys.stderr.write('Error: unexpected phrase table format. Are you using a very old/new version of Moses with different formatting?\n')
exit()
exit(1)
def get_word_alignments(self,src,target,cache=False,mycache={}):
@ -373,7 +377,8 @@ class Moses():
return ''
# information specific to Moses model: alignment info and comment section with target and source counts
alignment,comments = self.phrase_pairs[src][target][1]
additional_entries = self.phrase_pairs[src][target][1]
alignment = additional_entries[0]
if alignment:
extra_space = b' '
else:
@ -384,7 +389,7 @@ class Moses():
i_f2e = flags['i_f2e']
srccount = dot_product(self.phrase_source[src],weights[i_f2e])
targetcount = dot_product(self.phrase_target[target],weights[i_e2f])
comments = b"%s %s" %(targetcount,srccount)
additional_entries[1] = b"%s %s" %(targetcount,srccount)
features = b' '.join([b'%.6g' %(f) for f in features])
@ -397,7 +402,7 @@ class Moses():
phrase_penalty = b' 2.718'
else:
phrase_penalty = b''
line = b"%s ||| %s ||| %s%s %s||| %s%s||| %s\n" %(src,target,features,origin_features,phrase_penalty,alignment,extra_space,comments)
line = b"%s ||| %s ||| %s%s %s||| %s%s||| %s\n" %(src,target,features,origin_features,phrase_penalty,alignment,extra_space,b' ||| '.join(additional_entries[1:]))
return line
@ -473,8 +478,15 @@ class Moses():
for line,line2 in izip(pt_normal,pt_inverse):
line = line.split(b' ||| ')
if line[-1].endswith(b' |||'):
line[-1] = line[-1][:-4]
line.append('')
line2 = line2.split(b' ||| ')
if line2[-1].endswith(b' |||'):
line2[-1] = line2[-1][:-4]
line2.append('')
#scores
mid = int(self.number_of_features/2)
scores1 = line[2].split()
@ -483,11 +495,11 @@ class Moses():
# marginal counts
if mode == 'counts':
src_count = line[-1].split()[1]
src_count = line[4].split()[1]
target_count = line2[-1].split()[0]
line[-1] = b' '.join([target_count,src_count]) + b'\n'
line[4] = b' '.join([target_count,src_count])
pt_out.write(b' ||| '.join(line))
pt_out.write(b' ||| '.join(line)+ b'\n')
pt_normal.close()
pt_inverse.close()
@ -515,7 +527,7 @@ class TigerXML():
if not src or not target:
sys.stderr.write('Error: Source and/or target language not specified. Required for TigerXML extraction.\n')
exit()
exit(1)
alignments = self._get_aligned_ids(src,target)
self._textualize_alignments(src,target,alignments)
@ -685,7 +697,10 @@ class Moses_Alignment():
for line in fileobj:
line = line.split(b' ||| ')
if line[-1].endswith(b' |||'):
line[-1] = line[-1][:-4]
line.append('')
src = line[0]
target = line[1]
@ -1261,7 +1276,7 @@ def handle_file(filename,action,fileobj=None,mode='r'):
sys.stderr.write('For a weighted counts combination, we need statistics that Moses doesn\'t write to disk by default.\n')
sys.stderr.write('Repeat step 4 of Moses training for all models with the option -write-lexical-counts.\n')
exit()
exit(1)
if filename.endswith('.gz'):
fileobj = gzip.open(filename,mode)
@ -1435,7 +1450,7 @@ class Combine_TMs():
if mode not in ['interpolate','loglinear','counts']:
sys.stderr.write('Error: mode must be either "interpolate", "loglinear" or "counts"\n')
sys.exit()
sys.exit(1)
models,number_of_features,weights = self._sanity_checks(models,number_of_features,weights)
@ -1528,6 +1543,9 @@ class Combine_TMs():
sys.stderr.write('...'+str(j))
j += 1
line = line.rstrip().split(b' ||| ')
if line[-1].endswith(b' |||'):
line[-1] = line[-1][:-4]
line.append('')
self.model_interface.load_phrase_features(line,priority,i,store='all',mode=self.mode,filter_by=self.reference_interface.word_pairs,filter_by_src=self.reference_interface.word_source,filter_by_target=self.reference_interface.word_target,flags=self.flags)
sys.stderr.write(' done\n')
@ -1553,6 +1571,9 @@ class Combine_TMs():
sys.stderr.write('...'+str(j))
j += 1
line = line.rstrip().split(b' ||| ')
if line[-1].endswith(b' |||'):
line[-1] = line[-1][:-4]
line.append('')
self.model_interface.load_phrase_features(line,priority,i,mode=self.mode,store='target',flags=self.flags)
sys.stderr.write(' done\n')

View File

@ -288,11 +288,11 @@ rule failure-message ( ok ? ) {
echo "If you need support, attach the full output to your e-mail." ;
} else {
echo "The build failed. If you need support, run:" ;
echo " $(args) --debug-configuration -d2 >build.log" ;
echo "then attach build.log to your e-mail." ;
echo " $(args) --debug-configuration -d2 |gzip >build.log.gz" ;
echo "then attach build.log.gz to your e-mail." ;
echo "You MUST do 3 things before sending to the mailing list:" ;
echo " 1. Subscribe to the mailing list at http://mailman.mit.edu/mailman/listinfo/moses-support" ;
echo " 2. Zip up your build.log file before attaching it to the email" ;
echo " 2. Attach build.log.gz to your e-mail" ;
echo " 3. Say what is the EXACT command you executed when you got the error" ;
}
echo "ERROR" ;

View File

@ -1,4 +1,6 @@
#include "lm/bhiksha.hh"
#include "lm/binary_format.hh"
#include "lm/config.hh"
#include "util/file.hh"
#include "util/exception.hh"
@ -15,11 +17,11 @@ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_
const uint8_t kArrayBhikshaVersion = 0;
// TODO: put this in binary file header instead when I change the binary file format again.
void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) {
uint8_t version;
uint8_t configured_bits;
util::ReadOrThrow(fd, &version, 1);
util::ReadOrThrow(fd, &configured_bits, 1);
void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
uint8_t buffer[2];
file.ReadForConfig(buffer, 2, offset);
uint8_t version = buffer[0];
uint8_t configured_bits = buffer[1];
if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
config.pointer_bhiksha_bits = configured_bits;
}
@ -87,9 +89,6 @@ void ArrayBhiksha::FinishedLoading(const Config &config) {
*(head_write++) = config.pointer_bhiksha_bits;
}
void ArrayBhiksha::LoadedBinary() {
}
} // namespace trie
} // namespace ngram
} // namespace lm

View File

@ -24,6 +24,7 @@
namespace lm {
namespace ngram {
struct Config;
class BinaryFormat;
namespace trie {
@ -31,7 +32,7 @@ class DontBhiksha {
public:
static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {}
static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {}
static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
@ -53,8 +54,6 @@ class DontBhiksha {
void FinishedLoading(const Config &/*config*/) {}
void LoadedBinary() {}
uint8_t InlineBits() const { return next_.bits; }
private:
@ -65,7 +64,7 @@ class ArrayBhiksha {
public:
static const ModelType kModelTypeAdd = kArrayAdd;
static void UpdateConfigFromBinary(int fd, Config &config);
static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
@ -93,8 +92,6 @@ class ArrayBhiksha {
void FinishedLoading(const Config &config);
void LoadedBinary();
uint8_t InlineBits() const { return next_inline_.bits; }
private:

View File

@ -14,6 +14,9 @@
namespace lm {
namespace ngram {
const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
namespace {
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
@ -58,8 +61,6 @@ struct Sanity {
}
};
const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
std::size_t TotalHeaderSize(unsigned char order) {
return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
}
@ -81,83 +82,6 @@ void WriteHeader(void *to, const Parameters &params) {
} // namespace
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) {
if (config.write_mmap) {
std::size_t total = TotalHeaderSize(order) + memory_size;
backing.file.reset(util::CreateOrThrow(config.write_mmap));
if (config.write_method == Config::WRITE_MMAP) {
backing.vocab.reset(util::MapZeroedWrite(backing.file.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
} else {
util::ResizeOrThrow(backing.file.get(), 0);
util::MapAnonymous(total, backing.vocab);
}
strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order));
return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
} else {
util::MapAnonymous(memory_size, backing.vocab);
return reinterpret_cast<uint8_t*>(backing.vocab.get());
}
}
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
if (config.write_mmap) {
// Grow the file to accomodate the search, using zeros.
try {
util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
} catch (util::ErrnoException &e) {
e << " for file " << config.write_mmap;
throw e;
}
if (config.write_method == Config::WRITE_AFTER) {
util::MapAnonymous(memory_size, backing.search);
return reinterpret_cast<uint8_t*>(backing.search.get());
}
// mmap it now.
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
std::size_t page_size = util::SizePage();
std::size_t alignment_cruft = adjusted_vocab % page_size;
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
} else {
util::MapAnonymous(memory_size, backing.search);
return reinterpret_cast<uint8_t*>(backing.search.get());
}
}
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
if (!config.write_mmap) return;
switch (config.write_method) {
case Config::WRITE_MMAP:
util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
util::SyncOrThrow(backing.search.get(), backing.search.size());
break;
case Config::WRITE_AFTER:
util::SeekOrThrow(backing.file.get(), 0);
util::WriteOrThrow(backing.file.get(), backing.vocab.get(), backing.vocab.size());
util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad);
util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size());
util::FSyncOrThrow(backing.file.get());
break;
}
// header and vocab share the same mmap. The header is written here because we know the counts.
Parameters params = Parameters();
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;
params.fixed.search_version = search_version;
WriteHeader(backing.vocab.get(), params);
if (config.write_method == Config::WRITE_AFTER) {
util::SeekOrThrow(backing.file.get(), 0);
util::WriteOrThrow(backing.file.get(), backing.vocab.get(), TotalHeaderSize(counts.size()));
}
}
namespace detail {
bool IsBinaryFormat(int fd) {
const uint64_t size = util::SizeFile(fd);
if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
@ -209,44 +133,164 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
}
void SeekPastHeader(int fd, const Parameters &params) {
util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
BinaryFormat::BinaryFormat(const Config &config)
: write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params) {
file_.reset(fd);
write_mmap_ = NULL; // Ignore write requests; this is already in binary format.
ReadHeader(fd, params);
MatchCheck(model_type, search_version, params);
header_size_ = TotalHeaderSize(params.counts.size());
}
uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing) {
const uint64_t file_size = util::SizeFile(backing.file.get());
void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
assert(header_size_ != kInvalidSize);
util::PReadOrThrow(file_.get(), to, amount, offset_excluding_header + header_size_);
}
void *BinaryFormat::LoadBinary(std::size_t size) {
assert(header_size_ != kInvalidSize);
const uint64_t file_size = util::SizeFile(file_.get());
// The header is smaller than a page, so we have to map the whole header as well.
std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
uint64_t total_map = static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(size);
UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.search);
util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_);
if (config.enumerate_vocab && !params.fixed.has_vocabulary)
UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
// Seek to vocabulary words
util::SeekOrThrow(backing.file.get(), total_map);
return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size());
vocab_string_offset_ = total_map;
return reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
}
void ComplainAboutARPA(const Config &config, ModelType model_type) {
if (config.write_mmap || !config.messages) return;
if (config.arpa_complain == Config::ALL) {
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
} else if (config.arpa_complain == Config::EXPENSIVE &&
(model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
vocab_size_ = memory_size;
if (!write_mmap_) {
header_size_ = 0;
util::MapAnonymous(memory_size, memory_vocab_);
return reinterpret_cast<uint8_t*>(memory_vocab_.get());
}
header_size_ = TotalHeaderSize(order);
std::size_t total = util::CheckOverflow(static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(memory_size));
file_.reset(util::CreateOrThrow(write_mmap_));
// some gccs complain about uninitialized variables even though all enum values are covered.
void *vocab_base = NULL;
switch (write_method_) {
case Config::WRITE_MMAP:
mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
vocab_base = mapping_.get();
break;
case Config::WRITE_AFTER:
util::ResizeOrThrow(file_.get(), 0);
util::MapAnonymous(total, memory_vocab_);
vocab_base = memory_vocab_.get();
break;
}
strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
}
void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) {
assert(vocab_size_ != kInvalidSize);
vocab_pad_ = vocab_pad;
std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
vocab_string_offset_ = new_size;
if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
util::MapAnonymous(memory_size, memory_search_);
assert(header_size_ == 0 || write_mmap_);
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
return reinterpret_cast<uint8_t*>(memory_search_.get());
}
assert(write_method_ == Config::WRITE_MMAP);
// Also known as total size without vocab words.
// Grow the file to accomodate the search, using zeros.
// According to man mmap, behavior is undefined when the file is resized
// underneath a mmap that is not a multiple of the page size. So to be
// safe, we'll unmap it and map it again.
mapping_.reset();
util::ResizeOrThrow(file_.get(), new_size);
void *ret;
MapFile(vocab_base, ret);
return ret;
}
void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) {
// Checking Config's include_vocab is the responsibility of the caller.
assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize);
if (!write_mmap_) {
// Unchanged base.
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get());
search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
return;
}
if (write_method_ == Config::WRITE_MMAP) {
mapping_.reset();
}
util::SeekOrThrow(file_.get(), VocabStringReadingOffset());
util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
if (write_method_ == Config::WRITE_MMAP) {
MapFile(vocab_base, search_base);
} else {
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
}
}
} // namespace detail
void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts) {
if (!write_mmap_) return;
switch (write_method_) {
case Config::WRITE_MMAP:
util::SyncOrThrow(mapping_.get(), mapping_.size());
break;
case Config::WRITE_AFTER:
util::SeekOrThrow(file_.get(), 0);
util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size());
util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_);
util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size());
util::FSyncOrThrow(file_.get());
break;
}
// header and vocab share the same mmap.
Parameters params = Parameters();
memset(&params, 0, sizeof(Parameters));
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;
params.fixed.search_version = search_version;
switch (write_method_) {
case Config::WRITE_MMAP:
WriteHeader(mapping_.get(), params);
util::SyncOrThrow(mapping_.get(), mapping_.size());
break;
case Config::WRITE_AFTER:
{
std::vector<uint8_t> buffer(TotalHeaderSize(counts.size()));
WriteHeader(&buffer[0], params);
util::SeekOrThrow(file_.get(), 0);
util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
}
break;
}
}
void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) {
mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED);
vocab_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
search_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_;
}
bool RecognizeBinary(const char *file, ModelType &recognized) {
util::scoped_fd fd(util::OpenReadOrThrow(file));
if (!detail::IsBinaryFormat(fd.get())) return false;
if (!IsBinaryFormat(fd.get())) {
return false;
}
Parameters params;
detail::ReadHeader(fd.get(), params);
ReadHeader(fd.get(), params);
recognized = params.fixed.model_type;
return true;
}

View File

@ -17,6 +17,8 @@
namespace lm {
namespace ngram {
extern const char *kModelNames[6];
/*Inspect a file to determine if it is a binary lm. If not, return false.
* If so, return true and set recognized to the type. This is the only API in
* this header designed for use by decoder authors.
@ -42,67 +44,63 @@ struct Parameters {
std::vector<uint64_t> counts;
};
struct Backing {
// File behind memory, if any.
util::scoped_fd file;
// Vocabulary lookup table. Not to be confused with the vocab words themselves.
util::scoped_memory vocab;
// Raw block of memory backing the language model data structures
util::scoped_memory search;
class BinaryFormat {
public:
explicit BinaryFormat(const Config &config);
// Reading a binary file:
// Takes ownership of fd
void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params);
// Used to read parts of the file to update the config object before figuring out full size.
void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
// Actually load the binary file and return a pointer to the beginning of the search area.
void *LoadBinary(std::size_t size);
uint64_t VocabStringReadingOffset() const {
assert(vocab_string_offset_ != kInvalidOffset);
return vocab_string_offset_;
}
// Writing a binary file or initializing in RAM from ARPA:
// Size for vocabulary.
void *SetupJustVocab(std::size_t memory_size, uint8_t order);
// Warning: can change the vocaulary base pointer.
void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
// Warning: can change vocabulary and search base addresses.
void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
// Write the header at the beginning of the file.
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
private:
void MapFile(void *&vocab_base, void *&search_base);
// Copied from configuration.
const Config::WriteMethod write_method_;
const char *write_mmap_;
util::LoadMethod load_method_;
// File behind memory, if any.
util::scoped_fd file_;
// If there is a file involved, a single mapping.
util::scoped_memory mapping_;
// If the data is only in memory, separately allocate each because the trie
// knows vocab's size before it knows search's size (because SRILM might
// have pruned).
util::scoped_memory memory_vocab_, memory_search_;
// Memory ranges. Note that these may not be contiguous and may not all
// exist.
std::size_t header_size_, vocab_size_, vocab_pad_;
// aka end of search.
uint64_t vocab_string_offset_;
static const uint64_t kInvalidOffset = (uint64_t)-1;
};
// Create just enough of a binary file to write vocabulary to it.
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing);
// Write header to binary file. This is done last to prevent incomplete files
// from loading.
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing);
namespace detail {
bool IsBinaryFormat(int fd);
void ReadHeader(int fd, Parameters &params);
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params);
void SeekPastHeader(int fd, const Parameters &params);
uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing);
void ComplainAboutARPA(const Config &config, ModelType model_type);
} // namespace detail
template <class To> void LoadLM(const char *file, const Config &config, To &to) {
Backing &backing = to.MutableBacking();
backing.file.reset(util::OpenReadOrThrow(file));
try {
if (detail::IsBinaryFormat(backing.file.get())) {
Parameters params;
detail::ReadHeader(backing.file.get(), params);
detail::MatchCheck(To::kModelType, To::kVersion, params);
// Replace the run-time configured probing_multiplier with the one in the file.
Config new_config(config);
new_config.probing_multiplier = params.fixed.probing_multiplier;
detail::SeekPastHeader(backing.file.get(), params);
To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config);
uint64_t memory_size = To::Size(params.counts, new_config);
uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
to.InitializeFromBinary(start, params, new_config, backing.file.get());
} else {
detail::ComplainAboutARPA(config, To::kModelType);
to.InitializeFromARPA(file, config);
}
} catch (util::Exception &e) {
e << " File: " << file;
throw;
}
}
} // namespace ngram
} // namespace lm
#endif // LM_BINARY_FORMAT__

View File

@ -87,7 +87,7 @@ class VocabHandout {
Table table_;
std::size_t double_cutoff_;
util::FakeOFStream word_list_;
};
@ -98,7 +98,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
std::size_t operator()(const WordIndex *start) const {
return util::MurmurHashNative(start, size_);
}
private:
const std::size_t size_;
};
@ -106,11 +106,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
public:
explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
bool operator()(const WordIndex *first, const WordIndex *second) const {
return !memcmp(first, second, size_);
}
}
private:
const std::size_t size_;
};
@ -131,7 +131,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
class Writer {
public:
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
: block_(position), gram_(block_->Get(), order),
dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
@ -140,7 +140,7 @@ class Writer {
dedupe_.Clear();
assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
if (order == 1) {
// Add special words. AdjustCounts is responsible if order != 1.
// Add special words. AdjustCounts is responsible if order != 1.
AddUnigramWord(kUNK);
AddUnigramWord(kBOS);
}
@ -170,16 +170,16 @@ class Writer {
memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
return;
}
// Complete the write.
// Complete the write.
gram_.Count() = 1;
// Prepare the next n-gram.
// Prepare the next n-gram.
if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
NGram last(gram_);
gram_.NextInMemory();
std::copy(last.begin() + 1, last.end(), gram_.begin());
return;
}
// Block end. Need to store the context in a temporary buffer.
// Block end. Need to store the context in a temporary buffer.
std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
dedupe_.Clear();
block_->SetValidSize(block_size_);
@ -207,7 +207,7 @@ class Writer {
// Hash table combiner implementation.
Dedupe dedupe_;
// Small buffer to hold existing ngrams when shifting across a block boundary.
// Small buffer to hold existing ngrams when shifting across a block boundary.
boost::scoped_array<WordIndex> buffer_;
const std::size_t block_size_;
@ -223,7 +223,7 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
return VocabHandout::MemUsage(vocab_estimate);
}
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
: from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) {
@ -240,7 +240,10 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
uint64_t count = 0;
bool delimiters[256];
memset(delimiters, 0, sizeof(delimiters));
delimiters['\0'] = delimiters['\t'] = delimiters['\n'] = delimiters['\r'] = delimiters[' '] = true;
const char kDelimiterSet[] = "\0\t\n\r ";
for (const char *i = kDelimiterSet; i < kDelimiterSet + sizeof(kDelimiterSet); ++i) {
delimiters[static_cast<unsigned char>(*i)] = true;
}
try {
while(true) {
StringPiece line(from_.ReadLine());

View File

@ -33,12 +33,12 @@ class Callback {
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
probs_[order_minus_1 + 1] = pay.complete.prob;
pay.complete.prob = log10(pay.complete.prob);
// TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling.
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1].Get()) { // check valid pointer at tht end
// TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling.
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
++backoffs_[order_minus_1];
} else {
// Not a context.
// Not a context.
pay.complete.backoff = 0.0;
}
}
@ -52,7 +52,7 @@ class Callback {
};
} // namespace
Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
: uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {}
// perform order-wise interpolation

View File

@ -11,11 +11,7 @@ Config::Config() :
enumerate_vocab(NULL),
unknown_missing(COMPLAIN),
sentence_marker_missing(THROW_UP),
#if defined(_WIN32) || defined(_WIN64)
positive_log_probability(SILENT),
#else
positive_log_probability(THROW_UP),
#endif
unknown_missing_logprob(-100.0),
probing_multiplier(1.5),
building_memory(1073741824ULL), // 1 GB

View File

@ -17,14 +17,14 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
typedef VocabularyT Vocabulary;
/* Translate from void* to State */
FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->FullScore(
*reinterpret_cast<const State*>(in_state),
new_word,
*reinterpret_cast<State*>(out_state));
}
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->FullScoreForgotState(
context_rbegin,
context_rend,
@ -37,7 +37,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
}
float Score(const void *in_state, const WordIndex new_word, void *out_state) const {
float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->Score(
*reinterpret_cast<const State*>(in_state),
new_word,

View File

@ -14,10 +14,6 @@
#include <string>
#include <vector>
#if !defined __MINGW32__
#include <err.h>
#endif
#include <string.h>
#include <stdint.h>

View File

@ -5,27 +5,18 @@
#include <iostream>
#include <string>
#if !defined __MINGW32__
#include <err.h>
#endif
#include "util/fake_ofstream.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
namespace lm {
class CountOutput : boost::noncopyable {
public:
explicit CountOutput(const char *name) : file_(name, std::ios::out) {}
explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {}
void AddNGram(const StringPiece &line) {
if (!(file_ << line << '\n')) {
#if defined __MINGW32__
std::cerr<<"Writing counts file failed"<<std::endl;
exit(3);
#else
err(3, "Writing counts file failed");
#endif
}
file_ << line << '\n';
}
template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
@ -37,12 +28,12 @@ class CountOutput : boost::noncopyable {
}
private:
std::fstream file_;
util::FakeOFStream file_;
};
class CountBatch {
public:
explicit CountBatch(std::streamsize initial_read)
explicit CountBatch(std::streamsize initial_read)
: initial_read_(initial_read) {
buffer_.reserve(initial_read);
}
@ -75,7 +66,7 @@ class CountBatch {
private:
std::streamsize initial_read_;
// This could have been a std::string but that's less happy with raw writes.
// This could have been a std::string but that's less happy with raw writes.
std::vector<char> buffer_;
};

View File

@ -6,6 +6,7 @@
#endif
#include "lm/filter/vocab.hh"
#include "lm/filter/wrapper.hh"
#include "util/exception.hh"
#include "util/file_piece.hh"
#include <boost/ptr_container/ptr_vector.hpp>
@ -57,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
struct Config {
Config() :
Config() :
#ifndef NTHREAD
batch_size(25000),
threads(boost::thread::hardware_concurrency()),
@ -157,102 +158,96 @@ template <class Format> void DispatchFilterModes(const Config &config, std::istr
} // namespace lm
int main(int argc, char *argv[]) {
if (argc < 4) {
lm::DisplayHelp(argv[0]);
return 1;
}
// I used to have boost::program_options, but some users didn't want to compile boost.
lm::Config config;
config.mode = lm::MODE_UNSET;
for (int i = 1; i < argc - 2; ++i) {
const char *str = argv[i];
if (!std::strcmp(str, "copy")) {
config.mode = lm::MODE_COPY;
} else if (!std::strcmp(str, "single")) {
config.mode = lm::MODE_SINGLE;
} else if (!std::strcmp(str, "multiple")) {
config.mode = lm::MODE_MULTIPLE;
} else if (!std::strcmp(str, "union")) {
config.mode = lm::MODE_UNION;
} else if (!std::strcmp(str, "phrase")) {
config.phrase = true;
} else if (!std::strcmp(str, "context")) {
config.context = true;
} else if (!std::strcmp(str, "arpa")) {
config.format = lm::FORMAT_ARPA;
} else if (!std::strcmp(str, "raw")) {
config.format = lm::FORMAT_COUNT;
#ifndef NTHREAD
} else if (!std::strncmp(str, "threads:", 8)) {
config.threads = boost::lexical_cast<size_t>(str + 8);
if (!config.threads) {
std::cerr << "Specify at least one thread." << std::endl;
return 1;
}
} else if (!std::strncmp(str, "batch_size:", 11)) {
config.batch_size = boost::lexical_cast<size_t>(str + 11);
if (config.batch_size < 5000) {
std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
if (!config.batch_size) return 1;
}
#endif
} else {
try {
if (argc < 4) {
lm::DisplayHelp(argv[0]);
return 1;
}
}
if (config.mode == lm::MODE_UNSET) {
lm::DisplayHelp(argv[0]);
return 1;
}
if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl;
return 1;
}
bool cmd_is_model = true;
const char *cmd_input = argv[argc - 2];
if (!strncmp(cmd_input, "vocab:", 6)) {
cmd_is_model = false;
cmd_input += 6;
} else if (!strncmp(cmd_input, "model:", 6)) {
cmd_input += 6;
} else if (strchr(cmd_input, ':')) {
#if defined __MINGW32__
std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
exit(1);
#else
errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input);
#endif // defined
} else {
std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
}
std::ifstream cmd_file;
std::istream *vocab;
if (cmd_is_model) {
vocab = &std::cin;
} else {
cmd_file.open(cmd_input, std::ios::in);
if (!cmd_file) {
#if defined __MINGW32__
std::cerr << "Could not open input file " << cmd_input << std::endl;
exit(2);
#else
err(2, "Could not open input file %s", cmd_input);
#endif // defined
// I used to have boost::program_options, but some users didn't want to compile boost.
lm::Config config;
config.mode = lm::MODE_UNSET;
for (int i = 1; i < argc - 2; ++i) {
const char *str = argv[i];
if (!std::strcmp(str, "copy")) {
config.mode = lm::MODE_COPY;
} else if (!std::strcmp(str, "single")) {
config.mode = lm::MODE_SINGLE;
} else if (!std::strcmp(str, "multiple")) {
config.mode = lm::MODE_MULTIPLE;
} else if (!std::strcmp(str, "union")) {
config.mode = lm::MODE_UNION;
} else if (!std::strcmp(str, "phrase")) {
config.phrase = true;
} else if (!std::strcmp(str, "context")) {
config.context = true;
} else if (!std::strcmp(str, "arpa")) {
config.format = lm::FORMAT_ARPA;
} else if (!std::strcmp(str, "raw")) {
config.format = lm::FORMAT_COUNT;
#ifndef NTHREAD
} else if (!std::strncmp(str, "threads:", 8)) {
config.threads = boost::lexical_cast<size_t>(str + 8);
if (!config.threads) {
std::cerr << "Specify at least one thread." << std::endl;
return 1;
}
} else if (!std::strncmp(str, "batch_size:", 11)) {
config.batch_size = boost::lexical_cast<size_t>(str + 11);
if (config.batch_size < 5000) {
std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
if (!config.batch_size) return 1;
}
#endif
} else {
lm::DisplayHelp(argv[0]);
return 1;
}
}
vocab = &cmd_file;
}
util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
if (config.mode == lm::MODE_UNSET) {
lm::DisplayHelp(argv[0]);
return 1;
}
if (config.format == lm::FORMAT_ARPA) {
lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
} else if (config.format == lm::FORMAT_COUNT) {
lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl;
return 1;
}
bool cmd_is_model = true;
const char *cmd_input = argv[argc - 2];
if (!strncmp(cmd_input, "vocab:", 6)) {
cmd_is_model = false;
cmd_input += 6;
} else if (!strncmp(cmd_input, "model:", 6)) {
cmd_input += 6;
} else if (strchr(cmd_input, ':')) {
std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
return 1;
} else {
std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
}
std::ifstream cmd_file;
std::istream *vocab;
if (cmd_is_model) {
vocab = &std::cin;
} else {
cmd_file.open(cmd_input, std::ios::in);
UTIL_THROW_IF(!cmd_file, util::ErrnoException, "Failed to open " << cmd_input);
vocab = &cmd_file;
}
util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
if (config.format == lm::FORMAT_ARPA) {
lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
} else if (config.format == lm::FORMAT_COUNT) {
lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
}
return 0;
} catch (const std::exception &e) {
std::cerr << e.what() << std::endl;
return 1;
}
return 0;
}

View File

@ -1,5 +1,5 @@
#ifndef LM_FILTER_FORMAT_H__
#define LM_FITLER_FORMAT_H__
#define LM_FILTER_FORMAT_H__
#include "lm/filter/arpa_io.hh"
#include "lm/filter/count_io.hh"

View File

@ -5,10 +5,6 @@
#include <ctype.h>
#if !defined __MINGW32__
#include <err.h>
#endif
namespace lm {
namespace vocab {
@ -34,7 +30,7 @@ bool IsLineEnd(std::istream &in) {
}// namespace
// Read space separated words in enter separated lines. These lines can be
// very long, so don't read an entire line at a time.
// very long, so don't read an entire line at a time.
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
in.exceptions(std::istream::badbit);
unsigned int sentence = 0;

View File

@ -34,8 +34,47 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
if (static_cast<std::size_t>(start - static_cast<uint8_t*>(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast<uint8_t*>(base)) << " but Size says they should take " << goal_size);
}
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
LoadLM(file, config, *this);
namespace {
void ComplainAboutARPA(const Config &config, ModelType model_type) {
if (config.write_mmap || !config.messages) return;
if (config.arpa_complain == Config::ALL) {
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
} else if (config.arpa_complain == Config::EXPENSIVE &&
(model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
}
}
void CheckCounts(const std::vector<uint64_t> &counts) {
UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE);
if (sizeof(uint64_t) > sizeof(std::size_t)) {
for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) {
UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines.");
}
}
}
} // namespace
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &init_config) : backing_(init_config) {
util::scoped_fd fd(util::OpenReadOrThrow(file));
if (IsBinaryFormat(fd.get())) {
Parameters parameters;
int fd_shallow = fd.release();
backing_.InitializeBinary(fd_shallow, kModelType, kVersion, parameters);
CheckCounts(parameters.counts);
Config new_config(init_config);
new_config.probing_multiplier = parameters.fixed.probing_multiplier;
Search::UpdateConfigFromBinary(backing_, parameters.counts, VocabularyT::Size(parameters.counts[0], new_config), new_config);
UTIL_THROW_IF(new_config.enumerate_vocab && !parameters.fixed.has_vocabulary, FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
SetupMemory(backing_.LoadBinary(Size(parameters.counts, new_config)), parameters.counts, new_config);
vocab_.LoadedBinary(parameters.fixed.has_vocabulary, fd_shallow, new_config.enumerate_vocab, backing_.VocabStringReadingOffset());
} else {
ComplainAboutARPA(init_config, kModelType);
InitializeFromARPA(fd.release(), file, init_config);
}
// g++ prints warnings unless these are fully initialized.
State begin_sentence = State();
@ -50,27 +89,9 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
P::Init(begin_sentence, null_context, vocab_, search_.Order());
}
namespace {
void CheckCounts(const std::vector<uint64_t> &counts) {
UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE);
if (sizeof(uint64_t) > sizeof(std::size_t)) {
for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) {
UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines.");
}
}
}
} // namespace
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd) {
CheckCounts(params.counts);
SetupMemory(start, params.counts, config);
vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab);
search_.LoadedBinary();
}
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
// Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(int fd, const char *file, const Config &config) {
// Backing file is the ARPA.
util::FilePiece f(fd, file, config.ProgressMessages());
try {
std::vector<uint64_t> counts;
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
@ -81,13 +102,17 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
// Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);
vocab_.SetupMemory(backing_.SetupJustVocab(vocab_size, counts.size()), vocab_size, counts[0], config);
if (config.write_mmap) {
if (config.write_mmap && config.include_vocab) {
WriteWordsWrapper wrap(config.enumerate_vocab);
vocab_.ConfigureEnumerate(&wrap, counts[0]);
search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + Search::Size(counts, config));
void *vocab_rebase, *search_rebase;
backing_.WriteVocabWords(wrap.Buffer(), vocab_rebase, search_rebase);
// Due to writing at the end of file, mmap may have relocated data. So remap.
vocab_.Relocate(vocab_rebase);
search_.SetupMemory(reinterpret_cast<uint8_t*>(search_rebase), counts, config);
} else {
vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]);
search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
@ -99,18 +124,13 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
search_.UnknownUnigram().backoff = 0.0;
search_.UnknownUnigram().prob = config.unknown_missing_logprob;
}
FinishFile(config, kModelType, kVersion, counts, vocab_.UnkCountChangePadding(), backing_);
backing_.FinishFile(config, kModelType, kVersion, counts);
} catch (util::Exception &e) {
e << " Byte: " << f.Offset();
throw;
}
}
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config));
Search::UpdateConfigFromBinary(fd, counts, config);
}
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) {

View File

@ -104,10 +104,6 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
}
private:
friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
// Score bigrams and above. Do not include backoff.
@ -116,15 +112,11 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
// Appears after Size in the cc file.
void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);
void InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd);
void InitializeFromARPA(const char *file, const Config &config);
void InitializeFromARPA(int fd, const char *file, const Config &config);
float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
Backing &MutableBacking() { return backing_; }
Backing backing_;
BinaryFormat backing_;
VocabularyT vocab_;

View File

@ -360,10 +360,11 @@ BOOST_AUTO_TEST_CASE(quant_bhiksha_trie) {
LoadingTest<QuantArrayTrieModel>();
}
template <class ModelT> void BinaryTest() {
template <class ModelT> void BinaryTest(Config::WriteMethod write_method) {
Config config;
config.write_mmap = "test.binary";
config.messages = NULL;
config.write_method = write_method;
ExpectEnumerateVocab enumerate;
config.enumerate_vocab = &enumerate;
@ -406,6 +407,11 @@ template <class ModelT> void BinaryTest() {
unlink("test_nounk.binary");
}
template <class ModelT> void BinaryTest() {
BinaryTest<ModelT>(Config::WRITE_MMAP);
BinaryTest<ModelT>(Config::WRITE_AFTER);
}
BOOST_AUTO_TEST_CASE(write_and_read_probing) {
BinaryTest<ProbingModel>();
}

View File

@ -38,13 +38,13 @@ const char kSeparatelyQuantizeVersion = 2;
} // namespace
void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
char version;
util::ReadOrThrow(fd, &version, 1);
util::ReadOrThrow(fd, &config.prob_bits, 1);
util::ReadOrThrow(fd, &config.backoff_bits, 1);
void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
unsigned char buffer[3];
file.ReadForConfig(buffer, 3, offset);
char version = buffer[0];
config.prob_bits = buffer[1];
config.backoff_bits = buffer[2];
if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
util::AdvanceOrThrow(fd, -3);
}
void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {

Some files were not shown because too many files have changed in this diff Show More