Merge branch 'master' of ssh://github.com/moses-smt/mosesdecoder

This commit is contained in:
phikoehn 2014-02-12 21:01:09 +00:00
commit 049be8b71c
133 changed files with 7456 additions and 1604 deletions

View File

@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -72,13 +72,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -0,0 +1,133 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.456080129" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.582801917" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/extract-mixed-syntax/Debug}" id="cdt.managedbuild.builder.gnu.cross.1220166455" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.2055012191" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1768196213" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2007889843" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.855436310" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.506549229" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1497326561" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.606353571" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.740521305" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<option id="gnu.cpp.link.option.libs.1946120010" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
</option>
<option id="gnu.cpp.link.option.paths.1563475751" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.106010037" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.136661991" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.2112208574" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.172930211" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.715007893">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.715007893." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.99436307" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.801178939" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/extract-mixed-syntax/Release}" id="cdt.managedbuild.builder.gnu.cross.1999547547" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1481537766" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1967527847" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.442342681" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.1847950300" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.1130138972" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.158429528" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.2020667840" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1372779734" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.371006952" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1770045040" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.707592414" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="extract-mixed-syntax.cdt.managedbuild.target.gnu.cross.exe.1868010260" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038;cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906;cdt.managedbuild.tool.gnu.c.compiler.input.442342681">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915;cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568;cdt.managedbuild.tool.gnu.c.compiler.input.2007889843">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cproject>

View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>extract-mixed-syntax</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>

View File

@ -0,0 +1,37 @@
/*
* Global.cpp
* extract
*
* Created by Hieu Hoang on 01/02/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "Global.h"
bool g_debug = false;
Global::Global()
: minHoleSpanSourceDefault(2)
, maxHoleSpanSourceDefault(7)
, minHoleSpanSourceSyntax(1)
, maxHoleSpanSourceSyntax(1000)
, maxUnaligned(5)
, maxSymbols(5)
, maxNonTerm(3)
, maxNonTermDefault(2)
// int minHoleSize(1)
// int minSubPhraseSize(1) // minimum size of a remaining lexical phrase
, glueGrammarFlag(false)
, unknownWordLabelFlag(false)
//bool zipFiles(false)
, sourceSyntax(true)
, targetSyntax(false)
, mixed(true)
, uppermostOnly(true)
, allowDefaultNonTermEdge(true)
, gzOutput(false)
{}

View File

@ -0,0 +1,45 @@
#pragma once
/*
* Global.h
* extract
*
* Created by Hieu Hoang on 01/02/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <set>
#include <map>
#include <string>
class Global
{
public:
int minHoleSpanSourceDefault;
int maxHoleSpanSourceDefault;
int minHoleSpanSourceSyntax;
int maxHoleSpanSourceSyntax;
int maxSymbols;
bool glueGrammarFlag;
bool unknownWordLabelFlag;
int maxNonTerm;
int maxNonTermDefault;
bool sourceSyntax;
bool targetSyntax;
bool mixed;
int maxUnaligned;
bool uppermostOnly;
bool allowDefaultNonTermEdge;
bool gzOutput;
Global();
Global(const Global&);
};
extern bool g_debug;
#define DEBUG_OUTPUT() void DebugOutput() const;

View File

@ -0,0 +1,62 @@
// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "InputFileStream.h"
#include "gzfilebuf.h"
#include <iostream>
using namespace std;
namespace Moses
{
InputFileStream::InputFileStream(const std::string &filePath)
: std::istream(NULL)
, m_streambuf(NULL)
{
if (filePath.size() > 3 &&
filePath.substr(filePath.size() - 3, 3) == ".gz")
{
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
std::filebuf* fb = new std::filebuf();
fb = fb->open(filePath.c_str(), std::ios::in);
if (! fb) {
cerr << "Can't read " << filePath.c_str() << endl;
exit(1);
}
m_streambuf = fb;
}
this->init(m_streambuf);
}
InputFileStream::~InputFileStream()
{
delete m_streambuf;
m_streambuf = NULL;
}
void InputFileStream::Close()
{
}
}

View File

@ -0,0 +1,48 @@
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_InputFileStream_h
#define moses_InputFileStream_h
#include <cstdlib>
#include <fstream>
#include <string>
namespace Moses
{
/** Used in place of std::istream, can read zipped files if it ends in .gz
*/
class InputFileStream : public std::istream
{
protected:
std::streambuf *m_streambuf;
public:
InputFileStream(const std::string &filePath);
~InputFileStream();
void Close();
};
}
#endif

View File

@ -0,0 +1,180 @@
/*
* Lattice.cpp
* extract
*
* Created by Hieu Hoang on 18/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <cassert>
#include "Lattice.h"
#include "LatticeNode.h"
#include "Tunnel.h"
#include "TunnelCollection.h"
#include "SyntaxTree.h"
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "Rule.h"
#include "RuleCollection.h"
using namespace std;
Lattice::Lattice(size_t sourceSize)
:m_stacks(sourceSize + 1)
{
}
Lattice::~Lattice()
{
std::vector<Stack>::iterator iterStack;
for (iterStack = m_stacks.begin(); iterStack != m_stacks.end(); ++iterStack)
{
Stack &stack = *iterStack;
RemoveAllInColl(stack);
}
}
void Lattice::CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global)
{
// term
Stack &startStack = GetStack(startPos);
LatticeNode *node = new LatticeNode(startPos, &sentence);
startStack.push_back(node);
// non-term
for (size_t endPos = startPos + 1; endPos <= sentence.source.size(); ++endPos)
{
const TunnelList &tunnels = tunnelColl.GetTunnels(startPos, endPos - 1);
TunnelList::const_iterator iterHole;
for (iterHole = tunnels.begin(); iterHole != tunnels.end(); ++iterHole)
{
const Tunnel &tunnel = *iterHole;
CreateArcsUsing1Hole(tunnel, sentence, global);
}
}
}
void Lattice::CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global)
{
size_t startPos = tunnel.GetRange(0).GetStartPos()
, endPos = tunnel.GetRange(0).GetEndPos();
size_t numSymbols = tunnel.GetRange(0).GetWidth();
assert(numSymbols > 0);
Stack &startStack = GetStack(startPos);
// non-terms. cartesian product of source & target labels
assert(startPos == tunnel.GetRange(0).GetStartPos() && endPos == tunnel.GetRange(0).GetEndPos());
size_t startT = tunnel.GetRange(1).GetStartPos()
,endT = tunnel.GetRange(1).GetEndPos();
const SyntaxNodes &nodesS = sentence.sourceTree.GetNodes(startPos, endPos);
const SyntaxNodes &nodesT = sentence.targetTree.GetNodes(startT, endT );
SyntaxNodes::const_iterator iterS, iterT;
for (iterS = nodesS.begin(); iterS != nodesS.end(); ++iterS)
{
const SyntaxNode *syntaxNodeS = *iterS;
for (iterT = nodesT.begin(); iterT != nodesT.end(); ++iterT)
{
const SyntaxNode *syntaxNodeT = *iterT;
bool isSyntax = syntaxNodeS->IsSyntax() || syntaxNodeT->IsSyntax();
size_t maxSourceNonTermSpan = isSyntax ? global.maxHoleSpanSourceSyntax : global.maxHoleSpanSourceDefault;
if (maxSourceNonTermSpan >= endPos - startPos)
{
LatticeNode *node = new LatticeNode(tunnel, syntaxNodeS, syntaxNodeT);
startStack.push_back(node);
}
}
}
}
Stack &Lattice::GetStack(size_t startPos)
{
assert(startPos < m_stacks.size());
return m_stacks[startPos];
}
const Stack &Lattice::GetStack(size_t startPos) const
{
assert(startPos < m_stacks.size());
return m_stacks[startPos];
}
void Lattice::CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global)
{
const Stack &startStack = GetStack(startPos);
Stack::const_iterator iterStack;
for (iterStack = startStack.begin(); iterStack != startStack.end(); ++iterStack)
{
const LatticeNode *node = *iterStack;
Rule *initRule = new Rule(node);
if (initRule->CanRecurse(global, sentence.GetTunnelCollection()))
{ // may or maynot be valid, but can continue to build on this rule
initRule->CreateRules(m_rules, *this, sentence, global);
}
if (initRule->IsValid(global, sentence.GetTunnelCollection()))
{ // add to rule collection
m_rules.Add(global, initRule, sentence);
}
else
{
delete initRule;
}
}
}
Stack Lattice::GetNonTermNode(const Range &sourceRange) const
{
Stack ret;
size_t sourcePos = sourceRange.GetStartPos();
const Stack &origStack = GetStack(sourcePos);
Stack::const_iterator iter;
for (iter = origStack.begin(); iter != origStack.end(); ++iter)
{
LatticeNode *node = *iter;
const Range &nodeRangeS = node->GetSourceRange();
assert(nodeRangeS.GetStartPos() == sourceRange.GetStartPos());
if (! node->IsTerminal() && nodeRangeS.GetEndPos() == sourceRange.GetEndPos())
{
ret.push_back(node);
}
}
return ret;
}
std::ostream& operator<<(std::ostream &out, const Lattice &obj)
{
std::vector<Stack>::const_iterator iter;
for (iter = obj.m_stacks.begin(); iter != obj.m_stacks.end(); ++iter)
{
const Stack &stack = *iter;
Stack::const_iterator iterStack;
for (iterStack = stack.begin(); iterStack != stack.end(); ++iterStack)
{
const LatticeNode &node = **iterStack;
out << node << " ";
}
}
return out;
}

View File

@ -0,0 +1,47 @@
#pragma once
/*
* Lattice.h
* extract
*
* Created by Hieu Hoang on 18/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <iostream>
#include <vector>
#include "RuleCollection.h"
class Global;
class LatticeNode;
class Tunnel;
class TunnelCollection;
class SentenceAlignment;
typedef std::vector<LatticeNode*> Stack;
class Lattice
{
friend std::ostream& operator<<(std::ostream&, const Lattice&);
std::vector<Stack> m_stacks;
RuleCollection m_rules;
Stack &GetStack(size_t endPos);
void CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global);
public:
Lattice(size_t sourceSize);
~Lattice();
void CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global);
void CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global);
const Stack &GetStack(size_t startPos) const;
const RuleCollection &GetRules() const
{ return m_rules; }
Stack GetNonTermNode(const Range &sourceRange) const;
};

View File

@ -0,0 +1,149 @@
/*
* LatticeNode.cpp
* extract
*
* Created by Hieu Hoang on 18/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <sstream>
#include "LatticeNode.h"
#include "SyntaxTree.h"
#include "Tunnel.h"
#include "SentenceAlignment.h"
#include "SymbolSequence.h"
size_t LatticeNode::s_count = 0;
using namespace std;
// for terms
LatticeNode::LatticeNode(size_t pos, const SentenceAlignment *sentence)
:m_tunnel(NULL)
,m_isTerminal(true)
,m_sourceTreeNode(NULL)
,m_targetTreeNode(NULL)
,m_sentence(sentence)
,m_sourceRange(pos, pos)
{
s_count++;
//cerr << *this << endl;
}
// for non-terms
LatticeNode::LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode)
:m_tunnel(&tunnel)
,m_isTerminal(false)
,m_sourceTreeNode(sourceTreeNode)
,m_targetTreeNode(targetTreeNode)
,m_sentence(NULL)
,m_sourceRange(tunnel.GetRange(0))
{
s_count++;
//cerr << *this << endl;
}
bool LatticeNode::IsSyntax() const
{
assert(!m_isTerminal);
bool ret = m_sourceTreeNode->IsSyntax() || m_targetTreeNode->IsSyntax();
return ret;
}
size_t LatticeNode::GetNumSymbols(size_t direction) const
{
return 1;
}
int LatticeNode::Compare(const LatticeNode &otherNode) const
{
int ret = 0;
if (m_isTerminal != otherNode.m_isTerminal)
{
ret = m_isTerminal ? -1 : 1;
}
// both term or non-term
else if (m_isTerminal)
{ // term. compare source span
if (m_sourceRange.GetStartPos() == otherNode.m_sourceRange.GetStartPos())
ret = 0;
else
ret = (m_sourceRange.GetStartPos() < otherNode.m_sourceRange.GetStartPos()) ? -1 : +1;
}
else
{ // non-term. compare source span and BOTH label
assert(!m_isTerminal);
assert(!otherNode.m_isTerminal);
if (m_sourceTreeNode->IsSyntax())
{
ret = m_tunnel->Compare(*otherNode.m_tunnel, 0);
if (ret == 0 && m_sourceTreeNode->GetLabel() != otherNode.m_sourceTreeNode->GetLabel())
{
ret = (m_sourceTreeNode->GetLabel() < otherNode.m_sourceTreeNode->GetLabel()) ? -1 : +1;
}
}
if (ret == 0 && m_targetTreeNode->IsSyntax())
{
ret = m_tunnel->Compare(*otherNode.m_tunnel, 1);
if (ret == 0 && m_targetTreeNode->GetLabel() != otherNode.m_targetTreeNode->GetLabel())
{
ret = (m_targetTreeNode->GetLabel() < otherNode.m_targetTreeNode->GetLabel()) ? -1 : +1;
}
}
}
return ret;
}
void LatticeNode::CreateSymbols(size_t direction, SymbolSequence &symbols) const
{
if (m_isTerminal)
{
/*
const std::vector<std::string> &words = (direction == 0 ? m_sentence->source : m_sentence->target);
size_t startPos = m_tunnel.GetStart(direction)
,endPos = m_tunnel.GetEnd(direction);
for (size_t pos = startPos; pos <= endPos; ++pos)
{
Symbol symbol(words[pos], pos);
symbols.Add(symbol);
}
*/
}
else
{ // output both
Symbol symbol(m_sourceTreeNode->GetLabel(), m_targetTreeNode->GetLabel()
, m_tunnel->GetRange(0).GetStartPos(), m_tunnel->GetRange(0).GetEndPos()
, m_tunnel->GetRange(1).GetStartPos(), m_tunnel->GetRange(1).GetEndPos()
, m_sourceTreeNode->IsSyntax(), m_targetTreeNode->IsSyntax());
symbols.Add(symbol);
}
}
std::ostream& operator<<(std::ostream &out, const LatticeNode &obj)
{
if (obj.m_isTerminal)
{
assert(obj.m_sourceRange.GetWidth() == 1);
size_t pos = obj.m_sourceRange.GetStartPos();
const SentenceAlignment &sentence = *obj.m_sentence;
out << obj.m_sourceRange << "=" << sentence.source[pos];
}
else
{
assert(obj.m_tunnel);
out << obj.GetTunnel() << "=" << obj.m_sourceTreeNode->GetLabel() << obj.m_targetTreeNode->GetLabel() << " ";
}
return out;
}

View File

@ -0,0 +1,77 @@
#pragma once
/*
* LatticeNode.h
* extract
*
* Created by Hieu Hoang on 18/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include <iostream>
#include <cassert>
#include "Range.h"
class Tunnel;
class SyntaxNode;
class SentenceAlignment;
class SymbolSequence;
class LatticeNode
{
friend std::ostream& operator<<(std::ostream&, const LatticeNode&);
bool m_isTerminal;
// for terms & non-term
Range m_sourceRange;
// non-terms. source range should be same as m_sourceRange
const Tunnel *m_tunnel;
public:
static size_t s_count;
const SyntaxNode *m_sourceTreeNode, *m_targetTreeNode;
const SentenceAlignment *m_sentence;
// for terms
LatticeNode(size_t pos, const SentenceAlignment *sentence);
// for non-terms
LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode);
bool IsTerminal() const
{ return m_isTerminal; }
bool IsSyntax() const;
size_t GetNumSymbols(size_t direction) const;
std::string ToString() const;
int Compare(const LatticeNode &otherNode) const;
void CreateSymbols(size_t direction, SymbolSequence &symbols) const;
const Tunnel &GetTunnel() const
{
assert(m_tunnel);
return *m_tunnel;
}
const Range &GetSourceRange() const
{
return m_sourceRange;
}
const SyntaxNode &GetSyntaxNode(size_t direction) const
{
const SyntaxNode *node = direction == 0 ? m_sourceTreeNode : m_targetTreeNode;
assert(node);
return *node;
}
};

View File

@ -0,0 +1,13 @@
all: extract
clean:
rm -f *.o extract-mixed-syntax
.cpp.o:
g++ -O6 -g -c $<
extract: tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o
g++ tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o -lz -lboost_iostreams-mt -o extract-mixed-syntax

View File

@ -0,0 +1,79 @@
// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <boost/iostreams/filter/gzip.hpp>
#include "OutputFileStream.h"
#include "gzfilebuf.h"
using namespace std;
namespace Moses
{
OutputFileStream::OutputFileStream()
:boost::iostreams::filtering_ostream()
,m_outFile(NULL)
{
}
OutputFileStream::OutputFileStream(const std::string &filePath)
: m_outFile(NULL)
{
Open(filePath);
}
OutputFileStream::~OutputFileStream()
{
Close();
}
bool OutputFileStream::Open(const std::string &filePath)
{
m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
if (m_outFile->fail()) {
return false;
}
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
this->push(boost::iostreams::gzip_compressor());
}
this->push(*m_outFile);
return true;
}
void OutputFileStream::Close()
{
if (m_outFile == NULL) {
return;
}
this->flush();
this->pop(); // file
m_outFile->close();
delete m_outFile;
m_outFile = NULL;
return;
}
}

View File

@ -0,0 +1,50 @@
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <cstdlib>
#include <fstream>
#include <string>
#include <iostream>
#include <boost/iostreams/filtering_stream.hpp>
namespace Moses
{
/** Used in place of std::istream, can read zipped files if it ends in .gz
*/
class OutputFileStream : public boost::iostreams::filtering_ostream
{
protected:
std::ofstream *m_outFile;
public:
OutputFileStream();
OutputFileStream(const std::string &filePath);
virtual ~OutputFileStream();
bool Open(const std::string &filePath);
void Close();
};
}

View File

@ -0,0 +1,74 @@
/*
* Range.cpp
* extract
*
* Created by Hieu Hoang on 22/02/2011.
* Copyright 2011 __MyCompanyName__. All rights reserved.
*
*/
#include "Range.h"
using namespace std;
void Range::Merge(const Range &a, const Range &b)
{
if (a.m_startPos == NOT_FOUND)
{ // get the other regardless
m_startPos = b.m_startPos;
}
else if (b.m_startPos == NOT_FOUND)
{
m_startPos = a.m_startPos;
}
else
{
m_startPos = min(a.m_startPos, b.m_startPos);
}
if (a.m_endPos == NOT_FOUND)
{ // get the other regardless
m_endPos = b.m_endPos;
}
else if (b.m_endPos == NOT_FOUND)
{ // do nothing
m_endPos = a.m_endPos;
}
else
{
m_endPos = max(a.m_endPos, b.m_endPos);
}
}
int Range::Compare(const Range &other) const
{
if (m_startPos < other.m_startPos)
return -1;
else if (m_startPos > other.m_startPos)
return +1;
else if (m_endPos < other.m_endPos)
return -1;
else if (m_endPos > other.m_endPos)
return +1;
return 0;
}
bool Range::Overlap(const Range &other) const
{
if ( other.m_endPos < m_startPos || other.m_startPos > m_endPos)
return false;
return true;
}
std::ostream& operator<<(std::ostream &out, const Range &range)
{
out << "[" << range.m_startPos << "-" << range.m_endPos << "]";
return out;
}

View File

@ -0,0 +1,57 @@
/*
* Range.h
* extract
*
* Created by Hieu Hoang on 22/02/2011.
* Copyright 2011 __MyCompanyName__. All rights reserved.
*
*/
#pragma once
#include <string>
#include <iostream>
#include <limits>
#define NOT_FOUND std::numeric_limits<size_t>::max()
class Range
{
friend std::ostream& operator<<(std::ostream&, const Range&);
size_t m_startPos, m_endPos;
public:
Range()
:m_startPos(NOT_FOUND)
,m_endPos(NOT_FOUND)
{}
Range(const Range &copy)
:m_startPos(copy.m_startPos)
,m_endPos(copy.m_endPos)
{}
Range(size_t startPos, size_t endPos)
:m_startPos(startPos)
,m_endPos(endPos)
{}
size_t GetStartPos() const
{ return m_startPos; }
size_t GetEndPos() const
{ return m_endPos; }
size_t GetWidth() const
{ return m_endPos - m_startPos + 1; }
void SetStartPos(size_t startPos)
{ m_startPos = startPos; }
void SetEndPos(size_t endPos)
{ m_endPos = endPos; }
void Merge(const Range &a, const Range &b);
int Compare(const Range &other) const;
bool Overlap(const Range &other) const;
};

View File

@ -0,0 +1,594 @@
/*
* Rule.cpp
* extract
*
* Created by Hieu Hoang on 19/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <algorithm>
#include <sstream>
#include "Rule.h"
#include "Global.h"
#include "LatticeNode.h"
#include "Lattice.h"
#include "SentenceAlignment.h"
#include "Tunnel.h"
#include "TunnelCollection.h"
#include "RuleCollection.h"
using namespace std;
RuleElement::RuleElement(const RuleElement &copy)
:m_latticeNode(copy.m_latticeNode)
,m_alignmentPos(copy.m_alignmentPos)
{
}
Rule::Rule(const LatticeNode *latticeNode)
:m_lhs(NULL)
{
RuleElement element(*latticeNode);
m_coll.push_back(element);
}
Rule::Rule(const Rule &prevRule, const LatticeNode *latticeNode)
:m_coll(prevRule.m_coll)
,m_lhs(NULL)
{
RuleElement element(*latticeNode);
m_coll.push_back(element);
}
Rule::Rule(const Global &global, bool &isValid, const Rule &copy, const LatticeNode *lhs, const SentenceAlignment &sentence)
:m_coll(copy.m_coll)
,m_source(copy.m_source)
,m_target(copy.m_target)
,m_lhs(lhs)
{
CreateSymbols(global, isValid, sentence);
}
Rule::~Rule()
{
}
// helper for sort
struct CompareLatticeNodeTarget
{
bool operator() (const RuleElement *a, const RuleElement *b)
{
const Range &rangeA = a->GetLatticeNode().GetTunnel().GetRange(1)
,&rangeB = b->GetLatticeNode().GetTunnel().GetRange(1);
return rangeA.GetEndPos() < rangeB.GetEndPos();
}
};
void Rule::CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence)
{
vector<RuleElement*> nonTerms;
// source
for (size_t ind = 0; ind < m_coll.size(); ++ind)
{
RuleElement &element = m_coll[ind];
const LatticeNode &node = element.GetLatticeNode();
if (node.IsTerminal())
{
size_t sourcePos = node.GetSourceRange().GetStartPos();
const string &word = sentence.source[sourcePos];
Symbol symbol(word, sourcePos);
m_source.Add(symbol);
}
else
{ // non-term
const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
const string &targetWord = node.GetSyntaxNode(1).GetLabel();
Symbol symbol(sourceWord, targetWord
, node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
, node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
, node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
m_source.Add(symbol);
// store current pos within phrase
element.m_alignmentPos.first = ind;
// for target symbols
nonTerms.push_back(&element);
}
}
// target
isValid = true;
const Range &lhsTargetRange = m_lhs->GetTunnel().GetRange(1);
// check spans of target non-terms
if (nonTerms.size())
{
// sort non-term rules elements by target range
std::sort(nonTerms.begin(), nonTerms.end(), CompareLatticeNodeTarget());
const Range &first = nonTerms.front()->GetLatticeNode().GetTunnel().GetRange(1);
const Range &last = nonTerms.back()->GetLatticeNode().GetTunnel().GetRange(1);
if (first.GetStartPos() < lhsTargetRange.GetStartPos()
|| last.GetEndPos() > lhsTargetRange.GetEndPos())
{
isValid = false;
}
}
if (isValid)
{
size_t indNonTerm = 0;
RuleElement *currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;
for (size_t targetPos = lhsTargetRange.GetStartPos(); targetPos <= lhsTargetRange.GetEndPos(); ++targetPos)
{
if (currNonTermElement && targetPos == currNonTermElement->GetLatticeNode().GetTunnel().GetRange(1).GetStartPos())
{ // start of a non-term. print out non-terms & skip to the end
const LatticeNode &node = currNonTermElement->GetLatticeNode();
const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
const string &targetWord = node.GetSyntaxNode(1).GetLabel();
Symbol symbol(sourceWord, targetWord
, node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
, node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
, node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
m_target.Add(symbol);
// store current pos within phrase
currNonTermElement->m_alignmentPos.second = m_target.GetSize() - 1;
assert(currNonTermElement->m_alignmentPos.first != NOT_FOUND);
targetPos = node.GetTunnel().GetRange(1).GetEndPos();
indNonTerm++;
currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;
}
else
{ // term
const string &word = sentence.target[targetPos];
Symbol symbol(word, targetPos);
m_target.Add(symbol);
}
}
assert(indNonTerm == nonTerms.size());
if (m_target.GetSize() > global.maxSymbols) {
isValid = false;
//cerr << "m_source=" << m_source.GetSize() << ":" << m_source << endl;
//cerr << "m_target=" << m_target.GetSize() << ":" << m_target << endl;
}
}
}
bool Rule::MoreDefaultNonTermThanTerm() const
{
size_t numTerm = 0, numDefaultNonTerm = 0;
CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const RuleElement &element = *iter;
const LatticeNode &node = element.GetLatticeNode();
if (node.IsTerminal())
{
++numTerm;
}
else if (!node.IsSyntax())
{
++numDefaultNonTerm;
}
}
bool ret = numDefaultNonTerm > numTerm;
return ret;
}
bool Rule::SourceHasEdgeDefaultNonTerm() const
{
assert(m_coll.size());
const LatticeNode &first = m_coll.front().GetLatticeNode();
const LatticeNode &last = m_coll.back().GetLatticeNode();
// 1st
if (!first.IsTerminal() && !first.IsSyntax())
{
return true;
}
if (!last.IsTerminal() && !last.IsSyntax())
{
return true;
}
return false;
}
bool Rule::IsValid(const Global &global, const TunnelCollection &tunnelColl) const
{
if (m_coll.size() == 1 && !m_coll[0].GetLatticeNode().IsTerminal()) // can't be only 1 terminal
{
return false;
}
if (MoreDefaultNonTermThanTerm())
{ // must have at least as many terms as non-syntax non-terms
return false;
}
if (!global.allowDefaultNonTermEdge && SourceHasEdgeDefaultNonTerm())
{
return false;
}
if (GetNumSymbols() > global.maxSymbols)
{
return false;
}
if (AdjacentDefaultNonTerms())
{
return false;
}
if (!IsHole(tunnelColl))
{
return false;
}
if (NonTermOverlap())
{
return false;
}
/*
std::pair<size_t, size_t> spanS = GetSpan(0)
,spanT= GetSpan(1);
if (tunnelColl.NumUnalignedWord(0, spanS.first, spanS.second) >= global.maxUnaligned)
return false;
if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
return false;
*/
return true;
}
bool Rule::NonTermOverlap() const
{
vector<Range> ranges;
CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const RuleElement &element = *iter;
if (!element.GetLatticeNode().IsTerminal())
{
const Range &range = element.GetLatticeNode().GetTunnel().GetRange(1);
ranges.push_back(range);
}
}
vector<Range>::const_iterator outerIter;
for (outerIter = ranges.begin(); outerIter != ranges.end(); ++outerIter)
{
const Range &outer = *outerIter;
vector<Range>::const_iterator innerIter;
for (innerIter = outerIter + 1; innerIter != ranges.end(); ++innerIter)
{
const Range &inner = *innerIter;
if (outer.Overlap(inner))
return true;
}
}
return false;
}
Range Rule::GetSourceRange() const
{
assert(m_coll.size());
const Range &first = m_coll.front().GetLatticeNode().GetSourceRange();
const Range &last = m_coll.back().GetLatticeNode().GetSourceRange();
Range ret(first.GetStartPos(), last.GetEndPos());
return ret;
}
bool Rule::IsHole(const TunnelCollection &tunnelColl) const
{
const Range &spanS = GetSourceRange();
const TunnelList &tunnels = tunnelColl.GetTunnels(spanS.GetStartPos(), spanS.GetEndPos());
bool ret = tunnels.size() > 0;
return ret;
}
bool Rule::CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const
{
if (GetNumSymbols() >= global.maxSymbols)
return false;
if (AdjacentDefaultNonTerms())
return false;
if (MaxNonTerm(global))
return false;
if (NonTermOverlap())
{
return false;
}
const Range spanS = GetSourceRange();
if (tunnelColl.NumUnalignedWord(0, spanS.GetStartPos(), spanS.GetEndPos()) >= global.maxUnaligned)
return false;
// if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
// return false;
return true;
}
bool Rule::MaxNonTerm(const Global &global) const
{
//cerr << *this << endl;
size_t numNonTerm = 0, numNonTermDefault = 0;
CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const LatticeNode *node = &(*iter).GetLatticeNode();
if (!node->IsTerminal() )
{
numNonTerm++;
if (!node->IsSyntax())
{
numNonTermDefault++;
}
if (numNonTerm >= global.maxNonTerm || numNonTermDefault >= global.maxNonTermDefault)
return true;
}
}
return false;
}
bool Rule::AdjacentDefaultNonTerms() const
{
assert(m_coll.size() > 0);
const LatticeNode *prevNode = &m_coll.front().GetLatticeNode();
CollType::const_iterator iter;
for (iter = m_coll.begin() + 1; iter != m_coll.end(); ++iter)
{
const LatticeNode *node = &(*iter).GetLatticeNode();
if (!prevNode->IsTerminal() && !node->IsTerminal() && !prevNode->IsSyntax() && !node->IsSyntax() )
{
return true;
}
prevNode = node;
}
return false;
}
size_t Rule::GetNumSymbols() const
{
size_t ret = m_coll.size();
return ret;
}
void Rule::CreateRules(RuleCollection &rules
, const Lattice &lattice
, const SentenceAlignment &sentence
, const Global &global)
{
assert(m_coll.size() > 0);
const LatticeNode *latticeNode = &m_coll.back().GetLatticeNode();
size_t endPos = latticeNode->GetSourceRange().GetEndPos() + 1;
const Stack &stack = lattice.GetStack(endPos);
Stack::const_iterator iter;
for (iter = stack.begin(); iter != stack.end(); ++iter)
{
const LatticeNode *newLatticeNode = *iter;
Rule *newRule = new Rule(*this, newLatticeNode);
//cerr << *newRule << endl;
if (newRule->CanRecurse(global, sentence.GetTunnelCollection()))
{ // may or maynot be valid, but can continue to build on this rule
newRule->CreateRules(rules, lattice, sentence, global);
}
if (newRule->IsValid(global, sentence.GetTunnelCollection()))
{ // add to rule collection
rules.Add(global, newRule, sentence);
}
else
{
delete newRule;
}
}
}
bool Rule::operator<(const Rule &compare) const
{
/*
if (g_debug)
{
cerr << *this << endl << compare;
cerr << endl;
}
*/
bool ret = Compare(compare) < 0;
/*
if (g_debug)
{
cerr << *this << endl << compare << endl << ret << endl << endl;
}
*/
return ret;
}
int Rule::Compare(const Rule &compare) const
{
//cerr << *this << endl << compare << endl;
assert(m_coll.size() > 0);
assert(m_source.GetSize() > 0);
assert(m_target.GetSize() > 0);
int ret = 0;
// compare each fragment
ret = m_source.Compare(compare.m_source);
if (ret != 0)
{
return ret;
}
ret = m_target.Compare(compare.m_target);
if (ret != 0)
{
return ret;
}
// compare lhs
const string &thisSourceLabel = m_lhs->GetSyntaxNode(0).GetLabel();
const string &otherSourceLabel = compare.m_lhs->GetSyntaxNode(0).GetLabel();
if (thisSourceLabel != otherSourceLabel)
{
ret = (thisSourceLabel < otherSourceLabel) ? -1 : +1;
return ret;
}
const string &thisTargetLabel = m_lhs->GetSyntaxNode(1).GetLabel();
const string &otherTargetLabel = compare.m_lhs->GetSyntaxNode(1).GetLabel();
if (thisTargetLabel != otherTargetLabel)
{
ret = (thisTargetLabel < otherTargetLabel) ? -1 : +1;
return ret;
}
assert(ret == 0);
return ret;
}
const LatticeNode &Rule::GetLatticeNode(size_t ind) const
{
assert(ind < m_coll.size());
return m_coll[ind].GetLatticeNode();
}
void Rule::DebugOutput() const
{
Output(cerr);
}
void Rule::Output(std::ostream &out) const
{
stringstream strmeS, strmeT;
std::vector<Symbol>::const_iterator iterSymbol;
for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
strmeS << symbol << " ";
}
for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
strmeT << symbol << " ";
}
// lhs
if (m_lhs)
{
strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
}
out << strmeS.str() << " ||| " << strmeT.str() << " ||| ";
// alignment
Rule::CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const RuleElement &element = *iter;
const LatticeNode &node = element.GetLatticeNode();
bool isTerminal = node.IsTerminal();
if (!isTerminal)
{
out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " ";
}
}
out << "||| 1";
}
void Rule::OutputInv(std::ostream &out) const
{
stringstream strmeS, strmeT;
std::vector<Symbol>::const_iterator iterSymbol;
for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
strmeS << symbol << " ";
}
for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
strmeT << symbol << " ";
}
// lhs
if (m_lhs)
{
strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
}
out << strmeT.str() << " ||| " << strmeS.str() << " ||| ";
// alignment
Rule::CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const RuleElement &element = *iter;
const LatticeNode &node = element.GetLatticeNode();
bool isTerminal = node.IsTerminal();
if (!isTerminal)
{
out << element.m_alignmentPos.second << "-" << element.m_alignmentPos.first << " ";
}
}
out << "||| 1";
}

View File

@ -0,0 +1,96 @@
#pragma once
/*
* Rule.h
* extract
*
* Created by Hieu Hoang on 19/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include <iostream>
#include "LatticeNode.h"
#include "SymbolSequence.h"
#include "Global.h"
class Lattice;
class SentenceAlignment;
class Global;
class RuleCollection;
class SyntaxNode;
class TunnelCollection;
class Range;
class RuleElement
{
protected:
const LatticeNode *m_latticeNode;
public:
std::pair<size_t, size_t> m_alignmentPos;
RuleElement(const RuleElement &copy);
RuleElement(const LatticeNode &latticeNode)
:m_latticeNode(&latticeNode)
,m_alignmentPos(NOT_FOUND, NOT_FOUND)
{}
const LatticeNode &GetLatticeNode() const
{ return *m_latticeNode; }
};
class Rule
{
protected:
typedef std::vector<RuleElement> CollType;
CollType m_coll;
const LatticeNode *m_lhs;
SymbolSequence m_source, m_target;
bool IsHole(const TunnelCollection &tunnelColl) const;
bool NonTermOverlap() const;
const LatticeNode &GetLatticeNode(size_t ind) const;
void CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence);
public:
// init
Rule(const LatticeNode *latticeNode);
// create new rule by appending node to prev rule
Rule(const Rule &prevRule, const LatticeNode *latticeNode);
// create copy with lhs
Rule(const Global &global, bool &isValid, const Rule &copy, const LatticeNode *lhs, const SentenceAlignment &sentence);
// can continue to add to this rule
bool CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const;
virtual ~Rule();
// can add this to the set of rules
bool IsValid(const Global &global, const TunnelCollection &tunnelColl) const;
size_t GetNumSymbols() const;
bool AdjacentDefaultNonTerms() const;
bool MaxNonTerm(const Global &global) const;
bool MoreDefaultNonTermThanTerm() const;
bool SourceHasEdgeDefaultNonTerm() const;
void CreateRules(RuleCollection &rules
, const Lattice &lattice
, const SentenceAlignment &sentence
, const Global &global);
int Compare(const Rule &compare) const;
bool operator<(const Rule &compare) const;
Range GetSourceRange() const;
DEBUG_OUTPUT();
void Output(std::ostream &out) const;
void OutputInv(std::ostream &out) const;
};

View File

@ -0,0 +1,102 @@
/*
* RuleCollection.cpp
* extract
*
* Created by Hieu Hoang on 19/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "RuleCollection.h"
#include "Rule.h"
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "Lattice.h"
#include "SyntaxTree.h"
using namespace std;
RuleCollection::~RuleCollection()
{
RemoveAllInColl(m_coll);
}
void RuleCollection::Add(const Global &global, Rule *rule, const SentenceAlignment &sentence)
{
Range spanS = rule->GetSourceRange();
// cartesian product of lhs
Stack nontermNodes = sentence.GetLattice().GetNonTermNode(spanS);
Stack::const_iterator iterStack;
for (iterStack = nontermNodes.begin(); iterStack != nontermNodes.end(); ++iterStack)
{
const LatticeNode &node = **iterStack;
assert(!node.IsTerminal());
bool isValid;
// create rules with LHS
//cerr << "old:" << *rule << endl;
Rule *newRule = new Rule(global, isValid, *rule, &node, sentence);
if (!isValid)
{ // lhs doesn't match non-term spans
delete newRule;
continue;
}
/*
stringstream s;
s << *newRule;
if (s.str().find("Wiederaufnahme der [X] ||| resumption of the [X] ||| ||| 1") == 0)
{
cerr << "READY:" << *newRule << endl;
g_debug = true;
}
else {
g_debug = false;
}
*/
typedef set<const Rule*, CompareRule>::iterator Iterator;
pair<Iterator,bool> ret = m_coll.insert(newRule);
if (ret.second)
{
//cerr << "ACCEPTED:" << *newRule << endl;
//cerr << "";
}
else
{
//cerr << "REJECTED:" << *newRule << endl;
delete newRule;
}
}
delete rule;
}
void RuleCollection::Output(std::ostream &out) const
{
RuleCollection::CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const Rule &rule = **iter;
rule.Output(out);
out << endl;
}
}
void RuleCollection::OutputInv(std::ostream &out) const
{
RuleCollection::CollType::const_iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
{
const Rule &rule = **iter;
rule.OutputInv(out);
out << endl;
}
}

View File

@ -0,0 +1,55 @@
#pragma once
/*
* RuleCollection.h
* extract
*
* Created by Hieu Hoang on 19/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <set>
#include <iostream>
#include "Rule.h"
class SentenceAlignment;
// helper for sort. Don't compare default non-terminals
struct CompareRule
{
bool operator() (const Rule *a, const Rule *b)
{
/*
if (g_debug)
{
std::cerr << std::endl << (*a) << std::endl << (*b) << " ";
}
*/
bool ret = (*a) < (*b);
/*
if (g_debug)
{
std::cerr << ret << std::endl;
}
*/
return ret;
}
};
class RuleCollection
{
protected:
typedef std::set<const Rule*, CompareRule> CollType;
CollType m_coll;
public:
~RuleCollection();
void Add(const Global &global, Rule *rule, const SentenceAlignment &sentence);
size_t GetSize() const
{ return m_coll.size(); }
void Output(std::ostream &out) const;
void OutputInv(std::ostream &out) const;
};

View File

@ -0,0 +1,331 @@
/*
* SentenceAlignment.cpp
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <set>
#include <map>
#include <sstream>
#include "SentenceAlignment.h"
#include "XmlTree.h"
#include "tables-core.h"
#include "TunnelCollection.h"
#include "Lattice.h"
#include "LatticeNode.h"
using namespace std;
extern std::set< std::string > targetLabelCollection, sourceLabelCollection;
extern std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;
SentenceAlignment::SentenceAlignment()
:m_tunnelCollection(NULL)
,m_lattice(NULL)
{}
SentenceAlignment::~SentenceAlignment()
{
delete m_tunnelCollection;
delete m_lattice;
}
int SentenceAlignment::Create( const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global )
{
// tokenizing English (and potentially extract syntax spans)
if (global.targetSyntax) {
string targetStringCPP = string(targetString);
ProcessAndStripXMLTags( targetStringCPP, targetTree, targetLabelCollection , targetTopLabelCollection );
target = tokenize( targetStringCPP.c_str() );
// cerr << "E: " << targetStringCPP << endl;
}
else {
target = tokenize( targetString.c_str() );
}
// tokenizing source (and potentially extract syntax spans)
if (global.sourceSyntax) {
string sourceStringCPP = string(sourceString);
ProcessAndStripXMLTags( sourceStringCPP, sourceTree, sourceLabelCollection , sourceTopLabelCollection );
source = tokenize( sourceStringCPP.c_str() );
// cerr << "F: " << sourceStringCPP << endl;
}
else {
source = tokenize( sourceString.c_str() );
}
// check if sentences are empty
if (target.size() == 0 || source.size() == 0) {
cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return 0;
}
// prepare data structures for alignments
for(int i=0; i<source.size(); i++) {
alignedCountS.push_back( 0 );
}
for(int i=0; i<target.size(); i++) {
vector< int > dummy;
alignedToT.push_back( dummy );
}
//InitTightest(m_s2tTightest, source.size());
//InitTightest(m_t2sTightest, target.size());
// reading in alignments
vector<string> alignmentSequence = tokenize( alignmentString.c_str() );
for(int i=0; i<alignmentSequence.size(); i++) {
int s,t;
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return 0;
}
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
if (t >= target.size() || s >= source.size()) {
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return 0;
}
alignedToT[t].push_back( s );
alignedCountS[s]++;
//SetAlignment(s, t);
}
bool mixed = global.mixed;
sourceTree.AddDefaultNonTerms(global.sourceSyntax, mixed, source.size());
targetTree.AddDefaultNonTerms(global.targetSyntax, mixed, target.size());
//CalcTightestSpan(m_s2tTightest);
//CalcTightestSpan(m_t2sTightest);
return 1;
}
/*
void SentenceAlignment::InitTightest(Outer &tightest, size_t len)
{
tightest.resize(len);
for (size_t posOuter = 0; posOuter < len; ++posOuter)
{
Inner &inner = tightest[posOuter];
size_t innerSize = len - posOuter;
inner.resize(innerSize);
}
}
void SentenceAlignment::CalcTightestSpan(Outer &tightest)
{
size_t len = tightest.size();
for (size_t startPos = 0; startPos < len; ++startPos)
{
for (size_t endPos = startPos + 1; endPos < len; ++endPos)
{
const Range &prevRange = GetTightest(tightest, startPos, endPos - 1);
const Range &smallRange = GetTightest(tightest, endPos, endPos);
Range &newRange = GetTightest(tightest, startPos, endPos);
newRange.Merge(prevRange, smallRange);
//cerr << "[" << startPos << "-" << endPos << "] --> [" << newRange.GetStartPos() << "-" << newRange.GetEndPos() << "]";
}
}
}
Range &SentenceAlignment::GetTightest(Outer &tightest, size_t startPos, size_t endPos)
{
assert(endPos < tightest.size());
assert(endPos >= startPos);
Inner &inner = tightest[startPos];
size_t ind = endPos - startPos;
Range &ret = inner[ind];
return ret;
}
void SentenceAlignment::SetAlignment(size_t source, size_t target)
{
SetAlignment(m_s2tTightest, source, target);
SetAlignment(m_t2sTightest, target, source);
}
void SentenceAlignment::SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos)
{
Range &range = GetTightest(tightest, thisPos, thisPos);
if (range.GetStartPos() == NOT_FOUND)
{ // not yet set, do them both
assert(range.GetEndPos() == NOT_FOUND);
range.SetStartPos(thatPos);
range.SetEndPos(thatPos);
}
else
{
assert(range.GetEndPos() != NOT_FOUND);
range.SetStartPos( (range.GetStartPos() > thatPos) ? thatPos : range.GetStartPos() );
range.SetEndPos( (range.GetEndPos() < thatPos) ? thatPos : range.GetEndPos() );
}
}
*/
void SentenceAlignment::FindTunnels(const Global &global )
{
int countT = target.size();
int countS = source.size();
int maxSpan = max(global.maxHoleSpanSourceDefault, global.maxHoleSpanSourceSyntax);
m_tunnelCollection = new TunnelCollection(countS);
m_tunnelCollection->alignedCountS = alignedCountS;
m_tunnelCollection->alignedCountT.resize(alignedToT.size());
for (size_t ind = 0; ind < alignedToT.size(); ind++)
{
m_tunnelCollection->alignedCountT[ind] = alignedToT[ind].size();
}
// phrase repository for creating hiero phrases
// check alignments for target phrase startT...endT
for(int lengthT=1;
lengthT <= maxSpan && lengthT <= countT;
lengthT++) {
for(int startT=0; startT < countT-(lengthT-1); startT++) {
// that's nice to have
int endT = startT + lengthT - 1;
// if there is target side syntax, there has to be a node
if (global.targetSyntax && !targetTree.HasNode(startT,endT))
continue;
// find find aligned source words
// first: find minimum and maximum source word
int minS = 9999;
int maxS = -1;
vector< int > usedS = alignedCountS;
for(int ti=startT;ti<=endT;ti++) {
for(int i=0;i<alignedToT[ti].size();i++) {
int si = alignedToT[ti][i];
// cerr << "point (" << si << ", " << ti << ")\n";
if (si<minS) { minS = si; }
if (si>maxS) { maxS = si; }
usedS[ si ]--;
}
}
// unaligned phrases are not allowed
if( maxS == -1 )
continue;
// source phrase has to be within limits
if( maxS-minS >= maxSpan )
{
continue;
}
// check if source words are aligned to out of bound target words
bool out_of_bounds = false;
for(int si=minS;si<=maxS && !out_of_bounds;si++)
{
if (usedS[si]>0) {
out_of_bounds = true;
}
}
// if out of bound, you gotta go
if (out_of_bounds)
continue;
if (m_tunnelCollection->NumUnalignedWord(1, startT, endT) >= global.maxUnaligned)
continue;
// done with all the checks, lets go over all consistent phrase pairs
// start point of source phrase may retreat over unaligned
for(int startS=minS;
(startS>=0 &&
startS>maxS - maxSpan && // within length limit
(startS==minS || alignedCountS[startS]==0)); // unaligned
startS--)
{
// end point of source phrase may advance over unaligned
for(int endS=maxS;
(endS<countS && endS<startS + maxSpan && // within length limit
(endS==maxS || alignedCountS[endS]==0)); // unaligned
endS++)
{
if (m_tunnelCollection->NumUnalignedWord(0, startS, endS) >= global.maxUnaligned)
continue;
// take note that this is a valid phrase alignment
m_tunnelCollection->Add(startS, endS, startT, endT);
}
}
}
}
//cerr << *tunnelCollection << endl;
}
void SentenceAlignment::CreateLattice(const Global &global)
{
size_t countS = source.size();
m_lattice = new Lattice(countS);
for (size_t startPos = 0; startPos < countS; ++startPos)
{
//cerr << "creating arcs for " << startPos << "=";
m_lattice->CreateArcs(startPos, *m_tunnelCollection, *this, global);
//cerr << LatticeNode::s_count << endl;
}
}
void SentenceAlignment::CreateRules(const Global &global)
{
size_t countS = source.size();
for (size_t startPos = 0; startPos < countS; ++startPos)
{
//cerr << "creating rules for " << startPos << "\n";
m_lattice->CreateRules(startPos, *this, global);
}
}
void OutputSentenceStr(std::ostream &out, const std::vector<std::string> &vec)
{
for (size_t pos = 0; pos < vec.size(); ++pos)
{
out << vec[pos] << " ";
}
}
std::ostream& operator<<(std::ostream &out, const SentenceAlignment &obj)
{
OutputSentenceStr(out, obj.target);
out << " ==> ";
OutputSentenceStr(out, obj.source);
out << endl;
out << *obj.m_tunnelCollection;
if (obj.m_lattice)
out << endl << *obj.m_lattice;
return out;
}

View File

@ -0,0 +1,69 @@
#pragma once
/*
* SentenceAlignment.h
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include <cassert>
#include <iostream>
#include "SyntaxTree.h"
#include "Global.h"
#include "Range.h"
class TunnelCollection;
class Lattice;
class SentenceAlignment
{
friend std::ostream& operator<<(std::ostream&, const SentenceAlignment&);
public:
std::vector<std::string> target;
std::vector<std::string> source;
std::vector<int> alignedCountS;
std::vector< std::vector<int> > alignedToT;
SyntaxTree sourceTree, targetTree;
//typedef std::vector<Range> Inner;
//typedef std::vector<Inner> Outer;
//Outer m_s2tTightest, m_t2sTightest;
SentenceAlignment();
~SentenceAlignment();
int Create(const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global);
// void clear() { delete(alignment); };
void FindTunnels( const Global &global ) ;
void CreateLattice(const Global &global);
void CreateRules(const Global &global);
const TunnelCollection &GetTunnelCollection() const
{
assert(m_tunnelCollection);
return *m_tunnelCollection;
}
const Lattice &GetLattice() const
{
assert(m_lattice);
return *m_lattice;
}
protected:
TunnelCollection *m_tunnelCollection;
Lattice *m_lattice;
/*
void CalcTightestSpan(Outer &tightest);
void InitTightest(Outer &tightest, size_t len);
Range &GetTightest(Outer &tightest, size_t startPos, size_t endPos);
void SetAlignment(size_t source, size_t target);
void SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos);
*/
};

View File

@ -0,0 +1,101 @@
/*
* Symbol.cpp
* extract
*
* Created by Hieu Hoang on 21/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <cassert>
#include "Symbol.h"
using namespace std;
Symbol::Symbol(const std::string &label, size_t pos)
:m_label(label)
,m_isTerminal(true)
,m_span(2)
{
m_span[0].first = pos;
}
Symbol::Symbol(const std::string &labelS, const std::string &labelT
, size_t startS, size_t endS
, size_t startT, size_t endT
, bool isSourceSyntax, bool isTargetSyntax)
:m_label(labelS)
,m_labelT(labelT)
,m_isTerminal(false)
,m_span(2)
,m_isSourceSyntax(isSourceSyntax)
,m_isTargetSyntax(isTargetSyntax)
{
m_span[0] = std::pair<size_t, size_t>(startS, endS);
m_span[1] = std::pair<size_t, size_t>(startT, endT);
}
int CompareNonTerm(bool thisIsSyntax, bool otherIsSyntax
, const std::pair<size_t, size_t> &thisSpan, const std::pair<size_t, size_t> &otherSpan
, std::string thisLabel, std::string otherLabel)
{
if (thisIsSyntax != otherIsSyntax)
{ // 1 is [X] & the other is [NP] on the source
return thisIsSyntax ? -1 : +1;
}
assert(thisIsSyntax == otherIsSyntax);
if (thisIsSyntax)
{ // compare span & label
if (thisSpan != otherSpan)
return thisSpan < otherSpan ? -1 : +1;
if (thisLabel != otherLabel)
return thisLabel < otherLabel ? -1 : +1;
}
return 0;
}
int Symbol::Compare(const Symbol &other) const
{
if (m_isTerminal != other.m_isTerminal)
return m_isTerminal ? -1 : +1;
assert(m_isTerminal == other.m_isTerminal);
if (m_isTerminal)
{ // compare labels & pos
if (m_span[0].first != other.m_span[0].first)
return (m_span[0].first < other.m_span[0].first) ? -1 : +1;
if (m_label != other.m_label)
return (m_label < other.m_label) ? -1 : +1;
}
else
{ // non terms
int ret = CompareNonTerm(m_isSourceSyntax, other.m_isSourceSyntax
,m_span[0], other.m_span[0]
,m_label, other.m_label);
if (ret != 0)
return ret;
ret = CompareNonTerm(m_isTargetSyntax, other.m_isTargetSyntax
,m_span[1], other.m_span[1]
,m_label, other.m_label);
if (ret != 0)
return ret;
}
return 0;
}
std::ostream& operator<<(std::ostream &out, const Symbol &obj)
{
if (obj.m_isTerminal)
out << obj.m_label;
else
out << obj.m_label + obj.m_labelT;
return out;
}

View File

@ -0,0 +1,36 @@
#pragma once
/*
* Symbol.h
* extract
*
* Created by Hieu Hoang on 21/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <string>
#include <iostream>
#include <vector>
class Symbol
{
friend std::ostream& operator<<(std::ostream &out, const Symbol &obj);
protected:
std::string m_label, m_labelT; // m_labelT only for non-term
std::vector<std::pair<size_t, size_t> > m_span;
bool m_isTerminal, m_isSourceSyntax, m_isTargetSyntax;
public:
// for terminals
Symbol(const std::string &label, size_t pos);
// for non-terminals
Symbol(const std::string &labelS, const std::string &labelT
, size_t startS, size_t endS
, size_t startT, size_t endT
, bool isSourceSyntax, bool isTargetSyntax);
int Compare(const Symbol &other) const;
};

View File

@ -0,0 +1,56 @@
/*
* SymbolSequence.cpp
* extract
*
* Created by Hieu Hoang on 21/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <cassert>
#include <sstream>
#include "SymbolSequence.h"
using namespace std;
int SymbolSequence::Compare(const SymbolSequence &other) const
{
int ret;
size_t thisSize = GetSize();
size_t otherSize = other.GetSize();
if (thisSize != otherSize)
{
ret = (thisSize < otherSize) ? -1 : +1;
return ret;
}
else
{
assert(thisSize == otherSize);
for (size_t ind = 0; ind < thisSize; ++ind)
{
const Symbol &thisSymbol = GetSymbol(ind);
const Symbol &otherSymbol = other.GetSymbol(ind);
ret = thisSymbol.Compare(otherSymbol);
if (ret != 0)
{
return ret;
}
}
}
assert(ret == 0);
return ret;
}
std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj)
{
SymbolSequence::CollType::const_iterator iterSymbol;
for (iterSymbol = obj.m_coll.begin(); iterSymbol != obj.m_coll.end(); ++iterSymbol)
{
const Symbol &symbol = *iterSymbol;
out << symbol << " ";
}
return out;
}

View File

@ -0,0 +1,42 @@
#pragma once
/*
* SymbolSequence.h
* extract
*
* Created by Hieu Hoang on 21/07/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <iostream>
#include <vector>
#include "Symbol.h"
class SymbolSequence
{
friend std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj);
protected:
typedef std::vector<Symbol> CollType;
CollType m_coll;
public:
typedef CollType::iterator iterator;
typedef CollType::const_iterator const_iterator;
const_iterator begin() const { return m_coll.begin(); }
const_iterator end() const { return m_coll.end(); }
void Add(const Symbol &symbol)
{
m_coll.push_back(symbol);
}
size_t GetSize() const
{ return m_coll.size(); }
const Symbol &GetSymbol(size_t ind) const
{ return m_coll[ind]; }
void Clear()
{ m_coll.clear(); }
int Compare(const SymbolSequence &other) const;
};

View File

@ -0,0 +1,245 @@
// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include <cassert>
#include "SyntaxTree.h"
//#include "extract.h"
#include "Global.h"
//extern const Global g_debug;
extern const Global *g_global;
using namespace std;
bool SyntaxNode::IsSyntax() const
{
bool ret = GetLabel() != "[X]";
return ret;
}
SyntaxTree::SyntaxTree()
:m_defaultLHS(0,0, "[X]")
{
m_emptyNode.clear();
}
SyntaxTree::~SyntaxTree()
{
// loop through all m_nodes, delete them
for(int i=0; i<m_nodes.size(); i++)
{
delete m_nodes[i];
}
}
bool HasDuplicates(const SyntaxNodes &nodes)
{
string prevLabel;
SyntaxNodes::const_iterator iter;
for (iter = nodes.begin(); iter != nodes.end(); ++iter)
{
const SyntaxNode &node = **iter;
string label = node.GetLabel();
if (label == prevLabel)
return true;
}
return false;
}
void SyntaxTree::AddNode( int startPos, int endPos, std::string label )
{
SyntaxNode* newNode = new SyntaxNode( startPos, endPos, "[" + label + "]");
m_nodes.push_back( newNode );
SyntaxNodes &nodesChart = m_index[ startPos ][ endPos ];
if (!g_global->uppermostOnly)
{
nodesChart.push_back( newNode );
//assert(!HasDuplicates(m_index[ startPos ][ endPos ]));
}
else
{
if (nodesChart.size() > 0)
{
assert(nodesChart.size() == 1);
//delete nodes[0];
nodesChart.resize(0);
}
assert(nodesChart.size() == 0);
nodesChart.push_back( newNode );
}
}
ParentNodes SyntaxTree::Parse() {
ParentNodes parents;
int size = m_index.size();
// looping through all spans of size >= 2
for( int length=2; length<=size; length++ )
{
for( int startPos = 0; startPos <= size-length; startPos++ )
{
if (HasNode( startPos, startPos+length-1 ))
{
// processing one (parent) span
//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
SplitPoints splitPoints;
splitPoints.push_back( startPos );
//std::cerr << " " << startPos;
int first = 1;
int covered = 0;
while( covered < length )
{
// find largest covering subspan (child)
// starting at last covered position
for( int midPos=length-first; midPos>covered; midPos-- )
{
if( HasNode( startPos+covered, startPos+midPos-1 ) )
{
covered = midPos;
splitPoints.push_back( startPos+covered );
// std::cerr << " " << ( startPos+covered );
first = 0;
}
}
}
// std::cerr << std::endl;
parents.push_back( splitPoints );
}
}
}
return parents;
}
bool SyntaxTree::HasNode( int startPos, int endPos ) const
{
return GetNodes( startPos, endPos).size() > 0;
}
const SyntaxNodes &SyntaxTree::GetNodes( int startPos, int endPos ) const
{
SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
if (startIndex == m_index.end() )
return m_emptyNode;
SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
if (endIndex == startIndex->second.end())
return m_emptyNode;
return endIndex->second;
}
// for printing out tree
std::string SyntaxTree::ToString() const
{
std::stringstream out;
out << *this;
return out.str();
}
void SyntaxTree::AddDefaultNonTerms(size_t phraseSize)
{
for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
{
for (size_t endPos = startPos; endPos < phraseSize; ++endPos)
{
AddNode(startPos, endPos, "X");
}
}
}
void SyntaxTree::AddDefaultNonTerms(bool isSyntax, bool mixed, size_t phraseSize)
{
if (isSyntax)
{
AddDefaultNonTerms(!mixed, phraseSize);
}
else
{ // add X everywhere
AddDefaultNonTerms(phraseSize);
}
}
void SyntaxTree::AddDefaultNonTerms(bool addEverywhere, size_t phraseSize)
{
//cerr << "GetNumWords()=" << GetNumWords() << endl;
//assert(phraseSize == GetNumWords() || GetNumWords() == 1); // 1 if syntax sentence doesn't have any xml. TODO fix syntax tree obj
for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
{
for (size_t endPos = startPos; endPos <= phraseSize; ++endPos)
{
const SyntaxNodes &nodes = GetNodes(startPos, endPos);
if (!addEverywhere && nodes.size() > 0)
{ // only add if no label
continue;
}
AddNode(startPos, endPos, "X");
}
}
}
const SyntaxNodes SyntaxTree::GetNodesForLHS( int startPos, int endPos ) const
{
SyntaxNodes ret(GetNodes(startPos, endPos));
if (ret.size() == 0)
ret.push_back(&m_defaultLHS);
return ret;
}
std::ostream& operator<<(std::ostream& os, const SyntaxTree& t)
{
int size = t.m_index.size();
for(size_t length=1; length<=size; length++)
{
for(size_t space=0; space<length; space++)
{
os << " ";
}
for(size_t start=0; start<=size-length; start++)
{
if (t.HasNode( start, start+(length-1) ))
{
std::string label = t.GetNodes( start, start+(length-1) )[0]->GetLabel() + "#######";
os << label.substr(0,7) << " ";
}
else
{
os << "------- ";
}
}
os << std::endl;
}
return os;
}

View File

@ -0,0 +1,96 @@
#pragma once
// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <string>
#include <vector>
#include <map>
#include <sstream>
class SyntaxNode;
typedef std::vector<const SyntaxNode*> SyntaxNodes;
class SyntaxNode {
protected:
int m_start, m_end;
std::string m_label;
SyntaxNodes m_children;
SyntaxNode* m_parent;
public:
SyntaxNode( int startPos, int endPos, const std::string &label)
:m_start(startPos)
,m_end(endPos)
,m_label(label)
{}
int GetStart() const
{ return m_start; }
int GetEnd() const
{ return m_end; }
const std::string &GetLabel() const
{ return m_label; }
bool IsSyntax() const;
};
typedef std::vector< int > SplitPoints;
typedef std::vector< SplitPoints > ParentNodes;
class SyntaxTree {
protected:
SyntaxNodes m_nodes;
SyntaxNode* m_top;
SyntaxNode m_defaultLHS;
typedef std::map< int, SyntaxNodes > SyntaxTreeIndex2;
typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
SyntaxTreeIndex m_index;
SyntaxNodes m_emptyNode;
friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
public:
SyntaxTree();
~SyntaxTree();
void AddNode( int startPos, int endPos, std::string label );
ParentNodes Parse();
bool HasNode( int startPos, int endPos ) const;
const SyntaxNodes &GetNodes( int startPos, int endPos ) const;
const SyntaxNodes &GetAllNodes() const { return m_nodes; } ;
size_t GetNumWords() const { return m_index.size(); }
std::string ToString() const;
void AddDefaultNonTerms(bool isSyntax, bool addEverywhere, size_t phraseSize);
void AddDefaultNonTerms(bool mixed, size_t phraseSize);
void AddDefaultNonTerms(size_t phraseSize);
const SyntaxNodes GetNodesForLHS( int startPos, int endPos ) const;
};
std::ostream& operator<<(std::ostream&, const SyntaxTree&);

View File

@ -0,0 +1,38 @@
/*
* Tunnel.cpp
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "Tunnel.h"
int Tunnel::Compare(const Tunnel &other) const
{
int ret = m_sourceRange.Compare(other.m_sourceRange);
if (ret != 0)
return ret;
ret = m_targetRange.Compare(other.m_targetRange);
return ret;
}
int Tunnel::Compare(const Tunnel &other, size_t direction) const
{
const Range &thisRange = (direction == 0) ? m_sourceRange : m_targetRange;
const Range &otherRange = (direction == 0) ? other.m_sourceRange : other.m_targetRange;
int ret = thisRange.Compare(otherRange);
return ret;
}
std::ostream& operator<<(std::ostream &out, const Tunnel &tunnel)
{
out << tunnel.m_sourceRange << "==>" << tunnel.m_targetRange;
return out;
}

View File

@ -0,0 +1,49 @@
#pragma once
/*
* Tunnel.h
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include <cassert>
#include <string>
#include <iostream>
#include "Range.h"
// for unaligned source terminal
class Tunnel
{
friend std::ostream& operator<<(std::ostream&, const Tunnel&);
protected:
Range m_sourceRange, m_targetRange;
public:
Tunnel()
{}
Tunnel(const Tunnel &copy)
:m_sourceRange(copy.m_sourceRange)
,m_targetRange(copy.m_targetRange)
{}
Tunnel(const Range &sourceRange, const Range &targetRange)
:m_sourceRange(sourceRange)
,m_targetRange(targetRange)
{}
const Range &GetRange(size_t direction) const
{ return (direction == 0) ? m_sourceRange : m_targetRange; }
int Compare(const Tunnel &other) const;
int Compare(const Tunnel &other, size_t direction) const;
};
typedef std::vector<Tunnel> TunnelList;

View File

@ -0,0 +1,70 @@
/*
* TunnelCollection.cpp
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "TunnelCollection.h"
#include "Range.h"
using namespace std;
size_t TunnelCollection::NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const
{
assert(startPos <= endPos);
if (direction == 0)
assert(endPos < alignedCountS.size());
else
assert(endPos < alignedCountT.size());
size_t ret = 0;
for (size_t ind = startPos; ind <= endPos; ++ind)
{
if (direction == 0 && alignedCountS[ind] == 0)
{
ret++;
}
else if (direction == 1 && alignedCountT[ind] == 0)
{
ret++;
}
}
return ret;
}
void TunnelCollection::Add(int startS, int endS, int startT, int endT)
{
// m_phraseExist[startS][endS - startS].push_back(Tunnel(startT, endT));
m_coll[startS][endS - startS].push_back(Tunnel(Range(startS, endS), Range(startT, endT)));
}
std::ostream& operator<<(std::ostream &out, const TunnelCollection &TunnelCollection)
{
size_t size = TunnelCollection.GetSize();
for (size_t startPos = 0; startPos < size; ++startPos)
{
for (size_t endPos = startPos; endPos < size; ++endPos)
{
const TunnelList &tunnelList = TunnelCollection.GetTunnels(startPos, endPos);
TunnelList::const_iterator iter;
for (iter = tunnelList.begin(); iter != tunnelList.end(); ++iter)
{
const Tunnel &tunnel = *iter;
out << tunnel << " ";
}
}
}
return out;
}

View File

@ -0,0 +1,61 @@
#pragma once
/*
* TunnelCollection.h
* extract
*
* Created by Hieu Hoang on 19/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include <vector>
#include "Tunnel.h"
// reposity of extracted phrase pairs
// which are potential tunnels in larger phrase pairs
class TunnelCollection
{
friend std::ostream& operator<<(std::ostream&, const TunnelCollection&);
protected:
std::vector< std::vector<TunnelList> > m_coll;
// indexed by source pos. and source length
// maps to list of tunnels where <int, int> are target pos
public:
std::vector<int> alignedCountS, alignedCountT;
TunnelCollection(const TunnelCollection &);
TunnelCollection(size_t size)
:m_coll(size)
{
// size is the length of the source sentence
for (size_t pos = 0; pos < size; ++pos)
{
// create empty tunnel lists
std::vector<TunnelList> &endVec = m_coll[pos];
endVec.resize(size - pos);
}
}
void Add(int startS, int endS, int startT, int endT);
//const TunnelList &GetTargetHoles(int startS, int endS) const
//{
// const TunnelList &targetHoles = m_phraseExist[startS][endS - startS];
// return targetHoles;
//}
const TunnelList &GetTunnels(int startS, int endS) const
{
const TunnelList &sourceHoles = m_coll[startS][endS - startS];
return sourceHoles;
}
const size_t GetSize() const
{ return m_coll.size(); }
size_t NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const;
};

View File

@ -0,0 +1,344 @@
// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <vector>
#include <string>
#include <set>
#include <iostream>
#include <stdlib.h>
#include "SyntaxTree.h"
using namespace std;
inline std::vector<std::string> Tokenize(const std::string& str,
const std::string& delimiters = " \t")
{
std::vector<std::string> tokens;
// Skip delimiters at beginning.
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
// Find first "non-delimiter".
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
while (std::string::npos != pos || std::string::npos != lastPos)
{
// Found a token, add it to the vector.
tokens.push_back(str.substr(lastPos, pos - lastPos));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiters, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiters, lastPos);
}
return tokens;
}
const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
{
std::string res = str;
res.erase(str.find_last_not_of(dropChars)+1);
return res.erase(0, res.find_first_not_of(dropChars));
}
string ParseXmlTagAttribute(const string& tag,const string& attributeName){
/*TODO deal with unescaping \"*/
string tagOpen = attributeName + "=\"";
size_t contentsStart = tag.find(tagOpen);
if (contentsStart == string::npos) return "";
contentsStart += tagOpen.size();
size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
if (contentsEnd == string::npos) {
cerr << "Malformed XML attribute: "<< tag;
return "";
}
size_t possibleEnd;
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
contentsEnd = possibleEnd;
}
return tag.substr(contentsStart,contentsEnd-contentsStart);
}
/**
* Remove "<" and ">" from XML tag
*
* \param str xml token to be stripped
*/
string TrimXml(const string& str)
{
// too short to be xml token -> do nothing
if (str.size() < 2) return str;
// strip first and last character
if (str[0] == '<' && str[str.size() - 1] == '>')
{
return str.substr(1, str.size() - 2);
}
// not an xml token -> do nothing
else { return str; }
}
/**
* Check if the token is an XML tag, i.e. starts with "<"
*
* \param tag token to be checked
*/
bool isXmlTag(const string& tag)
{
return tag[0] == '<';
}
/**
* Split up the input character string into tokens made up of
* either XML tags or text.
* example: this <b> is a </b> test .
* => (this ), (<b>), ( is a ), (</b>), ( test .)
*
* \param str input string
*/
inline vector<string> TokenizeXml(const string& str)
{
string lbrack = "<";
string rbrack = ">";
vector<string> tokens; // vector of tokens to be returned
string::size_type cpos = 0; // current position in string
string::size_type lpos = 0; // left start of xml tag
string::size_type rpos = 0; // right end of xml tag
// walk thorugh the string (loop vver cpos)
while (cpos != str.size())
{
// find the next opening "<" of an xml tag
lpos = str.find_first_of(lbrack, cpos);
if (lpos != string::npos)
{
// find the end of the xml tag
rpos = str.find_first_of(rbrack, lpos);
// sanity check: there has to be closing ">"
if (rpos == string::npos)
{
cerr << "ERROR: malformed XML: " << str << endl;
return tokens;
}
}
else // no more tags found
{
// add the rest as token
tokens.push_back(str.substr(cpos));
break;
}
// add stuff before xml tag as token, if there is any
if (lpos - cpos > 0)
tokens.push_back(str.substr(cpos, lpos - cpos));
// add xml tag as token
tokens.push_back(str.substr(lpos, rpos-lpos+1));
cpos = rpos + 1;
}
return tokens;
}
/**
* Process a sentence with xml annotation
* Xml tags may specifiy additional/replacing translation options
* and reordering constraints
*
* \param line in: sentence, out: sentence without the xml
* \param res vector with translation options specified by xml
* \param reorderingConstraint reordering constraint zones specified by xml
* \param walls reordering constraint walls specified by xml
*/
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
is so we can link things up afterwards. We can't create TranslationOptions as we
parse because we don't have the completed source parsed until after this function
removes all the markup from it (CreateFromString in Sentence::Read).
*/
bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) {
//parse XML markup in translation line
// no xml tag? we're done.
if (line.find_first_of('<') == string::npos) { return true; }
// break up input into a vector of xml tags and text
// example: (this), (<b>), (is a), (</b>), (test .)
vector<string> xmlTokens = TokenizeXml(line);
// we need to store opened tags, until they are closed
// tags are stored as tripled (tagname, startpos, contents)
typedef pair< string, pair< size_t, string > > OpenedTag;
vector< OpenedTag > tagStack; // stack that contains active opened tags
string cleanLine; // return string (text without xml)
size_t wordPos = 0; // position in sentence (in terms of number of words)
bool isLinked = false;
// loop through the tokens
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
{
// not a xml tag, but regular text (may contain many words)
if(!isXmlTag(xmlTokens[xmlTokenPos]))
{
// add a space at boundary, if necessary
if (cleanLine.size()>0 &&
cleanLine[cleanLine.size() - 1] != ' ' &&
xmlTokens[xmlTokenPos][0] != ' ')
{
cleanLine += " ";
}
cleanLine += xmlTokens[xmlTokenPos]; // add to output
wordPos = Tokenize(cleanLine).size(); // count all the words
}
// process xml tag
else
{
// *** get essential information about tag ***
// strip extra boundary spaces and "<" and ">"
string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
// cerr << "XML TAG IS: " << tag << std::endl;
if (tag.size() == 0)
{
cerr << "ERROR: empty tag name: " << line << endl;
return false;
}
// check if unary (e.g., "<wall/>")
bool isUnary = ( tag[tag.size() - 1] == '/' );
// check if opening tag (e.g. "<a>", not "</a>")g
bool isClosed = ( tag[0] == '/' );
bool isOpen = !isClosed;
if (isClosed && isUnary)
{
cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
return false;
}
if (isClosed)
tag = tag.substr(1); // remove "/" at the beginning
if (isUnary)
tag = tag.substr(0,tag.size()-1); // remove "/" at the end
// find the tag name and contents
string::size_type endOfName = tag.find_first_of(' ');
string tagName = tag;
string tagContent = "";
if (endOfName != string::npos) {
tagName = tag.substr(0,endOfName);
tagContent = tag.substr(endOfName+1);
}
// *** process new tag ***
if (isOpen || isUnary)
{
// put the tag on the tag stack
OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
tagStack.push_back( openedTag );
// cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
}
// *** process completed tag ***
if (isClosed || isUnary)
{
// pop last opened tag from stack;
if (tagStack.size() == 0)
{
cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
return false;
}
OpenedTag openedTag = tagStack.back();
tagStack.pop_back();
// tag names have to match
if (openedTag.first != tagName)
{
cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
return false;
}
// assemble remaining information about tag
size_t startPos = openedTag.second.first;
string tagContent = openedTag.second.second;
size_t endPos = wordPos;
// span attribute overwrites position
string span = ParseXmlTagAttribute(tagContent,"span");
if (! span.empty())
{
vector<string> ij = Tokenize(span, "-");
if (ij.size() != 1 && ij.size() != 2) {
cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
return false;
}
startPos = atoi(ij[0].c_str());
if (ij.size() == 1) endPos = startPos + 1;
else endPos = atoi(ij[1].c_str()) + 1;
}
// cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
if (startPos >= endPos)
{
cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
return false;
}
string label = ParseXmlTagAttribute(tagContent,"label");
labelCollection.insert( label );
// report what we have processed so far
if (0) {
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
}
tree.AddNode( startPos, endPos-1, label );
}
}
}
// we are done. check if there are tags that are still open
if (tagStack.size() > 0)
{
cerr << "ERROR: some opened tags were never closed: " << line << endl;
return false;
}
// collect top labels
const SyntaxNodes &topNodes = tree.GetNodes( 0, wordPos-1 );
for( SyntaxNodes::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ )
{
const SyntaxNode *n = *node;
const string &label = n->GetLabel();
if (topLabelCollection.find( label ) == topLabelCollection.end())
topLabelCollection[ label ] = 0;
topLabelCollection[ label ]++;
}
// return de-xml'ed sentence in line
line = cleanLine;
return true;
}

View File

@ -0,0 +1,35 @@
#pragma once
// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <string>
#include <vector>
#include <set>
#include <map>
#include "SyntaxTree.h"
std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
std::string TrimXml(const std::string& str);
bool isXmlTag(const std::string& tag);
inline std::vector<std::string> TokenizeXml(const std::string& str);
bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection );

View File

@ -0,0 +1,310 @@
// $Id: extract.cpp 2828 2010-02-01 16:07:58Z hieuhoang1972 $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <cstdio>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <cstring>
#include <sstream>
#include <iostream>
#include "extract.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "Lattice.h"
#ifdef WIN32
// Include Visual Leak Detector
#include <vld.h>
#endif
using namespace std;
void writeGlueGrammar(const string &, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
int main(int argc, char* argv[])
{
cerr << "Extract v2.0, written by Philipp Koehn\n"
<< "rule extraction from an aligned parallel corpus\n";
//time_t starttime = time(NULL);
Global *global = new Global();
g_global = global;
int sentenceOffset = 0;
if (argc < 5) {
cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract "
<< " [ --Hierarchical | --Orientation"
<< " | --GlueGrammar FILE | --UnknownWordLabel FILE"
<< " | --OnlyDirect"
<< " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]"
<< " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]"
<< " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]"
<< " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]"
<< " | --MaxSymbols[" << global->maxSymbols<< "]"
<< " | --MaxNonTerm[" << global->maxNonTerm << "]"
<< " | --SourceSyntax | --TargetSyntax"
<< " | --UppermostOnly[" << g_global->uppermostOnly << "]"
<< endl;
exit(1);
}
char* &fileNameT = argv[1];
char* &fileNameS = argv[2];
char* &fileNameA = argv[3];
string fileNameGlueGrammar;
string fileNameUnknownWordLabel;
string fileNameExtract = string(argv[4]);
int optionInd = 5;
for(int i=optionInd;i<argc;i++)
{
if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) {
global->minHoleSpanSourceDefault = atoi(argv[++i]);
if (global->minHoleSpanSourceDefault < 1) {
cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl;
exit(1);
}
}
else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) {
global->maxHoleSpanSourceDefault = atoi(argv[++i]);
if (global->maxHoleSpanSourceDefault < 1) {
cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl;
exit(1);
}
}
else if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) {
global->minHoleSpanSourceSyntax = atoi(argv[++i]);
if (global->minHoleSpanSourceSyntax < 1) {
cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl;
exit(1);
}
}
else if (strcmp(argv[i],"--UppermostOnly") == 0) {
global->uppermostOnly = atoi(argv[++i]);
}
else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) {
global->maxHoleSpanSourceSyntax = atoi(argv[++i]);
if (global->maxHoleSpanSourceSyntax < 1) {
cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl;
exit(1);
}
}
// maximum number of words in hierarchical phrase
else if (strcmp(argv[i],"--maxSymbols") == 0) {
global->maxSymbols = atoi(argv[++i]);
if (global->maxSymbols < 1) {
cerr << "extract error: --maxSymbols should be at least 1" << endl;
exit(1);
}
}
// maximum number of non-terminals
else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
global->maxNonTerm = atoi(argv[++i]);
if (global->maxNonTerm < 1) {
cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
exit(1);
}
}
// allow consecutive non-terminals (X Y | X Y)
else if (strcmp(argv[i],"--TargetSyntax") == 0) {
global->targetSyntax = true;
}
else if (strcmp(argv[i],"--SourceSyntax") == 0) {
global->sourceSyntax = true;
}
// do not create many part00xx files!
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
// now default
}
else if (strcmp(argv[i],"--GlueGrammar") == 0) {
global->glueGrammarFlag = true;
if (++i >= argc)
{
cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
exit(0);
}
fileNameGlueGrammar = string(argv[i]);
cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
}
else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
global->unknownWordLabelFlag = true;
if (++i >= argc)
{
cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
exit(0);
}
fileNameUnknownWordLabel = string(argv[i]);
cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
}
// TODO: this should be a useful option
//else if (strcmp(argv[i],"--ZipFiles") == 0) {
// zipFiles = true;
//}
// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
else if (strcmp(argv[i],"--Mixed") == 0) {
global->mixed = true;
}
else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) {
global->allowDefaultNonTermEdge = atoi(argv[++i]);
}
else if (strcmp(argv[i], "--GZOutput") == 0) {
global->gzOutput = true;
}
else if (strcmp(argv[i],"--MaxSpan") == 0) {
// ignore
++i;
}
else if (strcmp(argv[i],"--SentenceOffset") == 0) {
if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
exit(1);
}
sentenceOffset = atoi(argv[++i]);
}
else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
}
}
// open input files
Moses::InputFileStream tFile(fileNameT);
Moses::InputFileStream sFile(fileNameS);
Moses::InputFileStream aFile(fileNameA);
// open output files
string fileNameExtractInv = fileNameExtract + ".inv";
if (global->gzOutput) {
fileNameExtract += ".gz";
fileNameExtractInv += ".gz";
}
Moses::OutputFileStream extractFile;
Moses::OutputFileStream extractFileInv;
extractFile.Open(fileNameExtract.c_str());
extractFileInv.Open(fileNameExtractInv.c_str());
// loop through all sentence pairs
int i = sentenceOffset;
while(true) {
i++;
if (i % 1000 == 0) {
cerr << i << " " << flush;
}
string targetString;
string sourceString;
string alignmentString;
bool ok = getline(tFile, targetString);
if (!ok)
break;
getline(sFile, sourceString);
getline(aFile, alignmentString);
//cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl;
//time_t currTime = time(NULL);
//cerr << "A " << (currTime - starttime) << endl;
SentenceAlignment sentencePair;
if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global ))
{
//cerr << sentence.sourceTree << endl;
//cerr << sentence.targetTree << endl;
sentencePair.FindTunnels(*g_global);
//cerr << "C " << (time(NULL) - starttime) << endl;
//cerr << sentencePair << endl;
sentencePair.CreateLattice(*g_global);
//cerr << "D " << (time(NULL) - starttime) << endl;
//cerr << sentencePair << endl;
sentencePair.CreateRules(*g_global);
//cerr << "E " << (time(NULL) - starttime) << endl;
//cerr << sentence.lattice->GetRules().GetSize() << endl;
sentencePair.GetLattice().GetRules().Output(extractFile);
sentencePair.GetLattice().GetRules().OutputInv(extractFileInv);
}
}
tFile.Close();
sFile.Close();
aFile.Close();
extractFile.Close();
extractFileInv.Close();
if (global->glueGrammarFlag) {
writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection);
}
delete global;
}
void writeGlueGrammar( const string & fileName, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
{
ofstream grammarFile;
grammarFile.open(fileName.c_str());
if (!options.targetSyntax) {
grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
} else {
// chose a top label that is not already a label
string topLabel = "QQQQQQ";
for( unsigned int i=1; i<=topLabel.length(); i++) {
if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
topLabel = topLabel.substr(0,i);
break;
}
}
// basic rules
grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| " << endl
<< "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << endl;
// top rules
for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
i != targetTopLabelCollection.end(); i++ ) {
grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << endl;
}
// glue rules
for( set<string>::const_iterator i = targetLabelCollection.begin();
i != targetLabelCollection.end(); i++ ) {
grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << endl;
}
grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << endl; // glue rule for unknown word...
}
grammarFile.close();
}

View File

@ -0,0 +1,34 @@
#pragma once
#include <vector>
#include <list>
#include <map>
#include <set>
#include <string>
#include <fstream>
#include <algorithm>
#include "SyntaxTree.h"
#include "XmlTree.h"
#include "Tunnel.h"
#include "TunnelCollection.h"
#include "SentenceAlignment.h"
#include "Global.h"
std::vector<std::string> tokenize( const char [] );
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
_IS.getline(_LINE, _SIZE, _DELIM); \
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
if (_IS.gcount() == _SIZE-1) { \
cerr << "Line too long! Buffer overflow. Delete lines >=" \
<< _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \
<< endl; \
exit(1); \
} \
}
#define LINE_MAX_LENGTH 1000000
const Global *g_global;
std::set< std::string > targetLabelCollection, sourceLabelCollection;
std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;

View File

@ -0,0 +1,81 @@
#ifndef moses_gzfile_buf_h
#define moses_gzfile_buf_h
#include <streambuf>
#include <zlib.h>
#include <cstring>
class gzfilebuf : public std::streambuf {
public:
gzfilebuf(const char *filename)
{ _gzf = gzopen(filename, "rb");
setg (_buff+sizeof(int), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)); // end position
}
~gzfilebuf() { gzclose(_gzf); }
protected:
virtual int_type overflow (int_type c) {
throw;
}
// write multiple characters
virtual
std::streamsize xsputn (const char* s,
std::streamsize num) {
throw;
}
virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
}
//read one character
virtual int_type underflow () {
// is read position before end of _buff?
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
/* process size of putback area
* - use number of characters read
* - but at most four
*/
unsigned int numPutback = gptr() - eback();
if (numPutback > sizeof(int)) {
numPutback = sizeof(int);
}
/* copy up to four characters previously read into
* the putback _buff (area of first four characters)
*/
std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
numPutback);
// read new characters
int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
if (num <= 0) {
// ERROR or EOF
return EOF;
}
// reset _buff pointers
setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)+num); // end of buffer
// return next character
return traits_type::to_int_type(*gptr());
}
std::streamsize xsgetn (char* s,
std::streamsize num) {
return gzread(_gzf,s,num);
}
private:
gzFile _gzf;
static const unsigned int _buffsize = 1024;
char _buff[_buffsize];
};
#endif

View File

@ -0,0 +1,110 @@
// $Id: tables-core.cpp 3131 2010-04-13 16:29:55Z pjwilliams $
//#include "beammain.h"
//#include "SafeGetLine.h"
#include "tables-core.h"
#define TABLE_LINE_MAX_LENGTH 1000
#define UNKNOWNSTR "UNK"
// as in beamdecoder/tables.cpp
vector<string> tokenize( const char* input ) {
vector< string > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
}
else if (isSpace && !betweenWords) {
token.push_back( string( input+start, i-start ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( string( input+start, i-start ) );
return token;
}
WORD_ID Vocabulary::storeIfNew( const WORD& word ) {
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i != lookup.end() )
return i->second;
WORD_ID id = vocab.size();
vocab.push_back( word );
lookup[ word ] = id;
return id;
}
WORD_ID Vocabulary::getWordID( const WORD& word ) {
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i == lookup.end() )
return 0;
return i->second;
}
PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase ) {
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
if( i != lookup.end() )
return i->second;
PHRASE_ID id = phraseTable.size();
phraseTable.push_back( phrase );
lookup[ phrase ] = id;
return id;
}
PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase ) {
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
if( i == lookup.end() )
return 0;
return i->second;
}
void PhraseTable::clear() {
lookup.clear();
phraseTable.clear();
}
void DTable::init() {
for(int i = -10; i<10; i++)
dtable[i] = -abs( i );
}
/*
void DTable::load( const string& fileName ) {
ifstream inFile;
inFile.open(fileName.c_str());
istream *inFileP = &inFile;
char line[TABLE_LINE_MAX_LENGTH];
int i=0;
while(true) {
i++;
SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
vector<string> token = tokenize( line );
if (token.size() < 2) {
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
continue;
}
int d = atoi( token[0].c_str() );
double prob = log( atof( token[1].c_str() ) );
dtable[ d ] = prob;
}
}
*/
double DTable::get( int distortion ) {
if (dtable.find( distortion ) == dtable.end())
return log( 0.00001 );
return dtable[ distortion ];
}

View File

@ -0,0 +1,72 @@
#pragma once
// $Id: tables-core.h 2416 2009-07-30 11:07:38Z hieuhoang1972 $
#include <iostream>
#include <fstream>
#include <assert.h>
#include <stdlib.h>
#include <string>
#include <queue>
#include <map>
#include <cmath>
using namespace std;
#define TABLE_LINE_MAX_LENGTH 1000
#define UNKNOWNSTR "UNK"
vector<string> tokenize( const char[] );
//! delete and remove every element of a collection object such as map, set, list etc
template<class COLL>
void RemoveAllInColl(COLL &coll)
{
for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter)
{
delete (*iter);
}
coll.clear();
}
typedef string WORD;
typedef unsigned int WORD_ID;
class Vocabulary {
public:
map<WORD, WORD_ID> lookup;
vector< WORD > vocab;
WORD_ID storeIfNew( const WORD& );
WORD_ID getWordID( const WORD& );
inline WORD &getWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
};
typedef vector< WORD_ID > PHRASE;
typedef unsigned int PHRASE_ID;
class PhraseTable {
public:
map< PHRASE, PHRASE_ID > lookup;
vector< PHRASE > phraseTable;
PHRASE_ID storeIfNew( const PHRASE& );
PHRASE_ID getPhraseID( const PHRASE& );
void clear();
inline PHRASE &getPhrase( const PHRASE_ID id ) { return phraseTable[ id ]; }
};
typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC;
class TTable {
public:
map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable;
map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti;
};
class DTable {
public:
map< int, double > dtable;
void init();
void load( const string& );
double get( int );
};

View File

@ -0,0 +1,123 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.499747849" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.798364121" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/extract-ordering}/Debug" id="cdt.managedbuild.builder.gnu.cross.1976289814" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1324749613" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1750299246" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.719498215" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.251118848" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.99297656" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1844372739" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1178164658" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<option id="gnu.cpp.link.option.libs.1434184833" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
<listOptionValue builtIn="false" value="boost_system-mt"/>
<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
</option>
<option id="gnu.cpp.link.option.paths.974811544" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.904916320" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1005231499" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1318928675" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.604255673" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.818331963">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.818331963." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.1489025499" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1052477856" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/extract-ordering}/Release" id="cdt.managedbuild.builder.gnu.cross.33925527" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1884790737" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.197048136" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.106898878" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.1920378037" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.37950410" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1197641703" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1356351201" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2053623412" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1988048517" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1494470963" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1553727957" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="extract-ordering.cdt.managedbuild.target.gnu.cross.exe.1840421491" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417;cdt.managedbuild.tool.gnu.c.compiler.input.106898878">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446;cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964;cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827;cdt.managedbuild.tool.gnu.c.compiler.input.719498215">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>

View File

@ -0,0 +1,74 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>extract-ordering</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
<linkedResources>
<link>
<name>InputFileStream.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.cpp</locationURI>
</link>
<link>
<name>InputFileStream.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.h</locationURI>
</link>
<link>
<name>OutputFileStream.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.cpp</locationURI>
</link>
<link>
<name>OutputFileStream.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
</link>
<link>
<name>SentenceAlignment.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.cpp</locationURI>
</link>
<link>
<name>SentenceAlignment.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.h</locationURI>
</link>
<link>
<name>extract-ordering-main.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ordering-main.cpp</locationURI>
</link>
<link>
<name>tables-core.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.cpp</locationURI>
</link>
<link>
<name>tables-core.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.h</locationURI>
</link>
</linkedResources>
</projectDescription>

View File

@ -0,0 +1,124 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.1899954923" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1645930772" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/manual-label/Debug}" id="cdt.managedbuild.builder.gnu.cross.1703642277" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1888648788" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.1838052643" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.798368516" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.153015988" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.418888584" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.406065865" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1741441821" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1626431978" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<option id="gnu.cpp.link.option.libs.1886912770" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="boost_program_options-mt"/>
</option>
<option id="gnu.cpp.link.option.paths.1541583695" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/lib64&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1367999206" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.31522559" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.826957235" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.350181339" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.97427761" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.564169339" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
<builder buildPath="${workspace_loc:/manual-label/Release}" id="cdt.managedbuild.builder.gnu.cross.663164336" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.2135645103" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.option.debugging.level.764935013" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1841809129" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
<option id="gnu.cpp.compiler.option.optimization.level.1877584345" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.debugging.level.935490779" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.355530813" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.940299092" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.17718999" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1527322008" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.480337803" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1788533940" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="manual-label.cdt.managedbuild.target.gnu.cross.exe.2117548180" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943;cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607;cdt.managedbuild.tool.gnu.c.compiler.input.798368516">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437;cdt.managedbuild.tool.gnu.c.compiler.input.1841809129">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503;cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>

View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>manual-label</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>

View File

@ -0,0 +1,86 @@
#include <list>
#include "DeEn.h"
#include "moses/Util.h"
using namespace std;
extern bool g_debug;
bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str)
{
pos += offset;
if (pos >= source.size() || pos < 0) {
return false;
}
const string &word = source[pos][factor];
vector<string> soughts = Moses::Tokenize(str, " ");
for (int i = 0; i < soughts.size(); ++i) {
string &sought = soughts[i];
bool found = (word == sought);
if (found) {
return true;
}
}
return false;
}
bool Contains(const Phrase &source, int start, int end, int factor, const string &str)
{
for (int pos = start; pos <= end; ++pos) {
bool found = IsA(source, pos, 0, factor, str);
if (found) {
return true;
}
}
return false;
}
void LabelDeEn(const Phrase &source, ostream &out)
{
typedef pair<int,int> Range;
typedef list<Range> Ranges;
Ranges ranges;
// find ranges to label
for (int start = 0; start < source.size(); ++start) {
for (int end = start; end < source.size(); ++end) {
if (IsA(source, start, -1, 1, "VAFIN")
&& IsA(source, end, +1, 1, "VVINF VVPP")
&& !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) {
Range range(start, end);
ranges.push_back(range);
}
else if ((start == 0 || IsA(source, start, -1, 1, "$,"))
&& IsA(source, end, +1, 0, "zu")
&& IsA(source, end, +2, 1, "VVINF")
&& !Contains(source, start, end, 1, "$,")) {
Range range(start, end);
ranges.push_back(range);
}
}
}
// output sentence, with labels
for (int pos = 0; pos < source.size(); ++pos) {
// output beginning of label
for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
const Range &range = *iter;
if (range.first == pos) {
out << "<tree label=\"reorder-label\"> ";
}
}
const Word &word = source[pos];
out << word[0] << " ";
for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
const Range &range = *iter;
if (range.second == pos) {
out << "</tree> ";
}
}
}
out << endl;
}

View File

@ -0,0 +1,10 @@
#pragma once
#include <iostream>
#include <vector>
#include <string>
typedef std::vector<std::string> Word;
typedef std::vector<Word> Phrase;
void LabelDeEn(const Phrase &source, std::ostream &out);

View File

@ -0,0 +1,13 @@
all: manual-label
clean:
rm -f *.o manual-label
.cpp.o:
g++ -I../../../ -O6 -g -c $<
manual-label: DeEn.o manual-label.o
g++ DeEn.o manual-label.o -lz -lboost_program_options-mt -o manual-label

View File

@ -0,0 +1,88 @@
#include <iostream>
#include <cstdlib>
#include <boost/program_options.hpp>
#include "moses/Util.h"
#include "DeEn.h"
using namespace std;
bool g_debug = false;
Phrase Tokenize(const string &line);
int main(int argc, char** argv)
{
cerr << "Starting" << endl;
namespace po = boost::program_options;
po::options_description desc("Options");
desc.add_options()
("help", "Print help messages")
("add", "additional options")
("source-language,s", po::value<string>()->required(), "Source Language")
("target-language,t", po::value<string>()->required(), "Target Language");
po::variables_map vm;
try
{
po::store(po::parse_command_line(argc, argv, desc),
vm); // can throw
/** --help option
*/
if ( vm.count("help") )
{
std::cout << "Basic Command Line Parameter App" << std::endl
<< desc << std::endl;
return EXIT_SUCCESS;
}
po::notify(vm); // throws on error, so do after help in case
// there are any problems
}
catch(po::error& e)
{
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return EXIT_FAILURE;
}
string sourceLang = vm["source-language"].as<string>();
string targetLang = vm["target-language"].as<string>();
cerr << sourceLang << " " << targetLang << endl;
string line;
size_t lineNum = 1;
while (getline(cin, line)) {
//cerr << lineNum << ":" << line << endl;
if (lineNum % 1000 == 0) {
cerr << lineNum << " ";
}
Phrase source = Tokenize(line);
LabelDeEn(source, cout);
++lineNum;
}
cerr << "Finished" << endl;
return EXIT_SUCCESS;
}
Phrase Tokenize(const string &line)
{
Phrase ret;
vector<string> toks = Moses::Tokenize(line);
for (size_t i = 0; i < toks.size(); ++i) {
Word word = Moses::Tokenize(toks[i], "|");
ret.push_back(word);
}
return ret;
}

View File

@ -11,11 +11,11 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -64,11 +64,11 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -88,13 +88,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.401150096" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -12,12 +12,13 @@ Building the RPM SPEC file
The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
- The Git repository from which an installer will be built,
- The branch in the Git repository to build, and
- The branch in the Git repository to build,
- The location of Boost on the build machine, and
- The version of the installed Moses distribution.
For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
For example, to build the RELEASE-1.0 branch in the mosesdecoder repository (git://github.com/moses-smt/mosesdecoder.git):
$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0 -t /usr
This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.

View File

@ -1,11 +1,13 @@
#!/bin/bash
BRANCH="master"
BOOST="/usr"
declare -i NO_RPM_BUILD=0
declare -r RPM_VERSION_TAG="___RPM_VERSION__"
declare -r BOOST_TAG="___BOOST_LOCATION__"
function usage() {
echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version] -t [Boost install: default ${BOOST}]"
exit 1
}
@ -13,11 +15,12 @@ if [ $# -lt 4 ]; then
usage
fi
while getopts r:b:v:nh OPTION
while getopts r:b:t:v:nh OPTION
do
case "$OPTION" in
r) REPO="${OPTARG}";;
b) BRANCH="${OPTARG}";;
t) BOOST="${OPTARG}";;
v) VERSION="${OPTARG}";;
n) NO_RPM_BUILD=1;;
[h\?]) usage;;
@ -53,7 +56,8 @@ if [ ${NO_RPM_BUILD} -eq 0 ]; then
if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
mkdir -p ${HOME}/rpmbuild/SPECS
fi
eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
ESC_BOOST=`echo ${BOOST} | gawk '{gsub(/\//, "\\\\/"); print}'`
eval sed -e \"s/${RPM_VERSION_TAG}/${VERSION}/\" -e \"s/${BOOST_TAG}/${ESC_BOOST}/\" ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
mkdir -p ${HOME}/rpmbuild/SOURCES
fi

View File

@ -8,7 +8,7 @@ License: LGPL
Group: Development/Tools
Vendor: Capita Translation and Interpreting
Packager: Ian Johnson <ian.johnson@capita-ti.com>
Requires: boost >= 1.48, python >= 2.6, perl >= 5
Requires: python >= 2.6, perl >= 5
BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
%description
Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
@ -35,16 +35,17 @@ cd ../giza-pp
make
cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
%build
./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
./bjam --with-boost=___BOOST_LOCATION__ --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
%install
mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
cp -R bin $RPM_BUILD_ROOT/opt/moses
cp -R scripts/OSM $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/Transliteration $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
@ -52,12 +53,13 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
%files
%defattr(-,root,root)
/opt/moses/bin/*
/opt/moses/scripts/OSM/*
/opt/moses/scripts/Transliteration/*
/opt/moses/scripts/analysis/*
/opt/moses/scripts/ems/*
/opt/moses/scripts/generic/*
/opt/moses/scripts/other/*
/opt/moses/scripts/recaser/*
/opt/moses/scripts/regression-testing/*
/opt/moses/scripts/share/*
/opt/moses/scripts/tokenizer/*
/opt/moses/scripts/training/*

View File

@ -106,7 +106,7 @@ class Moses():
scores = line[2].split()
if len(scores) <self.number_of_features:
sys.stderr.write('Error: model only has {0} features. Expected {1}.\n'.format(len(scores),self.number_of_features))
exit()
exit(1)
scores = scores[:self.number_of_features]
model_probabilities = map(float,scores)
@ -179,7 +179,7 @@ class Moses():
reordering_probabilities[j][i] = p
except IndexError:
sys.stderr.write('\nIndexError: Did you correctly specify the number of reordering features? (--number_of_features N in command line)\n')
exit()
exit(1)
def traverse_incrementally(self,table,models,load_lines,store_flag,mode='interpolate',inverted=False,lowmem=False,flags=None):
"""hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
@ -307,13 +307,13 @@ class Moses():
elif len(line) == 4:
if self.require_alignment:
sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n')
exit()
exit(1)
self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')]
else:
sys.stderr.write('Error: unexpected phrase table format. Are you using a very old/new version of Moses with different formatting?\n')
exit()
exit(1)
def get_word_alignments(self,src,target,cache=False,mycache={}):
@ -515,7 +515,7 @@ class TigerXML():
if not src or not target:
sys.stderr.write('Error: Source and/or target language not specified. Required for TigerXML extraction.\n')
exit()
exit(1)
alignments = self._get_aligned_ids(src,target)
self._textualize_alignments(src,target,alignments)
@ -1261,7 +1261,7 @@ def handle_file(filename,action,fileobj=None,mode='r'):
sys.stderr.write('For a weighted counts combination, we need statistics that Moses doesn\'t write to disk by default.\n')
sys.stderr.write('Repeat step 4 of Moses training for all models with the option -write-lexical-counts.\n')
exit()
exit(1)
if filename.endswith('.gz'):
fileobj = gzip.open(filename,mode)
@ -1435,7 +1435,7 @@ class Combine_TMs():
if mode not in ['interpolate','loglinear','counts']:
sys.stderr.write('Error: mode must be either "interpolate", "loglinear" or "counts"\n')
sys.exit()
sys.exit(1)
models,number_of_features,weights = self._sanity_checks(models,number_of_features,weights)

View File

@ -1,4 +1,6 @@
#include "lm/bhiksha.hh"
#include "lm/binary_format.hh"
#include "lm/config.hh"
#include "util/file.hh"
#include "util/exception.hh"
@ -15,11 +17,11 @@ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_
const uint8_t kArrayBhikshaVersion = 0;
// TODO: put this in binary file header instead when I change the binary file format again.
void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) {
uint8_t version;
uint8_t configured_bits;
util::ReadOrThrow(fd, &version, 1);
util::ReadOrThrow(fd, &configured_bits, 1);
void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
uint8_t buffer[2];
file.ReadForConfig(buffer, 2, offset);
uint8_t version = buffer[0];
uint8_t configured_bits = buffer[1];
if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
config.pointer_bhiksha_bits = configured_bits;
}
@ -87,9 +89,6 @@ void ArrayBhiksha::FinishedLoading(const Config &config) {
*(head_write++) = config.pointer_bhiksha_bits;
}
void ArrayBhiksha::LoadedBinary() {
}
} // namespace trie
} // namespace ngram
} // namespace lm

View File

@ -24,6 +24,7 @@
namespace lm {
namespace ngram {
struct Config;
class BinaryFormat;
namespace trie {
@ -31,7 +32,7 @@ class DontBhiksha {
public:
static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {}
static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {}
static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
@ -53,8 +54,6 @@ class DontBhiksha {
void FinishedLoading(const Config &/*config*/) {}
void LoadedBinary() {}
uint8_t InlineBits() const { return next_.bits; }
private:
@ -65,7 +64,7 @@ class ArrayBhiksha {
public:
static const ModelType kModelTypeAdd = kArrayAdd;
static void UpdateConfigFromBinary(int fd, Config &config);
static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
@ -93,8 +92,6 @@ class ArrayBhiksha {
void FinishedLoading(const Config &config);
void LoadedBinary();
uint8_t InlineBits() const { return next_inline_.bits; }
private:

View File

@ -14,6 +14,9 @@
namespace lm {
namespace ngram {
const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
namespace {
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
@ -58,8 +61,6 @@ struct Sanity {
}
};
const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
std::size_t TotalHeaderSize(unsigned char order) {
return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
}
@ -81,83 +82,6 @@ void WriteHeader(void *to, const Parameters &params) {
} // namespace
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) {
if (config.write_mmap) {
std::size_t total = TotalHeaderSize(order) + memory_size;
backing.file.reset(util::CreateOrThrow(config.write_mmap));
if (config.write_method == Config::WRITE_MMAP) {
backing.vocab.reset(util::MapZeroedWrite(backing.file.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
} else {
util::ResizeOrThrow(backing.file.get(), 0);
util::MapAnonymous(total, backing.vocab);
}
strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order));
return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
} else {
util::MapAnonymous(memory_size, backing.vocab);
return reinterpret_cast<uint8_t*>(backing.vocab.get());
}
}
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
if (config.write_mmap) {
// Grow the file to accomodate the search, using zeros.
try {
util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
} catch (util::ErrnoException &e) {
e << " for file " << config.write_mmap;
throw e;
}
if (config.write_method == Config::WRITE_AFTER) {
util::MapAnonymous(memory_size, backing.search);
return reinterpret_cast<uint8_t*>(backing.search.get());
}
// mmap it now.
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
std::size_t page_size = util::SizePage();
std::size_t alignment_cruft = adjusted_vocab % page_size;
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
} else {
util::MapAnonymous(memory_size, backing.search);
return reinterpret_cast<uint8_t*>(backing.search.get());
}
}
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
if (!config.write_mmap) return;
switch (config.write_method) {
case Config::WRITE_MMAP:
util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
util::SyncOrThrow(backing.search.get(), backing.search.size());
break;
case Config::WRITE_AFTER:
util::SeekOrThrow(backing.file.get(), 0);
util::WriteOrThrow(backing.file.get(), backing.vocab.get(), backing.vocab.size());
util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad);
util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size());
util::FSyncOrThrow(backing.file.get());
break;
}
// header and vocab share the same mmap. The header is written here because we know the counts.
Parameters params = Parameters();
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;
params.fixed.search_version = search_version;
WriteHeader(backing.vocab.get(), params);
if (config.write_method == Config::WRITE_AFTER) {
util::SeekOrThrow(backing.file.get(), 0);
util::WriteOrThrow(backing.file.get(), backing.vocab.get(), TotalHeaderSize(counts.size()));
}
}
namespace detail {
bool IsBinaryFormat(int fd) {
const uint64_t size = util::SizeFile(fd);
if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
@ -209,44 +133,164 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
}
void SeekPastHeader(int fd, const Parameters &params) {
util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
BinaryFormat::BinaryFormat(const Config &config)
: write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params) {
file_.reset(fd);
write_mmap_ = NULL; // Ignore write requests; this is already in binary format.
ReadHeader(fd, params);
MatchCheck(model_type, search_version, params);
header_size_ = TotalHeaderSize(params.counts.size());
}
uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing) {
const uint64_t file_size = util::SizeFile(backing.file.get());
void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
assert(header_size_ != kInvalidSize);
util::PReadOrThrow(file_.get(), to, amount, offset_excluding_header + header_size_);
}
void *BinaryFormat::LoadBinary(std::size_t size) {
assert(header_size_ != kInvalidSize);
const uint64_t file_size = util::SizeFile(file_.get());
// The header is smaller than a page, so we have to map the whole header as well.
std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
uint64_t total_map = static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(size);
UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.search);
util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_);
if (config.enumerate_vocab && !params.fixed.has_vocabulary)
UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
// Seek to vocabulary words
util::SeekOrThrow(backing.file.get(), total_map);
return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size());
vocab_string_offset_ = total_map;
return reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
}
void ComplainAboutARPA(const Config &config, ModelType model_type) {
if (config.write_mmap || !config.messages) return;
if (config.arpa_complain == Config::ALL) {
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
} else if (config.arpa_complain == Config::EXPENSIVE &&
(model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
vocab_size_ = memory_size;
if (!write_mmap_) {
header_size_ = 0;
util::MapAnonymous(memory_size, memory_vocab_);
return reinterpret_cast<uint8_t*>(memory_vocab_.get());
}
header_size_ = TotalHeaderSize(order);
std::size_t total = util::CheckOverflow(static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(memory_size));
file_.reset(util::CreateOrThrow(write_mmap_));
// some gccs complain about uninitialized variables even though all enum values are covered.
void *vocab_base = NULL;
switch (write_method_) {
case Config::WRITE_MMAP:
mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
vocab_base = mapping_.get();
break;
case Config::WRITE_AFTER:
util::ResizeOrThrow(file_.get(), 0);
util::MapAnonymous(total, memory_vocab_);
vocab_base = memory_vocab_.get();
break;
}
strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
}
void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) {
assert(vocab_size_ != kInvalidSize);
vocab_pad_ = vocab_pad;
std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
vocab_string_offset_ = new_size;
if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
util::MapAnonymous(memory_size, memory_search_);
assert(header_size_ == 0 || write_mmap_);
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
return reinterpret_cast<uint8_t*>(memory_search_.get());
}
assert(write_method_ == Config::WRITE_MMAP);
// Also known as total size without vocab words.
// Grow the file to accomodate the search, using zeros.
// According to man mmap, behavior is undefined when the file is resized
// underneath a mmap that is not a multiple of the page size. So to be
// safe, we'll unmap it and map it again.
mapping_.reset();
util::ResizeOrThrow(file_.get(), new_size);
void *ret;
MapFile(vocab_base, ret);
return ret;
}
void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) {
// Checking Config's include_vocab is the responsibility of the caller.
assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize);
if (!write_mmap_) {
// Unchanged base.
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get());
search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
return;
}
if (write_method_ == Config::WRITE_MMAP) {
mapping_.reset();
}
util::SeekOrThrow(file_.get(), VocabStringReadingOffset());
util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
if (write_method_ == Config::WRITE_MMAP) {
MapFile(vocab_base, search_base);
} else {
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
}
}
} // namespace detail
void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts) {
if (!write_mmap_) return;
switch (write_method_) {
case Config::WRITE_MMAP:
util::SyncOrThrow(mapping_.get(), mapping_.size());
break;
case Config::WRITE_AFTER:
util::SeekOrThrow(file_.get(), 0);
util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size());
util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_);
util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size());
util::FSyncOrThrow(file_.get());
break;
}
// header and vocab share the same mmap.
Parameters params = Parameters();
memset(&params, 0, sizeof(Parameters));
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;
params.fixed.search_version = search_version;
switch (write_method_) {
case Config::WRITE_MMAP:
WriteHeader(mapping_.get(), params);
util::SyncOrThrow(mapping_.get(), mapping_.size());
break;
case Config::WRITE_AFTER:
{
std::vector<uint8_t> buffer(TotalHeaderSize(counts.size()));
WriteHeader(&buffer[0], params);
util::SeekOrThrow(file_.get(), 0);
util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
}
break;
}
}
void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) {
mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED);
vocab_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
search_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_;
}
bool RecognizeBinary(const char *file, ModelType &recognized) {
util::scoped_fd fd(util::OpenReadOrThrow(file));
if (!detail::IsBinaryFormat(fd.get())) return false;
if (!IsBinaryFormat(fd.get())) {
return false;
}
Parameters params;
detail::ReadHeader(fd.get(), params);
ReadHeader(fd.get(), params);
recognized = params.fixed.model_type;
return true;
}

View File

@ -17,6 +17,8 @@
namespace lm {
namespace ngram {
extern const char *kModelNames[6];
/*Inspect a file to determine if it is a binary lm. If not, return false.
* If so, return true and set recognized to the type. This is the only API in
* this header designed for use by decoder authors.
@ -42,67 +44,63 @@ struct Parameters {
std::vector<uint64_t> counts;
};
struct Backing {
// File behind memory, if any.
util::scoped_fd file;
// Vocabulary lookup table. Not to be confused with the vocab words themselves.
util::scoped_memory vocab;
// Raw block of memory backing the language model data structures
util::scoped_memory search;
class BinaryFormat {
public:
explicit BinaryFormat(const Config &config);
// Reading a binary file:
// Takes ownership of fd
void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params);
// Used to read parts of the file to update the config object before figuring out full size.
void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
// Actually load the binary file and return a pointer to the beginning of the search area.
void *LoadBinary(std::size_t size);
uint64_t VocabStringReadingOffset() const {
assert(vocab_string_offset_ != kInvalidOffset);
return vocab_string_offset_;
}
// Writing a binary file or initializing in RAM from ARPA:
// Size for vocabulary.
void *SetupJustVocab(std::size_t memory_size, uint8_t order);
// Warning: can change the vocaulary base pointer.
void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
// Warning: can change vocabulary and search base addresses.
void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
// Write the header at the beginning of the file.
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
private:
void MapFile(void *&vocab_base, void *&search_base);
// Copied from configuration.
const Config::WriteMethod write_method_;
const char *write_mmap_;
util::LoadMethod load_method_;
// File behind memory, if any.
util::scoped_fd file_;
// If there is a file involved, a single mapping.
util::scoped_memory mapping_;
// If the data is only in memory, separately allocate each because the trie
// knows vocab's size before it knows search's size (because SRILM might
// have pruned).
util::scoped_memory memory_vocab_, memory_search_;
// Memory ranges. Note that these may not be contiguous and may not all
// exist.
std::size_t header_size_, vocab_size_, vocab_pad_;
// aka end of search.
uint64_t vocab_string_offset_;
static const uint64_t kInvalidOffset = (uint64_t)-1;
};
// Create just enough of a binary file to write vocabulary to it.
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing);
// Write header to binary file. This is done last to prevent incomplete files
// from loading.
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing);
namespace detail {
bool IsBinaryFormat(int fd);
void ReadHeader(int fd, Parameters &params);
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params);
void SeekPastHeader(int fd, const Parameters &params);
uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing);
void ComplainAboutARPA(const Config &config, ModelType model_type);
} // namespace detail
template <class To> void LoadLM(const char *file, const Config &config, To &to) {
Backing &backing = to.MutableBacking();
backing.file.reset(util::OpenReadOrThrow(file));
try {
if (detail::IsBinaryFormat(backing.file.get())) {
Parameters params;
detail::ReadHeader(backing.file.get(), params);
detail::MatchCheck(To::kModelType, To::kVersion, params);
// Replace the run-time configured probing_multiplier with the one in the file.
Config new_config(config);
new_config.probing_multiplier = params.fixed.probing_multiplier;
detail::SeekPastHeader(backing.file.get(), params);
To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config);
uint64_t memory_size = To::Size(params.counts, new_config);
uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
to.InitializeFromBinary(start, params, new_config, backing.file.get());
} else {
detail::ComplainAboutARPA(config, To::kModelType);
to.InitializeFromARPA(file, config);
}
} catch (util::Exception &e) {
e << " File: " << file;
throw;
}
}
} // namespace ngram
} // namespace lm
#endif // LM_BINARY_FORMAT__

View File

@ -87,7 +87,7 @@ class VocabHandout {
Table table_;
std::size_t double_cutoff_;
util::FakeOFStream word_list_;
};
@ -98,7 +98,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
std::size_t operator()(const WordIndex *start) const {
return util::MurmurHashNative(start, size_);
}
private:
const std::size_t size_;
};
@ -106,11 +106,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
public:
explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
bool operator()(const WordIndex *first, const WordIndex *second) const {
return !memcmp(first, second, size_);
}
}
private:
const std::size_t size_;
};
@ -131,7 +131,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
class Writer {
public:
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
: block_(position), gram_(block_->Get(), order),
dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
@ -140,7 +140,7 @@ class Writer {
dedupe_.Clear();
assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
if (order == 1) {
// Add special words. AdjustCounts is responsible if order != 1.
// Add special words. AdjustCounts is responsible if order != 1.
AddUnigramWord(kUNK);
AddUnigramWord(kBOS);
}
@ -170,16 +170,16 @@ class Writer {
memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
return;
}
// Complete the write.
// Complete the write.
gram_.Count() = 1;
// Prepare the next n-gram.
// Prepare the next n-gram.
if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
NGram last(gram_);
gram_.NextInMemory();
std::copy(last.begin() + 1, last.end(), gram_.begin());
return;
}
// Block end. Need to store the context in a temporary buffer.
// Block end. Need to store the context in a temporary buffer.
std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
dedupe_.Clear();
block_->SetValidSize(block_size_);
@ -207,7 +207,7 @@ class Writer {
// Hash table combiner implementation.
Dedupe dedupe_;
// Small buffer to hold existing ngrams when shifting across a block boundary.
// Small buffer to hold existing ngrams when shifting across a block boundary.
boost::scoped_array<WordIndex> buffer_;
const std::size_t block_size_;
@ -223,7 +223,7 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
return VocabHandout::MemUsage(vocab_estimate);
}
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
: from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) {
@ -240,7 +240,10 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
uint64_t count = 0;
bool delimiters[256];
memset(delimiters, 0, sizeof(delimiters));
delimiters['\0'] = delimiters['\t'] = delimiters['\n'] = delimiters['\r'] = delimiters[' '] = true;
const char kDelimiterSet[] = "\0\t\n\r ";
for (const char *i = kDelimiterSet; i < kDelimiterSet + sizeof(kDelimiterSet); ++i) {
delimiters[static_cast<unsigned char>(*i)] = true;
}
try {
while(true) {
StringPiece line(from_.ReadLine());

View File

@ -33,12 +33,12 @@ class Callback {
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
probs_[order_minus_1 + 1] = pay.complete.prob;
pay.complete.prob = log10(pay.complete.prob);
// TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling.
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1].Get()) { // check valid pointer at tht end
// TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling.
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
++backoffs_[order_minus_1];
} else {
// Not a context.
// Not a context.
pay.complete.backoff = 0.0;
}
}
@ -52,7 +52,7 @@ class Callback {
};
} // namespace
Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
: uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {}
// perform order-wise interpolation

View File

@ -11,11 +11,7 @@ Config::Config() :
enumerate_vocab(NULL),
unknown_missing(COMPLAIN),
sentence_marker_missing(THROW_UP),
#if defined(_WIN32) || defined(_WIN64)
positive_log_probability(SILENT),
#else
positive_log_probability(THROW_UP),
#endif
unknown_missing_logprob(-100.0),
probing_multiplier(1.5),
building_memory(1073741824ULL), // 1 GB

View File

@ -17,14 +17,14 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
typedef VocabularyT Vocabulary;
/* Translate from void* to State */
FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->FullScore(
*reinterpret_cast<const State*>(in_state),
new_word,
*reinterpret_cast<State*>(out_state));
}
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->FullScoreForgotState(
context_rbegin,
context_rend,
@ -37,7 +37,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
}
float Score(const void *in_state, const WordIndex new_word, void *out_state) const {
float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->Score(
*reinterpret_cast<const State*>(in_state),
new_word,

View File

@ -14,10 +14,6 @@
#include <string>
#include <vector>
#if !defined __MINGW32__
#include <err.h>
#endif
#include <string.h>
#include <stdint.h>

View File

@ -5,27 +5,18 @@
#include <iostream>
#include <string>
#if !defined __MINGW32__
#include <err.h>
#endif
#include "util/fake_ofstream.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
namespace lm {
class CountOutput : boost::noncopyable {
public:
explicit CountOutput(const char *name) : file_(name, std::ios::out) {}
explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {}
void AddNGram(const StringPiece &line) {
if (!(file_ << line << '\n')) {
#if defined __MINGW32__
std::cerr<<"Writing counts file failed"<<std::endl;
exit(3);
#else
err(3, "Writing counts file failed");
#endif
}
file_ << line << '\n';
}
template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
@ -37,12 +28,12 @@ class CountOutput : boost::noncopyable {
}
private:
std::fstream file_;
util::FakeOFStream file_;
};
class CountBatch {
public:
explicit CountBatch(std::streamsize initial_read)
explicit CountBatch(std::streamsize initial_read)
: initial_read_(initial_read) {
buffer_.reserve(initial_read);
}
@ -75,7 +66,7 @@ class CountBatch {
private:
std::streamsize initial_read_;
// This could have been a std::string but that's less happy with raw writes.
// This could have been a std::string but that's less happy with raw writes.
std::vector<char> buffer_;
};

View File

@ -6,6 +6,7 @@
#endif
#include "lm/filter/vocab.hh"
#include "lm/filter/wrapper.hh"
#include "util/exception.hh"
#include "util/file_piece.hh"
#include <boost/ptr_container/ptr_vector.hpp>
@ -57,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
struct Config {
Config() :
Config() :
#ifndef NTHREAD
batch_size(25000),
threads(boost::thread::hardware_concurrency()),
@ -157,102 +158,96 @@ template <class Format> void DispatchFilterModes(const Config &config, std::istr
} // namespace lm
int main(int argc, char *argv[]) {
if (argc < 4) {
lm::DisplayHelp(argv[0]);
return 1;
}
// I used to have boost::program_options, but some users didn't want to compile boost.
lm::Config config;
config.mode = lm::MODE_UNSET;
for (int i = 1; i < argc - 2; ++i) {
const char *str = argv[i];
if (!std::strcmp(str, "copy")) {
config.mode = lm::MODE_COPY;
} else if (!std::strcmp(str, "single")) {
config.mode = lm::MODE_SINGLE;
} else if (!std::strcmp(str, "multiple")) {
config.mode = lm::MODE_MULTIPLE;
} else if (!std::strcmp(str, "union")) {
config.mode = lm::MODE_UNION;
} else if (!std::strcmp(str, "phrase")) {
config.phrase = true;
} else if (!std::strcmp(str, "context")) {
config.context = true;
} else if (!std::strcmp(str, "arpa")) {
config.format = lm::FORMAT_ARPA;
} else if (!std::strcmp(str, "raw")) {
config.format = lm::FORMAT_COUNT;
#ifndef NTHREAD
} else if (!std::strncmp(str, "threads:", 8)) {
config.threads = boost::lexical_cast<size_t>(str + 8);
if (!config.threads) {
std::cerr << "Specify at least one thread." << std::endl;
return 1;
}
} else if (!std::strncmp(str, "batch_size:", 11)) {
config.batch_size = boost::lexical_cast<size_t>(str + 11);
if (config.batch_size < 5000) {
std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
if (!config.batch_size) return 1;
}
#endif
} else {
try {
if (argc < 4) {
lm::DisplayHelp(argv[0]);
return 1;
}
}
if (config.mode == lm::MODE_UNSET) {
lm::DisplayHelp(argv[0]);
return 1;
}
if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl;
return 1;
}
bool cmd_is_model = true;
const char *cmd_input = argv[argc - 2];
if (!strncmp(cmd_input, "vocab:", 6)) {
cmd_is_model = false;
cmd_input += 6;
} else if (!strncmp(cmd_input, "model:", 6)) {
cmd_input += 6;
} else if (strchr(cmd_input, ':')) {
#if defined __MINGW32__
std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
exit(1);
#else
errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input);
#endif // defined
} else {
std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
}
std::ifstream cmd_file;
std::istream *vocab;
if (cmd_is_model) {
vocab = &std::cin;
} else {
cmd_file.open(cmd_input, std::ios::in);
if (!cmd_file) {
#if defined __MINGW32__
std::cerr << "Could not open input file " << cmd_input << std::endl;
exit(2);
#else
err(2, "Could not open input file %s", cmd_input);
#endif // defined
// I used to have boost::program_options, but some users didn't want to compile boost.
lm::Config config;
config.mode = lm::MODE_UNSET;
for (int i = 1; i < argc - 2; ++i) {
const char *str = argv[i];
if (!std::strcmp(str, "copy")) {
config.mode = lm::MODE_COPY;
} else if (!std::strcmp(str, "single")) {
config.mode = lm::MODE_SINGLE;
} else if (!std::strcmp(str, "multiple")) {
config.mode = lm::MODE_MULTIPLE;
} else if (!std::strcmp(str, "union")) {
config.mode = lm::MODE_UNION;
} else if (!std::strcmp(str, "phrase")) {
config.phrase = true;
} else if (!std::strcmp(str, "context")) {
config.context = true;
} else if (!std::strcmp(str, "arpa")) {
config.format = lm::FORMAT_ARPA;
} else if (!std::strcmp(str, "raw")) {
config.format = lm::FORMAT_COUNT;
#ifndef NTHREAD
} else if (!std::strncmp(str, "threads:", 8)) {
config.threads = boost::lexical_cast<size_t>(str + 8);
if (!config.threads) {
std::cerr << "Specify at least one thread." << std::endl;
return 1;
}
} else if (!std::strncmp(str, "batch_size:", 11)) {
config.batch_size = boost::lexical_cast<size_t>(str + 11);
if (config.batch_size < 5000) {
std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
if (!config.batch_size) return 1;
}
#endif
} else {
lm::DisplayHelp(argv[0]);
return 1;
}
}
vocab = &cmd_file;
}
util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
if (config.mode == lm::MODE_UNSET) {
lm::DisplayHelp(argv[0]);
return 1;
}
if (config.format == lm::FORMAT_ARPA) {
lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
} else if (config.format == lm::FORMAT_COUNT) {
lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl;
return 1;
}
bool cmd_is_model = true;
const char *cmd_input = argv[argc - 2];
if (!strncmp(cmd_input, "vocab:", 6)) {
cmd_is_model = false;
cmd_input += 6;
} else if (!strncmp(cmd_input, "model:", 6)) {
cmd_input += 6;
} else if (strchr(cmd_input, ':')) {
std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
return 1;
} else {
std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
}
std::ifstream cmd_file;
std::istream *vocab;
if (cmd_is_model) {
vocab = &std::cin;
} else {
cmd_file.open(cmd_input, std::ios::in);
UTIL_THROW_IF(!cmd_file, util::ErrnoException, "Failed to open " << cmd_input);
vocab = &cmd_file;
}
util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
if (config.format == lm::FORMAT_ARPA) {
lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
} else if (config.format == lm::FORMAT_COUNT) {
lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
}
return 0;
} catch (const std::exception &e) {
std::cerr << e.what() << std::endl;
return 1;
}
return 0;
}

View File

@ -1,5 +1,5 @@
#ifndef LM_FILTER_FORMAT_H__
#define LM_FITLER_FORMAT_H__
#define LM_FILTER_FORMAT_H__
#include "lm/filter/arpa_io.hh"
#include "lm/filter/count_io.hh"

View File

@ -5,10 +5,6 @@
#include <ctype.h>
#if !defined __MINGW32__
#include <err.h>
#endif
namespace lm {
namespace vocab {
@ -34,7 +30,7 @@ bool IsLineEnd(std::istream &in) {
}// namespace
// Read space separated words in enter separated lines. These lines can be
// very long, so don't read an entire line at a time.
// very long, so don't read an entire line at a time.
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
in.exceptions(std::istream::badbit);
unsigned int sentence = 0;

View File

@ -34,8 +34,47 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
if (static_cast<std::size_t>(start - static_cast<uint8_t*>(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast<uint8_t*>(base)) << " but Size says they should take " << goal_size);
}
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
LoadLM(file, config, *this);
namespace {
void ComplainAboutARPA(const Config &config, ModelType model_type) {
if (config.write_mmap || !config.messages) return;
if (config.arpa_complain == Config::ALL) {
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
} else if (config.arpa_complain == Config::EXPENSIVE &&
(model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
}
}
void CheckCounts(const std::vector<uint64_t> &counts) {
UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE);
if (sizeof(uint64_t) > sizeof(std::size_t)) {
for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) {
UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines.");
}
}
}
} // namespace
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &init_config) : backing_(init_config) {
util::scoped_fd fd(util::OpenReadOrThrow(file));
if (IsBinaryFormat(fd.get())) {
Parameters parameters;
int fd_shallow = fd.release();
backing_.InitializeBinary(fd_shallow, kModelType, kVersion, parameters);
CheckCounts(parameters.counts);
Config new_config(init_config);
new_config.probing_multiplier = parameters.fixed.probing_multiplier;
Search::UpdateConfigFromBinary(backing_, parameters.counts, VocabularyT::Size(parameters.counts[0], new_config), new_config);
UTIL_THROW_IF(new_config.enumerate_vocab && !parameters.fixed.has_vocabulary, FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
SetupMemory(backing_.LoadBinary(Size(parameters.counts, new_config)), parameters.counts, new_config);
vocab_.LoadedBinary(parameters.fixed.has_vocabulary, fd_shallow, new_config.enumerate_vocab, backing_.VocabStringReadingOffset());
} else {
ComplainAboutARPA(init_config, kModelType);
InitializeFromARPA(fd.release(), file, init_config);
}
// g++ prints warnings unless these are fully initialized.
State begin_sentence = State();
@ -50,27 +89,9 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
P::Init(begin_sentence, null_context, vocab_, search_.Order());
}
namespace {
void CheckCounts(const std::vector<uint64_t> &counts) {
UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE);
if (sizeof(uint64_t) > sizeof(std::size_t)) {
for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) {
UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines.");
}
}
}
} // namespace
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd) {
CheckCounts(params.counts);
SetupMemory(start, params.counts, config);
vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab);
search_.LoadedBinary();
}
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
// Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(int fd, const char *file, const Config &config) {
// Backing file is the ARPA.
util::FilePiece f(fd, file, config.ProgressMessages());
try {
std::vector<uint64_t> counts;
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
@ -81,13 +102,17 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
// Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);
vocab_.SetupMemory(backing_.SetupJustVocab(vocab_size, counts.size()), vocab_size, counts[0], config);
if (config.write_mmap) {
if (config.write_mmap && config.include_vocab) {
WriteWordsWrapper wrap(config.enumerate_vocab);
vocab_.ConfigureEnumerate(&wrap, counts[0]);
search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + Search::Size(counts, config));
void *vocab_rebase, *search_rebase;
backing_.WriteVocabWords(wrap.Buffer(), vocab_rebase, search_rebase);
// Due to writing at the end of file, mmap may have relocated data. So remap.
vocab_.Relocate(vocab_rebase);
search_.SetupMemory(reinterpret_cast<uint8_t*>(search_rebase), counts, config);
} else {
vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]);
search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
@ -99,18 +124,13 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
search_.UnknownUnigram().backoff = 0.0;
search_.UnknownUnigram().prob = config.unknown_missing_logprob;
}
FinishFile(config, kModelType, kVersion, counts, vocab_.UnkCountChangePadding(), backing_);
backing_.FinishFile(config, kModelType, kVersion, counts);
} catch (util::Exception &e) {
e << " Byte: " << f.Offset();
throw;
}
}
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config));
Search::UpdateConfigFromBinary(fd, counts, config);
}
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) {

View File

@ -104,10 +104,6 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
}
private:
friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
// Score bigrams and above. Do not include backoff.
@ -116,15 +112,11 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
// Appears after Size in the cc file.
void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);
void InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd);
void InitializeFromARPA(const char *file, const Config &config);
void InitializeFromARPA(int fd, const char *file, const Config &config);
float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
Backing &MutableBacking() { return backing_; }
Backing backing_;
BinaryFormat backing_;
VocabularyT vocab_;

View File

@ -360,10 +360,11 @@ BOOST_AUTO_TEST_CASE(quant_bhiksha_trie) {
LoadingTest<QuantArrayTrieModel>();
}
template <class ModelT> void BinaryTest() {
template <class ModelT> void BinaryTest(Config::WriteMethod write_method) {
Config config;
config.write_mmap = "test.binary";
config.messages = NULL;
config.write_method = write_method;
ExpectEnumerateVocab enumerate;
config.enumerate_vocab = &enumerate;
@ -406,6 +407,11 @@ template <class ModelT> void BinaryTest() {
unlink("test_nounk.binary");
}
template <class ModelT> void BinaryTest() {
BinaryTest<ModelT>(Config::WRITE_MMAP);
BinaryTest<ModelT>(Config::WRITE_AFTER);
}
BOOST_AUTO_TEST_CASE(write_and_read_probing) {
BinaryTest<ProbingModel>();
}

View File

@ -38,13 +38,13 @@ const char kSeparatelyQuantizeVersion = 2;
} // namespace
void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
char version;
util::ReadOrThrow(fd, &version, 1);
util::ReadOrThrow(fd, &config.prob_bits, 1);
util::ReadOrThrow(fd, &config.backoff_bits, 1);
void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
unsigned char buffer[3];
file.ReadForConfig(buffer, 3, offset);
char version = buffer[0];
config.prob_bits = buffer[1];
config.backoff_bits = buffer[2];
if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
util::AdvanceOrThrow(fd, -3);
}
void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {

View File

@ -18,12 +18,13 @@ namespace lm {
namespace ngram {
struct Config;
class BinaryFormat;
/* Store values directly and don't quantize. */
class DontQuantize {
public:
static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &) {}
static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; }
static uint8_t MiddleBits(const Config &/*config*/) { return 63; }
static uint8_t LongestBits(const Config &/*config*/) { return 31; }
@ -136,7 +137,7 @@ class SeparatelyQuantize {
public:
static const ModelType kModelTypeAdd = kQuantAdd;
static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
static uint64_t Size(uint8_t order, const Config &config) {
uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float);

View File

@ -204,9 +204,10 @@ template <class Build, class Activate, class Store> void ReadNGrams(
namespace detail {
template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
std::size_t allocated = Unigram::Size(counts[0]);
unigram_ = Unigram(start, counts[0], allocated);
start += allocated;
unigram_ = Unigram(start, counts[0]);
start += Unigram::Size(counts[0]);
std::size_t allocated;
middle_.clear();
for (unsigned int n = 2; n < counts.size(); ++n) {
allocated = Middle::Size(counts[n - 1], config.probing_multiplier);
middle_.push_back(Middle(start, allocated));
@ -218,9 +219,21 @@ template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start,
return start;
}
template <class Value> void HashedSearch<Value>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing) {
// TODO: fix sorted.
SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config);
/*template <class Value> void HashedSearch<Value>::Relocate(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
unigram_ = Unigram(start, counts[0]);
start += Unigram::Size(counts[0]);
for (unsigned int n = 2; n < counts.size(); ++n) {
middle[n-2].Relocate(start);
start += Middle::Size(counts[n - 1], config.probing_multiplier)
}
longest_.Relocate(start);
}*/
template <class Value> void HashedSearch<Value>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing) {
void *vocab_rebase;
void *search_base = backing.GrowForSearch(Size(counts, config), vocab.UnkCountChangePadding(), vocab_rebase);
vocab.Relocate(vocab_rebase);
SetupMemory(reinterpret_cast<uint8_t*>(search_base), counts, config);
PositiveProbWarn warn(config.positive_log_probability);
Read1Grams(f, counts[0], vocab, unigram_.Raw(), warn);
@ -277,14 +290,6 @@ template <class Value> template <class Build> void HashedSearch<Value>::ApplyBui
ReadEnd(f);
}
template <class Value> void HashedSearch<Value>::LoadedBinary() {
unigram_.LoadedBinary();
for (typename std::vector<Middle>::iterator i = middle_.begin(); i != middle_.end(); ++i) {
i->LoadedBinary();
}
longest_.LoadedBinary();
}
template class HashedSearch<BackoffValue>;
template class HashedSearch<RestValue>;

View File

@ -18,7 +18,7 @@ namespace util { class FilePiece; }
namespace lm {
namespace ngram {
struct Backing;
class BinaryFormat;
class ProbingVocabulary;
namespace detail {
@ -72,7 +72,7 @@ template <class Value> class HashedSearch {
static const unsigned int kVersion = 0;
// TODO: move probing_multiplier here with next binary file format update.
static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
static void UpdateConfigFromBinary(const BinaryFormat &, const std::vector<uint64_t> &, uint64_t, Config &) {}
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
uint64_t ret = Unigram::Size(counts[0]);
@ -84,9 +84,7 @@ template <class Value> class HashedSearch {
uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config);
void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing);
void LoadedBinary();
void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing);
unsigned char Order() const {
return middle_.size() + 2;
@ -148,7 +146,7 @@ template <class Value> class HashedSearch {
public:
Unigram() {}
Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
Unigram(void *start, uint64_t count) :
unigram_(static_cast<typename Value::Weights*>(start))
#ifdef DEBUG
, count_(count)
@ -168,8 +166,6 @@ template <class Value> class HashedSearch {
typename Value::Weights &Unknown() { return unigram_[0]; }
void LoadedBinary() {}
// For building.
typename Value::Weights *Raw() { return unigram_; }

View File

@ -459,7 +459,7 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c
} // namespace
template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing) {
RecordReader inputs[KENLM_MAX_ORDER - 1];
RecordReader contexts[KENLM_MAX_ORDER - 1];
@ -488,7 +488,10 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);
out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), backing), fixed_counts, config);
void *vocab_relocate;
void *search_base = backing.GrowForSearch(TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), vocab.UnkCountChangePadding(), vocab_relocate);
vocab.Relocate(vocab_relocate);
out.SetupMemory(reinterpret_cast<uint8_t*>(search_base), fixed_counts, config);
for (unsigned char i = 2; i <= counts.size(); ++i) {
inputs[i-2].Rewind();
@ -571,15 +574,7 @@ template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::Setup
return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]);
}
template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::LoadedBinary() {
unigram_.LoadedBinary();
for (Middle *i = middle_begin_; i != middle_end_; ++i) {
i->LoadedBinary();
}
longest_.LoadedBinary();
}
template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) {
template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing) {
std::string temporary_prefix;
if (config.temporary_directory_prefix) {
temporary_prefix = config.temporary_directory_prefix;

View File

@ -17,13 +17,13 @@
namespace lm {
namespace ngram {
struct Backing;
class BinaryFormat;
class SortedVocabulary;
namespace trie {
template <class Quant, class Bhiksha> class TrieSearch;
class SortedFiles;
template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing);
template <class Quant, class Bhiksha> class TrieSearch {
public:
@ -39,11 +39,11 @@ template <class Quant, class Bhiksha> class TrieSearch {
static const unsigned int kVersion = 1;
static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
Quant::UpdateConfigFromBinary(fd, counts, config);
util::AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0]));
static void UpdateConfigFromBinary(const BinaryFormat &file, const std::vector<uint64_t> &counts, uint64_t offset, Config &config) {
Quant::UpdateConfigFromBinary(file, offset, config);
// Currently the unigram pointers are not compresssed, so there will only be a header for order > 2.
if (counts.size() > 2) Bhiksha::UpdateConfigFromBinary(fd, config);
if (counts.size() > 2)
Bhiksha::UpdateConfigFromBinary(file, offset + Quant::Size(counts.size(), config) + Unigram::Size(counts[0]), config);
}
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
@ -60,9 +60,7 @@ template <class Quant, class Bhiksha> class TrieSearch {
uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config);
void LoadedBinary();
void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing);
void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing);
unsigned char Order() const {
return middle_end_ - middle_begin_ + 2;
@ -103,7 +101,7 @@ template <class Quant, class Bhiksha> class TrieSearch {
}
private:
friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing);
// Middles are managed manually so we can delay construction and they don't have to be copyable.
void FreeMiddles() {

View File

@ -62,8 +62,6 @@ class Unigram {
return unigram_;
}
void LoadedBinary() {}
UnigramPointer Find(WordIndex word, NodeRange &next) const {
UnigramValue *val = unigram_ + word;
next.begin = val->next;
@ -108,8 +106,6 @@ template <class Bhiksha> class BitPackedMiddle : public BitPacked {
void FinishedLoading(uint64_t next_end, const Config &config);
void LoadedBinary() { bhiksha_.LoadedBinary(); }
util::BitAddress Find(WordIndex word, NodeRange &range, uint64_t &pointer) const;
util::BitAddress ReadEntry(uint64_t pointer, NodeRange &range) {
@ -138,14 +134,9 @@ class BitPackedLongest : public BitPacked {
BaseInit(base, max_vocab, quant_bits);
}
void LoadedBinary() {}
util::BitAddress Insert(WordIndex word);
util::BitAddress Find(WordIndex word, const NodeRange &node) const;
private:
uint8_t quant_bits_;
};
} // namespace trie

View File

@ -50,6 +50,10 @@ class PartialViewProxy {
const void *Data() const { return inner_.Data(); }
void *Data() { return inner_.Data(); }
friend void swap(PartialViewProxy first, PartialViewProxy second) {
std::swap_ranges(reinterpret_cast<char*>(first.Data()), reinterpret_cast<char*>(first.Data()) + first.attention_size_, reinterpret_cast<char*>(second.Data()));
}
private:
friend class util::ProxyIterator<PartialViewProxy>;

View File

@ -125,13 +125,13 @@ class Model {
void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); }
// Requires in_state != out_state
virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
virtual float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
// Requires in_state != out_state
virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
virtual FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
// Prefer to use FullScore. The context words should be provided in reverse order.
virtual FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0;
virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0;
unsigned char Order() const { return order_; }

View File

@ -32,7 +32,8 @@ const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5);
// Sadly some LMs have <UNK>.
const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5);
void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count) {
void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint64_t offset) {
util::SeekOrThrow(fd, offset);
// Check that we're at the right place by reading <unk> which is always first.
char check_unk[6];
util::ReadOrThrow(fd, check_unk, 6);
@ -80,11 +81,6 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) {
buffer_.push_back(0);
}
void WriteWordsWrapper::Write(int fd, uint64_t start) {
util::SeekOrThrow(fd, start);
util::WriteOrThrow(fd, buffer_.data(), buffer_.size());
}
SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {}
uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) {
@ -100,6 +96,12 @@ void SortedVocabulary::SetupMemory(void *start, std::size_t allocated, std::size
saw_unk_ = false;
}
void SortedVocabulary::Relocate(void *new_start) {
std::size_t delta = end_ - begin_;
begin_ = reinterpret_cast<uint64_t*>(new_start) + 1;
end_ = begin_ + delta;
}
void SortedVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries) {
enumerate_ = to;
if (enumerate_) {
@ -147,11 +149,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
bound_ = end_ - begin_ + 1;
}
void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) {
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
SetSpecial(Index("<s>"), Index("</s>"), 0);
bound_ = end_ - begin_ + 1;
if (have_words) ReadWords(fd, to, bound_);
if (have_words) ReadWords(fd, to, bound_, offset);
}
namespace {
@ -179,6 +181,11 @@ void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::siz
saw_unk_ = false;
}
void ProbingVocabulary::Relocate(void *new_start) {
header_ = static_cast<detail::ProbingVocabularyHeader*>(new_start);
lookup_.Relocate(static_cast<uint8_t*>(new_start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)));
}
void ProbingVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t /*max_entries*/) {
enumerate_ = to;
if (enumerate_) {
@ -206,12 +213,11 @@ void ProbingVocabulary::InternalFinishedLoading() {
SetSpecial(Index("<s>"), Index("</s>"), 0);
}
void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) {
UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code.");
lookup_.LoadedBinary();
bound_ = header_->bound;
SetSpecial(Index("<s>"), Index("</s>"), 0);
if (have_words) ReadWords(fd, to, bound_);
if (have_words) ReadWords(fd, to, bound_, offset);
}
void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {

View File

@ -36,7 +36,7 @@ class WriteWordsWrapper : public EnumerateVocab {
void Add(WordIndex index, const StringPiece &str);
void Write(int fd, uint64_t start);
const std::string &Buffer() const { return buffer_; }
private:
EnumerateVocab *inner_;
@ -71,6 +71,8 @@ class SortedVocabulary : public base::Vocabulary {
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config);
void Relocate(void *new_start);
void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries);
WordIndex Insert(const StringPiece &str);
@ -83,15 +85,13 @@ class SortedVocabulary : public base::Vocabulary {
bool SawUnk() const { return saw_unk_; }
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);
private:
uint64_t *begin_, *end_;
WordIndex bound_;
WordIndex highest_value_;
bool saw_unk_;
EnumerateVocab *enumerate_;
@ -140,6 +140,8 @@ class ProbingVocabulary : public base::Vocabulary {
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config);
void Relocate(void *new_start);
void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries);
WordIndex Insert(const StringPiece &str);
@ -152,7 +154,7 @@ class ProbingVocabulary : public base::Vocabulary {
bool SawUnk() const { return saw_unk_; }
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);
private:
void InternalFinishedLoading();

View File

@ -242,9 +242,9 @@ void FeatureRegistry::PrintFF() const
Map::const_iterator iter;
for (iter = registry_.begin(); iter != registry_.end(); ++iter) {
const string &ffName = iter->first;
std::cerr << ffName << std::endl;
std::cerr << ffName << " ";
}
std::cerr << std::endl;
}
} // namespace Moses

View File

@ -12,6 +12,7 @@ if $(with-dlib) {
alias headers : ../util//kenutil : : : $(max-factors) $(dlib) ;
alias ThreadPool : ThreadPool.cpp ;
alias Util : Util.cpp Timer.cpp ;
if [ option.get "with-synlm" : no : yes ] = yes
{

View File

@ -94,9 +94,16 @@ if $(with-nplm) {
local with-dalm = [ option.get "with-dalm" ] ;
if $(with-dalm) {
lib dalm : : <search>$(with-dalm)/lib ;
lib MurmurHash3 : : <search>$(with-dalm)/lib ;
obj DALM.o : DALMWrapper.cpp dalm MurmurHash3 ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
alias dalmALIAS : DALM.o dalm MurmurHash3 : : : <define>LM_DALM ;
if [ path.exists $(with-dalm)/lib/libMurmurHash3.a ] {
lib MurmurHash3 : : <search>$(with-dalm)/lib ;
alias dalm-libs : dalm MurmurHash3 ;
} else {
alias dalm-libs : dalm ;
}
obj DALM.o : DALMWrapper.cpp dalm-libs ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
alias dalmALIAS : DALM.o dalm-libs : : : <define>LM_DALM ;
dependencies += dalmALIAS ;
lmmacros += LM_DALM ;
}

View File

@ -202,6 +202,7 @@ Parameter::Parameter()
AddParam("placeholder-factor", "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model");
AddParam("no-cache", "Disable all phrase-table caching. Default = false (ie. enable caching)");
AddParam("adjacent-only", "Only allow hypotheses which are adjacent to current derivation. ITG without block moves");
}

View File

@ -250,6 +250,11 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
return true;
}
if (StaticData::Instance().AdjacentOnly() &&
!hypoBitmap.IsAdjacent(range.GetStartPos(), range.GetEndPos())) {
return false;
}
bool leftMostEdge = (hypoFirstGapPos == startPos);
// any length extension is okay if starting at left-most edge
if (leftMostEdge) {

View File

@ -253,6 +253,11 @@ void SearchNormal::ExpandAllHypotheses(const Hypothesis &hypothesis, size_t star
expectedScore += m_transOptColl.GetFutureScore().CalcFutureScore( hypothesis.GetWordsBitmap(), startPos, endPos );
}
if (StaticData::Instance().AdjacentOnly() &&
!hypothesis.GetWordsBitmap().IsAdjacent(startPos, endPos)) {
return;
}
// loop through all translation options
const TranslationOptionList &transOptList = m_transOptColl.GetTranslationOptionList(WordsRange(startPos, endPos));
TranslationOptionList::const_iterator iter;

View File

@ -385,6 +385,8 @@ bool StaticData::LoadData(Parameter *parameter)
SetBooleanParameter( &m_lmEnableOOVFeature, "lmodel-oov-feature", false);
SetBooleanParameter( &m_adjacentOnly, "adjacent-only", false);
// minimum Bayes risk decoding
SetBooleanParameter( &m_mbr, "minimum-bayes-risk", false );
m_mbrSize = (m_parameter->GetParam("mbr-size").size() > 0) ?

View File

@ -197,6 +197,7 @@ protected:
FactorType m_placeHolderFactor;
bool m_useLegacyPT;
bool m_adjacentOnly;
FeatureRegistry m_registry;
@ -753,6 +754,8 @@ public:
return &m_soft_matches_map_reverse;
}
bool AdjacentOnly() const
{ return m_adjacentOnly; }
};
}

View File

@ -58,8 +58,7 @@ const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollectionLEGACY(
size_t hash = hash_value(src);
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
CacheColl::iterator iter;
iter = cache.find(hash);
if (iter == cache.end()) {
@ -179,7 +178,7 @@ void PhraseDictionary::ReduceCache() const
// find cutoff for last used time
priority_queue< clock_t > lastUsedTimes;
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iter;
CacheColl::iterator iter;
iter = cache.begin();
while( iter != cache.end() ) {
lastUsedTimes.push( iter->second.second );
@ -193,7 +192,7 @@ void PhraseDictionary::ReduceCache() const
iter = cache.begin();
while( iter != cache.end() ) {
if (iter->second.second < cutoffLastUsedTime) {
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iterRemove = iter++;
CacheColl::iterator iterRemove = iter++;
delete iterRemove->second.first;
cache.erase(iterRemove);
} else iter++;

View File

@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <stdexcept>
#include <vector>
#include <string>
#include <boost/unordered_map.hpp>
#ifdef WITH_THREADS
#include <boost/thread/tss.hpp>
@ -54,7 +55,7 @@ class ChartCellCollectionBase;
class ChartRuleLookupManager;
class ChartParser;
class CacheColl : public std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
class CacheColl : public boost::unordered_map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
{
// 1st = hash of source phrase/ address of phrase-table node
// 2nd = all translations

View File

@ -59,7 +59,7 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
CacheColl &cache = GetCache();
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
CacheColl::iterator iter;
iter = cache.find(hash);
if (iter != cache.end()) {

View File

@ -165,7 +165,7 @@ const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollection(
CacheColl &cache = GetCache();
size_t hash = (size_t) ptNode->GetFilePos();
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
CacheColl::iterator iter;
iter = cache.find(hash);

View File

@ -63,6 +63,22 @@ int WordsBitmap::GetFutureCosts(int lastPos) const
return sum;
}
bool WordsBitmap::IsAdjacent(size_t startPos, size_t endPos) const
{
if (GetNumWordsCovered() == 0) {
return true;
}
size_t first = GetFirstGapPos();
size_t last = GetLastGapPos();
if (startPos == last || endPos == first) {
return true;
}
return false;
}
}

View File

@ -132,6 +132,8 @@ public:
return NOT_FOUND;
}
bool IsAdjacent(size_t startPos, size_t endPos) const;
//! whether a word has been translated at a particular position
bool GetValue(size_t pos) const {
return m_bitmap[pos];

View File

@ -1,6 +1,5 @@
// $Id$
//#include "beammain.h"
#include "domain.h"
#include "DomainFeature.h"
#include "ExtractionPhrasePair.h"
#include "tables-core.h"
#include "InputFileStream.h"
#include "SafeGetline.h"
@ -26,7 +25,7 @@ void Domain::load( const std::string &domainFileName )
int lineNumber;
if (domainSpecLine.size() != 2 ||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
exit(1);
}
// store
@ -50,29 +49,34 @@ string Domain::getDomainOfSentence( int sentenceId ) const
return "undefined";
}
DomainFeature::DomainFeature(const string& domainFile)
DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
{
//process domain file
m_domain.load(domainFile);
}
void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
float count,
int sentenceId) const
{
std::string value = m_domain.getDomainOfSentence(sentenceId);
phrasePair.AddProperty(m_propertyKey, value, count);
}
void DomainFeature::add(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const
{
map< string, float > domainCount;
for(size_t i=0; i<context.phrasePair.size(); i++) {
string d = m_domain.getDomainOfSentence(context.phrasePair[i]->sentenceId );
if (domainCount.find( d ) == domainCount.end()) {
domainCount[d] = context.phrasePair[i]->count;
} else {
domainCount[d] += context.phrasePair[i]->count;
}
}
add(domainCount, context.count, context.maybeLog, denseValues, sparseValues);
const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
assert( domainCount != NULL );
add(*domainCount,
context.phrasePair.GetCount(),
context.maybeLog,
denseValues, sparseValues);
}
void SubsetDomainFeature::add(const map<string,float>& domainCount,float count,
void SubsetDomainFeature::add(const map<string,float>& domainCount,
float count,
const MaybeLog& maybeLog,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const
@ -152,7 +156,6 @@ void IndicatorDomainFeature::add(const map<string,float>& domainCount,float coun
denseValues.push_back(maybeLog(2.718));
}
}
}
void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
@ -166,12 +169,5 @@ void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,floa
}
}
bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
{
return m_domain.getDomainOfSentence(lhs.sentenceId) ==
m_domain.getDomainOfSentence( rhs.sentenceId);
}
}

View File

@ -34,13 +34,17 @@ class DomainFeature : public ScoreFeature
public:
DomainFeature(const std::string& domainFile);
bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
float count,
int sentenceId) const;
void add(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const;
protected:
/** Overriden in subclass */
/** Overridden in subclass */
virtual void add(const std::map<std::string,float>& domainCounts, float count,
const MaybeLog& maybeLog,
std::vector<float>& denseValues,
@ -49,6 +53,8 @@ protected:
Domain m_domain;
const std::string m_propertyKey;
};
class SubsetDomainFeature : public DomainFeature

View File

@ -0,0 +1,327 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <sstream>
#include "ExtractionPhrasePair.h"
#include "SafeGetline.h"
#include "tables-core.h"
#include "score.h"
#include "moses/Util.h"
#include <cstdlib>
using namespace std;
namespace MosesTraining {
extern Vocabulary vcbT;
extern Vocabulary vcbS;
extern bool hierarchicalFlag;
ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
const PHRASE *phraseTarget,
ALIGNMENT *targetToSourceAlignment,
float count, float pcfgSum ) :
m_phraseSource(phraseSource),
m_phraseTarget(phraseTarget),
m_count(count),
m_pcfgSum(pcfgSum)
{
assert(phraseSource->empty());
assert(phraseTarget->empty());
m_count = count;
m_pcfgSum = pcfgSum;
std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
m_lastTargetToSourceAlignment = insertedAlignment.first;
m_lastCount = m_count;
m_lastPcfgSum = m_pcfgSum;
m_isValid = true;
}
ExtractionPhrasePair::~ExtractionPhrasePair( ) {
Clear();
}
// return value: true if the given alignment was seen for the first time and thus will be stored,
// false if it was present already (the pointer may thus be deleted(
bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment,
float count, float pcfgSum )
{
m_count += count;
m_pcfgSum += pcfgSum;
m_lastCount = count;
m_lastPcfgSum = pcfgSum;
std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
if ( *(iter->first) == *targetToSourceAlignment ) {
iter->second += count;
return false;
} else {
std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
if ( !insertedAlignment.second ) {
// the alignment already exists: increment count
insertedAlignment.first->second += count;
return false;
}
m_lastTargetToSourceAlignment = insertedAlignment.first;
}
return true;
}
void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
{
m_count += count;
m_pcfgSum += pcfgSum;
m_lastTargetToSourceAlignment->second += count;
// properties
for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
iter !=m_properties.end(); ++iter ) {
LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
(*lastPropertyValue)->second += count;
}
m_lastCount = count;
m_lastPcfgSum = pcfgSum;
}
// Check for lexical match
// and in case of SCFG rules for equal non-terminal alignment.
bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
const PHRASE *otherPhraseTarget,
ALIGNMENT *otherTargetToSourceAlignment ) const
{
if (*otherPhraseTarget != *m_phraseTarget) {
return false;
}
if (*otherPhraseSource != *m_phraseSource) {
return false;
}
return MatchesAlignment( otherTargetToSourceAlignment );
}
// Check for lexical match
// and in case of SCFG rules for equal non-terminal alignment.
// Set boolean indicators.
// (Note that we check in the order: target - source - alignment
// and do not touch the subsequent boolean indicators once a previous one has been set to false.)
bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
const PHRASE *otherPhraseTarget,
ALIGNMENT *otherTargetToSourceAlignment,
bool &sourceMatch,
bool &targetMatch,
bool &alignmentMatch ) const
{
if (*otherPhraseSource != *m_phraseSource) {
sourceMatch = false;
return false;
} else {
sourceMatch = true;
}
if (*otherPhraseTarget != *m_phraseTarget) {
targetMatch = false;
return false;
} else {
targetMatch = true;
}
if ( !MatchesAlignment(otherTargetToSourceAlignment) ) {
alignmentMatch = false;
return false;
} else {
alignmentMatch = true;
}
return true;
}
// Check for equal non-terminal alignment in case of SCFG rules.
// Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first
bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
{
if (!hierarchicalFlag) return true;
// all or none of the phrasePair's word alignment matrices match, so just pick one
const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;
assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());
// loop over all symbols but the left hand side of the rule
for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin());
size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());
if (thisTargetToSourceAlignment->at(i).size() != 1 ||
otherTargetToSourceAlignment->at(i).size() != 1 ||
thisAlign != otherAlign) {
return false;
}
}
}
return true;
}
void ExtractionPhrasePair::Clear()
{
delete m_phraseSource;
delete m_phraseTarget;
m_count = 0.0f;
m_pcfgSum = 0.0f;
for ( std::map<ALIGNMENT*,float>::iterator iter=m_targetToSourceAlignments.begin();
iter!=m_targetToSourceAlignments.end(); ++iter) {
delete iter->first;
}
m_targetToSourceAlignments.clear();
for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
iter!=m_properties.end(); ++iter) {
delete (iter->second).second;
delete (iter->second).first;
}
m_properties.clear();
m_lastCount = 0.0f;
m_lastPcfgSum = 0.0f;
m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
m_isValid = false;
}
void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count )
{
if (propertiesString.empty()) {
return;
}
vector<std::string> toks;
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
for (size_t i = 1; i < toks.size(); ++i) {
std::string &tok = toks[i];
if (tok.empty()) {
continue;
}
size_t endPos = tok.rfind("}");
tok = tok.substr(0, endPos - 1);
vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
assert(keyValue.size() == 2);
AddProperty(keyValue[0], keyValue[1], count);
}
}
const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
{
float bestAlignmentCount = -1;
std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin();
iter!=m_targetToSourceAlignments.end(); ++iter) {
if ( (iter->second > bestAlignmentCount) ||
( (iter->second == bestAlignmentCount) &&
(*(iter->first) > *(bestAlignment->first)) ) ) {
bestAlignmentCount = iter->second;
bestAlignment = iter;
}
}
if ( bestAlignment == m_targetToSourceAlignments.end()) {
return NULL;
}
return bestAlignment->first;
}
const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const
{
float bestPropertyCount = -1;
const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
if ( allPropertyValues == NULL ) {
return NULL;
}
PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
iter!=allPropertyValues->end(); ++iter) {
if ( (iter->second > bestPropertyCount) ||
( (iter->second == bestPropertyCount) &&
(iter->first > bestPropertyValue->first) ) ) {
bestPropertyCount = iter->second;
bestPropertyValue = iter;
}
}
if ( bestPropertyValue == allPropertyValues->end()) {
return NULL;
}
return &(bestPropertyValue->first);
}
std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const
{
const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
if ( allPropertyValues == NULL ) {
return "";
}
std::ostringstream oss;
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
iter!=allPropertyValues->end(); ++iter) {
if (iter!=allPropertyValues->begin()) {
oss << " ";
}
oss << iter->first;
oss << " ";
oss << iter->second;
}
std::string allPropertyValuesString(oss.str());
return allPropertyValuesString;
}
}

View File

@ -0,0 +1,162 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include "tables-core.h"
#include <vector>
#include <set>
#include <map>
namespace MosesTraining {
typedef std::vector< std::set<size_t> > ALIGNMENT;
class ExtractionPhrasePair {
protected:
typedef std::map<std::string,float> PROPERTY_VALUES;
typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
bool m_isValid;
const PHRASE *m_phraseSource;
const PHRASE *m_phraseTarget;
float m_count;
float m_pcfgSum;
std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
std::map<std::string,
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
float m_lastCount;
float m_lastPcfgSum;
std::map<ALIGNMENT*,float>::iterator m_lastTargetToSourceAlignment;
public:
ExtractionPhrasePair( const PHRASE *phraseSource,
const PHRASE *phraseTarget,
ALIGNMENT *targetToSourceAlignment,
float count, float pcfgSum );
~ExtractionPhrasePair();
bool Add( ALIGNMENT *targetToSourceAlignment,
float count, float pcfgSum );
void IncrementPrevious( float count, float pcfgSum );
bool Matches( const PHRASE *otherPhraseSource,
const PHRASE *otherPhraseTarget,
ALIGNMENT *otherTargetToSourceAlignment ) const;
bool Matches( const PHRASE *otherPhraseSource,
const PHRASE *otherPhraseTarget,
ALIGNMENT *otherTargetToSourceAlignment,
bool &sourceMatch,
bool &targetMatch,
bool &alignmentMatch ) const;
bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const;
void Clear();
bool IsValid() const {
return m_isValid;
}
const PHRASE *GetSource() const {
return m_phraseSource;
}
const PHRASE *GetTarget() const {
return m_phraseTarget;
}
float GetCount() const {
return m_count;
}
float GetPcfgScore() const {
return m_pcfgSum;
}
const size_t GetNumberOfProperties() const {
return m_properties.size();
}
const std::map<std::string,float> *GetProperty( const std::string &key ) const {
std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::const_iterator iter;
iter = m_properties.find(key);
if (iter == m_properties.end()) {
return NULL;
} else {
return iter->second.first;
}
}
const ALIGNMENT *FindBestAlignmentTargetToSource() const;
const std::string *FindBestPropertyValue(const std::string &key) const;
std::string CollectAllPropertyValues(const std::string &key) const;
void AddProperties( const std::string &str, float count );
void AddProperty( const std::string &key, const std::string &value, float count )
{
std::map<std::string,
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
if ( iter == m_properties.end() ) {
// key not found: insert property key and value
PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue);
} else {
LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
if ( (*lastPropertyValue)->first == value ) { // same property key-value pair has been seen right before
// property key-value pair exists already: add count
(*lastPropertyValue)->second += count;
} else { // need to check whether the property key-value pair has appeared before (insert if not)
// property key exists, but not in combination with this value:
// add new value with count
PROPERTY_VALUES *propertyValues = (iter->second).first;
std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
if ( !insertedProperty.second ) { // property value for this key appeared before: add count
insertedProperty.first->second += count;
}
LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
delete (iter->second).second;
(iter->second).second = lastPropertyValue;
}
}
}
};
}

View File

@ -1,50 +1,30 @@
#include "InternalStructFeature.h"
#include <map>
using namespace std;
namespace MosesTraining
{
InternalStructFeature::InternalStructFeature()
:m_type(0){
//cout<<"InternalStructFeature: Construct "<<m_type<<"\n";
}
bool InternalStructFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const{
//cout<<"InternalStructFeature: Equals\n";
//don't know what it's used for and what we should compare
//-> if the dense score is the same
//-> if the sparse feature is set
// compare phrases? with the internalStrucutre string?
/** Return true if the two phrase pairs are equal from the point of this feature. Assume
that they already compare true according to PhraseAlignment.equals()
**/
/* if(lhs.ghkmParse==rhs.ghkmParse)
return true;
else
return false;
*/
//return true;
}
void InternalStructFeature::add(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const{
for(size_t i=0; i<context.phrasePair.size(); i++) {
add(&context.phrasePair[i]->treeFragment, denseValues, sparseValues);
}
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const {
const std::map<std::string,float> *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only?
for ( std::map<std::string,float>::const_iterator iter=allTrees->begin();
iter!=allTrees->end(); ++iter ) {
add(&(iter->first), iter->second, denseValues, sparseValues);
}
}
void InternalStructFeatureDense::add(std::string *internalStruct,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const{
void InternalStructFeatureDense::add(const std::string *treeFragment,
float count,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const {
//cout<<"Dense: "<<*internalStruct<<endl;
size_t start=0;
int countNP=0;
while((start = internalStruct->find("NP", start)) != string::npos) {
countNP++;
while((start = treeFragment->find("NP", start)) != string::npos) {
countNP += count;
start+=2; //length of "NP"
}
//should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
@ -53,21 +33,21 @@ void InternalStructFeatureDense::add(std::string *internalStruct,
}
void InternalStructFeatureSparse::add(std::string *internalStruct,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const{
//cout<<"Sparse: "<<*internalStruct<<endl;
if(internalStruct->find("VBZ")!=std::string::npos)
sparseValues["NTVBZ"] = 1;
if(internalStruct->find("VBD")!=std::string::npos)
sparseValues["NTVBD"] = 1;
if(internalStruct->find("VBP")!=std::string::npos)
sparseValues["NTVBP"] = 1;
if(internalStruct->find("PP")!=std::string::npos)
sparseValues["NTPP"] = 1;
if(internalStruct->find("SBAR")!=std::string::npos)
sparseValues["NTSBAR"] = 1;
void InternalStructFeatureSparse::add(const std::string *treeFragment,
float count,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const {
//cout<<"Sparse: "<<*internalStruct<<endl;
if(treeFragment->find("VBZ")!=std::string::npos)
sparseValues["NTVBZ"] += count;
if(treeFragment->find("VBD")!=std::string::npos)
sparseValues["NTVBD"] += count;
if(treeFragment->find("VBP")!=std::string::npos)
sparseValues["NTVBP"] += count;
if(treeFragment->find("PP")!=std::string::npos)
sparseValues["NTPP"] += count;
if(treeFragment->find("SBAR")!=std::string::npos)
sparseValues["NTSBAR"] += count;
}

View File

@ -21,22 +21,19 @@ namespace MosesTraining
class InternalStructFeature : public ScoreFeature
{
public:
InternalStructFeature();
/** Return true if the two phrase pairs are equal from the point of this feature. Assume
that they already compare true according to PhraseAlignment.equals()
**/
bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
InternalStructFeature() : m_type(0) {};
/** Add the values for this feature function. */
void add(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const;
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const;
protected:
/** Overriden in subclass */
virtual void add(std::string *internalStruct,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const = 0;
/** Overridden in subclass */
virtual void add(const std::string *treeFragment,
float count,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const = 0;
int m_type;
};
@ -47,9 +44,10 @@ public:
InternalStructFeatureDense()
:InternalStructFeature(){m_type=1;} //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
protected:
virtual void add(std::string *internalStruct,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const;
virtual void add(const std::string *treeFragment,
float count,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const;
};
class InternalStructFeatureSparse : public InternalStructFeature
@ -58,9 +56,10 @@ public:
InternalStructFeatureSparse()
:InternalStructFeature(){m_type=2;}// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
protected:
virtual void add(std::string *internalStruct,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const;
virtual void add(const std::string *treeFragment,
float count,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const;
};
}

Some files were not shown because too many files have changed in this diff Show More