mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-19 23:27:46 +03:00
Merge branch 'master' of ssh://github.com/moses-smt/mosesdecoder
This commit is contained in:
commit
049be8b71c
@ -11,12 +11,12 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -72,13 +72,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
133
contrib/other-builds/extract-mixed-syntax/.cproject
Normal file
133
contrib/other-builds/extract-mixed-syntax/.cproject
Normal file
@ -0,0 +1,133 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.456080129" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
|
||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.582801917" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
||||
<builder buildPath="${workspace_loc:/extract-mixed-syntax/Debug}" id="cdt.managedbuild.builder.gnu.cross.1220166455" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
||||
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.2055012191" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
|
||||
<option id="gnu.c.compiler.option.debugging.level.1768196213" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2007889843" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
||||
<option id="gnu.cpp.compiler.option.optimization.level.855436310" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.debugging.level.506549229" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.include.paths.1497326561" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.606353571" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.740521305" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
|
||||
<option id="gnu.cpp.link.option.libs.1946120010" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
|
||||
<listOptionValue builtIn="false" value="z"/>
|
||||
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
|
||||
</option>
|
||||
<option id="gnu.cpp.link.option.paths.1563475751" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib64""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.106010037" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.136661991" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.2112208574" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.172930211" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.715007893">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.715007893." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.99436307" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
|
||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.801178939" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
||||
<builder buildPath="${workspace_loc:/extract-mixed-syntax/Release}" id="cdt.managedbuild.builder.gnu.cross.1999547547" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
||||
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1481537766" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
|
||||
<option id="gnu.c.compiler.option.debugging.level.1967527847" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.442342681" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
||||
<option id="gnu.cpp.compiler.option.optimization.level.1847950300" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.debugging.level.1130138972" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.158429528" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.2020667840" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1372779734" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.371006952" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1770045040" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.707592414" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<project id="extract-mixed-syntax.cdt.managedbuild.target.gnu.cross.exe.1868010260" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
|
||||
</storageModule>
|
||||
<storageModule moduleId="scannerConfiguration">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038;cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906;cdt.managedbuild.tool.gnu.c.compiler.input.442342681">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915;cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568;cdt.managedbuild.tool.gnu.c.compiler.input.2007889843">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
|
||||
</cproject>
|
27
contrib/other-builds/extract-mixed-syntax/.project
Normal file
27
contrib/other-builds/extract-mixed-syntax/.project
Normal file
@ -0,0 +1,27 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>extract-mixed-syntax</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
|
||||
<triggers>clean,full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
|
||||
<triggers>full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.cdt.core.cnature</nature>
|
||||
<nature>org.eclipse.cdt.core.ccnature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
37
contrib/other-builds/extract-mixed-syntax/Global.cpp
Normal file
37
contrib/other-builds/extract-mixed-syntax/Global.cpp
Normal file
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Global.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 01/02/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "Global.h"
|
||||
|
||||
bool g_debug = false;
|
||||
|
||||
Global::Global()
|
||||
: minHoleSpanSourceDefault(2)
|
||||
, maxHoleSpanSourceDefault(7)
|
||||
, minHoleSpanSourceSyntax(1)
|
||||
, maxHoleSpanSourceSyntax(1000)
|
||||
, maxUnaligned(5)
|
||||
|
||||
, maxSymbols(5)
|
||||
, maxNonTerm(3)
|
||||
, maxNonTermDefault(2)
|
||||
|
||||
// int minHoleSize(1)
|
||||
// int minSubPhraseSize(1) // minimum size of a remaining lexical phrase
|
||||
, glueGrammarFlag(false)
|
||||
, unknownWordLabelFlag(false)
|
||||
//bool zipFiles(false)
|
||||
, sourceSyntax(true)
|
||||
, targetSyntax(false)
|
||||
, mixed(true)
|
||||
, uppermostOnly(true)
|
||||
, allowDefaultNonTermEdge(true)
|
||||
, gzOutput(false)
|
||||
|
||||
{}
|
45
contrib/other-builds/extract-mixed-syntax/Global.h
Normal file
45
contrib/other-builds/extract-mixed-syntax/Global.h
Normal file
@ -0,0 +1,45 @@
|
||||
#pragma once
|
||||
/*
|
||||
* Global.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 01/02/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
class Global
|
||||
{
|
||||
public:
|
||||
int minHoleSpanSourceDefault;
|
||||
int maxHoleSpanSourceDefault;
|
||||
int minHoleSpanSourceSyntax;
|
||||
int maxHoleSpanSourceSyntax;
|
||||
|
||||
int maxSymbols;
|
||||
bool glueGrammarFlag;
|
||||
bool unknownWordLabelFlag;
|
||||
int maxNonTerm;
|
||||
int maxNonTermDefault;
|
||||
bool sourceSyntax;
|
||||
bool targetSyntax;
|
||||
bool mixed;
|
||||
int maxUnaligned;
|
||||
bool uppermostOnly;
|
||||
bool allowDefaultNonTermEdge;
|
||||
bool gzOutput;
|
||||
|
||||
Global();
|
||||
|
||||
Global(const Global&);
|
||||
|
||||
};
|
||||
|
||||
extern bool g_debug;
|
||||
|
||||
#define DEBUG_OUTPUT() void DebugOutput() const;
|
||||
|
||||
|
@ -0,0 +1,62 @@
|
||||
// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include "InputFileStream.h"
|
||||
#include "gzfilebuf.h"
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
InputFileStream::InputFileStream(const std::string &filePath)
|
||||
: std::istream(NULL)
|
||||
, m_streambuf(NULL)
|
||||
{
|
||||
if (filePath.size() > 3 &&
|
||||
filePath.substr(filePath.size() - 3, 3) == ".gz")
|
||||
{
|
||||
m_streambuf = new gzfilebuf(filePath.c_str());
|
||||
} else {
|
||||
std::filebuf* fb = new std::filebuf();
|
||||
fb = fb->open(filePath.c_str(), std::ios::in);
|
||||
if (! fb) {
|
||||
cerr << "Can't read " << filePath.c_str() << endl;
|
||||
exit(1);
|
||||
}
|
||||
m_streambuf = fb;
|
||||
}
|
||||
this->init(m_streambuf);
|
||||
}
|
||||
|
||||
InputFileStream::~InputFileStream()
|
||||
{
|
||||
delete m_streambuf;
|
||||
m_streambuf = NULL;
|
||||
}
|
||||
|
||||
void InputFileStream::Close()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
48
contrib/other-builds/extract-mixed-syntax/InputFileStream.h
Normal file
48
contrib/other-builds/extract-mixed-syntax/InputFileStream.h
Normal file
@ -0,0 +1,48 @@
|
||||
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#ifndef moses_InputFileStream_h
|
||||
#define moses_InputFileStream_h
|
||||
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
/** Used in place of std::istream, can read zipped files if it ends in .gz
|
||||
*/
|
||||
class InputFileStream : public std::istream
|
||||
{
|
||||
protected:
|
||||
std::streambuf *m_streambuf;
|
||||
public:
|
||||
|
||||
InputFileStream(const std::string &filePath);
|
||||
~InputFileStream();
|
||||
|
||||
void Close();
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
180
contrib/other-builds/extract-mixed-syntax/Lattice.cpp
Normal file
180
contrib/other-builds/extract-mixed-syntax/Lattice.cpp
Normal file
@ -0,0 +1,180 @@
|
||||
/*
|
||||
* Lattice.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 18/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include "Lattice.h"
|
||||
#include "LatticeNode.h"
|
||||
#include "Tunnel.h"
|
||||
#include "TunnelCollection.h"
|
||||
#include "SyntaxTree.h"
|
||||
#include "SentenceAlignment.h"
|
||||
#include "tables-core.h"
|
||||
#include "Rule.h"
|
||||
#include "RuleCollection.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
Lattice::Lattice(size_t sourceSize)
|
||||
:m_stacks(sourceSize + 1)
|
||||
{
|
||||
}
|
||||
|
||||
Lattice::~Lattice()
|
||||
{
|
||||
std::vector<Stack>::iterator iterStack;
|
||||
for (iterStack = m_stacks.begin(); iterStack != m_stacks.end(); ++iterStack)
|
||||
{
|
||||
Stack &stack = *iterStack;
|
||||
RemoveAllInColl(stack);
|
||||
}
|
||||
}
|
||||
|
||||
void Lattice::CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global)
|
||||
{
|
||||
// term
|
||||
Stack &startStack = GetStack(startPos);
|
||||
|
||||
LatticeNode *node = new LatticeNode(startPos, &sentence);
|
||||
startStack.push_back(node);
|
||||
|
||||
// non-term
|
||||
for (size_t endPos = startPos + 1; endPos <= sentence.source.size(); ++endPos)
|
||||
{
|
||||
const TunnelList &tunnels = tunnelColl.GetTunnels(startPos, endPos - 1);
|
||||
|
||||
TunnelList::const_iterator iterHole;
|
||||
for (iterHole = tunnels.begin(); iterHole != tunnels.end(); ++iterHole)
|
||||
{
|
||||
const Tunnel &tunnel = *iterHole;
|
||||
CreateArcsUsing1Hole(tunnel, sentence, global);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Lattice::CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global)
|
||||
{
|
||||
size_t startPos = tunnel.GetRange(0).GetStartPos()
|
||||
, endPos = tunnel.GetRange(0).GetEndPos();
|
||||
size_t numSymbols = tunnel.GetRange(0).GetWidth();
|
||||
assert(numSymbols > 0);
|
||||
|
||||
Stack &startStack = GetStack(startPos);
|
||||
|
||||
|
||||
// non-terms. cartesian product of source & target labels
|
||||
assert(startPos == tunnel.GetRange(0).GetStartPos() && endPos == tunnel.GetRange(0).GetEndPos());
|
||||
size_t startT = tunnel.GetRange(1).GetStartPos()
|
||||
,endT = tunnel.GetRange(1).GetEndPos();
|
||||
|
||||
const SyntaxNodes &nodesS = sentence.sourceTree.GetNodes(startPos, endPos);
|
||||
const SyntaxNodes &nodesT = sentence.targetTree.GetNodes(startT, endT );
|
||||
|
||||
SyntaxNodes::const_iterator iterS, iterT;
|
||||
for (iterS = nodesS.begin(); iterS != nodesS.end(); ++iterS)
|
||||
{
|
||||
const SyntaxNode *syntaxNodeS = *iterS;
|
||||
|
||||
for (iterT = nodesT.begin(); iterT != nodesT.end(); ++iterT)
|
||||
{
|
||||
const SyntaxNode *syntaxNodeT = *iterT;
|
||||
|
||||
bool isSyntax = syntaxNodeS->IsSyntax() || syntaxNodeT->IsSyntax();
|
||||
size_t maxSourceNonTermSpan = isSyntax ? global.maxHoleSpanSourceSyntax : global.maxHoleSpanSourceDefault;
|
||||
|
||||
if (maxSourceNonTermSpan >= endPos - startPos)
|
||||
{
|
||||
LatticeNode *node = new LatticeNode(tunnel, syntaxNodeS, syntaxNodeT);
|
||||
startStack.push_back(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Stack &Lattice::GetStack(size_t startPos)
|
||||
{
|
||||
assert(startPos < m_stacks.size());
|
||||
return m_stacks[startPos];
|
||||
}
|
||||
|
||||
const Stack &Lattice::GetStack(size_t startPos) const
|
||||
{
|
||||
assert(startPos < m_stacks.size());
|
||||
return m_stacks[startPos];
|
||||
}
|
||||
|
||||
void Lattice::CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global)
|
||||
{
|
||||
const Stack &startStack = GetStack(startPos);
|
||||
|
||||
Stack::const_iterator iterStack;
|
||||
for (iterStack = startStack.begin(); iterStack != startStack.end(); ++iterStack)
|
||||
{
|
||||
const LatticeNode *node = *iterStack;
|
||||
Rule *initRule = new Rule(node);
|
||||
|
||||
if (initRule->CanRecurse(global, sentence.GetTunnelCollection()))
|
||||
{ // may or maynot be valid, but can continue to build on this rule
|
||||
initRule->CreateRules(m_rules, *this, sentence, global);
|
||||
}
|
||||
|
||||
if (initRule->IsValid(global, sentence.GetTunnelCollection()))
|
||||
{ // add to rule collection
|
||||
m_rules.Add(global, initRule, sentence);
|
||||
}
|
||||
else
|
||||
{
|
||||
delete initRule;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Stack Lattice::GetNonTermNode(const Range &sourceRange) const
|
||||
{
|
||||
Stack ret;
|
||||
size_t sourcePos = sourceRange.GetStartPos();
|
||||
|
||||
const Stack &origStack = GetStack(sourcePos);
|
||||
Stack::const_iterator iter;
|
||||
for (iter = origStack.begin(); iter != origStack.end(); ++iter)
|
||||
{
|
||||
LatticeNode *node = *iter;
|
||||
const Range &nodeRangeS = node->GetSourceRange();
|
||||
|
||||
assert(nodeRangeS.GetStartPos() == sourceRange.GetStartPos());
|
||||
|
||||
if (! node->IsTerminal() && nodeRangeS.GetEndPos() == sourceRange.GetEndPos())
|
||||
{
|
||||
ret.push_back(node);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const Lattice &obj)
|
||||
{
|
||||
std::vector<Stack>::const_iterator iter;
|
||||
for (iter = obj.m_stacks.begin(); iter != obj.m_stacks.end(); ++iter)
|
||||
{
|
||||
const Stack &stack = *iter;
|
||||
|
||||
Stack::const_iterator iterStack;
|
||||
for (iterStack = stack.begin(); iterStack != stack.end(); ++iterStack)
|
||||
{
|
||||
const LatticeNode &node = **iterStack;
|
||||
out << node << " ";
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
47
contrib/other-builds/extract-mixed-syntax/Lattice.h
Normal file
47
contrib/other-builds/extract-mixed-syntax/Lattice.h
Normal file
@ -0,0 +1,47 @@
|
||||
#pragma once
|
||||
/*
|
||||
* Lattice.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 18/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "RuleCollection.h"
|
||||
|
||||
class Global;
|
||||
class LatticeNode;
|
||||
class Tunnel;
|
||||
class TunnelCollection;
|
||||
class SentenceAlignment;
|
||||
|
||||
typedef std::vector<LatticeNode*> Stack;
|
||||
|
||||
class Lattice
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream&, const Lattice&);
|
||||
|
||||
std::vector<Stack> m_stacks;
|
||||
RuleCollection m_rules;
|
||||
|
||||
Stack &GetStack(size_t endPos);
|
||||
|
||||
void CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global);
|
||||
|
||||
public:
|
||||
Lattice(size_t sourceSize);
|
||||
~Lattice();
|
||||
|
||||
void CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global);
|
||||
void CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global);
|
||||
|
||||
const Stack &GetStack(size_t startPos) const;
|
||||
const RuleCollection &GetRules() const
|
||||
{ return m_rules; }
|
||||
|
||||
Stack GetNonTermNode(const Range &sourceRange) const;
|
||||
|
||||
};
|
||||
|
149
contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp
Normal file
149
contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp
Normal file
@ -0,0 +1,149 @@
|
||||
/*
|
||||
* LatticeNode.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 18/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <sstream>
|
||||
#include "LatticeNode.h"
|
||||
#include "SyntaxTree.h"
|
||||
#include "Tunnel.h"
|
||||
#include "SentenceAlignment.h"
|
||||
#include "SymbolSequence.h"
|
||||
|
||||
size_t LatticeNode::s_count = 0;
|
||||
|
||||
using namespace std;
|
||||
|
||||
// for terms
|
||||
LatticeNode::LatticeNode(size_t pos, const SentenceAlignment *sentence)
|
||||
:m_tunnel(NULL)
|
||||
,m_isTerminal(true)
|
||||
,m_sourceTreeNode(NULL)
|
||||
,m_targetTreeNode(NULL)
|
||||
,m_sentence(sentence)
|
||||
,m_sourceRange(pos, pos)
|
||||
{
|
||||
s_count++;
|
||||
//cerr << *this << endl;
|
||||
}
|
||||
|
||||
// for non-terms
|
||||
LatticeNode::LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode)
|
||||
:m_tunnel(&tunnel)
|
||||
,m_isTerminal(false)
|
||||
,m_sourceTreeNode(sourceTreeNode)
|
||||
,m_targetTreeNode(targetTreeNode)
|
||||
,m_sentence(NULL)
|
||||
,m_sourceRange(tunnel.GetRange(0))
|
||||
{
|
||||
s_count++;
|
||||
//cerr << *this << endl;
|
||||
}
|
||||
|
||||
bool LatticeNode::IsSyntax() const
|
||||
{
|
||||
assert(!m_isTerminal);
|
||||
bool ret = m_sourceTreeNode->IsSyntax() || m_targetTreeNode->IsSyntax();
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t LatticeNode::GetNumSymbols(size_t direction) const
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
int LatticeNode::Compare(const LatticeNode &otherNode) const
|
||||
{
|
||||
int ret = 0;
|
||||
if (m_isTerminal != otherNode.m_isTerminal)
|
||||
{
|
||||
ret = m_isTerminal ? -1 : 1;
|
||||
}
|
||||
|
||||
// both term or non-term
|
||||
else if (m_isTerminal)
|
||||
{ // term. compare source span
|
||||
if (m_sourceRange.GetStartPos() == otherNode.m_sourceRange.GetStartPos())
|
||||
ret = 0;
|
||||
else
|
||||
ret = (m_sourceRange.GetStartPos() < otherNode.m_sourceRange.GetStartPos()) ? -1 : +1;
|
||||
}
|
||||
else
|
||||
{ // non-term. compare source span and BOTH label
|
||||
assert(!m_isTerminal);
|
||||
assert(!otherNode.m_isTerminal);
|
||||
|
||||
if (m_sourceTreeNode->IsSyntax())
|
||||
{
|
||||
ret = m_tunnel->Compare(*otherNode.m_tunnel, 0);
|
||||
if (ret == 0 && m_sourceTreeNode->GetLabel() != otherNode.m_sourceTreeNode->GetLabel())
|
||||
{
|
||||
ret = (m_sourceTreeNode->GetLabel() < otherNode.m_sourceTreeNode->GetLabel()) ? -1 : +1;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret == 0 && m_targetTreeNode->IsSyntax())
|
||||
{
|
||||
ret = m_tunnel->Compare(*otherNode.m_tunnel, 1);
|
||||
if (ret == 0 && m_targetTreeNode->GetLabel() != otherNode.m_targetTreeNode->GetLabel())
|
||||
{
|
||||
ret = (m_targetTreeNode->GetLabel() < otherNode.m_targetTreeNode->GetLabel()) ? -1 : +1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void LatticeNode::CreateSymbols(size_t direction, SymbolSequence &symbols) const
|
||||
{
|
||||
if (m_isTerminal)
|
||||
{
|
||||
/*
|
||||
const std::vector<std::string> &words = (direction == 0 ? m_sentence->source : m_sentence->target);
|
||||
size_t startPos = m_tunnel.GetStart(direction)
|
||||
,endPos = m_tunnel.GetEnd(direction);
|
||||
|
||||
for (size_t pos = startPos; pos <= endPos; ++pos)
|
||||
{
|
||||
Symbol symbol(words[pos], pos);
|
||||
symbols.Add(symbol);
|
||||
}
|
||||
*/
|
||||
}
|
||||
else
|
||||
{ // output both
|
||||
|
||||
Symbol symbol(m_sourceTreeNode->GetLabel(), m_targetTreeNode->GetLabel()
|
||||
, m_tunnel->GetRange(0).GetStartPos(), m_tunnel->GetRange(0).GetEndPos()
|
||||
, m_tunnel->GetRange(1).GetStartPos(), m_tunnel->GetRange(1).GetEndPos()
|
||||
, m_sourceTreeNode->IsSyntax(), m_targetTreeNode->IsSyntax());
|
||||
|
||||
symbols.Add(symbol);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const LatticeNode &obj)
|
||||
{
|
||||
if (obj.m_isTerminal)
|
||||
{
|
||||
assert(obj.m_sourceRange.GetWidth() == 1);
|
||||
size_t pos = obj.m_sourceRange.GetStartPos();
|
||||
|
||||
const SentenceAlignment &sentence = *obj.m_sentence;
|
||||
out << obj.m_sourceRange << "=" << sentence.source[pos];
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(obj.m_tunnel);
|
||||
out << obj.GetTunnel() << "=" << obj.m_sourceTreeNode->GetLabel() << obj.m_targetTreeNode->GetLabel() << " ";
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
77
contrib/other-builds/extract-mixed-syntax/LatticeNode.h
Normal file
77
contrib/other-builds/extract-mixed-syntax/LatticeNode.h
Normal file
@ -0,0 +1,77 @@
|
||||
#pragma once
|
||||
/*
|
||||
* LatticeNode.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 18/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include "Range.h"
|
||||
|
||||
class Tunnel;
|
||||
class SyntaxNode;
|
||||
class SentenceAlignment;
|
||||
class SymbolSequence;
|
||||
|
||||
class LatticeNode
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream&, const LatticeNode&);
|
||||
|
||||
bool m_isTerminal;
|
||||
|
||||
// for terms & non-term
|
||||
Range m_sourceRange;
|
||||
|
||||
// non-terms. source range should be same as m_sourceRange
|
||||
const Tunnel *m_tunnel;
|
||||
|
||||
public:
|
||||
static size_t s_count;
|
||||
|
||||
|
||||
|
||||
const SyntaxNode *m_sourceTreeNode, *m_targetTreeNode;
|
||||
const SentenceAlignment *m_sentence;
|
||||
|
||||
// for terms
|
||||
LatticeNode(size_t pos, const SentenceAlignment *sentence);
|
||||
|
||||
// for non-terms
|
||||
LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode);
|
||||
|
||||
bool IsTerminal() const
|
||||
{ return m_isTerminal; }
|
||||
|
||||
bool IsSyntax() const;
|
||||
|
||||
size_t GetNumSymbols(size_t direction) const;
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
int Compare(const LatticeNode &otherNode) const;
|
||||
|
||||
void CreateSymbols(size_t direction, SymbolSequence &symbols) const;
|
||||
|
||||
const Tunnel &GetTunnel() const
|
||||
{
|
||||
assert(m_tunnel);
|
||||
return *m_tunnel;
|
||||
}
|
||||
|
||||
const Range &GetSourceRange() const
|
||||
{
|
||||
return m_sourceRange;
|
||||
}
|
||||
const SyntaxNode &GetSyntaxNode(size_t direction) const
|
||||
{
|
||||
const SyntaxNode *node = direction == 0 ? m_sourceTreeNode : m_targetTreeNode;
|
||||
assert(node);
|
||||
return *node;
|
||||
}
|
||||
|
||||
};
|
||||
|
13
contrib/other-builds/extract-mixed-syntax/Makefile
Normal file
13
contrib/other-builds/extract-mixed-syntax/Makefile
Normal file
@ -0,0 +1,13 @@
|
||||
all: extract
|
||||
|
||||
clean:
|
||||
rm -f *.o extract-mixed-syntax
|
||||
|
||||
.cpp.o:
|
||||
g++ -O6 -g -c $<
|
||||
|
||||
extract: tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o
|
||||
|
||||
g++ tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o -lz -lboost_iostreams-mt -o extract-mixed-syntax
|
||||
|
||||
|
@ -0,0 +1,79 @@
|
||||
// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include <boost/iostreams/filter/gzip.hpp>
|
||||
#include "OutputFileStream.h"
|
||||
#include "gzfilebuf.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
OutputFileStream::OutputFileStream()
|
||||
:boost::iostreams::filtering_ostream()
|
||||
,m_outFile(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
OutputFileStream::OutputFileStream(const std::string &filePath)
|
||||
: m_outFile(NULL)
|
||||
{
|
||||
Open(filePath);
|
||||
}
|
||||
|
||||
OutputFileStream::~OutputFileStream()
|
||||
{
|
||||
Close();
|
||||
}
|
||||
|
||||
bool OutputFileStream::Open(const std::string &filePath)
|
||||
{
|
||||
m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
|
||||
if (m_outFile->fail()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
|
||||
this->push(boost::iostreams::gzip_compressor());
|
||||
}
|
||||
this->push(*m_outFile);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void OutputFileStream::Close()
|
||||
{
|
||||
if (m_outFile == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
this->flush();
|
||||
this->pop(); // file
|
||||
|
||||
m_outFile->close();
|
||||
delete m_outFile;
|
||||
m_outFile = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
50
contrib/other-builds/extract-mixed-syntax/OutputFileStream.h
Normal file
50
contrib/other-builds/extract-mixed-syntax/OutputFileStream.h
Normal file
@ -0,0 +1,50 @@
|
||||
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <boost/iostreams/filtering_stream.hpp>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
/** Used in place of std::istream, can read zipped files if it ends in .gz
|
||||
*/
|
||||
class OutputFileStream : public boost::iostreams::filtering_ostream
|
||||
{
|
||||
protected:
|
||||
std::ofstream *m_outFile;
|
||||
public:
|
||||
OutputFileStream();
|
||||
|
||||
OutputFileStream(const std::string &filePath);
|
||||
virtual ~OutputFileStream();
|
||||
|
||||
bool Open(const std::string &filePath);
|
||||
void Close();
|
||||
};
|
||||
|
||||
}
|
||||
|
74
contrib/other-builds/extract-mixed-syntax/Range.cpp
Normal file
74
contrib/other-builds/extract-mixed-syntax/Range.cpp
Normal file
@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Range.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 22/02/2011.
|
||||
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "Range.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void Range::Merge(const Range &a, const Range &b)
|
||||
{
|
||||
if (a.m_startPos == NOT_FOUND)
|
||||
{ // get the other regardless
|
||||
m_startPos = b.m_startPos;
|
||||
}
|
||||
else if (b.m_startPos == NOT_FOUND)
|
||||
{
|
||||
m_startPos = a.m_startPos;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_startPos = min(a.m_startPos, b.m_startPos);
|
||||
}
|
||||
|
||||
if (a.m_endPos == NOT_FOUND)
|
||||
{ // get the other regardless
|
||||
m_endPos = b.m_endPos;
|
||||
}
|
||||
else if (b.m_endPos == NOT_FOUND)
|
||||
{ // do nothing
|
||||
m_endPos = a.m_endPos;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_endPos = max(a.m_endPos, b.m_endPos);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
int Range::Compare(const Range &other) const
|
||||
{
|
||||
if (m_startPos < other.m_startPos)
|
||||
return -1;
|
||||
else if (m_startPos > other.m_startPos)
|
||||
return +1;
|
||||
else if (m_endPos < other.m_endPos)
|
||||
return -1;
|
||||
else if (m_endPos > other.m_endPos)
|
||||
return +1;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
bool Range::Overlap(const Range &other) const
|
||||
{
|
||||
if ( other.m_endPos < m_startPos || other.m_startPos > m_endPos)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const Range &range)
|
||||
{
|
||||
out << "[" << range.m_startPos << "-" << range.m_endPos << "]";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
57
contrib/other-builds/extract-mixed-syntax/Range.h
Normal file
57
contrib/other-builds/extract-mixed-syntax/Range.h
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Range.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 22/02/2011.
|
||||
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
|
||||
#define NOT_FOUND std::numeric_limits<size_t>::max()
|
||||
|
||||
class Range
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream&, const Range&);
|
||||
|
||||
size_t m_startPos, m_endPos;
|
||||
public:
|
||||
|
||||
Range()
|
||||
:m_startPos(NOT_FOUND)
|
||||
,m_endPos(NOT_FOUND)
|
||||
{}
|
||||
|
||||
Range(const Range ©)
|
||||
:m_startPos(copy.m_startPos)
|
||||
,m_endPos(copy.m_endPos)
|
||||
{}
|
||||
|
||||
Range(size_t startPos, size_t endPos)
|
||||
:m_startPos(startPos)
|
||||
,m_endPos(endPos)
|
||||
{}
|
||||
|
||||
size_t GetStartPos() const
|
||||
{ return m_startPos; }
|
||||
size_t GetEndPos() const
|
||||
{ return m_endPos; }
|
||||
size_t GetWidth() const
|
||||
{ return m_endPos - m_startPos + 1; }
|
||||
|
||||
void SetStartPos(size_t startPos)
|
||||
{ m_startPos = startPos; }
|
||||
void SetEndPos(size_t endPos)
|
||||
{ m_endPos = endPos; }
|
||||
|
||||
void Merge(const Range &a, const Range &b);
|
||||
|
||||
int Compare(const Range &other) const;
|
||||
|
||||
bool Overlap(const Range &other) const;
|
||||
|
||||
|
||||
};
|
594
contrib/other-builds/extract-mixed-syntax/Rule.cpp
Normal file
594
contrib/other-builds/extract-mixed-syntax/Rule.cpp
Normal file
@ -0,0 +1,594 @@
|
||||
/*
|
||||
* Rule.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include "Rule.h"
|
||||
#include "Global.h"
|
||||
#include "LatticeNode.h"
|
||||
#include "Lattice.h"
|
||||
#include "SentenceAlignment.h"
|
||||
#include "Tunnel.h"
|
||||
#include "TunnelCollection.h"
|
||||
#include "RuleCollection.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
RuleElement::RuleElement(const RuleElement ©)
|
||||
:m_latticeNode(copy.m_latticeNode)
|
||||
,m_alignmentPos(copy.m_alignmentPos)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
Rule::Rule(const LatticeNode *latticeNode)
|
||||
:m_lhs(NULL)
|
||||
{
|
||||
RuleElement element(*latticeNode);
|
||||
|
||||
m_coll.push_back(element);
|
||||
}
|
||||
|
||||
Rule::Rule(const Rule &prevRule, const LatticeNode *latticeNode)
|
||||
:m_coll(prevRule.m_coll)
|
||||
,m_lhs(NULL)
|
||||
{
|
||||
RuleElement element(*latticeNode);
|
||||
m_coll.push_back(element);
|
||||
}
|
||||
|
||||
Rule::Rule(const Global &global, bool &isValid, const Rule ©, const LatticeNode *lhs, const SentenceAlignment &sentence)
|
||||
:m_coll(copy.m_coll)
|
||||
,m_source(copy.m_source)
|
||||
,m_target(copy.m_target)
|
||||
,m_lhs(lhs)
|
||||
{
|
||||
CreateSymbols(global, isValid, sentence);
|
||||
}
|
||||
|
||||
Rule::~Rule()
|
||||
{
|
||||
}
|
||||
|
||||
// helper for sort
|
||||
struct CompareLatticeNodeTarget
|
||||
{
|
||||
bool operator() (const RuleElement *a, const RuleElement *b)
|
||||
{
|
||||
const Range &rangeA = a->GetLatticeNode().GetTunnel().GetRange(1)
|
||||
,&rangeB = b->GetLatticeNode().GetTunnel().GetRange(1);
|
||||
return rangeA.GetEndPos() < rangeB.GetEndPos();
|
||||
}
|
||||
};
|
||||
|
||||
void Rule::CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence)
|
||||
{
|
||||
vector<RuleElement*> nonTerms;
|
||||
|
||||
// source
|
||||
for (size_t ind = 0; ind < m_coll.size(); ++ind)
|
||||
{
|
||||
RuleElement &element = m_coll[ind];
|
||||
const LatticeNode &node = element.GetLatticeNode();
|
||||
if (node.IsTerminal())
|
||||
{
|
||||
size_t sourcePos = node.GetSourceRange().GetStartPos();
|
||||
const string &word = sentence.source[sourcePos];
|
||||
Symbol symbol(word, sourcePos);
|
||||
m_source.Add(symbol);
|
||||
}
|
||||
else
|
||||
{ // non-term
|
||||
const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
|
||||
const string &targetWord = node.GetSyntaxNode(1).GetLabel();
|
||||
Symbol symbol(sourceWord, targetWord
|
||||
, node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
|
||||
, node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
|
||||
, node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
|
||||
m_source.Add(symbol);
|
||||
|
||||
// store current pos within phrase
|
||||
element.m_alignmentPos.first = ind;
|
||||
|
||||
// for target symbols
|
||||
nonTerms.push_back(&element);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// target
|
||||
isValid = true;
|
||||
|
||||
const Range &lhsTargetRange = m_lhs->GetTunnel().GetRange(1);
|
||||
|
||||
// check spans of target non-terms
|
||||
if (nonTerms.size())
|
||||
{
|
||||
// sort non-term rules elements by target range
|
||||
std::sort(nonTerms.begin(), nonTerms.end(), CompareLatticeNodeTarget());
|
||||
|
||||
const Range &first = nonTerms.front()->GetLatticeNode().GetTunnel().GetRange(1);
|
||||
const Range &last = nonTerms.back()->GetLatticeNode().GetTunnel().GetRange(1);
|
||||
|
||||
if (first.GetStartPos() < lhsTargetRange.GetStartPos()
|
||||
|| last.GetEndPos() > lhsTargetRange.GetEndPos())
|
||||
{
|
||||
isValid = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (isValid)
|
||||
{
|
||||
size_t indNonTerm = 0;
|
||||
RuleElement *currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;
|
||||
for (size_t targetPos = lhsTargetRange.GetStartPos(); targetPos <= lhsTargetRange.GetEndPos(); ++targetPos)
|
||||
{
|
||||
if (currNonTermElement && targetPos == currNonTermElement->GetLatticeNode().GetTunnel().GetRange(1).GetStartPos())
|
||||
{ // start of a non-term. print out non-terms & skip to the end
|
||||
|
||||
const LatticeNode &node = currNonTermElement->GetLatticeNode();
|
||||
|
||||
const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
|
||||
const string &targetWord = node.GetSyntaxNode(1).GetLabel();
|
||||
Symbol symbol(sourceWord, targetWord
|
||||
, node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
|
||||
, node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
|
||||
, node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
|
||||
m_target.Add(symbol);
|
||||
|
||||
// store current pos within phrase
|
||||
currNonTermElement->m_alignmentPos.second = m_target.GetSize() - 1;
|
||||
|
||||
assert(currNonTermElement->m_alignmentPos.first != NOT_FOUND);
|
||||
|
||||
targetPos = node.GetTunnel().GetRange(1).GetEndPos();
|
||||
indNonTerm++;
|
||||
currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;
|
||||
}
|
||||
else
|
||||
{ // term
|
||||
const string &word = sentence.target[targetPos];
|
||||
|
||||
Symbol symbol(word, targetPos);
|
||||
m_target.Add(symbol);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
assert(indNonTerm == nonTerms.size());
|
||||
|
||||
if (m_target.GetSize() > global.maxSymbols) {
|
||||
isValid = false;
|
||||
//cerr << "m_source=" << m_source.GetSize() << ":" << m_source << endl;
|
||||
//cerr << "m_target=" << m_target.GetSize() << ":" << m_target << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Rule::MoreDefaultNonTermThanTerm() const
|
||||
{
|
||||
size_t numTerm = 0, numDefaultNonTerm = 0;
|
||||
|
||||
CollType::const_iterator iter;
|
||||
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
|
||||
{
|
||||
const RuleElement &element = *iter;
|
||||
const LatticeNode &node = element.GetLatticeNode();
|
||||
if (node.IsTerminal())
|
||||
{
|
||||
++numTerm;
|
||||
}
|
||||
else if (!node.IsSyntax())
|
||||
{
|
||||
++numDefaultNonTerm;
|
||||
}
|
||||
}
|
||||
|
||||
bool ret = numDefaultNonTerm > numTerm;
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool Rule::SourceHasEdgeDefaultNonTerm() const
|
||||
{
|
||||
assert(m_coll.size());
|
||||
const LatticeNode &first = m_coll.front().GetLatticeNode();
|
||||
const LatticeNode &last = m_coll.back().GetLatticeNode();
|
||||
|
||||
// 1st
|
||||
if (!first.IsTerminal() && !first.IsSyntax())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if (!last.IsTerminal() && !last.IsSyntax())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Rule::IsValid(const Global &global, const TunnelCollection &tunnelColl) const
|
||||
{
|
||||
if (m_coll.size() == 1 && !m_coll[0].GetLatticeNode().IsTerminal()) // can't be only 1 terminal
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (MoreDefaultNonTermThanTerm())
|
||||
{ // must have at least as many terms as non-syntax non-terms
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!global.allowDefaultNonTermEdge && SourceHasEdgeDefaultNonTerm())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (GetNumSymbols() > global.maxSymbols)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (AdjacentDefaultNonTerms())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!IsHole(tunnelColl))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (NonTermOverlap())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
std::pair<size_t, size_t> spanS = GetSpan(0)
|
||||
,spanT= GetSpan(1);
|
||||
|
||||
if (tunnelColl.NumUnalignedWord(0, spanS.first, spanS.second) >= global.maxUnaligned)
|
||||
return false;
|
||||
if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
|
||||
return false;
|
||||
*/
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Rule::NonTermOverlap() const
|
||||
{
|
||||
vector<Range> ranges;
|
||||
|
||||
CollType::const_iterator iter;
|
||||
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
|
||||
{
|
||||
const RuleElement &element = *iter;
|
||||
if (!element.GetLatticeNode().IsTerminal())
|
||||
{
|
||||
const Range &range = element.GetLatticeNode().GetTunnel().GetRange(1);
|
||||
ranges.push_back(range);
|
||||
}
|
||||
}
|
||||
|
||||
vector<Range>::const_iterator outerIter;
|
||||
for (outerIter = ranges.begin(); outerIter != ranges.end(); ++outerIter)
|
||||
{
|
||||
const Range &outer = *outerIter;
|
||||
vector<Range>::const_iterator innerIter;
|
||||
for (innerIter = outerIter + 1; innerIter != ranges.end(); ++innerIter)
|
||||
{
|
||||
const Range &inner = *innerIter;
|
||||
if (outer.Overlap(inner))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
Range Rule::GetSourceRange() const
|
||||
{
|
||||
assert(m_coll.size());
|
||||
const Range &first = m_coll.front().GetLatticeNode().GetSourceRange();
|
||||
const Range &last = m_coll.back().GetLatticeNode().GetSourceRange();
|
||||
|
||||
Range ret(first.GetStartPos(), last.GetEndPos());
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
bool Rule::IsHole(const TunnelCollection &tunnelColl) const
|
||||
{
|
||||
const Range &spanS = GetSourceRange();
|
||||
const TunnelList &tunnels = tunnelColl.GetTunnels(spanS.GetStartPos(), spanS.GetEndPos());
|
||||
|
||||
bool ret = tunnels.size() > 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
bool Rule::CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const
|
||||
{
|
||||
if (GetNumSymbols() >= global.maxSymbols)
|
||||
return false;
|
||||
if (AdjacentDefaultNonTerms())
|
||||
return false;
|
||||
if (MaxNonTerm(global))
|
||||
return false;
|
||||
if (NonTermOverlap())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
const Range spanS = GetSourceRange();
|
||||
|
||||
if (tunnelColl.NumUnalignedWord(0, spanS.GetStartPos(), spanS.GetEndPos()) >= global.maxUnaligned)
|
||||
return false;
|
||||
// if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
|
||||
// return false;
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Rule::MaxNonTerm(const Global &global) const
|
||||
{
|
||||
//cerr << *this << endl;
|
||||
size_t numNonTerm = 0, numNonTermDefault = 0;
|
||||
|
||||
CollType::const_iterator iter;
|
||||
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
|
||||
{
|
||||
const LatticeNode *node = &(*iter).GetLatticeNode();
|
||||
if (!node->IsTerminal() )
|
||||
{
|
||||
numNonTerm++;
|
||||
if (!node->IsSyntax())
|
||||
{
|
||||
numNonTermDefault++;
|
||||
}
|
||||
if (numNonTerm >= global.maxNonTerm || numNonTermDefault >= global.maxNonTermDefault)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool Rule::AdjacentDefaultNonTerms() const
|
||||
{
|
||||
assert(m_coll.size() > 0);
|
||||
|
||||
const LatticeNode *prevNode = &m_coll.front().GetLatticeNode();
|
||||
CollType::const_iterator iter;
|
||||
for (iter = m_coll.begin() + 1; iter != m_coll.end(); ++iter)
|
||||
{
|
||||
const LatticeNode *node = &(*iter).GetLatticeNode();
|
||||
if (!prevNode->IsTerminal() && !node->IsTerminal() && !prevNode->IsSyntax() && !node->IsSyntax() )
|
||||
{
|
||||
return true;
|
||||
}
|
||||
prevNode = node;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
size_t Rule::GetNumSymbols() const
|
||||
{
|
||||
size_t ret = m_coll.size();
|
||||
return ret;
|
||||
}
|
||||
|
||||
void Rule::CreateRules(RuleCollection &rules
|
||||
, const Lattice &lattice
|
||||
, const SentenceAlignment &sentence
|
||||
, const Global &global)
|
||||
{
|
||||
assert(m_coll.size() > 0);
|
||||
const LatticeNode *latticeNode = &m_coll.back().GetLatticeNode();
|
||||
size_t endPos = latticeNode->GetSourceRange().GetEndPos() + 1;
|
||||
|
||||
const Stack &stack = lattice.GetStack(endPos);
|
||||
|
||||
Stack::const_iterator iter;
|
||||
for (iter = stack.begin(); iter != stack.end(); ++iter)
|
||||
{
|
||||
const LatticeNode *newLatticeNode = *iter;
|
||||
Rule *newRule = new Rule(*this, newLatticeNode);
|
||||
//cerr << *newRule << endl;
|
||||
|
||||
if (newRule->CanRecurse(global, sentence.GetTunnelCollection()))
|
||||
{ // may or maynot be valid, but can continue to build on this rule
|
||||
newRule->CreateRules(rules, lattice, sentence, global);
|
||||
}
|
||||
|
||||
if (newRule->IsValid(global, sentence.GetTunnelCollection()))
|
||||
{ // add to rule collection
|
||||
rules.Add(global, newRule, sentence);
|
||||
}
|
||||
else
|
||||
{
|
||||
delete newRule;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
bool Rule::operator<(const Rule &compare) const
|
||||
{
|
||||
/*
|
||||
if (g_debug)
|
||||
{
|
||||
cerr << *this << endl << compare;
|
||||
cerr << endl;
|
||||
}
|
||||
*/
|
||||
|
||||
bool ret = Compare(compare) < 0;
|
||||
|
||||
/*
|
||||
if (g_debug)
|
||||
{
|
||||
cerr << *this << endl << compare << endl << ret << endl << endl;
|
||||
}
|
||||
*/
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int Rule::Compare(const Rule &compare) const
|
||||
{
|
||||
//cerr << *this << endl << compare << endl;
|
||||
assert(m_coll.size() > 0);
|
||||
assert(m_source.GetSize() > 0);
|
||||
assert(m_target.GetSize() > 0);
|
||||
|
||||
int ret = 0;
|
||||
|
||||
// compare each fragment
|
||||
ret = m_source.Compare(compare.m_source);
|
||||
if (ret != 0)
|
||||
{
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = m_target.Compare(compare.m_target);
|
||||
if (ret != 0)
|
||||
{
|
||||
return ret;
|
||||
}
|
||||
|
||||
// compare lhs
|
||||
const string &thisSourceLabel = m_lhs->GetSyntaxNode(0).GetLabel();
|
||||
const string &otherSourceLabel = compare.m_lhs->GetSyntaxNode(0).GetLabel();
|
||||
if (thisSourceLabel != otherSourceLabel)
|
||||
{
|
||||
ret = (thisSourceLabel < otherSourceLabel) ? -1 : +1;
|
||||
return ret;
|
||||
}
|
||||
|
||||
const string &thisTargetLabel = m_lhs->GetSyntaxNode(1).GetLabel();
|
||||
const string &otherTargetLabel = compare.m_lhs->GetSyntaxNode(1).GetLabel();
|
||||
if (thisTargetLabel != otherTargetLabel)
|
||||
{
|
||||
ret = (thisTargetLabel < otherTargetLabel) ? -1 : +1;
|
||||
return ret;
|
||||
}
|
||||
|
||||
assert(ret == 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
const LatticeNode &Rule::GetLatticeNode(size_t ind) const
|
||||
{
|
||||
assert(ind < m_coll.size());
|
||||
return m_coll[ind].GetLatticeNode();
|
||||
}
|
||||
|
||||
void Rule::DebugOutput() const
|
||||
{
|
||||
Output(cerr);
|
||||
}
|
||||
|
||||
void Rule::Output(std::ostream &out) const
|
||||
{
|
||||
|
||||
stringstream strmeS, strmeT;
|
||||
|
||||
std::vector<Symbol>::const_iterator iterSymbol;
|
||||
for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
|
||||
{
|
||||
const Symbol &symbol = *iterSymbol;
|
||||
strmeS << symbol << " ";
|
||||
}
|
||||
|
||||
for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
|
||||
{
|
||||
const Symbol &symbol = *iterSymbol;
|
||||
strmeT << symbol << " ";
|
||||
}
|
||||
|
||||
// lhs
|
||||
if (m_lhs)
|
||||
{
|
||||
strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
|
||||
strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
|
||||
}
|
||||
|
||||
out << strmeS.str() << " ||| " << strmeT.str() << " ||| ";
|
||||
|
||||
// alignment
|
||||
Rule::CollType::const_iterator iter;
|
||||
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
|
||||
{
|
||||
const RuleElement &element = *iter;
|
||||
const LatticeNode &node = element.GetLatticeNode();
|
||||
bool isTerminal = node.IsTerminal();
|
||||
|
||||
if (!isTerminal)
|
||||
{
|
||||
out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " ";
|
||||
}
|
||||
}
|
||||
|
||||
out << "||| 1";
|
||||
|
||||
}
|
||||
|
||||
void Rule::OutputInv(std::ostream &out) const
|
||||
{
|
||||
stringstream strmeS, strmeT;
|
||||
|
||||
std::vector<Symbol>::const_iterator iterSymbol;
|
||||
for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
|
||||
{
|
||||
const Symbol &symbol = *iterSymbol;
|
||||
strmeS << symbol << " ";
|
||||
}
|
||||
|
||||
for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
|
||||
{
|
||||
const Symbol &symbol = *iterSymbol;
|
||||
strmeT << symbol << " ";
|
||||
}
|
||||
|
||||
// lhs
|
||||
if (m_lhs)
|
||||
{
|
||||
strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
|
||||
strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
|
||||
}
|
||||
|
||||
out << strmeT.str() << " ||| " << strmeS.str() << " ||| ";
|
||||
|
||||
// alignment
|
||||
Rule::CollType::const_iterator iter;
|
||||
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
|
||||
{
|
||||
const RuleElement &element = *iter;
|
||||
const LatticeNode &node = element.GetLatticeNode();
|
||||
bool isTerminal = node.IsTerminal();
|
||||
|
||||
if (!isTerminal)
|
||||
{
|
||||
out << element.m_alignmentPos.second << "-" << element.m_alignmentPos.first << " ";
|
||||
}
|
||||
}
|
||||
|
||||
out << "||| 1";
|
||||
|
||||
}
|
||||
|
||||
|
96
contrib/other-builds/extract-mixed-syntax/Rule.h
Normal file
96
contrib/other-builds/extract-mixed-syntax/Rule.h
Normal file
@ -0,0 +1,96 @@
|
||||
#pragma once
|
||||
/*
|
||||
* Rule.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include "LatticeNode.h"
|
||||
#include "SymbolSequence.h"
|
||||
#include "Global.h"
|
||||
|
||||
class Lattice;
|
||||
class SentenceAlignment;
|
||||
class Global;
|
||||
class RuleCollection;
|
||||
class SyntaxNode;
|
||||
class TunnelCollection;
|
||||
class Range;
|
||||
|
||||
class RuleElement
|
||||
{
|
||||
protected:
|
||||
const LatticeNode *m_latticeNode;
|
||||
public:
|
||||
std::pair<size_t, size_t> m_alignmentPos;
|
||||
|
||||
RuleElement(const RuleElement ©);
|
||||
RuleElement(const LatticeNode &latticeNode)
|
||||
:m_latticeNode(&latticeNode)
|
||||
,m_alignmentPos(NOT_FOUND, NOT_FOUND)
|
||||
{}
|
||||
|
||||
const LatticeNode &GetLatticeNode() const
|
||||
{ return *m_latticeNode; }
|
||||
|
||||
};
|
||||
|
||||
class Rule
|
||||
{
|
||||
protected:
|
||||
typedef std::vector<RuleElement> CollType;
|
||||
CollType m_coll;
|
||||
|
||||
const LatticeNode *m_lhs;
|
||||
SymbolSequence m_source, m_target;
|
||||
|
||||
bool IsHole(const TunnelCollection &tunnelColl) const;
|
||||
bool NonTermOverlap() const;
|
||||
|
||||
const LatticeNode &GetLatticeNode(size_t ind) const;
|
||||
void CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence);
|
||||
|
||||
public:
|
||||
// init
|
||||
Rule(const LatticeNode *latticeNode);
|
||||
|
||||
// create new rule by appending node to prev rule
|
||||
Rule(const Rule &prevRule, const LatticeNode *latticeNode);
|
||||
|
||||
// create copy with lhs
|
||||
Rule(const Global &global, bool &isValid, const Rule ©, const LatticeNode *lhs, const SentenceAlignment &sentence);
|
||||
|
||||
// can continue to add to this rule
|
||||
bool CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const;
|
||||
|
||||
virtual ~Rule();
|
||||
|
||||
// can add this to the set of rules
|
||||
bool IsValid(const Global &global, const TunnelCollection &tunnelColl) const;
|
||||
|
||||
size_t GetNumSymbols() const;
|
||||
bool AdjacentDefaultNonTerms() const;
|
||||
bool MaxNonTerm(const Global &global) const;
|
||||
bool MoreDefaultNonTermThanTerm() const;
|
||||
bool SourceHasEdgeDefaultNonTerm() const;
|
||||
|
||||
void CreateRules(RuleCollection &rules
|
||||
, const Lattice &lattice
|
||||
, const SentenceAlignment &sentence
|
||||
, const Global &global);
|
||||
|
||||
int Compare(const Rule &compare) const;
|
||||
bool operator<(const Rule &compare) const;
|
||||
|
||||
Range GetSourceRange() const;
|
||||
|
||||
DEBUG_OUTPUT();
|
||||
|
||||
void Output(std::ostream &out) const;
|
||||
void OutputInv(std::ostream &out) const;
|
||||
|
||||
};
|
102
contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp
Normal file
102
contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp
Normal file
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* RuleCollection.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include "RuleCollection.h"
|
||||
#include "Rule.h"
|
||||
#include "SentenceAlignment.h"
|
||||
#include "tables-core.h"
|
||||
#include "Lattice.h"
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
RuleCollection::~RuleCollection()
|
||||
{
|
||||
RemoveAllInColl(m_coll);
|
||||
}
|
||||
|
||||
void RuleCollection::Add(const Global &global, Rule *rule, const SentenceAlignment &sentence)
|
||||
{
|
||||
Range spanS = rule->GetSourceRange();
|
||||
|
||||
// cartesian product of lhs
|
||||
Stack nontermNodes = sentence.GetLattice().GetNonTermNode(spanS);
|
||||
Stack::const_iterator iterStack;
|
||||
for (iterStack = nontermNodes.begin(); iterStack != nontermNodes.end(); ++iterStack)
|
||||
{
|
||||
const LatticeNode &node = **iterStack;
|
||||
assert(!node.IsTerminal());
|
||||
|
||||
bool isValid;
|
||||
// create rules with LHS
|
||||
//cerr << "old:" << *rule << endl;
|
||||
Rule *newRule = new Rule(global, isValid, *rule, &node, sentence);
|
||||
|
||||
if (!isValid)
|
||||
{ // lhs doesn't match non-term spans
|
||||
delete newRule;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
stringstream s;
|
||||
s << *newRule;
|
||||
if (s.str().find("Wiederaufnahme der [X] ||| resumption of the [X] ||| ||| 1") == 0)
|
||||
{
|
||||
cerr << "READY:" << *newRule << endl;
|
||||
g_debug = true;
|
||||
}
|
||||
else {
|
||||
g_debug = false;
|
||||
}
|
||||
*/
|
||||
|
||||
typedef set<const Rule*, CompareRule>::iterator Iterator;
|
||||
pair<Iterator,bool> ret = m_coll.insert(newRule);
|
||||
|
||||
if (ret.second)
|
||||
{
|
||||
//cerr << "ACCEPTED:" << *newRule << endl;
|
||||
//cerr << "";
|
||||
}
|
||||
else
|
||||
{
|
||||
//cerr << "REJECTED:" << *newRule << endl;
|
||||
delete newRule;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
delete rule;
|
||||
|
||||
}
|
||||
|
||||
void RuleCollection::Output(std::ostream &out) const
|
||||
{
|
||||
RuleCollection::CollType::const_iterator iter;
|
||||
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
|
||||
{
|
||||
const Rule &rule = **iter;
|
||||
rule.Output(out);
|
||||
out << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void RuleCollection::OutputInv(std::ostream &out) const
|
||||
{
|
||||
RuleCollection::CollType::const_iterator iter;
|
||||
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
|
||||
{
|
||||
const Rule &rule = **iter;
|
||||
rule.OutputInv(out);
|
||||
out << endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
55
contrib/other-builds/extract-mixed-syntax/RuleCollection.h
Normal file
55
contrib/other-builds/extract-mixed-syntax/RuleCollection.h
Normal file
@ -0,0 +1,55 @@
|
||||
#pragma once
|
||||
/*
|
||||
* RuleCollection.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <set>
|
||||
#include <iostream>
|
||||
#include "Rule.h"
|
||||
|
||||
class SentenceAlignment;
|
||||
|
||||
// helper for sort. Don't compare default non-terminals
|
||||
struct CompareRule
|
||||
{
|
||||
bool operator() (const Rule *a, const Rule *b)
|
||||
{
|
||||
/*
|
||||
if (g_debug)
|
||||
{
|
||||
std::cerr << std::endl << (*a) << std::endl << (*b) << " ";
|
||||
}
|
||||
*/
|
||||
bool ret = (*a) < (*b);
|
||||
/*
|
||||
if (g_debug)
|
||||
{
|
||||
std::cerr << ret << std::endl;
|
||||
}
|
||||
*/
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class RuleCollection
|
||||
{
|
||||
protected:
|
||||
typedef std::set<const Rule*, CompareRule> CollType;
|
||||
CollType m_coll;
|
||||
|
||||
public:
|
||||
~RuleCollection();
|
||||
void Add(const Global &global, Rule *rule, const SentenceAlignment &sentence);
|
||||
size_t GetSize() const
|
||||
{ return m_coll.size(); }
|
||||
|
||||
void Output(std::ostream &out) const;
|
||||
void OutputInv(std::ostream &out) const;
|
||||
|
||||
};
|
||||
|
331
contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp
Normal file
331
contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp
Normal file
@ -0,0 +1,331 @@
|
||||
/*
|
||||
* SentenceAlignment.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/01/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include "SentenceAlignment.h"
|
||||
#include "XmlTree.h"
|
||||
#include "tables-core.h"
|
||||
#include "TunnelCollection.h"
|
||||
#include "Lattice.h"
|
||||
#include "LatticeNode.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
extern std::set< std::string > targetLabelCollection, sourceLabelCollection;
|
||||
extern std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;
|
||||
|
||||
SentenceAlignment::SentenceAlignment()
|
||||
:m_tunnelCollection(NULL)
|
||||
,m_lattice(NULL)
|
||||
{}
|
||||
|
||||
SentenceAlignment::~SentenceAlignment()
|
||||
{
|
||||
delete m_tunnelCollection;
|
||||
delete m_lattice;
|
||||
}
|
||||
|
||||
int SentenceAlignment::Create( const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global )
|
||||
{
|
||||
|
||||
// tokenizing English (and potentially extract syntax spans)
|
||||
if (global.targetSyntax) {
|
||||
string targetStringCPP = string(targetString);
|
||||
ProcessAndStripXMLTags( targetStringCPP, targetTree, targetLabelCollection , targetTopLabelCollection );
|
||||
target = tokenize( targetStringCPP.c_str() );
|
||||
// cerr << "E: " << targetStringCPP << endl;
|
||||
}
|
||||
else {
|
||||
target = tokenize( targetString.c_str() );
|
||||
}
|
||||
|
||||
// tokenizing source (and potentially extract syntax spans)
|
||||
if (global.sourceSyntax) {
|
||||
string sourceStringCPP = string(sourceString);
|
||||
ProcessAndStripXMLTags( sourceStringCPP, sourceTree, sourceLabelCollection , sourceTopLabelCollection );
|
||||
source = tokenize( sourceStringCPP.c_str() );
|
||||
// cerr << "F: " << sourceStringCPP << endl;
|
||||
}
|
||||
else {
|
||||
source = tokenize( sourceString.c_str() );
|
||||
}
|
||||
|
||||
// check if sentences are empty
|
||||
if (target.size() == 0 || source.size() == 0) {
|
||||
cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
|
||||
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// prepare data structures for alignments
|
||||
for(int i=0; i<source.size(); i++) {
|
||||
alignedCountS.push_back( 0 );
|
||||
}
|
||||
for(int i=0; i<target.size(); i++) {
|
||||
vector< int > dummy;
|
||||
alignedToT.push_back( dummy );
|
||||
}
|
||||
|
||||
//InitTightest(m_s2tTightest, source.size());
|
||||
//InitTightest(m_t2sTightest, target.size());
|
||||
|
||||
|
||||
// reading in alignments
|
||||
vector<string> alignmentSequence = tokenize( alignmentString.c_str() );
|
||||
for(int i=0; i<alignmentSequence.size(); i++) {
|
||||
int s,t;
|
||||
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
|
||||
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
|
||||
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
|
||||
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
||||
return 0;
|
||||
}
|
||||
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
|
||||
if (t >= target.size() || s >= source.size()) {
|
||||
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
|
||||
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
||||
return 0;
|
||||
}
|
||||
alignedToT[t].push_back( s );
|
||||
alignedCountS[s]++;
|
||||
|
||||
//SetAlignment(s, t);
|
||||
}
|
||||
|
||||
bool mixed = global.mixed;
|
||||
sourceTree.AddDefaultNonTerms(global.sourceSyntax, mixed, source.size());
|
||||
targetTree.AddDefaultNonTerms(global.targetSyntax, mixed, target.size());
|
||||
|
||||
//CalcTightestSpan(m_s2tTightest);
|
||||
//CalcTightestSpan(m_t2sTightest);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
void SentenceAlignment::InitTightest(Outer &tightest, size_t len)
|
||||
{
|
||||
tightest.resize(len);
|
||||
|
||||
for (size_t posOuter = 0; posOuter < len; ++posOuter)
|
||||
{
|
||||
Inner &inner = tightest[posOuter];
|
||||
size_t innerSize = len - posOuter;
|
||||
inner.resize(innerSize);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void SentenceAlignment::CalcTightestSpan(Outer &tightest)
|
||||
{
|
||||
size_t len = tightest.size();
|
||||
|
||||
for (size_t startPos = 0; startPos < len; ++startPos)
|
||||
{
|
||||
for (size_t endPos = startPos + 1; endPos < len; ++endPos)
|
||||
{
|
||||
const Range &prevRange = GetTightest(tightest, startPos, endPos - 1);
|
||||
const Range &smallRange = GetTightest(tightest, endPos, endPos);
|
||||
Range &newRange = GetTightest(tightest, startPos, endPos);
|
||||
|
||||
newRange.Merge(prevRange, smallRange);
|
||||
//cerr << "[" << startPos << "-" << endPos << "] --> [" << newRange.GetStartPos() << "-" << newRange.GetEndPos() << "]";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Range &SentenceAlignment::GetTightest(Outer &tightest, size_t startPos, size_t endPos)
|
||||
{
|
||||
assert(endPos < tightest.size());
|
||||
assert(endPos >= startPos);
|
||||
|
||||
Inner &inner = tightest[startPos];
|
||||
|
||||
size_t ind = endPos - startPos;
|
||||
Range &ret = inner[ind];
|
||||
return ret;
|
||||
}
|
||||
|
||||
void SentenceAlignment::SetAlignment(size_t source, size_t target)
|
||||
{
|
||||
SetAlignment(m_s2tTightest, source, target);
|
||||
SetAlignment(m_t2sTightest, target, source);
|
||||
}
|
||||
|
||||
void SentenceAlignment::SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos)
|
||||
{
|
||||
|
||||
Range &range = GetTightest(tightest, thisPos, thisPos);
|
||||
if (range.GetStartPos() == NOT_FOUND)
|
||||
{ // not yet set, do them both
|
||||
assert(range.GetEndPos() == NOT_FOUND);
|
||||
range.SetStartPos(thatPos);
|
||||
range.SetEndPos(thatPos);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(range.GetEndPos() != NOT_FOUND);
|
||||
range.SetStartPos( (range.GetStartPos() > thatPos) ? thatPos : range.GetStartPos() );
|
||||
range.SetEndPos( (range.GetEndPos() < thatPos) ? thatPos : range.GetEndPos() );
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
void SentenceAlignment::FindTunnels(const Global &global )
|
||||
{
|
||||
int countT = target.size();
|
||||
int countS = source.size();
|
||||
int maxSpan = max(global.maxHoleSpanSourceDefault, global.maxHoleSpanSourceSyntax);
|
||||
|
||||
m_tunnelCollection = new TunnelCollection(countS);
|
||||
|
||||
m_tunnelCollection->alignedCountS = alignedCountS;
|
||||
m_tunnelCollection->alignedCountT.resize(alignedToT.size());
|
||||
for (size_t ind = 0; ind < alignedToT.size(); ind++)
|
||||
{
|
||||
m_tunnelCollection->alignedCountT[ind] = alignedToT[ind].size();
|
||||
}
|
||||
|
||||
// phrase repository for creating hiero phrases
|
||||
|
||||
// check alignments for target phrase startT...endT
|
||||
for(int lengthT=1;
|
||||
lengthT <= maxSpan && lengthT <= countT;
|
||||
lengthT++) {
|
||||
for(int startT=0; startT < countT-(lengthT-1); startT++) {
|
||||
|
||||
// that's nice to have
|
||||
int endT = startT + lengthT - 1;
|
||||
|
||||
// if there is target side syntax, there has to be a node
|
||||
if (global.targetSyntax && !targetTree.HasNode(startT,endT))
|
||||
continue;
|
||||
|
||||
// find find aligned source words
|
||||
// first: find minimum and maximum source word
|
||||
int minS = 9999;
|
||||
int maxS = -1;
|
||||
vector< int > usedS = alignedCountS;
|
||||
for(int ti=startT;ti<=endT;ti++) {
|
||||
for(int i=0;i<alignedToT[ti].size();i++) {
|
||||
int si = alignedToT[ti][i];
|
||||
// cerr << "point (" << si << ", " << ti << ")\n";
|
||||
if (si<minS) { minS = si; }
|
||||
if (si>maxS) { maxS = si; }
|
||||
usedS[ si ]--;
|
||||
}
|
||||
}
|
||||
|
||||
// unaligned phrases are not allowed
|
||||
if( maxS == -1 )
|
||||
continue;
|
||||
|
||||
// source phrase has to be within limits
|
||||
if( maxS-minS >= maxSpan )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// check if source words are aligned to out of bound target words
|
||||
bool out_of_bounds = false;
|
||||
for(int si=minS;si<=maxS && !out_of_bounds;si++)
|
||||
{
|
||||
if (usedS[si]>0) {
|
||||
out_of_bounds = true;
|
||||
}
|
||||
}
|
||||
|
||||
// if out of bound, you gotta go
|
||||
if (out_of_bounds)
|
||||
continue;
|
||||
|
||||
if (m_tunnelCollection->NumUnalignedWord(1, startT, endT) >= global.maxUnaligned)
|
||||
continue;
|
||||
|
||||
// done with all the checks, lets go over all consistent phrase pairs
|
||||
// start point of source phrase may retreat over unaligned
|
||||
for(int startS=minS;
|
||||
(startS>=0 &&
|
||||
startS>maxS - maxSpan && // within length limit
|
||||
(startS==minS || alignedCountS[startS]==0)); // unaligned
|
||||
startS--)
|
||||
{
|
||||
// end point of source phrase may advance over unaligned
|
||||
for(int endS=maxS;
|
||||
(endS<countS && endS<startS + maxSpan && // within length limit
|
||||
(endS==maxS || alignedCountS[endS]==0)); // unaligned
|
||||
endS++)
|
||||
{
|
||||
if (m_tunnelCollection->NumUnalignedWord(0, startS, endS) >= global.maxUnaligned)
|
||||
continue;
|
||||
|
||||
// take note that this is a valid phrase alignment
|
||||
m_tunnelCollection->Add(startS, endS, startT, endT);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//cerr << *tunnelCollection << endl;
|
||||
|
||||
}
|
||||
|
||||
void SentenceAlignment::CreateLattice(const Global &global)
|
||||
{
|
||||
size_t countS = source.size();
|
||||
m_lattice = new Lattice(countS);
|
||||
|
||||
for (size_t startPos = 0; startPos < countS; ++startPos)
|
||||
{
|
||||
//cerr << "creating arcs for " << startPos << "=";
|
||||
m_lattice->CreateArcs(startPos, *m_tunnelCollection, *this, global);
|
||||
|
||||
//cerr << LatticeNode::s_count << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void SentenceAlignment::CreateRules(const Global &global)
|
||||
{
|
||||
size_t countS = source.size();
|
||||
|
||||
for (size_t startPos = 0; startPos < countS; ++startPos)
|
||||
{
|
||||
//cerr << "creating rules for " << startPos << "\n";
|
||||
m_lattice->CreateRules(startPos, *this, global);
|
||||
}
|
||||
}
|
||||
|
||||
void OutputSentenceStr(std::ostream &out, const std::vector<std::string> &vec)
|
||||
{
|
||||
for (size_t pos = 0; pos < vec.size(); ++pos)
|
||||
{
|
||||
out << vec[pos] << " ";
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const SentenceAlignment &obj)
|
||||
{
|
||||
OutputSentenceStr(out, obj.target);
|
||||
out << " ==> ";
|
||||
OutputSentenceStr(out, obj.source);
|
||||
out << endl;
|
||||
|
||||
out << *obj.m_tunnelCollection;
|
||||
|
||||
if (obj.m_lattice)
|
||||
out << endl << *obj.m_lattice;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,69 @@
|
||||
#pragma once
|
||||
/*
|
||||
* SentenceAlignment.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/01/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include "SyntaxTree.h"
|
||||
#include "Global.h"
|
||||
#include "Range.h"
|
||||
|
||||
class TunnelCollection;
|
||||
class Lattice;
|
||||
|
||||
class SentenceAlignment
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream&, const SentenceAlignment&);
|
||||
|
||||
public:
|
||||
std::vector<std::string> target;
|
||||
std::vector<std::string> source;
|
||||
std::vector<int> alignedCountS;
|
||||
std::vector< std::vector<int> > alignedToT;
|
||||
SyntaxTree sourceTree, targetTree;
|
||||
|
||||
//typedef std::vector<Range> Inner;
|
||||
//typedef std::vector<Inner> Outer;
|
||||
|
||||
//Outer m_s2tTightest, m_t2sTightest;
|
||||
|
||||
SentenceAlignment();
|
||||
~SentenceAlignment();
|
||||
int Create(const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global);
|
||||
// void clear() { delete(alignment); };
|
||||
void FindTunnels( const Global &global ) ;
|
||||
|
||||
void CreateLattice(const Global &global);
|
||||
void CreateRules(const Global &global);
|
||||
|
||||
const TunnelCollection &GetTunnelCollection() const
|
||||
{
|
||||
assert(m_tunnelCollection);
|
||||
return *m_tunnelCollection;
|
||||
}
|
||||
|
||||
const Lattice &GetLattice() const
|
||||
{
|
||||
assert(m_lattice);
|
||||
return *m_lattice;
|
||||
}
|
||||
|
||||
protected:
|
||||
TunnelCollection *m_tunnelCollection;
|
||||
Lattice *m_lattice;
|
||||
|
||||
/*
|
||||
void CalcTightestSpan(Outer &tightest);
|
||||
void InitTightest(Outer &tightest, size_t len);
|
||||
Range &GetTightest(Outer &tightest, size_t startPos, size_t endPos);
|
||||
void SetAlignment(size_t source, size_t target);
|
||||
void SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos);
|
||||
*/
|
||||
};
|
||||
|
101
contrib/other-builds/extract-mixed-syntax/Symbol.cpp
Normal file
101
contrib/other-builds/extract-mixed-syntax/Symbol.cpp
Normal file
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Symbol.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 21/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <cassert>
|
||||
#include "Symbol.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
Symbol::Symbol(const std::string &label, size_t pos)
|
||||
:m_label(label)
|
||||
,m_isTerminal(true)
|
||||
,m_span(2)
|
||||
{
|
||||
m_span[0].first = pos;
|
||||
}
|
||||
|
||||
Symbol::Symbol(const std::string &labelS, const std::string &labelT
|
||||
, size_t startS, size_t endS
|
||||
, size_t startT, size_t endT
|
||||
, bool isSourceSyntax, bool isTargetSyntax)
|
||||
:m_label(labelS)
|
||||
,m_labelT(labelT)
|
||||
,m_isTerminal(false)
|
||||
,m_span(2)
|
||||
,m_isSourceSyntax(isSourceSyntax)
|
||||
,m_isTargetSyntax(isTargetSyntax)
|
||||
{
|
||||
m_span[0] = std::pair<size_t, size_t>(startS, endS);
|
||||
m_span[1] = std::pair<size_t, size_t>(startT, endT);
|
||||
}
|
||||
|
||||
int CompareNonTerm(bool thisIsSyntax, bool otherIsSyntax
|
||||
, const std::pair<size_t, size_t> &thisSpan, const std::pair<size_t, size_t> &otherSpan
|
||||
, std::string thisLabel, std::string otherLabel)
|
||||
{
|
||||
if (thisIsSyntax != otherIsSyntax)
|
||||
{ // 1 is [X] & the other is [NP] on the source
|
||||
return thisIsSyntax ? -1 : +1;
|
||||
}
|
||||
|
||||
assert(thisIsSyntax == otherIsSyntax);
|
||||
if (thisIsSyntax)
|
||||
{ // compare span & label
|
||||
if (thisSpan != otherSpan)
|
||||
return thisSpan < otherSpan ? -1 : +1;
|
||||
if (thisLabel != otherLabel)
|
||||
return thisLabel < otherLabel ? -1 : +1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Symbol::Compare(const Symbol &other) const
|
||||
{
|
||||
if (m_isTerminal != other.m_isTerminal)
|
||||
return m_isTerminal ? -1 : +1;
|
||||
|
||||
assert(m_isTerminal == other.m_isTerminal);
|
||||
if (m_isTerminal)
|
||||
{ // compare labels & pos
|
||||
if (m_span[0].first != other.m_span[0].first)
|
||||
return (m_span[0].first < other.m_span[0].first) ? -1 : +1;
|
||||
|
||||
if (m_label != other.m_label)
|
||||
return (m_label < other.m_label) ? -1 : +1;
|
||||
|
||||
}
|
||||
else
|
||||
{ // non terms
|
||||
int ret = CompareNonTerm(m_isSourceSyntax, other.m_isSourceSyntax
|
||||
,m_span[0], other.m_span[0]
|
||||
,m_label, other.m_label);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
ret = CompareNonTerm(m_isTargetSyntax, other.m_isTargetSyntax
|
||||
,m_span[1], other.m_span[1]
|
||||
,m_label, other.m_label);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const Symbol &obj)
|
||||
{
|
||||
if (obj.m_isTerminal)
|
||||
out << obj.m_label;
|
||||
else
|
||||
out << obj.m_label + obj.m_labelT;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
36
contrib/other-builds/extract-mixed-syntax/Symbol.h
Normal file
36
contrib/other-builds/extract-mixed-syntax/Symbol.h
Normal file
@ -0,0 +1,36 @@
|
||||
#pragma once
|
||||
|
||||
/*
|
||||
* Symbol.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 21/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
class Symbol
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream &out, const Symbol &obj);
|
||||
|
||||
protected:
|
||||
std::string m_label, m_labelT; // m_labelT only for non-term
|
||||
std::vector<std::pair<size_t, size_t> > m_span;
|
||||
|
||||
bool m_isTerminal, m_isSourceSyntax, m_isTargetSyntax;
|
||||
public:
|
||||
// for terminals
|
||||
Symbol(const std::string &label, size_t pos);
|
||||
|
||||
// for non-terminals
|
||||
Symbol(const std::string &labelS, const std::string &labelT
|
||||
, size_t startS, size_t endS
|
||||
, size_t startT, size_t endT
|
||||
, bool isSourceSyntax, bool isTargetSyntax);
|
||||
|
||||
int Compare(const Symbol &other) const;
|
||||
|
||||
};
|
56
contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp
Normal file
56
contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp
Normal file
@ -0,0 +1,56 @@
|
||||
/*
|
||||
* SymbolSequence.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 21/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <cassert>
|
||||
#include <sstream>
|
||||
#include "SymbolSequence.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int SymbolSequence::Compare(const SymbolSequence &other) const
|
||||
{
|
||||
int ret;
|
||||
size_t thisSize = GetSize();
|
||||
size_t otherSize = other.GetSize();
|
||||
if (thisSize != otherSize)
|
||||
{
|
||||
ret = (thisSize < otherSize) ? -1 : +1;
|
||||
return ret;
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(thisSize == otherSize);
|
||||
for (size_t ind = 0; ind < thisSize; ++ind)
|
||||
{
|
||||
const Symbol &thisSymbol = GetSymbol(ind);
|
||||
const Symbol &otherSymbol = other.GetSymbol(ind);
|
||||
ret = thisSymbol.Compare(otherSymbol);
|
||||
if (ret != 0)
|
||||
{
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert(ret == 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj)
|
||||
{
|
||||
SymbolSequence::CollType::const_iterator iterSymbol;
|
||||
for (iterSymbol = obj.m_coll.begin(); iterSymbol != obj.m_coll.end(); ++iterSymbol)
|
||||
{
|
||||
const Symbol &symbol = *iterSymbol;
|
||||
out << symbol << " ";
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
42
contrib/other-builds/extract-mixed-syntax/SymbolSequence.h
Normal file
42
contrib/other-builds/extract-mixed-syntax/SymbolSequence.h
Normal file
@ -0,0 +1,42 @@
|
||||
#pragma once
|
||||
/*
|
||||
* SymbolSequence.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 21/07/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "Symbol.h"
|
||||
|
||||
class SymbolSequence
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj);
|
||||
|
||||
protected:
|
||||
typedef std::vector<Symbol> CollType;
|
||||
CollType m_coll;
|
||||
|
||||
public:
|
||||
typedef CollType::iterator iterator;
|
||||
typedef CollType::const_iterator const_iterator;
|
||||
const_iterator begin() const { return m_coll.begin(); }
|
||||
const_iterator end() const { return m_coll.end(); }
|
||||
|
||||
void Add(const Symbol &symbol)
|
||||
{
|
||||
m_coll.push_back(symbol);
|
||||
}
|
||||
size_t GetSize() const
|
||||
{ return m_coll.size(); }
|
||||
const Symbol &GetSymbol(size_t ind) const
|
||||
{ return m_coll[ind]; }
|
||||
|
||||
void Clear()
|
||||
{ m_coll.clear(); }
|
||||
|
||||
int Compare(const SymbolSequence &other) const;
|
||||
|
||||
};
|
245
contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp
Normal file
245
contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp
Normal file
@ -0,0 +1,245 @@
|
||||
// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $
|
||||
// vim:tabstop=2
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2009 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include "SyntaxTree.h"
|
||||
//#include "extract.h"
|
||||
#include "Global.h"
|
||||
|
||||
//extern const Global g_debug;
|
||||
extern const Global *g_global;
|
||||
|
||||
using namespace std;
|
||||
|
||||
bool SyntaxNode::IsSyntax() const
|
||||
{
|
||||
bool ret = GetLabel() != "[X]";
|
||||
return ret;
|
||||
}
|
||||
|
||||
SyntaxTree::SyntaxTree()
|
||||
:m_defaultLHS(0,0, "[X]")
|
||||
{
|
||||
m_emptyNode.clear();
|
||||
}
|
||||
|
||||
SyntaxTree::~SyntaxTree()
|
||||
{
|
||||
// loop through all m_nodes, delete them
|
||||
for(int i=0; i<m_nodes.size(); i++)
|
||||
{
|
||||
delete m_nodes[i];
|
||||
}
|
||||
}
|
||||
|
||||
bool HasDuplicates(const SyntaxNodes &nodes)
|
||||
{
|
||||
string prevLabel;
|
||||
SyntaxNodes::const_iterator iter;
|
||||
for (iter = nodes.begin(); iter != nodes.end(); ++iter)
|
||||
{
|
||||
const SyntaxNode &node = **iter;
|
||||
string label = node.GetLabel();
|
||||
if (label == prevLabel)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void SyntaxTree::AddNode( int startPos, int endPos, std::string label )
|
||||
{
|
||||
SyntaxNode* newNode = new SyntaxNode( startPos, endPos, "[" + label + "]");
|
||||
m_nodes.push_back( newNode );
|
||||
|
||||
SyntaxNodes &nodesChart = m_index[ startPos ][ endPos ];
|
||||
|
||||
if (!g_global->uppermostOnly)
|
||||
{
|
||||
nodesChart.push_back( newNode );
|
||||
//assert(!HasDuplicates(m_index[ startPos ][ endPos ]));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (nodesChart.size() > 0)
|
||||
{
|
||||
assert(nodesChart.size() == 1);
|
||||
//delete nodes[0];
|
||||
nodesChart.resize(0);
|
||||
}
|
||||
assert(nodesChart.size() == 0);
|
||||
nodesChart.push_back( newNode );
|
||||
}
|
||||
}
|
||||
|
||||
ParentNodes SyntaxTree::Parse() {
|
||||
ParentNodes parents;
|
||||
|
||||
int size = m_index.size();
|
||||
|
||||
// looping through all spans of size >= 2
|
||||
for( int length=2; length<=size; length++ )
|
||||
{
|
||||
for( int startPos = 0; startPos <= size-length; startPos++ )
|
||||
{
|
||||
if (HasNode( startPos, startPos+length-1 ))
|
||||
{
|
||||
// processing one (parent) span
|
||||
|
||||
//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
|
||||
SplitPoints splitPoints;
|
||||
splitPoints.push_back( startPos );
|
||||
//std::cerr << " " << startPos;
|
||||
|
||||
int first = 1;
|
||||
int covered = 0;
|
||||
while( covered < length )
|
||||
{
|
||||
// find largest covering subspan (child)
|
||||
// starting at last covered position
|
||||
for( int midPos=length-first; midPos>covered; midPos-- )
|
||||
{
|
||||
if( HasNode( startPos+covered, startPos+midPos-1 ) )
|
||||
{
|
||||
covered = midPos;
|
||||
splitPoints.push_back( startPos+covered );
|
||||
// std::cerr << " " << ( startPos+covered );
|
||||
first = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// std::cerr << std::endl;
|
||||
parents.push_back( splitPoints );
|
||||
}
|
||||
}
|
||||
}
|
||||
return parents;
|
||||
}
|
||||
|
||||
bool SyntaxTree::HasNode( int startPos, int endPos ) const
|
||||
{
|
||||
return GetNodes( startPos, endPos).size() > 0;
|
||||
}
|
||||
|
||||
const SyntaxNodes &SyntaxTree::GetNodes( int startPos, int endPos ) const
|
||||
{
|
||||
SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
|
||||
if (startIndex == m_index.end() )
|
||||
return m_emptyNode;
|
||||
|
||||
SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
|
||||
if (endIndex == startIndex->second.end())
|
||||
return m_emptyNode;
|
||||
|
||||
return endIndex->second;
|
||||
}
|
||||
|
||||
// for printing out tree
|
||||
std::string SyntaxTree::ToString() const
|
||||
{
|
||||
std::stringstream out;
|
||||
out << *this;
|
||||
return out.str();
|
||||
}
|
||||
|
||||
void SyntaxTree::AddDefaultNonTerms(size_t phraseSize)
|
||||
{
|
||||
for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
|
||||
{
|
||||
for (size_t endPos = startPos; endPos < phraseSize; ++endPos)
|
||||
{
|
||||
AddNode(startPos, endPos, "X");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SyntaxTree::AddDefaultNonTerms(bool isSyntax, bool mixed, size_t phraseSize)
|
||||
{
|
||||
if (isSyntax)
|
||||
{
|
||||
AddDefaultNonTerms(!mixed, phraseSize);
|
||||
}
|
||||
else
|
||||
{ // add X everywhere
|
||||
AddDefaultNonTerms(phraseSize);
|
||||
}
|
||||
}
|
||||
|
||||
void SyntaxTree::AddDefaultNonTerms(bool addEverywhere, size_t phraseSize)
|
||||
{
|
||||
//cerr << "GetNumWords()=" << GetNumWords() << endl;
|
||||
//assert(phraseSize == GetNumWords() || GetNumWords() == 1); // 1 if syntax sentence doesn't have any xml. TODO fix syntax tree obj
|
||||
|
||||
for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
|
||||
{
|
||||
for (size_t endPos = startPos; endPos <= phraseSize; ++endPos)
|
||||
{
|
||||
const SyntaxNodes &nodes = GetNodes(startPos, endPos);
|
||||
if (!addEverywhere && nodes.size() > 0)
|
||||
{ // only add if no label
|
||||
continue;
|
||||
}
|
||||
AddNode(startPos, endPos, "X");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const SyntaxNodes SyntaxTree::GetNodesForLHS( int startPos, int endPos ) const
|
||||
{
|
||||
SyntaxNodes ret(GetNodes(startPos, endPos));
|
||||
|
||||
if (ret.size() == 0)
|
||||
ret.push_back(&m_defaultLHS);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const SyntaxTree& t)
|
||||
{
|
||||
int size = t.m_index.size();
|
||||
for(size_t length=1; length<=size; length++)
|
||||
{
|
||||
for(size_t space=0; space<length; space++)
|
||||
{
|
||||
os << " ";
|
||||
}
|
||||
for(size_t start=0; start<=size-length; start++)
|
||||
{
|
||||
|
||||
if (t.HasNode( start, start+(length-1) ))
|
||||
{
|
||||
std::string label = t.GetNodes( start, start+(length-1) )[0]->GetLabel() + "#######";
|
||||
|
||||
os << label.substr(0,7) << " ";
|
||||
}
|
||||
else
|
||||
{
|
||||
os << "------- ";
|
||||
}
|
||||
}
|
||||
os << std::endl;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
|
96
contrib/other-builds/extract-mixed-syntax/SyntaxTree.h
Normal file
96
contrib/other-builds/extract-mixed-syntax/SyntaxTree.h
Normal file
@ -0,0 +1,96 @@
|
||||
#pragma once
|
||||
|
||||
// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $
|
||||
// vim:tabstop=2
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2009 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
|
||||
class SyntaxNode;
|
||||
|
||||
typedef std::vector<const SyntaxNode*> SyntaxNodes;
|
||||
|
||||
class SyntaxNode {
|
||||
protected:
|
||||
int m_start, m_end;
|
||||
std::string m_label;
|
||||
SyntaxNodes m_children;
|
||||
SyntaxNode* m_parent;
|
||||
public:
|
||||
SyntaxNode( int startPos, int endPos, const std::string &label)
|
||||
:m_start(startPos)
|
||||
,m_end(endPos)
|
||||
,m_label(label)
|
||||
{}
|
||||
int GetStart() const
|
||||
{ return m_start; }
|
||||
int GetEnd() const
|
||||
{ return m_end; }
|
||||
const std::string &GetLabel() const
|
||||
{ return m_label; }
|
||||
bool IsSyntax() const;
|
||||
};
|
||||
|
||||
|
||||
typedef std::vector< int > SplitPoints;
|
||||
typedef std::vector< SplitPoints > ParentNodes;
|
||||
|
||||
class SyntaxTree {
|
||||
protected:
|
||||
SyntaxNodes m_nodes;
|
||||
SyntaxNode* m_top;
|
||||
SyntaxNode m_defaultLHS;
|
||||
|
||||
typedef std::map< int, SyntaxNodes > SyntaxTreeIndex2;
|
||||
typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
|
||||
typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
|
||||
typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
|
||||
SyntaxTreeIndex m_index;
|
||||
SyntaxNodes m_emptyNode;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
|
||||
|
||||
public:
|
||||
SyntaxTree();
|
||||
~SyntaxTree();
|
||||
|
||||
void AddNode( int startPos, int endPos, std::string label );
|
||||
ParentNodes Parse();
|
||||
bool HasNode( int startPos, int endPos ) const;
|
||||
const SyntaxNodes &GetNodes( int startPos, int endPos ) const;
|
||||
const SyntaxNodes &GetAllNodes() const { return m_nodes; } ;
|
||||
size_t GetNumWords() const { return m_index.size(); }
|
||||
std::string ToString() const;
|
||||
|
||||
void AddDefaultNonTerms(bool isSyntax, bool addEverywhere, size_t phraseSize);
|
||||
void AddDefaultNonTerms(bool mixed, size_t phraseSize);
|
||||
|
||||
void AddDefaultNonTerms(size_t phraseSize);
|
||||
|
||||
const SyntaxNodes GetNodesForLHS( int startPos, int endPos ) const;
|
||||
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream&, const SyntaxTree&);
|
||||
|
38
contrib/other-builds/extract-mixed-syntax/Tunnel.cpp
Normal file
38
contrib/other-builds/extract-mixed-syntax/Tunnel.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Tunnel.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/01/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "Tunnel.h"
|
||||
|
||||
|
||||
int Tunnel::Compare(const Tunnel &other) const
|
||||
{
|
||||
int ret = m_sourceRange.Compare(other.m_sourceRange);
|
||||
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
ret = m_targetRange.Compare(other.m_targetRange);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int Tunnel::Compare(const Tunnel &other, size_t direction) const
|
||||
{
|
||||
const Range &thisRange = (direction == 0) ? m_sourceRange : m_targetRange;
|
||||
const Range &otherRange = (direction == 0) ? other.m_sourceRange : other.m_targetRange;
|
||||
|
||||
int ret = thisRange.Compare(otherRange);
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const Tunnel &tunnel)
|
||||
{
|
||||
out << tunnel.m_sourceRange << "==>" << tunnel.m_targetRange;
|
||||
return out;
|
||||
}
|
49
contrib/other-builds/extract-mixed-syntax/Tunnel.h
Normal file
49
contrib/other-builds/extract-mixed-syntax/Tunnel.h
Normal file
@ -0,0 +1,49 @@
|
||||
#pragma once
|
||||
|
||||
/*
|
||||
* Tunnel.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/01/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include "Range.h"
|
||||
|
||||
// for unaligned source terminal
|
||||
|
||||
class Tunnel
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream&, const Tunnel&);
|
||||
|
||||
protected:
|
||||
|
||||
Range m_sourceRange, m_targetRange;
|
||||
|
||||
public:
|
||||
Tunnel()
|
||||
{}
|
||||
|
||||
Tunnel(const Tunnel ©)
|
||||
:m_sourceRange(copy.m_sourceRange)
|
||||
,m_targetRange(copy.m_targetRange)
|
||||
{}
|
||||
|
||||
Tunnel(const Range &sourceRange, const Range &targetRange)
|
||||
:m_sourceRange(sourceRange)
|
||||
,m_targetRange(targetRange)
|
||||
{}
|
||||
|
||||
const Range &GetRange(size_t direction) const
|
||||
{ return (direction == 0) ? m_sourceRange : m_targetRange; }
|
||||
|
||||
int Compare(const Tunnel &other) const;
|
||||
int Compare(const Tunnel &other, size_t direction) const;
|
||||
};
|
||||
|
||||
typedef std::vector<Tunnel> TunnelList;
|
||||
|
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* TunnelCollection.cpp
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/01/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "TunnelCollection.h"
|
||||
#include "Range.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
size_t TunnelCollection::NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const
|
||||
{
|
||||
assert(startPos <= endPos);
|
||||
|
||||
if (direction == 0)
|
||||
assert(endPos < alignedCountS.size());
|
||||
else
|
||||
assert(endPos < alignedCountT.size());
|
||||
|
||||
size_t ret = 0;
|
||||
for (size_t ind = startPos; ind <= endPos; ++ind)
|
||||
{
|
||||
if (direction == 0 && alignedCountS[ind] == 0)
|
||||
{
|
||||
ret++;
|
||||
}
|
||||
else if (direction == 1 && alignedCountT[ind] == 0)
|
||||
{
|
||||
ret++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void TunnelCollection::Add(int startS, int endS, int startT, int endT)
|
||||
{
|
||||
// m_phraseExist[startS][endS - startS].push_back(Tunnel(startT, endT));
|
||||
m_coll[startS][endS - startS].push_back(Tunnel(Range(startS, endS), Range(startT, endT)));
|
||||
}
|
||||
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const TunnelCollection &TunnelCollection)
|
||||
{
|
||||
size_t size = TunnelCollection.GetSize();
|
||||
|
||||
for (size_t startPos = 0; startPos < size; ++startPos)
|
||||
{
|
||||
for (size_t endPos = startPos; endPos < size; ++endPos)
|
||||
{
|
||||
const TunnelList &tunnelList = TunnelCollection.GetTunnels(startPos, endPos);
|
||||
TunnelList::const_iterator iter;
|
||||
for (iter = tunnelList.begin(); iter != tunnelList.end(); ++iter)
|
||||
{
|
||||
const Tunnel &tunnel = *iter;
|
||||
out << tunnel << " ";
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
61
contrib/other-builds/extract-mixed-syntax/TunnelCollection.h
Normal file
61
contrib/other-builds/extract-mixed-syntax/TunnelCollection.h
Normal file
@ -0,0 +1,61 @@
|
||||
#pragma once
|
||||
/*
|
||||
* TunnelCollection.h
|
||||
* extract
|
||||
*
|
||||
* Created by Hieu Hoang on 19/01/2010.
|
||||
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
#include <vector>
|
||||
#include "Tunnel.h"
|
||||
|
||||
// reposity of extracted phrase pairs
|
||||
// which are potential tunnels in larger phrase pairs
|
||||
class TunnelCollection
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream&, const TunnelCollection&);
|
||||
|
||||
protected:
|
||||
std::vector< std::vector<TunnelList> > m_coll;
|
||||
// indexed by source pos. and source length
|
||||
// maps to list of tunnels where <int, int> are target pos
|
||||
|
||||
public:
|
||||
std::vector<int> alignedCountS, alignedCountT;
|
||||
|
||||
TunnelCollection(const TunnelCollection &);
|
||||
|
||||
TunnelCollection(size_t size)
|
||||
:m_coll(size)
|
||||
{
|
||||
// size is the length of the source sentence
|
||||
for (size_t pos = 0; pos < size; ++pos)
|
||||
{
|
||||
// create empty tunnel lists
|
||||
std::vector<TunnelList> &endVec = m_coll[pos];
|
||||
endVec.resize(size - pos);
|
||||
}
|
||||
}
|
||||
|
||||
void Add(int startS, int endS, int startT, int endT);
|
||||
|
||||
//const TunnelList &GetTargetHoles(int startS, int endS) const
|
||||
//{
|
||||
// const TunnelList &targetHoles = m_phraseExist[startS][endS - startS];
|
||||
// return targetHoles;
|
||||
//}
|
||||
const TunnelList &GetTunnels(int startS, int endS) const
|
||||
{
|
||||
const TunnelList &sourceHoles = m_coll[startS][endS - startS];
|
||||
return sourceHoles;
|
||||
}
|
||||
|
||||
const size_t GetSize() const
|
||||
{ return m_coll.size(); }
|
||||
|
||||
size_t NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const;
|
||||
|
||||
|
||||
};
|
||||
|
344
contrib/other-builds/extract-mixed-syntax/XmlTree.cpp
Normal file
344
contrib/other-builds/extract-mixed-syntax/XmlTree.cpp
Normal file
@ -0,0 +1,344 @@
|
||||
// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
|
||||
// vim:tabstop=2
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
inline std::vector<std::string> Tokenize(const std::string& str,
|
||||
const std::string& delimiters = " \t")
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
// Skip delimiters at beginning.
|
||||
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
|
||||
// Find first "non-delimiter".
|
||||
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
|
||||
|
||||
while (std::string::npos != pos || std::string::npos != lastPos)
|
||||
{
|
||||
// Found a token, add it to the vector.
|
||||
tokens.push_back(str.substr(lastPos, pos - lastPos));
|
||||
// Skip delimiters. Note the "not_of"
|
||||
lastPos = str.find_first_not_of(delimiters, pos);
|
||||
// Find next "non-delimiter"
|
||||
pos = str.find_first_of(delimiters, lastPos);
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
|
||||
{
|
||||
std::string res = str;
|
||||
res.erase(str.find_last_not_of(dropChars)+1);
|
||||
return res.erase(0, res.find_first_not_of(dropChars));
|
||||
}
|
||||
|
||||
string ParseXmlTagAttribute(const string& tag,const string& attributeName){
|
||||
/*TODO deal with unescaping \"*/
|
||||
string tagOpen = attributeName + "=\"";
|
||||
size_t contentsStart = tag.find(tagOpen);
|
||||
if (contentsStart == string::npos) return "";
|
||||
contentsStart += tagOpen.size();
|
||||
size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
|
||||
if (contentsEnd == string::npos) {
|
||||
cerr << "Malformed XML attribute: "<< tag;
|
||||
return "";
|
||||
}
|
||||
size_t possibleEnd;
|
||||
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
|
||||
contentsEnd = possibleEnd;
|
||||
}
|
||||
return tag.substr(contentsStart,contentsEnd-contentsStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove "<" and ">" from XML tag
|
||||
*
|
||||
* \param str xml token to be stripped
|
||||
*/
|
||||
string TrimXml(const string& str)
|
||||
{
|
||||
// too short to be xml token -> do nothing
|
||||
if (str.size() < 2) return str;
|
||||
|
||||
// strip first and last character
|
||||
if (str[0] == '<' && str[str.size() - 1] == '>')
|
||||
{
|
||||
return str.substr(1, str.size() - 2);
|
||||
}
|
||||
// not an xml token -> do nothing
|
||||
else { return str; }
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the token is an XML tag, i.e. starts with "<"
|
||||
*
|
||||
* \param tag token to be checked
|
||||
*/
|
||||
bool isXmlTag(const string& tag)
|
||||
{
|
||||
return tag[0] == '<';
|
||||
}
|
||||
|
||||
/**
|
||||
* Split up the input character string into tokens made up of
|
||||
* either XML tags or text.
|
||||
* example: this <b> is a </b> test .
|
||||
* => (this ), (<b>), ( is a ), (</b>), ( test .)
|
||||
*
|
||||
* \param str input string
|
||||
*/
|
||||
inline vector<string> TokenizeXml(const string& str)
|
||||
{
|
||||
string lbrack = "<";
|
||||
string rbrack = ">";
|
||||
vector<string> tokens; // vector of tokens to be returned
|
||||
string::size_type cpos = 0; // current position in string
|
||||
string::size_type lpos = 0; // left start of xml tag
|
||||
string::size_type rpos = 0; // right end of xml tag
|
||||
|
||||
// walk thorugh the string (loop vver cpos)
|
||||
while (cpos != str.size())
|
||||
{
|
||||
// find the next opening "<" of an xml tag
|
||||
lpos = str.find_first_of(lbrack, cpos);
|
||||
if (lpos != string::npos)
|
||||
{
|
||||
// find the end of the xml tag
|
||||
rpos = str.find_first_of(rbrack, lpos);
|
||||
// sanity check: there has to be closing ">"
|
||||
if (rpos == string::npos)
|
||||
{
|
||||
cerr << "ERROR: malformed XML: " << str << endl;
|
||||
return tokens;
|
||||
}
|
||||
}
|
||||
else // no more tags found
|
||||
{
|
||||
// add the rest as token
|
||||
tokens.push_back(str.substr(cpos));
|
||||
break;
|
||||
}
|
||||
|
||||
// add stuff before xml tag as token, if there is any
|
||||
if (lpos - cpos > 0)
|
||||
tokens.push_back(str.substr(cpos, lpos - cpos));
|
||||
|
||||
// add xml tag as token
|
||||
tokens.push_back(str.substr(lpos, rpos-lpos+1));
|
||||
cpos = rpos + 1;
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a sentence with xml annotation
|
||||
* Xml tags may specifiy additional/replacing translation options
|
||||
* and reordering constraints
|
||||
*
|
||||
* \param line in: sentence, out: sentence without the xml
|
||||
* \param res vector with translation options specified by xml
|
||||
* \param reorderingConstraint reordering constraint zones specified by xml
|
||||
* \param walls reordering constraint walls specified by xml
|
||||
*/
|
||||
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
|
||||
is so we can link things up afterwards. We can't create TranslationOptions as we
|
||||
parse because we don't have the completed source parsed until after this function
|
||||
removes all the markup from it (CreateFromString in Sentence::Read).
|
||||
*/
|
||||
bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) {
|
||||
//parse XML markup in translation line
|
||||
|
||||
// no xml tag? we're done.
|
||||
if (line.find_first_of('<') == string::npos) { return true; }
|
||||
|
||||
// break up input into a vector of xml tags and text
|
||||
// example: (this), (<b>), (is a), (</b>), (test .)
|
||||
vector<string> xmlTokens = TokenizeXml(line);
|
||||
|
||||
// we need to store opened tags, until they are closed
|
||||
// tags are stored as tripled (tagname, startpos, contents)
|
||||
typedef pair< string, pair< size_t, string > > OpenedTag;
|
||||
vector< OpenedTag > tagStack; // stack that contains active opened tags
|
||||
|
||||
string cleanLine; // return string (text without xml)
|
||||
size_t wordPos = 0; // position in sentence (in terms of number of words)
|
||||
bool isLinked = false;
|
||||
|
||||
// loop through the tokens
|
||||
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
|
||||
{
|
||||
// not a xml tag, but regular text (may contain many words)
|
||||
if(!isXmlTag(xmlTokens[xmlTokenPos]))
|
||||
{
|
||||
// add a space at boundary, if necessary
|
||||
if (cleanLine.size()>0 &&
|
||||
cleanLine[cleanLine.size() - 1] != ' ' &&
|
||||
xmlTokens[xmlTokenPos][0] != ' ')
|
||||
{
|
||||
cleanLine += " ";
|
||||
}
|
||||
cleanLine += xmlTokens[xmlTokenPos]; // add to output
|
||||
wordPos = Tokenize(cleanLine).size(); // count all the words
|
||||
}
|
||||
|
||||
// process xml tag
|
||||
else
|
||||
{
|
||||
// *** get essential information about tag ***
|
||||
|
||||
// strip extra boundary spaces and "<" and ">"
|
||||
string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
|
||||
// cerr << "XML TAG IS: " << tag << std::endl;
|
||||
|
||||
if (tag.size() == 0)
|
||||
{
|
||||
cerr << "ERROR: empty tag name: " << line << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// check if unary (e.g., "<wall/>")
|
||||
bool isUnary = ( tag[tag.size() - 1] == '/' );
|
||||
|
||||
// check if opening tag (e.g. "<a>", not "</a>")g
|
||||
bool isClosed = ( tag[0] == '/' );
|
||||
bool isOpen = !isClosed;
|
||||
|
||||
if (isClosed && isUnary)
|
||||
{
|
||||
cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isClosed)
|
||||
tag = tag.substr(1); // remove "/" at the beginning
|
||||
if (isUnary)
|
||||
tag = tag.substr(0,tag.size()-1); // remove "/" at the end
|
||||
|
||||
// find the tag name and contents
|
||||
string::size_type endOfName = tag.find_first_of(' ');
|
||||
string tagName = tag;
|
||||
string tagContent = "";
|
||||
if (endOfName != string::npos) {
|
||||
tagName = tag.substr(0,endOfName);
|
||||
tagContent = tag.substr(endOfName+1);
|
||||
}
|
||||
|
||||
// *** process new tag ***
|
||||
|
||||
if (isOpen || isUnary)
|
||||
{
|
||||
// put the tag on the tag stack
|
||||
OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
|
||||
tagStack.push_back( openedTag );
|
||||
// cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
|
||||
}
|
||||
|
||||
// *** process completed tag ***
|
||||
|
||||
if (isClosed || isUnary)
|
||||
{
|
||||
// pop last opened tag from stack;
|
||||
if (tagStack.size() == 0)
|
||||
{
|
||||
cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
|
||||
return false;
|
||||
}
|
||||
OpenedTag openedTag = tagStack.back();
|
||||
tagStack.pop_back();
|
||||
|
||||
// tag names have to match
|
||||
if (openedTag.first != tagName)
|
||||
{
|
||||
cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// assemble remaining information about tag
|
||||
size_t startPos = openedTag.second.first;
|
||||
string tagContent = openedTag.second.second;
|
||||
size_t endPos = wordPos;
|
||||
|
||||
// span attribute overwrites position
|
||||
string span = ParseXmlTagAttribute(tagContent,"span");
|
||||
if (! span.empty())
|
||||
{
|
||||
vector<string> ij = Tokenize(span, "-");
|
||||
if (ij.size() != 1 && ij.size() != 2) {
|
||||
cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
|
||||
return false;
|
||||
}
|
||||
startPos = atoi(ij[0].c_str());
|
||||
if (ij.size() == 1) endPos = startPos + 1;
|
||||
else endPos = atoi(ij[1].c_str()) + 1;
|
||||
}
|
||||
|
||||
// cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
|
||||
|
||||
if (startPos >= endPos)
|
||||
{
|
||||
cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
string label = ParseXmlTagAttribute(tagContent,"label");
|
||||
labelCollection.insert( label );
|
||||
|
||||
// report what we have processed so far
|
||||
if (0) {
|
||||
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
|
||||
cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
|
||||
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
|
||||
}
|
||||
tree.AddNode( startPos, endPos-1, label );
|
||||
}
|
||||
}
|
||||
}
|
||||
// we are done. check if there are tags that are still open
|
||||
if (tagStack.size() > 0)
|
||||
{
|
||||
cerr << "ERROR: some opened tags were never closed: " << line << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// collect top labels
|
||||
const SyntaxNodes &topNodes = tree.GetNodes( 0, wordPos-1 );
|
||||
for( SyntaxNodes::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ )
|
||||
{
|
||||
const SyntaxNode *n = *node;
|
||||
const string &label = n->GetLabel();
|
||||
if (topLabelCollection.find( label ) == topLabelCollection.end())
|
||||
topLabelCollection[ label ] = 0;
|
||||
topLabelCollection[ label ]++;
|
||||
}
|
||||
|
||||
// return de-xml'ed sentence in line
|
||||
line = cleanLine;
|
||||
return true;
|
||||
}
|
35
contrib/other-builds/extract-mixed-syntax/XmlTree.h
Normal file
35
contrib/other-builds/extract-mixed-syntax/XmlTree.h
Normal file
@ -0,0 +1,35 @@
|
||||
#pragma once
|
||||
|
||||
// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
|
||||
// vim:tabstop=2
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include "SyntaxTree.h"
|
||||
|
||||
std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
|
||||
std::string TrimXml(const std::string& str);
|
||||
bool isXmlTag(const std::string& tag);
|
||||
inline std::vector<std::string> TokenizeXml(const std::string& str);
|
||||
bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection );
|
310
contrib/other-builds/extract-mixed-syntax/extract.cpp
Normal file
310
contrib/other-builds/extract-mixed-syntax/extract.cpp
Normal file
@ -0,0 +1,310 @@
|
||||
// $Id: extract.cpp 2828 2010-02-01 16:07:58Z hieuhoang1972 $
|
||||
// vim:tabstop=2
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2009 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include <cstdio>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <time.h>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include "extract.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
#include "Lattice.h"
|
||||
|
||||
#ifdef WIN32
|
||||
// Include Visual Leak Detector
|
||||
#include <vld.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
|
||||
void writeGlueGrammar(const string &, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cerr << "Extract v2.0, written by Philipp Koehn\n"
|
||||
<< "rule extraction from an aligned parallel corpus\n";
|
||||
//time_t starttime = time(NULL);
|
||||
|
||||
Global *global = new Global();
|
||||
g_global = global;
|
||||
int sentenceOffset = 0;
|
||||
|
||||
if (argc < 5) {
|
||||
cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract "
|
||||
<< " [ --Hierarchical | --Orientation"
|
||||
<< " | --GlueGrammar FILE | --UnknownWordLabel FILE"
|
||||
<< " | --OnlyDirect"
|
||||
|
||||
<< " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]"
|
||||
<< " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]"
|
||||
<< " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]"
|
||||
<< " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]"
|
||||
|
||||
<< " | --MaxSymbols[" << global->maxSymbols<< "]"
|
||||
<< " | --MaxNonTerm[" << global->maxNonTerm << "]"
|
||||
<< " | --SourceSyntax | --TargetSyntax"
|
||||
<< " | --UppermostOnly[" << g_global->uppermostOnly << "]"
|
||||
<< endl;
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameT = argv[1];
|
||||
char* &fileNameS = argv[2];
|
||||
char* &fileNameA = argv[3];
|
||||
string fileNameGlueGrammar;
|
||||
string fileNameUnknownWordLabel;
|
||||
string fileNameExtract = string(argv[4]);
|
||||
|
||||
int optionInd = 5;
|
||||
|
||||
for(int i=optionInd;i<argc;i++)
|
||||
{
|
||||
if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) {
|
||||
global->minHoleSpanSourceDefault = atoi(argv[++i]);
|
||||
if (global->minHoleSpanSourceDefault < 1) {
|
||||
cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) {
|
||||
global->maxHoleSpanSourceDefault = atoi(argv[++i]);
|
||||
if (global->maxHoleSpanSourceDefault < 1) {
|
||||
cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) {
|
||||
global->minHoleSpanSourceSyntax = atoi(argv[++i]);
|
||||
if (global->minHoleSpanSourceSyntax < 1) {
|
||||
cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else if (strcmp(argv[i],"--UppermostOnly") == 0) {
|
||||
global->uppermostOnly = atoi(argv[++i]);
|
||||
}
|
||||
else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) {
|
||||
global->maxHoleSpanSourceSyntax = atoi(argv[++i]);
|
||||
if (global->maxHoleSpanSourceSyntax < 1) {
|
||||
cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// maximum number of words in hierarchical phrase
|
||||
else if (strcmp(argv[i],"--maxSymbols") == 0) {
|
||||
global->maxSymbols = atoi(argv[++i]);
|
||||
if (global->maxSymbols < 1) {
|
||||
cerr << "extract error: --maxSymbols should be at least 1" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
// maximum number of non-terminals
|
||||
else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
|
||||
global->maxNonTerm = atoi(argv[++i]);
|
||||
if (global->maxNonTerm < 1) {
|
||||
cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
// allow consecutive non-terminals (X Y | X Y)
|
||||
else if (strcmp(argv[i],"--TargetSyntax") == 0) {
|
||||
global->targetSyntax = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--SourceSyntax") == 0) {
|
||||
global->sourceSyntax = true;
|
||||
}
|
||||
// do not create many part00xx files!
|
||||
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
|
||||
// now default
|
||||
}
|
||||
else if (strcmp(argv[i],"--GlueGrammar") == 0) {
|
||||
global->glueGrammarFlag = true;
|
||||
if (++i >= argc)
|
||||
{
|
||||
cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
|
||||
exit(0);
|
||||
}
|
||||
fileNameGlueGrammar = string(argv[i]);
|
||||
cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
|
||||
}
|
||||
else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
|
||||
global->unknownWordLabelFlag = true;
|
||||
if (++i >= argc)
|
||||
{
|
||||
cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
|
||||
exit(0);
|
||||
}
|
||||
fileNameUnknownWordLabel = string(argv[i]);
|
||||
cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
|
||||
}
|
||||
// TODO: this should be a useful option
|
||||
//else if (strcmp(argv[i],"--ZipFiles") == 0) {
|
||||
// zipFiles = true;
|
||||
//}
|
||||
// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
|
||||
else if (strcmp(argv[i],"--Mixed") == 0) {
|
||||
global->mixed = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) {
|
||||
global->allowDefaultNonTermEdge = atoi(argv[++i]);
|
||||
}
|
||||
else if (strcmp(argv[i], "--GZOutput") == 0) {
|
||||
global->gzOutput = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--MaxSpan") == 0) {
|
||||
// ignore
|
||||
++i;
|
||||
}
|
||||
else if (strcmp(argv[i],"--SentenceOffset") == 0) {
|
||||
if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
|
||||
cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
|
||||
exit(1);
|
||||
}
|
||||
sentenceOffset = atoi(argv[++i]);
|
||||
}
|
||||
else {
|
||||
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// open input files
|
||||
Moses::InputFileStream tFile(fileNameT);
|
||||
Moses::InputFileStream sFile(fileNameS);
|
||||
Moses::InputFileStream aFile(fileNameA);
|
||||
|
||||
// open output files
|
||||
string fileNameExtractInv = fileNameExtract + ".inv";
|
||||
if (global->gzOutput) {
|
||||
fileNameExtract += ".gz";
|
||||
fileNameExtractInv += ".gz";
|
||||
}
|
||||
|
||||
Moses::OutputFileStream extractFile;
|
||||
Moses::OutputFileStream extractFileInv;
|
||||
extractFile.Open(fileNameExtract.c_str());
|
||||
extractFileInv.Open(fileNameExtractInv.c_str());
|
||||
|
||||
|
||||
// loop through all sentence pairs
|
||||
int i = sentenceOffset;
|
||||
while(true) {
|
||||
i++;
|
||||
|
||||
if (i % 1000 == 0) {
|
||||
cerr << i << " " << flush;
|
||||
}
|
||||
|
||||
string targetString;
|
||||
string sourceString;
|
||||
string alignmentString;
|
||||
|
||||
bool ok = getline(tFile, targetString);
|
||||
if (!ok)
|
||||
break;
|
||||
getline(sFile, sourceString);
|
||||
getline(aFile, alignmentString);
|
||||
|
||||
//cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl;
|
||||
|
||||
//time_t currTime = time(NULL);
|
||||
//cerr << "A " << (currTime - starttime) << endl;
|
||||
|
||||
SentenceAlignment sentencePair;
|
||||
if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global ))
|
||||
{
|
||||
//cerr << sentence.sourceTree << endl;
|
||||
//cerr << sentence.targetTree << endl;
|
||||
|
||||
sentencePair.FindTunnels(*g_global);
|
||||
//cerr << "C " << (time(NULL) - starttime) << endl;
|
||||
//cerr << sentencePair << endl;
|
||||
|
||||
sentencePair.CreateLattice(*g_global);
|
||||
//cerr << "D " << (time(NULL) - starttime) << endl;
|
||||
//cerr << sentencePair << endl;
|
||||
|
||||
sentencePair.CreateRules(*g_global);
|
||||
//cerr << "E " << (time(NULL) - starttime) << endl;
|
||||
|
||||
//cerr << sentence.lattice->GetRules().GetSize() << endl;
|
||||
sentencePair.GetLattice().GetRules().Output(extractFile);
|
||||
sentencePair.GetLattice().GetRules().OutputInv(extractFileInv);
|
||||
}
|
||||
}
|
||||
|
||||
tFile.Close();
|
||||
sFile.Close();
|
||||
aFile.Close();
|
||||
|
||||
extractFile.Close();
|
||||
extractFileInv.Close();
|
||||
|
||||
if (global->glueGrammarFlag) {
|
||||
writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection);
|
||||
}
|
||||
|
||||
delete global;
|
||||
}
|
||||
|
||||
|
||||
void writeGlueGrammar( const string & fileName, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
|
||||
{
|
||||
ofstream grammarFile;
|
||||
grammarFile.open(fileName.c_str());
|
||||
if (!options.targetSyntax) {
|
||||
grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
|
||||
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
|
||||
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
|
||||
} else {
|
||||
// chose a top label that is not already a label
|
||||
string topLabel = "QQQQQQ";
|
||||
for( unsigned int i=1; i<=topLabel.length(); i++) {
|
||||
if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
|
||||
topLabel = topLabel.substr(0,i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// basic rules
|
||||
grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| " << endl
|
||||
<< "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << endl;
|
||||
|
||||
// top rules
|
||||
for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
|
||||
i != targetTopLabelCollection.end(); i++ ) {
|
||||
grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << endl;
|
||||
}
|
||||
|
||||
// glue rules
|
||||
for( set<string>::const_iterator i = targetLabelCollection.begin();
|
||||
i != targetLabelCollection.end(); i++ ) {
|
||||
grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << endl;
|
||||
}
|
||||
grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << endl; // glue rule for unknown word...
|
||||
}
|
||||
grammarFile.close();
|
||||
}
|
||||
|
34
contrib/other-builds/extract-mixed-syntax/extract.h
Normal file
34
contrib/other-builds/extract-mixed-syntax/extract.h
Normal file
@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include "SyntaxTree.h"
|
||||
#include "XmlTree.h"
|
||||
#include "Tunnel.h"
|
||||
#include "TunnelCollection.h"
|
||||
#include "SentenceAlignment.h"
|
||||
#include "Global.h"
|
||||
|
||||
std::vector<std::string> tokenize( const char [] );
|
||||
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
|
||||
_IS.getline(_LINE, _SIZE, _DELIM); \
|
||||
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
|
||||
if (_IS.gcount() == _SIZE-1) { \
|
||||
cerr << "Line too long! Buffer overflow. Delete lines >=" \
|
||||
<< _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \
|
||||
<< endl; \
|
||||
exit(1); \
|
||||
} \
|
||||
}
|
||||
#define LINE_MAX_LENGTH 1000000
|
||||
|
||||
const Global *g_global;
|
||||
|
||||
std::set< std::string > targetLabelCollection, sourceLabelCollection;
|
||||
std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;
|
81
contrib/other-builds/extract-mixed-syntax/gzfilebuf.h
Normal file
81
contrib/other-builds/extract-mixed-syntax/gzfilebuf.h
Normal file
@ -0,0 +1,81 @@
|
||||
#ifndef moses_gzfile_buf_h
|
||||
#define moses_gzfile_buf_h
|
||||
|
||||
#include <streambuf>
|
||||
#include <zlib.h>
|
||||
#include <cstring>
|
||||
|
||||
class gzfilebuf : public std::streambuf {
|
||||
public:
|
||||
gzfilebuf(const char *filename)
|
||||
{ _gzf = gzopen(filename, "rb");
|
||||
setg (_buff+sizeof(int), // beginning of putback area
|
||||
_buff+sizeof(int), // read position
|
||||
_buff+sizeof(int)); // end position
|
||||
}
|
||||
~gzfilebuf() { gzclose(_gzf); }
|
||||
protected:
|
||||
virtual int_type overflow (int_type c) {
|
||||
throw;
|
||||
}
|
||||
|
||||
// write multiple characters
|
||||
virtual
|
||||
std::streamsize xsputn (const char* s,
|
||||
std::streamsize num) {
|
||||
throw;
|
||||
}
|
||||
|
||||
virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
|
||||
}
|
||||
|
||||
//read one character
|
||||
virtual int_type underflow () {
|
||||
// is read position before end of _buff?
|
||||
if (gptr() < egptr()) {
|
||||
return traits_type::to_int_type(*gptr());
|
||||
}
|
||||
|
||||
/* process size of putback area
|
||||
* - use number of characters read
|
||||
* - but at most four
|
||||
*/
|
||||
unsigned int numPutback = gptr() - eback();
|
||||
if (numPutback > sizeof(int)) {
|
||||
numPutback = sizeof(int);
|
||||
}
|
||||
|
||||
/* copy up to four characters previously read into
|
||||
* the putback _buff (area of first four characters)
|
||||
*/
|
||||
std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
|
||||
numPutback);
|
||||
|
||||
// read new characters
|
||||
int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
|
||||
if (num <= 0) {
|
||||
// ERROR or EOF
|
||||
return EOF;
|
||||
}
|
||||
|
||||
// reset _buff pointers
|
||||
setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
|
||||
_buff+sizeof(int), // read position
|
||||
_buff+sizeof(int)+num); // end of buffer
|
||||
|
||||
// return next character
|
||||
return traits_type::to_int_type(*gptr());
|
||||
}
|
||||
|
||||
std::streamsize xsgetn (char* s,
|
||||
std::streamsize num) {
|
||||
return gzread(_gzf,s,num);
|
||||
}
|
||||
|
||||
private:
|
||||
gzFile _gzf;
|
||||
static const unsigned int _buffsize = 1024;
|
||||
char _buff[_buffsize];
|
||||
};
|
||||
|
||||
#endif
|
110
contrib/other-builds/extract-mixed-syntax/tables-core.cpp
Normal file
110
contrib/other-builds/extract-mixed-syntax/tables-core.cpp
Normal file
@ -0,0 +1,110 @@
|
||||
// $Id: tables-core.cpp 3131 2010-04-13 16:29:55Z pjwilliams $
|
||||
//#include "beammain.h"
|
||||
//#include "SafeGetLine.h"
|
||||
#include "tables-core.h"
|
||||
|
||||
#define TABLE_LINE_MAX_LENGTH 1000
|
||||
#define UNKNOWNSTR "UNK"
|
||||
|
||||
// as in beamdecoder/tables.cpp
|
||||
vector<string> tokenize( const char* input ) {
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
}
|
||||
else if (isSpace && !betweenWords) {
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::storeIfNew( const WORD& word ) {
|
||||
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
||||
|
||||
if( i != lookup.end() )
|
||||
return i->second;
|
||||
|
||||
WORD_ID id = vocab.size();
|
||||
vocab.push_back( word );
|
||||
lookup[ word ] = id;
|
||||
return id;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::getWordID( const WORD& word ) {
|
||||
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
||||
if( i == lookup.end() )
|
||||
return 0;
|
||||
return i->second;
|
||||
}
|
||||
|
||||
PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase ) {
|
||||
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
|
||||
if( i != lookup.end() )
|
||||
return i->second;
|
||||
|
||||
PHRASE_ID id = phraseTable.size();
|
||||
phraseTable.push_back( phrase );
|
||||
lookup[ phrase ] = id;
|
||||
return id;
|
||||
}
|
||||
|
||||
PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase ) {
|
||||
map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
|
||||
if( i == lookup.end() )
|
||||
return 0;
|
||||
return i->second;
|
||||
}
|
||||
|
||||
void PhraseTable::clear() {
|
||||
lookup.clear();
|
||||
phraseTable.clear();
|
||||
}
|
||||
|
||||
void DTable::init() {
|
||||
for(int i = -10; i<10; i++)
|
||||
dtable[i] = -abs( i );
|
||||
}
|
||||
|
||||
/*
|
||||
void DTable::load( const string& fileName ) {
|
||||
ifstream inFile;
|
||||
inFile.open(fileName.c_str());
|
||||
istream *inFileP = &inFile;
|
||||
|
||||
char line[TABLE_LINE_MAX_LENGTH];
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (inFileP->eof()) break;
|
||||
|
||||
vector<string> token = tokenize( line );
|
||||
if (token.size() < 2) {
|
||||
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
int d = atoi( token[0].c_str() );
|
||||
double prob = log( atof( token[1].c_str() ) );
|
||||
dtable[ d ] = prob;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
double DTable::get( int distortion ) {
|
||||
if (dtable.find( distortion ) == dtable.end())
|
||||
return log( 0.00001 );
|
||||
return dtable[ distortion ];
|
||||
}
|
||||
|
72
contrib/other-builds/extract-mixed-syntax/tables-core.h
Normal file
72
contrib/other-builds/extract-mixed-syntax/tables-core.h
Normal file
@ -0,0 +1,72 @@
|
||||
#pragma once
|
||||
// $Id: tables-core.h 2416 2009-07-30 11:07:38Z hieuhoang1972 $
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <queue>
|
||||
#include <map>
|
||||
#include <cmath>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define TABLE_LINE_MAX_LENGTH 1000
|
||||
#define UNKNOWNSTR "UNK"
|
||||
|
||||
vector<string> tokenize( const char[] );
|
||||
|
||||
//! delete and remove every element of a collection object such as map, set, list etc
|
||||
template<class COLL>
|
||||
void RemoveAllInColl(COLL &coll)
|
||||
{
|
||||
for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter)
|
||||
{
|
||||
delete (*iter);
|
||||
}
|
||||
coll.clear();
|
||||
}
|
||||
|
||||
typedef string WORD;
|
||||
typedef unsigned int WORD_ID;
|
||||
|
||||
class Vocabulary {
|
||||
public:
|
||||
map<WORD, WORD_ID> lookup;
|
||||
vector< WORD > vocab;
|
||||
WORD_ID storeIfNew( const WORD& );
|
||||
WORD_ID getWordID( const WORD& );
|
||||
inline WORD &getWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
|
||||
};
|
||||
|
||||
typedef vector< WORD_ID > PHRASE;
|
||||
typedef unsigned int PHRASE_ID;
|
||||
|
||||
class PhraseTable {
|
||||
public:
|
||||
map< PHRASE, PHRASE_ID > lookup;
|
||||
vector< PHRASE > phraseTable;
|
||||
PHRASE_ID storeIfNew( const PHRASE& );
|
||||
PHRASE_ID getPhraseID( const PHRASE& );
|
||||
void clear();
|
||||
inline PHRASE &getPhrase( const PHRASE_ID id ) { return phraseTable[ id ]; }
|
||||
};
|
||||
|
||||
typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC;
|
||||
|
||||
class TTable {
|
||||
public:
|
||||
map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable;
|
||||
map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti;
|
||||
};
|
||||
|
||||
class DTable {
|
||||
public:
|
||||
map< int, double > dtable;
|
||||
void init();
|
||||
void load( const string& );
|
||||
double get( int );
|
||||
};
|
||||
|
||||
|
123
contrib/other-builds/extract-ordering/.cproject
Normal file
123
contrib/other-builds/extract-ordering/.cproject
Normal file
@ -0,0 +1,123 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.499747849" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
|
||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.798364121" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
||||
<builder buildPath="${workspace_loc:/extract-ordering}/Debug" id="cdt.managedbuild.builder.gnu.cross.1976289814" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
||||
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1324749613" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
|
||||
<option id="gnu.c.compiler.option.debugging.level.1750299246" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.719498215" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
||||
<option id="gnu.cpp.compiler.option.optimization.level.251118848" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.debugging.level.99297656" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1844372739" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1178164658" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
|
||||
<option id="gnu.cpp.link.option.libs.1434184833" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
|
||||
<listOptionValue builtIn="false" value="z"/>
|
||||
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
|
||||
<listOptionValue builtIn="false" value="boost_system-mt"/>
|
||||
<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
|
||||
</option>
|
||||
<option id="gnu.cpp.link.option.paths.974811544" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib64""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.904916320" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1005231499" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1318928675" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.604255673" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.818331963">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.818331963." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.1489025499" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
|
||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1052477856" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
||||
<builder buildPath="${workspace_loc:/extract-ordering}/Release" id="cdt.managedbuild.builder.gnu.cross.33925527" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
||||
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1884790737" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
|
||||
<option id="gnu.c.compiler.option.debugging.level.197048136" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.106898878" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
||||
<option id="gnu.cpp.compiler.option.optimization.level.1920378037" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.debugging.level.37950410" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1197641703" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1356351201" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2053623412" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1988048517" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1494470963" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1553727957" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<project id="extract-ordering.cdt.managedbuild.target.gnu.cross.exe.1840421491" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
|
||||
</storageModule>
|
||||
<storageModule moduleId="scannerConfiguration">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417;cdt.managedbuild.tool.gnu.c.compiler.input.106898878">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446;cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964;cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827;cdt.managedbuild.tool.gnu.c.compiler.input.719498215">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
</cproject>
|
74
contrib/other-builds/extract-ordering/.project
Normal file
74
contrib/other-builds/extract-ordering/.project
Normal file
@ -0,0 +1,74 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>extract-ordering</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
|
||||
<triggers>clean,full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
|
||||
<triggers>full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.cdt.core.cnature</nature>
|
||||
<nature>org.eclipse.cdt.core.ccnature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
|
||||
</natures>
|
||||
<linkedResources>
|
||||
<link>
|
||||
<name>InputFileStream.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>InputFileStream.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>OutputFileStream.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>OutputFileStream.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>SentenceAlignment.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>SentenceAlignment.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>extract-ordering-main.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ordering-main.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>tables-core.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>tables-core.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.h</locationURI>
|
||||
</link>
|
||||
</linkedResources>
|
||||
</projectDescription>
|
124
contrib/other-builds/manual-label/.cproject
Normal file
124
contrib/other-builds/manual-label/.cproject
Normal file
@ -0,0 +1,124 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.1899954923" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
|
||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1645930772" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
||||
<builder buildPath="${workspace_loc:/manual-label/Debug}" id="cdt.managedbuild.builder.gnu.cross.1703642277" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
||||
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1888648788" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
|
||||
<option id="gnu.c.compiler.option.debugging.level.1838052643" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.798368516" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
||||
<option id="gnu.cpp.compiler.option.optimization.level.153015988" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.debugging.level.418888584" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.include.paths.406065865" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../..""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1741441821" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1626431978" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
|
||||
<option id="gnu.cpp.link.option.libs.1886912770" superClass="gnu.cpp.link.option.libs" valueType="libs">
|
||||
<listOptionValue builtIn="false" value="boost_program_options-mt"/>
|
||||
</option>
|
||||
<option id="gnu.cpp.link.option.paths.1541583695" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/lib64""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1367999206" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.31522559" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.826957235" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.350181339" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.97427761" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
|
||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.564169339" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
||||
<builder buildPath="${workspace_loc:/manual-label/Release}" id="cdt.managedbuild.builder.gnu.cross.663164336" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
||||
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.2135645103" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
|
||||
<option id="gnu.c.compiler.option.debugging.level.764935013" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1841809129" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
||||
<option id="gnu.cpp.compiler.option.optimization.level.1877584345" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.debugging.level.935490779" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.355530813" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.940299092" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.17718999" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1527322008" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cross.assembler.480337803" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1788533940" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<project id="manual-label.cdt.managedbuild.target.gnu.cross.exe.2117548180" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
|
||||
</storageModule>
|
||||
<storageModule moduleId="scannerConfiguration">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943;cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607;cdt.managedbuild.tool.gnu.c.compiler.input.798368516">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437;cdt.managedbuild.tool.gnu.c.compiler.input.1841809129">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503;cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
</cproject>
|
27
contrib/other-builds/manual-label/.project
Normal file
27
contrib/other-builds/manual-label/.project
Normal file
@ -0,0 +1,27 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>manual-label</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
|
||||
<triggers>clean,full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
|
||||
<triggers>full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.cdt.core.cnature</nature>
|
||||
<nature>org.eclipse.cdt.core.ccnature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
86
contrib/other-builds/manual-label/DeEn.cpp
Normal file
86
contrib/other-builds/manual-label/DeEn.cpp
Normal file
@ -0,0 +1,86 @@
|
||||
#include <list>
|
||||
#include "DeEn.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
extern bool g_debug;
|
||||
|
||||
bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str)
|
||||
{
|
||||
pos += offset;
|
||||
if (pos >= source.size() || pos < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const string &word = source[pos][factor];
|
||||
vector<string> soughts = Moses::Tokenize(str, " ");
|
||||
for (int i = 0; i < soughts.size(); ++i) {
|
||||
string &sought = soughts[i];
|
||||
bool found = (word == sought);
|
||||
if (found) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Contains(const Phrase &source, int start, int end, int factor, const string &str)
|
||||
{
|
||||
for (int pos = start; pos <= end; ++pos) {
|
||||
bool found = IsA(source, pos, 0, factor, str);
|
||||
if (found) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void LabelDeEn(const Phrase &source, ostream &out)
|
||||
{
|
||||
typedef pair<int,int> Range;
|
||||
typedef list<Range> Ranges;
|
||||
Ranges ranges;
|
||||
|
||||
// find ranges to label
|
||||
for (int start = 0; start < source.size(); ++start) {
|
||||
for (int end = start; end < source.size(); ++end) {
|
||||
if (IsA(source, start, -1, 1, "VAFIN")
|
||||
&& IsA(source, end, +1, 1, "VVINF VVPP")
|
||||
&& !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) {
|
||||
Range range(start, end);
|
||||
ranges.push_back(range);
|
||||
}
|
||||
else if ((start == 0 || IsA(source, start, -1, 1, "$,"))
|
||||
&& IsA(source, end, +1, 0, "zu")
|
||||
&& IsA(source, end, +2, 1, "VVINF")
|
||||
&& !Contains(source, start, end, 1, "$,")) {
|
||||
Range range(start, end);
|
||||
ranges.push_back(range);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// output sentence, with labels
|
||||
for (int pos = 0; pos < source.size(); ++pos) {
|
||||
// output beginning of label
|
||||
for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
|
||||
const Range &range = *iter;
|
||||
if (range.first == pos) {
|
||||
out << "<tree label=\"reorder-label\"> ";
|
||||
}
|
||||
}
|
||||
|
||||
const Word &word = source[pos];
|
||||
out << word[0] << " ";
|
||||
|
||||
for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
|
||||
const Range &range = *iter;
|
||||
if (range.second == pos) {
|
||||
out << "</tree> ";
|
||||
}
|
||||
}
|
||||
}
|
||||
out << endl;
|
||||
|
||||
}
|
10
contrib/other-builds/manual-label/DeEn.h
Normal file
10
contrib/other-builds/manual-label/DeEn.h
Normal file
@ -0,0 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
typedef std::vector<std::string> Word;
|
||||
typedef std::vector<Word> Phrase;
|
||||
|
||||
void LabelDeEn(const Phrase &source, std::ostream &out);
|
13
contrib/other-builds/manual-label/Makefile
Normal file
13
contrib/other-builds/manual-label/Makefile
Normal file
@ -0,0 +1,13 @@
|
||||
all: manual-label
|
||||
|
||||
clean:
|
||||
rm -f *.o manual-label
|
||||
|
||||
.cpp.o:
|
||||
g++ -I../../../ -O6 -g -c $<
|
||||
|
||||
manual-label: DeEn.o manual-label.o
|
||||
|
||||
g++ DeEn.o manual-label.o -lz -lboost_program_options-mt -o manual-label
|
||||
|
||||
|
88
contrib/other-builds/manual-label/manual-label.cpp
Normal file
88
contrib/other-builds/manual-label/manual-label.cpp
Normal file
@ -0,0 +1,88 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <boost/program_options.hpp>
|
||||
#include "moses/Util.h"
|
||||
#include "DeEn.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
bool g_debug = false;
|
||||
|
||||
Phrase Tokenize(const string &line);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
cerr << "Starting" << endl;
|
||||
|
||||
namespace po = boost::program_options;
|
||||
po::options_description desc("Options");
|
||||
desc.add_options()
|
||||
("help", "Print help messages")
|
||||
("add", "additional options")
|
||||
("source-language,s", po::value<string>()->required(), "Source Language")
|
||||
("target-language,t", po::value<string>()->required(), "Target Language");
|
||||
|
||||
po::variables_map vm;
|
||||
try
|
||||
{
|
||||
po::store(po::parse_command_line(argc, argv, desc),
|
||||
vm); // can throw
|
||||
|
||||
/** --help option
|
||||
*/
|
||||
if ( vm.count("help") )
|
||||
{
|
||||
std::cout << "Basic Command Line Parameter App" << std::endl
|
||||
<< desc << std::endl;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
po::notify(vm); // throws on error, so do after help in case
|
||||
// there are any problems
|
||||
}
|
||||
catch(po::error& e)
|
||||
{
|
||||
std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
|
||||
std::cerr << desc << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
string sourceLang = vm["source-language"].as<string>();
|
||||
string targetLang = vm["target-language"].as<string>();
|
||||
cerr << sourceLang << " " << targetLang << endl;
|
||||
|
||||
string line;
|
||||
size_t lineNum = 1;
|
||||
|
||||
while (getline(cin, line)) {
|
||||
//cerr << lineNum << ":" << line << endl;
|
||||
if (lineNum % 1000 == 0) {
|
||||
cerr << lineNum << " ";
|
||||
}
|
||||
|
||||
Phrase source = Tokenize(line);
|
||||
|
||||
LabelDeEn(source, cout);
|
||||
|
||||
++lineNum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
cerr << "Finished" << endl;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
Phrase Tokenize(const string &line)
|
||||
{
|
||||
Phrase ret;
|
||||
|
||||
vector<string> toks = Moses::Tokenize(line);
|
||||
for (size_t i = 0; i < toks.size(); ++i) {
|
||||
Word word = Moses::Tokenize(toks[i], "|");
|
||||
ret.push_back(word);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -11,11 +11,11 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -64,11 +64,11 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -11,12 +11,12 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -88,13 +88,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.401150096" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -12,12 +12,13 @@ Building the RPM SPEC file
|
||||
The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
|
||||
|
||||
- The Git repository from which an installer will be built,
|
||||
- The branch in the Git repository to build, and
|
||||
- The branch in the Git repository to build,
|
||||
- The location of Boost on the build machine, and
|
||||
- The version of the installed Moses distribution.
|
||||
|
||||
For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
|
||||
For example, to build the RELEASE-1.0 branch in the mosesdecoder repository (git://github.com/moses-smt/mosesdecoder.git):
|
||||
|
||||
$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
|
||||
$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0 -t /usr
|
||||
|
||||
This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.
|
||||
|
||||
|
@ -1,11 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
BRANCH="master"
|
||||
BOOST="/usr"
|
||||
declare -i NO_RPM_BUILD=0
|
||||
declare -r RPM_VERSION_TAG="___RPM_VERSION__"
|
||||
declare -r BOOST_TAG="___BOOST_LOCATION__"
|
||||
|
||||
function usage() {
|
||||
echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
|
||||
echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version] -t [Boost install: default ${BOOST}]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
@ -13,11 +15,12 @@ if [ $# -lt 4 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
while getopts r:b:v:nh OPTION
|
||||
while getopts r:b:t:v:nh OPTION
|
||||
do
|
||||
case "$OPTION" in
|
||||
r) REPO="${OPTARG}";;
|
||||
b) BRANCH="${OPTARG}";;
|
||||
t) BOOST="${OPTARG}";;
|
||||
v) VERSION="${OPTARG}";;
|
||||
n) NO_RPM_BUILD=1;;
|
||||
[h\?]) usage;;
|
||||
@ -53,7 +56,8 @@ if [ ${NO_RPM_BUILD} -eq 0 ]; then
|
||||
if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
|
||||
mkdir -p ${HOME}/rpmbuild/SPECS
|
||||
fi
|
||||
eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
|
||||
ESC_BOOST=`echo ${BOOST} | gawk '{gsub(/\//, "\\\\/"); print}'`
|
||||
eval sed -e \"s/${RPM_VERSION_TAG}/${VERSION}/\" -e \"s/${BOOST_TAG}/${ESC_BOOST}/\" ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
|
||||
if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
|
||||
mkdir -p ${HOME}/rpmbuild/SOURCES
|
||||
fi
|
||||
|
@ -8,7 +8,7 @@ License: LGPL
|
||||
Group: Development/Tools
|
||||
Vendor: Capita Translation and Interpreting
|
||||
Packager: Ian Johnson <ian.johnson@capita-ti.com>
|
||||
Requires: boost >= 1.48, python >= 2.6, perl >= 5
|
||||
Requires: python >= 2.6, perl >= 5
|
||||
BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
|
||||
%description
|
||||
Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
|
||||
@ -35,16 +35,17 @@ cd ../giza-pp
|
||||
make
|
||||
cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
|
||||
%build
|
||||
./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
|
||||
./bjam --with-boost=___BOOST_LOCATION__ --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
|
||||
%install
|
||||
mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R bin $RPM_BUILD_ROOT/opt/moses
|
||||
cp -R scripts/OSM $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/Transliteration $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
@ -52,12 +53,13 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
%files
|
||||
%defattr(-,root,root)
|
||||
/opt/moses/bin/*
|
||||
/opt/moses/scripts/OSM/*
|
||||
/opt/moses/scripts/Transliteration/*
|
||||
/opt/moses/scripts/analysis/*
|
||||
/opt/moses/scripts/ems/*
|
||||
/opt/moses/scripts/generic/*
|
||||
/opt/moses/scripts/other/*
|
||||
/opt/moses/scripts/recaser/*
|
||||
/opt/moses/scripts/regression-testing/*
|
||||
/opt/moses/scripts/share/*
|
||||
/opt/moses/scripts/tokenizer/*
|
||||
/opt/moses/scripts/training/*
|
||||
|
@ -106,7 +106,7 @@ class Moses():
|
||||
scores = line[2].split()
|
||||
if len(scores) <self.number_of_features:
|
||||
sys.stderr.write('Error: model only has {0} features. Expected {1}.\n'.format(len(scores),self.number_of_features))
|
||||
exit()
|
||||
exit(1)
|
||||
|
||||
scores = scores[:self.number_of_features]
|
||||
model_probabilities = map(float,scores)
|
||||
@ -179,7 +179,7 @@ class Moses():
|
||||
reordering_probabilities[j][i] = p
|
||||
except IndexError:
|
||||
sys.stderr.write('\nIndexError: Did you correctly specify the number of reordering features? (--number_of_features N in command line)\n')
|
||||
exit()
|
||||
exit(1)
|
||||
|
||||
def traverse_incrementally(self,table,models,load_lines,store_flag,mode='interpolate',inverted=False,lowmem=False,flags=None):
|
||||
"""hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
|
||||
@ -307,13 +307,13 @@ class Moses():
|
||||
elif len(line) == 4:
|
||||
if self.require_alignment:
|
||||
sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n')
|
||||
exit()
|
||||
exit(1)
|
||||
|
||||
self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')]
|
||||
|
||||
else:
|
||||
sys.stderr.write('Error: unexpected phrase table format. Are you using a very old/new version of Moses with different formatting?\n')
|
||||
exit()
|
||||
exit(1)
|
||||
|
||||
|
||||
def get_word_alignments(self,src,target,cache=False,mycache={}):
|
||||
@ -515,7 +515,7 @@ class TigerXML():
|
||||
|
||||
if not src or not target:
|
||||
sys.stderr.write('Error: Source and/or target language not specified. Required for TigerXML extraction.\n')
|
||||
exit()
|
||||
exit(1)
|
||||
|
||||
alignments = self._get_aligned_ids(src,target)
|
||||
self._textualize_alignments(src,target,alignments)
|
||||
@ -1261,7 +1261,7 @@ def handle_file(filename,action,fileobj=None,mode='r'):
|
||||
sys.stderr.write('For a weighted counts combination, we need statistics that Moses doesn\'t write to disk by default.\n')
|
||||
sys.stderr.write('Repeat step 4 of Moses training for all models with the option -write-lexical-counts.\n')
|
||||
|
||||
exit()
|
||||
exit(1)
|
||||
|
||||
if filename.endswith('.gz'):
|
||||
fileobj = gzip.open(filename,mode)
|
||||
@ -1435,7 +1435,7 @@ class Combine_TMs():
|
||||
|
||||
if mode not in ['interpolate','loglinear','counts']:
|
||||
sys.stderr.write('Error: mode must be either "interpolate", "loglinear" or "counts"\n')
|
||||
sys.exit()
|
||||
sys.exit(1)
|
||||
|
||||
models,number_of_features,weights = self._sanity_checks(models,number_of_features,weights)
|
||||
|
||||
|
@ -1,4 +1,6 @@
|
||||
#include "lm/bhiksha.hh"
|
||||
|
||||
#include "lm/binary_format.hh"
|
||||
#include "lm/config.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/exception.hh"
|
||||
@ -15,11 +17,11 @@ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_
|
||||
const uint8_t kArrayBhikshaVersion = 0;
|
||||
|
||||
// TODO: put this in binary file header instead when I change the binary file format again.
|
||||
void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) {
|
||||
uint8_t version;
|
||||
uint8_t configured_bits;
|
||||
util::ReadOrThrow(fd, &version, 1);
|
||||
util::ReadOrThrow(fd, &configured_bits, 1);
|
||||
void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
|
||||
uint8_t buffer[2];
|
||||
file.ReadForConfig(buffer, 2, offset);
|
||||
uint8_t version = buffer[0];
|
||||
uint8_t configured_bits = buffer[1];
|
||||
if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
|
||||
config.pointer_bhiksha_bits = configured_bits;
|
||||
}
|
||||
@ -87,9 +89,6 @@ void ArrayBhiksha::FinishedLoading(const Config &config) {
|
||||
*(head_write++) = config.pointer_bhiksha_bits;
|
||||
}
|
||||
|
||||
void ArrayBhiksha::LoadedBinary() {
|
||||
}
|
||||
|
||||
} // namespace trie
|
||||
} // namespace ngram
|
||||
} // namespace lm
|
||||
|
@ -24,6 +24,7 @@
|
||||
namespace lm {
|
||||
namespace ngram {
|
||||
struct Config;
|
||||
class BinaryFormat;
|
||||
|
||||
namespace trie {
|
||||
|
||||
@ -31,7 +32,7 @@ class DontBhiksha {
|
||||
public:
|
||||
static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
|
||||
|
||||
static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {}
|
||||
static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {}
|
||||
|
||||
static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
|
||||
|
||||
@ -53,8 +54,6 @@ class DontBhiksha {
|
||||
|
||||
void FinishedLoading(const Config &/*config*/) {}
|
||||
|
||||
void LoadedBinary() {}
|
||||
|
||||
uint8_t InlineBits() const { return next_.bits; }
|
||||
|
||||
private:
|
||||
@ -65,7 +64,7 @@ class ArrayBhiksha {
|
||||
public:
|
||||
static const ModelType kModelTypeAdd = kArrayAdd;
|
||||
|
||||
static void UpdateConfigFromBinary(int fd, Config &config);
|
||||
static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
|
||||
|
||||
static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
|
||||
|
||||
@ -93,8 +92,6 @@ class ArrayBhiksha {
|
||||
|
||||
void FinishedLoading(const Config &config);
|
||||
|
||||
void LoadedBinary();
|
||||
|
||||
uint8_t InlineBits() const { return next_inline_.bits; }
|
||||
|
||||
private:
|
||||
|
@ -14,6 +14,9 @@
|
||||
|
||||
namespace lm {
|
||||
namespace ngram {
|
||||
|
||||
const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
|
||||
|
||||
namespace {
|
||||
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
|
||||
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
|
||||
@ -58,8 +61,6 @@ struct Sanity {
|
||||
}
|
||||
};
|
||||
|
||||
const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
|
||||
|
||||
std::size_t TotalHeaderSize(unsigned char order) {
|
||||
return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
|
||||
}
|
||||
@ -81,83 +82,6 @@ void WriteHeader(void *to, const Parameters ¶ms) {
|
||||
|
||||
} // namespace
|
||||
|
||||
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) {
|
||||
if (config.write_mmap) {
|
||||
std::size_t total = TotalHeaderSize(order) + memory_size;
|
||||
backing.file.reset(util::CreateOrThrow(config.write_mmap));
|
||||
if (config.write_method == Config::WRITE_MMAP) {
|
||||
backing.vocab.reset(util::MapZeroedWrite(backing.file.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
|
||||
} else {
|
||||
util::ResizeOrThrow(backing.file.get(), 0);
|
||||
util::MapAnonymous(total, backing.vocab);
|
||||
}
|
||||
strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order));
|
||||
return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
|
||||
} else {
|
||||
util::MapAnonymous(memory_size, backing.vocab);
|
||||
return reinterpret_cast<uint8_t*>(backing.vocab.get());
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
|
||||
std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
|
||||
if (config.write_mmap) {
|
||||
// Grow the file to accomodate the search, using zeros.
|
||||
try {
|
||||
util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
|
||||
} catch (util::ErrnoException &e) {
|
||||
e << " for file " << config.write_mmap;
|
||||
throw e;
|
||||
}
|
||||
|
||||
if (config.write_method == Config::WRITE_AFTER) {
|
||||
util::MapAnonymous(memory_size, backing.search);
|
||||
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||
}
|
||||
// mmap it now.
|
||||
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
|
||||
std::size_t page_size = util::SizePage();
|
||||
std::size_t alignment_cruft = adjusted_vocab % page_size;
|
||||
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
||||
return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
|
||||
} else {
|
||||
util::MapAnonymous(memory_size, backing.search);
|
||||
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||
}
|
||||
}
|
||||
|
||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
|
||||
if (!config.write_mmap) return;
|
||||
switch (config.write_method) {
|
||||
case Config::WRITE_MMAP:
|
||||
util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
|
||||
util::SyncOrThrow(backing.search.get(), backing.search.size());
|
||||
break;
|
||||
case Config::WRITE_AFTER:
|
||||
util::SeekOrThrow(backing.file.get(), 0);
|
||||
util::WriteOrThrow(backing.file.get(), backing.vocab.get(), backing.vocab.size());
|
||||
util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad);
|
||||
util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size());
|
||||
util::FSyncOrThrow(backing.file.get());
|
||||
break;
|
||||
}
|
||||
// header and vocab share the same mmap. The header is written here because we know the counts.
|
||||
Parameters params = Parameters();
|
||||
params.counts = counts;
|
||||
params.fixed.order = counts.size();
|
||||
params.fixed.probing_multiplier = config.probing_multiplier;
|
||||
params.fixed.model_type = model_type;
|
||||
params.fixed.has_vocabulary = config.include_vocab;
|
||||
params.fixed.search_version = search_version;
|
||||
WriteHeader(backing.vocab.get(), params);
|
||||
if (config.write_method == Config::WRITE_AFTER) {
|
||||
util::SeekOrThrow(backing.file.get(), 0);
|
||||
util::WriteOrThrow(backing.file.get(), backing.vocab.get(), TotalHeaderSize(counts.size()));
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
||||
bool IsBinaryFormat(int fd) {
|
||||
const uint64_t size = util::SizeFile(fd);
|
||||
if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
|
||||
@ -209,44 +133,164 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
|
||||
UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
|
||||
}
|
||||
|
||||
void SeekPastHeader(int fd, const Parameters ¶ms) {
|
||||
util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
|
||||
const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
|
||||
|
||||
BinaryFormat::BinaryFormat(const Config &config)
|
||||
: write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
|
||||
header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
|
||||
|
||||
void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms) {
|
||||
file_.reset(fd);
|
||||
write_mmap_ = NULL; // Ignore write requests; this is already in binary format.
|
||||
ReadHeader(fd, params);
|
||||
MatchCheck(model_type, search_version, params);
|
||||
header_size_ = TotalHeaderSize(params.counts.size());
|
||||
}
|
||||
|
||||
uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing) {
|
||||
const uint64_t file_size = util::SizeFile(backing.file.get());
|
||||
void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
|
||||
assert(header_size_ != kInvalidSize);
|
||||
util::PReadOrThrow(file_.get(), to, amount, offset_excluding_header + header_size_);
|
||||
}
|
||||
|
||||
void *BinaryFormat::LoadBinary(std::size_t size) {
|
||||
assert(header_size_ != kInvalidSize);
|
||||
const uint64_t file_size = util::SizeFile(file_.get());
|
||||
// The header is smaller than a page, so we have to map the whole header as well.
|
||||
std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
|
||||
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
|
||||
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
|
||||
uint64_t total_map = static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(size);
|
||||
UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
|
||||
|
||||
util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.search);
|
||||
util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_);
|
||||
|
||||
if (config.enumerate_vocab && !params.fixed.has_vocabulary)
|
||||
UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
|
||||
|
||||
// Seek to vocabulary words
|
||||
util::SeekOrThrow(backing.file.get(), total_map);
|
||||
return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size());
|
||||
vocab_string_offset_ = total_map;
|
||||
return reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
|
||||
}
|
||||
|
||||
void ComplainAboutARPA(const Config &config, ModelType model_type) {
|
||||
if (config.write_mmap || !config.messages) return;
|
||||
if (config.arpa_complain == Config::ALL) {
|
||||
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
|
||||
} else if (config.arpa_complain == Config::EXPENSIVE &&
|
||||
(model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
|
||||
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
|
||||
void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
|
||||
vocab_size_ = memory_size;
|
||||
if (!write_mmap_) {
|
||||
header_size_ = 0;
|
||||
util::MapAnonymous(memory_size, memory_vocab_);
|
||||
return reinterpret_cast<uint8_t*>(memory_vocab_.get());
|
||||
}
|
||||
header_size_ = TotalHeaderSize(order);
|
||||
std::size_t total = util::CheckOverflow(static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(memory_size));
|
||||
file_.reset(util::CreateOrThrow(write_mmap_));
|
||||
// some gccs complain about uninitialized variables even though all enum values are covered.
|
||||
void *vocab_base = NULL;
|
||||
switch (write_method_) {
|
||||
case Config::WRITE_MMAP:
|
||||
mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
|
||||
vocab_base = mapping_.get();
|
||||
break;
|
||||
case Config::WRITE_AFTER:
|
||||
util::ResizeOrThrow(file_.get(), 0);
|
||||
util::MapAnonymous(total, memory_vocab_);
|
||||
vocab_base = memory_vocab_.get();
|
||||
break;
|
||||
}
|
||||
strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
|
||||
return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
|
||||
}
|
||||
|
||||
void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) {
|
||||
assert(vocab_size_ != kInvalidSize);
|
||||
vocab_pad_ = vocab_pad;
|
||||
std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
|
||||
vocab_string_offset_ = new_size;
|
||||
if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
|
||||
util::MapAnonymous(memory_size, memory_search_);
|
||||
assert(header_size_ == 0 || write_mmap_);
|
||||
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
|
||||
return reinterpret_cast<uint8_t*>(memory_search_.get());
|
||||
}
|
||||
|
||||
assert(write_method_ == Config::WRITE_MMAP);
|
||||
// Also known as total size without vocab words.
|
||||
// Grow the file to accomodate the search, using zeros.
|
||||
// According to man mmap, behavior is undefined when the file is resized
|
||||
// underneath a mmap that is not a multiple of the page size. So to be
|
||||
// safe, we'll unmap it and map it again.
|
||||
mapping_.reset();
|
||||
util::ResizeOrThrow(file_.get(), new_size);
|
||||
void *ret;
|
||||
MapFile(vocab_base, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) {
|
||||
// Checking Config's include_vocab is the responsibility of the caller.
|
||||
assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize);
|
||||
if (!write_mmap_) {
|
||||
// Unchanged base.
|
||||
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get());
|
||||
search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
|
||||
return;
|
||||
}
|
||||
if (write_method_ == Config::WRITE_MMAP) {
|
||||
mapping_.reset();
|
||||
}
|
||||
util::SeekOrThrow(file_.get(), VocabStringReadingOffset());
|
||||
util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
|
||||
if (write_method_ == Config::WRITE_MMAP) {
|
||||
MapFile(vocab_base, search_base);
|
||||
} else {
|
||||
vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
|
||||
search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts) {
|
||||
if (!write_mmap_) return;
|
||||
switch (write_method_) {
|
||||
case Config::WRITE_MMAP:
|
||||
util::SyncOrThrow(mapping_.get(), mapping_.size());
|
||||
break;
|
||||
case Config::WRITE_AFTER:
|
||||
util::SeekOrThrow(file_.get(), 0);
|
||||
util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size());
|
||||
util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_);
|
||||
util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size());
|
||||
util::FSyncOrThrow(file_.get());
|
||||
break;
|
||||
}
|
||||
// header and vocab share the same mmap.
|
||||
Parameters params = Parameters();
|
||||
memset(¶ms, 0, sizeof(Parameters));
|
||||
params.counts = counts;
|
||||
params.fixed.order = counts.size();
|
||||
params.fixed.probing_multiplier = config.probing_multiplier;
|
||||
params.fixed.model_type = model_type;
|
||||
params.fixed.has_vocabulary = config.include_vocab;
|
||||
params.fixed.search_version = search_version;
|
||||
switch (write_method_) {
|
||||
case Config::WRITE_MMAP:
|
||||
WriteHeader(mapping_.get(), params);
|
||||
util::SyncOrThrow(mapping_.get(), mapping_.size());
|
||||
break;
|
||||
case Config::WRITE_AFTER:
|
||||
{
|
||||
std::vector<uint8_t> buffer(TotalHeaderSize(counts.size()));
|
||||
WriteHeader(&buffer[0], params);
|
||||
util::SeekOrThrow(file_.get(), 0);
|
||||
util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) {
|
||||
mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED);
|
||||
vocab_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
|
||||
search_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_;
|
||||
}
|
||||
|
||||
bool RecognizeBinary(const char *file, ModelType &recognized) {
|
||||
util::scoped_fd fd(util::OpenReadOrThrow(file));
|
||||
if (!detail::IsBinaryFormat(fd.get())) return false;
|
||||
if (!IsBinaryFormat(fd.get())) {
|
||||
return false;
|
||||
}
|
||||
Parameters params;
|
||||
detail::ReadHeader(fd.get(), params);
|
||||
ReadHeader(fd.get(), params);
|
||||
recognized = params.fixed.model_type;
|
||||
return true;
|
||||
}
|
||||
|
@ -17,6 +17,8 @@
|
||||
namespace lm {
|
||||
namespace ngram {
|
||||
|
||||
extern const char *kModelNames[6];
|
||||
|
||||
/*Inspect a file to determine if it is a binary lm. If not, return false.
|
||||
* If so, return true and set recognized to the type. This is the only API in
|
||||
* this header designed for use by decoder authors.
|
||||
@ -42,67 +44,63 @@ struct Parameters {
|
||||
std::vector<uint64_t> counts;
|
||||
};
|
||||
|
||||
struct Backing {
|
||||
// File behind memory, if any.
|
||||
util::scoped_fd file;
|
||||
// Vocabulary lookup table. Not to be confused with the vocab words themselves.
|
||||
util::scoped_memory vocab;
|
||||
// Raw block of memory backing the language model data structures
|
||||
util::scoped_memory search;
|
||||
class BinaryFormat {
|
||||
public:
|
||||
explicit BinaryFormat(const Config &config);
|
||||
|
||||
// Reading a binary file:
|
||||
// Takes ownership of fd
|
||||
void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms);
|
||||
// Used to read parts of the file to update the config object before figuring out full size.
|
||||
void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
|
||||
// Actually load the binary file and return a pointer to the beginning of the search area.
|
||||
void *LoadBinary(std::size_t size);
|
||||
|
||||
uint64_t VocabStringReadingOffset() const {
|
||||
assert(vocab_string_offset_ != kInvalidOffset);
|
||||
return vocab_string_offset_;
|
||||
}
|
||||
|
||||
// Writing a binary file or initializing in RAM from ARPA:
|
||||
// Size for vocabulary.
|
||||
void *SetupJustVocab(std::size_t memory_size, uint8_t order);
|
||||
// Warning: can change the vocaulary base pointer.
|
||||
void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
|
||||
// Warning: can change vocabulary and search base addresses.
|
||||
void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
|
||||
// Write the header at the beginning of the file.
|
||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
|
||||
|
||||
private:
|
||||
void MapFile(void *&vocab_base, void *&search_base);
|
||||
|
||||
// Copied from configuration.
|
||||
const Config::WriteMethod write_method_;
|
||||
const char *write_mmap_;
|
||||
util::LoadMethod load_method_;
|
||||
|
||||
// File behind memory, if any.
|
||||
util::scoped_fd file_;
|
||||
|
||||
// If there is a file involved, a single mapping.
|
||||
util::scoped_memory mapping_;
|
||||
|
||||
// If the data is only in memory, separately allocate each because the trie
|
||||
// knows vocab's size before it knows search's size (because SRILM might
|
||||
// have pruned).
|
||||
util::scoped_memory memory_vocab_, memory_search_;
|
||||
|
||||
// Memory ranges. Note that these may not be contiguous and may not all
|
||||
// exist.
|
||||
std::size_t header_size_, vocab_size_, vocab_pad_;
|
||||
// aka end of search.
|
||||
uint64_t vocab_string_offset_;
|
||||
|
||||
static const uint64_t kInvalidOffset = (uint64_t)-1;
|
||||
};
|
||||
|
||||
// Create just enough of a binary file to write vocabulary to it.
|
||||
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
|
||||
// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
|
||||
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing);
|
||||
|
||||
// Write header to binary file. This is done last to prevent incomplete files
|
||||
// from loading.
|
||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing);
|
||||
|
||||
namespace detail {
|
||||
|
||||
bool IsBinaryFormat(int fd);
|
||||
|
||||
void ReadHeader(int fd, Parameters ¶ms);
|
||||
|
||||
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms);
|
||||
|
||||
void SeekPastHeader(int fd, const Parameters ¶ms);
|
||||
|
||||
uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing);
|
||||
|
||||
void ComplainAboutARPA(const Config &config, ModelType model_type);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class To> void LoadLM(const char *file, const Config &config, To &to) {
|
||||
Backing &backing = to.MutableBacking();
|
||||
backing.file.reset(util::OpenReadOrThrow(file));
|
||||
|
||||
try {
|
||||
if (detail::IsBinaryFormat(backing.file.get())) {
|
||||
Parameters params;
|
||||
detail::ReadHeader(backing.file.get(), params);
|
||||
detail::MatchCheck(To::kModelType, To::kVersion, params);
|
||||
// Replace the run-time configured probing_multiplier with the one in the file.
|
||||
Config new_config(config);
|
||||
new_config.probing_multiplier = params.fixed.probing_multiplier;
|
||||
detail::SeekPastHeader(backing.file.get(), params);
|
||||
To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config);
|
||||
uint64_t memory_size = To::Size(params.counts, new_config);
|
||||
uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
|
||||
to.InitializeFromBinary(start, params, new_config, backing.file.get());
|
||||
} else {
|
||||
detail::ComplainAboutARPA(config, To::kModelType);
|
||||
to.InitializeFromARPA(file, config);
|
||||
}
|
||||
} catch (util::Exception &e) {
|
||||
e << " File: " << file;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ngram
|
||||
} // namespace lm
|
||||
#endif // LM_BINARY_FORMAT__
|
||||
|
@ -87,7 +87,7 @@ class VocabHandout {
|
||||
Table table_;
|
||||
|
||||
std::size_t double_cutoff_;
|
||||
|
||||
|
||||
util::FakeOFStream word_list_;
|
||||
};
|
||||
|
||||
@ -98,7 +98,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
|
||||
std::size_t operator()(const WordIndex *start) const {
|
||||
return util::MurmurHashNative(start, size_);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
const std::size_t size_;
|
||||
};
|
||||
@ -106,11 +106,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
|
||||
class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
|
||||
public:
|
||||
explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
|
||||
|
||||
|
||||
bool operator()(const WordIndex *first, const WordIndex *second) const {
|
||||
return !memcmp(first, second, size_);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
const std::size_t size_;
|
||||
};
|
||||
@ -131,7 +131,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
|
||||
|
||||
class Writer {
|
||||
public:
|
||||
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
|
||||
Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
|
||||
: block_(position), gram_(block_->Get(), order),
|
||||
dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
|
||||
dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
|
||||
@ -140,7 +140,7 @@ class Writer {
|
||||
dedupe_.Clear();
|
||||
assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
|
||||
if (order == 1) {
|
||||
// Add special words. AdjustCounts is responsible if order != 1.
|
||||
// Add special words. AdjustCounts is responsible if order != 1.
|
||||
AddUnigramWord(kUNK);
|
||||
AddUnigramWord(kBOS);
|
||||
}
|
||||
@ -170,16 +170,16 @@ class Writer {
|
||||
memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
|
||||
return;
|
||||
}
|
||||
// Complete the write.
|
||||
// Complete the write.
|
||||
gram_.Count() = 1;
|
||||
// Prepare the next n-gram.
|
||||
// Prepare the next n-gram.
|
||||
if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
|
||||
NGram last(gram_);
|
||||
gram_.NextInMemory();
|
||||
std::copy(last.begin() + 1, last.end(), gram_.begin());
|
||||
return;
|
||||
}
|
||||
// Block end. Need to store the context in a temporary buffer.
|
||||
// Block end. Need to store the context in a temporary buffer.
|
||||
std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
|
||||
dedupe_.Clear();
|
||||
block_->SetValidSize(block_size_);
|
||||
@ -207,7 +207,7 @@ class Writer {
|
||||
// Hash table combiner implementation.
|
||||
Dedupe dedupe_;
|
||||
|
||||
// Small buffer to hold existing ngrams when shifting across a block boundary.
|
||||
// Small buffer to hold existing ngrams when shifting across a block boundary.
|
||||
boost::scoped_array<WordIndex> buffer_;
|
||||
|
||||
const std::size_t block_size_;
|
||||
@ -223,7 +223,7 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
|
||||
return VocabHandout::MemUsage(vocab_estimate);
|
||||
}
|
||||
|
||||
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
|
||||
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
|
||||
: from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
|
||||
dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
|
||||
dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) {
|
||||
@ -240,7 +240,10 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
|
||||
uint64_t count = 0;
|
||||
bool delimiters[256];
|
||||
memset(delimiters, 0, sizeof(delimiters));
|
||||
delimiters['\0'] = delimiters['\t'] = delimiters['\n'] = delimiters['\r'] = delimiters[' '] = true;
|
||||
const char kDelimiterSet[] = "\0\t\n\r ";
|
||||
for (const char *i = kDelimiterSet; i < kDelimiterSet + sizeof(kDelimiterSet); ++i) {
|
||||
delimiters[static_cast<unsigned char>(*i)] = true;
|
||||
}
|
||||
try {
|
||||
while(true) {
|
||||
StringPiece line(from_.ReadLine());
|
||||
|
@ -33,12 +33,12 @@ class Callback {
|
||||
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
|
||||
probs_[order_minus_1 + 1] = pay.complete.prob;
|
||||
pay.complete.prob = log10(pay.complete.prob);
|
||||
// TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling.
|
||||
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1].Get()) { // check valid pointer at tht end
|
||||
// TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling.
|
||||
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
|
||||
pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
|
||||
++backoffs_[order_minus_1];
|
||||
} else {
|
||||
// Not a context.
|
||||
// Not a context.
|
||||
pay.complete.backoff = 0.0;
|
||||
}
|
||||
}
|
||||
@ -52,7 +52,7 @@ class Callback {
|
||||
};
|
||||
} // namespace
|
||||
|
||||
Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
|
||||
Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
|
||||
: uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {}
|
||||
|
||||
// perform order-wise interpolation
|
||||
|
@ -11,11 +11,7 @@ Config::Config() :
|
||||
enumerate_vocab(NULL),
|
||||
unknown_missing(COMPLAIN),
|
||||
sentence_marker_missing(THROW_UP),
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
positive_log_probability(SILENT),
|
||||
#else
|
||||
positive_log_probability(THROW_UP),
|
||||
#endif
|
||||
unknown_missing_logprob(-100.0),
|
||||
probing_multiplier(1.5),
|
||||
building_memory(1073741824ULL), // 1 GB
|
||||
|
@ -17,14 +17,14 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
|
||||
typedef VocabularyT Vocabulary;
|
||||
|
||||
/* Translate from void* to State */
|
||||
FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
|
||||
FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
|
||||
return static_cast<const Child*>(this)->FullScore(
|
||||
*reinterpret_cast<const State*>(in_state),
|
||||
new_word,
|
||||
*reinterpret_cast<State*>(out_state));
|
||||
}
|
||||
|
||||
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
|
||||
FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
|
||||
return static_cast<const Child*>(this)->FullScoreForgotState(
|
||||
context_rbegin,
|
||||
context_rend,
|
||||
@ -37,7 +37,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
|
||||
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
|
||||
}
|
||||
|
||||
float Score(const void *in_state, const WordIndex new_word, void *out_state) const {
|
||||
float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const {
|
||||
return static_cast<const Child*>(this)->Score(
|
||||
*reinterpret_cast<const State*>(in_state),
|
||||
new_word,
|
||||
|
@ -14,10 +14,6 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if !defined __MINGW32__
|
||||
#include <err.h>
|
||||
#endif
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
@ -5,27 +5,18 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#if !defined __MINGW32__
|
||||
#include <err.h>
|
||||
#endif
|
||||
|
||||
#include "util/fake_ofstream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/file_piece.hh"
|
||||
|
||||
namespace lm {
|
||||
|
||||
class CountOutput : boost::noncopyable {
|
||||
public:
|
||||
explicit CountOutput(const char *name) : file_(name, std::ios::out) {}
|
||||
explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {}
|
||||
|
||||
void AddNGram(const StringPiece &line) {
|
||||
if (!(file_ << line << '\n')) {
|
||||
#if defined __MINGW32__
|
||||
std::cerr<<"Writing counts file failed"<<std::endl;
|
||||
exit(3);
|
||||
#else
|
||||
err(3, "Writing counts file failed");
|
||||
#endif
|
||||
}
|
||||
file_ << line << '\n';
|
||||
}
|
||||
|
||||
template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
|
||||
@ -37,12 +28,12 @@ class CountOutput : boost::noncopyable {
|
||||
}
|
||||
|
||||
private:
|
||||
std::fstream file_;
|
||||
util::FakeOFStream file_;
|
||||
};
|
||||
|
||||
class CountBatch {
|
||||
public:
|
||||
explicit CountBatch(std::streamsize initial_read)
|
||||
explicit CountBatch(std::streamsize initial_read)
|
||||
: initial_read_(initial_read) {
|
||||
buffer_.reserve(initial_read);
|
||||
}
|
||||
@ -75,7 +66,7 @@ class CountBatch {
|
||||
private:
|
||||
std::streamsize initial_read_;
|
||||
|
||||
// This could have been a std::string but that's less happy with raw writes.
|
||||
// This could have been a std::string but that's less happy with raw writes.
|
||||
std::vector<char> buffer_;
|
||||
};
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
#endif
|
||||
#include "lm/filter/vocab.hh"
|
||||
#include "lm/filter/wrapper.hh"
|
||||
#include "util/exception.hh"
|
||||
#include "util/file_piece.hh"
|
||||
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
@ -57,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
|
||||
typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
|
||||
|
||||
struct Config {
|
||||
Config() :
|
||||
Config() :
|
||||
#ifndef NTHREAD
|
||||
batch_size(25000),
|
||||
threads(boost::thread::hardware_concurrency()),
|
||||
@ -157,102 +158,96 @@ template <class Format> void DispatchFilterModes(const Config &config, std::istr
|
||||
} // namespace lm
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc < 4) {
|
||||
lm::DisplayHelp(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// I used to have boost::program_options, but some users didn't want to compile boost.
|
||||
lm::Config config;
|
||||
config.mode = lm::MODE_UNSET;
|
||||
for (int i = 1; i < argc - 2; ++i) {
|
||||
const char *str = argv[i];
|
||||
if (!std::strcmp(str, "copy")) {
|
||||
config.mode = lm::MODE_COPY;
|
||||
} else if (!std::strcmp(str, "single")) {
|
||||
config.mode = lm::MODE_SINGLE;
|
||||
} else if (!std::strcmp(str, "multiple")) {
|
||||
config.mode = lm::MODE_MULTIPLE;
|
||||
} else if (!std::strcmp(str, "union")) {
|
||||
config.mode = lm::MODE_UNION;
|
||||
} else if (!std::strcmp(str, "phrase")) {
|
||||
config.phrase = true;
|
||||
} else if (!std::strcmp(str, "context")) {
|
||||
config.context = true;
|
||||
} else if (!std::strcmp(str, "arpa")) {
|
||||
config.format = lm::FORMAT_ARPA;
|
||||
} else if (!std::strcmp(str, "raw")) {
|
||||
config.format = lm::FORMAT_COUNT;
|
||||
#ifndef NTHREAD
|
||||
} else if (!std::strncmp(str, "threads:", 8)) {
|
||||
config.threads = boost::lexical_cast<size_t>(str + 8);
|
||||
if (!config.threads) {
|
||||
std::cerr << "Specify at least one thread." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
} else if (!std::strncmp(str, "batch_size:", 11)) {
|
||||
config.batch_size = boost::lexical_cast<size_t>(str + 11);
|
||||
if (config.batch_size < 5000) {
|
||||
std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
|
||||
if (!config.batch_size) return 1;
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
try {
|
||||
if (argc < 4) {
|
||||
lm::DisplayHelp(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (config.mode == lm::MODE_UNSET) {
|
||||
lm::DisplayHelp(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
|
||||
std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool cmd_is_model = true;
|
||||
const char *cmd_input = argv[argc - 2];
|
||||
if (!strncmp(cmd_input, "vocab:", 6)) {
|
||||
cmd_is_model = false;
|
||||
cmd_input += 6;
|
||||
} else if (!strncmp(cmd_input, "model:", 6)) {
|
||||
cmd_input += 6;
|
||||
} else if (strchr(cmd_input, ':')) {
|
||||
#if defined __MINGW32__
|
||||
std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
|
||||
exit(1);
|
||||
#else
|
||||
errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input);
|
||||
#endif // defined
|
||||
} else {
|
||||
std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
|
||||
}
|
||||
std::ifstream cmd_file;
|
||||
std::istream *vocab;
|
||||
if (cmd_is_model) {
|
||||
vocab = &std::cin;
|
||||
} else {
|
||||
cmd_file.open(cmd_input, std::ios::in);
|
||||
if (!cmd_file) {
|
||||
#if defined __MINGW32__
|
||||
std::cerr << "Could not open input file " << cmd_input << std::endl;
|
||||
exit(2);
|
||||
#else
|
||||
err(2, "Could not open input file %s", cmd_input);
|
||||
#endif // defined
|
||||
// I used to have boost::program_options, but some users didn't want to compile boost.
|
||||
lm::Config config;
|
||||
config.mode = lm::MODE_UNSET;
|
||||
for (int i = 1; i < argc - 2; ++i) {
|
||||
const char *str = argv[i];
|
||||
if (!std::strcmp(str, "copy")) {
|
||||
config.mode = lm::MODE_COPY;
|
||||
} else if (!std::strcmp(str, "single")) {
|
||||
config.mode = lm::MODE_SINGLE;
|
||||
} else if (!std::strcmp(str, "multiple")) {
|
||||
config.mode = lm::MODE_MULTIPLE;
|
||||
} else if (!std::strcmp(str, "union")) {
|
||||
config.mode = lm::MODE_UNION;
|
||||
} else if (!std::strcmp(str, "phrase")) {
|
||||
config.phrase = true;
|
||||
} else if (!std::strcmp(str, "context")) {
|
||||
config.context = true;
|
||||
} else if (!std::strcmp(str, "arpa")) {
|
||||
config.format = lm::FORMAT_ARPA;
|
||||
} else if (!std::strcmp(str, "raw")) {
|
||||
config.format = lm::FORMAT_COUNT;
|
||||
#ifndef NTHREAD
|
||||
} else if (!std::strncmp(str, "threads:", 8)) {
|
||||
config.threads = boost::lexical_cast<size_t>(str + 8);
|
||||
if (!config.threads) {
|
||||
std::cerr << "Specify at least one thread." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
} else if (!std::strncmp(str, "batch_size:", 11)) {
|
||||
config.batch_size = boost::lexical_cast<size_t>(str + 11);
|
||||
if (config.batch_size < 5000) {
|
||||
std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
|
||||
if (!config.batch_size) return 1;
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
lm::DisplayHelp(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
vocab = &cmd_file;
|
||||
}
|
||||
|
||||
util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
|
||||
if (config.mode == lm::MODE_UNSET) {
|
||||
lm::DisplayHelp(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (config.format == lm::FORMAT_ARPA) {
|
||||
lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
|
||||
} else if (config.format == lm::FORMAT_COUNT) {
|
||||
lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
|
||||
if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
|
||||
std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool cmd_is_model = true;
|
||||
const char *cmd_input = argv[argc - 2];
|
||||
if (!strncmp(cmd_input, "vocab:", 6)) {
|
||||
cmd_is_model = false;
|
||||
cmd_input += 6;
|
||||
} else if (!strncmp(cmd_input, "model:", 6)) {
|
||||
cmd_input += 6;
|
||||
} else if (strchr(cmd_input, ':')) {
|
||||
std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
|
||||
return 1;
|
||||
} else {
|
||||
std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
|
||||
}
|
||||
std::ifstream cmd_file;
|
||||
std::istream *vocab;
|
||||
if (cmd_is_model) {
|
||||
vocab = &std::cin;
|
||||
} else {
|
||||
cmd_file.open(cmd_input, std::ios::in);
|
||||
UTIL_THROW_IF(!cmd_file, util::ErrnoException, "Failed to open " << cmd_input);
|
||||
vocab = &cmd_file;
|
||||
}
|
||||
|
||||
util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
|
||||
|
||||
if (config.format == lm::FORMAT_ARPA) {
|
||||
lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
|
||||
} else if (config.format == lm::FORMAT_COUNT) {
|
||||
lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
|
||||
}
|
||||
return 0;
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << e.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
#ifndef LM_FILTER_FORMAT_H__
|
||||
#define LM_FITLER_FORMAT_H__
|
||||
#define LM_FILTER_FORMAT_H__
|
||||
|
||||
#include "lm/filter/arpa_io.hh"
|
||||
#include "lm/filter/count_io.hh"
|
||||
|
@ -5,10 +5,6 @@
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
#if !defined __MINGW32__
|
||||
#include <err.h>
|
||||
#endif
|
||||
|
||||
namespace lm {
|
||||
namespace vocab {
|
||||
|
||||
@ -34,7 +30,7 @@ bool IsLineEnd(std::istream &in) {
|
||||
}// namespace
|
||||
|
||||
// Read space separated words in enter separated lines. These lines can be
|
||||
// very long, so don't read an entire line at a time.
|
||||
// very long, so don't read an entire line at a time.
|
||||
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
|
||||
in.exceptions(std::istream::badbit);
|
||||
unsigned int sentence = 0;
|
||||
|
84
lm/model.cc
84
lm/model.cc
@ -34,8 +34,47 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
||||
if (static_cast<std::size_t>(start - static_cast<uint8_t*>(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast<uint8_t*>(base)) << " but Size says they should take " << goal_size);
|
||||
}
|
||||
|
||||
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
|
||||
LoadLM(file, config, *this);
|
||||
namespace {
|
||||
void ComplainAboutARPA(const Config &config, ModelType model_type) {
|
||||
if (config.write_mmap || !config.messages) return;
|
||||
if (config.arpa_complain == Config::ALL) {
|
||||
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
|
||||
} else if (config.arpa_complain == Config::EXPENSIVE &&
|
||||
(model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
|
||||
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void CheckCounts(const std::vector<uint64_t> &counts) {
|
||||
UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE);
|
||||
if (sizeof(uint64_t) > sizeof(std::size_t)) {
|
||||
for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) {
|
||||
UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &init_config) : backing_(init_config) {
|
||||
util::scoped_fd fd(util::OpenReadOrThrow(file));
|
||||
if (IsBinaryFormat(fd.get())) {
|
||||
Parameters parameters;
|
||||
int fd_shallow = fd.release();
|
||||
backing_.InitializeBinary(fd_shallow, kModelType, kVersion, parameters);
|
||||
CheckCounts(parameters.counts);
|
||||
|
||||
Config new_config(init_config);
|
||||
new_config.probing_multiplier = parameters.fixed.probing_multiplier;
|
||||
Search::UpdateConfigFromBinary(backing_, parameters.counts, VocabularyT::Size(parameters.counts[0], new_config), new_config);
|
||||
UTIL_THROW_IF(new_config.enumerate_vocab && !parameters.fixed.has_vocabulary, FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
|
||||
|
||||
SetupMemory(backing_.LoadBinary(Size(parameters.counts, new_config)), parameters.counts, new_config);
|
||||
vocab_.LoadedBinary(parameters.fixed.has_vocabulary, fd_shallow, new_config.enumerate_vocab, backing_.VocabStringReadingOffset());
|
||||
} else {
|
||||
ComplainAboutARPA(init_config, kModelType);
|
||||
InitializeFromARPA(fd.release(), file, init_config);
|
||||
}
|
||||
|
||||
// g++ prints warnings unless these are fully initialized.
|
||||
State begin_sentence = State();
|
||||
@ -50,27 +89,9 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
|
||||
P::Init(begin_sentence, null_context, vocab_, search_.Order());
|
||||
}
|
||||
|
||||
namespace {
|
||||
void CheckCounts(const std::vector<uint64_t> &counts) {
|
||||
UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE);
|
||||
if (sizeof(uint64_t) > sizeof(std::size_t)) {
|
||||
for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) {
|
||||
UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines.");
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) {
|
||||
CheckCounts(params.counts);
|
||||
SetupMemory(start, params.counts, config);
|
||||
vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab);
|
||||
search_.LoadedBinary();
|
||||
}
|
||||
|
||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
|
||||
// Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
|
||||
util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());
|
||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(int fd, const char *file, const Config &config) {
|
||||
// Backing file is the ARPA.
|
||||
util::FilePiece f(fd, file, config.ProgressMessages());
|
||||
try {
|
||||
std::vector<uint64_t> counts;
|
||||
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
|
||||
@ -81,13 +102,17 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
||||
|
||||
std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
|
||||
// Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
|
||||
vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);
|
||||
vocab_.SetupMemory(backing_.SetupJustVocab(vocab_size, counts.size()), vocab_size, counts[0], config);
|
||||
|
||||
if (config.write_mmap) {
|
||||
if (config.write_mmap && config.include_vocab) {
|
||||
WriteWordsWrapper wrap(config.enumerate_vocab);
|
||||
vocab_.ConfigureEnumerate(&wrap, counts[0]);
|
||||
search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
|
||||
wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + Search::Size(counts, config));
|
||||
void *vocab_rebase, *search_rebase;
|
||||
backing_.WriteVocabWords(wrap.Buffer(), vocab_rebase, search_rebase);
|
||||
// Due to writing at the end of file, mmap may have relocated data. So remap.
|
||||
vocab_.Relocate(vocab_rebase);
|
||||
search_.SetupMemory(reinterpret_cast<uint8_t*>(search_rebase), counts, config);
|
||||
} else {
|
||||
vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]);
|
||||
search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
|
||||
@ -99,18 +124,13 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
||||
search_.UnknownUnigram().backoff = 0.0;
|
||||
search_.UnknownUnigram().prob = config.unknown_missing_logprob;
|
||||
}
|
||||
FinishFile(config, kModelType, kVersion, counts, vocab_.UnkCountChangePadding(), backing_);
|
||||
backing_.FinishFile(config, kModelType, kVersion, counts);
|
||||
} catch (util::Exception &e) {
|
||||
e << " Byte: " << f.Offset();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
|
||||
util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config));
|
||||
Search::UpdateConfigFromBinary(fd, counts, config);
|
||||
}
|
||||
|
||||
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
|
||||
FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
|
||||
for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) {
|
||||
|
12
lm/model.hh
12
lm/model.hh
@ -104,10 +104,6 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
|
||||
}
|
||||
|
||||
private:
|
||||
friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
|
||||
|
||||
static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
|
||||
|
||||
FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
|
||||
|
||||
// Score bigrams and above. Do not include backoff.
|
||||
@ -116,15 +112,11 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
|
||||
// Appears after Size in the cc file.
|
||||
void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);
|
||||
|
||||
void InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd);
|
||||
|
||||
void InitializeFromARPA(const char *file, const Config &config);
|
||||
void InitializeFromARPA(int fd, const char *file, const Config &config);
|
||||
|
||||
float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
|
||||
|
||||
Backing &MutableBacking() { return backing_; }
|
||||
|
||||
Backing backing_;
|
||||
BinaryFormat backing_;
|
||||
|
||||
VocabularyT vocab_;
|
||||
|
||||
|
@ -360,10 +360,11 @@ BOOST_AUTO_TEST_CASE(quant_bhiksha_trie) {
|
||||
LoadingTest<QuantArrayTrieModel>();
|
||||
}
|
||||
|
||||
template <class ModelT> void BinaryTest() {
|
||||
template <class ModelT> void BinaryTest(Config::WriteMethod write_method) {
|
||||
Config config;
|
||||
config.write_mmap = "test.binary";
|
||||
config.messages = NULL;
|
||||
config.write_method = write_method;
|
||||
ExpectEnumerateVocab enumerate;
|
||||
config.enumerate_vocab = &enumerate;
|
||||
|
||||
@ -406,6 +407,11 @@ template <class ModelT> void BinaryTest() {
|
||||
unlink("test_nounk.binary");
|
||||
}
|
||||
|
||||
template <class ModelT> void BinaryTest() {
|
||||
BinaryTest<ModelT>(Config::WRITE_MMAP);
|
||||
BinaryTest<ModelT>(Config::WRITE_AFTER);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(write_and_read_probing) {
|
||||
BinaryTest<ProbingModel>();
|
||||
}
|
||||
|
@ -38,13 +38,13 @@ const char kSeparatelyQuantizeVersion = 2;
|
||||
|
||||
} // namespace
|
||||
|
||||
void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
|
||||
char version;
|
||||
util::ReadOrThrow(fd, &version, 1);
|
||||
util::ReadOrThrow(fd, &config.prob_bits, 1);
|
||||
util::ReadOrThrow(fd, &config.backoff_bits, 1);
|
||||
void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
|
||||
unsigned char buffer[3];
|
||||
file.ReadForConfig(buffer, 3, offset);
|
||||
char version = buffer[0];
|
||||
config.prob_bits = buffer[1];
|
||||
config.backoff_bits = buffer[2];
|
||||
if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
|
||||
util::AdvanceOrThrow(fd, -3);
|
||||
}
|
||||
|
||||
void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
|
||||
|
@ -18,12 +18,13 @@ namespace lm {
|
||||
namespace ngram {
|
||||
|
||||
struct Config;
|
||||
class BinaryFormat;
|
||||
|
||||
/* Store values directly and don't quantize. */
|
||||
class DontQuantize {
|
||||
public:
|
||||
static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
|
||||
static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
|
||||
static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &) {}
|
||||
static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; }
|
||||
static uint8_t MiddleBits(const Config &/*config*/) { return 63; }
|
||||
static uint8_t LongestBits(const Config &/*config*/) { return 31; }
|
||||
@ -136,7 +137,7 @@ class SeparatelyQuantize {
|
||||
public:
|
||||
static const ModelType kModelTypeAdd = kQuantAdd;
|
||||
|
||||
static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
|
||||
static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
|
||||
|
||||
static uint64_t Size(uint8_t order, const Config &config) {
|
||||
uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float);
|
||||
|
@ -204,9 +204,10 @@ template <class Build, class Activate, class Store> void ReadNGrams(
|
||||
namespace detail {
|
||||
|
||||
template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
|
||||
std::size_t allocated = Unigram::Size(counts[0]);
|
||||
unigram_ = Unigram(start, counts[0], allocated);
|
||||
start += allocated;
|
||||
unigram_ = Unigram(start, counts[0]);
|
||||
start += Unigram::Size(counts[0]);
|
||||
std::size_t allocated;
|
||||
middle_.clear();
|
||||
for (unsigned int n = 2; n < counts.size(); ++n) {
|
||||
allocated = Middle::Size(counts[n - 1], config.probing_multiplier);
|
||||
middle_.push_back(Middle(start, allocated));
|
||||
@ -218,9 +219,21 @@ template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start,
|
||||
return start;
|
||||
}
|
||||
|
||||
template <class Value> void HashedSearch<Value>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing) {
|
||||
// TODO: fix sorted.
|
||||
SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config);
|
||||
/*template <class Value> void HashedSearch<Value>::Relocate(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
|
||||
unigram_ = Unigram(start, counts[0]);
|
||||
start += Unigram::Size(counts[0]);
|
||||
for (unsigned int n = 2; n < counts.size(); ++n) {
|
||||
middle[n-2].Relocate(start);
|
||||
start += Middle::Size(counts[n - 1], config.probing_multiplier)
|
||||
}
|
||||
longest_.Relocate(start);
|
||||
}*/
|
||||
|
||||
template <class Value> void HashedSearch<Value>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing) {
|
||||
void *vocab_rebase;
|
||||
void *search_base = backing.GrowForSearch(Size(counts, config), vocab.UnkCountChangePadding(), vocab_rebase);
|
||||
vocab.Relocate(vocab_rebase);
|
||||
SetupMemory(reinterpret_cast<uint8_t*>(search_base), counts, config);
|
||||
|
||||
PositiveProbWarn warn(config.positive_log_probability);
|
||||
Read1Grams(f, counts[0], vocab, unigram_.Raw(), warn);
|
||||
@ -277,14 +290,6 @@ template <class Value> template <class Build> void HashedSearch<Value>::ApplyBui
|
||||
ReadEnd(f);
|
||||
}
|
||||
|
||||
template <class Value> void HashedSearch<Value>::LoadedBinary() {
|
||||
unigram_.LoadedBinary();
|
||||
for (typename std::vector<Middle>::iterator i = middle_.begin(); i != middle_.end(); ++i) {
|
||||
i->LoadedBinary();
|
||||
}
|
||||
longest_.LoadedBinary();
|
||||
}
|
||||
|
||||
template class HashedSearch<BackoffValue>;
|
||||
template class HashedSearch<RestValue>;
|
||||
|
||||
|
@ -18,7 +18,7 @@ namespace util { class FilePiece; }
|
||||
|
||||
namespace lm {
|
||||
namespace ngram {
|
||||
struct Backing;
|
||||
class BinaryFormat;
|
||||
class ProbingVocabulary;
|
||||
namespace detail {
|
||||
|
||||
@ -72,7 +72,7 @@ template <class Value> class HashedSearch {
|
||||
static const unsigned int kVersion = 0;
|
||||
|
||||
// TODO: move probing_multiplier here with next binary file format update.
|
||||
static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
|
||||
static void UpdateConfigFromBinary(const BinaryFormat &, const std::vector<uint64_t> &, uint64_t, Config &) {}
|
||||
|
||||
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
|
||||
uint64_t ret = Unigram::Size(counts[0]);
|
||||
@ -84,9 +84,7 @@ template <class Value> class HashedSearch {
|
||||
|
||||
uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config);
|
||||
|
||||
void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing);
|
||||
|
||||
void LoadedBinary();
|
||||
void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing);
|
||||
|
||||
unsigned char Order() const {
|
||||
return middle_.size() + 2;
|
||||
@ -148,7 +146,7 @@ template <class Value> class HashedSearch {
|
||||
public:
|
||||
Unigram() {}
|
||||
|
||||
Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
|
||||
Unigram(void *start, uint64_t count) :
|
||||
unigram_(static_cast<typename Value::Weights*>(start))
|
||||
#ifdef DEBUG
|
||||
, count_(count)
|
||||
@ -168,8 +166,6 @@ template <class Value> class HashedSearch {
|
||||
|
||||
typename Value::Weights &Unknown() { return unigram_[0]; }
|
||||
|
||||
void LoadedBinary() {}
|
||||
|
||||
// For building.
|
||||
typename Value::Weights *Raw() { return unigram_; }
|
||||
|
||||
|
@ -459,7 +459,7 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c
|
||||
|
||||
} // namespace
|
||||
|
||||
template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
|
||||
template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing) {
|
||||
RecordReader inputs[KENLM_MAX_ORDER - 1];
|
||||
RecordReader contexts[KENLM_MAX_ORDER - 1];
|
||||
|
||||
@ -488,7 +488,10 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
||||
|
||||
sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);
|
||||
|
||||
out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), backing), fixed_counts, config);
|
||||
void *vocab_relocate;
|
||||
void *search_base = backing.GrowForSearch(TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), vocab.UnkCountChangePadding(), vocab_relocate);
|
||||
vocab.Relocate(vocab_relocate);
|
||||
out.SetupMemory(reinterpret_cast<uint8_t*>(search_base), fixed_counts, config);
|
||||
|
||||
for (unsigned char i = 2; i <= counts.size(); ++i) {
|
||||
inputs[i-2].Rewind();
|
||||
@ -571,15 +574,7 @@ template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::Setup
|
||||
return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]);
|
||||
}
|
||||
|
||||
template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::LoadedBinary() {
|
||||
unigram_.LoadedBinary();
|
||||
for (Middle *i = middle_begin_; i != middle_end_; ++i) {
|
||||
i->LoadedBinary();
|
||||
}
|
||||
longest_.LoadedBinary();
|
||||
}
|
||||
|
||||
template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) {
|
||||
template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing) {
|
||||
std::string temporary_prefix;
|
||||
if (config.temporary_directory_prefix) {
|
||||
temporary_prefix = config.temporary_directory_prefix;
|
||||
|
@ -17,13 +17,13 @@
|
||||
|
||||
namespace lm {
|
||||
namespace ngram {
|
||||
struct Backing;
|
||||
class BinaryFormat;
|
||||
class SortedVocabulary;
|
||||
namespace trie {
|
||||
|
||||
template <class Quant, class Bhiksha> class TrieSearch;
|
||||
class SortedFiles;
|
||||
template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
|
||||
template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing);
|
||||
|
||||
template <class Quant, class Bhiksha> class TrieSearch {
|
||||
public:
|
||||
@ -39,11 +39,11 @@ template <class Quant, class Bhiksha> class TrieSearch {
|
||||
|
||||
static const unsigned int kVersion = 1;
|
||||
|
||||
static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
|
||||
Quant::UpdateConfigFromBinary(fd, counts, config);
|
||||
util::AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0]));
|
||||
static void UpdateConfigFromBinary(const BinaryFormat &file, const std::vector<uint64_t> &counts, uint64_t offset, Config &config) {
|
||||
Quant::UpdateConfigFromBinary(file, offset, config);
|
||||
// Currently the unigram pointers are not compresssed, so there will only be a header for order > 2.
|
||||
if (counts.size() > 2) Bhiksha::UpdateConfigFromBinary(fd, config);
|
||||
if (counts.size() > 2)
|
||||
Bhiksha::UpdateConfigFromBinary(file, offset + Quant::Size(counts.size(), config) + Unigram::Size(counts[0]), config);
|
||||
}
|
||||
|
||||
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
|
||||
@ -60,9 +60,7 @@ template <class Quant, class Bhiksha> class TrieSearch {
|
||||
|
||||
uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config);
|
||||
|
||||
void LoadedBinary();
|
||||
|
||||
void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing);
|
||||
void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing);
|
||||
|
||||
unsigned char Order() const {
|
||||
return middle_end_ - middle_begin_ + 2;
|
||||
@ -103,7 +101,7 @@ template <class Quant, class Bhiksha> class TrieSearch {
|
||||
}
|
||||
|
||||
private:
|
||||
friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
|
||||
friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing);
|
||||
|
||||
// Middles are managed manually so we can delay construction and they don't have to be copyable.
|
||||
void FreeMiddles() {
|
||||
|
@ -62,8 +62,6 @@ class Unigram {
|
||||
return unigram_;
|
||||
}
|
||||
|
||||
void LoadedBinary() {}
|
||||
|
||||
UnigramPointer Find(WordIndex word, NodeRange &next) const {
|
||||
UnigramValue *val = unigram_ + word;
|
||||
next.begin = val->next;
|
||||
@ -108,8 +106,6 @@ template <class Bhiksha> class BitPackedMiddle : public BitPacked {
|
||||
|
||||
void FinishedLoading(uint64_t next_end, const Config &config);
|
||||
|
||||
void LoadedBinary() { bhiksha_.LoadedBinary(); }
|
||||
|
||||
util::BitAddress Find(WordIndex word, NodeRange &range, uint64_t &pointer) const;
|
||||
|
||||
util::BitAddress ReadEntry(uint64_t pointer, NodeRange &range) {
|
||||
@ -138,14 +134,9 @@ class BitPackedLongest : public BitPacked {
|
||||
BaseInit(base, max_vocab, quant_bits);
|
||||
}
|
||||
|
||||
void LoadedBinary() {}
|
||||
|
||||
util::BitAddress Insert(WordIndex word);
|
||||
|
||||
util::BitAddress Find(WordIndex word, const NodeRange &node) const;
|
||||
|
||||
private:
|
||||
uint8_t quant_bits_;
|
||||
};
|
||||
|
||||
} // namespace trie
|
||||
|
@ -50,6 +50,10 @@ class PartialViewProxy {
|
||||
const void *Data() const { return inner_.Data(); }
|
||||
void *Data() { return inner_.Data(); }
|
||||
|
||||
friend void swap(PartialViewProxy first, PartialViewProxy second) {
|
||||
std::swap_ranges(reinterpret_cast<char*>(first.Data()), reinterpret_cast<char*>(first.Data()) + first.attention_size_, reinterpret_cast<char*>(second.Data()));
|
||||
}
|
||||
|
||||
private:
|
||||
friend class util::ProxyIterator<PartialViewProxy>;
|
||||
|
||||
|
@ -125,13 +125,13 @@ class Model {
|
||||
void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); }
|
||||
|
||||
// Requires in_state != out_state
|
||||
virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
|
||||
virtual float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
|
||||
|
||||
// Requires in_state != out_state
|
||||
virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
|
||||
virtual FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
|
||||
|
||||
// Prefer to use FullScore. The context words should be provided in reverse order.
|
||||
virtual FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0;
|
||||
virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0;
|
||||
|
||||
unsigned char Order() const { return order_; }
|
||||
|
||||
|
28
lm/vocab.cc
28
lm/vocab.cc
@ -32,7 +32,8 @@ const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5);
|
||||
// Sadly some LMs have <UNK>.
|
||||
const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5);
|
||||
|
||||
void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count) {
|
||||
void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint64_t offset) {
|
||||
util::SeekOrThrow(fd, offset);
|
||||
// Check that we're at the right place by reading <unk> which is always first.
|
||||
char check_unk[6];
|
||||
util::ReadOrThrow(fd, check_unk, 6);
|
||||
@ -80,11 +81,6 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) {
|
||||
buffer_.push_back(0);
|
||||
}
|
||||
|
||||
void WriteWordsWrapper::Write(int fd, uint64_t start) {
|
||||
util::SeekOrThrow(fd, start);
|
||||
util::WriteOrThrow(fd, buffer_.data(), buffer_.size());
|
||||
}
|
||||
|
||||
SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {}
|
||||
|
||||
uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) {
|
||||
@ -100,6 +96,12 @@ void SortedVocabulary::SetupMemory(void *start, std::size_t allocated, std::size
|
||||
saw_unk_ = false;
|
||||
}
|
||||
|
||||
void SortedVocabulary::Relocate(void *new_start) {
|
||||
std::size_t delta = end_ - begin_;
|
||||
begin_ = reinterpret_cast<uint64_t*>(new_start) + 1;
|
||||
end_ = begin_ + delta;
|
||||
}
|
||||
|
||||
void SortedVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries) {
|
||||
enumerate_ = to;
|
||||
if (enumerate_) {
|
||||
@ -147,11 +149,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
|
||||
bound_ = end_ - begin_ + 1;
|
||||
}
|
||||
|
||||
void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
|
||||
void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) {
|
||||
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
|
||||
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
||||
bound_ = end_ - begin_ + 1;
|
||||
if (have_words) ReadWords(fd, to, bound_);
|
||||
if (have_words) ReadWords(fd, to, bound_, offset);
|
||||
}
|
||||
|
||||
namespace {
|
||||
@ -179,6 +181,11 @@ void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::siz
|
||||
saw_unk_ = false;
|
||||
}
|
||||
|
||||
void ProbingVocabulary::Relocate(void *new_start) {
|
||||
header_ = static_cast<detail::ProbingVocabularyHeader*>(new_start);
|
||||
lookup_.Relocate(static_cast<uint8_t*>(new_start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)));
|
||||
}
|
||||
|
||||
void ProbingVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t /*max_entries*/) {
|
||||
enumerate_ = to;
|
||||
if (enumerate_) {
|
||||
@ -206,12 +213,11 @@ void ProbingVocabulary::InternalFinishedLoading() {
|
||||
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
||||
}
|
||||
|
||||
void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
|
||||
void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) {
|
||||
UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code.");
|
||||
lookup_.LoadedBinary();
|
||||
bound_ = header_->bound;
|
||||
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
||||
if (have_words) ReadWords(fd, to, bound_);
|
||||
if (have_words) ReadWords(fd, to, bound_, offset);
|
||||
}
|
||||
|
||||
void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
|
||||
|
12
lm/vocab.hh
12
lm/vocab.hh
@ -36,7 +36,7 @@ class WriteWordsWrapper : public EnumerateVocab {
|
||||
|
||||
void Add(WordIndex index, const StringPiece &str);
|
||||
|
||||
void Write(int fd, uint64_t start);
|
||||
const std::string &Buffer() const { return buffer_; }
|
||||
|
||||
private:
|
||||
EnumerateVocab *inner_;
|
||||
@ -71,6 +71,8 @@ class SortedVocabulary : public base::Vocabulary {
|
||||
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
|
||||
void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config);
|
||||
|
||||
void Relocate(void *new_start);
|
||||
|
||||
void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries);
|
||||
|
||||
WordIndex Insert(const StringPiece &str);
|
||||
@ -83,15 +85,13 @@ class SortedVocabulary : public base::Vocabulary {
|
||||
|
||||
bool SawUnk() const { return saw_unk_; }
|
||||
|
||||
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
|
||||
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);
|
||||
|
||||
private:
|
||||
uint64_t *begin_, *end_;
|
||||
|
||||
WordIndex bound_;
|
||||
|
||||
WordIndex highest_value_;
|
||||
|
||||
bool saw_unk_;
|
||||
|
||||
EnumerateVocab *enumerate_;
|
||||
@ -140,6 +140,8 @@ class ProbingVocabulary : public base::Vocabulary {
|
||||
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
|
||||
void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config);
|
||||
|
||||
void Relocate(void *new_start);
|
||||
|
||||
void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries);
|
||||
|
||||
WordIndex Insert(const StringPiece &str);
|
||||
@ -152,7 +154,7 @@ class ProbingVocabulary : public base::Vocabulary {
|
||||
|
||||
bool SawUnk() const { return saw_unk_; }
|
||||
|
||||
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
|
||||
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);
|
||||
|
||||
private:
|
||||
void InternalFinishedLoading();
|
||||
|
@ -242,9 +242,9 @@ void FeatureRegistry::PrintFF() const
|
||||
Map::const_iterator iter;
|
||||
for (iter = registry_.begin(); iter != registry_.end(); ++iter) {
|
||||
const string &ffName = iter->first;
|
||||
std::cerr << ffName << std::endl;
|
||||
std::cerr << ffName << " ";
|
||||
}
|
||||
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -12,6 +12,7 @@ if $(with-dlib) {
|
||||
|
||||
alias headers : ../util//kenutil : : : $(max-factors) $(dlib) ;
|
||||
alias ThreadPool : ThreadPool.cpp ;
|
||||
alias Util : Util.cpp Timer.cpp ;
|
||||
|
||||
if [ option.get "with-synlm" : no : yes ] = yes
|
||||
{
|
||||
|
@ -94,9 +94,16 @@ if $(with-nplm) {
|
||||
local with-dalm = [ option.get "with-dalm" ] ;
|
||||
if $(with-dalm) {
|
||||
lib dalm : : <search>$(with-dalm)/lib ;
|
||||
lib MurmurHash3 : : <search>$(with-dalm)/lib ;
|
||||
obj DALM.o : DALMWrapper.cpp dalm MurmurHash3 ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
|
||||
alias dalmALIAS : DALM.o dalm MurmurHash3 : : : <define>LM_DALM ;
|
||||
|
||||
if [ path.exists $(with-dalm)/lib/libMurmurHash3.a ] {
|
||||
lib MurmurHash3 : : <search>$(with-dalm)/lib ;
|
||||
alias dalm-libs : dalm MurmurHash3 ;
|
||||
} else {
|
||||
alias dalm-libs : dalm ;
|
||||
}
|
||||
|
||||
obj DALM.o : DALMWrapper.cpp dalm-libs ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
|
||||
alias dalmALIAS : DALM.o dalm-libs : : : <define>LM_DALM ;
|
||||
dependencies += dalmALIAS ;
|
||||
lmmacros += LM_DALM ;
|
||||
}
|
||||
|
@ -202,6 +202,7 @@ Parameter::Parameter()
|
||||
AddParam("placeholder-factor", "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model");
|
||||
AddParam("no-cache", "Disable all phrase-table caching. Default = false (ie. enable caching)");
|
||||
|
||||
AddParam("adjacent-only", "Only allow hypotheses which are adjacent to current derivation. ITG without block moves");
|
||||
|
||||
}
|
||||
|
||||
|
@ -250,6 +250,11 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
|
||||
return true;
|
||||
}
|
||||
|
||||
if (StaticData::Instance().AdjacentOnly() &&
|
||||
!hypoBitmap.IsAdjacent(range.GetStartPos(), range.GetEndPos())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool leftMostEdge = (hypoFirstGapPos == startPos);
|
||||
// any length extension is okay if starting at left-most edge
|
||||
if (leftMostEdge) {
|
||||
|
@ -253,6 +253,11 @@ void SearchNormal::ExpandAllHypotheses(const Hypothesis &hypothesis, size_t star
|
||||
expectedScore += m_transOptColl.GetFutureScore().CalcFutureScore( hypothesis.GetWordsBitmap(), startPos, endPos );
|
||||
}
|
||||
|
||||
if (StaticData::Instance().AdjacentOnly() &&
|
||||
!hypothesis.GetWordsBitmap().IsAdjacent(startPos, endPos)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// loop through all translation options
|
||||
const TranslationOptionList &transOptList = m_transOptColl.GetTranslationOptionList(WordsRange(startPos, endPos));
|
||||
TranslationOptionList::const_iterator iter;
|
||||
|
@ -385,6 +385,8 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
|
||||
SetBooleanParameter( &m_lmEnableOOVFeature, "lmodel-oov-feature", false);
|
||||
|
||||
SetBooleanParameter( &m_adjacentOnly, "adjacent-only", false);
|
||||
|
||||
// minimum Bayes risk decoding
|
||||
SetBooleanParameter( &m_mbr, "minimum-bayes-risk", false );
|
||||
m_mbrSize = (m_parameter->GetParam("mbr-size").size() > 0) ?
|
||||
|
@ -197,6 +197,7 @@ protected:
|
||||
|
||||
FactorType m_placeHolderFactor;
|
||||
bool m_useLegacyPT;
|
||||
bool m_adjacentOnly;
|
||||
|
||||
FeatureRegistry m_registry;
|
||||
|
||||
@ -753,6 +754,8 @@ public:
|
||||
return &m_soft_matches_map_reverse;
|
||||
}
|
||||
|
||||
bool AdjacentOnly() const
|
||||
{ return m_adjacentOnly; }
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -58,8 +58,7 @@ const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollectionLEGACY(
|
||||
|
||||
size_t hash = hash_value(src);
|
||||
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
|
||||
|
||||
CacheColl::iterator iter;
|
||||
iter = cache.find(hash);
|
||||
|
||||
if (iter == cache.end()) {
|
||||
@ -179,7 +178,7 @@ void PhraseDictionary::ReduceCache() const
|
||||
|
||||
// find cutoff for last used time
|
||||
priority_queue< clock_t > lastUsedTimes;
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iter;
|
||||
CacheColl::iterator iter;
|
||||
iter = cache.begin();
|
||||
while( iter != cache.end() ) {
|
||||
lastUsedTimes.push( iter->second.second );
|
||||
@ -193,7 +192,7 @@ void PhraseDictionary::ReduceCache() const
|
||||
iter = cache.begin();
|
||||
while( iter != cache.end() ) {
|
||||
if (iter->second.second < cutoffLastUsedTime) {
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iterRemove = iter++;
|
||||
CacheColl::iterator iterRemove = iter++;
|
||||
delete iterRemove->second.first;
|
||||
cache.erase(iterRemove);
|
||||
} else iter++;
|
||||
|
@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
#include <boost/thread/tss.hpp>
|
||||
@ -54,7 +55,7 @@ class ChartCellCollectionBase;
|
||||
class ChartRuleLookupManager;
|
||||
class ChartParser;
|
||||
|
||||
class CacheColl : public std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
|
||||
class CacheColl : public boost::unordered_map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
|
||||
{
|
||||
// 1st = hash of source phrase/ address of phrase-table node
|
||||
// 2nd = all translations
|
||||
|
@ -59,7 +59,7 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
|
||||
|
||||
CacheColl &cache = GetCache();
|
||||
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
|
||||
CacheColl::iterator iter;
|
||||
iter = cache.find(hash);
|
||||
|
||||
if (iter != cache.end()) {
|
||||
|
@ -165,7 +165,7 @@ const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollection(
|
||||
CacheColl &cache = GetCache();
|
||||
size_t hash = (size_t) ptNode->GetFilePos();
|
||||
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
|
||||
CacheColl::iterator iter;
|
||||
|
||||
iter = cache.find(hash);
|
||||
|
||||
|
@ -63,6 +63,22 @@ int WordsBitmap::GetFutureCosts(int lastPos) const
|
||||
return sum;
|
||||
}
|
||||
|
||||
bool WordsBitmap::IsAdjacent(size_t startPos, size_t endPos) const
|
||||
{
|
||||
if (GetNumWordsCovered() == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t first = GetFirstGapPos();
|
||||
size_t last = GetLastGapPos();
|
||||
|
||||
if (startPos == last || endPos == first) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -132,6 +132,8 @@ public:
|
||||
return NOT_FOUND;
|
||||
}
|
||||
|
||||
bool IsAdjacent(size_t startPos, size_t endPos) const;
|
||||
|
||||
//! whether a word has been translated at a particular position
|
||||
bool GetValue(size_t pos) const {
|
||||
return m_bitmap[pos];
|
||||
|
@ -1,6 +1,5 @@
|
||||
// $Id$
|
||||
//#include "beammain.h"
|
||||
#include "domain.h"
|
||||
#include "DomainFeature.h"
|
||||
#include "ExtractionPhrasePair.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "SafeGetline.h"
|
||||
@ -26,7 +25,7 @@ void Domain::load( const std::string &domainFileName )
|
||||
int lineNumber;
|
||||
if (domainSpecLine.size() != 2 ||
|
||||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
|
||||
cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
|
||||
std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
|
||||
exit(1);
|
||||
}
|
||||
// store
|
||||
@ -50,29 +49,34 @@ string Domain::getDomainOfSentence( int sentenceId ) const
|
||||
return "undefined";
|
||||
}
|
||||
|
||||
DomainFeature::DomainFeature(const string& domainFile)
|
||||
DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
|
||||
{
|
||||
//process domain file
|
||||
m_domain.load(domainFile);
|
||||
}
|
||||
|
||||
void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
int sentenceId) const
|
||||
{
|
||||
std::string value = m_domain.getDomainOfSentence(sentenceId);
|
||||
phrasePair.AddProperty(m_propertyKey, value, count);
|
||||
}
|
||||
|
||||
void DomainFeature::add(const ScoreFeatureContext& context,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const
|
||||
{
|
||||
map< string, float > domainCount;
|
||||
for(size_t i=0; i<context.phrasePair.size(); i++) {
|
||||
string d = m_domain.getDomainOfSentence(context.phrasePair[i]->sentenceId );
|
||||
if (domainCount.find( d ) == domainCount.end()) {
|
||||
domainCount[d] = context.phrasePair[i]->count;
|
||||
} else {
|
||||
domainCount[d] += context.phrasePair[i]->count;
|
||||
}
|
||||
}
|
||||
add(domainCount, context.count, context.maybeLog, denseValues, sparseValues);
|
||||
const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
|
||||
assert( domainCount != NULL );
|
||||
add(*domainCount,
|
||||
context.phrasePair.GetCount(),
|
||||
context.maybeLog,
|
||||
denseValues, sparseValues);
|
||||
}
|
||||
|
||||
void SubsetDomainFeature::add(const map<string,float>& domainCount,float count,
|
||||
void SubsetDomainFeature::add(const map<string,float>& domainCount,
|
||||
float count,
|
||||
const MaybeLog& maybeLog,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const
|
||||
@ -152,7 +156,6 @@ void IndicatorDomainFeature::add(const map<string,float>& domainCount,float coun
|
||||
denseValues.push_back(maybeLog(2.718));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
|
||||
@ -166,12 +169,5 @@ void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,floa
|
||||
}
|
||||
}
|
||||
|
||||
bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
|
||||
{
|
||||
return m_domain.getDomainOfSentence(lhs.sentenceId) ==
|
||||
m_domain.getDomainOfSentence( rhs.sentenceId);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -34,13 +34,17 @@ class DomainFeature : public ScoreFeature
|
||||
public:
|
||||
|
||||
DomainFeature(const std::string& domainFile);
|
||||
bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
|
||||
|
||||
void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
int sentenceId) const;
|
||||
|
||||
void add(const ScoreFeatureContext& context,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
|
||||
protected:
|
||||
/** Overriden in subclass */
|
||||
/** Overridden in subclass */
|
||||
virtual void add(const std::map<std::string,float>& domainCounts, float count,
|
||||
const MaybeLog& maybeLog,
|
||||
std::vector<float>& denseValues,
|
||||
@ -49,6 +53,8 @@ protected:
|
||||
|
||||
Domain m_domain;
|
||||
|
||||
const std::string m_propertyKey;
|
||||
|
||||
};
|
||||
|
||||
class SubsetDomainFeature : public DomainFeature
|
327
phrase-extract/ExtractionPhrasePair.cpp
Normal file
327
phrase-extract/ExtractionPhrasePair.cpp
Normal file
@ -0,0 +1,327 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2009 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include <sstream>
|
||||
#include "ExtractionPhrasePair.h"
|
||||
#include "SafeGetline.h"
|
||||
#include "tables-core.h"
|
||||
#include "score.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
namespace MosesTraining {
|
||||
|
||||
|
||||
extern Vocabulary vcbT;
|
||||
extern Vocabulary vcbS;
|
||||
|
||||
extern bool hierarchicalFlag;
|
||||
|
||||
|
||||
ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
|
||||
const PHRASE *phraseTarget,
|
||||
ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum ) :
|
||||
m_phraseSource(phraseSource),
|
||||
m_phraseTarget(phraseTarget),
|
||||
m_count(count),
|
||||
m_pcfgSum(pcfgSum)
|
||||
{
|
||||
assert(phraseSource->empty());
|
||||
assert(phraseTarget->empty());
|
||||
|
||||
m_count = count;
|
||||
m_pcfgSum = pcfgSum;
|
||||
|
||||
std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
|
||||
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
|
||||
|
||||
m_lastTargetToSourceAlignment = insertedAlignment.first;
|
||||
m_lastCount = m_count;
|
||||
m_lastPcfgSum = m_pcfgSum;
|
||||
|
||||
m_isValid = true;
|
||||
}
|
||||
|
||||
|
||||
ExtractionPhrasePair::~ExtractionPhrasePair( ) {
|
||||
Clear();
|
||||
}
|
||||
|
||||
|
||||
// return value: true if the given alignment was seen for the first time and thus will be stored,
|
||||
// false if it was present already (the pointer may thus be deleted(
|
||||
bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum )
|
||||
{
|
||||
m_count += count;
|
||||
m_pcfgSum += pcfgSum;
|
||||
|
||||
m_lastCount = count;
|
||||
m_lastPcfgSum = pcfgSum;
|
||||
|
||||
std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
|
||||
if ( *(iter->first) == *targetToSourceAlignment ) {
|
||||
iter->second += count;
|
||||
return false;
|
||||
} else {
|
||||
std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
|
||||
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
|
||||
if ( !insertedAlignment.second ) {
|
||||
// the alignment already exists: increment count
|
||||
insertedAlignment.first->second += count;
|
||||
return false;
|
||||
}
|
||||
m_lastTargetToSourceAlignment = insertedAlignment.first;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
|
||||
{
|
||||
m_count += count;
|
||||
m_pcfgSum += pcfgSum;
|
||||
m_lastTargetToSourceAlignment->second += count;
|
||||
// properties
|
||||
for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
|
||||
iter !=m_properties.end(); ++iter ) {
|
||||
LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
|
||||
(*lastPropertyValue)->second += count;
|
||||
}
|
||||
|
||||
m_lastCount = count;
|
||||
m_lastPcfgSum = pcfgSum;
|
||||
}
|
||||
|
||||
|
||||
// Check for lexical match
|
||||
// and in case of SCFG rules for equal non-terminal alignment.
|
||||
bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
|
||||
const PHRASE *otherPhraseTarget,
|
||||
ALIGNMENT *otherTargetToSourceAlignment ) const
|
||||
{
|
||||
if (*otherPhraseTarget != *m_phraseTarget) {
|
||||
return false;
|
||||
}
|
||||
if (*otherPhraseSource != *m_phraseSource) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return MatchesAlignment( otherTargetToSourceAlignment );
|
||||
}
|
||||
|
||||
// Check for lexical match
|
||||
// and in case of SCFG rules for equal non-terminal alignment.
|
||||
// Set boolean indicators.
|
||||
// (Note that we check in the order: target - source - alignment
|
||||
// and do not touch the subsequent boolean indicators once a previous one has been set to false.)
|
||||
bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
|
||||
const PHRASE *otherPhraseTarget,
|
||||
ALIGNMENT *otherTargetToSourceAlignment,
|
||||
bool &sourceMatch,
|
||||
bool &targetMatch,
|
||||
bool &alignmentMatch ) const
|
||||
{
|
||||
if (*otherPhraseSource != *m_phraseSource) {
|
||||
sourceMatch = false;
|
||||
return false;
|
||||
} else {
|
||||
sourceMatch = true;
|
||||
}
|
||||
if (*otherPhraseTarget != *m_phraseTarget) {
|
||||
targetMatch = false;
|
||||
return false;
|
||||
} else {
|
||||
targetMatch = true;
|
||||
}
|
||||
if ( !MatchesAlignment(otherTargetToSourceAlignment) ) {
|
||||
alignmentMatch = false;
|
||||
return false;
|
||||
} else {
|
||||
alignmentMatch = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check for equal non-terminal alignment in case of SCFG rules.
|
||||
// Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first
|
||||
bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
|
||||
{
|
||||
if (!hierarchicalFlag) return true;
|
||||
|
||||
// all or none of the phrasePair's word alignment matrices match, so just pick one
|
||||
const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;
|
||||
|
||||
assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
|
||||
assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());
|
||||
|
||||
// loop over all symbols but the left hand side of the rule
|
||||
for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
|
||||
if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
|
||||
size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin());
|
||||
size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());
|
||||
|
||||
if (thisTargetToSourceAlignment->at(i).size() != 1 ||
|
||||
otherTargetToSourceAlignment->at(i).size() != 1 ||
|
||||
thisAlign != otherAlign) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void ExtractionPhrasePair::Clear()
|
||||
{
|
||||
delete m_phraseSource;
|
||||
delete m_phraseTarget;
|
||||
|
||||
m_count = 0.0f;
|
||||
m_pcfgSum = 0.0f;
|
||||
|
||||
for ( std::map<ALIGNMENT*,float>::iterator iter=m_targetToSourceAlignments.begin();
|
||||
iter!=m_targetToSourceAlignments.end(); ++iter) {
|
||||
delete iter->first;
|
||||
}
|
||||
m_targetToSourceAlignments.clear();
|
||||
|
||||
for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
|
||||
iter!=m_properties.end(); ++iter) {
|
||||
delete (iter->second).second;
|
||||
delete (iter->second).first;
|
||||
}
|
||||
m_properties.clear();
|
||||
|
||||
m_lastCount = 0.0f;
|
||||
m_lastPcfgSum = 0.0f;
|
||||
m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
|
||||
|
||||
m_isValid = false;
|
||||
}
|
||||
|
||||
|
||||
void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count )
|
||||
{
|
||||
if (propertiesString.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
vector<std::string> toks;
|
||||
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
|
||||
for (size_t i = 1; i < toks.size(); ++i) {
|
||||
std::string &tok = toks[i];
|
||||
if (tok.empty()) {
|
||||
continue;
|
||||
}
|
||||
size_t endPos = tok.rfind("}");
|
||||
tok = tok.substr(0, endPos - 1);
|
||||
|
||||
vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
||||
assert(keyValue.size() == 2);
|
||||
AddProperty(keyValue[0], keyValue[1], count);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
|
||||
{
|
||||
float bestAlignmentCount = -1;
|
||||
|
||||
std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
|
||||
|
||||
for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin();
|
||||
iter!=m_targetToSourceAlignments.end(); ++iter) {
|
||||
if ( (iter->second > bestAlignmentCount) ||
|
||||
( (iter->second == bestAlignmentCount) &&
|
||||
(*(iter->first) > *(bestAlignment->first)) ) ) {
|
||||
bestAlignmentCount = iter->second;
|
||||
bestAlignment = iter;
|
||||
}
|
||||
}
|
||||
|
||||
if ( bestAlignment == m_targetToSourceAlignments.end()) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return bestAlignment->first;
|
||||
}
|
||||
|
||||
|
||||
const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const
|
||||
{
|
||||
float bestPropertyCount = -1;
|
||||
|
||||
const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
|
||||
if ( allPropertyValues == NULL ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
|
||||
|
||||
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||
iter!=allPropertyValues->end(); ++iter) {
|
||||
if ( (iter->second > bestPropertyCount) ||
|
||||
( (iter->second == bestPropertyCount) &&
|
||||
(iter->first > bestPropertyValue->first) ) ) {
|
||||
bestPropertyCount = iter->second;
|
||||
bestPropertyValue = iter;
|
||||
}
|
||||
}
|
||||
|
||||
if ( bestPropertyValue == allPropertyValues->end()) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return &(bestPropertyValue->first);
|
||||
}
|
||||
|
||||
|
||||
std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const
|
||||
{
|
||||
const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
|
||||
|
||||
if ( allPropertyValues == NULL ) {
|
||||
return "";
|
||||
}
|
||||
|
||||
std::ostringstream oss;
|
||||
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||
iter!=allPropertyValues->end(); ++iter) {
|
||||
if (iter!=allPropertyValues->begin()) {
|
||||
oss << " ";
|
||||
}
|
||||
oss << iter->first;
|
||||
oss << " ";
|
||||
oss << iter->second;
|
||||
}
|
||||
|
||||
std::string allPropertyValuesString(oss.str());
|
||||
return allPropertyValuesString;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
162
phrase-extract/ExtractionPhrasePair.h
Normal file
162
phrase-extract/ExtractionPhrasePair.h
Normal file
@ -0,0 +1,162 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2009 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#include "tables-core.h"
|
||||
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <map>
|
||||
|
||||
namespace MosesTraining {
|
||||
|
||||
|
||||
typedef std::vector< std::set<size_t> > ALIGNMENT;
|
||||
|
||||
|
||||
class ExtractionPhrasePair {
|
||||
|
||||
protected:
|
||||
|
||||
typedef std::map<std::string,float> PROPERTY_VALUES;
|
||||
typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
|
||||
|
||||
|
||||
bool m_isValid;
|
||||
|
||||
const PHRASE *m_phraseSource;
|
||||
const PHRASE *m_phraseTarget;
|
||||
|
||||
float m_count;
|
||||
float m_pcfgSum;
|
||||
|
||||
std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
|
||||
std::map<std::string,
|
||||
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
|
||||
|
||||
float m_lastCount;
|
||||
float m_lastPcfgSum;
|
||||
std::map<ALIGNMENT*,float>::iterator m_lastTargetToSourceAlignment;
|
||||
|
||||
public:
|
||||
|
||||
ExtractionPhrasePair( const PHRASE *phraseSource,
|
||||
const PHRASE *phraseTarget,
|
||||
ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum );
|
||||
|
||||
~ExtractionPhrasePair();
|
||||
|
||||
bool Add( ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum );
|
||||
|
||||
void IncrementPrevious( float count, float pcfgSum );
|
||||
|
||||
bool Matches( const PHRASE *otherPhraseSource,
|
||||
const PHRASE *otherPhraseTarget,
|
||||
ALIGNMENT *otherTargetToSourceAlignment ) const;
|
||||
|
||||
bool Matches( const PHRASE *otherPhraseSource,
|
||||
const PHRASE *otherPhraseTarget,
|
||||
ALIGNMENT *otherTargetToSourceAlignment,
|
||||
bool &sourceMatch,
|
||||
bool &targetMatch,
|
||||
bool &alignmentMatch ) const;
|
||||
|
||||
bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const;
|
||||
|
||||
void Clear();
|
||||
|
||||
bool IsValid() const {
|
||||
return m_isValid;
|
||||
}
|
||||
|
||||
|
||||
const PHRASE *GetSource() const {
|
||||
return m_phraseSource;
|
||||
}
|
||||
|
||||
const PHRASE *GetTarget() const {
|
||||
return m_phraseTarget;
|
||||
}
|
||||
|
||||
float GetCount() const {
|
||||
return m_count;
|
||||
}
|
||||
|
||||
float GetPcfgScore() const {
|
||||
return m_pcfgSum;
|
||||
}
|
||||
|
||||
const size_t GetNumberOfProperties() const {
|
||||
return m_properties.size();
|
||||
}
|
||||
|
||||
const std::map<std::string,float> *GetProperty( const std::string &key ) const {
|
||||
std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::const_iterator iter;
|
||||
iter = m_properties.find(key);
|
||||
if (iter == m_properties.end()) {
|
||||
return NULL;
|
||||
} else {
|
||||
return iter->second.first;
|
||||
}
|
||||
}
|
||||
|
||||
const ALIGNMENT *FindBestAlignmentTargetToSource() const;
|
||||
|
||||
const std::string *FindBestPropertyValue(const std::string &key) const;
|
||||
|
||||
std::string CollectAllPropertyValues(const std::string &key) const;
|
||||
|
||||
void AddProperties( const std::string &str, float count );
|
||||
|
||||
void AddProperty( const std::string &key, const std::string &value, float count )
|
||||
{
|
||||
std::map<std::string,
|
||||
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
|
||||
if ( iter == m_properties.end() ) {
|
||||
// key not found: insert property key and value
|
||||
PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
|
||||
std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
|
||||
LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
|
||||
m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue);
|
||||
} else {
|
||||
LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
|
||||
if ( (*lastPropertyValue)->first == value ) { // same property key-value pair has been seen right before
|
||||
// property key-value pair exists already: add count
|
||||
(*lastPropertyValue)->second += count;
|
||||
} else { // need to check whether the property key-value pair has appeared before (insert if not)
|
||||
// property key exists, but not in combination with this value:
|
||||
// add new value with count
|
||||
PROPERTY_VALUES *propertyValues = (iter->second).first;
|
||||
std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
|
||||
if ( !insertedProperty.second ) { // property value for this key appeared before: add count
|
||||
insertedProperty.first->second += count;
|
||||
}
|
||||
LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
|
||||
delete (iter->second).second;
|
||||
(iter->second).second = lastPropertyValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -1,50 +1,30 @@
|
||||
#include "InternalStructFeature.h"
|
||||
#include <map>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
InternalStructFeature::InternalStructFeature()
|
||||
:m_type(0){
|
||||
//cout<<"InternalStructFeature: Construct "<<m_type<<"\n";
|
||||
}
|
||||
|
||||
bool InternalStructFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const{
|
||||
//cout<<"InternalStructFeature: Equals\n";
|
||||
//don't know what it's used for and what we should compare
|
||||
//-> if the dense score is the same
|
||||
//-> if the sparse feature is set
|
||||
// compare phrases? with the internalStrucutre string?
|
||||
/** Return true if the two phrase pairs are equal from the point of this feature. Assume
|
||||
that they already compare true according to PhraseAlignment.equals()
|
||||
**/
|
||||
|
||||
/* if(lhs.ghkmParse==rhs.ghkmParse)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
*/
|
||||
//return true;
|
||||
}
|
||||
|
||||
void InternalStructFeature::add(const ScoreFeatureContext& context,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const{
|
||||
for(size_t i=0; i<context.phrasePair.size(); i++) {
|
||||
add(&context.phrasePair[i]->treeFragment, denseValues, sparseValues);
|
||||
}
|
||||
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const {
|
||||
const std::map<std::string,float> *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only?
|
||||
for ( std::map<std::string,float>::const_iterator iter=allTrees->begin();
|
||||
iter!=allTrees->end(); ++iter ) {
|
||||
add(&(iter->first), iter->second, denseValues, sparseValues);
|
||||
}
|
||||
}
|
||||
|
||||
void InternalStructFeatureDense::add(std::string *internalStruct,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const{
|
||||
void InternalStructFeatureDense::add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const {
|
||||
//cout<<"Dense: "<<*internalStruct<<endl;
|
||||
size_t start=0;
|
||||
int countNP=0;
|
||||
while((start = internalStruct->find("NP", start)) != string::npos) {
|
||||
countNP++;
|
||||
while((start = treeFragment->find("NP", start)) != string::npos) {
|
||||
countNP += count;
|
||||
start+=2; //length of "NP"
|
||||
}
|
||||
//should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
|
||||
@ -53,21 +33,21 @@ void InternalStructFeatureDense::add(std::string *internalStruct,
|
||||
|
||||
}
|
||||
|
||||
void InternalStructFeatureSparse::add(std::string *internalStruct,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const{
|
||||
//cout<<"Sparse: "<<*internalStruct<<endl;
|
||||
if(internalStruct->find("VBZ")!=std::string::npos)
|
||||
sparseValues["NTVBZ"] = 1;
|
||||
if(internalStruct->find("VBD")!=std::string::npos)
|
||||
sparseValues["NTVBD"] = 1;
|
||||
if(internalStruct->find("VBP")!=std::string::npos)
|
||||
sparseValues["NTVBP"] = 1;
|
||||
if(internalStruct->find("PP")!=std::string::npos)
|
||||
sparseValues["NTPP"] = 1;
|
||||
if(internalStruct->find("SBAR")!=std::string::npos)
|
||||
sparseValues["NTSBAR"] = 1;
|
||||
|
||||
void InternalStructFeatureSparse::add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const {
|
||||
//cout<<"Sparse: "<<*internalStruct<<endl;
|
||||
if(treeFragment->find("VBZ")!=std::string::npos)
|
||||
sparseValues["NTVBZ"] += count;
|
||||
if(treeFragment->find("VBD")!=std::string::npos)
|
||||
sparseValues["NTVBD"] += count;
|
||||
if(treeFragment->find("VBP")!=std::string::npos)
|
||||
sparseValues["NTVBP"] += count;
|
||||
if(treeFragment->find("PP")!=std::string::npos)
|
||||
sparseValues["NTPP"] += count;
|
||||
if(treeFragment->find("SBAR")!=std::string::npos)
|
||||
sparseValues["NTSBAR"] += count;
|
||||
}
|
||||
|
||||
|
||||
|
@ -21,22 +21,19 @@ namespace MosesTraining
|
||||
class InternalStructFeature : public ScoreFeature
|
||||
{
|
||||
public:
|
||||
InternalStructFeature();
|
||||
/** Return true if the two phrase pairs are equal from the point of this feature. Assume
|
||||
that they already compare true according to PhraseAlignment.equals()
|
||||
**/
|
||||
bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
|
||||
InternalStructFeature() : m_type(0) {};
|
||||
/** Add the values for this feature function. */
|
||||
void add(const ScoreFeatureContext& context,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
|
||||
|
||||
protected:
|
||||
/** Overriden in subclass */
|
||||
virtual void add(std::string *internalStruct,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const = 0;
|
||||
/** Overridden in subclass */
|
||||
virtual void add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const = 0;
|
||||
int m_type;
|
||||
|
||||
};
|
||||
@ -47,9 +44,10 @@ public:
|
||||
InternalStructFeatureDense()
|
||||
:InternalStructFeature(){m_type=1;} //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
|
||||
protected:
|
||||
virtual void add(std::string *internalStruct,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
virtual void add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
};
|
||||
|
||||
class InternalStructFeatureSparse : public InternalStructFeature
|
||||
@ -58,9 +56,10 @@ public:
|
||||
InternalStructFeatureSparse()
|
||||
:InternalStructFeature(){m_type=2;}// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
|
||||
protected:
|
||||
virtual void add(std::string *internalStruct,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
virtual void add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user