Merge branch 'master' of ssh://github.com/moses-smt/mosesdecoder

2024-09-19 23:27:46 +03:00 · 2014-02-12 21:01:09 +00:00 · 2014-02-12 21:01:09 +00:00 · 049be8b71c
commit 049be8b71c
parent d6b62db5b1 50cadc754f
133 changed files with 7456 additions and 1604 deletions
--- a/contrib/other-builds/OnDiskPt/.cproject
+++ b/contrib/other-builds/OnDiskPt/.cproject
@ -11,12 +11,12 @@
 					</externalSetting>
 				</externalSettings>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -72,13 +72,13 @@
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
 				<externalSettings/>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
--- a/contrib/other-builds/extract-mixed-syntax/.cproject
+++ b/contrib/other-builds/extract-mixed-syntax/.cproject
@ -0,0 +1,133 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
+					<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982." name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.456080129" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.582801917" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
+							<builder buildPath="${workspace_loc:/extract-mixed-syntax/Debug}" id="cdt.managedbuild.builder.gnu.cross.1220166455" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
+								<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.2055012191" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
+								<option id="gnu.c.compiler.option.debugging.level.1768196213" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2007889843" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
+								<option id="gnu.cpp.compiler.option.optimization.level.855436310" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.debugging.level.506549229" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.include.paths.1497326561" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.606353571" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.740521305" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
+								<option id="gnu.cpp.link.option.libs.1946120010" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+									<listOptionValue builtIn="false" value="z"/>
+									<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
+								</option>
+								<option id="gnu.cpp.link.option.paths.1563475751" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.106010037" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.archiver.136661991" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.assembler.2112208574" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.172930211" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+		<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.715007893">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" moduleId="org.eclipse.cdt.core.settings" name="Release">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.715007893" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
+					<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.715007893." name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.99436307" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.801178939" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
+							<builder buildPath="${workspace_loc:/extract-mixed-syntax/Release}" id="cdt.managedbuild.builder.gnu.cross.1999547547" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
+								<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1481537766" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
+								<option id="gnu.c.compiler.option.debugging.level.1967527847" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.442342681" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
+								<option id="gnu.cpp.compiler.option.optimization.level.1847950300" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.debugging.level.1130138972" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.158429528" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.2020667840" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1372779734" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.archiver.371006952" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1770045040" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.707592414" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="extract-mixed-syntax.cdt.managedbuild.target.gnu.cross.exe.1868010260" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1604862038;cdt.managedbuild.tool.gnu.cpp.compiler.input.870650754">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.715007893;cdt.managedbuild.config.gnu.cross.exe.release.715007893.;cdt.managedbuild.tool.gnu.cross.c.compiler.2138817906;cdt.managedbuild.tool.gnu.c.compiler.input.442342681">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1194558915;cdt.managedbuild.tool.gnu.cpp.compiler.input.2118510064">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1919499982;cdt.managedbuild.config.gnu.cross.exe.debug.1919499982.;cdt.managedbuild.tool.gnu.cross.c.compiler.1245611568;cdt.managedbuild.tool.gnu.c.compiler.input.2007889843">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/extract-mixed-syntax"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+</cproject>
--- a/contrib/other-builds/extract-mixed-syntax/.project
+++ b/contrib/other-builds/extract-mixed-syntax/.project
@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>extract-mixed-syntax</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+</projectDescription>
--- a/contrib/other-builds/extract-mixed-syntax/Global.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/Global.cpp
@ -0,0 +1,37 @@
+/*
+ *  Global.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 01/02/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include "Global.h"
+
+bool g_debug = false;
+
+Global::Global()
+: minHoleSpanSourceDefault(2)
+, maxHoleSpanSourceDefault(7)
+, minHoleSpanSourceSyntax(1)
+, maxHoleSpanSourceSyntax(1000)
+, maxUnaligned(5)
+
+, maxSymbols(5)
+, maxNonTerm(3)
+, maxNonTermDefault(2)
+
+// int minHoleSize(1)
+// int minSubPhraseSize(1) // minimum size of a remaining lexical phrase 
+, glueGrammarFlag(false)
+, unknownWordLabelFlag(false)
+//bool zipFiles(false)
+, sourceSyntax(true)
+, targetSyntax(false)
+, mixed(true)
+, uppermostOnly(true)
+, allowDefaultNonTermEdge(true)
+, gzOutput(false)
+
+{}
--- a/contrib/other-builds/extract-mixed-syntax/Global.h
+++ b/contrib/other-builds/extract-mixed-syntax/Global.h
@ -0,0 +1,45 @@
+#pragma once
+/*
+ *  Global.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 01/02/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <set>
+#include <map>
+#include <string>
+
+class Global
+{
+public:
+	int minHoleSpanSourceDefault;
+	int maxHoleSpanSourceDefault;
+	int minHoleSpanSourceSyntax;
+	int maxHoleSpanSourceSyntax;
+
+	int maxSymbols;
+	bool glueGrammarFlag;
+	bool unknownWordLabelFlag;
+	int maxNonTerm;
+	int maxNonTermDefault;
+	bool sourceSyntax;
+	bool targetSyntax;
+	bool mixed;
+	int maxUnaligned;
+	bool uppermostOnly;
+	bool allowDefaultNonTermEdge;
+  bool gzOutput;
+
+	Global();
+
+	Global(const Global&);
+
+};
+
+extern bool g_debug;
+
+#define DEBUG_OUTPUT()	 void DebugOutput() const;
+
+
--- a/contrib/other-builds/extract-mixed-syntax/InputFileStream.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/InputFileStream.cpp
@ -0,0 +1,62 @@
+// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+
+using namespace std;
+
+namespace Moses
+{
+	InputFileStream::InputFileStream(const std::string &filePath)
+	: std::istream(NULL)
+	, m_streambuf(NULL)
+	{
+		if (filePath.size() > 3 &&
+				filePath.substr(filePath.size() - 3, 3) == ".gz")
+		{
+			m_streambuf = new gzfilebuf(filePath.c_str());
+		} else {
+			std::filebuf* fb = new std::filebuf();
+			fb = fb->open(filePath.c_str(), std::ios::in);
+			if (! fb) {
+				cerr << "Can't read " << filePath.c_str() << endl;
+				exit(1);
+			}
+			m_streambuf = fb;
+		}
+		this->init(m_streambuf);
+	}
+	
+	InputFileStream::~InputFileStream()
+	{
+		delete m_streambuf;
+		m_streambuf = NULL;
+	}
+	
+	void InputFileStream::Close()
+	{
+	}
+	
+	
+}
+
--- a/contrib/other-builds/extract-mixed-syntax/InputFileStream.h
+++ b/contrib/other-builds/extract-mixed-syntax/InputFileStream.h
@ -0,0 +1,48 @@
+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#ifndef moses_InputFileStream_h
+#define moses_InputFileStream_h
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+namespace Moses
+{
+	
+	/** Used in place of std::istream, can read zipped files if it ends in .gz
+	 */
+	class InputFileStream : public std::istream
+	{
+	protected:
+		std::streambuf *m_streambuf;
+	public:
+		
+		InputFileStream(const std::string &filePath);
+		~InputFileStream();
+		
+		void Close();
+	};
+	
+}
+
+#endif
--- a/contrib/other-builds/extract-mixed-syntax/Lattice.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/Lattice.cpp
@ -0,0 +1,180 @@
+/*
+ *  Lattice.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 18/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include <cassert>
+#include "Lattice.h"
+#include "LatticeNode.h"
+#include "Tunnel.h"
+#include "TunnelCollection.h"
+#include "SyntaxTree.h"
+#include "SentenceAlignment.h"
+#include "tables-core.h"
+#include "Rule.h"
+#include "RuleCollection.h"
+
+using namespace std;
+
+Lattice::Lattice(size_t sourceSize)
+:m_stacks(sourceSize + 1)
+{
+}
+
+Lattice::~Lattice()
+{
+	std::vector<Stack>::iterator iterStack;
+	for (iterStack = m_stacks.begin(); iterStack != m_stacks.end(); ++iterStack)
+	{
+		Stack &stack = *iterStack;
+		RemoveAllInColl(stack);
+	}	
+}
+
+void Lattice::CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global)
+{	
+	// term
+	Stack &startStack = GetStack(startPos);
+	
+	LatticeNode *node = new LatticeNode(startPos, &sentence);
+	startStack.push_back(node);
+	
+	// non-term
+	for (size_t endPos = startPos + 1; endPos <= sentence.source.size(); ++endPos)
+	{
+		const TunnelList &tunnels = tunnelColl.GetTunnels(startPos, endPos - 1);
+		
+		TunnelList::const_iterator iterHole;
+		for (iterHole = tunnels.begin(); iterHole != tunnels.end(); ++iterHole)
+		{
+			const Tunnel &tunnel = *iterHole;
+			CreateArcsUsing1Hole(tunnel, sentence, global);
+		}
+	}
+}
+
+void Lattice::CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global)
+{
+	size_t startPos	= tunnel.GetRange(0).GetStartPos()
+				, endPos	= tunnel.GetRange(0).GetEndPos();
+	size_t numSymbols = tunnel.GetRange(0).GetWidth();
+	assert(numSymbols > 0);
+	
+	Stack &startStack = GetStack(startPos);
+
+		
+	// non-terms. cartesian product of source & target labels
+	assert(startPos == tunnel.GetRange(0).GetStartPos() && endPos == tunnel.GetRange(0).GetEndPos());
+	size_t startT	= tunnel.GetRange(1).GetStartPos()
+				,endT		= tunnel.GetRange(1).GetEndPos();
+	
+	const SyntaxNodes &nodesS = sentence.sourceTree.GetNodes(startPos, endPos);
+	const SyntaxNodes &nodesT = sentence.targetTree.GetNodes(startT, endT );
+
+	SyntaxNodes::const_iterator iterS, iterT;
+	for (iterS = nodesS.begin(); iterS != nodesS.end(); ++iterS)
+	{
+		const SyntaxNode *syntaxNodeS = *iterS;
+		
+		for (iterT = nodesT.begin(); iterT != nodesT.end(); ++iterT)
+		{
+			const SyntaxNode *syntaxNodeT = *iterT;
+			
+			bool isSyntax = syntaxNodeS->IsSyntax() || syntaxNodeT->IsSyntax();
+			size_t maxSourceNonTermSpan = isSyntax ? global.maxHoleSpanSourceSyntax : global.maxHoleSpanSourceDefault;
+			
+			if (maxSourceNonTermSpan >= endPos - startPos)
+			{				
+				LatticeNode *node = new LatticeNode(tunnel, syntaxNodeS, syntaxNodeT);
+				startStack.push_back(node);
+			}
+		}
+	}
+}
+
+Stack &Lattice::GetStack(size_t startPos)
+{
+	assert(startPos < m_stacks.size());
+	return m_stacks[startPos];
+}
+
+const Stack &Lattice::GetStack(size_t startPos) const
+{
+	assert(startPos < m_stacks.size());
+	return m_stacks[startPos];
+}
+
+void Lattice::CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global)
+{
+	const Stack &startStack = GetStack(startPos);
+	
+	Stack::const_iterator iterStack;
+	for (iterStack = startStack.begin(); iterStack != startStack.end(); ++iterStack)
+	{
+		const LatticeNode *node = *iterStack;
+		Rule *initRule = new Rule(node);
+		
+		if (initRule->CanRecurse(global, sentence.GetTunnelCollection()))
+		{ // may or maynot be valid, but can continue to build on this rule
+			initRule->CreateRules(m_rules, *this, sentence, global);
+		}
+
+		if (initRule->IsValid(global, sentence.GetTunnelCollection()))
+		{ // add to rule collection
+			m_rules.Add(global, initRule, sentence);
+		}
+		else
+		{
+			delete initRule;
+		}
+
+		
+	}
+}
+
+Stack Lattice::GetNonTermNode(const Range &sourceRange) const
+{
+	Stack ret;
+	size_t sourcePos = sourceRange.GetStartPos();
+	
+	const Stack &origStack = GetStack(sourcePos);
+	Stack::const_iterator iter;
+	for (iter = origStack.begin(); iter != origStack.end(); ++iter)
+	{
+		LatticeNode *node = *iter;
+		const Range &nodeRangeS = node->GetSourceRange();
+		
+		assert(nodeRangeS.GetStartPos() == sourceRange.GetStartPos());
+		
+		if (! node->IsTerminal() && nodeRangeS.GetEndPos() == sourceRange.GetEndPos())
+		{
+			ret.push_back(node);
+		}
+	}
+	
+	return ret;
+}
+
+std::ostream& operator<<(std::ostream &out, const Lattice &obj)
+{
+	std::vector<Stack>::const_iterator iter;
+	for (iter = obj.m_stacks.begin(); iter != obj.m_stacks.end(); ++iter)
+	{
+		const Stack &stack = *iter;
+
+		Stack::const_iterator iterStack;
+		for (iterStack = stack.begin(); iterStack != stack.end(); ++iterStack)
+		{
+			const LatticeNode &node = **iterStack;
+			out << node << " ";
+		}
+	}
+
+	return out;
+}
+
+
--- a/contrib/other-builds/extract-mixed-syntax/Lattice.h
+++ b/contrib/other-builds/extract-mixed-syntax/Lattice.h
@ -0,0 +1,47 @@
+#pragma once
+/*
+ *  Lattice.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 18/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <iostream>
+#include <vector>
+#include "RuleCollection.h"
+
+class Global;
+class LatticeNode;
+class Tunnel;
+class TunnelCollection;
+class SentenceAlignment;
+
+typedef std::vector<LatticeNode*> Stack;
+
+class Lattice
+{
+	friend std::ostream& operator<<(std::ostream&, const Lattice&);
+
+	std::vector<Stack> m_stacks;
+	RuleCollection m_rules;
+	
+	Stack &GetStack(size_t endPos);			
+	
+	void CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global);
+
+public:
+	Lattice(size_t sourceSize);
+	~Lattice();
+	
+	void CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global);
+	void CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global);
+
+	const Stack &GetStack(size_t startPos) const;			
+	const RuleCollection &GetRules() const
+	{ return m_rules; }
+	
+	Stack GetNonTermNode(const Range &sourceRange) const;			
+
+};
+
--- a/contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp
@ -0,0 +1,149 @@
+/*
+ *  LatticeNode.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 18/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <sstream>
+#include "LatticeNode.h"
+#include "SyntaxTree.h"
+#include "Tunnel.h"
+#include "SentenceAlignment.h"
+#include "SymbolSequence.h"
+
+size_t LatticeNode::s_count = 0;
+
+using namespace std;
+
+// for terms
+LatticeNode::LatticeNode(size_t pos, const SentenceAlignment *sentence)
+:m_tunnel(NULL)
+,m_isTerminal(true)
+,m_sourceTreeNode(NULL)
+,m_targetTreeNode(NULL)
+,m_sentence(sentence)
+,m_sourceRange(pos, pos)
+{
+	s_count++;
+	//cerr << *this << endl;
+}
+
+// for non-terms
+LatticeNode::LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode)
+:m_tunnel(&tunnel)
+,m_isTerminal(false)
+,m_sourceTreeNode(sourceTreeNode)
+,m_targetTreeNode(targetTreeNode)
+,m_sentence(NULL)
+,m_sourceRange(tunnel.GetRange(0))
+{
+	s_count++;
+	//cerr << *this << endl;
+}
+
+bool LatticeNode::IsSyntax() const
+{
+	assert(!m_isTerminal);
+	bool ret = m_sourceTreeNode->IsSyntax() || m_targetTreeNode->IsSyntax();
+	return ret;
+}
+
+size_t LatticeNode::GetNumSymbols(size_t direction) const
+{
+	return 1;
+}
+
+int LatticeNode::Compare(const LatticeNode &otherNode) const
+{
+	int ret = 0;
+	if (m_isTerminal != otherNode.m_isTerminal)
+	{
+		ret = m_isTerminal ? -1 : 1;
+	}
+	
+	// both term or non-term
+	else if (m_isTerminal)
+	{ // term. compare source span
+		if (m_sourceRange.GetStartPos() == otherNode.m_sourceRange.GetStartPos())
+			ret = 0;
+		else 
+			ret = (m_sourceRange.GetStartPos() < otherNode.m_sourceRange.GetStartPos()) ? -1 : +1;
+	}
+	else
+	{ // non-term. compare source span and BOTH label
+		assert(!m_isTerminal);
+		assert(!otherNode.m_isTerminal);
+
+		if (m_sourceTreeNode->IsSyntax())
+		{
+			ret = m_tunnel->Compare(*otherNode.m_tunnel, 0);
+			if (ret == 0 && m_sourceTreeNode->GetLabel() != otherNode.m_sourceTreeNode->GetLabel())
+			{
+				ret = (m_sourceTreeNode->GetLabel() < otherNode.m_sourceTreeNode->GetLabel()) ? -1 : +1;
+			}			
+		}
+
+		if (ret == 0 && m_targetTreeNode->IsSyntax())
+		{
+			ret = m_tunnel->Compare(*otherNode.m_tunnel, 1);
+			if (ret == 0 && m_targetTreeNode->GetLabel() != otherNode.m_targetTreeNode->GetLabel())
+			{
+				ret = (m_targetTreeNode->GetLabel() < otherNode.m_targetTreeNode->GetLabel()) ? -1 : +1;
+			}
+		}
+	}
+	
+	return ret;
+}
+
+void LatticeNode::CreateSymbols(size_t direction, SymbolSequence &symbols) const
+{
+	if (m_isTerminal)
+	{
+		/*
+		const std::vector<std::string> &words = (direction == 0 ? m_sentence->source : m_sentence->target);
+		size_t startPos = m_tunnel.GetStart(direction)
+						,endPos = m_tunnel.GetEnd(direction);
+		
+		for (size_t pos = startPos; pos <= endPos; ++pos)
+		{
+			Symbol symbol(words[pos], pos);
+			symbols.Add(symbol);
+		}
+		 */
+	}
+	else
+	{ // output both
+		
+		Symbol symbol(m_sourceTreeNode->GetLabel(), m_targetTreeNode->GetLabel()
+									, m_tunnel->GetRange(0).GetStartPos(), m_tunnel->GetRange(0).GetEndPos()
+									, m_tunnel->GetRange(1).GetStartPos(), m_tunnel->GetRange(1).GetEndPos()
+									, m_sourceTreeNode->IsSyntax(), m_targetTreeNode->IsSyntax());
+
+		symbols.Add(symbol);
+	}
+	
+}
+
+std::ostream& operator<<(std::ostream &out, const LatticeNode &obj)
+{	
+	if (obj.m_isTerminal)
+	{
+		assert(obj.m_sourceRange.GetWidth() == 1);
+		size_t pos = obj.m_sourceRange.GetStartPos();
+		
+		const SentenceAlignment &sentence = *obj.m_sentence;
+		out << obj.m_sourceRange << "=" << sentence.source[pos];		
+	}
+	else
+	{ 
+		assert(obj.m_tunnel);
+		out << obj.GetTunnel() << "=" << obj.m_sourceTreeNode->GetLabel() << obj.m_targetTreeNode->GetLabel() << " "; 
+	}
+	
+	return out;
+}
+
+
--- a/contrib/other-builds/extract-mixed-syntax/LatticeNode.h
+++ b/contrib/other-builds/extract-mixed-syntax/LatticeNode.h
@ -0,0 +1,77 @@
+#pragma once
+/*
+ *  LatticeNode.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 18/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <vector>
+#include <iostream>
+#include <cassert>
+#include "Range.h"
+
+class Tunnel;
+class SyntaxNode;
+class SentenceAlignment;
+class SymbolSequence;
+
+class LatticeNode
+{
+	friend std::ostream& operator<<(std::ostream&, const LatticeNode&);
+
+	bool m_isTerminal;
+
+	// for terms & non-term
+	Range m_sourceRange;
+
+	// non-terms. source range should be same as m_sourceRange
+	const Tunnel *m_tunnel;
+
+public:
+	static size_t s_count;
+	
+	
+	
+	const SyntaxNode *m_sourceTreeNode, *m_targetTreeNode;
+	const SentenceAlignment *m_sentence;
+	
+	// for terms
+	LatticeNode(size_t pos, const SentenceAlignment *sentence);
+
+	// for non-terms
+	LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode);
+	
+	bool IsTerminal() const
+	{ return m_isTerminal; }
+
+	bool IsSyntax() const;
+	
+	size_t GetNumSymbols(size_t direction) const;
+	
+	std::string ToString() const;
+	
+	int Compare(const LatticeNode &otherNode) const;
+	
+	void CreateSymbols(size_t direction, SymbolSequence &symbols) const;
+
+	const Tunnel &GetTunnel() const
+	{
+		assert(m_tunnel);
+		return *m_tunnel;
+	}
+	
+	const Range &GetSourceRange() const
+	{
+		return m_sourceRange;
+	}
+	const SyntaxNode &GetSyntaxNode(size_t direction) const
+	{
+		const SyntaxNode *node = direction == 0 ? m_sourceTreeNode : m_targetTreeNode;
+		assert(node);
+		return *node;
+	}
+	
+};
+
--- a/contrib/other-builds/extract-mixed-syntax/Makefile
+++ b/contrib/other-builds/extract-mixed-syntax/Makefile
@ -0,0 +1,13 @@
+all: extract 
+
+clean: 
+	rm -f *.o extract-mixed-syntax
+
+.cpp.o:
+	g++ -O6 -g -c $<
+
+extract: tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o
+
+	g++ tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o -lz -lboost_iostreams-mt -o extract-mixed-syntax
+
+
--- a/contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp
@ -0,0 +1,79 @@
+// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <boost/iostreams/filter/gzip.hpp>
+#include "OutputFileStream.h"
+#include "gzfilebuf.h"
+
+using namespace std;
+
+namespace Moses
+{
+OutputFileStream::OutputFileStream()
+  :boost::iostreams::filtering_ostream()
+  ,m_outFile(NULL)
+{
+}
+
+OutputFileStream::OutputFileStream(const std::string &filePath)
+  : m_outFile(NULL)
+{
+  Open(filePath);
+}
+
+OutputFileStream::~OutputFileStream()
+{
+  Close();
+}
+
+bool OutputFileStream::Open(const std::string &filePath)
+{
+  m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
+  if (m_outFile->fail()) {
+    return false;
+  }
+
+  if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
+    this->push(boost::iostreams::gzip_compressor());
+  }
+  this->push(*m_outFile);
+
+  return true;
+}
+
+void OutputFileStream::Close()
+{
+  if (m_outFile == NULL) {
+    return;
+  }
+
+  this->flush();
+  this->pop(); // file
+
+  m_outFile->close();
+  delete m_outFile;
+  m_outFile = NULL;
+  return;
+}
+
+
+}
+
--- a/contrib/other-builds/extract-mixed-syntax/OutputFileStream.h
+++ b/contrib/other-builds/extract-mixed-syntax/OutputFileStream.h
@ -0,0 +1,50 @@
+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <boost/iostreams/filtering_stream.hpp>
+
+namespace Moses
+{
+
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class OutputFileStream : public boost::iostreams::filtering_ostream
+{
+protected:
+  std::ofstream *m_outFile;
+public:
+  OutputFileStream();
+
+  OutputFileStream(const std::string &filePath);
+  virtual ~OutputFileStream();
+
+  bool Open(const std::string &filePath);
+  void Close();
+};
+
+}
+
--- a/contrib/other-builds/extract-mixed-syntax/Range.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/Range.cpp
@ -0,0 +1,74 @@
+/*
+ *  Range.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 22/02/2011.
+ *  Copyright 2011 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include "Range.h"
+
+using namespace std;
+
+void Range::Merge(const Range &a, const Range &b)
+{
+	if (a.m_startPos == NOT_FOUND)
+	{ // get the other regardless
+		m_startPos = b.m_startPos;
+	}
+	else if (b.m_startPos == NOT_FOUND)
+	{ 	
+		m_startPos = a.m_startPos;
+	}
+	else
+	{
+		m_startPos = min(a.m_startPos, b.m_startPos);
+	}
+
+	if (a.m_endPos == NOT_FOUND)
+	{ // get the other regardless
+		m_endPos = b.m_endPos;
+	}
+	else if (b.m_endPos == NOT_FOUND)
+	{ // do nothing		
+		m_endPos = a.m_endPos;
+	}
+	else
+	{
+		m_endPos = max(a.m_endPos, b.m_endPos);
+	}
+	
+	
+}
+
+int Range::Compare(const Range &other) const
+{
+	if (m_startPos < other.m_startPos)
+		return -1;
+	else if (m_startPos > other.m_startPos)
+		return +1;
+	else if (m_endPos < other.m_endPos)
+		return -1;
+	else if (m_endPos > other.m_endPos)
+		return +1;
+	
+	return 0;
+	
+}
+
+bool Range::Overlap(const Range &other) const
+{
+	if ( other.m_endPos < m_startPos || other.m_startPos > m_endPos) 
+		return false;
+	
+	return true;	
+}
+
+std::ostream& operator<<(std::ostream &out, const Range &range)
+{
+	out << "[" << range.m_startPos << "-" << range.m_endPos << "]";
+	return out;
+}
+
+
--- a/contrib/other-builds/extract-mixed-syntax/Range.h
+++ b/contrib/other-builds/extract-mixed-syntax/Range.h
@ -0,0 +1,57 @@
+/*
+ *  Range.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 22/02/2011.
+ *  Copyright 2011 __MyCompanyName__. All rights reserved.
+ *
+ */
+#pragma once
+#include <string>
+#include <iostream>
+#include <limits>
+
+#define NOT_FOUND 			std::numeric_limits<size_t>::max()
+
+class Range
+{
+	friend std::ostream& operator<<(std::ostream&, const Range&);
+
+	size_t m_startPos, m_endPos;
+public:
+
+	Range()
+	:m_startPos(NOT_FOUND)
+	,m_endPos(NOT_FOUND)
+	{}
+	
+	Range(const Range &copy)
+	:m_startPos(copy.m_startPos)
+	,m_endPos(copy.m_endPos)
+	{}
+
+	Range(size_t startPos, size_t endPos)
+	:m_startPos(startPos)
+	,m_endPos(endPos)
+	{}
+	
+	size_t GetStartPos() const
+	{ return m_startPos; }
+	size_t GetEndPos() const
+	{ return m_endPos; }
+	size_t GetWidth() const
+	{ return m_endPos - m_startPos + 1; }
+
+	void SetStartPos(size_t startPos)
+	{ m_startPos = startPos; }
+	void SetEndPos(size_t endPos)
+	{ m_endPos = endPos; }
+	
+	void Merge(const Range &a, const Range &b);
+	
+	int Compare(const Range &other) const;
+
+	bool Overlap(const Range &other) const;
+	
+	
+};
--- a/contrib/other-builds/extract-mixed-syntax/Rule.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/Rule.cpp
@ -0,0 +1,594 @@
+/*
+ *  Rule.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <algorithm>
+#include <sstream>
+#include "Rule.h"
+#include "Global.h"
+#include "LatticeNode.h"
+#include "Lattice.h"
+#include "SentenceAlignment.h"
+#include "Tunnel.h"
+#include "TunnelCollection.h"
+#include "RuleCollection.h"
+
+using namespace std;
+
+RuleElement::RuleElement(const RuleElement &copy)
+:m_latticeNode(copy.m_latticeNode)
+,m_alignmentPos(copy.m_alignmentPos)
+{
+}
+
+
+Rule::Rule(const LatticeNode *latticeNode)
+:m_lhs(NULL)
+{
+	RuleElement element(*latticeNode);
+	
+	m_coll.push_back(element);
+}
+
+Rule::Rule(const Rule &prevRule, const LatticeNode *latticeNode)
+:m_coll(prevRule.m_coll)
+,m_lhs(NULL)
+{	
+	RuleElement element(*latticeNode);
+	m_coll.push_back(element);
+}
+
+Rule::Rule(const Global &global, bool &isValid, const Rule &copy, const LatticeNode *lhs, const SentenceAlignment &sentence)
+:m_coll(copy.m_coll)
+,m_source(copy.m_source)
+,m_target(copy.m_target)
+,m_lhs(lhs)
+{	
+	CreateSymbols(global, isValid, sentence);
+}
+
+Rule::~Rule()
+{
+}
+
+// helper for sort
+struct CompareLatticeNodeTarget
+{
+ 	bool operator() (const RuleElement *a, const RuleElement *b)
+  {
+		 const Range	 &rangeA = a->GetLatticeNode().GetTunnel().GetRange(1)
+									,&rangeB = b->GetLatticeNode().GetTunnel().GetRange(1);
+		 return rangeA.GetEndPos() < rangeB.GetEndPos();
+	}
+};
+
+void Rule::CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence)
+{
+	vector<RuleElement*> nonTerms;
+		
+	// source
+	for (size_t ind = 0; ind < m_coll.size(); ++ind)
+	{
+		RuleElement &element = m_coll[ind];
+		const LatticeNode &node = element.GetLatticeNode();
+		if (node.IsTerminal())
+		{
+			size_t sourcePos = node.GetSourceRange().GetStartPos();
+			const string &word = sentence.source[sourcePos];
+			Symbol symbol(word, sourcePos);
+			m_source.Add(symbol);			
+		}
+		else 
+		{	// non-term
+			const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
+			const string &targetWord = node.GetSyntaxNode(1).GetLabel();
+			Symbol symbol(sourceWord, targetWord
+										, node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
+										, node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
+										, node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
+			m_source.Add(symbol);		
+
+			// store current pos within phrase
+			element.m_alignmentPos.first = ind;
+
+			// for target symbols
+			nonTerms.push_back(&element);			
+		}
+		
+	}
+	
+	// target
+	isValid = true;
+	
+	const Range &lhsTargetRange = m_lhs->GetTunnel().GetRange(1);
+
+	// check spans of target non-terms
+	if (nonTerms.size())
+	{
+		// sort non-term rules elements by target range
+		std::sort(nonTerms.begin(), nonTerms.end(), CompareLatticeNodeTarget());
+
+		const Range &first = nonTerms.front()->GetLatticeNode().GetTunnel().GetRange(1);
+		const Range &last = nonTerms.back()->GetLatticeNode().GetTunnel().GetRange(1);
+
+		if (first.GetStartPos() < lhsTargetRange.GetStartPos()
+				|| last.GetEndPos() > lhsTargetRange.GetEndPos())
+		{			
+			isValid = false;
+		}
+	}
+	
+	if (isValid)
+	{
+		size_t indNonTerm = 0;
+		RuleElement *currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;
+		for (size_t targetPos = lhsTargetRange.GetStartPos(); targetPos <= lhsTargetRange.GetEndPos(); ++targetPos)
+		{		
+			if (currNonTermElement && targetPos == currNonTermElement->GetLatticeNode().GetTunnel().GetRange(1).GetStartPos())
+			{ // start of a non-term. print out non-terms & skip to the end
+				
+				const LatticeNode &node = currNonTermElement->GetLatticeNode();
+
+				const string &sourceWord = node.GetSyntaxNode(0).GetLabel();
+				const string &targetWord = node.GetSyntaxNode(1).GetLabel();
+				Symbol symbol(sourceWord, targetWord
+											, node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos()
+											, node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos()
+											, node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax());
+				m_target.Add(symbol);			
+				
+				// store current pos within phrase
+				currNonTermElement->m_alignmentPos.second = m_target.GetSize() - 1;
+				
+				assert(currNonTermElement->m_alignmentPos.first != NOT_FOUND);
+
+				targetPos = node.GetTunnel().GetRange(1).GetEndPos();
+				indNonTerm++;
+				currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL;			
+			}
+			else 
+			{ // term
+				const string &word = sentence.target[targetPos];
+
+				Symbol symbol(word, targetPos);
+				m_target.Add(symbol);
+
+			}
+		}
+				
+		assert(indNonTerm == nonTerms.size());
+
+		if (m_target.GetSize() > global.maxSymbols) {
+		  isValid = false;
+	    //cerr << "m_source=" << m_source.GetSize() << ":" << m_source << endl;
+	    //cerr << "m_target=" << m_target.GetSize() << ":" << m_target << endl;
+		}
+	}	
+}
+
+bool Rule::MoreDefaultNonTermThanTerm() const
+{
+	size_t numTerm = 0, numDefaultNonTerm = 0;
+	
+	CollType::const_iterator iter;
+	for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
+	{
+		const RuleElement &element = *iter;
+		const LatticeNode &node = element.GetLatticeNode();
+		if (node.IsTerminal())
+		{
+			++numTerm;
+		}
+		else if (!node.IsSyntax())
+		{
+			++numDefaultNonTerm;
+		}
+	}
+	
+	bool ret = numDefaultNonTerm > numTerm;
+	return ret;
+}
+
+bool Rule::SourceHasEdgeDefaultNonTerm() const
+{
+	assert(m_coll.size());
+	const LatticeNode &first = m_coll.front().GetLatticeNode();
+	const LatticeNode &last = m_coll.back().GetLatticeNode();
+
+	// 1st
+	if (!first.IsTerminal() && !first.IsSyntax())
+	{
+		return true;
+	}
+	if (!last.IsTerminal() && !last.IsSyntax())
+	{
+		return true;
+	}
+	
+	return false;	
+}
+
+bool Rule::IsValid(const Global &global, const TunnelCollection &tunnelColl) const
+{
+	if (m_coll.size() == 1 && !m_coll[0].GetLatticeNode().IsTerminal()) // can't be only 1 terminal
+	{
+		return false;
+	}
+
+	if (MoreDefaultNonTermThanTerm()) 
+	{ // must have at least as many terms as non-syntax non-terms
+		return false;
+	}
+
+	if (!global.allowDefaultNonTermEdge && SourceHasEdgeDefaultNonTerm())
+	{
+		return false;
+	}
+	
+	if (GetNumSymbols() > global.maxSymbols)
+	{
+		return false;
+	}
+	
+	if (AdjacentDefaultNonTerms())
+	{
+		return false;
+	}
+	
+	if (!IsHole(tunnelColl))
+	{
+		return false;
+	}
+
+	if (NonTermOverlap())
+	{
+		return false;
+	}
+	
+	/*
+	std::pair<size_t, size_t> spanS	= GetSpan(0)
+														,spanT= GetSpan(1);
+
+	if (tunnelColl.NumUnalignedWord(0, spanS.first, spanS.second) >= global.maxUnaligned)
+		return false;
+	if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
+		return false;
+	*/
+	
+	return true;
+}
+
+bool Rule::NonTermOverlap() const
+{
+	vector<Range> ranges;
+	
+	CollType::const_iterator iter;
+	for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
+	{
+		const RuleElement &element = *iter;
+		if (!element.GetLatticeNode().IsTerminal())
+		{
+			const Range &range = element.GetLatticeNode().GetTunnel().GetRange(1);
+			ranges.push_back(range);
+		}
+	}
+	
+	vector<Range>::const_iterator outerIter;
+	for (outerIter = ranges.begin(); outerIter != ranges.end(); ++outerIter)
+	{
+		const Range &outer = *outerIter;
+		vector<Range>::const_iterator innerIter;
+		for (innerIter = outerIter + 1; innerIter != ranges.end(); ++innerIter)
+		{
+			const Range &inner = *innerIter;
+			if (outer.Overlap(inner))
+				return true;
+		}
+	}
+	
+	return false;
+}
+
+Range Rule::GetSourceRange() const
+{
+	assert(m_coll.size());
+	const Range &first = m_coll.front().GetLatticeNode().GetSourceRange();
+	const Range &last = m_coll.back().GetLatticeNode().GetSourceRange();
+	
+	Range ret(first.GetStartPos(), last.GetEndPos());
+	return ret;
+}
+
+
+bool Rule::IsHole(const TunnelCollection &tunnelColl) const
+{
+	const Range &spanS	= GetSourceRange();
+	const TunnelList &tunnels = tunnelColl.GetTunnels(spanS.GetStartPos(), spanS.GetEndPos());
+
+	bool ret = tunnels.size() > 0;
+	return ret;
+}
+
+
+bool Rule::CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const
+{
+	if (GetNumSymbols() >= global.maxSymbols)
+		return false;
+	if (AdjacentDefaultNonTerms())
+		return false;
+	if (MaxNonTerm(global))
+		return false;
+	if (NonTermOverlap())
+	{
+		return false;
+	}
+	
+	const Range spanS	= GetSourceRange();
+
+	if (tunnelColl.NumUnalignedWord(0, spanS.GetStartPos(), spanS.GetEndPos()) >= global.maxUnaligned)
+		return false;
+//	if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned)
+//		return false;
+	
+	
+	return true;
+}
+
+bool Rule::MaxNonTerm(const Global &global) const
+{
+	//cerr << *this << endl;
+	size_t numNonTerm = 0, numNonTermDefault = 0;
+	
+	CollType::const_iterator iter;
+	for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
+	{
+		const LatticeNode *node = &(*iter).GetLatticeNode();
+		if (!node->IsTerminal()  )
+		{
+			numNonTerm++;
+			if (!node->IsSyntax())
+			{
+				numNonTermDefault++;
+			}
+			if (numNonTerm >= global.maxNonTerm || numNonTermDefault >= global.maxNonTermDefault)
+				return true;
+		}
+	}
+	
+	return false;
+}
+
+
+bool Rule::AdjacentDefaultNonTerms() const
+{
+	assert(m_coll.size() > 0);
+	
+	const LatticeNode *prevNode = &m_coll.front().GetLatticeNode();
+	CollType::const_iterator iter;
+	for (iter = m_coll.begin() + 1; iter != m_coll.end(); ++iter)
+	{
+		const LatticeNode *node = &(*iter).GetLatticeNode();
+		if (!prevNode->IsTerminal() && !node->IsTerminal() && !prevNode->IsSyntax() && !node->IsSyntax() )
+		{
+			return true;
+		}
+		prevNode = node;
+	}
+	
+	return false;
+}
+
+
+
+size_t Rule::GetNumSymbols() const
+{
+	size_t ret = m_coll.size();	
+	return ret;
+}
+
+void Rule::CreateRules(RuleCollection &rules
+											 , const Lattice &lattice
+											 , const SentenceAlignment &sentence
+											 , const Global &global)
+{
+	assert(m_coll.size() > 0);
+	const LatticeNode *latticeNode = &m_coll.back().GetLatticeNode();
+	size_t endPos = latticeNode->GetSourceRange().GetEndPos() + 1;
+	
+	const Stack &stack = lattice.GetStack(endPos);
+	
+	Stack::const_iterator iter;
+	for (iter = stack.begin(); iter != stack.end(); ++iter)
+	{
+		const LatticeNode *newLatticeNode = *iter;
+		Rule *newRule = new Rule(*this, newLatticeNode);
+		//cerr << *newRule << endl;
+		
+		if (newRule->CanRecurse(global, sentence.GetTunnelCollection()))
+		{ // may or maynot be valid, but can continue to build on this rule
+			newRule->CreateRules(rules, lattice, sentence, global);
+		}
+		
+		if (newRule->IsValid(global, sentence.GetTunnelCollection()))
+		{ // add to rule collection
+			rules.Add(global, newRule, sentence);
+		}	
+		else 
+		{
+			delete newRule;
+		}
+
+	}
+}
+
+bool Rule::operator<(const Rule &compare) const
+{	
+	/*
+	if (g_debug)
+	{
+		cerr << *this << endl << compare;
+		cerr << endl;
+	}
+	*/
+	
+	bool ret = Compare(compare) < 0;
+	
+	/*
+	if (g_debug)
+	{
+		cerr << *this << endl << compare << endl << ret << endl << endl;
+	}
+	*/
+	
+	return ret;
+}
+
+int Rule::Compare(const Rule &compare) const
+{ 	
+	//cerr << *this << endl << compare << endl;
+	assert(m_coll.size() > 0);
+	assert(m_source.GetSize() > 0);
+	assert(m_target.GetSize() > 0);
+	
+	int ret = 0;
+	
+	// compare each fragment
+	ret = m_source.Compare(compare.m_source);
+	if (ret != 0)
+	{
+		return ret;
+	}
+
+	ret = m_target.Compare(compare.m_target);
+	if (ret != 0)
+	{
+		return ret;
+	}
+	
+	// compare lhs
+	const string &thisSourceLabel		= m_lhs->GetSyntaxNode(0).GetLabel();
+	const string &otherSourceLabel	= compare.m_lhs->GetSyntaxNode(0).GetLabel();
+	if (thisSourceLabel != otherSourceLabel)
+	{
+		ret = (thisSourceLabel < otherSourceLabel) ? -1 : +1;
+		return ret;
+	}
+
+	const string &thisTargetLabel		= m_lhs->GetSyntaxNode(1).GetLabel();
+	const string &otherTargetLabel	= compare.m_lhs->GetSyntaxNode(1).GetLabel();
+	if (thisTargetLabel != otherTargetLabel)
+	{
+		ret = (thisTargetLabel < otherTargetLabel) ? -1 : +1;
+		return ret;
+	}
+	
+	assert(ret == 0);
+	return ret;
+}
+
+
+const LatticeNode &Rule::GetLatticeNode(size_t ind) const
+{
+	assert(ind < m_coll.size());
+	return m_coll[ind].GetLatticeNode();
+}
+
+void Rule::DebugOutput() const
+{
+	Output(cerr);
+}
+
+void Rule::Output(std::ostream &out) const
+{
+
+  stringstream strmeS, strmeT;
+
+  std::vector<Symbol>::const_iterator iterSymbol;
+  for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
+  {
+    const Symbol &symbol = *iterSymbol;
+    strmeS << symbol << " ";
+  }
+
+  for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
+  {
+    const Symbol &symbol = *iterSymbol;
+    strmeT << symbol << " ";
+  }
+
+  // lhs
+  if (m_lhs)
+  {
+    strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
+    strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
+  }
+
+  out << strmeS.str() << " ||| " << strmeT.str() << " ||| ";
+
+  // alignment
+  Rule::CollType::const_iterator iter;
+  for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
+  {
+    const RuleElement &element = *iter;
+    const LatticeNode &node = element.GetLatticeNode();
+    bool isTerminal = node.IsTerminal();
+
+    if (!isTerminal)
+    {
+      out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " ";
+    }
+  }
+
+  out << "||| 1";
+
+}
+
+void Rule::OutputInv(std::ostream &out) const
+{
+  stringstream strmeS, strmeT;
+
+  std::vector<Symbol>::const_iterator iterSymbol;
+  for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol)
+  {
+    const Symbol &symbol = *iterSymbol;
+    strmeS << symbol << " ";
+  }
+
+  for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol)
+  {
+    const Symbol &symbol = *iterSymbol;
+    strmeT << symbol << " ";
+  }
+
+  // lhs
+  if (m_lhs)
+  {
+    strmeS << m_lhs->GetSyntaxNode(0).GetLabel();
+    strmeT << m_lhs->GetSyntaxNode(1).GetLabel();
+  }
+
+  out << strmeT.str() << " ||| " << strmeS.str() << " ||| ";
+
+  // alignment
+  Rule::CollType::const_iterator iter;
+  for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
+  {
+    const RuleElement &element = *iter;
+    const LatticeNode &node = element.GetLatticeNode();
+    bool isTerminal = node.IsTerminal();
+
+    if (!isTerminal)
+    {
+      out << element.m_alignmentPos.second << "-" << element.m_alignmentPos.first << " ";
+    }
+  }
+
+  out << "||| 1";
+
+}
+
+
--- a/contrib/other-builds/extract-mixed-syntax/Rule.h
+++ b/contrib/other-builds/extract-mixed-syntax/Rule.h
@ -0,0 +1,96 @@
+#pragma once
+/*
+ *  Rule.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <vector>
+#include <iostream>
+#include "LatticeNode.h"
+#include "SymbolSequence.h"
+#include "Global.h"
+
+class Lattice;
+class SentenceAlignment;
+class Global;
+class RuleCollection;
+class SyntaxNode;
+class TunnelCollection;
+class Range;
+
+class RuleElement
+{
+protected:
+	const LatticeNode *m_latticeNode;
+public:
+	std::pair<size_t, size_t> m_alignmentPos;
+	
+	RuleElement(const RuleElement &copy);
+	RuleElement(const LatticeNode &latticeNode)
+	:m_latticeNode(&latticeNode)
+	,m_alignmentPos(NOT_FOUND, NOT_FOUND)
+	{}
+
+	const LatticeNode &GetLatticeNode() const
+	{ return *m_latticeNode; }
+
+};
+
+class Rule
+{
+protected:
+	typedef std::vector<RuleElement> CollType;
+	CollType m_coll;
+
+	const LatticeNode *m_lhs;
+	SymbolSequence m_source, m_target;
+	
+	bool IsHole(const TunnelCollection &tunnelColl) const;
+	bool NonTermOverlap() const;
+
+	const LatticeNode &GetLatticeNode(size_t ind) const;
+	void CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence);
+
+public:
+	// init
+	Rule(const LatticeNode *latticeNode);
+
+	// create new rule by appending node to prev rule
+	Rule(const Rule &prevRule, const LatticeNode *latticeNode);
+
+	// create copy with lhs
+	Rule(const Global &global, bool &isValid, const Rule &copy, const LatticeNode *lhs, const SentenceAlignment &sentence);
+
+	// can continue to add to this rule
+	bool CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const;
+
+	virtual ~Rule();
+
+	// can add this to the set of rules
+	bool IsValid(const Global &global, const TunnelCollection &tunnelColl) const;
+
+	size_t GetNumSymbols() const;
+	bool AdjacentDefaultNonTerms() const;
+	bool MaxNonTerm(const Global &global) const;
+	bool MoreDefaultNonTermThanTerm() const;
+	bool SourceHasEdgeDefaultNonTerm() const;
+
+	void CreateRules(RuleCollection &rules
+									 , const Lattice &lattice
+									 , const SentenceAlignment &sentence
+									 , const Global &global);
+	
+	int Compare(const Rule &compare) const;
+	bool operator<(const Rule &compare) const;
+			
+	Range GetSourceRange() const;
+	
+	DEBUG_OUTPUT();
+
+  void Output(std::ostream &out) const;
+  void OutputInv(std::ostream &out) const;
+
+};
--- a/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp
@ -0,0 +1,102 @@
+/*
+ *  RuleCollection.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include "RuleCollection.h"
+#include "Rule.h"
+#include "SentenceAlignment.h"
+#include "tables-core.h"
+#include "Lattice.h"
+#include "SyntaxTree.h"
+
+using namespace std;
+
+RuleCollection::~RuleCollection()
+{
+	RemoveAllInColl(m_coll);
+}
+
+void RuleCollection::Add(const Global &global, Rule *rule, const SentenceAlignment &sentence)
+{	
+	Range spanS	= rule->GetSourceRange();
+		
+	// cartesian product of lhs
+	Stack nontermNodes = sentence.GetLattice().GetNonTermNode(spanS);
+	Stack::const_iterator iterStack;
+	for (iterStack = nontermNodes.begin(); iterStack != nontermNodes.end(); ++iterStack)
+	{
+		const LatticeNode &node = **iterStack;
+		assert(!node.IsTerminal());
+
+		bool isValid;
+		// create rules with LHS
+		//cerr << "old:" << *rule << endl;
+		Rule *newRule = new Rule(global, isValid, *rule, &node, sentence);
+		
+		if (!isValid)
+		{ // lhs doesn't match non-term spans
+			delete newRule;
+			continue;
+		}
+
+		/*
+		stringstream s;
+		s << *newRule;
+		if (s.str().find("Wiederaufnahme der [X] ||| resumption of the [X] ||| ||| 1") == 0)
+		{
+			cerr << "READY:" << *newRule << endl;
+			g_debug = true;
+		}
+		else {
+			g_debug = false;
+		}
+		*/
+		
+		typedef set<const Rule*, CompareRule>::iterator Iterator;
+		pair<Iterator,bool> ret = m_coll.insert(newRule);
+					
+		if (ret.second)
+		{
+			//cerr << "ACCEPTED:" << *newRule << endl;
+			//cerr << "";
+		}
+		else
+		{
+			//cerr << "REJECTED:" << *newRule << endl;
+			delete newRule;
+		}
+		
+	}
+	
+	delete rule;
+
+}
+
+void RuleCollection::Output(std::ostream &out) const
+{
+  RuleCollection::CollType::const_iterator iter;
+  for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
+  {
+    const Rule &rule = **iter;
+    rule.Output(out);
+    out << endl;
+  }
+}
+
+void RuleCollection::OutputInv(std::ostream &out) const
+{
+  RuleCollection::CollType::const_iterator iter;
+  for (iter = m_coll.begin(); iter != m_coll.end(); ++iter)
+  {
+    const Rule &rule = **iter;
+    rule.OutputInv(out);
+    out << endl;
+  }
+}
+
+
+
--- a/contrib/other-builds/extract-mixed-syntax/RuleCollection.h
+++ b/contrib/other-builds/extract-mixed-syntax/RuleCollection.h
@ -0,0 +1,55 @@
+#pragma once
+/*
+ *  RuleCollection.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <set>
+#include <iostream>
+#include "Rule.h"
+
+class SentenceAlignment;
+
+// helper for sort. Don't compare default non-terminals
+struct CompareRule
+{
+ 	bool operator() (const Rule *a, const Rule *b)
+  {
+		/*
+		if (g_debug)
+		{
+			std::cerr << std::endl << (*a) << std::endl << (*b) << " ";
+		}
+		 */
+		bool ret = (*a) < (*b);
+		/*
+		if (g_debug)
+		{
+			std::cerr << ret << std::endl;
+		}
+		 */
+		return ret;
+ 	}
+};
+
+
+class RuleCollection
+{
+protected:
+	typedef std::set<const Rule*, CompareRule> CollType;
+	CollType m_coll;
+	
+public:
+	~RuleCollection();
+	void Add(const Global &global, Rule *rule, const SentenceAlignment &sentence);
+	size_t GetSize() const
+	{ return m_coll.size(); }
+
+  void Output(std::ostream &out) const;
+  void OutputInv(std::ostream &out) const;
+
+};
+
--- a/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp
@ -0,0 +1,331 @@
+/*
+ *  SentenceAlignment.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/01/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <set>
+#include <map>
+#include <sstream>
+#include "SentenceAlignment.h"
+#include "XmlTree.h"
+#include "tables-core.h"
+#include "TunnelCollection.h"
+#include "Lattice.h"
+#include "LatticeNode.h"
+
+using namespace std;
+
+extern std::set< std::string > targetLabelCollection, sourceLabelCollection;
+extern std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;
+
+SentenceAlignment::SentenceAlignment()
+:m_tunnelCollection(NULL)
+,m_lattice(NULL)
+{}
+
+SentenceAlignment::~SentenceAlignment()
+{
+	delete m_tunnelCollection;
+	delete m_lattice;
+}
+
+int SentenceAlignment::Create( const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global )
+{
+
+  // tokenizing English (and potentially extract syntax spans)
+  if (global.targetSyntax) {
+		string targetStringCPP = string(targetString);
+		ProcessAndStripXMLTags( targetStringCPP, targetTree, targetLabelCollection , targetTopLabelCollection );
+		target = tokenize( targetStringCPP.c_str() );
+		// cerr << "E: " << targetStringCPP << endl;
+  }
+  else {
+		target = tokenize( targetString.c_str() );
+  }
+	
+  // tokenizing source (and potentially extract syntax spans)
+  if (global.sourceSyntax) {
+		string sourceStringCPP = string(sourceString);
+		ProcessAndStripXMLTags( sourceStringCPP, sourceTree, sourceLabelCollection , sourceTopLabelCollection );
+		source = tokenize( sourceStringCPP.c_str() );
+		// cerr << "F: " << sourceStringCPP << endl;
+  }
+  else {
+		source = tokenize( sourceString.c_str() );
+  }
+	
+  // check if sentences are empty
+  if (target.size() == 0 || source.size() == 0) {
+    cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
+    cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
+    return 0;
+  }
+	
+  // prepare data structures for alignments
+  for(int i=0; i<source.size(); i++) {
+    alignedCountS.push_back( 0 );
+  }
+  for(int i=0; i<target.size(); i++) {
+    vector< int > dummy;
+    alignedToT.push_back( dummy );
+  }
+	
+	//InitTightest(m_s2tTightest, source.size());
+	//InitTightest(m_t2sTightest, target.size());
+
+	
+  // reading in alignments
+  vector<string> alignmentSequence = tokenize( alignmentString.c_str() );
+  for(int i=0; i<alignmentSequence.size(); i++) {
+    int s,t;
+    // cout << "scaning " << alignmentSequence[i].c_str() << endl;
+    if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
+      cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl; 
+      cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
+      return 0;
+    }
+		// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
+    if (t >= target.size() || s >= source.size()) { 
+      cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
+      cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
+      return 0;
+    }
+    alignedToT[t].push_back( s );
+    alignedCountS[s]++;
+		
+		//SetAlignment(s, t);
+  }
+	
+	bool mixed = global.mixed;
+	sourceTree.AddDefaultNonTerms(global.sourceSyntax, mixed, source.size());
+	targetTree.AddDefaultNonTerms(global.targetSyntax, mixed, target.size());
+
+	//CalcTightestSpan(m_s2tTightest);
+	//CalcTightestSpan(m_t2sTightest);
+	
+  return 1;
+}
+
+/*
+void SentenceAlignment::InitTightest(Outer &tightest, size_t len)
+{
+	tightest.resize(len);
+	
+	for (size_t posOuter = 0; posOuter < len; ++posOuter)
+	{
+		Inner &inner = tightest[posOuter];
+		size_t innerSize = len - posOuter;
+		inner.resize(innerSize);
+		
+	}
+}
+
+void SentenceAlignment::CalcTightestSpan(Outer &tightest)
+{
+	size_t len = tightest.size();
+	
+	for (size_t startPos = 0; startPos < len; ++startPos)
+	{
+		for (size_t endPos = startPos + 1; endPos < len; ++endPos)
+		{
+			const Range &prevRange = GetTightest(tightest, startPos, endPos - 1);
+			const Range &smallRange = GetTightest(tightest, endPos, endPos); 
+			Range &newRange = GetTightest(tightest, startPos, endPos);
+			
+			newRange.Merge(prevRange, smallRange);
+			//cerr << "[" << startPos << "-" << endPos << "] --> [" << newRange.GetStartPos() << "-" << newRange.GetEndPos() << "]";
+		}
+	}
+}
+
+Range &SentenceAlignment::GetTightest(Outer &tightest, size_t startPos, size_t endPos)
+{
+	assert(endPos < tightest.size());
+	assert(endPos >= startPos);
+	
+	Inner &inner = tightest[startPos];
+	
+	size_t ind = endPos - startPos;
+	Range &ret = inner[ind];
+	return ret;
+}
+
+void SentenceAlignment::SetAlignment(size_t source, size_t target)
+{
+	SetAlignment(m_s2tTightest, source, target);
+	SetAlignment(m_t2sTightest, target, source);
+}
+
+void SentenceAlignment::SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos)
+{
+
+	Range &range = GetTightest(tightest, thisPos, thisPos);
+	if (range.GetStartPos() == NOT_FOUND)
+	{ // not yet set, do them both
+		assert(range.GetEndPos() == NOT_FOUND);
+		range.SetStartPos(thatPos);
+		range.SetEndPos(thatPos);
+	}
+	else
+	{
+		assert(range.GetEndPos() != NOT_FOUND);
+		range.SetStartPos( (range.GetStartPos() > thatPos) ? thatPos : range.GetStartPos() );
+		range.SetEndPos( (range.GetEndPos() < thatPos) ? thatPos : range.GetEndPos() );
+	}
+}
+ */
+
+
+void SentenceAlignment::FindTunnels(const Global &global ) 
+{
+	int countT = target.size();
+	int countS = source.size();
+	int maxSpan = max(global.maxHoleSpanSourceDefault, global.maxHoleSpanSourceSyntax);
+
+	m_tunnelCollection = new TunnelCollection(countS);
+	
+	m_tunnelCollection->alignedCountS = alignedCountS;
+	m_tunnelCollection->alignedCountT.resize(alignedToT.size());
+	for (size_t ind = 0; ind < alignedToT.size(); ind++)
+	{
+		m_tunnelCollection->alignedCountT[ind] = alignedToT[ind].size();
+	}
+	
+	// phrase repository for creating hiero phrases
+	
+	// check alignments for target phrase startT...endT
+	for(int lengthT=1;
+			lengthT <= maxSpan && lengthT <= countT;
+			lengthT++) {
+		for(int startT=0; startT < countT-(lengthT-1); startT++) {
+			
+			// that's nice to have
+			int endT = startT + lengthT - 1;
+			
+			// if there is target side syntax, there has to be a node
+			if (global.targetSyntax && !targetTree.HasNode(startT,endT))
+				continue;
+			
+			// find find aligned source words
+			// first: find minimum and maximum source word
+			int minS = 9999;
+			int maxS = -1;
+			vector< int > usedS = alignedCountS;
+			for(int ti=startT;ti<=endT;ti++) {
+				for(int i=0;i<alignedToT[ti].size();i++) {
+					int si = alignedToT[ti][i];
+					// cerr << "point (" << si << ", " << ti << ")\n";
+					if (si<minS) { minS = si; }
+					if (si>maxS) { maxS = si; }
+					usedS[ si ]--;
+				}
+			}
+			
+			// unaligned phrases are not allowed
+			if( maxS == -1 )
+				continue;
+			
+			// source phrase has to be within limits
+			if( maxS-minS >= maxSpan )
+			{
+				continue;
+			}
+			
+			// check if source words are aligned to out of bound target words
+			bool out_of_bounds = false;
+			for(int si=minS;si<=maxS && !out_of_bounds;si++)
+			{
+				if (usedS[si]>0) {
+					out_of_bounds = true;
+				}
+			}
+			
+			// if out of bound, you gotta go
+			if (out_of_bounds)
+				continue;
+			
+			if (m_tunnelCollection->NumUnalignedWord(1, startT, endT) >= global.maxUnaligned)
+				continue;
+			
+			// done with all the checks, lets go over all consistent phrase pairs
+			// start point of source phrase may retreat over unaligned
+			for(int startS=minS;
+					(startS>=0 &&
+					 startS>maxS - maxSpan && // within length limit
+					 (startS==minS || alignedCountS[startS]==0)); // unaligned
+					startS--)
+			{
+				// end point of source phrase may advance over unaligned
+				for(int endS=maxS;
+						(endS<countS && endS<startS + maxSpan && // within length limit
+						 (endS==maxS || alignedCountS[endS]==0)); // unaligned
+						endS++) 
+				{
+					if (m_tunnelCollection->NumUnalignedWord(0, startS, endS) >= global.maxUnaligned)
+						continue;
+					
+					// take note that this is a valid phrase alignment
+					m_tunnelCollection->Add(startS, endS, startT, endT);
+				}
+			}
+		}
+	}
+	
+	//cerr << *tunnelCollection << endl;
+
+}
+
+void SentenceAlignment::CreateLattice(const Global &global)
+{
+	size_t countS = source.size();
+	m_lattice = new Lattice(countS);
+	
+	for (size_t startPos = 0; startPos < countS; ++startPos)
+	{
+		//cerr << "creating arcs for " << startPos << "=";
+		m_lattice->CreateArcs(startPos, *m_tunnelCollection, *this, global);
+		
+		//cerr << LatticeNode::s_count << endl;
+	}
+}
+
+void SentenceAlignment::CreateRules(const Global &global)
+{
+	size_t countS = source.size();
+	
+	for (size_t startPos = 0; startPos < countS; ++startPos)
+	{
+		//cerr << "creating rules for " << startPos << "\n";
+		m_lattice->CreateRules(startPos, *this, global);
+	}
+}
+
+void OutputSentenceStr(std::ostream &out, const std::vector<std::string> &vec)
+{
+	for (size_t pos = 0; pos < vec.size(); ++pos)
+	{
+		out << vec[pos] << " ";
+	}
+}
+
+std::ostream& operator<<(std::ostream &out, const SentenceAlignment &obj)
+{	
+	OutputSentenceStr(out, obj.target);
+	out << " ==> ";
+	OutputSentenceStr(out, obj.source);
+	out << endl;
+	
+	out << *obj.m_tunnelCollection;	
+
+	if (obj.m_lattice)
+		out << endl << *obj.m_lattice;
+	
+	return out;
+}
+
+
+
+
--- a/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h
+++ b/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h
@ -0,0 +1,69 @@
+#pragma once
+/*
+ *  SentenceAlignment.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/01/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <vector>
+#include <cassert>
+#include <iostream>
+#include "SyntaxTree.h"
+#include "Global.h"
+#include "Range.h"
+
+class TunnelCollection;
+class Lattice;
+
+class SentenceAlignment 
+{
+	friend std::ostream& operator<<(std::ostream&, const SentenceAlignment&);
+
+public:
+  std::vector<std::string> target;
+  std::vector<std::string> source;
+  std::vector<int> alignedCountS;
+  std::vector< std::vector<int> > alignedToT;
+  SyntaxTree sourceTree, targetTree;
+	
+	//typedef std::vector<Range> Inner;
+	//typedef std::vector<Inner> Outer;
+	
+	//Outer m_s2tTightest, m_t2sTightest;
+	
+	SentenceAlignment();
+	~SentenceAlignment();
+  int Create(const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global);
+  //  void clear() { delete(alignment); };
+	void FindTunnels( const Global &global ) ;
+
+	void CreateLattice(const Global &global);
+	void CreateRules(const Global &global);
+		
+	const TunnelCollection &GetTunnelCollection() const
+	{ 
+		assert(m_tunnelCollection);
+		return *m_tunnelCollection;
+	}
+
+	const Lattice &GetLattice() const
+	{ 
+		assert(m_lattice);
+		return *m_lattice;
+	}
+	
+protected:
+	TunnelCollection *m_tunnelCollection;
+	Lattice *m_lattice;
+	
+	/*
+	void CalcTightestSpan(Outer &tightest);
+	void InitTightest(Outer &tightest, size_t len);
+	Range &GetTightest(Outer &tightest, size_t startPos, size_t endPos);
+	void SetAlignment(size_t source, size_t target);
+	void SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos);
+	*/
+};
+
--- a/contrib/other-builds/extract-mixed-syntax/Symbol.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/Symbol.cpp
@ -0,0 +1,101 @@
+/*
+ *  Symbol.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 21/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <cassert>
+#include "Symbol.h"
+
+using namespace std;
+
+Symbol::Symbol(const std::string &label, size_t pos)
+:m_label(label)
+,m_isTerminal(true)
+,m_span(2)
+{
+	m_span[0].first = pos;
+}
+
+Symbol::Symbol(const std::string &labelS, const std::string &labelT
+							 , size_t startS, size_t endS
+							 , size_t startT, size_t endT
+							 , bool isSourceSyntax, bool isTargetSyntax)
+:m_label(labelS)
+,m_labelT(labelT)
+,m_isTerminal(false)
+,m_span(2)
+,m_isSourceSyntax(isSourceSyntax)
+,m_isTargetSyntax(isTargetSyntax)
+{
+	m_span[0] = std::pair<size_t, size_t>(startS, endS);
+	m_span[1] = std::pair<size_t, size_t>(startT, endT);
+}
+
+int CompareNonTerm(bool thisIsSyntax, bool otherIsSyntax
+									 , const std::pair<size_t, size_t> &thisSpan, const std::pair<size_t, size_t> &otherSpan
+									 , std::string thisLabel, std::string otherLabel)
+{
+	if (thisIsSyntax != otherIsSyntax)
+	{ // 1 is [X] & the other is [NP] on the source
+		return thisIsSyntax ? -1 : +1;
+	}
+
+	assert(thisIsSyntax == otherIsSyntax);
+	if (thisIsSyntax)
+	{ // compare span & label
+		if (thisSpan != otherSpan)
+			return thisSpan < otherSpan ? -1 : +1;
+		if (thisLabel != otherLabel)
+			return thisLabel < otherLabel ? -1 : +1;
+	}
+	
+	return 0;
+}
+
+int Symbol::Compare(const Symbol &other) const
+{
+	if (m_isTerminal != other.m_isTerminal)
+		return m_isTerminal ? -1 : +1;
+	
+	assert(m_isTerminal == other.m_isTerminal);
+	if (m_isTerminal)
+	{ // compare labels & pos
+		if (m_span[0].first != other.m_span[0].first)
+			return (m_span[0].first < other.m_span[0].first) ? -1 : +1;
+		
+		if (m_label != other.m_label)
+			return (m_label < other.m_label) ? -1 : +1;
+		
+	}
+	else 
+	{ // non terms
+		int ret = CompareNonTerm(m_isSourceSyntax, other.m_isSourceSyntax
+														,m_span[0], other.m_span[0]
+														 ,m_label, other.m_label);
+		if (ret != 0)
+			return ret;
+			
+		ret = CompareNonTerm(m_isTargetSyntax, other.m_isTargetSyntax
+												 ,m_span[1], other.m_span[1]
+												 ,m_label, other.m_label);
+		if (ret != 0)
+			return ret;
+	}
+	
+	return 0;
+}
+
+
+std::ostream& operator<<(std::ostream &out, const Symbol &obj)
+{
+	if (obj.m_isTerminal)
+		out << obj.m_label;
+	else 
+		out << obj.m_label + obj.m_labelT;
+
+	return out;
+}
+
--- a/contrib/other-builds/extract-mixed-syntax/Symbol.h
+++ b/contrib/other-builds/extract-mixed-syntax/Symbol.h
@ -0,0 +1,36 @@
+#pragma once
+
+/*
+ *  Symbol.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 21/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <string>
+#include <iostream>
+#include <vector>
+
+class Symbol
+{
+	friend std::ostream& operator<<(std::ostream &out, const Symbol &obj);
+
+protected:
+	std::string m_label, m_labelT; // m_labelT only for non-term
+	std::vector<std::pair<size_t, size_t> > m_span;
+	
+	bool m_isTerminal, m_isSourceSyntax, m_isTargetSyntax;
+public:
+	// for terminals
+	Symbol(const std::string &label, size_t pos);
+
+	// for non-terminals
+	Symbol(const std::string &labelS, const std::string &labelT
+				 , size_t startS, size_t endS
+				 , size_t startT, size_t endT
+				 , bool isSourceSyntax, bool isTargetSyntax);
+
+	int Compare(const Symbol &other) const;
+
+};
--- a/contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp
@ -0,0 +1,56 @@
+/*
+ *  SymbolSequence.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 21/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <cassert>
+#include <sstream>
+#include "SymbolSequence.h"
+
+using namespace std;
+
+int SymbolSequence::Compare(const SymbolSequence &other) const
+{	
+	int ret;
+	size_t thisSize = GetSize();
+	size_t otherSize = other.GetSize();
+	if (thisSize != otherSize)
+	{
+		ret = (thisSize < otherSize) ? -1 : +1;
+		return ret;
+	}
+	else 
+	{
+		assert(thisSize == otherSize);
+		for (size_t ind = 0; ind < thisSize; ++ind)
+		{
+			const Symbol &thisSymbol = GetSymbol(ind);
+			const Symbol &otherSymbol = other.GetSymbol(ind);
+			ret = thisSymbol.Compare(otherSymbol);
+			if (ret != 0)
+			{
+				return ret;
+			}
+		}
+	}
+	
+	assert(ret == 0);
+	return ret;
+}
+
+std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj)
+{	
+	SymbolSequence::CollType::const_iterator iterSymbol;
+	for (iterSymbol = obj.m_coll.begin(); iterSymbol != obj.m_coll.end(); ++iterSymbol)
+	{
+		const Symbol &symbol = *iterSymbol;
+		out << symbol << " ";
+	}
+	
+	return out;
+}
+	
+
--- a/contrib/other-builds/extract-mixed-syntax/SymbolSequence.h
+++ b/contrib/other-builds/extract-mixed-syntax/SymbolSequence.h
@ -0,0 +1,42 @@
+#pragma once
+/*
+ *  SymbolSequence.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 21/07/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <iostream>
+#include <vector>
+#include "Symbol.h"
+
+class SymbolSequence
+{
+	friend std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj);
+
+protected:
+	typedef std::vector<Symbol> CollType;
+	CollType m_coll;
+	
+public:
+	typedef CollType::iterator iterator;
+	typedef CollType::const_iterator const_iterator;
+	const_iterator begin() const { return m_coll.begin(); }
+	const_iterator end() const { return m_coll.end(); }
+	
+	void Add(const Symbol &symbol)
+	{
+		m_coll.push_back(symbol);
+	}
+	size_t GetSize() const
+	{ return m_coll.size(); }
+	const Symbol &GetSymbol(size_t ind) const
+	{ return m_coll[ind]; }
+
+	void Clear()
+	{ m_coll.clear(); }
+	
+	int Compare(const SymbolSequence &other) const;
+
+};
--- a/contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp
@ -0,0 +1,245 @@
+// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+
+#include <iostream>
+#include <cassert>
+#include "SyntaxTree.h"
+//#include "extract.h"
+#include "Global.h"
+
+//extern const Global g_debug;
+extern const Global *g_global;
+
+using namespace std;
+
+bool SyntaxNode::IsSyntax() const
+{
+	bool ret = GetLabel() != "[X]";
+	return ret;
+}
+
+SyntaxTree::SyntaxTree() 
+:m_defaultLHS(0,0, "[X]")
+{
+	m_emptyNode.clear();
+}
+
+SyntaxTree::~SyntaxTree()
+{
+	// loop through all m_nodes, delete them
+	for(int i=0; i<m_nodes.size(); i++)
+	{
+		delete m_nodes[i];
+	}
+}
+
+bool HasDuplicates(const SyntaxNodes &nodes)
+{
+	string prevLabel;
+	SyntaxNodes::const_iterator iter;
+	for (iter = nodes.begin(); iter != nodes.end(); ++iter)
+	{
+		const SyntaxNode &node = **iter;
+		string label = node.GetLabel();
+		if (label == prevLabel)
+			return true;
+	}
+	return false;
+}
+
+void SyntaxTree::AddNode( int startPos, int endPos, std::string label ) 
+{	
+	SyntaxNode* newNode = new SyntaxNode( startPos, endPos, "[" + label + "]");
+	m_nodes.push_back( newNode );
+	
+	SyntaxNodes &nodesChart = m_index[ startPos ][ endPos ];
+	
+	if (!g_global->uppermostOnly)
+	{
+		nodesChart.push_back( newNode );	
+		//assert(!HasDuplicates(m_index[ startPos ][ endPos ]));
+	}
+	else 
+	{
+		if (nodesChart.size() > 0)
+		{
+			assert(nodesChart.size() == 1);
+			//delete nodes[0];
+			nodesChart.resize(0);
+		}
+		assert(nodesChart.size() == 0);
+		nodesChart.push_back( newNode );	
+	}
+}
+
+ParentNodes SyntaxTree::Parse() {
+	ParentNodes parents;
+
+	int size = m_index.size();
+
+	// looping through all spans of size >= 2
+	for( int length=2; length<=size; length++ )
+	{
+		for( int startPos = 0; startPos <= size-length; startPos++ )
+		{
+			if (HasNode( startPos, startPos+length-1 ))
+			{
+				// processing one (parent) span
+
+				//std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
+				SplitPoints splitPoints;
+				splitPoints.push_back( startPos );
+				//std::cerr << " " << startPos;
+
+				int first = 1;
+				int covered = 0;
+				while( covered < length )
+				{
+					// find largest covering subspan (child)
+					// starting at last covered position
+					for( int midPos=length-first; midPos>covered; midPos-- )
+					{
+						if( HasNode( startPos+covered, startPos+midPos-1 ) )
+						{							
+							covered = midPos;							
+							splitPoints.push_back( startPos+covered );
+							// std::cerr << " " << ( startPos+covered );
+							first = 0;
+						}
+					}
+				}
+				// std::cerr << std::endl;
+				parents.push_back( splitPoints );
+			}
+		}
+	}
+	return parents;
+}
+
+bool SyntaxTree::HasNode( int startPos, int endPos ) const 
+{
+	return GetNodes( startPos, endPos).size() > 0;
+}
+
+const SyntaxNodes &SyntaxTree::GetNodes( int startPos, int endPos ) const 
+{
+	SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
+	if (startIndex == m_index.end() )
+		return m_emptyNode;
+	
+	SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
+	if (endIndex == startIndex->second.end())
+		return m_emptyNode;
+	
+	return endIndex->second;
+}
+
+// for printing out tree
+std::string SyntaxTree::ToString() const
+{
+	std::stringstream out;
+	out << *this;
+	return out.str();
+}
+
+void SyntaxTree::AddDefaultNonTerms(size_t phraseSize)
+{
+	for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
+	{
+		for (size_t endPos = startPos; endPos < phraseSize; ++endPos)
+		{
+			AddNode(startPos, endPos, "X");
+		}
+	}
+}
+
+void SyntaxTree::AddDefaultNonTerms(bool isSyntax, bool mixed, size_t phraseSize)
+{
+	if (isSyntax)
+	{
+		AddDefaultNonTerms(!mixed, phraseSize);
+	}
+	else 
+	{ // add X everywhere
+		AddDefaultNonTerms(phraseSize);
+	}
+}
+
+void SyntaxTree::AddDefaultNonTerms(bool addEverywhere, size_t phraseSize)
+{
+  //cerr << "GetNumWords()=" << GetNumWords() << endl;
+	//assert(phraseSize == GetNumWords() || GetNumWords() == 1); // 1 if syntax sentence doesn't have any xml. TODO fix syntax tree obj
+
+	for (size_t startPos = 0; startPos <= phraseSize; ++startPos)
+	{
+		for (size_t endPos = startPos; endPos <= phraseSize; ++endPos)
+		{
+			const SyntaxNodes &nodes = GetNodes(startPos, endPos);
+			if (!addEverywhere && nodes.size() > 0)
+			{ // only add if no label
+				continue;
+			}
+			AddNode(startPos, endPos, "X");
+		}
+	}
+}
+
+const SyntaxNodes SyntaxTree::GetNodesForLHS( int startPos, int endPos ) const
+{
+	SyntaxNodes ret(GetNodes(startPos, endPos));
+	
+	if (ret.size() == 0)
+		ret.push_back(&m_defaultLHS);
+	
+	return ret;
+}
+
+std::ostream& operator<<(std::ostream& os, const SyntaxTree& t)
+{
+	int size = t.m_index.size();
+	for(size_t length=1; length<=size; length++)
+	{
+		for(size_t space=0; space<length; space++)
+		{
+			os << "    ";
+		}
+		for(size_t start=0; start<=size-length; start++)
+		{
+			
+			if (t.HasNode( start, start+(length-1) ))
+			{
+				std::string label = t.GetNodes( start, start+(length-1) )[0]->GetLabel() + "#######";
+				
+				os << label.substr(0,7) << " ";
+			}
+			else
+			{
+				os << "------- ";
+			}		
+		}
+		os << std::endl;
+	}
+  return os;
+}
+
+
--- a/contrib/other-builds/extract-mixed-syntax/SyntaxTree.h
+++ b/contrib/other-builds/extract-mixed-syntax/SyntaxTree.h
@ -0,0 +1,96 @@
+#pragma once 
+
+// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <string>
+#include <vector>
+#include <map>
+#include <sstream>
+
+class SyntaxNode;
+
+typedef std::vector<const SyntaxNode*> SyntaxNodes;
+
+class SyntaxNode {
+protected:
+	int m_start, m_end;
+	std::string m_label;
+	SyntaxNodes m_children;
+	SyntaxNode* m_parent;
+public:
+SyntaxNode( int startPos, int endPos, const std::string &label)
+	:m_start(startPos)
+		,m_end(endPos)
+		,m_label(label)
+	{}
+	int GetStart() const
+	{ return m_start; }
+	int GetEnd() const
+	{ return m_end; }
+	const std::string &GetLabel() const
+	{ return m_label; }
+	bool IsSyntax() const;
+};
+
+
+typedef std::vector< int > SplitPoints;
+typedef std::vector< SplitPoints > ParentNodes;
+
+class SyntaxTree {
+protected:
+	SyntaxNodes m_nodes;
+	SyntaxNode* m_top;
+	SyntaxNode m_defaultLHS;
+	
+	typedef std::map< int, SyntaxNodes > SyntaxTreeIndex2;
+	typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
+	typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
+	typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
+	SyntaxTreeIndex m_index;
+	SyntaxNodes m_emptyNode;
+
+	friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
+
+public:
+	SyntaxTree();
+	~SyntaxTree();
+	
+	void AddNode( int startPos, int endPos, std::string label );
+	ParentNodes Parse();
+	bool HasNode( int startPos, int endPos ) const;
+	const SyntaxNodes &GetNodes( int startPos, int endPos ) const;
+	const SyntaxNodes &GetAllNodes() const { return m_nodes; } ;
+	size_t GetNumWords() const { return m_index.size(); }
+	std::string ToString() const;
+	
+	void AddDefaultNonTerms(bool isSyntax, bool addEverywhere, size_t phraseSize);
+	void AddDefaultNonTerms(bool mixed, size_t phraseSize);
+
+	void AddDefaultNonTerms(size_t phraseSize);
+	
+	const SyntaxNodes GetNodesForLHS( int startPos, int endPos ) const;
+	
+};
+
+std::ostream& operator<<(std::ostream&, const SyntaxTree&);
+
--- a/contrib/other-builds/extract-mixed-syntax/Tunnel.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/Tunnel.cpp
@ -0,0 +1,38 @@
+/*
+ *  Tunnel.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/01/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include "Tunnel.h"
+
+
+int Tunnel::Compare(const Tunnel &other) const
+{
+	int ret = m_sourceRange.Compare(other.m_sourceRange);
+	
+	if (ret != 0)
+		return ret;
+
+	ret = m_targetRange.Compare(other.m_targetRange);
+		
+	return ret;
+}
+
+int Tunnel::Compare(const Tunnel &other, size_t direction) const
+{
+	const Range &thisRange = (direction == 0) ? m_sourceRange : m_targetRange;
+	const Range &otherRange = (direction == 0) ? other.m_sourceRange : other.m_targetRange;
+	
+	int ret = thisRange.Compare(otherRange);
+	return ret;
+}
+
+std::ostream& operator<<(std::ostream &out, const Tunnel &tunnel)
+{
+	out << tunnel.m_sourceRange << "==>" << tunnel.m_targetRange;
+	return out;
+}
--- a/contrib/other-builds/extract-mixed-syntax/Tunnel.h
+++ b/contrib/other-builds/extract-mixed-syntax/Tunnel.h
@ -0,0 +1,49 @@
+#pragma once
+
+/*
+ *  Tunnel.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/01/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <vector>
+#include <cassert>
+#include <string>
+#include <iostream>
+#include "Range.h"
+
+	// for unaligned source terminal
+
+class Tunnel
+{
+	friend std::ostream& operator<<(std::ostream&, const Tunnel&);
+
+protected:
+	
+	Range m_sourceRange, m_targetRange;
+
+public:
+	Tunnel()
+	{}
+	
+	Tunnel(const Tunnel &copy)
+	:m_sourceRange(copy.m_sourceRange)
+	,m_targetRange(copy.m_targetRange)
+	{}
+	
+	Tunnel(const Range &sourceRange, const Range &targetRange)
+	:m_sourceRange(sourceRange)
+	,m_targetRange(targetRange)
+	{}
+	
+	const Range &GetRange(size_t direction) const
+	{ return (direction == 0) ? m_sourceRange : m_targetRange; }
+		
+	int Compare(const Tunnel &other) const;
+	int Compare(const Tunnel &other, size_t direction) const;
+};
+
+typedef std::vector<Tunnel> TunnelList;
+
--- a/contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp
@ -0,0 +1,70 @@
+/*
+ *  TunnelCollection.cpp
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/01/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include "TunnelCollection.h"
+#include "Range.h"
+
+using namespace std;
+
+size_t TunnelCollection::NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const
+{
+	assert(startPos <= endPos);
+	
+	if (direction == 0)
+		assert(endPos < alignedCountS.size());		
+	else 
+		assert(endPos < alignedCountT.size());
+
+	size_t ret = 0; 
+	for (size_t ind = startPos; ind <= endPos; ++ind)
+	{
+		if (direction == 0 && alignedCountS[ind] == 0)
+		{
+			ret++;
+		}
+		else if (direction == 1 && alignedCountT[ind] == 0)
+		{
+			ret++;
+		}
+		
+	}
+	
+	return ret;
+}
+
+void TunnelCollection::Add(int startS, int endS, int startT, int endT)
+{
+	// m_phraseExist[startS][endS - startS].push_back(Tunnel(startT, endT));
+	m_coll[startS][endS - startS].push_back(Tunnel(Range(startS, endS), Range(startT, endT)));
+}
+
+
+std::ostream& operator<<(std::ostream &out, const TunnelCollection &TunnelCollection)
+{
+	size_t size = TunnelCollection.GetSize();
+	
+	for (size_t startPos = 0; startPos < size; ++startPos)
+	{
+		for (size_t endPos = startPos; endPos < size; ++endPos)
+		{
+			const TunnelList &tunnelList = TunnelCollection.GetTunnels(startPos, endPos);
+			TunnelList::const_iterator iter;
+			for (iter = tunnelList.begin(); iter != tunnelList.end(); ++iter)
+			{
+				const Tunnel &tunnel = *iter;
+				out << tunnel << " ";
+				
+			}
+ 		}
+	}
+	
+	return out;
+}
+
+
--- a/contrib/other-builds/extract-mixed-syntax/TunnelCollection.h
+++ b/contrib/other-builds/extract-mixed-syntax/TunnelCollection.h
@ -0,0 +1,61 @@
+#pragma once
+/*
+ *  TunnelCollection.h
+ *  extract
+ *
+ *  Created by Hieu Hoang on 19/01/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <vector>
+#include "Tunnel.h"
+
+// reposity of extracted phrase pairs
+// which are potential tunnels in larger phrase pairs
+class TunnelCollection
+	{
+		friend std::ostream& operator<<(std::ostream&, const TunnelCollection&);
+
+	protected:
+		std::vector< std::vector<TunnelList> > m_coll;
+		// indexed by source pos. and source length 
+		// maps to list of tunnels where <int, int> are target pos
+
+	public:
+		std::vector<int> alignedCountS, alignedCountT;
+
+		TunnelCollection(const TunnelCollection &);
+
+		TunnelCollection(size_t size)
+		:m_coll(size)
+		{
+			// size is the length of the source sentence
+			for (size_t pos = 0; pos < size; ++pos)
+			{
+				// create empty tunnel lists
+				std::vector<TunnelList> &endVec = m_coll[pos];
+				endVec.resize(size - pos);
+			}
+		}
+		
+		void Add(int startS, int endS, int startT, int endT);
+
+		//const TunnelList &GetTargetHoles(int startS, int endS) const
+		//{
+		//	const TunnelList &targetHoles = m_phraseExist[startS][endS - startS];
+		//	return targetHoles;
+		//}
+		const TunnelList &GetTunnels(int startS, int endS) const
+		{
+			const TunnelList &sourceHoles = m_coll[startS][endS - startS];
+			return sourceHoles;
+		}
+		
+		const size_t GetSize() const
+		{ return m_coll.size(); }
+		
+		size_t NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const;
+
+
+	};
+
--- a/contrib/other-builds/extract-mixed-syntax/XmlTree.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/XmlTree.cpp
@ -0,0 +1,344 @@
+// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2006 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <vector>
+#include <string>
+#include <set>
+#include <iostream>
+#include <stdlib.h>
+#include "SyntaxTree.h"
+
+using namespace std;
+
+
+inline std::vector<std::string> Tokenize(const std::string& str,
+                                                                const std::string& delimiters = " \t")
+{
+	std::vector<std::string> tokens;
+	// Skip delimiters at beginning.
+	std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+	// Find first "non-delimiter".
+	std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
+	
+	while (std::string::npos != pos || std::string::npos != lastPos)
+	{
+		// Found a token, add it to the vector.
+		tokens.push_back(str.substr(lastPos, pos - lastPos));
+		// Skip delimiters.  Note the "not_of"
+		lastPos = str.find_first_not_of(delimiters, pos);
+		// Find next "non-delimiter"
+		pos = str.find_first_of(delimiters, lastPos);
+	}
+	
+	return tokens;
+}
+
+const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
+{
+	std::string res = str;
+	res.erase(str.find_last_not_of(dropChars)+1);
+	return res.erase(0, res.find_first_not_of(dropChars));
+}
+
+string ParseXmlTagAttribute(const string& tag,const string& attributeName){
+	/*TODO deal with unescaping \"*/
+	string tagOpen = attributeName + "=\"";
+	size_t contentsStart = tag.find(tagOpen);
+	if (contentsStart == string::npos) return "";
+	contentsStart += tagOpen.size();
+	size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
+	if (contentsEnd == string::npos) {
+		cerr << "Malformed XML attribute: "<< tag;
+		return "";
+	}
+	size_t possibleEnd;
+	while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
+		contentsEnd = possibleEnd;
+	}
+	return tag.substr(contentsStart,contentsEnd-contentsStart);
+}
+
+/**
+ * Remove "<" and ">" from XML tag
+ *
+ * \param str xml token to be stripped
+ */
+string TrimXml(const string& str) 
+{
+  // too short to be xml token -> do nothing
+	if (str.size() < 2) return str;
+	
+  // strip first and last character
+	if (str[0] == '<' && str[str.size() - 1] == '>') 
+	{
+		return str.substr(1, str.size() - 2);
+	} 
+  // not an xml token -> do nothing
+  else { return str; }
+}
+
+/**
+ * Check if the token is an XML tag, i.e. starts with "<"
+ *
+ * \param tag token to be checked
+ */
+bool isXmlTag(const string& tag)
+{
+	return tag[0] == '<';
+}
+
+/**
+ * Split up the input character string into tokens made up of 
+ * either XML tags or text.
+ * example: this <b> is a </b> test .
+ *       => (this ), (<b>), ( is a ), (</b>), ( test .)
+ *
+ * \param str input string
+ */
+inline vector<string> TokenizeXml(const string& str)
+{
+	string lbrack = "<";
+	string rbrack = ">";
+	vector<string> tokens; // vector of tokens to be returned
+	string::size_type cpos = 0; // current position in string
+	string::size_type lpos = 0; // left start of xml tag
+	string::size_type rpos = 0; // right end of xml tag
+	
+  // walk thorugh the string (loop vver cpos)
+	while (cpos != str.size()) 
+	{
+    // find the next opening "<" of an xml tag
+  	lpos = str.find_first_of(lbrack, cpos);
+		if (lpos != string::npos) 
+		{
+			// find the end of the xml tag
+			rpos = str.find_first_of(rbrack, lpos);
+			// sanity check: there has to be closing ">"
+			if (rpos == string::npos) 
+			{
+			  cerr << "ERROR: malformed XML: " << str << endl;
+				return tokens;
+			}
+		} 
+		else // no more tags found
+		{
+			// add the rest as token
+			tokens.push_back(str.substr(cpos));
+			break;
+		}
+		
+		// add stuff before xml tag as token, if there is any
+		if (lpos - cpos > 0)
+			tokens.push_back(str.substr(cpos, lpos - cpos));
+		
+		// add xml tag as token
+		tokens.push_back(str.substr(lpos, rpos-lpos+1));
+		cpos = rpos + 1;
+	}
+	return tokens;
+}
+
+/**
+ * Process a sentence with xml annotation
+ * Xml tags may specifiy additional/replacing translation options
+ * and reordering constraints
+ *
+ * \param line in: sentence, out: sentence without the xml
+ * \param res vector with translation options specified by xml
+ * \param reorderingConstraint reordering constraint zones specified by xml
+ * \param walls reordering constraint walls specified by xml
+ */
+/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
+	is so we can link things up afterwards. We can't create TranslationOptions as we
+	parse because we don't have the completed source parsed until after this function
+	removes all the markup from it (CreateFromString in Sentence::Read).
+*/
+bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) {
+	//parse XML markup in translation line
+	
+	// no xml tag? we're done.
+	if (line.find_first_of('<') == string::npos) { return true; }
+	
+	// break up input into a vector of xml tags and text
+  // example: (this), (<b>), (is a), (</b>), (test .)
+	vector<string> xmlTokens = TokenizeXml(line);
+	
+	// we need to store opened tags, until they are closed
+	// tags are stored as tripled (tagname, startpos, contents)
+	typedef pair< string, pair< size_t, string > > OpenedTag;
+	vector< OpenedTag > tagStack; // stack that contains active opened tags
+	
+	string cleanLine; // return string (text without xml)
+	size_t wordPos = 0; // position in sentence (in terms of number of words)
+	bool isLinked = false;
+	
+  // loop through the tokens
+	for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
+	{
+    // not a xml tag, but regular text (may contain many words)
+		if(!isXmlTag(xmlTokens[xmlTokenPos]))
+		{
+			// add a space at boundary, if necessary
+			if (cleanLine.size()>0 &&
+			    cleanLine[cleanLine.size() - 1] != ' ' &&
+			    xmlTokens[xmlTokenPos][0] != ' ')
+			{
+				cleanLine += " ";
+			}
+			cleanLine += xmlTokens[xmlTokenPos]; // add to output
+			wordPos = Tokenize(cleanLine).size(); // count all the words
+		}
+		
+		// process xml tag
+		else
+		{
+			// *** get essential information about tag ***
+			
+      // strip extra boundary spaces and "<" and ">"
+			string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
+			// cerr << "XML TAG IS: " << tag << std::endl;
+			
+			if (tag.size() == 0)
+			{
+				cerr << "ERROR: empty tag name: " << line << endl;
+				return false;
+			}
+			
+      // check if unary (e.g., "<wall/>")
+			bool isUnary = ( tag[tag.size() - 1] == '/' );
+			
+			// check if opening tag (e.g. "<a>", not "</a>")g
+			bool isClosed = ( tag[0] == '/' );
+			bool isOpen = !isClosed;
+			
+			if (isClosed && isUnary)
+			{
+				cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
+				return false;
+			}
+			
+			if (isClosed)
+				tag = tag.substr(1); // remove "/" at the beginning
+			if (isUnary)
+				tag = tag.substr(0,tag.size()-1); // remove "/" at the end
+			
+      // find the tag name and contents
+			string::size_type endOfName = tag.find_first_of(' ');
+			string tagName = tag;
+			string tagContent = "";
+			if (endOfName != string::npos) {
+				tagName = tag.substr(0,endOfName);
+				tagContent = tag.substr(endOfName+1);
+			}
+			
+			// *** process new tag ***
+
+			if (isOpen || isUnary)
+			{
+				// put the tag on the tag stack
+				OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
+				tagStack.push_back( openedTag );
+				// cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
+			}
+
+			// *** process completed tag ***
+
+			if (isClosed || isUnary)
+			{
+				// pop last opened tag from stack;
+				if (tagStack.size() == 0)
+				{
+				    cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
+					return false;
+				}
+				OpenedTag openedTag = tagStack.back();
+				tagStack.pop_back();
+				
+				// tag names have to match
+				if (openedTag.first != tagName)
+				{
+				    cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
+					return false;
+				}
+				 
+				// assemble remaining information about tag
+				size_t startPos = openedTag.second.first;
+				string tagContent = openedTag.second.second;
+				size_t endPos = wordPos;
+
+				// span attribute overwrites position
+				string span = ParseXmlTagAttribute(tagContent,"span");
+				if (! span.empty()) 
+				{
+					vector<string> ij = Tokenize(span, "-");
+					if (ij.size() != 1 && ij.size() != 2) {
+					    cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
+						return false;
+					}
+					startPos = atoi(ij[0].c_str());
+					if (ij.size() == 1) endPos = startPos + 1;
+					else endPos = atoi(ij[1].c_str()) + 1;
+				}
+
+				// cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
+
+				if (startPos >= endPos)
+				{
+				    cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
+					return false;
+				}
+
+				string label = ParseXmlTagAttribute(tagContent,"label");
+				labelCollection.insert( label );
+
+				// report what we have processed so far
+				if (0) {
+				  cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
+				  cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
+				  cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
+				}
+				tree.AddNode( startPos, endPos-1, label );
+			}
+		}
+	}
+	// we are done. check if there are tags that are still open
+	if (tagStack.size() > 0)
+	{
+		cerr << "ERROR: some opened tags were never closed: " << line << endl;
+		return false;
+	}
+
+	// collect top labels
+	const SyntaxNodes &topNodes = tree.GetNodes( 0, wordPos-1 );
+	for( SyntaxNodes::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ )
+	{
+		const SyntaxNode *n = *node;
+		const string &label = n->GetLabel();
+		if (topLabelCollection.find( label ) == topLabelCollection.end())
+			topLabelCollection[ label ] = 0;
+		topLabelCollection[ label ]++;
+	}
+
+	// return de-xml'ed sentence in line
+	line = cleanLine;
+	return true;
+}
--- a/contrib/other-builds/extract-mixed-syntax/XmlTree.h
+++ b/contrib/other-builds/extract-mixed-syntax/XmlTree.h
@ -0,0 +1,35 @@
+#pragma once 
+
+// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2006 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <string>
+#include <vector>
+#include <set>
+#include <map>
+#include "SyntaxTree.h"
+
+std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
+std::string TrimXml(const std::string& str);
+bool isXmlTag(const std::string& tag);
+inline std::vector<std::string> TokenizeXml(const std::string& str);
+bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection );
--- a/contrib/other-builds/extract-mixed-syntax/extract.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/extract.cpp
@ -0,0 +1,310 @@
+// $Id: extract.cpp 2828 2010-02-01 16:07:58Z hieuhoang1972 $
+// vim:tabstop=2
+
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <cstdio>
+#include <stdlib.h>
+#include <assert.h>
+#include <time.h>
+#include <cstring>
+#include <sstream>
+#include <iostream>
+#include "extract.h"
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+#include "Lattice.h"
+
+#ifdef WIN32
+// Include Visual Leak Detector
+#include <vld.h>
+#endif
+
+using namespace std;
+
+void writeGlueGrammar(const string &, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
+
+int main(int argc, char* argv[]) 
+{
+  cerr << "Extract v2.0, written by Philipp Koehn\n"
+       << "rule extraction from an aligned parallel corpus\n";
+  //time_t starttime = time(NULL);
+	
+	Global *global = new Global();
+	g_global = global;
+	int sentenceOffset = 0;
+		
+	if (argc < 5) {
+		cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract "
+		     << " [ --Hierarchical | --Orientation"
+				 << " | --GlueGrammar FILE | --UnknownWordLabel FILE"
+				 << " | --OnlyDirect"
+					
+					<< " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]"
+					<< " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]"
+					<< " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]"
+					<< " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]"
+
+				<< " | --MaxSymbols[" << global->maxSymbols<< "]"
+				 << " | --MaxNonTerm[" << global->maxNonTerm << "]"
+		     << " | --SourceSyntax | --TargetSyntax" 
+				<<	" | --UppermostOnly[" << g_global->uppermostOnly << "]"
+				<< endl;
+		exit(1);
+	}
+  char* &fileNameT = argv[1];
+  char* &fileNameS = argv[2];
+  char* &fileNameA = argv[3];
+	string fileNameGlueGrammar;
+ 	string fileNameUnknownWordLabel;
+	string fileNameExtract = string(argv[4]);
+
+	int optionInd = 5;
+
+  for(int i=optionInd;i<argc;i++) 
+	{
+		if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) {
+			global->minHoleSpanSourceDefault = atoi(argv[++i]);
+			if (global->minHoleSpanSourceDefault < 1) {
+				cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) {
+			global->maxHoleSpanSourceDefault = atoi(argv[++i]);
+			if (global->maxHoleSpanSourceDefault < 1) {
+				cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		else  if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) {
+			global->minHoleSpanSourceSyntax = atoi(argv[++i]);
+			if (global->minHoleSpanSourceSyntax < 1) {
+				cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		else if (strcmp(argv[i],"--UppermostOnly") == 0) {
+			global->uppermostOnly = atoi(argv[++i]);
+		}
+		else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) {
+			global->maxHoleSpanSourceSyntax = atoi(argv[++i]);
+			if (global->maxHoleSpanSourceSyntax < 1) {
+				cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		
+		// maximum number of words in hierarchical phrase
+		else if (strcmp(argv[i],"--maxSymbols") == 0) {
+			global->maxSymbols = atoi(argv[++i]);
+			if (global->maxSymbols < 1) {
+				cerr << "extract error: --maxSymbols should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		// maximum number of non-terminals
+		else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
+			global->maxNonTerm = atoi(argv[++i]);
+			if (global->maxNonTerm < 1) {
+				cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
+				exit(1);
+			}
+		}		
+		// allow consecutive non-terminals (X Y | X Y)
+    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
+      global->targetSyntax = true;
+    }
+    else if (strcmp(argv[i],"--SourceSyntax") == 0) {
+      global->sourceSyntax = true;
+    }
+		// do not create many part00xx files!
+    else if (strcmp(argv[i],"--NoFileLimit") == 0) {
+      // now default
+    }
+		else if (strcmp(argv[i],"--GlueGrammar") == 0) {
+			global->glueGrammarFlag = true;
+			if (++i >= argc)
+			{
+				cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
+				exit(0);
+			}
+			fileNameGlueGrammar = string(argv[i]);
+			cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
+    }
+		else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
+			global->unknownWordLabelFlag = true;
+			if (++i >= argc)
+			{
+				cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
+				exit(0);
+			}
+			fileNameUnknownWordLabel = string(argv[i]);
+			cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
+		}
+		// TODO: this should be a useful option
+    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
+    //  zipFiles = true;
+    //}
+		// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
+    else if (strcmp(argv[i],"--Mixed") == 0) {
+			global->mixed = true;
+    }
+		else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) {
+			global->allowDefaultNonTermEdge = atoi(argv[++i]);
+    }
+		else if (strcmp(argv[i], "--GZOutput") == 0) {
+      global->gzOutput = true;
+    }
+		else if (strcmp(argv[i],"--MaxSpan") == 0) {
+		  // ignore
+      ++i;
+		}
+    else if (strcmp(argv[i],"--SentenceOffset") == 0) {
+      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+        exit(1);
+      }
+      sentenceOffset = atoi(argv[++i]);
+    }
+    else {
+      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
+      exit(1);
+    }
+  }
+
+
+	// open input files
+	Moses::InputFileStream tFile(fileNameT);
+	Moses::InputFileStream sFile(fileNameS);
+	Moses::InputFileStream aFile(fileNameA);
+
+	// open output files
+  string fileNameExtractInv = fileNameExtract + ".inv";
+  if (global->gzOutput) {
+    fileNameExtract += ".gz";
+    fileNameExtractInv += ".gz";
+  }
+
+  Moses::OutputFileStream extractFile;
+  Moses::OutputFileStream extractFileInv;
+  extractFile.Open(fileNameExtract.c_str());
+  extractFileInv.Open(fileNameExtractInv.c_str());
+  
+  
+	// loop through all sentence pairs
+  int i = sentenceOffset;
+  while(true) {
+    i++;
+
+    if (i % 1000 == 0) {
+      cerr << i << " " << flush;
+    }
+
+    string targetString;
+    string sourceString;
+    string alignmentString;
+		
+		bool ok = getline(tFile, targetString);
+		if (!ok)
+			break;
+		getline(sFile, sourceString);
+		getline(aFile, alignmentString);
+    
+		//cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl;
+
+		//time_t currTime = time(NULL);
+		//cerr << "A " << (currTime - starttime) << endl;
+
+    SentenceAlignment sentencePair;
+    if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global )) 
+		{			
+			//cerr << sentence.sourceTree << endl;
+			//cerr << sentence.targetTree << endl;
+
+			sentencePair.FindTunnels(*g_global);
+			//cerr << "C " << (time(NULL) - starttime) << endl;
+			//cerr << sentencePair << endl;
+			
+			sentencePair.CreateLattice(*g_global);
+			//cerr << "D " << (time(NULL) - starttime) << endl;
+			//cerr << sentencePair << endl;
+
+			sentencePair.CreateRules(*g_global);
+			//cerr << "E " << (time(NULL) - starttime) << endl;
+
+			//cerr << sentence.lattice->GetRules().GetSize() << endl;
+			sentencePair.GetLattice().GetRules().Output(extractFile);
+      sentencePair.GetLattice().GetRules().OutputInv(extractFileInv);
+    }
+  }
+	
+  tFile.Close();
+  sFile.Close();
+  aFile.Close();
+
+  extractFile.Close();
+  extractFileInv.Close();
+
+  if (global->glueGrammarFlag) {
+    writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection);
+  }
+
+  delete global;
+}
+ 
+
+void writeGlueGrammar( const string & fileName, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
+{
+  ofstream grammarFile;
+  grammarFile.open(fileName.c_str());
+  if (!options.targetSyntax) {
+    grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
+                << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
+                << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
+  } else {
+    // chose a top label that is not already a label
+    string topLabel = "QQQQQQ";
+    for( unsigned int i=1; i<=topLabel.length(); i++) {
+      if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
+        topLabel = topLabel.substr(0,i);
+        break;
+      }
+    }
+    // basic rules
+    grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1  ||| " << endl
+                << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << endl;
+
+    // top rules
+    for( map<string,int>::const_iterator i =  targetTopLabelCollection.begin();
+         i !=  targetTopLabelCollection.end(); i++ ) {
+      grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << endl;
+    }
+
+    // glue rules
+    for( set<string>::const_iterator i =  targetLabelCollection.begin();
+         i !=  targetLabelCollection.end(); i++ ) {
+      grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << endl;
+    }
+    grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 |||  0-0 1-1 " << endl; // glue rule for unknown word...
+  }
+  grammarFile.close();
+}
+
--- a/contrib/other-builds/extract-mixed-syntax/extract.h
+++ b/contrib/other-builds/extract-mixed-syntax/extract.h
@ -0,0 +1,34 @@
+#pragma once
+
+#include <vector>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <fstream>
+#include <algorithm>
+#include "SyntaxTree.h"
+#include "XmlTree.h"
+#include "Tunnel.h"
+#include "TunnelCollection.h"
+#include "SentenceAlignment.h"
+#include "Global.h"
+
+std::vector<std::string> tokenize( const char [] );
+
+#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
+                _IS.getline(_LINE, _SIZE, _DELIM); \
+                if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
+                if (_IS.gcount() == _SIZE-1) { \
+                  cerr << "Line too long! Buffer overflow. Delete lines >=" \
+                    << _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \
+                    << endl; \
+                    exit(1); \
+                } \
+              }
+#define LINE_MAX_LENGTH 1000000
+
+const Global *g_global;
+
+std::set< std::string > targetLabelCollection, sourceLabelCollection;
+std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection;
--- a/contrib/other-builds/extract-mixed-syntax/gzfilebuf.h
+++ b/contrib/other-builds/extract-mixed-syntax/gzfilebuf.h
@ -0,0 +1,81 @@
+#ifndef moses_gzfile_buf_h
+#define moses_gzfile_buf_h
+
+#include <streambuf>
+#include <zlib.h>
+#include <cstring>
+
+class gzfilebuf : public std::streambuf {
+public:
+  gzfilebuf(const char *filename)
+  { _gzf = gzopen(filename, "rb"); 
+    setg (_buff+sizeof(int),     // beginning of putback area
+          _buff+sizeof(int),     // read position
+          _buff+sizeof(int));    // end position
+  }
+  ~gzfilebuf() { gzclose(_gzf); }
+protected:
+  virtual int_type overflow (int_type c) {
+		throw;
+  }
+	
+  // write multiple characters
+  virtual
+  std::streamsize xsputn (const char* s,
+                          std::streamsize num) {
+		throw;
+  }
+	
+  virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
+  }
+	
+  //read one character
+  virtual int_type underflow () {
+    // is read position before end of _buff?
+		if (gptr() < egptr()) {
+			return traits_type::to_int_type(*gptr());
+		}
+		
+		/* process size of putback area
+		 * - use number of characters read
+		 * - but at most four
+		 */
+		unsigned int numPutback = gptr() - eback();
+		if (numPutback > sizeof(int)) {
+			numPutback = sizeof(int);
+		}
+		
+		/* copy up to four characters previously read into
+		 * the putback _buff (area of first four characters)
+		 */
+		std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
+									numPutback);
+		
+		// read new characters
+		int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
+		if (num <= 0) {
+			// ERROR or EOF
+			return EOF;
+		}
+		
+		// reset _buff pointers
+		setg (_buff+(sizeof(int)-numPutback),   // beginning of putback area
+					_buff+sizeof(int),                // read position
+					_buff+sizeof(int)+num);           // end of buffer
+		
+		// return next character
+		return traits_type::to_int_type(*gptr());
+  }
+	
+  std::streamsize xsgetn (char* s,
+                          std::streamsize num) {
+    return gzread(_gzf,s,num);
+  }
+	
+private:
+  gzFile _gzf;
+  static const unsigned int _buffsize = 1024;
+  char _buff[_buffsize];
+};
+
+#endif
--- a/contrib/other-builds/extract-mixed-syntax/tables-core.cpp
+++ b/contrib/other-builds/extract-mixed-syntax/tables-core.cpp
@ -0,0 +1,110 @@
+// $Id: tables-core.cpp 3131 2010-04-13 16:29:55Z pjwilliams $
+//#include "beammain.h"
+//#include "SafeGetLine.h"
+#include "tables-core.h"
+
+#define TABLE_LINE_MAX_LENGTH 1000
+#define UNKNOWNSTR	"UNK"
+
+// as in beamdecoder/tables.cpp
+vector<string> tokenize( const char* input ) {
+  vector< string > token;
+  bool betweenWords = true;
+  int start=0;
+  int i=0;
+  for(; input[i] != '\0'; i++) {
+    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+		
+    if (!isSpace && betweenWords) {
+      start = i;
+      betweenWords = false;
+    }
+    else if (isSpace && !betweenWords) {
+      token.push_back( string( input+start, i-start ) );
+      betweenWords = true;
+    }
+  }
+  if (!betweenWords)
+    token.push_back( string( input+start, i-start ) );
+  return token;
+}
+
+WORD_ID Vocabulary::storeIfNew( const WORD& word ) {
+  map<WORD, WORD_ID>::iterator i = lookup.find( word );
+  
+  if( i != lookup.end() )
+    return i->second;
+	
+  WORD_ID id = vocab.size();
+  vocab.push_back( word );
+  lookup[ word ] = id;
+  return id;  
+}
+
+WORD_ID Vocabulary::getWordID( const WORD& word ) {
+  map<WORD, WORD_ID>::iterator i = lookup.find( word );
+  if( i == lookup.end() )
+    return 0;
+  return i->second;
+}
+
+PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase ) {
+  map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
+  if( i != lookup.end() )
+    return i->second;
+	
+  PHRASE_ID id  = phraseTable.size();
+  phraseTable.push_back( phrase );
+  lookup[ phrase ] = id;
+  return id;
+}
+
+PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase ) {
+  map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
+  if( i == lookup.end() )
+    return 0;
+  return i->second;
+}
+
+void PhraseTable::clear() {
+  lookup.clear();
+  phraseTable.clear();
+}
+
+void DTable::init() {
+  for(int i = -10; i<10; i++)
+    dtable[i] = -abs( i );
+}
+
+/*
+void DTable::load( const string& fileName ) {
+  ifstream inFile;
+  inFile.open(fileName.c_str());
+  istream *inFileP = &inFile;
+	
+  char line[TABLE_LINE_MAX_LENGTH];
+  int i=0;
+  while(true) {
+    i++;
+    SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
+    if (inFileP->eof()) break;
+		
+    vector<string> token = tokenize( line );
+    if (token.size() < 2) {
+      cerr << "line " << i << " in " << fileName << " too short, skipping\n";
+      continue;
+    }
+		
+    int d = atoi( token[0].c_str() );
+    double prob = log( atof( token[1].c_str() ) );
+    dtable[ d ] = prob;
+  }  
+}
+*/
+
+double DTable::get( int distortion ) {
+  if (dtable.find( distortion ) == dtable.end())
+    return log( 0.00001 );
+  return dtable[ distortion ];
+}
+
--- a/contrib/other-builds/extract-mixed-syntax/tables-core.h
+++ b/contrib/other-builds/extract-mixed-syntax/tables-core.h
@ -0,0 +1,72 @@
+#pragma once
+// $Id: tables-core.h 2416 2009-07-30 11:07:38Z hieuhoang1972 $
+
+#include <iostream>
+#include <fstream>
+#include <assert.h>
+#include <stdlib.h>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+
+using namespace std;
+
+#define TABLE_LINE_MAX_LENGTH 1000
+#define UNKNOWNSTR	"UNK"
+
+vector<string> tokenize( const char[] );
+
+//! delete and remove every element of a collection object such as map, set, list etc
+template<class COLL>
+void RemoveAllInColl(COLL &coll)
+{
+	for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter)
+	{
+		delete (*iter);
+	}
+	coll.clear();
+}
+
+typedef string WORD;
+typedef unsigned int WORD_ID;
+
+class Vocabulary {
+ public:
+  map<WORD, WORD_ID>  lookup;
+  vector< WORD > vocab;
+  WORD_ID storeIfNew( const WORD& );
+  WORD_ID getWordID( const WORD& );
+  inline WORD &getWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
+};
+
+typedef vector< WORD_ID > PHRASE;
+typedef unsigned int PHRASE_ID;
+
+class PhraseTable {
+ public:
+  map< PHRASE, PHRASE_ID > lookup;
+  vector< PHRASE > phraseTable;
+  PHRASE_ID storeIfNew( const PHRASE& );
+  PHRASE_ID getPhraseID( const PHRASE& );
+  void clear();
+  inline PHRASE &getPhrase( const PHRASE_ID id ) { return phraseTable[ id ]; }
+};
+
+typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC;
+
+class TTable {
+ public:
+  map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable;
+  map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti;
+};
+
+class DTable {
+ public:
+  map< int, double > dtable;
+  void init();
+  void load( const string& );
+  double get( int );
+};
+
+
--- a/contrib/other-builds/extract-ordering/.cproject
+++ b/contrib/other-builds/extract-ordering/.cproject
@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
+					<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127." name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.499747849" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.798364121" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
+							<builder buildPath="${workspace_loc:/extract-ordering}/Debug" id="cdt.managedbuild.builder.gnu.cross.1976289814" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
+								<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1324749613" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
+								<option id="gnu.c.compiler.option.debugging.level.1750299246" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.719498215" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
+								<option id="gnu.cpp.compiler.option.optimization.level.251118848" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.debugging.level.99297656" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1844372739" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1178164658" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
+								<option id="gnu.cpp.link.option.libs.1434184833" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+									<listOptionValue builtIn="false" value="z"/>
+									<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
+									<listOptionValue builtIn="false" value="boost_system-mt"/>
+									<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+								</option>
+								<option id="gnu.cpp.link.option.paths.974811544" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.904916320" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1005231499" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1318928675" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.604255673" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+		<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.818331963">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" moduleId="org.eclipse.cdt.core.settings" name="Release">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.818331963" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
+					<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.818331963." name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.1489025499" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1052477856" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
+							<builder buildPath="${workspace_loc:/extract-ordering}/Release" id="cdt.managedbuild.builder.gnu.cross.33925527" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
+								<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.1884790737" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
+								<option id="gnu.c.compiler.option.debugging.level.197048136" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.106898878" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
+								<option id="gnu.cpp.compiler.option.optimization.level.1920378037" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.debugging.level.37950410" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1197641703" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1356351201" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2053623412" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1988048517" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1494470963" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1553727957" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="extract-ordering.cdt.managedbuild.target.gnu.cross.exe.1840421491" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.c.compiler.1505710417;cdt.managedbuild.tool.gnu.c.compiler.input.106898878">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.818331963;cdt.managedbuild.config.gnu.cross.exe.release.818331963.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.157115446;cdt.managedbuild.tool.gnu.cpp.compiler.input.683027595">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964;cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127;cdt.managedbuild.config.gnu.cross.exe.debug.1624346127.;cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827;cdt.managedbuild.tool.gnu.c.compiler.input.719498215">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+</cproject>
--- a/contrib/other-builds/extract-ordering/.project
+++ b/contrib/other-builds/extract-ordering/.project
@ -0,0 +1,74 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>extract-ordering</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+	<linkedResources>
+		<link>
+			<name>InputFileStream.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.cpp</locationURI>
+		</link>
+		<link>
+			<name>InputFileStream.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.h</locationURI>
+		</link>
+		<link>
+			<name>OutputFileStream.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.cpp</locationURI>
+		</link>
+		<link>
+			<name>OutputFileStream.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
+		</link>
+		<link>
+			<name>SentenceAlignment.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.cpp</locationURI>
+		</link>
+		<link>
+			<name>SentenceAlignment.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.h</locationURI>
+		</link>
+		<link>
+			<name>extract-ordering-main.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ordering-main.cpp</locationURI>
+		</link>
+		<link>
+			<name>tables-core.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.cpp</locationURI>
+		</link>
+		<link>
+			<name>tables-core.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.h</locationURI>
+		</link>
+	</linkedResources>
+</projectDescription>
--- a/contrib/other-builds/manual-label/.cproject
+++ b/contrib/other-builds/manual-label/.cproject
@ -0,0 +1,124 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
+					<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639." name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.1899954923" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1645930772" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
+							<builder buildPath="${workspace_loc:/manual-label/Debug}" id="cdt.managedbuild.builder.gnu.cross.1703642277" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
+								<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1888648788" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
+								<option id="gnu.c.compiler.option.debugging.level.1838052643" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.798368516" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
+								<option id="gnu.cpp.compiler.option.optimization.level.153015988" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.debugging.level.418888584" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.include.paths.406065865" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1741441821" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.1626431978" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
+								<option id="gnu.cpp.link.option.libs.1886912770" superClass="gnu.cpp.link.option.libs" valueType="libs">
+									<listOptionValue builtIn="false" value="boost_program_options-mt"/>
+								</option>
+								<option id="gnu.cpp.link.option.paths.1541583695" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/lib64&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1367999206" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.archiver.31522559" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.assembler.826957235" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.350181339" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+		<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" moduleId="org.eclipse.cdt.core.settings" name="Release">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
+					<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.1335379815." name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.97427761" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.564169339" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
+							<builder buildPath="${workspace_loc:/manual-label/Release}" id="cdt.managedbuild.builder.gnu.cross.663164336" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
+								<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.2135645103" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
+								<option id="gnu.c.compiler.option.debugging.level.764935013" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1841809129" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
+								<option id="gnu.cpp.compiler.option.optimization.level.1877584345" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.debugging.level.935490779" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.355530813" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.940299092" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.17718999" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1527322008" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.assembler.480337803" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1788533940" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="manual-label.cdt.managedbuild.target.gnu.cross.exe.2117548180" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1180544943;cdt.managedbuild.tool.gnu.cpp.compiler.input.1084298301">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.c.compiler.1938374607;cdt.managedbuild.tool.gnu.c.compiler.input.798368516">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1335379815;cdt.managedbuild.config.gnu.cross.exe.release.1335379815.;cdt.managedbuild.tool.gnu.cross.c.compiler.2104943437;cdt.managedbuild.tool.gnu.c.compiler.input.1841809129">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1096604639;cdt.managedbuild.config.gnu.cross.exe.debug.1096604639.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.950686503;cdt.managedbuild.tool.gnu.cpp.compiler.input.596589558">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+</cproject>
--- a/contrib/other-builds/manual-label/.project
+++ b/contrib/other-builds/manual-label/.project
@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>manual-label</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+</projectDescription>
--- a/contrib/other-builds/manual-label/DeEn.cpp
+++ b/contrib/other-builds/manual-label/DeEn.cpp
@ -0,0 +1,86 @@
+#include <list>
+#include "DeEn.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+extern bool g_debug;
+
+bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str)
+{
+  pos += offset;
+  if (pos >= source.size() || pos < 0) {
+    return false;
+  }
+
+  const string &word = source[pos][factor];
+  vector<string> soughts = Moses::Tokenize(str, " ");
+  for (int i = 0; i < soughts.size(); ++i) {
+    string &sought = soughts[i];
+    bool found = (word == sought);
+    if (found) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool Contains(const Phrase &source, int start, int end, int factor, const string &str)
+{
+  for (int pos = start; pos <= end; ++pos) {
+    bool found = IsA(source, pos, 0, factor, str);
+    if (found) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void LabelDeEn(const Phrase &source, ostream &out)
+{
+  typedef pair<int,int> Range;
+  typedef list<Range> Ranges;
+  Ranges ranges;
+
+  // find ranges to label
+  for (int start = 0; start < source.size(); ++start) {
+    for (int end = start; end < source.size(); ++end) {
+     if (IsA(source, start, -1, 1, "VAFIN")
+          && IsA(source, end, +1, 1, "VVINF VVPP")
+          && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) {
+       Range range(start, end);
+       ranges.push_back(range);
+      }
+      else if ((start == 0 || IsA(source, start, -1, 1, "$,"))
+          && IsA(source, end, +1, 0, "zu")
+          && IsA(source, end, +2, 1, "VVINF")
+          && !Contains(source, start, end, 1, "$,")) {
+        Range range(start, end);
+        ranges.push_back(range);
+      }
+    }
+  }
+
+  // output sentence, with labels
+  for (int pos = 0; pos < source.size(); ++pos) {
+    // output beginning of label
+    for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
+      const Range &range = *iter;
+      if (range.first == pos) {
+        out << "<tree label=\"reorder-label\"> ";
+      }
+    }
+
+    const Word &word = source[pos];
+    out << word[0] << " ";
+
+    for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) {
+      const Range &range = *iter;
+      if (range.second == pos) {
+        out << "</tree> ";
+      }
+    }
+  }
+  out << endl;
+
+}
--- a/contrib/other-builds/manual-label/DeEn.h
+++ b/contrib/other-builds/manual-label/DeEn.h
@ -0,0 +1,10 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <string>
+
+typedef std::vector<std::string> Word;
+typedef std::vector<Word> Phrase;
+
+void LabelDeEn(const Phrase &source, std::ostream &out);
--- a/contrib/other-builds/manual-label/Makefile
+++ b/contrib/other-builds/manual-label/Makefile
@ -0,0 +1,13 @@
+all: manual-label
+
+clean: 
+	rm -f *.o manual-label
+
+.cpp.o:
+	g++ -I../../../ -O6 -g -c $<
+
+manual-label: DeEn.o manual-label.o
+
+	g++ DeEn.o manual-label.o -lz -lboost_program_options-mt -o manual-label
+
+
--- a/contrib/other-builds/manual-label/manual-label.cpp
+++ b/contrib/other-builds/manual-label/manual-label.cpp
@ -0,0 +1,88 @@
+#include <iostream>
+#include <cstdlib>
+#include <boost/program_options.hpp>
+#include "moses/Util.h"
+#include "DeEn.h"
+
+using namespace std;
+
+bool g_debug = false;
+
+Phrase Tokenize(const string &line);
+
+int main(int argc, char** argv)
+{
+  cerr << "Starting" << endl;
+
+  namespace po = boost::program_options;
+  po::options_description desc("Options");
+  desc.add_options()
+    ("help", "Print help messages")
+    ("add", "additional options")
+    ("source-language,s", po::value<string>()->required(), "Source Language")
+    ("target-language,t", po::value<string>()->required(), "Target Language");
+
+  po::variables_map vm;
+  try
+  {
+    po::store(po::parse_command_line(argc, argv, desc),
+              vm); // can throw
+
+    /** --help option
+     */
+    if ( vm.count("help")  )
+    {
+      std::cout << "Basic Command Line Parameter App" << std::endl
+                << desc << std::endl;
+      return EXIT_SUCCESS;
+    }
+
+    po::notify(vm); // throws on error, so do after help in case
+                    // there are any problems
+  }
+  catch(po::error& e)
+  {
+    std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
+    std::cerr << desc << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  string sourceLang = vm["source-language"].as<string>();
+  string targetLang = vm["target-language"].as<string>();
+  cerr << sourceLang << " " << targetLang << endl;
+
+  string line;
+  size_t lineNum = 1;
+
+  while (getline(cin, line)) {
+    //cerr << lineNum << ":" << line << endl;
+    if (lineNum % 1000 == 0) {
+      cerr << lineNum << " ";
+    }
+
+    Phrase source = Tokenize(line);
+
+    LabelDeEn(source, cout);
+
+    ++lineNum;
+  }
+
+
+
+  cerr << "Finished" << endl;
+  return EXIT_SUCCESS;
+}
+
+Phrase Tokenize(const string &line)
+{
+  Phrase ret;
+
+  vector<string> toks = Moses::Tokenize(line);
+  for (size_t i = 0; i < toks.size(); ++i) {
+    Word word = Moses::Tokenize(toks[i], "|");
+    ret.push_back(word);
+  }
+
+  return ret;
+}
+
--- a/contrib/other-builds/mert_lib/.cproject
+++ b/contrib/other-builds/mert_lib/.cproject
@ -11,11 +11,11 @@
 					</externalSetting>
 				</externalSettings>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -64,11 +64,11 @@
 					</externalSetting>
 				</externalSettings>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
--- a/contrib/other-builds/moses/.cproject
+++ b/contrib/other-builds/moses/.cproject
@ -11,12 +11,12 @@
 					</externalSetting>
 				</externalSettings>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -88,13 +88,13 @@
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.401150096" moduleId="org.eclipse.cdt.core.settings" name="Release">
 				<externalSettings/>
 				<extensions>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
 				</extensions>
 			</storageModule>
 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
--- a/contrib/rpm/README
+++ b/contrib/rpm/README
@ -12,12 +12,13 @@ Building the RPM SPEC file
 The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:

 - The Git repository from which an installer will be built,
- - The branch in the Git repository to build, and
+ - The branch in the Git repository to build,
+ - The location of Boost on the build machine, and
 - The version of the installed Moses distribution.

-For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
+For example, to build the RELEASE-1.0 branch in the mosesdecoder repository (git://github.com/moses-smt/mosesdecoder.git):

-$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
+$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0 -t /usr

 This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.

--- a/contrib/rpm/build_source.sh
+++ b/contrib/rpm/build_source.sh
@ -1,11 +1,13 @@
 #!/bin/bash

 BRANCH="master"
+BOOST="/usr"
 declare -i NO_RPM_BUILD=0
 declare -r RPM_VERSION_TAG="___RPM_VERSION__"
+declare -r BOOST_TAG="___BOOST_LOCATION__"

 function usage() {
-  echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
+  echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version] -t [Boost install: default ${BOOST}]"
  exit 1
 }

@ -13,11 +15,12 @@ if [ $# -lt 4 ]; then
  usage
 fi

-while getopts r:b:v:nh OPTION
+while getopts r:b:t:v:nh OPTION
 do
  case "$OPTION" in
      r) REPO="${OPTARG}";;
      b) BRANCH="${OPTARG}";;
+      t) BOOST="${OPTARG}";;
      v) VERSION="${OPTARG}";;
      n) NO_RPM_BUILD=1;;
      [h\?]) usage;;
@ -53,7 +56,8 @@ if [ ${NO_RPM_BUILD} -eq 0 ]; then
  if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
    mkdir -p ${HOME}/rpmbuild/SPECS
  fi
-  eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
+  ESC_BOOST=`echo ${BOOST} | gawk '{gsub(/\//, "\\\\/"); print}'`
+  eval sed -e \"s/${RPM_VERSION_TAG}/${VERSION}/\" -e \"s/${BOOST_TAG}/${ESC_BOOST}/\" ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
  if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
    mkdir -p ${HOME}/rpmbuild/SOURCES
  fi
--- a/contrib/rpm/rpmbuild/SPECS/moses.spec
+++ b/contrib/rpm/rpmbuild/SPECS/moses.spec
@ -8,7 +8,7 @@ License: LGPL
 Group: Development/Tools
 Vendor: Capita Translation and Interpreting
 Packager: Ian Johnson <ian.johnson@capita-ti.com>
-Requires: boost >= 1.48, python >= 2.6, perl >= 5
+Requires: python >= 2.6, perl >= 5
 BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
 %description
 Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
@ -35,16 +35,17 @@ cd ../giza-pp
 make
 cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
 %build
-./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
+./bjam --with-boost=___BOOST_LOCATION__ --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
 %install
 mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R bin $RPM_BUILD_ROOT/opt/moses
+cp -R scripts/OSM $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/Transliteration $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
-cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
@ -52,12 +53,13 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
 %files
 %defattr(-,root,root)
 /opt/moses/bin/*
+/opt/moses/scripts/OSM/*
+/opt/moses/scripts/Transliteration/*
 /opt/moses/scripts/analysis/*
 /opt/moses/scripts/ems/*
 /opt/moses/scripts/generic/*
 /opt/moses/scripts/other/*
 /opt/moses/scripts/recaser/*
-/opt/moses/scripts/regression-testing/*
 /opt/moses/scripts/share/*
 /opt/moses/scripts/tokenizer/*
 /opt/moses/scripts/training/*
--- a/contrib/tmcombine/tmcombine.py
+++ b/contrib/tmcombine/tmcombine.py
@ -106,7 +106,7 @@ class Moses():
                scores = line[2].split()
                if len(scores) <self.number_of_features:
                    sys.stderr.write('Error: model only has {0} features. Expected {1}.\n'.format(len(scores),self.number_of_features))
-                    exit()
+                    exit(1)
                    
                scores = scores[:self.number_of_features]
                model_probabilities = map(float,scores)
@ -179,7 +179,7 @@ class Moses():
                reordering_probabilities[j][i] = p
        except IndexError:
            sys.stderr.write('\nIndexError: Did you correctly specify the number of reordering features? (--number_of_features N in command line)\n')
-            exit()
+            exit(1)

    def traverse_incrementally(self,table,models,load_lines,store_flag,mode='interpolate',inverted=False,lowmem=False,flags=None):
        """hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
@ -307,13 +307,13 @@ class Moses():
        elif len(line) == 4:
            if self.require_alignment:
                sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n')
-                exit()
+                exit(1)
            
            self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')]
   
        else:
            sys.stderr.write('Error: unexpected phrase table format. Are you using a very old/new version of Moses with different formatting?\n')
-            exit()
+            exit(1)
   
   
    def get_word_alignments(self,src,target,cache=False,mycache={}):
@ -515,7 +515,7 @@ class TigerXML():
        
        if not src or not target:
            sys.stderr.write('Error: Source and/or target language not specified. Required for TigerXML extraction.\n')
-            exit()
+            exit(1)
        
        alignments = self._get_aligned_ids(src,target)
        self._textualize_alignments(src,target,alignments)
@ -1261,7 +1261,7 @@ def handle_file(filename,action,fileobj=None,mode='r'):
                    sys.stderr.write('For a weighted counts combination, we need statistics that Moses doesn\'t write to disk by default.\n')
                    sys.stderr.write('Repeat step 4 of Moses training for all models with the option -write-lexical-counts.\n')
                
-                exit()
+                exit(1)

        if filename.endswith('.gz'):
            fileobj = gzip.open(filename,mode)
@ -1435,7 +1435,7 @@ class Combine_TMs():

        if mode not in ['interpolate','loglinear','counts']:
            sys.stderr.write('Error: mode must be either "interpolate", "loglinear" or "counts"\n')
-            sys.exit()
+            sys.exit(1)

        models,number_of_features,weights = self._sanity_checks(models,number_of_features,weights)
        
--- a/lm/bhiksha.cc
+++ b/lm/bhiksha.cc
@ -1,4 +1,6 @@
 #include "lm/bhiksha.hh"
+
+#include "lm/binary_format.hh"
 #include "lm/config.hh"
 #include "util/file.hh"
 #include "util/exception.hh"
@ -15,11 +17,11 @@ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_
 const uint8_t kArrayBhikshaVersion = 0;

 // TODO: put this in binary file header instead when I change the binary file format again.  
-void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) {
-  uint8_t version;
-  uint8_t configured_bits;
-  util::ReadOrThrow(fd, &version, 1);
-  util::ReadOrThrow(fd, &configured_bits, 1);
+void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
+  uint8_t buffer[2];
+  file.ReadForConfig(buffer, 2, offset);
+  uint8_t version = buffer[0];
+  uint8_t configured_bits = buffer[1];
  if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
  config.pointer_bhiksha_bits = configured_bits;
 }
@ -87,9 +89,6 @@ void ArrayBhiksha::FinishedLoading(const Config &config) {
  *(head_write++) = config.pointer_bhiksha_bits;
 }

-void ArrayBhiksha::LoadedBinary() {
-}
-
 } // namespace trie
 } // namespace ngram
 } // namespace lm
--- a/lm/bhiksha.hh
+++ b/lm/bhiksha.hh
@ -24,6 +24,7 @@
 namespace lm {
 namespace ngram {
 struct Config;
+class BinaryFormat;

 namespace trie {

@ -31,7 +32,7 @@ class DontBhiksha {
  public:
    static const ModelType kModelTypeAdd = static_cast<ModelType>(0);

-    static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {}
+    static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {}

    static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }

@ -53,8 +54,6 @@ class DontBhiksha {

    void FinishedLoading(const Config &/*config*/) {}

-    void LoadedBinary() {}
-
    uint8_t InlineBits() const { return next_.bits; }

  private:
@ -65,7 +64,7 @@ class ArrayBhiksha {
  public:
    static const ModelType kModelTypeAdd = kArrayAdd;

-    static void UpdateConfigFromBinary(int fd, Config &config);
+    static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);

    static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);

@ -93,8 +92,6 @@ class ArrayBhiksha {

    void FinishedLoading(const Config &config);

-    void LoadedBinary();
-
    uint8_t InlineBits() const { return next_inline_.bits; }

  private:
--- a/lm/binary_format.cc
+++ b/lm/binary_format.cc
@ -14,6 +14,9 @@

 namespace lm {
 namespace ngram {
+
+const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
+
 namespace {
 const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
 const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
@ -58,8 +61,6 @@ struct Sanity {
  }
 };

-const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
-
 std::size_t TotalHeaderSize(unsigned char order) {
  return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
 }
@ -81,83 +82,6 @@ void WriteHeader(void *to, const Parameters &params) {

 } // namespace

-uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) {
-  if (config.write_mmap) {
-    std::size_t total = TotalHeaderSize(order) + memory_size;
-    backing.file.reset(util::CreateOrThrow(config.write_mmap));
-    if (config.write_method == Config::WRITE_MMAP) {
-      backing.vocab.reset(util::MapZeroedWrite(backing.file.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
-    } else {
-      util::ResizeOrThrow(backing.file.get(), 0);
-      util::MapAnonymous(total, backing.vocab);
-    }
-    strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order));
-    return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
-  } else {
-    util::MapAnonymous(memory_size, backing.vocab);
-    return reinterpret_cast<uint8_t*>(backing.vocab.get());
-  }
-}
-
-uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
-  std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
-  if (config.write_mmap) {
-    // Grow the file to accomodate the search, using zeros.
-    try {
-      util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
-    } catch (util::ErrnoException &e) {
-      e << " for file " << config.write_mmap;
-      throw e;
-    }
-
-    if (config.write_method == Config::WRITE_AFTER) {
-      util::MapAnonymous(memory_size, backing.search);
-      return reinterpret_cast<uint8_t*>(backing.search.get());
-    }
-    // mmap it now.
-    // We're skipping over the header and vocab for the search space mmap.  mmap likes page aligned offsets, so some arithmetic to round the offset down.
-    std::size_t page_size = util::SizePage();
-    std::size_t alignment_cruft = adjusted_vocab % page_size;
-    backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
-    return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
-  } else {
-    util::MapAnonymous(memory_size, backing.search);
-    return reinterpret_cast<uint8_t*>(backing.search.get());
-  }
-}
-
-void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
-  if (!config.write_mmap) return;
-  switch (config.write_method) {
-    case Config::WRITE_MMAP:
-      util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
-      util::SyncOrThrow(backing.search.get(), backing.search.size());
-      break;
-    case Config::WRITE_AFTER:
-      util::SeekOrThrow(backing.file.get(), 0);
-      util::WriteOrThrow(backing.file.get(), backing.vocab.get(), backing.vocab.size());
-      util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad);
-      util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size());
-      util::FSyncOrThrow(backing.file.get());
-      break;
-  }
-  // header and vocab share the same mmap.  The header is written here because we know the counts.
-  Parameters params = Parameters();
-  params.counts = counts;
-  params.fixed.order = counts.size();
-  params.fixed.probing_multiplier = config.probing_multiplier;
-  params.fixed.model_type = model_type;
-  params.fixed.has_vocabulary = config.include_vocab;
-  params.fixed.search_version = search_version;
-  WriteHeader(backing.vocab.get(), params);
-  if (config.write_method == Config::WRITE_AFTER) {
-    util::SeekOrThrow(backing.file.get(), 0);
-    util::WriteOrThrow(backing.file.get(), backing.vocab.get(), TotalHeaderSize(counts.size()));
-  }
-}
-
-namespace detail {
-
 bool IsBinaryFormat(int fd) {
  const uint64_t size = util::SizeFile(fd);
  if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
@ -209,44 +133,164 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
  UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
 }

-void SeekPastHeader(int fd, const Parameters &params) {
-  util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
+const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
+
+BinaryFormat::BinaryFormat(const Config &config) 
+  : write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
+    header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
+
+void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params) {
+  file_.reset(fd);
+  write_mmap_ = NULL; // Ignore write requests; this is already in binary format.
+  ReadHeader(fd, params);
+  MatchCheck(model_type, search_version, params);
+  header_size_ = TotalHeaderSize(params.counts.size());
 }

-uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing) {
-  const uint64_t file_size = util::SizeFile(backing.file.get());
+void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
+  assert(header_size_ != kInvalidSize);
+  util::PReadOrThrow(file_.get(), to, amount, offset_excluding_header + header_size_);
+}
+
+void *BinaryFormat::LoadBinary(std::size_t size) {
+  assert(header_size_ != kInvalidSize);
+  const uint64_t file_size = util::SizeFile(file_.get());
  // The header is smaller than a page, so we have to map the whole header as well.
-  std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
-  if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
-    UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
+  uint64_t total_map = static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(size);
+  UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);

-  util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.search);
+  util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_);

-  if (config.enumerate_vocab && !params.fixed.has_vocabulary)
-    UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them.  You may need to rebuild the binary file with an updated version of build_binary.");
-
-  // Seek to vocabulary words
-  util::SeekOrThrow(backing.file.get(), total_map);
-  return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size());
+  vocab_string_offset_ = total_map;
+  return reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
 }

-void ComplainAboutARPA(const Config &config, ModelType model_type) {
-  if (config.write_mmap || !config.messages) return;
-  if (config.arpa_complain == Config::ALL) {
-    *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
-  } else if (config.arpa_complain == Config::EXPENSIVE &&
-             (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
-    *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive.  Save time by building a binary format." << std::endl;
+void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
+  vocab_size_ = memory_size;
+  if (!write_mmap_) {
+    header_size_ = 0;
+    util::MapAnonymous(memory_size, memory_vocab_);
+    return reinterpret_cast<uint8_t*>(memory_vocab_.get());
+  }
+  header_size_ = TotalHeaderSize(order);
+  std::size_t total = util::CheckOverflow(static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(memory_size));
+  file_.reset(util::CreateOrThrow(write_mmap_));
+  // some gccs complain about uninitialized variables even though all enum values are covered.
+  void *vocab_base = NULL;
+  switch (write_method_) {
+    case Config::WRITE_MMAP:
+      mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
+      vocab_base = mapping_.get();
+      break;
+    case Config::WRITE_AFTER:
+      util::ResizeOrThrow(file_.get(), 0);
+      util::MapAnonymous(total, memory_vocab_);
+      vocab_base = memory_vocab_.get();
+      break;
+  }
+  strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
+  return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
+}
+
+void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) {
+  assert(vocab_size_ != kInvalidSize);
+  vocab_pad_ = vocab_pad;
+  std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
+  vocab_string_offset_ = new_size;
+  if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
+    util::MapAnonymous(memory_size, memory_search_);
+    assert(header_size_ == 0 || write_mmap_);
+    vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
+    return reinterpret_cast<uint8_t*>(memory_search_.get());
+  }
+
+  assert(write_method_ == Config::WRITE_MMAP);
+  // Also known as total size without vocab words.
+  // Grow the file to accomodate the search, using zeros.
+  // According to man mmap, behavior is undefined when the file is resized
+  // underneath a mmap that is not a multiple of the page size.  So to be
+  // safe, we'll unmap it and map it again.
+  mapping_.reset();
+  util::ResizeOrThrow(file_.get(), new_size);
+  void *ret;
+  MapFile(vocab_base, ret);
+  return ret;
+}
+
+void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) {
+  // Checking Config's include_vocab is the responsibility of the caller.
+  assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize);
+  if (!write_mmap_) {
+    // Unchanged base.
+    vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get());
+    search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
+    return;
+  }
+  if (write_method_ == Config::WRITE_MMAP) {
+    mapping_.reset();
+  }
+  util::SeekOrThrow(file_.get(), VocabStringReadingOffset());
+  util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
+  if (write_method_ == Config::WRITE_MMAP) {
+    MapFile(vocab_base, search_base);
+  } else {
+    vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
+    search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
  }
 }

-} // namespace detail
+void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts) {
+  if (!write_mmap_) return;
+  switch (write_method_) {
+    case Config::WRITE_MMAP:
+      util::SyncOrThrow(mapping_.get(), mapping_.size());
+      break;
+    case Config::WRITE_AFTER:
+      util::SeekOrThrow(file_.get(), 0);
+      util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size());
+      util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_);
+      util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size());
+      util::FSyncOrThrow(file_.get());
+      break;
+  }
+  // header and vocab share the same mmap.
+  Parameters params = Parameters();
+  memset(&params, 0, sizeof(Parameters));
+  params.counts = counts;
+  params.fixed.order = counts.size();
+  params.fixed.probing_multiplier = config.probing_multiplier;
+  params.fixed.model_type = model_type;
+  params.fixed.has_vocabulary = config.include_vocab;
+  params.fixed.search_version = search_version;
+  switch (write_method_) {
+    case Config::WRITE_MMAP:
+      WriteHeader(mapping_.get(), params);
+      util::SyncOrThrow(mapping_.get(), mapping_.size());
+      break;
+    case Config::WRITE_AFTER:
+      {
+        std::vector<uint8_t> buffer(TotalHeaderSize(counts.size()));
+        WriteHeader(&buffer[0], params);
+        util::SeekOrThrow(file_.get(), 0);
+        util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
+      }
+      break;
+  }
+}
+
+void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) {
+  mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED);
+  vocab_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
+  search_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_;
+}

 bool RecognizeBinary(const char *file, ModelType &recognized) {
  util::scoped_fd fd(util::OpenReadOrThrow(file));
-  if (!detail::IsBinaryFormat(fd.get())) return false;
+  if (!IsBinaryFormat(fd.get())) {
+    return false;
+  }
  Parameters params;
-  detail::ReadHeader(fd.get(), params);
+  ReadHeader(fd.get(), params);
  recognized = params.fixed.model_type;
  return true;
 }
--- a/lm/binary_format.hh
+++ b/lm/binary_format.hh
@ -17,6 +17,8 @@
 namespace lm {
 namespace ngram {

+extern const char *kModelNames[6];
+
 /*Inspect a file to determine if it is a binary lm.  If not, return false.  
 * If so, return true and set recognized to the type.  This is the only API in
 * this header designed for use by decoder authors.  
@ -42,67 +44,63 @@ struct Parameters {
  std::vector<uint64_t> counts;
 };

-struct Backing {
-  // File behind memory, if any.  
-  util::scoped_fd file;
-  // Vocabulary lookup table.  Not to be confused with the vocab words themselves.  
-  util::scoped_memory vocab;
-  // Raw block of memory backing the language model data structures
-  util::scoped_memory search;
+class BinaryFormat {
+  public:
+    explicit BinaryFormat(const Config &config);
+
+    // Reading a binary file:
+    // Takes ownership of fd
+    void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params);
+    // Used to read parts of the file to update the config object before figuring out full size.
+    void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
+    // Actually load the binary file and return a pointer to the beginning of the search area.
+    void *LoadBinary(std::size_t size);
+
+    uint64_t VocabStringReadingOffset() const {
+      assert(vocab_string_offset_ != kInvalidOffset);
+      return vocab_string_offset_;
+    }
+
+    // Writing a binary file or initializing in RAM from ARPA:
+    // Size for vocabulary.
+    void *SetupJustVocab(std::size_t memory_size, uint8_t order);
+    // Warning: can change the vocaulary base pointer.
+    void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
+    // Warning: can change vocabulary and search base addresses.
+    void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
+    // Write the header at the beginning of the file.
+    void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
+
+  private:
+    void MapFile(void *&vocab_base, void *&search_base);
+
+    // Copied from configuration.
+    const Config::WriteMethod write_method_;
+    const char *write_mmap_;
+    util::LoadMethod load_method_;
+
+    // File behind memory, if any.  
+    util::scoped_fd file_;
+
+    // If there is a file involved, a single mapping.
+    util::scoped_memory mapping_;
+
+    // If the data is only in memory, separately allocate each because the trie
+    // knows vocab's size before it knows search's size (because SRILM might
+    // have pruned).
+    util::scoped_memory memory_vocab_, memory_search_;
+
+    // Memory ranges.  Note that these may not be contiguous and may not all
+    // exist.
+    std::size_t header_size_, vocab_size_, vocab_pad_;
+    // aka end of search.
+    uint64_t vocab_string_offset_;
+
+    static const uint64_t kInvalidOffset = (uint64_t)-1;
 };

-// Create just enough of a binary file to write vocabulary to it.  
-uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
-// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.  
-uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing);
-
-// Write header to binary file.  This is done last to prevent incomplete files
-// from loading.   
-void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts,  std::size_t vocab_pad, Backing &backing);
-
-namespace detail {
-
 bool IsBinaryFormat(int fd);

-void ReadHeader(int fd, Parameters &params);
-
-void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params);
-
-void SeekPastHeader(int fd, const Parameters &params);
-
-uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing);
-
-void ComplainAboutARPA(const Config &config, ModelType model_type);
-
-} // namespace detail
-
-template <class To> void LoadLM(const char *file, const Config &config, To &to) {
-  Backing &backing = to.MutableBacking();
-  backing.file.reset(util::OpenReadOrThrow(file));
-
-  try {
-    if (detail::IsBinaryFormat(backing.file.get())) {
-      Parameters params;
-      detail::ReadHeader(backing.file.get(), params);
-      detail::MatchCheck(To::kModelType, To::kVersion, params);
-      // Replace the run-time configured probing_multiplier with the one in the file.  
-      Config new_config(config);
-      new_config.probing_multiplier = params.fixed.probing_multiplier;
-      detail::SeekPastHeader(backing.file.get(), params);
-      To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config);
-      uint64_t memory_size = To::Size(params.counts, new_config);
-      uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
-      to.InitializeFromBinary(start, params, new_config, backing.file.get());
-    } else {
-      detail::ComplainAboutARPA(config, To::kModelType);
-      to.InitializeFromARPA(file, config);
-    }
-  } catch (util::Exception &e) {
-    e << " File: " << file;
-    throw;
-  }
-}
-
 } // namespace ngram
 } // namespace lm
 #endif // LM_BINARY_FORMAT__
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@ -87,7 +87,7 @@ class VocabHandout {
    Table table_;

    std::size_t double_cutoff_;
-
+    
    util::FakeOFStream word_list_;
 };

@ -98,7 +98,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
    std::size_t operator()(const WordIndex *start) const {
      return util::MurmurHashNative(start, size_);
    }
-
+    
  private:
    const std::size_t size_;
 };
@ -106,11 +106,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
 class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
  public:
    explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
-
+    
    bool operator()(const WordIndex *first, const WordIndex *second) const {
      return !memcmp(first, second, size_);
-    }
-
+    } 
+    
  private:
    const std::size_t size_;
 };
@ -131,7 +131,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;

 class Writer {
  public:
-    Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
+    Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) 
      : block_(position), gram_(block_->Get(), order),
        dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
        dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
@ -140,7 +140,7 @@ class Writer {
      dedupe_.Clear();
      assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
      if (order == 1) {
-        // Add special words.  AdjustCounts is responsible if order != 1.
+        // Add special words.  AdjustCounts is responsible if order != 1.    
        AddUnigramWord(kUNK);
        AddUnigramWord(kBOS);
      }
@ -170,16 +170,16 @@ class Writer {
        memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
        return;
      }
-      // Complete the write.
+      // Complete the write.  
      gram_.Count() = 1;
-      // Prepare the next n-gram.
+      // Prepare the next n-gram.  
      if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
        NGram last(gram_);
        gram_.NextInMemory();
        std::copy(last.begin() + 1, last.end(), gram_.begin());
        return;
      }
-      // Block end.  Need to store the context in a temporary buffer.
+      // Block end.  Need to store the context in a temporary buffer.  
      std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
      dedupe_.Clear();
      block_->SetValidSize(block_size_);
@ -207,7 +207,7 @@ class Writer {
    // Hash table combiner implementation.
    Dedupe dedupe_;

-    // Small buffer to hold existing ngrams when shifting across a block boundary.
+    // Small buffer to hold existing ngrams when shifting across a block boundary.  
    boost::scoped_array<WordIndex> buffer_;

    const std::size_t block_size_;
@ -223,7 +223,7 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
  return VocabHandout::MemUsage(vocab_estimate);
 }

-CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
+CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) 
  : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
    dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
    dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) {
@ -240,7 +240,10 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
  uint64_t count = 0;
  bool delimiters[256];
  memset(delimiters, 0, sizeof(delimiters));
-  delimiters['\0'] = delimiters['\t'] = delimiters['\n'] = delimiters['\r'] = delimiters[' '] = true;
+  const char kDelimiterSet[] = "\0\t\n\r ";
+  for (const char *i = kDelimiterSet; i < kDelimiterSet + sizeof(kDelimiterSet); ++i) {
+    delimiters[static_cast<unsigned char>(*i)] = true;
+  }
  try {
    while(true) {
      StringPiece line(from_.ReadLine());
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@ -33,12 +33,12 @@ class Callback {
      pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
      probs_[order_minus_1 + 1] = pay.complete.prob;
      pay.complete.prob = log10(pay.complete.prob);
-      // TODO: this is a hack to skip n-grams that don't appear as context.  Pruning will require some different handling.
-      if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1].Get()) { // check valid pointer at tht end
+      // TODO: this is a hack to skip n-grams that don't appear as context.  Pruning will require some different handling.  
+      if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
        pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
        ++backoffs_[order_minus_1];
      } else {
-        // Not a context.
+        // Not a context.  
        pay.complete.backoff = 0.0;
      }
    }
@ -52,7 +52,7 @@ class Callback {
 };
 } // namespace

-Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
+Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) 
  : uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {}

 // perform order-wise interpolation
--- a/lm/config.cc
+++ b/lm/config.cc
@ -11,11 +11,7 @@ Config::Config() :
  enumerate_vocab(NULL),
  unknown_missing(COMPLAIN),
  sentence_marker_missing(THROW_UP),
-#if defined(_WIN32) || defined(_WIN64)
-  positive_log_probability(SILENT),
-#else
  positive_log_probability(THROW_UP),
-#endif
  unknown_missing_logprob(-100.0),
  probing_multiplier(1.5),
  building_memory(1073741824ULL), // 1 GB
--- a/lm/facade.hh
+++ b/lm/facade.hh
@ -17,14 +17,14 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
    typedef VocabularyT Vocabulary;

    /* Translate from void* to State */
-    FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
+    FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
      return static_cast<const Child*>(this)->FullScore(
          *reinterpret_cast<const State*>(in_state),
          new_word,
          *reinterpret_cast<State*>(out_state));
    }

-    FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
+    FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
      return static_cast<const Child*>(this)->FullScoreForgotState(
          context_rbegin,
          context_rend,
@ -37,7 +37,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
      return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
    }

-    float Score(const void *in_state, const WordIndex new_word, void *out_state) const {
+    float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const {
      return static_cast<const Child*>(this)->Score(
          *reinterpret_cast<const State*>(in_state),
          new_word,
--- a/lm/filter/arpa_io.hh
+++ b/lm/filter/arpa_io.hh
@ -14,10 +14,6 @@
 #include <string>
 #include <vector>

-#if !defined __MINGW32__
-#include <err.h>
-#endif
-
 #include <string.h>
 #include <stdint.h>

--- a/lm/filter/count_io.hh
+++ b/lm/filter/count_io.hh
@ -5,27 +5,18 @@
 #include <iostream>
 #include <string>

-#if !defined __MINGW32__
-#include <err.h>
-#endif
-
+#include "util/fake_ofstream.hh"
+#include "util/file.hh"
 #include "util/file_piece.hh"

 namespace lm {

 class CountOutput : boost::noncopyable {
  public:
-    explicit CountOutput(const char *name) : file_(name, std::ios::out) {}
+    explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {}

    void AddNGram(const StringPiece &line) {
-      if (!(file_ << line << '\n')) {
-#if defined __MINGW32__
-        std::cerr<<"Writing counts file failed"<<std::endl;
-        exit(3);
-#else
-        err(3, "Writing counts file failed");
-#endif
-      }
+      file_ << line << '\n';
    }

    template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
@ -37,12 +28,12 @@ class CountOutput : boost::noncopyable {
    }

  private:
-    std::fstream file_;
+    util::FakeOFStream file_;
 };

 class CountBatch {
  public:
-    explicit CountBatch(std::streamsize initial_read)
+    explicit CountBatch(std::streamsize initial_read) 
      : initial_read_(initial_read) {
      buffer_.reserve(initial_read);
    }
@ -75,7 +66,7 @@ class CountBatch {
  private:
    std::streamsize initial_read_;

-    // This could have been a std::string but that's less happy with raw writes.
+    // This could have been a std::string but that's less happy with raw writes.  
    std::vector<char> buffer_;
 };

--- a/lm/filter/filter_main.cc
+++ b/lm/filter/filter_main.cc
@ -6,6 +6,7 @@
 #endif
 #include "lm/filter/vocab.hh"
 #include "lm/filter/wrapper.hh"
+#include "util/exception.hh"
 #include "util/file_piece.hh"

 #include <boost/ptr_container/ptr_vector.hpp>
@ -57,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
 typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;

 struct Config {
-  Config() :
+  Config() : 
 #ifndef NTHREAD
  batch_size(25000),
  threads(boost::thread::hardware_concurrency()),
@ -157,102 +158,96 @@ template <class Format> void DispatchFilterModes(const Config &config, std::istr
 } // namespace lm

 int main(int argc, char *argv[]) {
-  if (argc < 4) {
-    lm::DisplayHelp(argv[0]);
-    return 1;
-  }
-
-  // I used to have boost::program_options, but some users didn't want to compile boost.
-  lm::Config config;
-  config.mode = lm::MODE_UNSET;
-  for (int i = 1; i < argc - 2; ++i) {
-    const char *str = argv[i];
-    if (!std::strcmp(str, "copy")) {
-      config.mode = lm::MODE_COPY;
-    } else if (!std::strcmp(str, "single")) {
-      config.mode = lm::MODE_SINGLE;
-    } else if (!std::strcmp(str, "multiple")) {
-      config.mode = lm::MODE_MULTIPLE;
-    } else if (!std::strcmp(str, "union")) {
-      config.mode = lm::MODE_UNION;
-    } else if (!std::strcmp(str, "phrase")) {
-      config.phrase = true;
-    } else if (!std::strcmp(str, "context")) {
-      config.context = true;
-    } else if (!std::strcmp(str, "arpa")) {
-      config.format = lm::FORMAT_ARPA;
-    } else if (!std::strcmp(str, "raw")) {
-      config.format = lm::FORMAT_COUNT;
-#ifndef NTHREAD
-    } else if (!std::strncmp(str, "threads:", 8)) {
-      config.threads = boost::lexical_cast<size_t>(str + 8);
-      if (!config.threads) {
-        std::cerr << "Specify at least one thread." << std::endl;
-        return 1;
-      }
-    } else if (!std::strncmp(str, "batch_size:", 11)) {
-      config.batch_size = boost::lexical_cast<size_t>(str + 11);
-      if (config.batch_size < 5000) {
-        std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
-        if (!config.batch_size) return 1;
-      }
-#endif
-    } else {
+  try {
+    if (argc < 4) {
      lm::DisplayHelp(argv[0]);
      return 1;
    }
-  }

-  if (config.mode == lm::MODE_UNSET) {
-    lm::DisplayHelp(argv[0]);
-    return 1;
-  }
-
-  if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
-    std::cerr << "Phrase constraint currently only works in multiple or union mode.  If you really need it for single, put everything on one line and use union." << std::endl;
-    return 1;
-  }
-
-  bool cmd_is_model = true;
-  const char *cmd_input = argv[argc - 2];
-  if (!strncmp(cmd_input, "vocab:", 6)) {
-    cmd_is_model = false;
-    cmd_input += 6;
-  } else if (!strncmp(cmd_input, "model:", 6)) {
-    cmd_input += 6;
-  } else if (strchr(cmd_input, ':')) {
-#if defined __MINGW32__
-    std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
-    exit(1);
-#else
-    errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input);
-#endif // defined
-  } else {
-    std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
-  }
-  std::ifstream cmd_file;
-  std::istream *vocab;
-  if (cmd_is_model) {
-    vocab = &std::cin;
-  } else {
-    cmd_file.open(cmd_input, std::ios::in);
-    if (!cmd_file) {
-#if defined __MINGW32__
-      std::cerr << "Could not open input file " << cmd_input << std::endl;
-      exit(2);
-#else
-      err(2, "Could not open input file %s", cmd_input);
-#endif // defined
+    // I used to have boost::program_options, but some users didn't want to compile boost.
+    lm::Config config;
+    config.mode = lm::MODE_UNSET;
+    for (int i = 1; i < argc - 2; ++i) {
+      const char *str = argv[i];
+      if (!std::strcmp(str, "copy")) {
+        config.mode = lm::MODE_COPY;
+      } else if (!std::strcmp(str, "single")) {
+        config.mode = lm::MODE_SINGLE;
+      } else if (!std::strcmp(str, "multiple")) {
+        config.mode = lm::MODE_MULTIPLE;
+      } else if (!std::strcmp(str, "union")) {
+        config.mode = lm::MODE_UNION;
+      } else if (!std::strcmp(str, "phrase")) {
+        config.phrase = true;
+      } else if (!std::strcmp(str, "context")) {
+        config.context = true;
+      } else if (!std::strcmp(str, "arpa")) {
+        config.format = lm::FORMAT_ARPA;
+      } else if (!std::strcmp(str, "raw")) {
+        config.format = lm::FORMAT_COUNT;
+#ifndef NTHREAD
+      } else if (!std::strncmp(str, "threads:", 8)) {
+        config.threads = boost::lexical_cast<size_t>(str + 8);
+        if (!config.threads) {
+          std::cerr << "Specify at least one thread." << std::endl;
+          return 1;
+        }
+      } else if (!std::strncmp(str, "batch_size:", 11)) {
+        config.batch_size = boost::lexical_cast<size_t>(str + 11);
+        if (config.batch_size < 5000) {
+          std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl;
+          if (!config.batch_size) return 1;
+        }
+#endif
+      } else {
+        lm::DisplayHelp(argv[0]);
+        return 1;
+      }
    }
-    vocab = &cmd_file;
-  }

-  util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
+    if (config.mode == lm::MODE_UNSET) {
+      lm::DisplayHelp(argv[0]);
+      return 1;
+    }

-  if (config.format == lm::FORMAT_ARPA) {
-    lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
-  } else if (config.format == lm::FORMAT_COUNT) {
-    lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
+    if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
+      std::cerr << "Phrase constraint currently only works in multiple or union mode.  If you really need it for single, put everything on one line and use union." << std::endl;
+      return 1;
+    }
+
+    bool cmd_is_model = true;
+    const char *cmd_input = argv[argc - 2];
+    if (!strncmp(cmd_input, "vocab:", 6)) {
+      cmd_is_model = false;
+      cmd_input += 6;
+    } else if (!strncmp(cmd_input, "model:", 6)) {
+      cmd_input += 6;
+    } else if (strchr(cmd_input, ':')) {
+      std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
+      return 1;
+    } else {
+      std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
+    }
+    std::ifstream cmd_file;
+    std::istream *vocab;
+    if (cmd_is_model) {
+      vocab = &std::cin;
+    } else {
+      cmd_file.open(cmd_input, std::ios::in);
+      UTIL_THROW_IF(!cmd_file, util::ErrnoException, "Failed to open " << cmd_input);
+      vocab = &cmd_file;
+    }
+
+    util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr);
+
+    if (config.format == lm::FORMAT_ARPA) {
+      lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]);
+    } else if (config.format == lm::FORMAT_COUNT) {
+      lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]);
+    }
+    return 0;
+  } catch (const std::exception &e) {
+    std::cerr << e.what() << std::endl;
+    return 1;
  }
-  return 0;
 }
--- a/lm/filter/format.hh
+++ b/lm/filter/format.hh
@ -1,5 +1,5 @@
 #ifndef LM_FILTER_FORMAT_H__
-#define LM_FITLER_FORMAT_H__
+#define LM_FILTER_FORMAT_H__

 #include "lm/filter/arpa_io.hh"
 #include "lm/filter/count_io.hh"
--- a/lm/filter/vocab.cc
+++ b/lm/filter/vocab.cc
@ -5,10 +5,6 @@

 #include <ctype.h>

-#if !defined __MINGW32__
-#include <err.h>
-#endif
-
 namespace lm {
 namespace vocab {

@ -34,7 +30,7 @@ bool IsLineEnd(std::istream &in) {
 }// namespace

 // Read space separated words in enter separated lines.  These lines can be
-// very long, so don't read an entire line at a time.
+// very long, so don't read an entire line at a time.  
 unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
  in.exceptions(std::istream::badbit);
  unsigned int sentence = 0;
--- a/lm/model.cc
+++ b/lm/model.cc
@ -34,8 +34,47 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
  if (static_cast<std::size_t>(start - static_cast<uint8_t*>(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast<uint8_t*>(base)) << " but Size says they should take " << goal_size);
 }

-template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
-  LoadLM(file, config, *this);
+namespace {
+void ComplainAboutARPA(const Config &config, ModelType model_type) {
+  if (config.write_mmap || !config.messages) return;
+  if (config.arpa_complain == Config::ALL) {
+    *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
+  } else if (config.arpa_complain == Config::EXPENSIVE &&
+             (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
+    *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive.  Save time by building a binary format." << std::endl;
+  }
+}
+
+void CheckCounts(const std::vector<uint64_t> &counts) {
+  UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ".  " << KENLM_ORDER_MESSAGE);
+  if (sizeof(uint64_t) > sizeof(std::size_t)) {
+    for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) {
+      UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines.");
+    }
+  }
+}
+
+} // namespace
+
+template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &init_config) : backing_(init_config) {
+  util::scoped_fd fd(util::OpenReadOrThrow(file));
+  if (IsBinaryFormat(fd.get())) {
+    Parameters parameters;
+    int fd_shallow = fd.release();
+    backing_.InitializeBinary(fd_shallow, kModelType, kVersion, parameters);
+    CheckCounts(parameters.counts);
+
+    Config new_config(init_config);
+    new_config.probing_multiplier = parameters.fixed.probing_multiplier;
+    Search::UpdateConfigFromBinary(backing_, parameters.counts, VocabularyT::Size(parameters.counts[0], new_config), new_config);
+    UTIL_THROW_IF(new_config.enumerate_vocab && !parameters.fixed.has_vocabulary, FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them.  You may need to rebuild the binary file with an updated version of build_binary.");
+
+    SetupMemory(backing_.LoadBinary(Size(parameters.counts, new_config)), parameters.counts, new_config);
+    vocab_.LoadedBinary(parameters.fixed.has_vocabulary, fd_shallow, new_config.enumerate_vocab, backing_.VocabStringReadingOffset());
+  } else {
+    ComplainAboutARPA(init_config, kModelType);
+    InitializeFromARPA(fd.release(), file, init_config);
+  }

  // g++ prints warnings unless these are fully initialized.
  State begin_sentence = State();
@ -50,27 +89,9 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
  P::Init(begin_sentence, null_context, vocab_, search_.Order());
 }

-namespace {
-void CheckCounts(const std::vector<uint64_t> &counts) {
-  UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ".  " << KENLM_ORDER_MESSAGE);
-  if (sizeof(uint64_t) > sizeof(std::size_t)) {
-    for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) {
-      UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines.");
-    }
-  }
-}
-} // namespace
-
-template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd) {
-  CheckCounts(params.counts);
-  SetupMemory(start, params.counts, config);
-  vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab);
-  search_.LoadedBinary();
-}
-
-template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
-  // Backing file is the ARPA.  Steal it so we can make the backing file the mmap output if any.
-  util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());
+template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(int fd, const char *file, const Config &config) {
+  // Backing file is the ARPA.
+  util::FilePiece f(fd, file, config.ProgressMessages());
  try {
    std::vector<uint64_t> counts;
    // File counts do not include pruned trigrams that extend to quadgrams etc.   These will be fixed by search_.
@ -81,13 +102,17 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT

    std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
    // Setup the binary file for writing the vocab lookup table.  The search_ is responsible for growing the binary file to its needs.
-    vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);
+    vocab_.SetupMemory(backing_.SetupJustVocab(vocab_size, counts.size()), vocab_size, counts[0], config);

-    if (config.write_mmap) {
+    if (config.write_mmap && config.include_vocab) {
      WriteWordsWrapper wrap(config.enumerate_vocab);
      vocab_.ConfigureEnumerate(&wrap, counts[0]);
      search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
-      wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + Search::Size(counts, config));
+      void *vocab_rebase, *search_rebase;
+      backing_.WriteVocabWords(wrap.Buffer(), vocab_rebase, search_rebase);
+      // Due to writing at the end of file, mmap may have relocated data.  So remap.
+      vocab_.Relocate(vocab_rebase);
+      search_.SetupMemory(reinterpret_cast<uint8_t*>(search_rebase), counts, config);
    } else {
      vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]);
      search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
@ -99,18 +124,13 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
      search_.UnknownUnigram().backoff = 0.0;
      search_.UnknownUnigram().prob = config.unknown_missing_logprob;
    }
-    FinishFile(config, kModelType, kVersion, counts, vocab_.UnkCountChangePadding(), backing_);
+    backing_.FinishFile(config, kModelType, kVersion, counts);
  } catch (util::Exception &e) {
    e << " Byte: " << f.Offset();
    throw;
  }
 }

-template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
-  util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config));
-  Search::UpdateConfigFromBinary(fd, counts, config);
-}
-
 template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
  FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
  for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) {
--- a/lm/model.hh
+++ b/lm/model.hh
@ -104,10 +104,6 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
    }

  private:
-    friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
-
-    static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
-
    FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;

    // Score bigrams and above.  Do not include backoff.   
@ -116,15 +112,11 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
    // Appears after Size in the cc file.
    void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);

-    void InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd);
-
-    void InitializeFromARPA(const char *file, const Config &config);
+    void InitializeFromARPA(int fd, const char *file, const Config &config);

    float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;

-    Backing &MutableBacking() { return backing_; }
-
-    Backing backing_;
+    BinaryFormat backing_;
    
    VocabularyT vocab_;

--- a/lm/model_test.cc
+++ b/lm/model_test.cc
@ -360,10 +360,11 @@ BOOST_AUTO_TEST_CASE(quant_bhiksha_trie) {
  LoadingTest<QuantArrayTrieModel>();
 }

-template <class ModelT> void BinaryTest() {
+template <class ModelT> void BinaryTest(Config::WriteMethod write_method) {
  Config config;
  config.write_mmap = "test.binary";
  config.messages = NULL;
+  config.write_method = write_method;
  ExpectEnumerateVocab enumerate;
  config.enumerate_vocab = &enumerate;

@ -406,6 +407,11 @@ template <class ModelT> void BinaryTest() {
  unlink("test_nounk.binary");
 }

+template <class ModelT> void BinaryTest() {
+  BinaryTest<ModelT>(Config::WRITE_MMAP);
+  BinaryTest<ModelT>(Config::WRITE_AFTER);
+}
+
 BOOST_AUTO_TEST_CASE(write_and_read_probing) {
  BinaryTest<ProbingModel>();
 }
--- a/lm/quantize.cc
+++ b/lm/quantize.cc
@ -38,13 +38,13 @@ const char kSeparatelyQuantizeVersion = 2;

 } // namespace

-void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
-  char version;
-  util::ReadOrThrow(fd, &version, 1);
-  util::ReadOrThrow(fd, &config.prob_bits, 1);
-  util::ReadOrThrow(fd, &config.backoff_bits, 1);
+void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
+  unsigned char buffer[3];
+  file.ReadForConfig(buffer, 3, offset);
+  char version = buffer[0];
+  config.prob_bits = buffer[1];
+  config.backoff_bits = buffer[2];
  if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
-  util::AdvanceOrThrow(fd, -3);
 }

 void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
--- a/lm/quantize.hh
+++ b/lm/quantize.hh
@ -18,12 +18,13 @@ namespace lm {
 namespace ngram {

 struct Config;
+class BinaryFormat;

 /* Store values directly and don't quantize. */
 class DontQuantize {
  public:
    static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
-    static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
+    static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &) {}
    static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; }
    static uint8_t MiddleBits(const Config &/*config*/) { return 63; }
    static uint8_t LongestBits(const Config &/*config*/) { return 31; }
@ -136,7 +137,7 @@ class SeparatelyQuantize {
  public:
    static const ModelType kModelTypeAdd = kQuantAdd;

-    static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
+    static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);

    static uint64_t Size(uint8_t order, const Config &config) {
      uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float);
--- a/lm/search_hashed.cc
+++ b/lm/search_hashed.cc
@ -204,9 +204,10 @@ template <class Build, class Activate, class Store> void ReadNGrams(
 namespace detail {

 template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
-  std::size_t allocated = Unigram::Size(counts[0]);
-  unigram_ = Unigram(start, counts[0], allocated);
-  start += allocated;
+  unigram_ = Unigram(start, counts[0]);
+  start += Unigram::Size(counts[0]);
+  std::size_t allocated;
+  middle_.clear();
  for (unsigned int n = 2; n < counts.size(); ++n) {
    allocated = Middle::Size(counts[n - 1], config.probing_multiplier);
    middle_.push_back(Middle(start, allocated));
@ -218,9 +219,21 @@ template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start,
  return start;
 }

-template <class Value> void HashedSearch<Value>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing) {
-  // TODO: fix sorted.
-  SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config);
+/*template <class Value> void HashedSearch<Value>::Relocate(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
+  unigram_ = Unigram(start, counts[0]);
+  start += Unigram::Size(counts[0]);
+  for (unsigned int n = 2; n < counts.size(); ++n) {
+    middle[n-2].Relocate(start);
+    start += Middle::Size(counts[n - 1], config.probing_multiplier)
+  }
+  longest_.Relocate(start);
+}*/
+
+template <class Value> void HashedSearch<Value>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing) {
+  void *vocab_rebase;
+  void *search_base = backing.GrowForSearch(Size(counts, config), vocab.UnkCountChangePadding(), vocab_rebase);
+  vocab.Relocate(vocab_rebase);
+  SetupMemory(reinterpret_cast<uint8_t*>(search_base), counts, config);

  PositiveProbWarn warn(config.positive_log_probability);
  Read1Grams(f, counts[0], vocab, unigram_.Raw(), warn);
@ -277,14 +290,6 @@ template <class Value> template <class Build> void HashedSearch<Value>::ApplyBui
  ReadEnd(f);
 }

-template <class Value> void HashedSearch<Value>::LoadedBinary() {
-  unigram_.LoadedBinary();
-  for (typename std::vector<Middle>::iterator i = middle_.begin(); i != middle_.end(); ++i) {
-    i->LoadedBinary();
-  }
-  longest_.LoadedBinary();
-}
-
 template class HashedSearch<BackoffValue>;
 template class HashedSearch<RestValue>;

--- a/lm/search_hashed.hh
+++ b/lm/search_hashed.hh
@ -18,7 +18,7 @@ namespace util { class FilePiece; }

 namespace lm {
 namespace ngram {
-struct Backing;
+class BinaryFormat;
 class ProbingVocabulary;
 namespace detail {

@ -72,7 +72,7 @@ template <class Value> class HashedSearch {
    static const unsigned int kVersion = 0;

    // TODO: move probing_multiplier here with next binary file format update.
-    static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
+    static void UpdateConfigFromBinary(const BinaryFormat &, const std::vector<uint64_t> &, uint64_t, Config &) {}

    static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
      uint64_t ret = Unigram::Size(counts[0]);
@ -84,9 +84,7 @@ template <class Value> class HashedSearch {

    uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config);

-    void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing);
-
-    void LoadedBinary();
+    void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing);

    unsigned char Order() const {
      return middle_.size() + 2;
@ -148,7 +146,7 @@ template <class Value> class HashedSearch {
      public:
        Unigram() {}

-        Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
+        Unigram(void *start, uint64_t count) :
          unigram_(static_cast<typename Value::Weights*>(start))
 #ifdef DEBUG
         ,  count_(count)
@ -168,8 +166,6 @@ template <class Value> class HashedSearch {

        typename Value::Weights &Unknown() { return unigram_[0]; }

-        void LoadedBinary() {}
-
        // For building.
        typename Value::Weights *Raw() { return unigram_; }

--- a/lm/search_trie.cc
+++ b/lm/search_trie.cc
@ -459,7 +459,7 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c

 } // namespace

-template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
+template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing) {
  RecordReader inputs[KENLM_MAX_ORDER - 1];
  RecordReader contexts[KENLM_MAX_ORDER - 1];

@ -488,7 +488,10 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve

  sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);

-  out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), backing), fixed_counts, config);
+  void *vocab_relocate;
+  void *search_base = backing.GrowForSearch(TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), vocab.UnkCountChangePadding(), vocab_relocate);
+  vocab.Relocate(vocab_relocate);
+  out.SetupMemory(reinterpret_cast<uint8_t*>(search_base), fixed_counts, config);

  for (unsigned char i = 2; i <= counts.size(); ++i) {
    inputs[i-2].Rewind();
@ -571,15 +574,7 @@ template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::Setup
  return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]);
 }

-template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::LoadedBinary() {
-  unigram_.LoadedBinary();
-  for (Middle *i = middle_begin_; i != middle_end_; ++i) {
-    i->LoadedBinary();
-  }
-  longest_.LoadedBinary();
-}
-
-template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) {
+template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing) {
  std::string temporary_prefix;
  if (config.temporary_directory_prefix) {
    temporary_prefix = config.temporary_directory_prefix;
--- a/lm/search_trie.hh
+++ b/lm/search_trie.hh
@ -17,13 +17,13 @@

 namespace lm {
 namespace ngram {
-struct Backing;
+class BinaryFormat;
 class SortedVocabulary;
 namespace trie {

 template <class Quant, class Bhiksha> class TrieSearch;
 class SortedFiles;
-template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
+template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing);

 template <class Quant, class Bhiksha> class TrieSearch {
  public:
@ -39,11 +39,11 @@ template <class Quant, class Bhiksha> class TrieSearch {

    static const unsigned int kVersion = 1;

-    static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
-      Quant::UpdateConfigFromBinary(fd, counts, config);
-      util::AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0]));
+    static void UpdateConfigFromBinary(const BinaryFormat &file, const std::vector<uint64_t> &counts, uint64_t offset, Config &config) {
+      Quant::UpdateConfigFromBinary(file, offset, config);
      // Currently the unigram pointers are not compresssed, so there will only be a header for order > 2.
-      if (counts.size() > 2) Bhiksha::UpdateConfigFromBinary(fd, config);
+      if (counts.size() > 2)
+        Bhiksha::UpdateConfigFromBinary(file, offset + Quant::Size(counts.size(), config) + Unigram::Size(counts[0]), config);
    }

    static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
@ -60,9 +60,7 @@ template <class Quant, class Bhiksha> class TrieSearch {

    uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config);

-    void LoadedBinary();
-
-    void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing);
+    void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing);

    unsigned char Order() const {
      return middle_end_ - middle_begin_ + 2;
@ -103,7 +101,7 @@ template <class Quant, class Bhiksha> class TrieSearch {
    }

  private:
-    friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
+    friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing);

    // Middles are managed manually so we can delay construction and they don't have to be copyable.
    void FreeMiddles() {
--- a/lm/trie.hh
+++ b/lm/trie.hh
@ -62,8 +62,6 @@ class Unigram {
      return unigram_;
    }
    
-    void LoadedBinary() {}
-
    UnigramPointer Find(WordIndex word, NodeRange &next) const {
      UnigramValue *val = unigram_ + word;
      next.begin = val->next;
@ -108,8 +106,6 @@ template <class Bhiksha> class BitPackedMiddle : public BitPacked {

    void FinishedLoading(uint64_t next_end, const Config &config);

-    void LoadedBinary() { bhiksha_.LoadedBinary(); }
-
    util::BitAddress Find(WordIndex word, NodeRange &range, uint64_t &pointer) const;

    util::BitAddress ReadEntry(uint64_t pointer, NodeRange &range) {
@ -138,14 +134,9 @@ class BitPackedLongest : public BitPacked {
      BaseInit(base, max_vocab, quant_bits);
    }

-    void LoadedBinary() {}
-
    util::BitAddress Insert(WordIndex word);

    util::BitAddress Find(WordIndex word, const NodeRange &node) const;
-
-  private:
-    uint8_t quant_bits_;
 };

 } // namespace trie
--- a/lm/trie_sort.cc
+++ b/lm/trie_sort.cc
@ -50,6 +50,10 @@ class PartialViewProxy {
    const void *Data() const { return inner_.Data(); }
    void *Data() { return inner_.Data(); }

+    friend void swap(PartialViewProxy first, PartialViewProxy second) {
+      std::swap_ranges(reinterpret_cast<char*>(first.Data()), reinterpret_cast<char*>(first.Data()) + first.attention_size_, reinterpret_cast<char*>(second.Data()));
+    }
+
  private:
    friend class util::ProxyIterator<PartialViewProxy>;

--- a/lm/virtual_interface.hh
+++ b/lm/virtual_interface.hh
@ -125,13 +125,13 @@ class Model {
    void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); }

    // Requires in_state != out_state
-    virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
+    virtual float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;

    // Requires in_state != out_state
-    virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
+    virtual FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;

    // Prefer to use FullScore.  The context words should be provided in reverse order.
-    virtual FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0;
+    virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0;

    unsigned char Order() const { return order_; }

--- a/lm/vocab.cc
+++ b/lm/vocab.cc
@ -32,7 +32,8 @@ const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5);
 // Sadly some LMs have <UNK>.  
 const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5);

-void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count) {
+void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint64_t offset) {
+  util::SeekOrThrow(fd, offset);
  // Check that we're at the right place by reading <unk> which is always first.
  char check_unk[6];
  util::ReadOrThrow(fd, check_unk, 6);
@ -80,11 +81,6 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) {
  buffer_.push_back(0);
 }

-void WriteWordsWrapper::Write(int fd, uint64_t start) {
-  util::SeekOrThrow(fd, start);
-  util::WriteOrThrow(fd, buffer_.data(), buffer_.size());
-}
-
 SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {}

 uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) {
@ -100,6 +96,12 @@ void SortedVocabulary::SetupMemory(void *start, std::size_t allocated, std::size
  saw_unk_ = false;
 }

+void SortedVocabulary::Relocate(void *new_start) {
+  std::size_t delta = end_ - begin_;
+  begin_ = reinterpret_cast<uint64_t*>(new_start) + 1;
+  end_ = begin_ + delta;
+}
+
 void SortedVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries) {
  enumerate_ = to;
  if (enumerate_) {
@ -147,11 +149,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
  bound_ = end_ - begin_ + 1;
 }

-void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
+void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) {
  end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
  SetSpecial(Index("<s>"), Index("</s>"), 0);
  bound_ = end_ - begin_ + 1;
-  if (have_words) ReadWords(fd, to, bound_);
+  if (have_words) ReadWords(fd, to, bound_, offset);
 }

 namespace {
@ -179,6 +181,11 @@ void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::siz
  saw_unk_ = false;
 }

+void ProbingVocabulary::Relocate(void *new_start) {
+  header_ = static_cast<detail::ProbingVocabularyHeader*>(new_start);
+  lookup_.Relocate(static_cast<uint8_t*>(new_start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)));
+}
+
 void ProbingVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t /*max_entries*/) {
  enumerate_ = to;
  if (enumerate_) {
@ -206,12 +213,11 @@ void ProbingVocabulary::InternalFinishedLoading() {
  SetSpecial(Index("<s>"), Index("</s>"), 0);
 }

-void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
+void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) {
  UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ".  Please rerun build_binary using the same version of the code.");
-  lookup_.LoadedBinary();
  bound_ = header_->bound;
  SetSpecial(Index("<s>"), Index("</s>"), 0);
-  if (have_words) ReadWords(fd, to, bound_);
+  if (have_words) ReadWords(fd, to, bound_, offset);
 }

 void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
--- a/lm/vocab.hh
+++ b/lm/vocab.hh
@ -36,7 +36,7 @@ class WriteWordsWrapper : public EnumerateVocab {
    
    void Add(WordIndex index, const StringPiece &str);

-    void Write(int fd, uint64_t start);
+    const std::string &Buffer() const { return buffer_; }

  private:
    EnumerateVocab *inner_;
@ -71,6 +71,8 @@ class SortedVocabulary : public base::Vocabulary {
    // Everything else is for populating.  I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
    void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config);

+    void Relocate(void *new_start);
+
    void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries);

    WordIndex Insert(const StringPiece &str);
@ -83,15 +85,13 @@ class SortedVocabulary : public base::Vocabulary {

    bool SawUnk() const { return saw_unk_; }

-    void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
+    void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);

  private:
    uint64_t *begin_, *end_;

    WordIndex bound_;

-    WordIndex highest_value_;
-
    bool saw_unk_;

    EnumerateVocab *enumerate_;
@ -140,6 +140,8 @@ class ProbingVocabulary : public base::Vocabulary {
    // Everything else is for populating.  I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
    void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config);

+    void Relocate(void *new_start);
+
    void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries);

    WordIndex Insert(const StringPiece &str);
@ -152,7 +154,7 @@ class ProbingVocabulary : public base::Vocabulary {

    bool SawUnk() const { return saw_unk_; }

-    void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
+    void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);

  private:
    void InternalFinishedLoading();
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@ -242,9 +242,9 @@ void FeatureRegistry::PrintFF() const
 	Map::const_iterator iter;
 	for (iter = registry_.begin(); iter != registry_.end(); ++iter) {
 		const string &ffName = iter->first;
-		std::cerr << ffName << std::endl;
+		std::cerr << ffName << " ";
 	}
-
+	std::cerr << std::endl;
 }

 } // namespace Moses
--- a/moses/Jamfile
+++ b/moses/Jamfile
@ -12,6 +12,7 @@ if $(with-dlib) {

 alias headers : ../util//kenutil : : : $(max-factors) $(dlib) ; 
 alias ThreadPool : ThreadPool.cpp ;
+alias Util : Util.cpp Timer.cpp ;

 if [ option.get "with-synlm" : no : yes ] = yes
 {
--- a/moses/LM/Jamfile
+++ b/moses/LM/Jamfile
@ -94,9 +94,16 @@ if $(with-nplm) {
 local with-dalm = [ option.get "with-dalm" ] ;
 if $(with-dalm) {
  lib dalm : : <search>$(with-dalm)/lib ;
-  lib MurmurHash3 : : <search>$(with-dalm)/lib ;
-  obj DALM.o : DALMWrapper.cpp dalm MurmurHash3 ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
-  alias dalmALIAS : DALM.o dalm MurmurHash3 : : : <define>LM_DALM ;
+
+	if [ path.exists $(with-dalm)/lib/libMurmurHash3.a ] {
+  	lib MurmurHash3 : : <search>$(with-dalm)/lib ;
+		alias dalm-libs : dalm MurmurHash3 ;
+	} else {
+		alias dalm-libs : dalm ;
+	}
+
+  obj DALM.o : DALMWrapper.cpp dalm-libs ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
+  alias dalmALIAS : DALM.o dalm-libs : : : <define>LM_DALM ;
  dependencies += dalmALIAS ;
  lmmacros += LM_DALM ;
 }
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@ -202,6 +202,7 @@ Parameter::Parameter()
  AddParam("placeholder-factor", "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model");
  AddParam("no-cache", "Disable all phrase-table caching. Default = false (ie. enable caching)");

+  AddParam("adjacent-only", "Only allow hypotheses which are adjacent to current derivation. ITG without block moves");

 }

--- a/moses/SearchCubePruning.cpp
+++ b/moses/SearchCubePruning.cpp
@ -250,6 +250,11 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
    return true;
  }

+  if (StaticData::Instance().AdjacentOnly() &&
+	  !hypoBitmap.IsAdjacent(range.GetStartPos(), range.GetEndPos())) {
+	return false;
+  }
+
  bool leftMostEdge = (hypoFirstGapPos == startPos);
  // any length extension is okay if starting at left-most edge
  if (leftMostEdge) {
--- a/moses/SearchNormal.cpp
+++ b/moses/SearchNormal.cpp
@ -253,6 +253,11 @@ void SearchNormal::ExpandAllHypotheses(const Hypothesis &hypothesis, size_t star
    expectedScore += m_transOptColl.GetFutureScore().CalcFutureScore( hypothesis.GetWordsBitmap(), startPos, endPos );
  }

+  if (StaticData::Instance().AdjacentOnly() &&
+	  !hypothesis.GetWordsBitmap().IsAdjacent(startPos, endPos)) {
+	return;
+  }
+
  // loop through all translation options
  const TranslationOptionList &transOptList = m_transOptColl.GetTranslationOptionList(WordsRange(startPos, endPos));
  TranslationOptionList::const_iterator iter;
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@ -385,6 +385,8 @@ bool StaticData::LoadData(Parameter *parameter)

  SetBooleanParameter( &m_lmEnableOOVFeature, "lmodel-oov-feature", false);

+  SetBooleanParameter( &m_adjacentOnly, "adjacent-only", false);
+
  // minimum Bayes risk decoding
  SetBooleanParameter( &m_mbr, "minimum-bayes-risk", false );
  m_mbrSize = (m_parameter->GetParam("mbr-size").size() > 0) ?
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@ -197,6 +197,7 @@ protected:

  FactorType m_placeHolderFactor;
  bool m_useLegacyPT;
+  bool m_adjacentOnly;

  FeatureRegistry m_registry;

@ -753,6 +754,8 @@ public:
    return &m_soft_matches_map_reverse;
  }

+  bool AdjacentOnly() const
+  { return m_adjacentOnly; }
 };

 }
--- a/moses/TranslationModel/PhraseDictionary.cpp
+++ b/moses/TranslationModel/PhraseDictionary.cpp
@ -58,8 +58,7 @@ const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollectionLEGACY(

    size_t hash = hash_value(src);

-    std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
-
+    CacheColl::iterator iter;
    iter = cache.find(hash);

    if (iter == cache.end()) {
@ -179,7 +178,7 @@ void PhraseDictionary::ReduceCache() const

  // find cutoff for last used time
  priority_queue< clock_t > lastUsedTimes;
-  std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iter;
+  CacheColl::iterator iter;
  iter = cache.begin();
  while( iter != cache.end() ) {
    lastUsedTimes.push( iter->second.second );
@ -193,7 +192,7 @@ void PhraseDictionary::ReduceCache() const
  iter = cache.begin();
  while( iter != cache.end() ) {
    if (iter->second.second < cutoffLastUsedTime) {
-      std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iterRemove = iter++;
+      CacheColl::iterator iterRemove = iter++;
      delete iterRemove->second.first;
      cache.erase(iterRemove);
    } else iter++;
--- a/moses/TranslationModel/PhraseDictionary.h
+++ b/moses/TranslationModel/PhraseDictionary.h
@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <stdexcept>
 #include <vector>
 #include <string>
+#include <boost/unordered_map.hpp>

 #ifdef WITH_THREADS
 #include <boost/thread/tss.hpp>
@ -54,7 +55,7 @@ class ChartCellCollectionBase;
 class ChartRuleLookupManager;
 class ChartParser;

-class CacheColl : public std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
+class CacheColl : public boost::unordered_map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
 {
 // 1st = hash of source phrase/ address of phrase-table node
 // 2nd = all translations
--- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp
@ -59,7 +59,7 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input

    CacheColl &cache = GetCache();

-    std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
+    CacheColl::iterator iter;
    iter = cache.find(hash);

    if (iter != cache.end()) {
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
@ -165,7 +165,7 @@ const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollection(
  CacheColl &cache = GetCache();
  size_t hash = (size_t) ptNode->GetFilePos();

-  std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
+  CacheColl::iterator iter;

  iter = cache.find(hash);

--- a/moses/WordsBitmap.cpp
+++ b/moses/WordsBitmap.cpp
@ -63,6 +63,22 @@ int WordsBitmap::GetFutureCosts(int lastPos) const
  return sum;
 }

+bool WordsBitmap::IsAdjacent(size_t startPos, size_t endPos) const
+{
+  if (GetNumWordsCovered() == 0) {
+	  return true;
+  }
+
+  size_t first = GetFirstGapPos();
+  size_t last = GetLastGapPos();
+
+  if (startPos == last || endPos == first) {
+    return true;
+  }
+
+  return false;
+}
+

 }

--- a/moses/WordsBitmap.h
+++ b/moses/WordsBitmap.h
@ -132,6 +132,8 @@ public:
    return NOT_FOUND;
  }

+  bool IsAdjacent(size_t startPos, size_t endPos) const;
+
  //! whether a word has been translated at a particular position
  bool GetValue(size_t pos) const {
    return m_bitmap[pos];
--- a/phrase-extract/DomainFeature.cpp
+++ b/phrase-extract/DomainFeature.cpp
@ -1,6 +1,5 @@
-// $Id$
-//#include "beammain.h"
-#include "domain.h"
+#include "DomainFeature.h"
+#include "ExtractionPhrasePair.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
 #include "SafeGetline.h"
@ -26,7 +25,7 @@ void Domain::load( const std::string &domainFileName )
    int lineNumber;
    if (domainSpecLine.size() != 2 ||
        ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
-      cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
+      std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
      exit(1);
    }
    // store
@ -50,29 +49,34 @@ string Domain::getDomainOfSentence( int sentenceId ) const
  return "undefined";
 }

-DomainFeature::DomainFeature(const string& domainFile)
+DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
 {
  //process domain file
  m_domain.load(domainFile);
 }

+void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, 
+                                              float count, 
+                                              int sentenceId) const
+{
+  std::string value = m_domain.getDomainOfSentence(sentenceId);
+  phrasePair.AddProperty(m_propertyKey, value, count);
+}
+
 void DomainFeature::add(const ScoreFeatureContext& context,
                        std::vector<float>& denseValues,
                        std::map<std::string,float>& sparseValues)  const
 {
-  map< string, float > domainCount;
-  for(size_t i=0; i<context.phrasePair.size(); i++) {
-    string d = m_domain.getDomainOfSentence(context.phrasePair[i]->sentenceId );
-    if (domainCount.find( d ) == domainCount.end()) {
-      domainCount[d] = context.phrasePair[i]->count;
-    } else {
-      domainCount[d] += context.phrasePair[i]->count;
-    }
-  }
-  add(domainCount, context.count, context.maybeLog, denseValues, sparseValues);
+  const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
+  assert( domainCount != NULL );
+  add(*domainCount, 
+      context.phrasePair.GetCount(), 
+      context.maybeLog, 
+      denseValues, sparseValues);
 }

-void SubsetDomainFeature::add(const map<string,float>& domainCount,float count,
+void SubsetDomainFeature::add(const map<string,float>& domainCount, 
+                              float count,
                              const MaybeLog& maybeLog,
                              std::vector<float>& denseValues,
                              std::map<std::string,float>& sparseValues)  const
@ -152,7 +156,6 @@ void IndicatorDomainFeature::add(const map<string,float>& domainCount,float coun
      denseValues.push_back(maybeLog(2.718));
    }
  }
-
 }

 void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
@ -166,12 +169,5 @@ void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,floa
  }
 }

-bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
-{
-  return m_domain.getDomainOfSentence(lhs.sentenceId) ==
-         m_domain.getDomainOfSentence( rhs.sentenceId);
-}
-
-
 }

--- a/phrase-extract/DomainFeature.h
+++ b/phrase-extract/DomainFeature.h
@ -34,13 +34,17 @@ class DomainFeature : public ScoreFeature
 public:

  DomainFeature(const std::string& domainFile);
-  bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
+
+  void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, 
+                                 float count, 
+                                 int sentenceId) const;
+
  void add(const ScoreFeatureContext& context,
           std::vector<float>& denseValues,
           std::map<std::string,float>& sparseValues) const;

 protected:
-  /** Overriden in subclass */
+  /** Overridden in subclass */
  virtual void add(const std::map<std::string,float>& domainCounts, float count,
                   const MaybeLog& maybeLog,
                   std::vector<float>& denseValues,
@ -49,6 +53,8 @@ protected:

  Domain m_domain;

+  const std::string m_propertyKey;
+
 };

 class SubsetDomainFeature : public DomainFeature
--- a/phrase-extract/ExtractionPhrasePair.cpp
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@ -0,0 +1,327 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <sstream>
+#include "ExtractionPhrasePair.h"
+#include "SafeGetline.h"
+#include "tables-core.h"
+#include "score.h"
+#include "moses/Util.h"
+
+#include <cstdlib>
+
+using namespace std;
+
+
+namespace MosesTraining {
+
+
+extern Vocabulary vcbT;
+extern Vocabulary vcbS;
+
+extern bool hierarchicalFlag;
+
+
+ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource, 
+                                            const PHRASE *phraseTarget, 
+                                            ALIGNMENT *targetToSourceAlignment, 
+                                            float count, float pcfgSum ) :
+    m_phraseSource(phraseSource),
+    m_phraseTarget(phraseTarget),
+    m_count(count),
+    m_pcfgSum(pcfgSum)
+{
+  assert(phraseSource->empty());
+  assert(phraseTarget->empty());
+
+  m_count = count;
+  m_pcfgSum = pcfgSum;
+  
+  std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
+      m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
+
+  m_lastTargetToSourceAlignment = insertedAlignment.first;
+  m_lastCount = m_count;
+  m_lastPcfgSum = m_pcfgSum;
+
+  m_isValid = true;
+}
+
+
+ExtractionPhrasePair::~ExtractionPhrasePair( ) {
+  Clear();
+}
+
+
+// return value: true if the given alignment was seen for the first time and thus will be stored,
+//               false if it was present already (the pointer may thus be deleted(
+bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment, 
+                                float count, float pcfgSum ) 
+{
+  m_count += count;
+  m_pcfgSum += pcfgSum;
+
+  m_lastCount = count;
+  m_lastPcfgSum = pcfgSum;
+  
+  std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
+  if ( *(iter->first) == *targetToSourceAlignment ) {
+    iter->second += count;
+    return false;
+  } else {
+    std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
+        m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
+    if ( !insertedAlignment.second ) {
+      // the alignment already exists: increment count
+      insertedAlignment.first->second += count;
+      return false;
+    }
+    m_lastTargetToSourceAlignment = insertedAlignment.first;
+  }
+
+  return true;
+}
+
+
+void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
+{
+  m_count += count;
+  m_pcfgSum += pcfgSum;
+  m_lastTargetToSourceAlignment->second += count;
+  // properties
+  for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin(); 
+        iter !=m_properties.end(); ++iter ) {
+    LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
+    (*lastPropertyValue)->second += count;
+  }
+
+  m_lastCount = count;
+  m_lastPcfgSum = pcfgSum;
+}
+
+
+// Check for lexical match 
+// and in case of SCFG rules for equal non-terminal alignment.
+bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
+                                    const PHRASE *otherPhraseTarget,
+                                    ALIGNMENT *otherTargetToSourceAlignment ) const
+{
+  if (*otherPhraseTarget != *m_phraseTarget) {
+    return false;
+  }
+  if (*otherPhraseSource != *m_phraseSource) {
+    return false;
+  }
+
+  return MatchesAlignment( otherTargetToSourceAlignment );
+}
+
+// Check for lexical match 
+// and in case of SCFG rules for equal non-terminal alignment.
+// Set boolean indicators. 
+// (Note that we check in the order: target - source - alignment
+//  and do not touch the subsequent boolean indicators once a previous one has been set to false.)
+bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
+                                    const PHRASE *otherPhraseTarget,
+                                    ALIGNMENT *otherTargetToSourceAlignment,
+                                    bool &sourceMatch,
+                                    bool &targetMatch,
+                                    bool &alignmentMatch ) const
+{
+  if (*otherPhraseSource != *m_phraseSource) {
+    sourceMatch = false;
+    return false;
+  } else {
+    sourceMatch = true;
+  }
+  if (*otherPhraseTarget != *m_phraseTarget) {
+    targetMatch = false;
+    return false;
+  } else {
+    targetMatch = true;
+  }
+  if ( !MatchesAlignment(otherTargetToSourceAlignment) ) {
+    alignmentMatch = false;
+    return false;
+  } else {
+    alignmentMatch = true;
+  }
+  return true;
+}
+
+// Check for equal non-terminal alignment in case of SCFG rules.
+// Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first
+bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
+{
+  if (!hierarchicalFlag) return true;
+
+  // all or none of the phrasePair's word alignment matrices match, so just pick one
+  const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;
+
+  assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
+  assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());
+
+  // loop over all symbols but the left hand side of the rule
+  for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
+    if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
+      size_t thisAlign  = *(thisTargetToSourceAlignment->at(i).begin());
+      size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());
+
+      if (thisTargetToSourceAlignment->at(i).size() != 1 ||
+          otherTargetToSourceAlignment->at(i).size() != 1 ||
+          thisAlign != otherAlign) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void ExtractionPhrasePair::Clear() 
+{
+  delete m_phraseSource;
+  delete m_phraseTarget;
+
+  m_count = 0.0f;
+  m_pcfgSum = 0.0f;
+
+  for ( std::map<ALIGNMENT*,float>::iterator iter=m_targetToSourceAlignments.begin();
+        iter!=m_targetToSourceAlignments.end(); ++iter) {
+    delete iter->first;
+  }
+  m_targetToSourceAlignments.clear();
+
+  for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
+        iter!=m_properties.end(); ++iter) {
+    delete (iter->second).second;
+    delete (iter->second).first;
+  }
+  m_properties.clear();
+
+  m_lastCount = 0.0f;
+  m_lastPcfgSum = 0.0f;
+  m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
+  
+  m_isValid = false;
+}
+
+
+void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count )
+{
+  if (propertiesString.empty()) {
+    return;
+  }
+
+  vector<std::string> toks;
+  Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
+  for (size_t i = 1; i < toks.size(); ++i) {
+    std::string &tok = toks[i];
+    if (tok.empty()) {
+      continue;
+    }
+    size_t endPos = tok.rfind("}");
+    tok = tok.substr(0, endPos - 1);
+
+    vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
+    assert(keyValue.size() == 2);
+    AddProperty(keyValue[0], keyValue[1], count);
+  }
+}
+
+
+const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
+{
+  float bestAlignmentCount = -1;
+
+  std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
+
+  for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin(); 
+       iter!=m_targetToSourceAlignments.end(); ++iter) {
+    if ( (iter->second > bestAlignmentCount) ||
+         ( (iter->second == bestAlignmentCount) &&
+           (*(iter->first) > *(bestAlignment->first)) ) ) {
+      bestAlignmentCount = iter->second;
+      bestAlignment = iter;
+    }
+  }
+
+  if ( bestAlignment == m_targetToSourceAlignments.end()) {
+    return NULL;
+  }
+
+  return bestAlignment->first;
+}
+
+
+const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const
+{
+  float bestPropertyCount = -1;
+
+  const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+  if ( allPropertyValues == NULL ) {
+    return NULL;
+  }
+
+  PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
+
+  for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); 
+       iter!=allPropertyValues->end(); ++iter) {
+    if ( (iter->second > bestPropertyCount) ||
+         ( (iter->second == bestPropertyCount) &&
+           (iter->first > bestPropertyValue->first) ) ) {
+      bestPropertyCount = iter->second;
+      bestPropertyValue = iter;
+    }
+  }
+
+  if ( bestPropertyValue == allPropertyValues->end()) {
+    return NULL;
+  }
+
+  return &(bestPropertyValue->first);
+}
+
+
+std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const
+{
+  const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+
+  if ( allPropertyValues == NULL ) {
+    return "";
+  }
+
+  std::ostringstream oss;
+  for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); 
+       iter!=allPropertyValues->end(); ++iter) {
+    if (iter!=allPropertyValues->begin()) {
+      oss << " ";
+    }
+    oss << iter->first;
+    oss << " ";
+    oss << iter->second;
+  }
+
+  std::string allPropertyValuesString(oss.str());
+  return allPropertyValuesString;
+}
+
+
+}
+
--- a/phrase-extract/ExtractionPhrasePair.h
+++ b/phrase-extract/ExtractionPhrasePair.h
@ -0,0 +1,162 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#pragma once
+#include "tables-core.h"
+
+#include <vector>
+#include <set>
+#include <map>
+
+namespace MosesTraining {
+
+
+typedef std::vector< std::set<size_t> > ALIGNMENT;
+
+
+class ExtractionPhrasePair {
+
+protected:
+
+  typedef std::map<std::string,float> PROPERTY_VALUES;
+  typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
+
+  
+  bool m_isValid;
+
+  const PHRASE *m_phraseSource;
+  const PHRASE *m_phraseTarget;
+
+  float m_count;
+  float m_pcfgSum;
+
+  std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
+  std::map<std::string, 
+           std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
+
+  float m_lastCount;
+  float m_lastPcfgSum;
+  std::map<ALIGNMENT*,float>::iterator m_lastTargetToSourceAlignment;
+
+public:
+
+  ExtractionPhrasePair( const PHRASE *phraseSource, 
+                        const PHRASE *phraseTarget, 
+                        ALIGNMENT *targetToSourceAlignment, 
+                        float count, float pcfgSum );
+
+  ~ExtractionPhrasePair();
+
+  bool Add( ALIGNMENT *targetToSourceAlignment, 
+            float count, float pcfgSum );
+
+  void IncrementPrevious( float count, float pcfgSum );
+
+  bool Matches( const PHRASE *otherPhraseSource,
+                const PHRASE *otherPhraseTarget,
+                ALIGNMENT *otherTargetToSourceAlignment ) const;
+
+  bool Matches( const PHRASE *otherPhraseSource,
+                const PHRASE *otherPhraseTarget,
+                ALIGNMENT *otherTargetToSourceAlignment,
+                bool &sourceMatch,
+                bool &targetMatch,
+                bool &alignmentMatch ) const;
+
+  bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const;
+
+  void Clear();
+
+  bool IsValid() const {
+    return m_isValid;
+  }
+
+
+  const PHRASE *GetSource() const {
+    return m_phraseSource;
+  }
+  
+  const PHRASE *GetTarget() const {
+    return m_phraseTarget;
+  }
+
+  float GetCount() const {
+    return m_count;
+  }
+
+  float GetPcfgScore() const {
+    return m_pcfgSum;
+  }
+
+  const size_t GetNumberOfProperties() const {
+    return m_properties.size();
+  }
+
+  const std::map<std::string,float> *GetProperty( const std::string &key ) const {
+    std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::const_iterator iter;
+    iter = m_properties.find(key);
+    if (iter == m_properties.end()) {
+      return NULL;
+    } else {
+      return iter->second.first;
+    }
+  }
+
+  const ALIGNMENT *FindBestAlignmentTargetToSource() const;
+
+  const std::string *FindBestPropertyValue(const std::string &key) const;
+
+  std::string CollectAllPropertyValues(const std::string &key) const;
+
+  void AddProperties( const std::string &str, float count );
+
+  void AddProperty( const std::string &key, const std::string &value, float count ) 
+  {
+    std::map<std::string,
+             std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
+    if ( iter == m_properties.end() ) {
+      // key not found: insert property key and value
+      PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
+      std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
+      LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
+      m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue);
+    } else {
+      LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
+      if ( (*lastPropertyValue)->first == value ) { // same property key-value pair has been seen right before
+        // property key-value pair exists already: add count
+        (*lastPropertyValue)->second += count;
+      } else { // need to check whether the property key-value pair has appeared before (insert if not)
+        // property key exists, but not in combination with this value:
+        // add new value with count
+        PROPERTY_VALUES *propertyValues = (iter->second).first;
+        std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
+        if ( !insertedProperty.second ) { // property value for this key appeared before: add count
+          insertedProperty.first->second += count;
+        }
+        LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
+        delete (iter->second).second;
+        (iter->second).second = lastPropertyValue;
+      }
+    }
+  }
+
+};
+
+}
+
--- a/phrase-extract/InternalStructFeature.cpp
+++ b/phrase-extract/InternalStructFeature.cpp
@ -1,50 +1,30 @@
 #include "InternalStructFeature.h"
+#include <map>

 using namespace std;

 namespace MosesTraining
 {

-InternalStructFeature::InternalStructFeature()
-	:m_type(0){
-	//cout<<"InternalStructFeature: Construct "<<m_type<<"\n";
-}
-
-bool InternalStructFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const{
-	//cout<<"InternalStructFeature: Equals\n";
-	//don't know what it's used for and what we should compare
-	//-> if the dense score is the same
-	//-> if the sparse feature is set
-	// compare phrases? with the internalStrucutre string?
-	/** Return true if the two phrase pairs are equal from the point of this feature. Assume
-	      that they already compare true according to PhraseAlignment.equals()
-	   **/
-
-/*	if(lhs.ghkmParse==rhs.ghkmParse)
-		return true;
-	else
-		return false;
-*/
-	//return true;
-}
-
 void InternalStructFeature::add(const ScoreFeatureContext& context,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const{
-	for(size_t i=0; i<context.phrasePair.size(); i++) {
-		add(&context.phrasePair[i]->treeFragment, denseValues, sparseValues);
-	}
-
+                                std::vector<float>& denseValues,
+                                std::map<std::string,float>& sparseValues) const {
+  const std::map<std::string,float> *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only?
+  for ( std::map<std::string,float>::const_iterator iter=allTrees->begin();
+        iter!=allTrees->end(); ++iter ) {
+    add(&(iter->first), iter->second, denseValues, sparseValues);
+  }
 }

-void InternalStructFeatureDense::add(std::string *internalStruct,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const{
+void InternalStructFeatureDense::add(const std::string *treeFragment,
+                                     float count,
+                                     std::vector<float>& denseValues,
+                                     std::map<std::string,float>& sparseValues) const {
 	//cout<<"Dense: "<<*internalStruct<<endl;
 	size_t start=0;
 	int countNP=0;
-	while((start = internalStruct->find("NP", start)) != string::npos) {
-		countNP++;
+	while((start = treeFragment->find("NP", start)) != string::npos) {
+		countNP += count;
 		start+=2; //length of "NP"
 	}
 	//should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
@ -53,21 +33,21 @@ void InternalStructFeatureDense::add(std::string *internalStruct,

 }

-void InternalStructFeatureSparse::add(std::string *internalStruct,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const{
-	//cout<<"Sparse: "<<*internalStruct<<endl;
-	if(internalStruct->find("VBZ")!=std::string::npos)
-		sparseValues["NTVBZ"] = 1;
-	if(internalStruct->find("VBD")!=std::string::npos)
-			sparseValues["NTVBD"] = 1;
-	if(internalStruct->find("VBP")!=std::string::npos)
-				sparseValues["NTVBP"] = 1;
-	if(internalStruct->find("PP")!=std::string::npos)
-				sparseValues["NTPP"] = 1;
-	if(internalStruct->find("SBAR")!=std::string::npos)
-				sparseValues["NTSBAR"] = 1;
-
+void InternalStructFeatureSparse::add(const std::string *treeFragment,
+                                      float count,
+                                      std::vector<float>& denseValues,
+                                      std::map<std::string,float>& sparseValues) const {
+  //cout<<"Sparse: "<<*internalStruct<<endl;
+  if(treeFragment->find("VBZ")!=std::string::npos)
+    sparseValues["NTVBZ"] += count;
+  if(treeFragment->find("VBD")!=std::string::npos)
+    sparseValues["NTVBD"] += count;
+  if(treeFragment->find("VBP")!=std::string::npos)
+    sparseValues["NTVBP"] += count;
+  if(treeFragment->find("PP")!=std::string::npos)
+    sparseValues["NTPP"] += count;
+  if(treeFragment->find("SBAR")!=std::string::npos)
+    sparseValues["NTSBAR"] += count;
 }


--- a/phrase-extract/InternalStructFeature.h
+++ b/phrase-extract/InternalStructFeature.h
@ -21,22 +21,19 @@ namespace MosesTraining
 class InternalStructFeature : public ScoreFeature
 {
 public:
-	InternalStructFeature();
-	/** Return true if the two phrase pairs are equal from the point of this feature. Assume
-	      that they already compare true according to PhraseAlignment.equals()
-	   **/
-	bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
+        InternalStructFeature() : m_type(0) {};
 	/** Add the values for this feature function. */
 	void add(const ScoreFeatureContext& context,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const;
+                 std::vector<float>& denseValues,
+                 std::map<std::string,float>& sparseValues) const;


 protected:
-	/** Overriden in subclass */
-	 virtual void add(std::string *internalStruct,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const = 0;
+	/** Overridden in subclass */
+	 virtual void add(const std::string *treeFragment,
+                          float count,
+                          std::vector<float>& denseValues,
+                          std::map<std::string,float>& sparseValues) const = 0;
 	int m_type;

 };
@ -47,9 +44,10 @@ public:
 	InternalStructFeatureDense()
 		:InternalStructFeature(){m_type=1;} //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
 protected:
-	virtual void add(std::string *internalStruct,
-		             std::vector<float>& denseValues,
-		             std::map<std::string,float>& sparseValues) const;
+	virtual void add(const std::string *treeFragment,
+                         float count,
+                         std::vector<float>& denseValues,
+                         std::map<std::string,float>& sparseValues) const;
 };

 class InternalStructFeatureSparse : public InternalStructFeature
@ -58,9 +56,10 @@ public:
 	InternalStructFeatureSparse()
 		:InternalStructFeature(){m_type=2;}// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
 protected:
-	virtual void add(std::string *internalStruct,
-		             std::vector<float>& denseValues,
-		             std::map<std::string,float>& sparseValues) const;
+	virtual void add(const std::string *treeFragment,
+                         float count,
+                         std::vector<float>& denseValues,
+                         std::map<std::string,float>& sparseValues) const;
 };

 }
--- a/Show More
+++ b/Show More