improvements to web analysis, fixes to syntax wrappers

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3633 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-26 13:23:25 +03:00 · 2010-10-21 09:49:27 +00:00 · 2010-10-21 09:49:27 +00:00 · 85a5a13e4c
commit 85a5a13e4c
parent 88eaf49c5e
34 changed files with 1717 additions and 74 deletions
--- a/scripts/ems/biconcor/Alignment.cpp
+++ b/scripts/ems/biconcor/Alignment.cpp
@ -0,0 +1,171 @@
+#include "Alignment.h"
+#include <string>
+#include <stdlib.h>
+#include <cstring>
+
+using namespace std;
+
+void Alignment::Create( string fileName ) 
+{
+	ifstream textFile;
+	char line[LINE_MAX_LENGTH];
+
+	// count the number of words first;
+	textFile.open(fileName.c_str());
+	istream *fileP = &textFile;
+	m_size = 0;
+	m_sentenceCount = 0;
+	while(!fileP->eof()) {
+		SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+		if (fileP->eof()) break;
+	  vector<string> alignmentSequence = Tokenize( line );
+		m_size += alignmentSequence.size();
+		m_sentenceCount++;
+	}
+	textFile.close();
+	cerr << m_size << " alignment points" << endl;
+
+	// allocate memory
+	m_array = (char*) calloc( sizeof( char ), m_size*2 );
+	m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount );
+
+	// fill the array
+	int alignmentPointIndex = 0;
+	int sentenceId = 0;
+	textFile.open(fileName.c_str());
+	fileP = &textFile;
+	while(!fileP->eof()) {
+		SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+		if (fileP->eof()) break;
+	  vector<string> alignmentSequence = Tokenize( line );
+	  for(int i=0; i<alignmentSequence.size(); i++) {
+	    int s,t;
+	    // cout << "scaning " << alignmentSequence[i].c_str() << endl;
+	    if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
+	      cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceId << endl; 
+	    }
+			m_array[alignmentPointIndex++] = (char) s;
+			m_array[alignmentPointIndex++] = (char) t;
+		}
+		m_sentenceEnd[ sentenceId++ ] = alignmentPointIndex - 2;
+	}
+	textFile.close();
+	cerr << "done reading " << (alignmentPointIndex/2) << " alignment points, " << sentenceId << " sentences." << endl;
+}
+
+Alignment::~Alignment()
+{ 
+	free(m_array);
+	free(m_sentenceEnd); 
+}
+
+vector<string> Alignment::Tokenize( const char input[] ) {
+  vector< string > token;
+  bool betweenWords = true;
+  int start=0;
+  int i=0;
+  for(; input[i] != '\0'; i++) {
+    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+    if (!isSpace && betweenWords) {
+      start = i;
+      betweenWords = false;
+    }
+    else if (isSpace && !betweenWords) {
+      token.push_back( string( input+start, i-start ) );
+      betweenWords = true;
+    }
+  }
+  if (!betweenWords)
+    token.push_back( string( input+start, i-start ) );
+  return token;
+}
+
+bool Alignment::PhraseAlignment( INDEX sentence, char target_length,
+                                 char source_start, char source_end,
+                                 char &target_start, char &target_end,
+                                 char &pre_null, char &post_null ) {
+	vector< char > alignedTargetWords;
+	
+	// get index for first alignment point
+	INDEX sentenceStart = 0;
+	if (sentence > 0) {
+		sentenceStart = m_sentenceEnd[ sentence-1 ] + 2;
+	}
+
+  // get target phrase boundaries
+	target_start = target_length;
+	target_end = 0;
+	for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
+		char source = m_array[ ap ];
+		if (source >= source_start && source <= source_end ) {
+			char target =  m_array[ ap+1 ];
+			if (target < target_start) target_start = target;
+			if (target > target_end )  target_end   = target;
+		}
+	}
+  if (target_start == target_length) {
+		return false; // done if no alignment points
+	}
+
+	// check consistency
+	for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
+		char target =  m_array[ ap+1 ];
+		if (target >= target_start && target <= target_end ) {
+			char source = m_array[ ap ];
+			if (source < source_start || source > source_end) {
+				return false; // alignment point out of range
+			}
+		}
+	}
+
+	// create array for unaligned words
+	for( int i=0; i<target_length; i++ ) {
+		m_unaligned[i] = true;
+	}
+	for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
+		char target =  m_array[ ap+1 ];
+		m_unaligned[ target ] = false;
+	}
+
+  // prior unaligned words
+	pre_null = 0;
+	for(char target = target_start-1; target >= 0 && m_unaligned[ target ]; target--) {
+		pre_null++;
+	}
+
+	// post unaligned words;
+	post_null = 0;
+	for(char target = target_end+1; target < target_length && m_unaligned[ target ]; target++) {
+		post_null++;
+	}
+	return true;
+}
+
+void Alignment::Save( string fileName ) {
+	FILE *pFile = fopen ( (fileName + ".align").c_str() , "w" );
+
+	fwrite( &m_size, sizeof(INDEX), 1, pFile );
+	fwrite( m_array, sizeof(char), m_size*2, pFile ); // corpus
+
+	fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
+	fwrite( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
+	fclose( pFile );
+}
+
+void Alignment::Load( string fileName ) {
+	FILE *pFile = fopen ( (fileName + ".align").c_str() , "r" );
+	cerr << "loading from " << fileName << ".align" << endl;
+
+	fread( &m_size, sizeof(INDEX), 1, pFile );
+	cerr << "alignment points in corpus: " << m_size << endl;
+	m_array = (char*) calloc( sizeof(char), m_size*2 );
+	fread( m_array, sizeof(char), m_size*2, pFile ); // corpus
+
+	fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
+	cerr << "sentences in corpus: " << m_sentenceCount << endl;
+	m_sentenceEnd = (INDEX*) calloc( sizeof(INDEX), m_sentenceCount );
+	fread( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
+	fclose( pFile );
+	cerr << "done loading\n";
+}
--- a/scripts/ems/biconcor/Alignment.h
+++ b/scripts/ems/biconcor/Alignment.h
@ -0,0 +1,30 @@
+#include "Vocabulary.h"
+
+#pragma once
+
+#define LINE_MAX_LENGTH 10000
+
+class Alignment 
+{
+public:
+	typedef unsigned int INDEX;
+	
+private:
+	char *m_array;
+	INDEX *m_sentenceEnd;
+	INDEX m_size;
+	INDEX m_sentenceCount;
+	char m_unaligned[ 256 ];
+
+public:
+	~Alignment();
+
+	void Create( string fileName );
+	bool PhraseAlignment( INDEX sentence, char target_length,
+	                                 char source_start, char source_end,
+	                                 char &target_start, char &target_end,
+																	 char &pre_null, char &post_null );
+	void Load( string fileName );
+	void Save( string fileName );
+	vector<string> Tokenize( const char input[] );
+};
--- a/scripts/ems/biconcor/Makefile
+++ b/scripts/ems/biconcor/Makefile
@ -0,0 +1,10 @@
+all: biconcor
+
+clean: 
+	rm -f *.o
+
+.cpp.o:
+	g++ -O6 -g -c $<
+
+biconcor: Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o
+	g++ Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o -o biconcor
--- a/scripts/ems/biconcor/PhrasePair.cpp
+++ b/scripts/ems/biconcor/PhrasePair.cpp
@ -0,0 +1,198 @@
+#include "PhrasePair.h"
+#include "Vocabulary.h"
+
+using namespace std;
+
+void PhrasePair::Print( ostream* out, int width ) {
+	vector< WORD_ID >::iterator t;
+	
+	// source
+	int sentence_start = m_source_position - m_source_start;
+	int source_width = (width-3)/2;
+	string source_pre = "";
+	string source = "";
+	string source_post = "";
+	for( int space=0; space<source_width/2; space++ ) source_pre += " ";
+	for( char i=0; i<m_source_start; i++ ) {
+		source_pre += " " + m_suffixArray->GetWord( sentence_start + i );
+	}
+	for( char i=m_source_start; i<=m_source_end; i++ ) {
+		if (i>m_source_start) source += " ";
+		source += m_suffixArray->GetWord( sentence_start + i );
+	}
+	char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
+	for( char i=m_source_end+1; i<source_length; i++ ) {
+		if (i>m_source_end+1) source_post += " ";
+		source_post += m_suffixArray->GetWord( sentence_start + i );
+	}
+	for( int space=0; space<source_width/2; space++ ) source_post += " ";
+	
+	int source_pre_width = (source_width-source.size()-2)/2;
+	int source_post_width = (source_width-source.size()-2+1)/2;
+	
+	if (source.size() > width) {
+		source_pre_width = 0;
+		source_post_width = 0;
+	}
+	
+	*out << source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ) << " "
+	     << source.substr( 0, source_width -2 ) << " "
+	     << source_post.substr( 0, source_post_width ) << " | ";
+	
+	// target
+	int target_width = (width-3)/2;
+	string target_pre = "";
+	string target = "";
+	string target_post = "";
+	for( int space=0; space<target_width/2; space++ ) target_pre += " ";
+	for( char i=0; i<m_target_start; i++ ) {
+		target_pre += " " + m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+	for( char i=m_target_start; i<=m_target_end; i++ ) {
+		if (i>m_target_start) target += " ";
+		target += m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+	for( char i=m_target_end+1; i<m_target_length; i++ ) {
+		if (i>m_target_end+1) target_post += " ";
+		target_post += m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+	
+	int target_pre_width = (target_width-target.size()-2)/2;
+	int target_post_width = (target_width-target.size()-2+1)/2;
+	
+	if (target.size() > width) {
+		target_pre_width = 0;
+		target_post_width = 0;
+	}
+	
+	*out << target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ) << " "
+	     << target.substr( 0, target_width -2 ) << " "
+	     << target_post.substr( 0, target_post_width ) << endl;
+}
+
+void PhrasePair::PrintTarget( ostream* out ) {
+	for( char i=m_target_start; i<=m_target_end; i++ ) {
+		if (i>m_target_start) *out << " ";
+		*out << m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+}
+
+void PhrasePair::PrintHTML( ostream* out ) {
+	// source
+	int sentence_start = m_source_position - m_source_start;
+	char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
+
+	*out << "<tr><td align=right class=\"pp_source_left\">";
+	for( char i=0; i<m_source_start; i++ ) {
+		if (i>0) *out << " ";
+		*out << m_suffixArray->GetWord( sentence_start + i );
+	}
+	*out << "</td><td class=\"pp_source\">";
+	for( char i=m_source_start; i<=m_source_end; i++ ) {	
+		if (i>m_source_start) *out << " ";
+		*out << m_suffixArray->GetWord( sentence_start + i );
+	}
+	*out << "</td><td class=\"pp_source_right\">";
+	for( char i=m_source_end+1; i<source_length; i++ ) {
+		if (i>m_source_end+1) *out << " ";
+		*out << m_suffixArray->GetWord( sentence_start + i );
+	}
+
+	// target
+	*out << "</td><td class=\"pp_target_left\">";
+	for( char i=0; i<m_target_start; i++ ) {
+		if (i>0) *out << " ";
+		*out << m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+	*out << "</td><td class=\"pp_target\">";
+	for( char i=m_target_start; i<=m_target_end; i++ ) {
+		if (i>m_target_start) *out << " ";
+		*out << m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+	*out << "</td><td class=\"pp_target_right\">";
+	for( char i=m_target_end+1; i<m_target_length; i++ ) {
+		if (i>m_target_end+1) *out << " ";
+		*out << m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+	*out << "</td></tr>\n";
+}
+
+void PhrasePair::PrintClippedHTML( ostream* out, int width ) {
+	vector< WORD_ID >::iterator t;
+	
+	// source
+	int sentence_start = m_source_position - m_source_start;
+	int source_width = (width+1)/2;
+	string source_pre = "";
+	string source = "";
+	string source_post = "";
+	for( char i=0; i<m_source_start; i++ ) {
+		source_pre += " " + m_suffixArray->GetWord( sentence_start + i );
+	}
+	for( char i=m_source_start; i<=m_source_end; i++ ) {
+		if (i>m_source_start) source += " ";
+		source += m_suffixArray->GetWord( sentence_start + i );
+	}
+	char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
+	for( char i=m_source_end+1; i<source_length; i++ ) {
+		if (i>m_source_end+1) source_post += " ";
+		source_post += m_suffixArray->GetWord( sentence_start + i );
+	}
+	int source_pre_width = (source_width-source.size())/2;
+	int source_post_width = (source_width-source.size()+1)/2;
+	
+	if (source.size() > width) {
+		source_pre_width = 0;
+		source_post_width = 0;
+	}
+	if (source_pre.size()>source_pre_width)
+		source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
+	if (source_post.size() > source_post_width)
+		source_post = source_post.substr( 0, source_post_width ) + "...";
+	
+	*out << "<tr><td class=\"pp_source_left\">"
+	     << source_pre 
+	     << "</td><td class=\"pp_source\">"
+	     << source.substr( 0, source_width -2 )
+	     << "</td><td class=\"pp_source_right\">"
+	     << source_post
+	     << "</td>";
+	
+	// target
+	int target_width = width/2;
+	string target_pre = "";
+	string target = "";
+	string target_post = "";
+	for( char i=0; i<m_target_start; i++ ) {
+		target_pre += " " + m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+	for( char i=m_target_start; i<=m_target_end; i++ ) {
+		if (i>m_target_start) target += " ";
+		target += m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+	for( char i=m_target_end+1; i<m_target_length; i++ ) {
+		if (i>m_target_end+1) target_post += " ";
+		target_post += m_targetCorpus->GetWord( m_sentence_id, i);
+	}
+	
+	int target_pre_width = (target_width-target.size())/2;
+	int target_post_width = (target_width-target.size()+1)/2;
+	
+	if (target.size() > width) {
+		target_pre_width = 0;
+		target_post_width = 0;
+	}
+	if (target_pre.size() > target_pre_width)
+		target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
+	if (target_post.size() > target_post_width)
+		target_post = target_post.substr( 0, target_post_width ) + "...";
+	
+	*out << "<td class=\"pp_target_left\">"
+	     << target_pre
+	     << "</td><td class=\"pp_target\">"
+	     << target.substr( 0, target_width -2 )
+	     << "</td><td class=\"pp_target_right\">"
+	     << target_post
+	     << "</td></tr>"<< endl;
+}
+
--- a/scripts/ems/biconcor/PhrasePair.h
+++ b/scripts/ems/biconcor/PhrasePair.h
@ -0,0 +1,54 @@
+#include <string>
+#include <stdlib.h>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include "SuffixArray.h"
+#include "TargetCorpus.h"
+#include "Alignment.h"
+#pragma once
+
+using namespace std;
+
+class PhrasePair
+{
+public:
+	typedef unsigned int INDEX;
+
+private:
+	SuffixArray *m_suffixArray;
+	TargetCorpus *m_targetCorpus;
+	Alignment *m_alignment;
+	INDEX m_sentence_id;
+	char m_target_length;
+	SuffixArray::INDEX m_source_position;
+	char m_source_start, m_source_end;
+	char m_target_start, m_target_end;
+	char m_start_null, m_end_null;
+	char m_pre_null, m_post_null;
+
+public:
+	PhrasePair( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, char target_length, INDEX position, char source_start, char source_end, char target_start, char target_end, char start_null, char end_null, char pre_null, char post_null)
+		:m_suffixArray(sa)
+		,m_targetCorpus(tc)
+		,m_alignment(a)
+		,m_sentence_id(sentence_id)
+		,m_source_position(position)
+		,m_target_length(target_length)
+		,m_source_start(source_start)
+		,m_source_end(source_end)
+		,m_target_start(target_start)
+		,m_target_end(target_end)
+		,m_start_null(start_null)
+		,m_end_null(end_null)
+		,m_pre_null(pre_null)
+		,m_post_null(post_null)
+	{}
+	~PhrasePair () {}
+	
+	void PrintTarget( ostream* out );
+	void Print( ostream* out, int width );
+	void PrintHTML( ostream* out );
+	void PrintClippedHTML( ostream* out, int width );
+};
--- a/scripts/ems/biconcor/PhrasePairCollection.cpp
+++ b/scripts/ems/biconcor/PhrasePairCollection.cpp
@ -0,0 +1,111 @@
+#include "PhrasePairCollection.h"
+#include <string>
+#include <stdlib.h>
+#include <cstring>
+#include <algorithm>
+
+using namespace std;
+
+PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a )
+:m_suffixArray(sa)
+,m_targetCorpus(tc)
+,m_alignment(a)
+,m_size(0)
+,m_max_lookup(10000)
+,m_max_pp_target(50)
+,m_max_pp(50)
+{}
+
+PhrasePairCollection::~PhrasePairCollection()
+{}
+
+bool PhrasePairCollection::GetCollection( const vector< string > sourceString ) {
+	INDEX first_match, last_match;
+	if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) {
+		return false;
+	}
+	cerr << "\tfirst match " << first_match << endl;
+	cerr << "\tlast match " << last_match << endl;
+	
+	INDEX found = last_match - first_match +1;
+
+	map< vector< WORD_ID >, INDEX > index;	
+	for( INDEX i=first_match; i<=last_match; i++ ) {
+		int position = m_suffixArray->GetPosition( i );
+		int source_start = m_suffixArray->GetWordInSentence( position );
+		int source_end = source_start + sourceString.size()-1;
+		INDEX sentence_id = m_suffixArray->GetSentence( position );
+		int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
+		int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
+		cerr << "match " << (i-first_match) 
+		     << " in sentence " << sentence_id 
+		     << ", starting at word " << source_start
+		     << " of " << sentence_length
+		     << ". target sentence has " << target_length << " words.";
+		char target_start, target_end, pre_null, post_null;
+		if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
+			cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
+			cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
+			for( char pre = 0; pre <= pre_null; pre++ ) {
+				for( char post = 0; post <= post_null; post++ ) {
+					vector< WORD_ID > targetString;
+					cerr << "; ";
+					for( char target = target_start-pre; target <= target_end+post; target++ ) {	
+						targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) );
+						cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
+					}
+					PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post);
+					// matchCollection.Add( sentence_id, )
+					if (index.find( targetString ) == index.end()) {
+						index[targetString] = m_collection.size();
+						vector< PhrasePair* > emptyVector;
+						m_collection.push_back( emptyVector );
+					}
+					m_collection[ index[targetString] ].push_back( phrasePair );
+					m_size++;
+				}
+			}
+		}
+		cerr << endl;
+
+		if (found > m_max_lookup) {
+			i += found/m_max_lookup-1;
+		}
+	}
+	sort(m_collection.begin(), m_collection.end(), CompareBySize());
+}
+
+void PhrasePairCollection::Print() {
+	vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
+	for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) {
+		(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
+		int count = ppWithSameTarget->size();
+		cout << "(" << count << ")" << endl;
+		vector< PhrasePair* >::iterator p;
+		for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) {
+			(*p)->Print( &cout, 100 );
+		}
+	}
+}
+
+void PhrasePairCollection::PrintHTML() {
+	vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
+	int pp_target = 0;
+	for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
+		cout << "<p class=\"pp_target_header\">";
+		(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
+		int count = ppWithSameTarget->size();
+		cout << "(" << count << "/" << m_size << ")" << endl;
+		cout << "<p><table align=\"center\">";
+		vector< PhrasePair* >::iterator p;
+		int pp = 0;
+		for(p = ppWithSameTarget->begin(); pp<count && p != ppWithSameTarget->end(); p++, pp++ ) {
+			(*p)->PrintClippedHTML( &cout, 160 );
+			if (count > m_max_pp) {
+				p += count/m_max_pp-1;
+				pp += count/m_max_pp-1;
+			} 
+		}
+		cout << "</table>\n";
+	}
+}
--- a/scripts/ems/biconcor/PhrasePairCollection.h
+++ b/scripts/ems/biconcor/PhrasePairCollection.h
@ -0,0 +1,40 @@
+#include "Vocabulary.h"
+#include "SuffixArray.h"
+#include "TargetCorpus.h"
+#include "Alignment.h"
+#include "PhrasePair.h"
+
+#pragma once
+
+class PhrasePairCollection 
+{
+public:
+	typedef unsigned int INDEX;
+
+private:
+	SuffixArray *m_suffixArray;
+	TargetCorpus *m_targetCorpus;
+	Alignment *m_alignment;
+	vector< vector<PhrasePair*> > m_collection;
+	int m_size;
+	int m_max_lookup;
+	int m_max_pp_target;
+	int m_max_pp;
+
+public:
+	PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment * );
+	~PhrasePairCollection ();
+
+	bool GetCollection( const vector< string > sourceString );
+	void Print();
+	void PrintHTML();
+};
+
+// sorting helper
+struct CompareBySize
+{
+	bool operator()(const vector<PhrasePair*> a, const vector<PhrasePair*> b ) const
+	{
+		return a.size() > b.size();
+	}
+};
--- a/scripts/ems/biconcor/SuffixArray.cpp
+++ b/scripts/ems/biconcor/SuffixArray.cpp
@ -0,0 +1,287 @@
+#include "SuffixArray.h"
+#include <string>
+#include <stdlib.h>
+#include <cstring>
+
+using namespace std;
+
+void SuffixArray::Create( string fileName ) 
+{
+	m_vcb.StoreIfNew( "<uNk>" );
+	m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
+
+	ifstream textFile;
+	char line[LINE_MAX_LENGTH];
+
+	// count the number of words first;
+	textFile.open(fileName.c_str());
+	istream *fileP = &textFile;
+	m_size = 0;
+	m_sentenceCount = 0;
+	while(!fileP->eof()) {
+		SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+		if (fileP->eof()) break;
+		vector< WORD_ID > words = m_vcb.Tokenize( line );
+		m_size += words.size() + 1;
+		m_sentenceCount++;
+	}
+	textFile.close();
+	cerr << m_size << " words (incl. sentence boundaries)" << endl;
+
+	// allocate memory
+	m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
+	m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
+	m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
+	m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
+	m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
+
+	// fill the array
+	int wordIndex = 0;
+	int sentenceId = 0;
+	textFile.open(fileName.c_str());
+	fileP = &textFile;
+	while(!fileP->eof()) {
+		SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+		if (fileP->eof()) break;
+		vector< WORD_ID > words = m_vcb.Tokenize( line );
+		vector< WORD_ID >::const_iterator i;
+		
+		for( i=words.begin(); i!=words.end(); i++)
+		{
+			m_index[ wordIndex ] = wordIndex;
+			m_sentence[ wordIndex ] = sentenceId;
+			m_wordInSentence[ wordIndex ] = i-words.begin();
+			m_array[ wordIndex++ ] = *i;
+		}
+		m_index[ wordIndex ] = wordIndex;
+		m_array[ wordIndex++ ] = m_endOfSentence;
+		m_sentenceLength[ sentenceId++ ] = words.size();
+	}
+	textFile.close();
+	cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
+	// List(0,9);
+
+	// sort
+	m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
+	Sort( 0, m_size-1 );
+	free( m_buffer );
+	cerr << "done sorting" << endl;
+}
+
+// good ol' quick sort
+void SuffixArray::Sort(INDEX start, INDEX end) {
+	if (start == end) return;
+	INDEX mid = (start+end+1)/2;
+	Sort( start, mid-1 );
+	Sort( mid, end );
+
+	// merge
+	int i = start;
+	int j = mid;
+	int k = 0;
+	int length = end-start+1;
+	while( k<length )
+	{
+		if (i == mid ) 
+		{
+			m_buffer[ k++ ] = m_index[ j++ ];
+		}
+		else if (j > end ) 
+		{
+			m_buffer[ k++ ] = m_index[ i++ ];
+		}
+		else {
+			if (CompareIndex( m_index[i], m_index[j] ) < 0) 
+			{
+				m_buffer[ k++ ] = m_index[ i++ ];
+			}	
+			else 
+			{
+				m_buffer[ k++ ] = m_index[ j++ ];
+			}
+		}
+	}
+	
+	memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
+					((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
+}
+
+SuffixArray::~SuffixArray()
+{ 
+	free(m_index); 
+	free(m_array);
+}
+
+int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
+{
+	// skip over identical words
+	INDEX offset = 0;
+	while( a+offset < m_size &&
+				 b+offset < m_size &&
+				 m_array[ a+offset ] == m_array[ b+offset ] )
+	{ offset++; }
+	
+	if( a+offset == m_size ) return -1;
+	if( b+offset == m_size ) return 1;
+	return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
+}
+
+inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
+{
+	// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
+	return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
+}
+
+int SuffixArray::Count( const vector< WORD > &phrase )
+{
+	INDEX dummy;
+	return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
+}
+
+bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
+{
+	INDEX dummy;
+	return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
+}
+
+bool SuffixArray::Exists( const vector< WORD > &phrase )
+{
+	INDEX dummy;
+	return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
+}
+
+int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
+{
+	return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
+}
+
+int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
+{
+	// cerr << "FindFirst\n";
+	INDEX start = search_start;
+	INDEX end = (search_end == -1) ? (m_size-1) : search_end;
+	INDEX mid = FindFirst( phrase, start, end );
+	// cerr << "done\n";
+	if (mid == m_size) return 0; // no matches
+	if (min == 1) return 1;      // only existance check
+
+	int matchCount = 1;
+
+	//cerr << "before...\n";
+	firstMatch = FindLast( phrase, mid, start, -1 );
+	matchCount += mid - firstMatch;
+
+	//cerr << "after...\n";
+	lastMatch = FindLast( phrase, mid, end, 1 );
+	matchCount += lastMatch - mid;
+
+	return matchCount;
+}
+
+SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
+{
+	end += direction;
+	while(true)
+	{
+		INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
+		
+		int match = Match( phrase, mid );
+		int matchNext = Match( phrase, mid+direction );
+		//cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
+		
+		if (match == 0 && matchNext != 0) return mid;
+
+		if (match == 0) // mid point is a match
+			start = mid;
+		else
+			end = mid;
+	}
+}
+
+SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
+{
+	while(true)
+	{
+		INDEX mid = ( start + end + 1 )/2;
+		//cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
+		int match = Match( phrase, mid );
+		
+		if (match == 0) return mid;
+		if (start >= end && match != 0 ) return m_size;
+		
+		if (match > 0)
+			start = mid+1;
+		else
+			end = mid-1;	
+	}
+}
+
+int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
+{
+	INDEX pos = m_index[ index ];
+	for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++)
+	{
+		int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
+		// cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
+		if (match != 0) 
+			return match;
+	}
+	return 0;
+}
+
+void SuffixArray::List(INDEX start, INDEX end)
+{
+	for(INDEX i=start; i<=end; i++)
+	{
+		INDEX pos = m_index[ i ];
+		// cerr << i << ":" << pos << "\t";
+		for(int j=0; j<5 && j+pos<m_size; j++)
+		{
+			cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
+		}
+		// cerr << "\n";
+	}
+}
+
+void SuffixArray::Save( string fileName ) {
+	FILE *pFile = fopen ( fileName.c_str() , "w" );
+	
+	fwrite( &m_size, sizeof(INDEX), 1, pFile );
+	fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
+	fwrite( m_index, sizeof(INDEX), m_size, pFile );   // suffix array
+	fwrite( m_wordInSentence, sizeof(char), m_size, pFile); // word index
+	fwrite( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
+
+	fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
+	fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
+	fclose( pFile );
+	
+	m_vcb.Save( fileName + ".src-vcb" );
+}
+
+void SuffixArray::Load( string fileName ) {
+	FILE *pFile = fopen ( fileName.c_str() , "r" );
+        cerr << "loading from " << fileName << endl;
+
+	fread( &m_size, sizeof(INDEX), 1, pFile );
+	cerr << "words in corpus: " << m_size << endl;
+	m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
+	m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
+	m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
+	m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
+	
+	fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
+	fread( m_index, sizeof(INDEX), m_size, pFile );   // suffix array
+	fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index
+	fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
+
+	fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
+	cerr << "sentences in corpus: " << m_sentenceCount << endl;
+	m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
+	fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
+	fclose( pFile );
+	
+	m_vcb.Load( fileName + ".src-vcb" );
+}
+
+
--- a/scripts/ems/biconcor/SuffixArray.h
+++ b/scripts/ems/biconcor/SuffixArray.h
@ -0,0 +1,49 @@
+#include "Vocabulary.h"
+
+#pragma once
+
+#define LINE_MAX_LENGTH 10000
+
+
+class SuffixArray 
+{
+public:
+	typedef unsigned int INDEX;
+
+private:
+	WORD_ID *m_array;
+	INDEX *m_index;
+	INDEX *m_buffer;
+	char *m_wordInSentence;
+	INDEX *m_sentence;
+	char *m_sentenceLength;
+	WORD_ID m_endOfSentence;
+	Vocabulary m_vcb;
+	INDEX m_size;
+	INDEX m_sentenceCount;
+
+public:
+	~SuffixArray();
+
+	void Create( string fileName );
+	void Sort(INDEX start, INDEX end);
+	int CompareIndex( INDEX a, INDEX b ) const;
+	inline int CompareWord( WORD_ID a, WORD_ID b ) const;
+	int Count( const vector< WORD > &phrase );
+	bool MinCount( const vector< WORD > &phrase, INDEX min );
+	bool Exists( const vector< WORD > &phrase );
+	int FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
+	int LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
+	INDEX FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end );
+	INDEX FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction );
+	int Match( const vector< WORD > &phrase, INDEX index );
+	void List( INDEX start, INDEX end );
+	inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; }
+	inline INDEX GetSentence( INDEX position ) { return m_sentence[position]; }
+	inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; }
+	inline char GetSentenceLength( INDEX sentenceId ) { return m_sentenceLength[sentenceId]; }
+	inline INDEX GetSize() { return m_size; }
+	inline WORD GetWord( INDEX position ) { return m_vcb.GetWord( m_array[position] ); }
+	void Save( string fileName );
+	void Load( string fileName );
+};
--- a/scripts/ems/biconcor/TargetCorpus.cpp
+++ b/scripts/ems/biconcor/TargetCorpus.cpp
@ -0,0 +1,107 @@
+#include "TargetCorpus.h"
+#include <string>
+#include <stdlib.h>
+#include <cstring>
+
+void TargetCorpus::Create( string fileName ) 
+{
+	ifstream textFile;
+	char line[LINE_MAX_LENGTH];
+
+	// count the number of words first;
+	textFile.open(fileName.c_str());
+	istream *fileP = &textFile;
+	m_size = 0;
+	m_sentenceCount = 0;
+	while(!fileP->eof()) {
+		SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+		if (fileP->eof()) break;
+		vector< WORD_ID > words = m_vcb.Tokenize( line );
+		m_size += words.size();
+		m_sentenceCount++;
+	}
+	textFile.close();
+	cerr << m_size << " words" << endl;
+
+	// allocate memory
+	m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
+	m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount );
+
+	// fill the array
+	int wordIndex = 0;
+	int sentenceId = 0;
+	textFile.open(fileName.c_str());
+	fileP = &textFile;
+	while(!fileP->eof()) {
+		SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+		if (fileP->eof()) break;
+		vector< WORD_ID > words = m_vcb.Tokenize( line );
+		vector< WORD_ID >::const_iterator i;
+		
+		for( i=words.begin(); i!=words.end(); i++)
+		{
+			m_array[ wordIndex++ ] = *i;
+		}
+		m_sentenceEnd[ sentenceId++ ] = wordIndex-1;
+	}
+	textFile.close();
+	cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
+}
+
+TargetCorpus::~TargetCorpus()
+{ 
+	free(m_array);
+	free(m_sentenceEnd); 
+}
+
+WORD TargetCorpus::GetWordFromId( const WORD_ID id ) const {
+	return m_vcb.GetWord( id );
+}
+
+WORD TargetCorpus::GetWord( INDEX sentence, char word ) {
+	return m_vcb.GetWord( GetWordId( sentence, word ) );
+}
+
+WORD_ID TargetCorpus::GetWordId( INDEX sentence, char word ) {
+	if (sentence == 0) {
+		return m_array[ word ];
+	}
+	return m_array[ m_sentenceEnd[ sentence-1 ] + 1 + word ] ;
+}
+
+char TargetCorpus::GetSentenceLength( INDEX sentence ) {
+	if (sentence == 0) {  
+		return (char) m_sentenceEnd[ 0 ]+1;
+	}
+	return (char) ( m_sentenceEnd[ sentence ] - m_sentenceEnd[ sentence-1 ] );
+}
+
+void TargetCorpus::Save( string fileName ) {
+	FILE *pFile = fopen ( (fileName + ".tgt").c_str() , "w" );
+
+	fwrite( &m_size, sizeof(INDEX), 1, pFile );
+	fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
+
+	fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
+	fwrite( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
+	fclose( pFile );
+	
+	m_vcb.Save( fileName + ".tgt-vcb" );
+}
+
+void TargetCorpus::Load( string fileName ) {
+	FILE *pFile = fopen ( (fileName + ".tgt").c_str() , "r" );
+  cerr << "loading from " << fileName << ".tgt" << endl;
+
+	fread( &m_size, sizeof(INDEX), 1, pFile );
+	cerr << "words in corpus: " << m_size << endl;
+	m_array = (WORD_ID*) calloc( sizeof(WORD_ID), m_size );
+	fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
+
+	fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
+	cerr << "sentences in corpus: " << m_sentenceCount << endl;
+	m_sentenceEnd = (INDEX*) calloc( sizeof(INDEX), m_sentenceCount );
+	fread( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
+	fclose( pFile );
+	m_vcb.Load( fileName + ".tgt-vcb" );
+}
--- a/scripts/ems/biconcor/TargetCorpus.h
+++ b/scripts/ems/biconcor/TargetCorpus.h
@ -0,0 +1,29 @@
+#include "Vocabulary.h"
+
+#pragma once
+
+#define LINE_MAX_LENGTH 10000
+
+class TargetCorpus 
+{
+public:
+	typedef unsigned int INDEX;
+
+private:
+	WORD_ID *m_array;
+	INDEX *m_sentenceEnd;
+	Vocabulary m_vcb;
+	INDEX m_size;
+	INDEX m_sentenceCount;
+
+public:
+	~TargetCorpus();
+
+	void Create( string fileName );
+	WORD GetWordFromId( const WORD_ID id ) const;
+	WORD GetWord( INDEX sentence, char word );
+	WORD_ID GetWordId( INDEX sentence, char word );
+	char GetSentenceLength( INDEX sentence );
+	void Load( string fileName );
+	void Save( string fileName );
+};
--- a/scripts/ems/biconcor/Vocabulary.cpp
+++ b/scripts/ems/biconcor/Vocabulary.cpp
@ -0,0 +1,75 @@
+// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
+#include "Vocabulary.h"
+
+// as in beamdecoder/tables.cpp
+vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
+  vector< WORD_ID > token;
+  bool betweenWords = true;
+  int start=0;
+  int i=0;
+  for(; input[i] != '\0'; i++) {
+    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+    if (!isSpace && betweenWords) {
+      start = i;
+      betweenWords = false;
+    }
+    else if (isSpace && !betweenWords) {
+      token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+      betweenWords = true;
+    }
+  }
+  if (!betweenWords)
+    token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+  return token;
+}
+
+WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
+  map<WORD, WORD_ID>::iterator i = lookup.find( word );
+  
+  if( i != lookup.end() )
+    return i->second;
+
+  WORD_ID id = vocab.size();
+  vocab.push_back( word );
+  lookup[ word ] = id;
+  return id;  
+}
+
+WORD_ID Vocabulary::GetWordID( const WORD &word ) {
+  map<WORD, WORD_ID>::iterator i = lookup.find( word );
+  if( i == lookup.end() )
+    return 0;
+  WORD_ID w= (WORD_ID) i->second;
+  return w;
+}
+
+void Vocabulary::Save( string fileName ) {
+	ofstream vcbFile;
+	vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
+	vector< WORD >::iterator i;
+	for(i = vocab.begin(); i != vocab.end(); i++) {
+		const string &word = *i;
+		vcbFile << word << endl;
+	}
+	vcbFile.close();
+}
+
+void Vocabulary::Load( string fileName ) {
+	ifstream vcbFile;
+	char line[MAX_LENGTH];
+	vcbFile.open(fileName.c_str());
+	cerr << "loading from " << fileName << endl;
+	istream *fileP = &vcbFile;
+	int count = 0;
+	while(!fileP->eof()) {
+		SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
+		if (fileP->eof()) break;
+		int length = 0;
+		for(; line[length] != '\0'; length++);
+		StoreIfNew( string( line, length ) );
+		count++;
+	}
+	vcbFile.close();
+	cerr << count << " word read, vocabulary size " << vocab.size() << endl;	
+}
--- a/scripts/ems/biconcor/Vocabulary.h
+++ b/scripts/ems/biconcor/Vocabulary.h
@ -0,0 +1,42 @@
+// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <assert.h>
+#include <stdlib.h>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+
+using namespace std;
+
+#define MAX_LENGTH 10000
+
+#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
+                _IS.getline(_LINE, _SIZE, _DELIM); \
+                if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
+                if (_IS.gcount() == _SIZE-1) { \
+                  cerr << "Line too long! Buffer overflow. Delete lines >=" \
+                    << _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
+                    << endl; \
+                    exit(1); \
+                } \
+              }
+
+typedef string WORD;
+typedef unsigned int WORD_ID;
+
+class Vocabulary {
+ public:
+  map<WORD, WORD_ID> lookup;
+  vector< WORD > vocab;
+  WORD_ID StoreIfNew( const WORD& );
+  WORD_ID GetWordID( const WORD& );
+  vector<WORD_ID> Tokenize( const char[] );
+  inline WORD &GetWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
+	void Save( string fileName );
+	void Load( string fileName );
+};
--- a/scripts/ems/biconcor/biconcor.cpp
+++ b/scripts/ems/biconcor/biconcor.cpp
@ -0,0 +1,116 @@
+#include "SuffixArray.h"
+#include "TargetCorpus.h"
+#include "Alignment.h"
+#include "PhrasePairCollection.h"
+#include <getopt.h>
+
+using namespace std;
+
+int main(int argc, char* argv[]) 
+{
+	// handle parameters
+	string query;
+	string fileNameSuffix;
+	string fileNameSource;
+	string fileNameTarget = "";
+	string fileNameAlignment = "";
+	int loadFlag = false;
+	int saveFlag = false;
+	int createFlag = false;
+	int queryFlag = false;
+	int htmlFlag = false;
+	string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n";
+	while(1) {
+		static struct option long_options[] = {
+			{"load", required_argument, 0, 'l'},
+			{"save", required_argument, 0, 's'},
+			{"create", required_argument, 0, 'c'},
+			{"query", required_argument, 0, 'q'},
+			{"target", required_argument, 0, 't'},
+			{"alignment", required_argument, 0, 'a'},
+			{"html", no_argument, &htmlFlag, 0},
+			{0, 0, 0, 0}
+		};
+		int option_index = 0;
+		int c = getopt_long (argc, argv, "l:s:c:q:t:a:h", long_options, &option_index);
+		if (c == -1) break;
+		switch (c) {
+			case 'l':
+				fileNameSuffix = string(optarg);
+				loadFlag = true;
+				break;
+			case 't':
+				fileNameTarget = string(optarg);
+				break;
+			case 'a':
+				fileNameAlignment = string(optarg);
+				break;
+			case 's':
+				fileNameSuffix = string(optarg);
+				saveFlag = true;
+				break;
+			case 'c':
+				fileNameSource = string(optarg);
+				createFlag = true;
+				break;
+			case 'q':
+				query = string(optarg);
+				queryFlag = true;
+				break;
+			default:
+				cerr << info;
+				exit(1);
+		}
+	}		
+  
+	// check if parameter settings are legal
+	if (saveFlag && !createFlag) {
+		cerr << "error: cannot save without creating\n" << info;
+		exit(1);
+	}
+	if (saveFlag && loadFlag) {
+		cerr << "error: cannot load and save at the same time\n" << info;
+		exit(1);
+	}
+	if (!loadFlag && !createFlag) {
+		cerr << "error: neither load or create - i have no info!\n" << info;
+		exit(1);
+	}
+	if (createFlag && (fileNameTarget == "" || fileNameAlignment == "")) {
+		cerr << "error: i have no target corpus or alignment\n" << info;
+		exit(1);		
+	}
+
+	// do your thing
+	SuffixArray suffixArray;
+	TargetCorpus targetCorpus;
+	Alignment alignment;
+	if (createFlag) {
+		cerr << "will create\n";
+		cerr << "source corpus is in " << fileNameSource << endl;
+		suffixArray.Create( fileNameSource );
+		cerr << "target corpus is in " << fileNameTarget << endl;
+		targetCorpus.Create( fileNameTarget );
+		cerr << "alignment is in " << fileNameAlignment << endl;
+		alignment.Create( fileNameAlignment );
+		if (saveFlag) {
+			suffixArray.Save( fileNameSuffix );
+			targetCorpus.Save( fileNameSuffix );
+			alignment.Save( fileNameSuffix );
+			cerr << "will save in " << fileNameSuffix << endl;
+		}
+	}
+	if (loadFlag) {
+		cerr << "will load from " << fileNameSuffix << endl;
+		suffixArray.Load( fileNameSuffix );
+		targetCorpus.Load( fileNameSuffix );
+		alignment.Load( fileNameSuffix );
+	}
+	if (queryFlag) {
+		cerr << "query is " << query << endl;
+		vector< string > queryString = alignment.Tokenize( query.c_str() );
+		PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment );
+		ppCollection.GetCollection( queryString );
+		ppCollection.PrintHTML();
+	}
+}
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@ -274,6 +274,10 @@ alignment-symmetrization-method = grow-diag-final-and
 #
 #word-alignment = $working-dir/model/aligned.1

+### create a bilingual concordancer for the model
+#
+#biconcor = $moses-script-dir/ems/biconcor/biconcor
+
 ### lexicalized reordering: specify orientation type
 # (default: only distance-based reordering model)
 #
@ -419,6 +423,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
 #
 decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

+### specify size of n-best list, if produced
+#
+#nbest = 100
+
 ### multiple reference translations
 #
 multiref = yes
--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@ -294,6 +294,10 @@ alignment-symmetrization-method = grow-diag-final-and
 #
 #word-alignment = $working-dir/model/aligned.1

+### create a bilingual concordancer for the model
+#
+#biconcor = $moses-script-dir/ems/biconcor/biconcor
+
 ### lexicalized reordering: specify orientation type
 # (default: only distance-based reordering model)
 #
@ -439,6 +443,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
 #
 decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

+### specify size of n-best list, if produced
+#
+#nbest = 100
+
 ### multiple reference translations
 #
 multiref = yes
--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@ -274,6 +274,10 @@ alignment-symmetrization-method = grow-diag-final-and
 #
 #word-alignment = $working-dir/model/aligned.1

+### create a bilingual concordancer for the model
+#
+#biconcor = $moses-script-dir/ems/biconcor/biconcor
+
 ### lexicalized reordering: specify orientation type
 # (default: only distance-based reordering model)
 #
@ -419,6 +423,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
 #
 #decoder-settings = ""

+### specify size of n-best list, if produced
+#
+#nbest = 100
+
 ### multiple reference translations
 #
 multiref = yes
--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@ -278,6 +278,10 @@ alignment-symmetrization-method = grow-diag-final-and
 #
 #word-alignment = $working-dir/model/aligned.1

+### create a bilingual concordancer for the model
+#
+#biconcor = $moses-script-dir/ems/biconcor/biconcor
+
 ### lexicalized reordering: specify orientation type
 # (default: only distance-based reordering model)
 #
@ -423,6 +427,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
 #
 #decoder-settings = ""

+### specify size of n-best list, if produced
+#
+#nbest = 100
+
 ### multiple reference translations
 #
 multiref = yes
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@ -258,6 +258,10 @@ alignment-symmetrization-method = grow-diag-final-and
 #
 #word-alignment = $working-dir/model/aligned.1

+### create a bilingual concordancer for the model
+#
+#biconcor = $moses-script-dir/ems/biconcor/biconcor
+
 ### lexicalized reordering: specify orientation type
 # (default: only distance-based reordering model)
 #
@ -399,6 +403,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
 #
 decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

+### specify size of n-best list, if produced
+#
+#nbest = 100
+
 ### multiple reference translations
 #
 multiref = yes
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -319,6 +319,12 @@ symmetrize-giza
 	rerun-on-change: alignment-symmetrization-method training-options script
 	default-name: model/aligned
 	error: skip=<[1-9]
+build-biconcor
+	in: word-alignment corpus
+	out: biconcor-model
+	default-name: model/biconcor
+	ignore-unless: biconcor
+	error: usage
 build-lex-trans
 	in: word-alignment corpus
 	out: lexical-translation-table
@ -354,14 +360,14 @@ build-generation
 	ignore-unless: generation-factors
 	default-name: model/generation-table
 create-config
-	in: reordering-table phrase-translation-table generation-table LM:binlm
+	in: reordering-table phrase-translation-table generation-table LM:binlm biconcor-model
 	out: config
 	ignore-if: use-hiero INTERPOLATED-LM:script
 	rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script
 	default-name: model/moses.ini
 	error: Unknown option
 create-config-interpolated-lm
-	in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm
+	in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm biconcor-model
 	out: config
 	ignore-if: use-hiero
 	ignore-unless: INTERPOLATED-LM:script
@ -617,6 +623,7 @@ remove-markup
 	in: system-output
 	out: cleaned-output
 	default-name: evaluation/cleaned
+	pass-if: TRAINING:hierarchical-rule-set
 	pass-unless: report-segmentation
 	template: $moses-script-dir/ems/support/remove-segmenation-markup.perl < IN > OUT
 recase-output
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -49,6 +49,7 @@ my (@MODULE,
    %STEP_OUT,
    %STEP_OUTNAME,
    %STEP_PASS,       # config parameters that have to be set, otherwise pass
+    %STEP_PASS_IF,    # config parameters that have to be not set, otherwise pass
    %STEP_IGNORE,     # config parameters that have to be set, otherwise ignore
    %STEP_IGNORE_IF,  # config parameters that have to be not set, otherwise ignore
    %QSUB_SCRIPT,     # flag if script contains qsub's when run on cluster
@ -208,6 +209,10 @@ sub read_meta {
 		@{$STEP_PASS{"$module:$step"}} = split(/\s+/,$2);
 		push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2);
 	    }
+	    elsif ($1 eq "pass-if") {
+		@{$STEP_PASS_IF{"$module:$step"}} = split(/\s+/,$2);
+		push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2);
+	    }
 	    elsif ($1 eq "ignore-unless") {
 		$STEP_IGNORE{"$module:$step"} = $2;
 	    }
@ -485,6 +490,15 @@ sub find_steps_for_module {
 	    }
 	    $PASS{$#DO_STEP}++ if $flag;
 	}
+
+	if (defined($STEP_PASS_IF{$defined_step})) {
+	    my $flag = 0;
+	    foreach my $pass (@{$STEP_PASS_IF{$defined_step}}) {
+		$flag = 1 
+		    if &backoff_and_get(&extend_local_name($module,$set,$pass));
+	    }
+	    $PASS{$#DO_STEP}++ if $flag;
+	}
 	
 	# special case for passing: steps that only affect factor 0
 	if (defined($ONLY_FACTOR_0{$defined_step})) {
@ -737,6 +751,7 @@ sub find_re_use {

    # summarize and convert hashes into integers for to be re-used 
    print "\nSTEP SUMMARY:\n";
+    open(RE_USE,">".&steps_file("re-use.$VERSION",$VERSION));
    for(my $i=$#DO_STEP;$i>=0;$i--) {
        if ($PASS{$i}) {
 	    $RE_USE[$i] = 0;
@ -747,12 +762,16 @@ sub find_re_use {
 	    my @ALL = sort { $a <=> $b} keys %{$RE_USE[$i]};
            print "re-using (".join(" ",@ALL).")\n";
 	    $RE_USE[$i] = $ALL[0];
+            if ($ALL[0] != $VERSION) {
+	      print RE_USE "$DO_STEP[$i] $ALL[0]\n";
+            }
 	}
 	else {
 	    print "run\n";
 	    $RE_USE[$i] = 0;
 	}
    }
+    close(RE_USE);
 }

 sub find_dependencies {
@ -816,10 +835,10 @@ sub draw_agenda_graph {
 		$step .= " (".$RE_USE[$i].")" if $RE_USE[$i];

 		my $color = "green";
-		$color = "#0000ff" if defined($DO{$i}) && $DO{$i} >= 1;
-		$color = "#8080ff" if defined($DONE{$i});
-		$color = "red" if defined($CRASHED{$i});
 		$color = "lightblue" if $RE_USE[$i];
+		$color = "#0000ff" if defined($DO{$i}) && $DO{$i} >= 1;
+		$color = "#8080ff" if defined($DONE{$i}) || ($RE_USE[$i] && $RE_USE[$i] == $VERSION);
+		$color = "red" if defined($CRASHED{$i});
 		$color = "lightyellow" if defined($PASS{$i});
 		
 		print DOT "    $i [label=\"$step\",shape=box,fontsize=10,height=0,style=filled,fillcolor=\"$color\"];\n";
@ -893,6 +912,9 @@ sub define_step {
        elsif ($DO_STEP[$i] eq 'TRAINING:symmetrize-giza') {
            &define_training_symmetrize_giza($i);
        }
+	elsif ($DO_STEP[$i] eq 'TRAINING:build-biconcor') {
+            &define_training_build_biconcor($i);
+	}
        elsif ($DO_STEP[$i] eq 'TRAINING:build-lex-trans') {
            &define_training_build_lex_trans($i);
        }
@ -1128,13 +1150,12 @@ sub check_info {
    print "\tcheck parameter count current: ".(scalar keys %VALUE).", old: ".(scalar keys %INFO)."\n" if $VERBOSE;
    return 0 unless scalar keys %INFO == scalar keys %VALUE;
    foreach my $parameter (keys %VALUE) {
-        if (! defined($VALUE{$parameter})) {
-          print "\tcurrent has not '$parameter' -> not re-usable\n" if $VERBOSE;
+        if (! defined($INFO{$parameter})) {
+          print "\told has no '$parameter' -> not re-usable\n" if $VERBOSE;
          return 0;
        }
 	print "\tcheck '$VALUE{$parameter}' eq '$INFO{$parameter}' -> " if $VERBOSE;
-        if (defined($INFO{$parameter})
-            && &match_info_strings($VALUE{$parameter},$INFO{$parameter})) { 
+        if (&match_info_strings($VALUE{$parameter},$INFO{$parameter})) { 
            print "ok\n" if $VERBOSE; 
        }
        else { 
@ -1148,6 +1169,8 @@ sub check_info {

 sub match_info_strings { 
  my ($current,$old) = @_;
+  $current =~ s/ $//;
+  $old =~ s/ $//;
  return 1 if $current eq $old;
  # ignore time stamps, if that option is used
  if (defined($IGNORE_TIME)) {
@ -1469,14 +1492,21 @@ sub factorize_one_language {
 	    my $script = &check_and_get("$type:$factor:factor-script");
 	    my $out = "$outfile.$factor";
 	    if ($parallelizer && defined($PARALLELIZE{&defined_step($DO_STEP[$step_id])}) 
-		&& &get("$module:jobs") && $CLUSTER) {
+		&& (  (&get("$module:jobs") && $CLUSTER)
+		   || (&get("$module:cores") && $MULTICORE))) {
 		my $subdir = $module;
 		$subdir =~ tr/A-Z/a-z/;
 		$subdir .= "/tmp.$set.$stepname.$type.$factor.$VERSION";
-		my $qsub_args = &get_qsub_args($DO_STEP[$step_id]);
-		my $qflags = "--queue-flags \"$qsub_args\"";
-		$cmd .= "$parallelizer $qflags -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -jobs ".&get("$module:jobs")." -tmpdir $temp_dir/$subdir\n";
-		$QSUB_STEP{$step_id}++;
+		if ($CLUSTER) {
+		    my $qflags = "";
+		    my $qsub_args = &get_qsub_args($DO_STEP[$step_id]);
+		    $qflags="--queue-flags \"$qsub_args\"" if ($CLUSTER && $qsub_args);
+		    $cmd .= "$parallelizer $qflags -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -jobs ".&get("$module:jobs")." -tmpdir $temp_dir/$subdir\n";
+		    $QSUB_STEP{$step_id}++;
+		}	
+		elsif ($MULTICORE) {
+		    $cmd .= "$parallelizer -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -cores ".&get("$module:cores")." -tmpdir $temp_dir/$subdir\n";
+		}
 	    }
 	    else {
 		$cmd .= "$script $infile $out $temp_dir\n";
@ -1597,6 +1627,19 @@ sub define_training_symmetrize_giza {
    &create_step($step_id,$cmd);
 }

+sub define_training_build_biconcor {
+    my ($step_id) = @_;
+
+    my ($model, $aligned,$corpus) = &get_output_and_input($step_id);
+    my $biconcor = &check_and_get("TRAINING:biconcor");
+    my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
+    my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
+    my $method = &check_and_get("TRAINING:alignment-symmetrization-method");
+
+    my $cmd = "$biconcor -c $corpus.$input_extension -t $corpus.$output_extension -a $aligned.$method -s $model";
+    &create_step($step_id,$cmd);
+}
+
 sub define_training_build_lex_trans {
    my ($step_id) = @_;

@ -1683,6 +1726,7 @@ sub define_training_create_config {
    my ($config,
 	$reordering_table,$phrase_translation_table,$generation_table,@LM)
 	= &get_output_and_input($step_id);
+    if ($LM[$#LM] =~ /biconcor/) { pop @LM; }

    my $cmd = &get_training_setting(9);

@ -1737,7 +1781,7 @@ sub define_training_create_config {
 	$cmd .= "-lm $factor:$order:$LM[0]:$type ";
    }
    else {
-	die("ERROR: number of defined LM sets (".(scalar @LM_SETS).") and LM files (".(scalar @LM).") does not match")
+	die("ERROR: number of defined LM sets (".(scalar @LM_SETS).":".join(",",@LM_SETS).") and LM files (".(scalar @LM).":".join(",",@LM).") does not match")
 	    unless scalar @LM == scalar @LM_SETS;
 	foreach my $lm (@LM) {
 	    my $set = shift @LM_SETS;
@ -2020,11 +2064,15 @@ sub define_evaluation_decode {
 	$cmd .= " -queue-parameters \"$qsub_args\"" if ($CLUSTER && $qsub_args);
 	$cmd .= " -decoder $decoder -config $dir/evaluation/filtered.$set.$VERSION/moses.ini -input-file $input --jobs $jobs  -decoder-parameters \"$settings\" > $system_output";
 	
-	$cmd .= " -n-best-file $system_output.best$nbest -n-best-size $nbest" if $nbest;
+        my $nbest_size;
+	$nbest_size = $nbest + 0 if $nbest;
+	$cmd .= " -n-best-file $system_output.best$nbest_size -n-best-size $nbest" if $nbest;
    }
    else {
 		$cmd = $filter."\n$decoder $settings -v 0 -f $dir/evaluation/filtered.$set.$VERSION/moses.ini < $input > $system_output";
-		$cmd .= " -n-best-list $system_output.best$nbest $nbest" if $nbest;
+        	my $nbest_size;
+		$nbest_size = $nbest + 0 if $nbest;
+		$cmd .= " -n-best-list $system_output.best$nbest_size $nbest" if $nbest;
    }

    &create_step($step_id,$cmd);
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@ -5,7 +5,7 @@ use Getopt::Long "GetOptions";

 my $MAX_LENGTH = 4;

-my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable,$hierarchical);
+my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable,$hierarchical,$output_corpus,$alignment,$biconcor);
 if (!&GetOptions('system=s' => \$system, # raw output from decoder
                 'reference=s' => \$reference, # tokenized reference
                 'dir=s' => \$dir, # directory for storing results
@ -13,9 +13,12 @@ if (!&GetOptions('system=s' => \$system, # raw output from decoder
                 'segmentation=s' => \$segmentation, # system output with segmentation markup
                 'input-corpus=s' => \$corpus, # input side of parallel training corpus
                 'ttable=s' => \$ttable, # phrase translation table used for decoding
+                 'output-corpus=s' => \$output_corpus, # output side of parallel training corpus
+                 'alignment-file=s' => \$alignment, # alignment of parallel corpus
+                 'biconcor=s' => \$biconcor, # binary for bilingual concordancer
 		 'hierarchical' => \$hierarchical) || # hierarchical model?
    !defined($dir)) {
-	die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE]");	
+	die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE] [-output-corpus FILE] [-alignment-file FILE] [-biconcor BIN]");	
 }

 `mkdir -p $dir`;
@ -84,6 +87,11 @@ if (defined($ttable) || defined($corpus)) {
  &input_annotation();
 }

+# bilingual concordance -- not used by experiment.perl
+if (defined($corpus) && defined($output_corpus) && defined($alignment) && defined($biconcor)) {
+  `$biconcor -s $dir/biconcor -c $corpus -t $output_corpus -a $alignment`;
+}
+
 sub best_matches {
    my ($CORRECT,$TOTAL,$out) = @_;
    my $type = ($out =~ /precision/) ? "precision" : "recall";
@ -208,6 +216,9 @@ sub ttable_coverage {
  if (! -e $ttable && -e $ttable.".gz") {
    open(TTABLE,"gzip -cd $ttable.gz|");
  }
+  elsif ($ttable =~ /.gz$/) {
+    open(TTABLE,"gzip -cd $ttable|");
+  }
  else {
    open(TTABLE,$ttable) or die "Can't read ttable $ttable";
  }
@ -219,7 +230,7 @@ sub ttable_coverage {
    my @COLUMN = split(/ \|\|\| /);
    my ($in,$out,$scores) = @COLUMN;
    # handling hierarchical
-    $in =~ s/\[[^ \]]+\]$//; # remove lhs nt
+    $in =~ s/ \[[^ \]]+\]$//; # remove lhs nt
    next if $in =~ /\[[^ \]]+\]\[[^ \]]+\]/; # only consider flat rules
    $scores = $COLUMN[4] if scalar @COLUMN == 5;
    my @IN = split(/ /,$in);
@ -255,6 +266,7 @@ sub compute_entropy {
  }
  my $entropy = 0;
  foreach my $p (@_) {
+    next if $p == 0;
    $entropy -= ($p/$z)*log($p/$z)/log(2);
  }
  return $entropy;
@ -465,7 +477,7 @@ sub hierarchical_segmentation {
    open(OUTPUT_TREE,">$dir/output-tree");
    open(NODE,">$dir/node");
    while(<TRACE>) {
-	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_");
+	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_");
 	my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
 	if ($last_sentence >= 0 && $sentence != $last_sentence) {
 	    &hs_process($last_sentence,\@DERIVATION,\%STATS);
@ -481,7 +493,7 @@ sub hierarchical_segmentation {
 	@{$ITEM{'rule_rhs'}} = split(/ /,$rule_rhs);
 	
 	foreach (split(/ /,$alignment)) {
-		/(\d+)\-(\d+)/ || die("funny alignment: $_\n");
+		/(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
 		$ITEM{'alignment'}{$2} = $1; # target non-terminal to source span
 		$ITEM{'alignedSpan'}{$1} = 1;
 	}
@ -528,12 +540,14 @@ sub hs_process {
    my $x=0;
    while(1) {
 	my $RULE = shift @{$DERIVATION};
-	if ($$RULE{'rule_lhs'} eq "S" && 
-	    scalar(@{$$RULE{'rule_rhs'}}) == 2 &&
-	    $$RULE{'rule_rhs'}[0] eq "S" &&
-	    $$RULE{'rule_rhs'}[1] eq "X") {
+	if (scalar(@{$$RULE{'rule_rhs'}}) == 2 &&
+	     ($$RULE{'rule_lhs'} eq "S" && 
+	      $$RULE{'rule_rhs'}[0] eq "S" &&
+	      $$RULE{'rule_rhs'}[1] eq "X") ||
+	     ($$RULE{'rule_lhs'} eq "Q" && 
+	      $$RULE{'rule_rhs'}[0] eq "Q")) {
 	    unshift @{$GLUE_RULE{'spans'}},$$RULE{'spans'}[1];
-	    push @{$GLUE_RULE{'rule_rhs'}}, "X";
+	    push @{$GLUE_RULE{'rule_rhs'}}, $$RULE{'rule_rhs'}[1];
 	    $GLUE_RULE{'alignment'}{$x} = $x;
 	    $GLUE_RULE{'alignedSpan'}{$x} = 1;
 	    $x++;
--- a/scripts/ems/web/analysis.php
+++ b/scripts/ems/web/analysis.php
@ -33,9 +33,9 @@ function generic_show(field,parameters) {
 }
 function highlight_phrase(sentence,phrase) {
  var input = "input-"+sentence+"-"+phrase;
-  $(input).setStyle({ borderWidth: '3px', borderColor: 'red' });
+  $(input).setStyle({ borderColor: 'red' });
  var output = "output-"+sentence+"-"+phrase;
-  $(output).setStyle({ borderWidth: '3px', borderColor: 'red' });
+  $(output).setStyle({ borderColor: 'red' });
 }
 function show_word_info(sentence,cc,tc,te) {
  var info = "info-"+sentence;
@ -44,14 +44,30 @@ function show_word_info(sentence,cc,tc,te) {
 }
 function lowlight_phrase(sentence,phrase) {
  var input = "input-"+sentence+"-"+phrase;
-  $(input).setStyle({ borderWidth: '1px', borderColor: 'black' });
+  $(input).setStyle({ borderColor: 'black' });
  var output = "output-"+sentence+"-"+phrase;
-  $(output).setStyle({ borderWidth: '1px', borderColor: 'black' });
+  $(output).setStyle({ borderColor: 'black' });
 }
 function hide_word_info(sentence) {
  var info = "info-"+sentence;
  $(info).setStyle({ opacity: 0 });
 }
+function show_biconcor(sentence,phrase) {
+  var div = "biconcor-"+sentence;
+  var url = '?analysis=biconcor'
+            + '&setup=<?php print $setup ?>&id=<?php print get_biconcor_version($dir,$id); ?>&set=<?php print $set ?>'
+	    + '&sentence=' + sentence
+            + '&phrase=' + encodeURIComponent(phrase);
+  document.getElementById(div).innerHTML = "<center><img src=\"spinner.gif\" width=48 height=48></center>";
+  $(div).setStyle({ borderStyle: 'solid', 'border-width': '3px', borderColor: 'black' });
+  new Ajax.Updater(div, url, { method: 'get', evalScripts: true });
+}
+function close_biconcor(sentence) {
+  var div = "biconcor-"+sentence;
+  document.getElementById(div).innerHTML = "";
+  $(div).setStyle({ borderStyle: 'none', 'border-width': '0px', borderColor: 'white' });
+}
+
 </script>
 </head>
 <body>
@ -586,7 +602,7 @@ function bleu_show() {

 // annotated sentences core: reads data, sorts sentences, displays them
 function sentence_annotation() {
-  global $set,$id,$dir;
+  global $set,$id,$dir,$biconcor;

  // load data
  $data = file("$dir/evaluation/$set.analysis.$id/bleu-annotation");
@ -635,19 +651,19 @@ function sentence_annotation() {
 	  if ($sentence != $last_sentence) { $span = 0; }
 	  $last_sentence = $sentence;
 	  $segmentation[$sentence][$span]["brackets"] = $brackets;
-	  $segmentation[$sentence][$span]["nt"] = $nt;
+#	  $segmentation[$sentence][$span]["nt"] = $nt;
 	  $segmentation[$sentence][$span]["words"] = rtrim($words);
 	  if ($nt != "") { $nt_count[$nt]++; }
 	  $span++;
      }
      $hierarchical = 1;
-      if (count($nt_count) <= 2) {
-	  foreach ($segmentation as $sentence => $segmentation_span) {
-	      foreach ($segmentation_span as $span => $type) {
-		  $segmentation[$sentence][$span]["nt"]="";
-	      }
-	  }
-      }
+#      if (count($nt_count) <= 2) {
+#	  foreach ($segmentation as $sentence => $segmentation_span) {
+#	      foreach ($segmentation_span as $span => $type) {
+#		  $segmentation[$sentence][$span]["nt"]="";
+#	      }
+#	  }
+#     }
  }
  if (file_exists("$dir/evaluation/$set.analysis.$id/output-tree")) {
      $data = file("$dir/evaluation/$set.analysis.$id/output-tree");
@ -690,6 +706,8 @@ function sentence_annotation() {
      }
  } 

+  $biconcor = get_biconcor_version($dir,$id);
+
  // sort
  global $sort;
  $sort = $_GET['sort'];
@ -739,6 +757,10 @@ function sentence_annotation() {
     }
     if ($input) {
       print "<div id=\"info-$i\" style=\"border-color:black; background:#ffff80; opacity:0; width:100%; border:1px;\">8364 occ. in corpus, 56 translations, entropy: 5.54</div>\n";
+       if ($biconcor) {
+	   //print "<div id=\"biconcor-$i\" style=\"display: none;\">xxx</div>";
+	   print "<div id=\"biconcor-$i\" class=\"biconcor\">xxx</div>";
+       }
       if ($hierarchical) {
         sentence_annotation_hierarchical("#".$line["id"],$line["id"],$input[$line["id"]],$segmentation[$line["id"]],"in");
       }
@ -761,8 +783,25 @@ function sentence_annotation() {
  }
 }

+function coverage($coverage_vector) {
+  # get information from line in input annotation file
+  $coverage = array();
+  foreach (split(" ",$coverage_vector) as $item) {
+    if (preg_match("/[\-:]/",$item)) {
+      list($from,$to,$corpus_count,$ttable_count,$ttable_entropy) = preg_split("/[\-:]/",$item);
+      $coverage[$from][$to]["corpus_count"] = $corpus_count;
+      $coverage[$from][$to]["ttable_count"] = $ttable_count;
+      $coverage[$from][$to]["ttable_entropy"] = $ttable_entropy;
+    }
+  }
+  $word = split(" ",$words);
+
+  return $coverage;
+}
+
 // annotate an inpute sentence
 function input_annotation($sentence,$input,$segmentation) {
+  global $biconcor;
  list($words,$coverage_vector) = split("\t",$input);

  # get information from line in input annotation file
@ -840,7 +879,7 @@ function input_annotation($sentence,$input,$segmentation) {
                  $highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';";
                  $lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
 		}
-	        print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords\">";
+	        print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($phrase)."');\"":"").">";
            }
            print "</div></td>";
 	    $from += $size-1;
@ -868,7 +907,7 @@ function input_annotation($sentence,$input,$segmentation) {
 	  $color = '#ffffff';
          $cc = 0; $tc = 0; $te = 0;
        }
-        print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence);  this.style.backgroundColor='$color';\">$word[$j]</span>";
+        print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence);  this.style.backgroundColor='$color';\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($word[$j])."');\"":"").">$word[$j]</span>";
        if ($segmentation && array_key_exists($j,$segmentation["input_end"])) {
          print "</span>";
        }
@ -945,7 +984,10 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
 function sentence_annotation_hierarchical($info,$sentence,$sequence,$segmentation,$in_out) {
    $In_Out = $in_out == "out" ? "Out" : "In";

-    $word = split(" ",$sequence);
+    list($words,$coverage_vector) = split("\t",$input);
+    $coverage = coverage($sequence);
+    $word = preg_split("/\s/",$sequence);
+
    $color = array("#ffe0e0","#f0e0ff","#e0e0ff","#c0c0ff","#a0a0ff");
    #$color = array("#FFC0C0","#FFC0FF","#C0C0FF","#C0FFFF","#C0FFC0");
    #$color = array("#c0c0c0","#e0e0ff","#b0b0ff","#8080ff","#4040ff");
@ -983,7 +1025,9 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
 	for($w=0;$w<count($span_word);$w++) {
 	    if ($w > 0) { print " "; }
 	    if ($in_out == "in") {
+	        #print "<span style=\"background-color: ".coverage_color($coverage[$word_count][$word_count]).";\">";
 		print $word[$word_count];
+		#print "</span>";
 	    }
 	    else {
 	      list($surface,$correct) = split("\|", $word[$word_count]);
@ -1000,3 +1044,22 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
    }
    print "</td></tr></table>\n";
 }
+
+function biconcor($query) {
+    global $set,$id,$dir;
+    $sentence = $_GET['sentence'];
+    $biconcor = get_biconcor_version($dir,$id);
+    print "<center>
+<form action=\"...\" method=get>
+<img src=\"close.gif\" width=17 height=17 onClick=\"close_biconcor($sentence);\">
+<input width=20 value=\"$query\">
+<input type=submit value=\"look up\">
+</form>
+<div class=\"biconcor-content\">";
+    $cmd = "./biconcor -l $dir/model/biconcor.$biconcor -q ".escapeshellarg($query)." 2>/dev/null";
+    # print $cmd."<p>";
+    system($cmd);
+    # print "<p>done.";
+    print "</div></center>";
+
+}
--- a/scripts/ems/web/analysis_diff.php
+++ b/scripts/ems/web/analysis_diff.php
@ -250,12 +250,12 @@ function bleu_diff_annotation() {
     $matched_with_score = string_edit_distance($word_with_score0,$word_with_score1);
     $matched = string_edit_distance($word0,$word1);

-     print "<font size=-2>[".$line["id"].":".$line["bleu1"]."]</font> ";
+     print "<font size=-2>[".$id2."-".$line["id"].":".$line["bleu1"]."]</font> ";
     $matched1 = preg_replace('/D/',"",$matched);
     $matched_with_score1 = preg_replace('/D/',"",$matched_with_score);
     bleu_line_diff( $word_with_score1, $matched1, $matched_with_score1 );

-     print "<font size=-2>[".$line["id"].":".$line["bleu0"]."]</font> ";     
+     print "<font size=-2>[".$id."-".$line["id"].":".$line["bleu0"]."]</font> ";     
     $matched0 = preg_replace('/I/',"",$matched);
     $matched_with_score0 = preg_replace('/I/',"",$matched_with_score);
     bleu_line_diff( $word_with_score0, $matched0, $matched_with_score0 );
--- a/scripts/ems/web/bilingual-concordance.css
+++ b/scripts/ems/web/bilingual-concordance.css
@ -0,0 +1,51 @@
+.pp_head {
+  font-size: 150%;
+  font-weight: bold;
+  text-align: center;
+}
+
+.pp_target_header {
+  font-size: 120%;
+  font-weight: bold;
+  text-align: center;
+}
+
+table.biconcor { 
+  table-layout: fixed; 
+  padding: 0px; 
+  margin: 0px; 
+}
+
+tr.biconcor { 
+  padding: 0px; 
+  margin: 0px; 
+}
+
+td.biconcor { 
+  white-space: nowrap; 
+  overflow: hidden; 
+  padding: 0px; 
+  margin: 0px; 
+}
+
+td.pp_source_left { 
+  text-align: right; 
+}
+
+td.pp_target_left { 
+  text-align: right; 
+}
+
+td.pp_source { 
+  font-weight: bold; 
+}
+
+td.pp_target { 
+  font-weight: bold; 
+}
+
+td.pp_source_right { 
+  border-style:solid; 
+  border-width:0px 2px 0px 0px ; 
+  border-color: black; 
+}
--- a/scripts/ems/web/diff.php
+++ b/scripts/ems/web/diff.php
@ -2,7 +2,7 @@

 function diff() {
  global $experiment;
-  $display = $_GET[run];
+  $display = $_GET["run"];
  sort($display);
  while (list($i,$run) = each($display)) {
    if ($i==0) {
@ -22,12 +22,15 @@ function compute_diff($base,$change) {
  $parameter_change = load_parameter($change);
  print "<H3>Experiment $change</H3><TABLE>";
  while (list($parameter,$base_value) = each($parameter_base)) {
+    if (!array_key_exists($parameter,$parameter_change)) {
+      $parameter_change[$parameter] = "";
+    }
    if ($base_value != $parameter_change[$parameter]) {
      output_diff_line($parameter,$base_value,$parameter_change[$parameter]);
    }
  }
  while (list($parameter,$change_value) = each($parameter_change)) {
-    if (!$parameter_base[$parameter]) {
+    if (!array_key_exists($parameter,$parameter_base)) {
      output_diff_line($parameter,"",$change_value);
    }
  }
--- a/scripts/ems/web/index.php
+++ b/scripts/ems/web/index.php
@ -13,6 +13,7 @@ function head($title) {
 <script language="javascript" src="/javascripts/scriptaculous.js"></script>
 <script language="javascript" src="hierarchical-segmentation.js"></script>
 <link href="hierarchical-segmentation.css" rel="stylesheet" type="text/css">
+<link href="bilingual-concordance.css" rel="stylesheet" type="text/css">
 </head>
 <body><h2>'.$title."</h2>\n";
 }
@ -35,6 +36,7 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) {
    else if ($action == "PrecisionRecallDetails_show") { precision_recall_details(); }
    else if ($action == "CoverageDetails_show") { coverage_details(); }
    else if ($action == "SegmentationSummary_show") { segmentation_summary(); }
+    else if ($action == "biconcor") { biconcor($_GET["phrase"]); }
    else { print "ERROR! $action"; }
  }
  else if (array_key_exists("analysis_diff_home",$_GET)) {
--- a/scripts/ems/web/lib.php
+++ b/scripts/ems/web/lib.php
@ -39,7 +39,7 @@ function load_experiment_info() {
  reset($experiment);
  while (list($id,$info) = each($experiment)) {
    if (file_exists($dir."/steps/new") ||
-        file_exists($dir."/steps/1")) {
+        file_exists($dir."/steps/$id")) {
      $stat = stat("$dir/steps/$id/parameter.$id");
    }
    else {
@ -71,7 +71,7 @@ function load_experiment_info() {
 function load_parameter($run) {
  global $dir;
  if (file_exists($dir."/steps/new") ||
-      file_exists($dir."/steps/1")) {
+      file_exists($dir."/steps/$run")) {
    $file = file("$dir/steps/$run/parameter.$run");
  } 
  else {
@ -123,3 +123,49 @@ function process_file_entry($dir,$entry) {
    }
  }
 }
+
+function get_coverage_analysis_version($dir,$set,$id) {
+  if (file_exists("$dir/evaluation/$set.analysis.$id/input-annotation")) {
+    return $id;
+  }
+  if (file_exists("$dir/steps/$id/re-use.$id")) {
+    $re_use = file("$dir/steps/$id/re-use.$id");
+    foreach($re_use as $line) {
+      if (preg_match("/EVALUATION:(.+):analysis-coverage (\d+)/",$line,$match) &&
+	  $match[1] == $set &&
+	  file_exists("$dir/evaluation/$set.analysis.$match[2]/input-annotation")) {
+	return $match[2];
+      }
+    } 
+  }
+  # legacy stuff below...
+  if (! file_exists("$dir/steps/$id/REPORTING_report.$id")) {
+    return 0;
+  }
+  $report = file("$dir/steps/$id/REPORTING_report.$id.INFO");
+  foreach ($report as $line) {
+    if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-coverage/",$line,$match) &&
+        $match[2] == $set) {
+      $reuse_id = $match[1];
+      if (file_exists("$dir/evaluation/$set.analysis.$reuse_id/input-annotation")) {
+        return $reuse_id;
+      }
+    }
+  }
+  return 0;
+}
+
+function get_biconcor_version($dir,$id) {
+  if (file_exists("$dir/model/biconcor.$id")) {
+    return $id;
+  }
+  $re_use = file("$dir/steps/$id/re-use.$id");
+  foreach($re_use as $line) {
+    if (preg_match("/TRAINING:build-biconcor (\d+)/",$line,$match) &&
+        file_exists("$dir/model/biconcor.$match[1]")) {
+      return $match[1];
+    }
+  }
+  return 0;
+}
+
--- a/scripts/ems/web/overview.php
+++ b/scripts/ems/web/overview.php
@ -11,7 +11,7 @@ function setup() {
    print "<TR><TD><A HREF=\"?setup=$dir[0]\">$dir[0]</A></TD><TD>$dir[1]</TD><TD>$dir[2]</TD><TD>$dir[3]</TD></TR>\n";
  }
  print "</TABLE>\n";
-  print "<P>To add experiment, edit /fs/thor4/html/experiment/setup";
+  print "<P>To add experiment, edit setup";
 }

 function overview() {
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@ -6,6 +6,7 @@ use Getopt::Long;
 my $help;
 my $lc = 0; # lowercase the corpus?
 my $ignore_ratio = 0;
+my $ignore_xml = 0;
 my $enc = "utf8"; # encoding of the input and output files
    # set to anything else you wish, but I have not tested it yet
 my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
@ -17,6 +18,7 @@ GetOptions(
  "lowercase|lc" => \$lc,
  "encoding=s" => \$enc,
  "ignore-ratio" => \$ignore_ratio,
+  "ignore-xml" => \$ignore_xml,
  "max-word-length|mwl=s" => \$max_word_length
 ) or exit(1);

@ -108,14 +110,15 @@ while(my $f = <F>) {
  $f =~ s/ $//;
  next if $f eq '';
  next if $e eq '';
-  my @E = split(/ /,$e);
-  my @F = split(/ /,$f);
-  next if scalar(@E) > $max;
-  next if scalar(@F) > $max;
-  next if scalar(@E) < $min;
-  next if scalar(@F) < $min;
-  next if !$ignore_ratio && scalar(@E)/scalar(@F) > 9;
-  next if !$ignore_ratio && scalar(@F)/scalar(@E) > 9;
+
+  my $ec = &word_count($e);
+  my $fc = &word_count($f);
+  next if $ec > $max;
+  next if $fc > $max;
+  next if $ec < $min;
+  next if $fc < $min;
+  next if !$ignore_ratio && $ec/$fc > 9;
+  next if !$ignore_ratio && $fc/$ec > 9;
  # Skip this segment if any factor is longer than $max_word_length
  my $max_word_length_plus_one = $max_word_length + 1;
  next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
@ -126,7 +129,6 @@ while(my $f = <F>) {
    if $f =~ /[ \|]\|/;
  die "There is a blank factor in $corpus.$l2 on line $innr: $e"
    if $e =~ /[ \|]\|/;
-    
  
  $outnr++;
  print FO $f."\n";
@ -146,3 +148,15 @@ my $e = <E>;
 die "$corpus.$l2 is too long!" if defined $e;

 print STDERR "Input sentences: $innr  Output sentences:  $outnr\n";
+
+sub word_count {
+  my ($line) = @_;
+  if ($ignore_xml) {
+    $line =~ s/<\S[^>]*\S>//g;
+    $line =~ s/\s+/ /g;
+    $line =~ s/^ //g;
+    $line =~ s/ $//g;    
+  }
+  my @w = split(/ /,$line);
+  return scalar @w;
+}
--- a/scripts/training/symal/symal.cpp
+++ b/scripts/training/symal/symal.cpp
@ -15,9 +15,9 @@

 using namespace std;

-#define MAX_WORD 1000  //maximum lengthsource/target strings 
-#define MAX_M 200     //maximum length of source strings
-#define MAX_N 200     //maximum length of target strings 
+#define MAX_WORD 10000 // maximum lengthsource/target strings 
+#define MAX_M 200      // maximum length of source strings
+#define MAX_N 200      // maximum length of target strings 

 #define UNION                      1
 #define INTERSECT                  2
--- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
+++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
@ -12,7 +12,9 @@ if (!&GetOptions('mxpost=s' => \$MXPOST) ||
        exit(1);
 }

-open(TAGGER,"cat $IN | perl -ne 's/—/-/g; s/\\p{Dash_Punctuation}/-/g; s/\\p{Open_Punctuation}/\(/g; s/\\p{Close_Punctuation}/\)/g; s/\\p{Initial_Punctuation}/\"/g; s/\\p{Final_Punctuation}/\"/g; s/\\p{Connector_Punctuation}/-/g; s/•/*/g; s/\\p{Currency_Symbol}/\\\$/g; s/\\p{Math_Symbol}/*/g; print \$_;' | $MXPOST/mxpost |");
+my $pipeline = "perl -ne 'chop; tr/\\x20-\\x7f/\?/c; print \$_.\"\\n\";' | tee debug | ";
+$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";
+open(TAGGER,"cat $IN | $pipeline");
 open(OUT,">$OUT");
 while(<TAGGER>) {
    foreach my $word_pos (split) {
--- a/scripts/training/wrappers/make-factor-suffix.perl
+++ b/scripts/training/wrappers/make-factor-suffix.perl
@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my ($size,$in,$out) = @ARGV;
+
+open(IN,$in);
+open(OUT,">$out");
+binmode(IN, ":utf8");
+binmode(OUT, ":utf8");
+
+while(<IN>) {
+    my $first = 1;
+    chomp; s/\s+/ /g; s/^ //; s/ $//;
+    foreach my $word (split) {
+        if (length($word) > $size) {
+	    $word = substr($word,length($word)-$size);
+        }
+	print OUT " " unless $first; 
+        $first = 0;
+	print OUT lc($word);
+    }
+    print OUT "\n";
+}
+close(OUT);
+close(IN);
--- a/scripts/training/wrappers/parse-en-collins.perl
+++ b/scripts/training/wrappers/parse-en-collins.perl
@ -24,7 +24,7 @@ GetOptions(

 # parser settings
 my $MaxChar=10000;
-my $MaxWord=200;
+my $MaxWord=120;
 my $ParserBin="$COLLINS/code/parser";
 my $ParserEvn="$COLLINS/models/model2/events.gz";
 my $ParserGrm="$COLLINS/models/model2/grammar";
@ -37,8 +37,13 @@ $pipeline .= "perl -ne 'tr/\\x20-\\x7f//cd; print \$_.\"\\n\";' | ";
 $pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";

 open(TAG,$pipeline);
-open(PARSER_IN,">$tmpfile");
+my $sentence_count=0;
 while(<TAG>) {
+  if ($sentence_count % 2000 == 0) {
+    close(PARSER_IN) if $sentence_count;
+    open(PARSER_IN,sprintf(">%s.%05d",$tmpfile,$sentence_count/2000));
+  }
+  $sentence_count++;
  chop;

  # convert tagged sequence into parser format
@ -53,14 +58,16 @@ while(<TAG>) {
 close(TAG);
 close(PARSER_IN);

-# parse and process output of parser
-`rm $RAW` if defined($RAW) && -e $RAW;
-$pipeline = "gunzip -c $ParserEvn | $ParserBin $tmpfile $ParserGrm 10000 1 1 1 1 |";
-$pipeline .= "tee -a \"$RAW\" |" if defined($RAW);
+# parse
+for(my $i=0;$i * 2000 < $sentence_count;$i++) {
+  my $i_formatted = sprintf("%05d",$i);
+  `gunzip -c $ParserEvn | $ParserBin $tmpfile.$i_formatted $ParserGrm 10000 1 1 1 1 > $tmpfile.$i_formatted.out`;
+}

+# process output of parser
 my $DEBUG = 0;
 my $DEBUG_SPACE = "                                                       ";
-open(PARSER,$pipeline);
+open(PARSER,"cat $tmpfile.?????.out|");
 while(my $line = <PARSER>) {
    next unless $line =~ /^\(/;
    if ($line =~ /SentenceTooLong/) {
@ -112,7 +119,7 @@ while(my $line = <PARSER>) {
    my $first=1;
    foreach (@OUT) {
        print " " unless $first;
-        s/\\//;
+	# s/\\//; #why?
        print $_;
        $first = 0;
    }
@ -129,14 +136,15 @@ sub escape {

 sub check_length {
    my ($line) = @_;
-    my ($ret,$numc,$numw,@words);
+    my ($numc,$numw,@words);
+
+    return 0 if $line =~ /^\d+ [^a-z0-9]+$/i || $line eq "0"  || $line eq "0 ";

    $numc = length($line);
    @words = split(" ",$line);
    $numw = ($#words+1)/2;

-    $ret = (($numc <= $MaxChar) && ($numw <= $MaxWord));
-    $ret;
+    return ($numc <= $MaxChar) && ($numw <= $MaxWord);
 }

 sub conv_posfmt {