improvements to web analysis, fixes to syntax wrappers

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3633 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
phkoehn 2010-10-21 09:49:27 +00:00
parent 88eaf49c5e
commit 85a5a13e4c
34 changed files with 1717 additions and 74 deletions

View File

@ -0,0 +1,171 @@
#include "Alignment.h"
#include <string>
#include <stdlib.h>
#include <cstring>
using namespace std;
void Alignment::Create( string fileName )
{
ifstream textFile;
char line[LINE_MAX_LENGTH];
// count the number of words first;
textFile.open(fileName.c_str());
istream *fileP = &textFile;
m_size = 0;
m_sentenceCount = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector<string> alignmentSequence = Tokenize( line );
m_size += alignmentSequence.size();
m_sentenceCount++;
}
textFile.close();
cerr << m_size << " alignment points" << endl;
// allocate memory
m_array = (char*) calloc( sizeof( char ), m_size*2 );
m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount );
// fill the array
int alignmentPointIndex = 0;
int sentenceId = 0;
textFile.open(fileName.c_str());
fileP = &textFile;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector<string> alignmentSequence = Tokenize( line );
for(int i=0; i<alignmentSequence.size(); i++) {
int s,t;
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceId << endl;
}
m_array[alignmentPointIndex++] = (char) s;
m_array[alignmentPointIndex++] = (char) t;
}
m_sentenceEnd[ sentenceId++ ] = alignmentPointIndex - 2;
}
textFile.close();
cerr << "done reading " << (alignmentPointIndex/2) << " alignment points, " << sentenceId << " sentences." << endl;
}
Alignment::~Alignment()
{
free(m_array);
free(m_sentenceEnd);
}
vector<string> Alignment::Tokenize( const char input[] ) {
vector< string > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
}
else if (isSpace && !betweenWords) {
token.push_back( string( input+start, i-start ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( string( input+start, i-start ) );
return token;
}
bool Alignment::PhraseAlignment( INDEX sentence, char target_length,
char source_start, char source_end,
char &target_start, char &target_end,
char &pre_null, char &post_null ) {
vector< char > alignedTargetWords;
// get index for first alignment point
INDEX sentenceStart = 0;
if (sentence > 0) {
sentenceStart = m_sentenceEnd[ sentence-1 ] + 2;
}
// get target phrase boundaries
target_start = target_length;
target_end = 0;
for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
char source = m_array[ ap ];
if (source >= source_start && source <= source_end ) {
char target = m_array[ ap+1 ];
if (target < target_start) target_start = target;
if (target > target_end ) target_end = target;
}
}
if (target_start == target_length) {
return false; // done if no alignment points
}
// check consistency
for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
char target = m_array[ ap+1 ];
if (target >= target_start && target <= target_end ) {
char source = m_array[ ap ];
if (source < source_start || source > source_end) {
return false; // alignment point out of range
}
}
}
// create array for unaligned words
for( int i=0; i<target_length; i++ ) {
m_unaligned[i] = true;
}
for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
char target = m_array[ ap+1 ];
m_unaligned[ target ] = false;
}
// prior unaligned words
pre_null = 0;
for(char target = target_start-1; target >= 0 && m_unaligned[ target ]; target--) {
pre_null++;
}
// post unaligned words;
post_null = 0;
for(char target = target_end+1; target < target_length && m_unaligned[ target ]; target++) {
post_null++;
}
return true;
}
void Alignment::Save( string fileName ) {
FILE *pFile = fopen ( (fileName + ".align").c_str() , "w" );
fwrite( &m_size, sizeof(INDEX), 1, pFile );
fwrite( m_array, sizeof(char), m_size*2, pFile ); // corpus
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
fwrite( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
fclose( pFile );
}
void Alignment::Load( string fileName ) {
FILE *pFile = fopen ( (fileName + ".align").c_str() , "r" );
cerr << "loading from " << fileName << ".align" << endl;
fread( &m_size, sizeof(INDEX), 1, pFile );
cerr << "alignment points in corpus: " << m_size << endl;
m_array = (char*) calloc( sizeof(char), m_size*2 );
fread( m_array, sizeof(char), m_size*2, pFile ); // corpus
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
cerr << "sentences in corpus: " << m_sentenceCount << endl;
m_sentenceEnd = (INDEX*) calloc( sizeof(INDEX), m_sentenceCount );
fread( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
fclose( pFile );
cerr << "done loading\n";
}

View File

@ -0,0 +1,30 @@
#include "Vocabulary.h"
#pragma once
#define LINE_MAX_LENGTH 10000
class Alignment
{
public:
typedef unsigned int INDEX;
private:
char *m_array;
INDEX *m_sentenceEnd;
INDEX m_size;
INDEX m_sentenceCount;
char m_unaligned[ 256 ];
public:
~Alignment();
void Create( string fileName );
bool PhraseAlignment( INDEX sentence, char target_length,
char source_start, char source_end,
char &target_start, char &target_end,
char &pre_null, char &post_null );
void Load( string fileName );
void Save( string fileName );
vector<string> Tokenize( const char input[] );
};

View File

@ -0,0 +1,10 @@
all: biconcor
clean:
rm -f *.o
.cpp.o:
g++ -O6 -g -c $<
biconcor: Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o
g++ Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o -o biconcor

View File

@ -0,0 +1,198 @@
#include "PhrasePair.h"
#include "Vocabulary.h"
using namespace std;
void PhrasePair::Print( ostream* out, int width ) {
vector< WORD_ID >::iterator t;
// source
int sentence_start = m_source_position - m_source_start;
int source_width = (width-3)/2;
string source_pre = "";
string source = "";
string source_post = "";
for( int space=0; space<source_width/2; space++ ) source_pre += " ";
for( char i=0; i<m_source_start; i++ ) {
source_pre += " " + m_suffixArray->GetWord( sentence_start + i );
}
for( char i=m_source_start; i<=m_source_end; i++ ) {
if (i>m_source_start) source += " ";
source += m_suffixArray->GetWord( sentence_start + i );
}
char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
for( char i=m_source_end+1; i<source_length; i++ ) {
if (i>m_source_end+1) source_post += " ";
source_post += m_suffixArray->GetWord( sentence_start + i );
}
for( int space=0; space<source_width/2; space++ ) source_post += " ";
int source_pre_width = (source_width-source.size()-2)/2;
int source_post_width = (source_width-source.size()-2+1)/2;
if (source.size() > width) {
source_pre_width = 0;
source_post_width = 0;
}
*out << source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ) << " "
<< source.substr( 0, source_width -2 ) << " "
<< source_post.substr( 0, source_post_width ) << " | ";
// target
int target_width = (width-3)/2;
string target_pre = "";
string target = "";
string target_post = "";
for( int space=0; space<target_width/2; space++ ) target_pre += " ";
for( char i=0; i<m_target_start; i++ ) {
target_pre += " " + m_targetCorpus->GetWord( m_sentence_id, i);
}
for( char i=m_target_start; i<=m_target_end; i++ ) {
if (i>m_target_start) target += " ";
target += m_targetCorpus->GetWord( m_sentence_id, i);
}
for( char i=m_target_end+1; i<m_target_length; i++ ) {
if (i>m_target_end+1) target_post += " ";
target_post += m_targetCorpus->GetWord( m_sentence_id, i);
}
int target_pre_width = (target_width-target.size()-2)/2;
int target_post_width = (target_width-target.size()-2+1)/2;
if (target.size() > width) {
target_pre_width = 0;
target_post_width = 0;
}
*out << target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ) << " "
<< target.substr( 0, target_width -2 ) << " "
<< target_post.substr( 0, target_post_width ) << endl;
}
void PhrasePair::PrintTarget( ostream* out ) {
for( char i=m_target_start; i<=m_target_end; i++ ) {
if (i>m_target_start) *out << " ";
*out << m_targetCorpus->GetWord( m_sentence_id, i);
}
}
void PhrasePair::PrintHTML( ostream* out ) {
// source
int sentence_start = m_source_position - m_source_start;
char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
*out << "<tr><td align=right class=\"pp_source_left\">";
for( char i=0; i<m_source_start; i++ ) {
if (i>0) *out << " ";
*out << m_suffixArray->GetWord( sentence_start + i );
}
*out << "</td><td class=\"pp_source\">";
for( char i=m_source_start; i<=m_source_end; i++ ) {
if (i>m_source_start) *out << " ";
*out << m_suffixArray->GetWord( sentence_start + i );
}
*out << "</td><td class=\"pp_source_right\">";
for( char i=m_source_end+1; i<source_length; i++ ) {
if (i>m_source_end+1) *out << " ";
*out << m_suffixArray->GetWord( sentence_start + i );
}
// target
*out << "</td><td class=\"pp_target_left\">";
for( char i=0; i<m_target_start; i++ ) {
if (i>0) *out << " ";
*out << m_targetCorpus->GetWord( m_sentence_id, i);
}
*out << "</td><td class=\"pp_target\">";
for( char i=m_target_start; i<=m_target_end; i++ ) {
if (i>m_target_start) *out << " ";
*out << m_targetCorpus->GetWord( m_sentence_id, i);
}
*out << "</td><td class=\"pp_target_right\">";
for( char i=m_target_end+1; i<m_target_length; i++ ) {
if (i>m_target_end+1) *out << " ";
*out << m_targetCorpus->GetWord( m_sentence_id, i);
}
*out << "</td></tr>\n";
}
void PhrasePair::PrintClippedHTML( ostream* out, int width ) {
vector< WORD_ID >::iterator t;
// source
int sentence_start = m_source_position - m_source_start;
int source_width = (width+1)/2;
string source_pre = "";
string source = "";
string source_post = "";
for( char i=0; i<m_source_start; i++ ) {
source_pre += " " + m_suffixArray->GetWord( sentence_start + i );
}
for( char i=m_source_start; i<=m_source_end; i++ ) {
if (i>m_source_start) source += " ";
source += m_suffixArray->GetWord( sentence_start + i );
}
char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
for( char i=m_source_end+1; i<source_length; i++ ) {
if (i>m_source_end+1) source_post += " ";
source_post += m_suffixArray->GetWord( sentence_start + i );
}
int source_pre_width = (source_width-source.size())/2;
int source_post_width = (source_width-source.size()+1)/2;
if (source.size() > width) {
source_pre_width = 0;
source_post_width = 0;
}
if (source_pre.size()>source_pre_width)
source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
if (source_post.size() > source_post_width)
source_post = source_post.substr( 0, source_post_width ) + "...";
*out << "<tr><td class=\"pp_source_left\">"
<< source_pre
<< "</td><td class=\"pp_source\">"
<< source.substr( 0, source_width -2 )
<< "</td><td class=\"pp_source_right\">"
<< source_post
<< "</td>";
// target
int target_width = width/2;
string target_pre = "";
string target = "";
string target_post = "";
for( char i=0; i<m_target_start; i++ ) {
target_pre += " " + m_targetCorpus->GetWord( m_sentence_id, i);
}
for( char i=m_target_start; i<=m_target_end; i++ ) {
if (i>m_target_start) target += " ";
target += m_targetCorpus->GetWord( m_sentence_id, i);
}
for( char i=m_target_end+1; i<m_target_length; i++ ) {
if (i>m_target_end+1) target_post += " ";
target_post += m_targetCorpus->GetWord( m_sentence_id, i);
}
int target_pre_width = (target_width-target.size())/2;
int target_post_width = (target_width-target.size()+1)/2;
if (target.size() > width) {
target_pre_width = 0;
target_post_width = 0;
}
if (target_pre.size() > target_pre_width)
target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
if (target_post.size() > target_post_width)
target_post = target_post.substr( 0, target_post_width ) + "...";
*out << "<td class=\"pp_target_left\">"
<< target_pre
<< "</td><td class=\"pp_target\">"
<< target.substr( 0, target_width -2 )
<< "</td><td class=\"pp_target_right\">"
<< target_post
<< "</td></tr>"<< endl;
}

View File

@ -0,0 +1,54 @@
#include <string>
#include <stdlib.h>
#include <cstring>
#include <fstream>
#include <sstream>
#include <iostream>
#include "SuffixArray.h"
#include "TargetCorpus.h"
#include "Alignment.h"
#pragma once
using namespace std;
class PhrasePair
{
public:
typedef unsigned int INDEX;
private:
SuffixArray *m_suffixArray;
TargetCorpus *m_targetCorpus;
Alignment *m_alignment;
INDEX m_sentence_id;
char m_target_length;
SuffixArray::INDEX m_source_position;
char m_source_start, m_source_end;
char m_target_start, m_target_end;
char m_start_null, m_end_null;
char m_pre_null, m_post_null;
public:
PhrasePair( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, char target_length, INDEX position, char source_start, char source_end, char target_start, char target_end, char start_null, char end_null, char pre_null, char post_null)
:m_suffixArray(sa)
,m_targetCorpus(tc)
,m_alignment(a)
,m_sentence_id(sentence_id)
,m_source_position(position)
,m_target_length(target_length)
,m_source_start(source_start)
,m_source_end(source_end)
,m_target_start(target_start)
,m_target_end(target_end)
,m_start_null(start_null)
,m_end_null(end_null)
,m_pre_null(pre_null)
,m_post_null(post_null)
{}
~PhrasePair () {}
void PrintTarget( ostream* out );
void Print( ostream* out, int width );
void PrintHTML( ostream* out );
void PrintClippedHTML( ostream* out, int width );
};

View File

@ -0,0 +1,111 @@
#include "PhrasePairCollection.h"
#include <string>
#include <stdlib.h>
#include <cstring>
#include <algorithm>
using namespace std;
PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a )
:m_suffixArray(sa)
,m_targetCorpus(tc)
,m_alignment(a)
,m_size(0)
,m_max_lookup(10000)
,m_max_pp_target(50)
,m_max_pp(50)
{}
PhrasePairCollection::~PhrasePairCollection()
{}
bool PhrasePairCollection::GetCollection( const vector< string > sourceString ) {
INDEX first_match, last_match;
if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) {
return false;
}
cerr << "\tfirst match " << first_match << endl;
cerr << "\tlast match " << last_match << endl;
INDEX found = last_match - first_match +1;
map< vector< WORD_ID >, INDEX > index;
for( INDEX i=first_match; i<=last_match; i++ ) {
int position = m_suffixArray->GetPosition( i );
int source_start = m_suffixArray->GetWordInSentence( position );
int source_end = source_start + sourceString.size()-1;
INDEX sentence_id = m_suffixArray->GetSentence( position );
int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
cerr << "match " << (i-first_match)
<< " in sentence " << sentence_id
<< ", starting at word " << source_start
<< " of " << sentence_length
<< ". target sentence has " << target_length << " words.";
char target_start, target_end, pre_null, post_null;
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
for( char pre = 0; pre <= pre_null; pre++ ) {
for( char post = 0; post <= post_null; post++ ) {
vector< WORD_ID > targetString;
cerr << "; ";
for( char target = target_start-pre; target <= target_end+post; target++ ) {
targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) );
cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
}
PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post);
// matchCollection.Add( sentence_id, )
if (index.find( targetString ) == index.end()) {
index[targetString] = m_collection.size();
vector< PhrasePair* > emptyVector;
m_collection.push_back( emptyVector );
}
m_collection[ index[targetString] ].push_back( phrasePair );
m_size++;
}
}
}
cerr << endl;
if (found > m_max_lookup) {
i += found/m_max_lookup-1;
}
}
sort(m_collection.begin(), m_collection.end(), CompareBySize());
}
void PhrasePairCollection::Print() {
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) {
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
int count = ppWithSameTarget->size();
cout << "(" << count << ")" << endl;
vector< PhrasePair* >::iterator p;
for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) {
(*p)->Print( &cout, 100 );
}
}
}
void PhrasePairCollection::PrintHTML() {
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
int pp_target = 0;
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
cout << "<p class=\"pp_target_header\">";
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
int count = ppWithSameTarget->size();
cout << "(" << count << "/" << m_size << ")" << endl;
cout << "<p><table align=\"center\">";
vector< PhrasePair* >::iterator p;
int pp = 0;
for(p = ppWithSameTarget->begin(); pp<count && p != ppWithSameTarget->end(); p++, pp++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_pp) {
p += count/m_max_pp-1;
pp += count/m_max_pp-1;
}
}
cout << "</table>\n";
}
}

View File

@ -0,0 +1,40 @@
#include "Vocabulary.h"
#include "SuffixArray.h"
#include "TargetCorpus.h"
#include "Alignment.h"
#include "PhrasePair.h"
#pragma once
class PhrasePairCollection
{
public:
typedef unsigned int INDEX;
private:
SuffixArray *m_suffixArray;
TargetCorpus *m_targetCorpus;
Alignment *m_alignment;
vector< vector<PhrasePair*> > m_collection;
int m_size;
int m_max_lookup;
int m_max_pp_target;
int m_max_pp;
public:
PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment * );
~PhrasePairCollection ();
bool GetCollection( const vector< string > sourceString );
void Print();
void PrintHTML();
};
// sorting helper
struct CompareBySize
{
bool operator()(const vector<PhrasePair*> a, const vector<PhrasePair*> b ) const
{
return a.size() > b.size();
}
};

View File

@ -0,0 +1,287 @@
#include "SuffixArray.h"
#include <string>
#include <stdlib.h>
#include <cstring>
using namespace std;
void SuffixArray::Create( string fileName )
{
m_vcb.StoreIfNew( "<uNk>" );
m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
ifstream textFile;
char line[LINE_MAX_LENGTH];
// count the number of words first;
textFile.open(fileName.c_str());
istream *fileP = &textFile;
m_size = 0;
m_sentenceCount = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector< WORD_ID > words = m_vcb.Tokenize( line );
m_size += words.size() + 1;
m_sentenceCount++;
}
textFile.close();
cerr << m_size << " words (incl. sentence boundaries)" << endl;
// allocate memory
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
// fill the array
int wordIndex = 0;
int sentenceId = 0;
textFile.open(fileName.c_str());
fileP = &textFile;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector< WORD_ID > words = m_vcb.Tokenize( line );
vector< WORD_ID >::const_iterator i;
for( i=words.begin(); i!=words.end(); i++)
{
m_index[ wordIndex ] = wordIndex;
m_sentence[ wordIndex ] = sentenceId;
m_wordInSentence[ wordIndex ] = i-words.begin();
m_array[ wordIndex++ ] = *i;
}
m_index[ wordIndex ] = wordIndex;
m_array[ wordIndex++ ] = m_endOfSentence;
m_sentenceLength[ sentenceId++ ] = words.size();
}
textFile.close();
cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
// List(0,9);
// sort
m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
Sort( 0, m_size-1 );
free( m_buffer );
cerr << "done sorting" << endl;
}
// good ol' quick sort
void SuffixArray::Sort(INDEX start, INDEX end) {
if (start == end) return;
INDEX mid = (start+end+1)/2;
Sort( start, mid-1 );
Sort( mid, end );
// merge
int i = start;
int j = mid;
int k = 0;
int length = end-start+1;
while( k<length )
{
if (i == mid )
{
m_buffer[ k++ ] = m_index[ j++ ];
}
else if (j > end )
{
m_buffer[ k++ ] = m_index[ i++ ];
}
else {
if (CompareIndex( m_index[i], m_index[j] ) < 0)
{
m_buffer[ k++ ] = m_index[ i++ ];
}
else
{
m_buffer[ k++ ] = m_index[ j++ ];
}
}
}
memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
}
SuffixArray::~SuffixArray()
{
free(m_index);
free(m_array);
}
int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
{
// skip over identical words
INDEX offset = 0;
while( a+offset < m_size &&
b+offset < m_size &&
m_array[ a+offset ] == m_array[ b+offset ] )
{ offset++; }
if( a+offset == m_size ) return -1;
if( b+offset == m_size ) return 1;
return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
}
inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
{
// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
}
int SuffixArray::Count( const vector< WORD > &phrase )
{
INDEX dummy;
return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
}
bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
{
INDEX dummy;
return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
}
bool SuffixArray::Exists( const vector< WORD > &phrase )
{
INDEX dummy;
return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
}
int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
}
int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
// cerr << "FindFirst\n";
INDEX start = search_start;
INDEX end = (search_end == -1) ? (m_size-1) : search_end;
INDEX mid = FindFirst( phrase, start, end );
// cerr << "done\n";
if (mid == m_size) return 0; // no matches
if (min == 1) return 1; // only existance check
int matchCount = 1;
//cerr << "before...\n";
firstMatch = FindLast( phrase, mid, start, -1 );
matchCount += mid - firstMatch;
//cerr << "after...\n";
lastMatch = FindLast( phrase, mid, end, 1 );
matchCount += lastMatch - mid;
return matchCount;
}
SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
{
end += direction;
while(true)
{
INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
int match = Match( phrase, mid );
int matchNext = Match( phrase, mid+direction );
//cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
if (match == 0 && matchNext != 0) return mid;
if (match == 0) // mid point is a match
start = mid;
else
end = mid;
}
}
SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
{
while(true)
{
INDEX mid = ( start + end + 1 )/2;
//cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
int match = Match( phrase, mid );
if (match == 0) return mid;
if (start >= end && match != 0 ) return m_size;
if (match > 0)
start = mid+1;
else
end = mid-1;
}
}
int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
{
INDEX pos = m_index[ index ];
for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++)
{
int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
// cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
if (match != 0)
return match;
}
return 0;
}
void SuffixArray::List(INDEX start, INDEX end)
{
for(INDEX i=start; i<=end; i++)
{
INDEX pos = m_index[ i ];
// cerr << i << ":" << pos << "\t";
for(int j=0; j<5 && j+pos<m_size; j++)
{
cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
}
// cerr << "\n";
}
}
void SuffixArray::Save( string fileName ) {
FILE *pFile = fopen ( fileName.c_str() , "w" );
fwrite( &m_size, sizeof(INDEX), 1, pFile );
fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
fwrite( m_index, sizeof(INDEX), m_size, pFile ); // suffix array
fwrite( m_wordInSentence, sizeof(char), m_size, pFile); // word index
fwrite( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
fclose( pFile );
m_vcb.Save( fileName + ".src-vcb" );
}
void SuffixArray::Load( string fileName ) {
FILE *pFile = fopen ( fileName.c_str() , "r" );
cerr << "loading from " << fileName << endl;
fread( &m_size, sizeof(INDEX), 1, pFile );
cerr << "words in corpus: " << m_size << endl;
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
fread( m_index, sizeof(INDEX), m_size, pFile ); // suffix array
fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index
fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
cerr << "sentences in corpus: " << m_sentenceCount << endl;
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
fclose( pFile );
m_vcb.Load( fileName + ".src-vcb" );
}

View File

@ -0,0 +1,49 @@
#include "Vocabulary.h"
#pragma once
#define LINE_MAX_LENGTH 10000
class SuffixArray
{
public:
typedef unsigned int INDEX;
private:
WORD_ID *m_array;
INDEX *m_index;
INDEX *m_buffer;
char *m_wordInSentence;
INDEX *m_sentence;
char *m_sentenceLength;
WORD_ID m_endOfSentence;
Vocabulary m_vcb;
INDEX m_size;
INDEX m_sentenceCount;
public:
~SuffixArray();
void Create( string fileName );
void Sort(INDEX start, INDEX end);
int CompareIndex( INDEX a, INDEX b ) const;
inline int CompareWord( WORD_ID a, WORD_ID b ) const;
int Count( const vector< WORD > &phrase );
bool MinCount( const vector< WORD > &phrase, INDEX min );
bool Exists( const vector< WORD > &phrase );
int FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
int LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
INDEX FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end );
INDEX FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction );
int Match( const vector< WORD > &phrase, INDEX index );
void List( INDEX start, INDEX end );
inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; }
inline INDEX GetSentence( INDEX position ) { return m_sentence[position]; }
inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; }
inline char GetSentenceLength( INDEX sentenceId ) { return m_sentenceLength[sentenceId]; }
inline INDEX GetSize() { return m_size; }
inline WORD GetWord( INDEX position ) { return m_vcb.GetWord( m_array[position] ); }
void Save( string fileName );
void Load( string fileName );
};

View File

@ -0,0 +1,107 @@
#include "TargetCorpus.h"
#include <string>
#include <stdlib.h>
#include <cstring>
void TargetCorpus::Create( string fileName )
{
ifstream textFile;
char line[LINE_MAX_LENGTH];
// count the number of words first;
textFile.open(fileName.c_str());
istream *fileP = &textFile;
m_size = 0;
m_sentenceCount = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector< WORD_ID > words = m_vcb.Tokenize( line );
m_size += words.size();
m_sentenceCount++;
}
textFile.close();
cerr << m_size << " words" << endl;
// allocate memory
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount );
// fill the array
int wordIndex = 0;
int sentenceId = 0;
textFile.open(fileName.c_str());
fileP = &textFile;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector< WORD_ID > words = m_vcb.Tokenize( line );
vector< WORD_ID >::const_iterator i;
for( i=words.begin(); i!=words.end(); i++)
{
m_array[ wordIndex++ ] = *i;
}
m_sentenceEnd[ sentenceId++ ] = wordIndex-1;
}
textFile.close();
cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
}
TargetCorpus::~TargetCorpus()
{
free(m_array);
free(m_sentenceEnd);
}
WORD TargetCorpus::GetWordFromId( const WORD_ID id ) const {
return m_vcb.GetWord( id );
}
WORD TargetCorpus::GetWord( INDEX sentence, char word ) {
return m_vcb.GetWord( GetWordId( sentence, word ) );
}
WORD_ID TargetCorpus::GetWordId( INDEX sentence, char word ) {
if (sentence == 0) {
return m_array[ word ];
}
return m_array[ m_sentenceEnd[ sentence-1 ] + 1 + word ] ;
}
char TargetCorpus::GetSentenceLength( INDEX sentence ) {
if (sentence == 0) {
return (char) m_sentenceEnd[ 0 ]+1;
}
return (char) ( m_sentenceEnd[ sentence ] - m_sentenceEnd[ sentence-1 ] );
}
void TargetCorpus::Save( string fileName ) {
FILE *pFile = fopen ( (fileName + ".tgt").c_str() , "w" );
fwrite( &m_size, sizeof(INDEX), 1, pFile );
fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
fwrite( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
fclose( pFile );
m_vcb.Save( fileName + ".tgt-vcb" );
}
void TargetCorpus::Load( string fileName ) {
FILE *pFile = fopen ( (fileName + ".tgt").c_str() , "r" );
cerr << "loading from " << fileName << ".tgt" << endl;
fread( &m_size, sizeof(INDEX), 1, pFile );
cerr << "words in corpus: " << m_size << endl;
m_array = (WORD_ID*) calloc( sizeof(WORD_ID), m_size );
fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
cerr << "sentences in corpus: " << m_sentenceCount << endl;
m_sentenceEnd = (INDEX*) calloc( sizeof(INDEX), m_sentenceCount );
fread( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
fclose( pFile );
m_vcb.Load( fileName + ".tgt-vcb" );
}

View File

@ -0,0 +1,29 @@
#include "Vocabulary.h"
#pragma once
#define LINE_MAX_LENGTH 10000
class TargetCorpus
{
public:
typedef unsigned int INDEX;
private:
WORD_ID *m_array;
INDEX *m_sentenceEnd;
Vocabulary m_vcb;
INDEX m_size;
INDEX m_sentenceCount;
public:
~TargetCorpus();
void Create( string fileName );
WORD GetWordFromId( const WORD_ID id ) const;
WORD GetWord( INDEX sentence, char word );
WORD_ID GetWordId( INDEX sentence, char word );
char GetSentenceLength( INDEX sentence );
void Load( string fileName );
void Save( string fileName );
};

View File

@ -0,0 +1,75 @@
// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
#include "Vocabulary.h"
// as in beamdecoder/tables.cpp
vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
vector< WORD_ID > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
}
else if (isSpace && !betweenWords) {
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
return token;
}
WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i != lookup.end() )
return i->second;
WORD_ID id = vocab.size();
vocab.push_back( word );
lookup[ word ] = id;
return id;
}
WORD_ID Vocabulary::GetWordID( const WORD &word ) {
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i == lookup.end() )
return 0;
WORD_ID w= (WORD_ID) i->second;
return w;
}
void Vocabulary::Save( string fileName ) {
ofstream vcbFile;
vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
vector< WORD >::iterator i;
for(i = vocab.begin(); i != vocab.end(); i++) {
const string &word = *i;
vcbFile << word << endl;
}
vcbFile.close();
}
void Vocabulary::Load( string fileName ) {
ifstream vcbFile;
char line[MAX_LENGTH];
vcbFile.open(fileName.c_str());
cerr << "loading from " << fileName << endl;
istream *fileP = &vcbFile;
int count = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
if (fileP->eof()) break;
int length = 0;
for(; line[length] != '\0'; length++);
StoreIfNew( string( line, length ) );
count++;
}
vcbFile.close();
cerr << count << " word read, vocabulary size " << vocab.size() << endl;
}

View File

@ -0,0 +1,42 @@
// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
#pragma once
#include <iostream>
#include <fstream>
#include <assert.h>
#include <stdlib.h>
#include <string>
#include <queue>
#include <map>
#include <cmath>
using namespace std;
#define MAX_LENGTH 10000
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
_IS.getline(_LINE, _SIZE, _DELIM); \
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
if (_IS.gcount() == _SIZE-1) { \
cerr << "Line too long! Buffer overflow. Delete lines >=" \
<< _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
<< endl; \
exit(1); \
} \
}
typedef string WORD;
typedef unsigned int WORD_ID;
class Vocabulary {
public:
map<WORD, WORD_ID> lookup;
vector< WORD > vocab;
WORD_ID StoreIfNew( const WORD& );
WORD_ID GetWordID( const WORD& );
vector<WORD_ID> Tokenize( const char[] );
inline WORD &GetWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
void Save( string fileName );
void Load( string fileName );
};

View File

@ -0,0 +1,116 @@
#include "SuffixArray.h"
#include "TargetCorpus.h"
#include "Alignment.h"
#include "PhrasePairCollection.h"
#include <getopt.h>
using namespace std;
int main(int argc, char* argv[])
{
// handle parameters
string query;
string fileNameSuffix;
string fileNameSource;
string fileNameTarget = "";
string fileNameAlignment = "";
int loadFlag = false;
int saveFlag = false;
int createFlag = false;
int queryFlag = false;
int htmlFlag = false;
string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n";
while(1) {
static struct option long_options[] = {
{"load", required_argument, 0, 'l'},
{"save", required_argument, 0, 's'},
{"create", required_argument, 0, 'c'},
{"query", required_argument, 0, 'q'},
{"target", required_argument, 0, 't'},
{"alignment", required_argument, 0, 'a'},
{"html", no_argument, &htmlFlag, 0},
{0, 0, 0, 0}
};
int option_index = 0;
int c = getopt_long (argc, argv, "l:s:c:q:t:a:h", long_options, &option_index);
if (c == -1) break;
switch (c) {
case 'l':
fileNameSuffix = string(optarg);
loadFlag = true;
break;
case 't':
fileNameTarget = string(optarg);
break;
case 'a':
fileNameAlignment = string(optarg);
break;
case 's':
fileNameSuffix = string(optarg);
saveFlag = true;
break;
case 'c':
fileNameSource = string(optarg);
createFlag = true;
break;
case 'q':
query = string(optarg);
queryFlag = true;
break;
default:
cerr << info;
exit(1);
}
}
// check if parameter settings are legal
if (saveFlag && !createFlag) {
cerr << "error: cannot save without creating\n" << info;
exit(1);
}
if (saveFlag && loadFlag) {
cerr << "error: cannot load and save at the same time\n" << info;
exit(1);
}
if (!loadFlag && !createFlag) {
cerr << "error: neither load or create - i have no info!\n" << info;
exit(1);
}
if (createFlag && (fileNameTarget == "" || fileNameAlignment == "")) {
cerr << "error: i have no target corpus or alignment\n" << info;
exit(1);
}
// do your thing
SuffixArray suffixArray;
TargetCorpus targetCorpus;
Alignment alignment;
if (createFlag) {
cerr << "will create\n";
cerr << "source corpus is in " << fileNameSource << endl;
suffixArray.Create( fileNameSource );
cerr << "target corpus is in " << fileNameTarget << endl;
targetCorpus.Create( fileNameTarget );
cerr << "alignment is in " << fileNameAlignment << endl;
alignment.Create( fileNameAlignment );
if (saveFlag) {
suffixArray.Save( fileNameSuffix );
targetCorpus.Save( fileNameSuffix );
alignment.Save( fileNameSuffix );
cerr << "will save in " << fileNameSuffix << endl;
}
}
if (loadFlag) {
cerr << "will load from " << fileNameSuffix << endl;
suffixArray.Load( fileNameSuffix );
targetCorpus.Load( fileNameSuffix );
alignment.Load( fileNameSuffix );
}
if (queryFlag) {
cerr << "query is " << query << endl;
vector< string > queryString = alignment.Tokenize( query.c_str() );
PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment );
ppCollection.GetCollection( queryString );
ppCollection.PrintHTML();
}
}

View File

@ -274,6 +274,10 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
@ -419,6 +423,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
### specify size of n-best list, if produced
#
#nbest = 100
### multiple reference translations
#
multiref = yes

View File

@ -294,6 +294,10 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
@ -439,6 +443,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
### specify size of n-best list, if produced
#
#nbest = 100
### multiple reference translations
#
multiref = yes

View File

@ -274,6 +274,10 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
@ -419,6 +423,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
#
#decoder-settings = ""
### specify size of n-best list, if produced
#
#nbest = 100
### multiple reference translations
#
multiref = yes

View File

@ -278,6 +278,10 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
@ -423,6 +427,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
#
#decoder-settings = ""
### specify size of n-best list, if produced
#
#nbest = 100
### multiple reference translations
#
multiref = yes

View File

@ -258,6 +258,10 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
@ -399,6 +403,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
### specify size of n-best list, if produced
#
#nbest = 100
### multiple reference translations
#
multiref = yes

View File

@ -319,6 +319,12 @@ symmetrize-giza
rerun-on-change: alignment-symmetrization-method training-options script
default-name: model/aligned
error: skip=<[1-9]
build-biconcor
in: word-alignment corpus
out: biconcor-model
default-name: model/biconcor
ignore-unless: biconcor
error: usage
build-lex-trans
in: word-alignment corpus
out: lexical-translation-table
@ -354,14 +360,14 @@ build-generation
ignore-unless: generation-factors
default-name: model/generation-table
create-config
in: reordering-table phrase-translation-table generation-table LM:binlm
in: reordering-table phrase-translation-table generation-table LM:binlm biconcor-model
out: config
ignore-if: use-hiero INTERPOLATED-LM:script
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script
default-name: model/moses.ini
error: Unknown option
create-config-interpolated-lm
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm biconcor-model
out: config
ignore-if: use-hiero
ignore-unless: INTERPOLATED-LM:script
@ -617,6 +623,7 @@ remove-markup
in: system-output
out: cleaned-output
default-name: evaluation/cleaned
pass-if: TRAINING:hierarchical-rule-set
pass-unless: report-segmentation
template: $moses-script-dir/ems/support/remove-segmenation-markup.perl < IN > OUT
recase-output

View File

@ -49,6 +49,7 @@ my (@MODULE,
%STEP_OUT,
%STEP_OUTNAME,
%STEP_PASS, # config parameters that have to be set, otherwise pass
%STEP_PASS_IF, # config parameters that have to be not set, otherwise pass
%STEP_IGNORE, # config parameters that have to be set, otherwise ignore
%STEP_IGNORE_IF, # config parameters that have to be not set, otherwise ignore
%QSUB_SCRIPT, # flag if script contains qsub's when run on cluster
@ -208,6 +209,10 @@ sub read_meta {
@{$STEP_PASS{"$module:$step"}} = split(/\s+/,$2);
push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2);
}
elsif ($1 eq "pass-if") {
@{$STEP_PASS_IF{"$module:$step"}} = split(/\s+/,$2);
push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2);
}
elsif ($1 eq "ignore-unless") {
$STEP_IGNORE{"$module:$step"} = $2;
}
@ -486,6 +491,15 @@ sub find_steps_for_module {
$PASS{$#DO_STEP}++ if $flag;
}
if (defined($STEP_PASS_IF{$defined_step})) {
my $flag = 0;
foreach my $pass (@{$STEP_PASS_IF{$defined_step}}) {
$flag = 1
if &backoff_and_get(&extend_local_name($module,$set,$pass));
}
$PASS{$#DO_STEP}++ if $flag;
}
# special case for passing: steps that only affect factor 0
if (defined($ONLY_FACTOR_0{$defined_step})) {
my $FACTOR = &backoff_and_get_array("LM:$set:factors");
@ -737,6 +751,7 @@ sub find_re_use {
# summarize and convert hashes into integers for to be re-used
print "\nSTEP SUMMARY:\n";
open(RE_USE,">".&steps_file("re-use.$VERSION",$VERSION));
for(my $i=$#DO_STEP;$i>=0;$i--) {
if ($PASS{$i}) {
$RE_USE[$i] = 0;
@ -747,12 +762,16 @@ sub find_re_use {
my @ALL = sort { $a <=> $b} keys %{$RE_USE[$i]};
print "re-using (".join(" ",@ALL).")\n";
$RE_USE[$i] = $ALL[0];
if ($ALL[0] != $VERSION) {
print RE_USE "$DO_STEP[$i] $ALL[0]\n";
}
}
else {
print "run\n";
$RE_USE[$i] = 0;
}
}
close(RE_USE);
}
sub find_dependencies {
@ -816,10 +835,10 @@ sub draw_agenda_graph {
$step .= " (".$RE_USE[$i].")" if $RE_USE[$i];
my $color = "green";
$color = "#0000ff" if defined($DO{$i}) && $DO{$i} >= 1;
$color = "#8080ff" if defined($DONE{$i});
$color = "red" if defined($CRASHED{$i});
$color = "lightblue" if $RE_USE[$i];
$color = "#0000ff" if defined($DO{$i}) && $DO{$i} >= 1;
$color = "#8080ff" if defined($DONE{$i}) || ($RE_USE[$i] && $RE_USE[$i] == $VERSION);
$color = "red" if defined($CRASHED{$i});
$color = "lightyellow" if defined($PASS{$i});
print DOT " $i [label=\"$step\",shape=box,fontsize=10,height=0,style=filled,fillcolor=\"$color\"];\n";
@ -893,6 +912,9 @@ sub define_step {
elsif ($DO_STEP[$i] eq 'TRAINING:symmetrize-giza') {
&define_training_symmetrize_giza($i);
}
elsif ($DO_STEP[$i] eq 'TRAINING:build-biconcor') {
&define_training_build_biconcor($i);
}
elsif ($DO_STEP[$i] eq 'TRAINING:build-lex-trans') {
&define_training_build_lex_trans($i);
}
@ -1128,13 +1150,12 @@ sub check_info {
print "\tcheck parameter count current: ".(scalar keys %VALUE).", old: ".(scalar keys %INFO)."\n" if $VERBOSE;
return 0 unless scalar keys %INFO == scalar keys %VALUE;
foreach my $parameter (keys %VALUE) {
if (! defined($VALUE{$parameter})) {
print "\tcurrent has not '$parameter' -> not re-usable\n" if $VERBOSE;
if (! defined($INFO{$parameter})) {
print "\told has no '$parameter' -> not re-usable\n" if $VERBOSE;
return 0;
}
print "\tcheck '$VALUE{$parameter}' eq '$INFO{$parameter}' -> " if $VERBOSE;
if (defined($INFO{$parameter})
&& &match_info_strings($VALUE{$parameter},$INFO{$parameter})) {
if (&match_info_strings($VALUE{$parameter},$INFO{$parameter})) {
print "ok\n" if $VERBOSE;
}
else {
@ -1148,6 +1169,8 @@ sub check_info {
sub match_info_strings {
my ($current,$old) = @_;
$current =~ s/ $//;
$old =~ s/ $//;
return 1 if $current eq $old;
# ignore time stamps, if that option is used
if (defined($IGNORE_TIME)) {
@ -1469,15 +1492,22 @@ sub factorize_one_language {
my $script = &check_and_get("$type:$factor:factor-script");
my $out = "$outfile.$factor";
if ($parallelizer && defined($PARALLELIZE{&defined_step($DO_STEP[$step_id])})
&& &get("$module:jobs") && $CLUSTER) {
&& ( (&get("$module:jobs") && $CLUSTER)
|| (&get("$module:cores") && $MULTICORE))) {
my $subdir = $module;
$subdir =~ tr/A-Z/a-z/;
$subdir .= "/tmp.$set.$stepname.$type.$factor.$VERSION";
if ($CLUSTER) {
my $qflags = "";
my $qsub_args = &get_qsub_args($DO_STEP[$step_id]);
my $qflags = "--queue-flags \"$qsub_args\"";
$qflags="--queue-flags \"$qsub_args\"" if ($CLUSTER && $qsub_args);
$cmd .= "$parallelizer $qflags -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -jobs ".&get("$module:jobs")." -tmpdir $temp_dir/$subdir\n";
$QSUB_STEP{$step_id}++;
}
elsif ($MULTICORE) {
$cmd .= "$parallelizer -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -cores ".&get("$module:cores")." -tmpdir $temp_dir/$subdir\n";
}
}
else {
$cmd .= "$script $infile $out $temp_dir\n";
}
@ -1597,6 +1627,19 @@ sub define_training_symmetrize_giza {
&create_step($step_id,$cmd);
}
sub define_training_build_biconcor {
my ($step_id) = @_;
my ($model, $aligned,$corpus) = &get_output_and_input($step_id);
my $biconcor = &check_and_get("TRAINING:biconcor");
my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
my $method = &check_and_get("TRAINING:alignment-symmetrization-method");
my $cmd = "$biconcor -c $corpus.$input_extension -t $corpus.$output_extension -a $aligned.$method -s $model";
&create_step($step_id,$cmd);
}
sub define_training_build_lex_trans {
my ($step_id) = @_;
@ -1683,6 +1726,7 @@ sub define_training_create_config {
my ($config,
$reordering_table,$phrase_translation_table,$generation_table,@LM)
= &get_output_and_input($step_id);
if ($LM[$#LM] =~ /biconcor/) { pop @LM; }
my $cmd = &get_training_setting(9);
@ -1737,7 +1781,7 @@ sub define_training_create_config {
$cmd .= "-lm $factor:$order:$LM[0]:$type ";
}
else {
die("ERROR: number of defined LM sets (".(scalar @LM_SETS).") and LM files (".(scalar @LM).") does not match")
die("ERROR: number of defined LM sets (".(scalar @LM_SETS).":".join(",",@LM_SETS).") and LM files (".(scalar @LM).":".join(",",@LM).") does not match")
unless scalar @LM == scalar @LM_SETS;
foreach my $lm (@LM) {
my $set = shift @LM_SETS;
@ -2020,11 +2064,15 @@ sub define_evaluation_decode {
$cmd .= " -queue-parameters \"$qsub_args\"" if ($CLUSTER && $qsub_args);
$cmd .= " -decoder $decoder -config $dir/evaluation/filtered.$set.$VERSION/moses.ini -input-file $input --jobs $jobs -decoder-parameters \"$settings\" > $system_output";
$cmd .= " -n-best-file $system_output.best$nbest -n-best-size $nbest" if $nbest;
my $nbest_size;
$nbest_size = $nbest + 0 if $nbest;
$cmd .= " -n-best-file $system_output.best$nbest_size -n-best-size $nbest" if $nbest;
}
else {
$cmd = $filter."\n$decoder $settings -v 0 -f $dir/evaluation/filtered.$set.$VERSION/moses.ini < $input > $system_output";
$cmd .= " -n-best-list $system_output.best$nbest $nbest" if $nbest;
my $nbest_size;
$nbest_size = $nbest + 0 if $nbest;
$cmd .= " -n-best-list $system_output.best$nbest_size $nbest" if $nbest;
}
&create_step($step_id,$cmd);

View File

@ -5,7 +5,7 @@ use Getopt::Long "GetOptions";
my $MAX_LENGTH = 4;
my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable,$hierarchical);
my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable,$hierarchical,$output_corpus,$alignment,$biconcor);
if (!&GetOptions('system=s' => \$system, # raw output from decoder
'reference=s' => \$reference, # tokenized reference
'dir=s' => \$dir, # directory for storing results
@ -13,9 +13,12 @@ if (!&GetOptions('system=s' => \$system, # raw output from decoder
'segmentation=s' => \$segmentation, # system output with segmentation markup
'input-corpus=s' => \$corpus, # input side of parallel training corpus
'ttable=s' => \$ttable, # phrase translation table used for decoding
'output-corpus=s' => \$output_corpus, # output side of parallel training corpus
'alignment-file=s' => \$alignment, # alignment of parallel corpus
'biconcor=s' => \$biconcor, # binary for bilingual concordancer
'hierarchical' => \$hierarchical) || # hierarchical model?
!defined($dir)) {
die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE]");
die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE] [-output-corpus FILE] [-alignment-file FILE] [-biconcor BIN]");
}
`mkdir -p $dir`;
@ -84,6 +87,11 @@ if (defined($ttable) || defined($corpus)) {
&input_annotation();
}
# bilingual concordance -- not used by experiment.perl
if (defined($corpus) && defined($output_corpus) && defined($alignment) && defined($biconcor)) {
`$biconcor -s $dir/biconcor -c $corpus -t $output_corpus -a $alignment`;
}
sub best_matches {
my ($CORRECT,$TOTAL,$out) = @_;
my $type = ($out =~ /precision/) ? "precision" : "recall";
@ -208,6 +216,9 @@ sub ttable_coverage {
if (! -e $ttable && -e $ttable.".gz") {
open(TTABLE,"gzip -cd $ttable.gz|");
}
elsif ($ttable =~ /.gz$/) {
open(TTABLE,"gzip -cd $ttable|");
}
else {
open(TTABLE,$ttable) or die "Can't read ttable $ttable";
}
@ -219,7 +230,7 @@ sub ttable_coverage {
my @COLUMN = split(/ \|\|\| /);
my ($in,$out,$scores) = @COLUMN;
# handling hierarchical
$in =~ s/\[[^ \]]+\]$//; # remove lhs nt
$in =~ s/ \[[^ \]]+\]$//; # remove lhs nt
next if $in =~ /\[[^ \]]+\]\[[^ \]]+\]/; # only consider flat rules
$scores = $COLUMN[4] if scalar @COLUMN == 5;
my @IN = split(/ /,$in);
@ -255,6 +266,7 @@ sub compute_entropy {
}
my $entropy = 0;
foreach my $p (@_) {
next if $p == 0;
$entropy -= ($p/$z)*log($p/$z)/log(2);
}
return $entropy;
@ -465,7 +477,7 @@ sub hierarchical_segmentation {
open(OUTPUT_TREE,">$dir/output-tree");
open(NODE,">$dir/node");
while(<TRACE>) {
/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_");
/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_");
my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
if ($last_sentence >= 0 && $sentence != $last_sentence) {
&hs_process($last_sentence,\@DERIVATION,\%STATS);
@ -481,7 +493,7 @@ sub hierarchical_segmentation {
@{$ITEM{'rule_rhs'}} = split(/ /,$rule_rhs);
foreach (split(/ /,$alignment)) {
/(\d+)\-(\d+)/ || die("funny alignment: $_\n");
/(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
$ITEM{'alignment'}{$2} = $1; # target non-terminal to source span
$ITEM{'alignedSpan'}{$1} = 1;
}
@ -528,12 +540,14 @@ sub hs_process {
my $x=0;
while(1) {
my $RULE = shift @{$DERIVATION};
if ($$RULE{'rule_lhs'} eq "S" &&
scalar(@{$$RULE{'rule_rhs'}}) == 2 &&
if (scalar(@{$$RULE{'rule_rhs'}}) == 2 &&
($$RULE{'rule_lhs'} eq "S" &&
$$RULE{'rule_rhs'}[0] eq "S" &&
$$RULE{'rule_rhs'}[1] eq "X") {
$$RULE{'rule_rhs'}[1] eq "X") ||
($$RULE{'rule_lhs'} eq "Q" &&
$$RULE{'rule_rhs'}[0] eq "Q")) {
unshift @{$GLUE_RULE{'spans'}},$$RULE{'spans'}[1];
push @{$GLUE_RULE{'rule_rhs'}}, "X";
push @{$GLUE_RULE{'rule_rhs'}}, $$RULE{'rule_rhs'}[1];
$GLUE_RULE{'alignment'}{$x} = $x;
$GLUE_RULE{'alignedSpan'}{$x} = 1;
$x++;

View File

@ -33,9 +33,9 @@ function generic_show(field,parameters) {
}
function highlight_phrase(sentence,phrase) {
var input = "input-"+sentence+"-"+phrase;
$(input).setStyle({ borderWidth: '3px', borderColor: 'red' });
$(input).setStyle({ borderColor: 'red' });
var output = "output-"+sentence+"-"+phrase;
$(output).setStyle({ borderWidth: '3px', borderColor: 'red' });
$(output).setStyle({ borderColor: 'red' });
}
function show_word_info(sentence,cc,tc,te) {
var info = "info-"+sentence;
@ -44,14 +44,30 @@ function show_word_info(sentence,cc,tc,te) {
}
function lowlight_phrase(sentence,phrase) {
var input = "input-"+sentence+"-"+phrase;
$(input).setStyle({ borderWidth: '1px', borderColor: 'black' });
$(input).setStyle({ borderColor: 'black' });
var output = "output-"+sentence+"-"+phrase;
$(output).setStyle({ borderWidth: '1px', borderColor: 'black' });
$(output).setStyle({ borderColor: 'black' });
}
function hide_word_info(sentence) {
var info = "info-"+sentence;
$(info).setStyle({ opacity: 0 });
}
function show_biconcor(sentence,phrase) {
var div = "biconcor-"+sentence;
var url = '?analysis=biconcor'
+ '&setup=<?php print $setup ?>&id=<?php print get_biconcor_version($dir,$id); ?>&set=<?php print $set ?>'
+ '&sentence=' + sentence
+ '&phrase=' + encodeURIComponent(phrase);
document.getElementById(div).innerHTML = "<center><img src=\"spinner.gif\" width=48 height=48></center>";
$(div).setStyle({ borderStyle: 'solid', 'border-width': '3px', borderColor: 'black' });
new Ajax.Updater(div, url, { method: 'get', evalScripts: true });
}
function close_biconcor(sentence) {
var div = "biconcor-"+sentence;
document.getElementById(div).innerHTML = "";
$(div).setStyle({ borderStyle: 'none', 'border-width': '0px', borderColor: 'white' });
}
</script>
</head>
<body>
@ -586,7 +602,7 @@ function bleu_show() {
// annotated sentences core: reads data, sorts sentences, displays them
function sentence_annotation() {
global $set,$id,$dir;
global $set,$id,$dir,$biconcor;
// load data
$data = file("$dir/evaluation/$set.analysis.$id/bleu-annotation");
@ -635,19 +651,19 @@ function sentence_annotation() {
if ($sentence != $last_sentence) { $span = 0; }
$last_sentence = $sentence;
$segmentation[$sentence][$span]["brackets"] = $brackets;
$segmentation[$sentence][$span]["nt"] = $nt;
# $segmentation[$sentence][$span]["nt"] = $nt;
$segmentation[$sentence][$span]["words"] = rtrim($words);
if ($nt != "") { $nt_count[$nt]++; }
$span++;
}
$hierarchical = 1;
if (count($nt_count) <= 2) {
foreach ($segmentation as $sentence => $segmentation_span) {
foreach ($segmentation_span as $span => $type) {
$segmentation[$sentence][$span]["nt"]="";
}
}
}
# if (count($nt_count) <= 2) {
# foreach ($segmentation as $sentence => $segmentation_span) {
# foreach ($segmentation_span as $span => $type) {
# $segmentation[$sentence][$span]["nt"]="";
# }
# }
# }
}
if (file_exists("$dir/evaluation/$set.analysis.$id/output-tree")) {
$data = file("$dir/evaluation/$set.analysis.$id/output-tree");
@ -690,6 +706,8 @@ function sentence_annotation() {
}
}
$biconcor = get_biconcor_version($dir,$id);
// sort
global $sort;
$sort = $_GET['sort'];
@ -739,6 +757,10 @@ function sentence_annotation() {
}
if ($input) {
print "<div id=\"info-$i\" style=\"border-color:black; background:#ffff80; opacity:0; width:100%; border:1px;\">8364 occ. in corpus, 56 translations, entropy: 5.54</div>\n";
if ($biconcor) {
//print "<div id=\"biconcor-$i\" style=\"display: none;\">xxx</div>";
print "<div id=\"biconcor-$i\" class=\"biconcor\">xxx</div>";
}
if ($hierarchical) {
sentence_annotation_hierarchical("#".$line["id"],$line["id"],$input[$line["id"]],$segmentation[$line["id"]],"in");
}
@ -761,8 +783,25 @@ function sentence_annotation() {
}
}
function coverage($coverage_vector) {
# get information from line in input annotation file
$coverage = array();
foreach (split(" ",$coverage_vector) as $item) {
if (preg_match("/[\-:]/",$item)) {
list($from,$to,$corpus_count,$ttable_count,$ttable_entropy) = preg_split("/[\-:]/",$item);
$coverage[$from][$to]["corpus_count"] = $corpus_count;
$coverage[$from][$to]["ttable_count"] = $ttable_count;
$coverage[$from][$to]["ttable_entropy"] = $ttable_entropy;
}
}
$word = split(" ",$words);
return $coverage;
}
// annotate an inpute sentence
function input_annotation($sentence,$input,$segmentation) {
global $biconcor;
list($words,$coverage_vector) = split("\t",$input);
# get information from line in input annotation file
@ -840,7 +879,7 @@ function input_annotation($sentence,$input,$segmentation) {
$highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';";
$lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
}
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords\">";
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($phrase)."');\"":"").">";
}
print "</div></td>";
$from += $size-1;
@ -868,7 +907,7 @@ function input_annotation($sentence,$input,$segmentation) {
$color = '#ffffff';
$cc = 0; $tc = 0; $te = 0;
}
print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\">$word[$j]</span>";
print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($word[$j])."');\"":"").">$word[$j]</span>";
if ($segmentation && array_key_exists($j,$segmentation["input_end"])) {
print "</span>";
}
@ -945,7 +984,10 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
function sentence_annotation_hierarchical($info,$sentence,$sequence,$segmentation,$in_out) {
$In_Out = $in_out == "out" ? "Out" : "In";
$word = split(" ",$sequence);
list($words,$coverage_vector) = split("\t",$input);
$coverage = coverage($sequence);
$word = preg_split("/\s/",$sequence);
$color = array("#ffe0e0","#f0e0ff","#e0e0ff","#c0c0ff","#a0a0ff");
#$color = array("#FFC0C0","#FFC0FF","#C0C0FF","#C0FFFF","#C0FFC0");
#$color = array("#c0c0c0","#e0e0ff","#b0b0ff","#8080ff","#4040ff");
@ -983,7 +1025,9 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
for($w=0;$w<count($span_word);$w++) {
if ($w > 0) { print " "; }
if ($in_out == "in") {
#print "<span style=\"background-color: ".coverage_color($coverage[$word_count][$word_count]).";\">";
print $word[$word_count];
#print "</span>";
}
else {
list($surface,$correct) = split("\|", $word[$word_count]);
@ -1000,3 +1044,22 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
}
print "</td></tr></table>\n";
}
function biconcor($query) {
global $set,$id,$dir;
$sentence = $_GET['sentence'];
$biconcor = get_biconcor_version($dir,$id);
print "<center>
<form action=\"...\" method=get>
<img src=\"close.gif\" width=17 height=17 onClick=\"close_biconcor($sentence);\">
<input width=20 value=\"$query\">
<input type=submit value=\"look up\">
</form>
<div class=\"biconcor-content\">";
$cmd = "./biconcor -l $dir/model/biconcor.$biconcor -q ".escapeshellarg($query)." 2>/dev/null";
# print $cmd."<p>";
system($cmd);
# print "<p>done.";
print "</div></center>";
}

View File

@ -250,12 +250,12 @@ function bleu_diff_annotation() {
$matched_with_score = string_edit_distance($word_with_score0,$word_with_score1);
$matched = string_edit_distance($word0,$word1);
print "<font size=-2>[".$line["id"].":".$line["bleu1"]."]</font> ";
print "<font size=-2>[".$id2."-".$line["id"].":".$line["bleu1"]."]</font> ";
$matched1 = preg_replace('/D/',"",$matched);
$matched_with_score1 = preg_replace('/D/',"",$matched_with_score);
bleu_line_diff( $word_with_score1, $matched1, $matched_with_score1 );
print "<font size=-2>[".$line["id"].":".$line["bleu0"]."]</font> ";
print "<font size=-2>[".$id."-".$line["id"].":".$line["bleu0"]."]</font> ";
$matched0 = preg_replace('/I/',"",$matched);
$matched_with_score0 = preg_replace('/I/',"",$matched_with_score);
bleu_line_diff( $word_with_score0, $matched0, $matched_with_score0 );

View File

@ -0,0 +1,51 @@
.pp_head {
font-size: 150%;
font-weight: bold;
text-align: center;
}
.pp_target_header {
font-size: 120%;
font-weight: bold;
text-align: center;
}
table.biconcor {
table-layout: fixed;
padding: 0px;
margin: 0px;
}
tr.biconcor {
padding: 0px;
margin: 0px;
}
td.biconcor {
white-space: nowrap;
overflow: hidden;
padding: 0px;
margin: 0px;
}
td.pp_source_left {
text-align: right;
}
td.pp_target_left {
text-align: right;
}
td.pp_source {
font-weight: bold;
}
td.pp_target {
font-weight: bold;
}
td.pp_source_right {
border-style:solid;
border-width:0px 2px 0px 0px ;
border-color: black;
}

View File

@ -2,7 +2,7 @@
function diff() {
global $experiment;
$display = $_GET[run];
$display = $_GET["run"];
sort($display);
while (list($i,$run) = each($display)) {
if ($i==0) {
@ -22,12 +22,15 @@ function compute_diff($base,$change) {
$parameter_change = load_parameter($change);
print "<H3>Experiment $change</H3><TABLE>";
while (list($parameter,$base_value) = each($parameter_base)) {
if (!array_key_exists($parameter,$parameter_change)) {
$parameter_change[$parameter] = "";
}
if ($base_value != $parameter_change[$parameter]) {
output_diff_line($parameter,$base_value,$parameter_change[$parameter]);
}
}
while (list($parameter,$change_value) = each($parameter_change)) {
if (!$parameter_base[$parameter]) {
if (!array_key_exists($parameter,$parameter_base)) {
output_diff_line($parameter,"",$change_value);
}
}

View File

@ -13,6 +13,7 @@ function head($title) {
<script language="javascript" src="/javascripts/scriptaculous.js"></script>
<script language="javascript" src="hierarchical-segmentation.js"></script>
<link href="hierarchical-segmentation.css" rel="stylesheet" type="text/css">
<link href="bilingual-concordance.css" rel="stylesheet" type="text/css">
</head>
<body><h2>'.$title."</h2>\n";
}
@ -35,6 +36,7 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) {
else if ($action == "PrecisionRecallDetails_show") { precision_recall_details(); }
else if ($action == "CoverageDetails_show") { coverage_details(); }
else if ($action == "SegmentationSummary_show") { segmentation_summary(); }
else if ($action == "biconcor") { biconcor($_GET["phrase"]); }
else { print "ERROR! $action"; }
}
else if (array_key_exists("analysis_diff_home",$_GET)) {

View File

@ -39,7 +39,7 @@ function load_experiment_info() {
reset($experiment);
while (list($id,$info) = each($experiment)) {
if (file_exists($dir."/steps/new") ||
file_exists($dir."/steps/1")) {
file_exists($dir."/steps/$id")) {
$stat = stat("$dir/steps/$id/parameter.$id");
}
else {
@ -71,7 +71,7 @@ function load_experiment_info() {
function load_parameter($run) {
global $dir;
if (file_exists($dir."/steps/new") ||
file_exists($dir."/steps/1")) {
file_exists($dir."/steps/$run")) {
$file = file("$dir/steps/$run/parameter.$run");
}
else {
@ -123,3 +123,49 @@ function process_file_entry($dir,$entry) {
}
}
}
function get_coverage_analysis_version($dir,$set,$id) {
if (file_exists("$dir/evaluation/$set.analysis.$id/input-annotation")) {
return $id;
}
if (file_exists("$dir/steps/$id/re-use.$id")) {
$re_use = file("$dir/steps/$id/re-use.$id");
foreach($re_use as $line) {
if (preg_match("/EVALUATION:(.+):analysis-coverage (\d+)/",$line,$match) &&
$match[1] == $set &&
file_exists("$dir/evaluation/$set.analysis.$match[2]/input-annotation")) {
return $match[2];
}
}
}
# legacy stuff below...
if (! file_exists("$dir/steps/$id/REPORTING_report.$id")) {
return 0;
}
$report = file("$dir/steps/$id/REPORTING_report.$id.INFO");
foreach ($report as $line) {
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-coverage/",$line,$match) &&
$match[2] == $set) {
$reuse_id = $match[1];
if (file_exists("$dir/evaluation/$set.analysis.$reuse_id/input-annotation")) {
return $reuse_id;
}
}
}
return 0;
}
function get_biconcor_version($dir,$id) {
if (file_exists("$dir/model/biconcor.$id")) {
return $id;
}
$re_use = file("$dir/steps/$id/re-use.$id");
foreach($re_use as $line) {
if (preg_match("/TRAINING:build-biconcor (\d+)/",$line,$match) &&
file_exists("$dir/model/biconcor.$match[1]")) {
return $match[1];
}
}
return 0;
}

View File

@ -11,7 +11,7 @@ function setup() {
print "<TR><TD><A HREF=\"?setup=$dir[0]\">$dir[0]</A></TD><TD>$dir[1]</TD><TD>$dir[2]</TD><TD>$dir[3]</TD></TR>\n";
}
print "</TABLE>\n";
print "<P>To add experiment, edit /fs/thor4/html/experiment/setup";
print "<P>To add experiment, edit setup";
}
function overview() {

View File

@ -6,6 +6,7 @@ use Getopt::Long;
my $help;
my $lc = 0; # lowercase the corpus?
my $ignore_ratio = 0;
my $ignore_xml = 0;
my $enc = "utf8"; # encoding of the input and output files
# set to anything else you wish, but I have not tested it yet
my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
@ -17,6 +18,7 @@ GetOptions(
"lowercase|lc" => \$lc,
"encoding=s" => \$enc,
"ignore-ratio" => \$ignore_ratio,
"ignore-xml" => \$ignore_xml,
"max-word-length|mwl=s" => \$max_word_length
) or exit(1);
@ -108,14 +110,15 @@ while(my $f = <F>) {
$f =~ s/ $//;
next if $f eq '';
next if $e eq '';
my @E = split(/ /,$e);
my @F = split(/ /,$f);
next if scalar(@E) > $max;
next if scalar(@F) > $max;
next if scalar(@E) < $min;
next if scalar(@F) < $min;
next if !$ignore_ratio && scalar(@E)/scalar(@F) > 9;
next if !$ignore_ratio && scalar(@F)/scalar(@E) > 9;
my $ec = &word_count($e);
my $fc = &word_count($f);
next if $ec > $max;
next if $fc > $max;
next if $ec < $min;
next if $fc < $min;
next if !$ignore_ratio && $ec/$fc > 9;
next if !$ignore_ratio && $fc/$ec > 9;
# Skip this segment if any factor is longer than $max_word_length
my $max_word_length_plus_one = $max_word_length + 1;
next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
@ -127,7 +130,6 @@ while(my $f = <F>) {
die "There is a blank factor in $corpus.$l2 on line $innr: $e"
if $e =~ /[ \|]\|/;
$outnr++;
print FO $f."\n";
print EO $e."\n";
@ -146,3 +148,15 @@ my $e = <E>;
die "$corpus.$l2 is too long!" if defined $e;
print STDERR "Input sentences: $innr Output sentences: $outnr\n";
sub word_count {
my ($line) = @_;
if ($ignore_xml) {
$line =~ s/<\S[^>]*\S>//g;
$line =~ s/\s+/ /g;
$line =~ s/^ //g;
$line =~ s/ $//g;
}
my @w = split(/ /,$line);
return scalar @w;
}

View File

@ -15,9 +15,9 @@
using namespace std;
#define MAX_WORD 1000 //maximum lengthsource/target strings
#define MAX_M 200 //maximum length of source strings
#define MAX_N 200 //maximum length of target strings
#define MAX_WORD 10000 // maximum lengthsource/target strings
#define MAX_M 200 // maximum length of source strings
#define MAX_N 200 // maximum length of target strings
#define UNION 1
#define INTERSECT 2

View File

@ -12,7 +12,9 @@ if (!&GetOptions('mxpost=s' => \$MXPOST) ||
exit(1);
}
open(TAGGER,"cat $IN | perl -ne 's/—/-/g; s/\\p{Dash_Punctuation}/-/g; s/\\p{Open_Punctuation}/\(/g; s/\\p{Close_Punctuation}/\)/g; s/\\p{Initial_Punctuation}/\"/g; s/\\p{Final_Punctuation}/\"/g; s/\\p{Connector_Punctuation}/-/g; s/•/*/g; s/\\p{Currency_Symbol}/\\\$/g; s/\\p{Math_Symbol}/*/g; print \$_;' | $MXPOST/mxpost |");
my $pipeline = "perl -ne 'chop; tr/\\x20-\\x7f/\?/c; print \$_.\"\\n\";' | tee debug | ";
$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";
open(TAGGER,"cat $IN | $pipeline");
open(OUT,">$OUT");
while(<TAGGER>) {
foreach my $word_pos (split) {

View File

@ -0,0 +1,26 @@
#!/usr/bin/perl -w
use strict;
my ($size,$in,$out) = @ARGV;
open(IN,$in);
open(OUT,">$out");
binmode(IN, ":utf8");
binmode(OUT, ":utf8");
while(<IN>) {
my $first = 1;
chomp; s/\s+/ /g; s/^ //; s/ $//;
foreach my $word (split) {
if (length($word) > $size) {
$word = substr($word,length($word)-$size);
}
print OUT " " unless $first;
$first = 0;
print OUT lc($word);
}
print OUT "\n";
}
close(OUT);
close(IN);

View File

@ -24,7 +24,7 @@ GetOptions(
# parser settings
my $MaxChar=10000;
my $MaxWord=200;
my $MaxWord=120;
my $ParserBin="$COLLINS/code/parser";
my $ParserEvn="$COLLINS/models/model2/events.gz";
my $ParserGrm="$COLLINS/models/model2/grammar";
@ -37,8 +37,13 @@ $pipeline .= "perl -ne 'tr/\\x20-\\x7f//cd; print \$_.\"\\n\";' | ";
$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";
open(TAG,$pipeline);
open(PARSER_IN,">$tmpfile");
my $sentence_count=0;
while(<TAG>) {
if ($sentence_count % 2000 == 0) {
close(PARSER_IN) if $sentence_count;
open(PARSER_IN,sprintf(">%s.%05d",$tmpfile,$sentence_count/2000));
}
$sentence_count++;
chop;
# convert tagged sequence into parser format
@ -53,14 +58,16 @@ while(<TAG>) {
close(TAG);
close(PARSER_IN);
# parse and process output of parser
`rm $RAW` if defined($RAW) && -e $RAW;
$pipeline = "gunzip -c $ParserEvn | $ParserBin $tmpfile $ParserGrm 10000 1 1 1 1 |";
$pipeline .= "tee -a \"$RAW\" |" if defined($RAW);
# parse
for(my $i=0;$i * 2000 < $sentence_count;$i++) {
my $i_formatted = sprintf("%05d",$i);
`gunzip -c $ParserEvn | $ParserBin $tmpfile.$i_formatted $ParserGrm 10000 1 1 1 1 > $tmpfile.$i_formatted.out`;
}
# process output of parser
my $DEBUG = 0;
my $DEBUG_SPACE = " ";
open(PARSER,$pipeline);
open(PARSER,"cat $tmpfile.?????.out|");
while(my $line = <PARSER>) {
next unless $line =~ /^\(/;
if ($line =~ /SentenceTooLong/) {
@ -112,7 +119,7 @@ while(my $line = <PARSER>) {
my $first=1;
foreach (@OUT) {
print " " unless $first;
s/\\//;
# s/\\//; #why?
print $_;
$first = 0;
}
@ -129,14 +136,15 @@ sub escape {
sub check_length {
my ($line) = @_;
my ($ret,$numc,$numw,@words);
my ($numc,$numw,@words);
return 0 if $line =~ /^\d+ [^a-z0-9]+$/i || $line eq "0" || $line eq "0 ";
$numc = length($line);
@words = split(" ",$line);
$numw = ($#words+1)/2;
$ret = (($numc <= $MaxChar) && ($numw <= $MaxWord));
$ret;
return ($numc <= $MaxChar) && ($numw <= $MaxWord);
}
sub conv_posfmt {