mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
improvements to web analysis, fixes to syntax wrappers
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3633 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
88eaf49c5e
commit
85a5a13e4c
171
scripts/ems/biconcor/Alignment.cpp
Normal file
171
scripts/ems/biconcor/Alignment.cpp
Normal file
@ -0,0 +1,171 @@
|
||||
#include "Alignment.h"
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <cstring>
|
||||
|
||||
using namespace std;
|
||||
|
||||
void Alignment::Create( string fileName )
|
||||
{
|
||||
ifstream textFile;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
// count the number of words first;
|
||||
textFile.open(fileName.c_str());
|
||||
istream *fileP = &textFile;
|
||||
m_size = 0;
|
||||
m_sentenceCount = 0;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
vector<string> alignmentSequence = Tokenize( line );
|
||||
m_size += alignmentSequence.size();
|
||||
m_sentenceCount++;
|
||||
}
|
||||
textFile.close();
|
||||
cerr << m_size << " alignment points" << endl;
|
||||
|
||||
// allocate memory
|
||||
m_array = (char*) calloc( sizeof( char ), m_size*2 );
|
||||
m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount );
|
||||
|
||||
// fill the array
|
||||
int alignmentPointIndex = 0;
|
||||
int sentenceId = 0;
|
||||
textFile.open(fileName.c_str());
|
||||
fileP = &textFile;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
vector<string> alignmentSequence = Tokenize( line );
|
||||
for(int i=0; i<alignmentSequence.size(); i++) {
|
||||
int s,t;
|
||||
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
|
||||
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
|
||||
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceId << endl;
|
||||
}
|
||||
m_array[alignmentPointIndex++] = (char) s;
|
||||
m_array[alignmentPointIndex++] = (char) t;
|
||||
}
|
||||
m_sentenceEnd[ sentenceId++ ] = alignmentPointIndex - 2;
|
||||
}
|
||||
textFile.close();
|
||||
cerr << "done reading " << (alignmentPointIndex/2) << " alignment points, " << sentenceId << " sentences." << endl;
|
||||
}
|
||||
|
||||
Alignment::~Alignment()
|
||||
{
|
||||
free(m_array);
|
||||
free(m_sentenceEnd);
|
||||
}
|
||||
|
||||
vector<string> Alignment::Tokenize( const char input[] ) {
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
}
|
||||
else if (isSpace && !betweenWords) {
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
bool Alignment::PhraseAlignment( INDEX sentence, char target_length,
|
||||
char source_start, char source_end,
|
||||
char &target_start, char &target_end,
|
||||
char &pre_null, char &post_null ) {
|
||||
vector< char > alignedTargetWords;
|
||||
|
||||
// get index for first alignment point
|
||||
INDEX sentenceStart = 0;
|
||||
if (sentence > 0) {
|
||||
sentenceStart = m_sentenceEnd[ sentence-1 ] + 2;
|
||||
}
|
||||
|
||||
// get target phrase boundaries
|
||||
target_start = target_length;
|
||||
target_end = 0;
|
||||
for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
|
||||
char source = m_array[ ap ];
|
||||
if (source >= source_start && source <= source_end ) {
|
||||
char target = m_array[ ap+1 ];
|
||||
if (target < target_start) target_start = target;
|
||||
if (target > target_end ) target_end = target;
|
||||
}
|
||||
}
|
||||
if (target_start == target_length) {
|
||||
return false; // done if no alignment points
|
||||
}
|
||||
|
||||
// check consistency
|
||||
for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
|
||||
char target = m_array[ ap+1 ];
|
||||
if (target >= target_start && target <= target_end ) {
|
||||
char source = m_array[ ap ];
|
||||
if (source < source_start || source > source_end) {
|
||||
return false; // alignment point out of range
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// create array for unaligned words
|
||||
for( int i=0; i<target_length; i++ ) {
|
||||
m_unaligned[i] = true;
|
||||
}
|
||||
for(INDEX ap = sentenceStart; ap <= m_sentenceEnd[ sentence ]; ap += 2 ) {
|
||||
char target = m_array[ ap+1 ];
|
||||
m_unaligned[ target ] = false;
|
||||
}
|
||||
|
||||
// prior unaligned words
|
||||
pre_null = 0;
|
||||
for(char target = target_start-1; target >= 0 && m_unaligned[ target ]; target--) {
|
||||
pre_null++;
|
||||
}
|
||||
|
||||
// post unaligned words;
|
||||
post_null = 0;
|
||||
for(char target = target_end+1; target < target_length && m_unaligned[ target ]; target++) {
|
||||
post_null++;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void Alignment::Save( string fileName ) {
|
||||
FILE *pFile = fopen ( (fileName + ".align").c_str() , "w" );
|
||||
|
||||
fwrite( &m_size, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_array, sizeof(char), m_size*2, pFile ); // corpus
|
||||
|
||||
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
|
||||
fclose( pFile );
|
||||
}
|
||||
|
||||
void Alignment::Load( string fileName ) {
|
||||
FILE *pFile = fopen ( (fileName + ".align").c_str() , "r" );
|
||||
cerr << "loading from " << fileName << ".align" << endl;
|
||||
|
||||
fread( &m_size, sizeof(INDEX), 1, pFile );
|
||||
cerr << "alignment points in corpus: " << m_size << endl;
|
||||
m_array = (char*) calloc( sizeof(char), m_size*2 );
|
||||
fread( m_array, sizeof(char), m_size*2, pFile ); // corpus
|
||||
|
||||
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
||||
cerr << "sentences in corpus: " << m_sentenceCount << endl;
|
||||
m_sentenceEnd = (INDEX*) calloc( sizeof(INDEX), m_sentenceCount );
|
||||
fread( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
|
||||
fclose( pFile );
|
||||
cerr << "done loading\n";
|
||||
}
|
30
scripts/ems/biconcor/Alignment.h
Normal file
30
scripts/ems/biconcor/Alignment.h
Normal file
@ -0,0 +1,30 @@
|
||||
#include "Vocabulary.h"
|
||||
|
||||
#pragma once
|
||||
|
||||
#define LINE_MAX_LENGTH 10000
|
||||
|
||||
class Alignment
|
||||
{
|
||||
public:
|
||||
typedef unsigned int INDEX;
|
||||
|
||||
private:
|
||||
char *m_array;
|
||||
INDEX *m_sentenceEnd;
|
||||
INDEX m_size;
|
||||
INDEX m_sentenceCount;
|
||||
char m_unaligned[ 256 ];
|
||||
|
||||
public:
|
||||
~Alignment();
|
||||
|
||||
void Create( string fileName );
|
||||
bool PhraseAlignment( INDEX sentence, char target_length,
|
||||
char source_start, char source_end,
|
||||
char &target_start, char &target_end,
|
||||
char &pre_null, char &post_null );
|
||||
void Load( string fileName );
|
||||
void Save( string fileName );
|
||||
vector<string> Tokenize( const char input[] );
|
||||
};
|
10
scripts/ems/biconcor/Makefile
Normal file
10
scripts/ems/biconcor/Makefile
Normal file
@ -0,0 +1,10 @@
|
||||
all: biconcor
|
||||
|
||||
clean:
|
||||
rm -f *.o
|
||||
|
||||
.cpp.o:
|
||||
g++ -O6 -g -c $<
|
||||
|
||||
biconcor: Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o
|
||||
g++ Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o -o biconcor
|
198
scripts/ems/biconcor/PhrasePair.cpp
Normal file
198
scripts/ems/biconcor/PhrasePair.cpp
Normal file
@ -0,0 +1,198 @@
|
||||
#include "PhrasePair.h"
|
||||
#include "Vocabulary.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void PhrasePair::Print( ostream* out, int width ) {
|
||||
vector< WORD_ID >::iterator t;
|
||||
|
||||
// source
|
||||
int sentence_start = m_source_position - m_source_start;
|
||||
int source_width = (width-3)/2;
|
||||
string source_pre = "";
|
||||
string source = "";
|
||||
string source_post = "";
|
||||
for( int space=0; space<source_width/2; space++ ) source_pre += " ";
|
||||
for( char i=0; i<m_source_start; i++ ) {
|
||||
source_pre += " " + m_suffixArray->GetWord( sentence_start + i );
|
||||
}
|
||||
for( char i=m_source_start; i<=m_source_end; i++ ) {
|
||||
if (i>m_source_start) source += " ";
|
||||
source += m_suffixArray->GetWord( sentence_start + i );
|
||||
}
|
||||
char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
|
||||
for( char i=m_source_end+1; i<source_length; i++ ) {
|
||||
if (i>m_source_end+1) source_post += " ";
|
||||
source_post += m_suffixArray->GetWord( sentence_start + i );
|
||||
}
|
||||
for( int space=0; space<source_width/2; space++ ) source_post += " ";
|
||||
|
||||
int source_pre_width = (source_width-source.size()-2)/2;
|
||||
int source_post_width = (source_width-source.size()-2+1)/2;
|
||||
|
||||
if (source.size() > width) {
|
||||
source_pre_width = 0;
|
||||
source_post_width = 0;
|
||||
}
|
||||
|
||||
*out << source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ) << " "
|
||||
<< source.substr( 0, source_width -2 ) << " "
|
||||
<< source_post.substr( 0, source_post_width ) << " | ";
|
||||
|
||||
// target
|
||||
int target_width = (width-3)/2;
|
||||
string target_pre = "";
|
||||
string target = "";
|
||||
string target_post = "";
|
||||
for( int space=0; space<target_width/2; space++ ) target_pre += " ";
|
||||
for( char i=0; i<m_target_start; i++ ) {
|
||||
target_pre += " " + m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
for( char i=m_target_start; i<=m_target_end; i++ ) {
|
||||
if (i>m_target_start) target += " ";
|
||||
target += m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
for( char i=m_target_end+1; i<m_target_length; i++ ) {
|
||||
if (i>m_target_end+1) target_post += " ";
|
||||
target_post += m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
|
||||
int target_pre_width = (target_width-target.size()-2)/2;
|
||||
int target_post_width = (target_width-target.size()-2+1)/2;
|
||||
|
||||
if (target.size() > width) {
|
||||
target_pre_width = 0;
|
||||
target_post_width = 0;
|
||||
}
|
||||
|
||||
*out << target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ) << " "
|
||||
<< target.substr( 0, target_width -2 ) << " "
|
||||
<< target_post.substr( 0, target_post_width ) << endl;
|
||||
}
|
||||
|
||||
void PhrasePair::PrintTarget( ostream* out ) {
|
||||
for( char i=m_target_start; i<=m_target_end; i++ ) {
|
||||
if (i>m_target_start) *out << " ";
|
||||
*out << m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
}
|
||||
|
||||
void PhrasePair::PrintHTML( ostream* out ) {
|
||||
// source
|
||||
int sentence_start = m_source_position - m_source_start;
|
||||
char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
|
||||
|
||||
*out << "<tr><td align=right class=\"pp_source_left\">";
|
||||
for( char i=0; i<m_source_start; i++ ) {
|
||||
if (i>0) *out << " ";
|
||||
*out << m_suffixArray->GetWord( sentence_start + i );
|
||||
}
|
||||
*out << "</td><td class=\"pp_source\">";
|
||||
for( char i=m_source_start; i<=m_source_end; i++ ) {
|
||||
if (i>m_source_start) *out << " ";
|
||||
*out << m_suffixArray->GetWord( sentence_start + i );
|
||||
}
|
||||
*out << "</td><td class=\"pp_source_right\">";
|
||||
for( char i=m_source_end+1; i<source_length; i++ ) {
|
||||
if (i>m_source_end+1) *out << " ";
|
||||
*out << m_suffixArray->GetWord( sentence_start + i );
|
||||
}
|
||||
|
||||
// target
|
||||
*out << "</td><td class=\"pp_target_left\">";
|
||||
for( char i=0; i<m_target_start; i++ ) {
|
||||
if (i>0) *out << " ";
|
||||
*out << m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
*out << "</td><td class=\"pp_target\">";
|
||||
for( char i=m_target_start; i<=m_target_end; i++ ) {
|
||||
if (i>m_target_start) *out << " ";
|
||||
*out << m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
*out << "</td><td class=\"pp_target_right\">";
|
||||
for( char i=m_target_end+1; i<m_target_length; i++ ) {
|
||||
if (i>m_target_end+1) *out << " ";
|
||||
*out << m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
*out << "</td></tr>\n";
|
||||
}
|
||||
|
||||
void PhrasePair::PrintClippedHTML( ostream* out, int width ) {
|
||||
vector< WORD_ID >::iterator t;
|
||||
|
||||
// source
|
||||
int sentence_start = m_source_position - m_source_start;
|
||||
int source_width = (width+1)/2;
|
||||
string source_pre = "";
|
||||
string source = "";
|
||||
string source_post = "";
|
||||
for( char i=0; i<m_source_start; i++ ) {
|
||||
source_pre += " " + m_suffixArray->GetWord( sentence_start + i );
|
||||
}
|
||||
for( char i=m_source_start; i<=m_source_end; i++ ) {
|
||||
if (i>m_source_start) source += " ";
|
||||
source += m_suffixArray->GetWord( sentence_start + i );
|
||||
}
|
||||
char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
|
||||
for( char i=m_source_end+1; i<source_length; i++ ) {
|
||||
if (i>m_source_end+1) source_post += " ";
|
||||
source_post += m_suffixArray->GetWord( sentence_start + i );
|
||||
}
|
||||
int source_pre_width = (source_width-source.size())/2;
|
||||
int source_post_width = (source_width-source.size()+1)/2;
|
||||
|
||||
if (source.size() > width) {
|
||||
source_pre_width = 0;
|
||||
source_post_width = 0;
|
||||
}
|
||||
if (source_pre.size()>source_pre_width)
|
||||
source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
|
||||
if (source_post.size() > source_post_width)
|
||||
source_post = source_post.substr( 0, source_post_width ) + "...";
|
||||
|
||||
*out << "<tr><td class=\"pp_source_left\">"
|
||||
<< source_pre
|
||||
<< "</td><td class=\"pp_source\">"
|
||||
<< source.substr( 0, source_width -2 )
|
||||
<< "</td><td class=\"pp_source_right\">"
|
||||
<< source_post
|
||||
<< "</td>";
|
||||
|
||||
// target
|
||||
int target_width = width/2;
|
||||
string target_pre = "";
|
||||
string target = "";
|
||||
string target_post = "";
|
||||
for( char i=0; i<m_target_start; i++ ) {
|
||||
target_pre += " " + m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
for( char i=m_target_start; i<=m_target_end; i++ ) {
|
||||
if (i>m_target_start) target += " ";
|
||||
target += m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
for( char i=m_target_end+1; i<m_target_length; i++ ) {
|
||||
if (i>m_target_end+1) target_post += " ";
|
||||
target_post += m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
}
|
||||
|
||||
int target_pre_width = (target_width-target.size())/2;
|
||||
int target_post_width = (target_width-target.size()+1)/2;
|
||||
|
||||
if (target.size() > width) {
|
||||
target_pre_width = 0;
|
||||
target_post_width = 0;
|
||||
}
|
||||
if (target_pre.size() > target_pre_width)
|
||||
target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
|
||||
if (target_post.size() > target_post_width)
|
||||
target_post = target_post.substr( 0, target_post_width ) + "...";
|
||||
|
||||
*out << "<td class=\"pp_target_left\">"
|
||||
<< target_pre
|
||||
<< "</td><td class=\"pp_target\">"
|
||||
<< target.substr( 0, target_width -2 )
|
||||
<< "</td><td class=\"pp_target_right\">"
|
||||
<< target_post
|
||||
<< "</td></tr>"<< endl;
|
||||
}
|
||||
|
54
scripts/ems/biconcor/PhrasePair.h
Normal file
54
scripts/ems/biconcor/PhrasePair.h
Normal file
@ -0,0 +1,54 @@
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include "SuffixArray.h"
|
||||
#include "TargetCorpus.h"
|
||||
#include "Alignment.h"
|
||||
#pragma once
|
||||
|
||||
using namespace std;
|
||||
|
||||
class PhrasePair
|
||||
{
|
||||
public:
|
||||
typedef unsigned int INDEX;
|
||||
|
||||
private:
|
||||
SuffixArray *m_suffixArray;
|
||||
TargetCorpus *m_targetCorpus;
|
||||
Alignment *m_alignment;
|
||||
INDEX m_sentence_id;
|
||||
char m_target_length;
|
||||
SuffixArray::INDEX m_source_position;
|
||||
char m_source_start, m_source_end;
|
||||
char m_target_start, m_target_end;
|
||||
char m_start_null, m_end_null;
|
||||
char m_pre_null, m_post_null;
|
||||
|
||||
public:
|
||||
PhrasePair( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, char target_length, INDEX position, char source_start, char source_end, char target_start, char target_end, char start_null, char end_null, char pre_null, char post_null)
|
||||
:m_suffixArray(sa)
|
||||
,m_targetCorpus(tc)
|
||||
,m_alignment(a)
|
||||
,m_sentence_id(sentence_id)
|
||||
,m_source_position(position)
|
||||
,m_target_length(target_length)
|
||||
,m_source_start(source_start)
|
||||
,m_source_end(source_end)
|
||||
,m_target_start(target_start)
|
||||
,m_target_end(target_end)
|
||||
,m_start_null(start_null)
|
||||
,m_end_null(end_null)
|
||||
,m_pre_null(pre_null)
|
||||
,m_post_null(post_null)
|
||||
{}
|
||||
~PhrasePair () {}
|
||||
|
||||
void PrintTarget( ostream* out );
|
||||
void Print( ostream* out, int width );
|
||||
void PrintHTML( ostream* out );
|
||||
void PrintClippedHTML( ostream* out, int width );
|
||||
};
|
111
scripts/ems/biconcor/PhrasePairCollection.cpp
Normal file
111
scripts/ems/biconcor/PhrasePairCollection.cpp
Normal file
@ -0,0 +1,111 @@
|
||||
#include "PhrasePairCollection.h"
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace std;
|
||||
|
||||
PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a )
|
||||
:m_suffixArray(sa)
|
||||
,m_targetCorpus(tc)
|
||||
,m_alignment(a)
|
||||
,m_size(0)
|
||||
,m_max_lookup(10000)
|
||||
,m_max_pp_target(50)
|
||||
,m_max_pp(50)
|
||||
{}
|
||||
|
||||
PhrasePairCollection::~PhrasePairCollection()
|
||||
{}
|
||||
|
||||
bool PhrasePairCollection::GetCollection( const vector< string > sourceString ) {
|
||||
INDEX first_match, last_match;
|
||||
if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) {
|
||||
return false;
|
||||
}
|
||||
cerr << "\tfirst match " << first_match << endl;
|
||||
cerr << "\tlast match " << last_match << endl;
|
||||
|
||||
INDEX found = last_match - first_match +1;
|
||||
|
||||
map< vector< WORD_ID >, INDEX > index;
|
||||
for( INDEX i=first_match; i<=last_match; i++ ) {
|
||||
int position = m_suffixArray->GetPosition( i );
|
||||
int source_start = m_suffixArray->GetWordInSentence( position );
|
||||
int source_end = source_start + sourceString.size()-1;
|
||||
INDEX sentence_id = m_suffixArray->GetSentence( position );
|
||||
int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
|
||||
int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
|
||||
cerr << "match " << (i-first_match)
|
||||
<< " in sentence " << sentence_id
|
||||
<< ", starting at word " << source_start
|
||||
<< " of " << sentence_length
|
||||
<< ". target sentence has " << target_length << " words.";
|
||||
char target_start, target_end, pre_null, post_null;
|
||||
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
|
||||
cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
|
||||
cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
|
||||
for( char pre = 0; pre <= pre_null; pre++ ) {
|
||||
for( char post = 0; post <= post_null; post++ ) {
|
||||
vector< WORD_ID > targetString;
|
||||
cerr << "; ";
|
||||
for( char target = target_start-pre; target <= target_end+post; target++ ) {
|
||||
targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) );
|
||||
cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
|
||||
}
|
||||
PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post);
|
||||
// matchCollection.Add( sentence_id, )
|
||||
if (index.find( targetString ) == index.end()) {
|
||||
index[targetString] = m_collection.size();
|
||||
vector< PhrasePair* > emptyVector;
|
||||
m_collection.push_back( emptyVector );
|
||||
}
|
||||
m_collection[ index[targetString] ].push_back( phrasePair );
|
||||
m_size++;
|
||||
}
|
||||
}
|
||||
}
|
||||
cerr << endl;
|
||||
|
||||
if (found > m_max_lookup) {
|
||||
i += found/m_max_lookup-1;
|
||||
}
|
||||
}
|
||||
sort(m_collection.begin(), m_collection.end(), CompareBySize());
|
||||
}
|
||||
|
||||
void PhrasePairCollection::Print() {
|
||||
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
|
||||
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) {
|
||||
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
|
||||
int count = ppWithSameTarget->size();
|
||||
cout << "(" << count << ")" << endl;
|
||||
vector< PhrasePair* >::iterator p;
|
||||
for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) {
|
||||
(*p)->Print( &cout, 100 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PhrasePairCollection::PrintHTML() {
|
||||
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
|
||||
int pp_target = 0;
|
||||
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
|
||||
cout << "<p class=\"pp_target_header\">";
|
||||
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
|
||||
int count = ppWithSameTarget->size();
|
||||
cout << "(" << count << "/" << m_size << ")" << endl;
|
||||
cout << "<p><table align=\"center\">";
|
||||
vector< PhrasePair* >::iterator p;
|
||||
int pp = 0;
|
||||
for(p = ppWithSameTarget->begin(); pp<count && p != ppWithSameTarget->end(); p++, pp++ ) {
|
||||
(*p)->PrintClippedHTML( &cout, 160 );
|
||||
if (count > m_max_pp) {
|
||||
p += count/m_max_pp-1;
|
||||
pp += count/m_max_pp-1;
|
||||
}
|
||||
}
|
||||
cout << "</table>\n";
|
||||
}
|
||||
}
|
40
scripts/ems/biconcor/PhrasePairCollection.h
Normal file
40
scripts/ems/biconcor/PhrasePairCollection.h
Normal file
@ -0,0 +1,40 @@
|
||||
#include "Vocabulary.h"
|
||||
#include "SuffixArray.h"
|
||||
#include "TargetCorpus.h"
|
||||
#include "Alignment.h"
|
||||
#include "PhrasePair.h"
|
||||
|
||||
#pragma once
|
||||
|
||||
class PhrasePairCollection
|
||||
{
|
||||
public:
|
||||
typedef unsigned int INDEX;
|
||||
|
||||
private:
|
||||
SuffixArray *m_suffixArray;
|
||||
TargetCorpus *m_targetCorpus;
|
||||
Alignment *m_alignment;
|
||||
vector< vector<PhrasePair*> > m_collection;
|
||||
int m_size;
|
||||
int m_max_lookup;
|
||||
int m_max_pp_target;
|
||||
int m_max_pp;
|
||||
|
||||
public:
|
||||
PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment * );
|
||||
~PhrasePairCollection ();
|
||||
|
||||
bool GetCollection( const vector< string > sourceString );
|
||||
void Print();
|
||||
void PrintHTML();
|
||||
};
|
||||
|
||||
// sorting helper
|
||||
struct CompareBySize
|
||||
{
|
||||
bool operator()(const vector<PhrasePair*> a, const vector<PhrasePair*> b ) const
|
||||
{
|
||||
return a.size() > b.size();
|
||||
}
|
||||
};
|
287
scripts/ems/biconcor/SuffixArray.cpp
Normal file
287
scripts/ems/biconcor/SuffixArray.cpp
Normal file
@ -0,0 +1,287 @@
|
||||
#include "SuffixArray.h"
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <cstring>
|
||||
|
||||
using namespace std;
|
||||
|
||||
void SuffixArray::Create( string fileName )
|
||||
{
|
||||
m_vcb.StoreIfNew( "<uNk>" );
|
||||
m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
|
||||
|
||||
ifstream textFile;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
// count the number of words first;
|
||||
textFile.open(fileName.c_str());
|
||||
istream *fileP = &textFile;
|
||||
m_size = 0;
|
||||
m_sentenceCount = 0;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line );
|
||||
m_size += words.size() + 1;
|
||||
m_sentenceCount++;
|
||||
}
|
||||
textFile.close();
|
||||
cerr << m_size << " words (incl. sentence boundaries)" << endl;
|
||||
|
||||
// allocate memory
|
||||
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
|
||||
m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
||||
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
|
||||
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
||||
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
|
||||
|
||||
// fill the array
|
||||
int wordIndex = 0;
|
||||
int sentenceId = 0;
|
||||
textFile.open(fileName.c_str());
|
||||
fileP = &textFile;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line );
|
||||
vector< WORD_ID >::const_iterator i;
|
||||
|
||||
for( i=words.begin(); i!=words.end(); i++)
|
||||
{
|
||||
m_index[ wordIndex ] = wordIndex;
|
||||
m_sentence[ wordIndex ] = sentenceId;
|
||||
m_wordInSentence[ wordIndex ] = i-words.begin();
|
||||
m_array[ wordIndex++ ] = *i;
|
||||
}
|
||||
m_index[ wordIndex ] = wordIndex;
|
||||
m_array[ wordIndex++ ] = m_endOfSentence;
|
||||
m_sentenceLength[ sentenceId++ ] = words.size();
|
||||
}
|
||||
textFile.close();
|
||||
cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
|
||||
// List(0,9);
|
||||
|
||||
// sort
|
||||
m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
||||
Sort( 0, m_size-1 );
|
||||
free( m_buffer );
|
||||
cerr << "done sorting" << endl;
|
||||
}
|
||||
|
||||
// good ol' quick sort
|
||||
void SuffixArray::Sort(INDEX start, INDEX end) {
|
||||
if (start == end) return;
|
||||
INDEX mid = (start+end+1)/2;
|
||||
Sort( start, mid-1 );
|
||||
Sort( mid, end );
|
||||
|
||||
// merge
|
||||
int i = start;
|
||||
int j = mid;
|
||||
int k = 0;
|
||||
int length = end-start+1;
|
||||
while( k<length )
|
||||
{
|
||||
if (i == mid )
|
||||
{
|
||||
m_buffer[ k++ ] = m_index[ j++ ];
|
||||
}
|
||||
else if (j > end )
|
||||
{
|
||||
m_buffer[ k++ ] = m_index[ i++ ];
|
||||
}
|
||||
else {
|
||||
if (CompareIndex( m_index[i], m_index[j] ) < 0)
|
||||
{
|
||||
m_buffer[ k++ ] = m_index[ i++ ];
|
||||
}
|
||||
else
|
||||
{
|
||||
m_buffer[ k++ ] = m_index[ j++ ];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
|
||||
((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
|
||||
}
|
||||
|
||||
SuffixArray::~SuffixArray()
|
||||
{
|
||||
free(m_index);
|
||||
free(m_array);
|
||||
}
|
||||
|
||||
int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
|
||||
{
|
||||
// skip over identical words
|
||||
INDEX offset = 0;
|
||||
while( a+offset < m_size &&
|
||||
b+offset < m_size &&
|
||||
m_array[ a+offset ] == m_array[ b+offset ] )
|
||||
{ offset++; }
|
||||
|
||||
if( a+offset == m_size ) return -1;
|
||||
if( b+offset == m_size ) return 1;
|
||||
return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
|
||||
}
|
||||
|
||||
inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
|
||||
{
|
||||
// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
|
||||
return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
|
||||
}
|
||||
|
||||
int SuffixArray::Count( const vector< WORD > &phrase )
|
||||
{
|
||||
INDEX dummy;
|
||||
return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
|
||||
}
|
||||
|
||||
bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
|
||||
{
|
||||
INDEX dummy;
|
||||
return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
|
||||
}
|
||||
|
||||
bool SuffixArray::Exists( const vector< WORD > &phrase )
|
||||
{
|
||||
INDEX dummy;
|
||||
return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
|
||||
}
|
||||
|
||||
int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
|
||||
{
|
||||
return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
|
||||
}
|
||||
|
||||
int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
|
||||
{
|
||||
// cerr << "FindFirst\n";
|
||||
INDEX start = search_start;
|
||||
INDEX end = (search_end == -1) ? (m_size-1) : search_end;
|
||||
INDEX mid = FindFirst( phrase, start, end );
|
||||
// cerr << "done\n";
|
||||
if (mid == m_size) return 0; // no matches
|
||||
if (min == 1) return 1; // only existance check
|
||||
|
||||
int matchCount = 1;
|
||||
|
||||
//cerr << "before...\n";
|
||||
firstMatch = FindLast( phrase, mid, start, -1 );
|
||||
matchCount += mid - firstMatch;
|
||||
|
||||
//cerr << "after...\n";
|
||||
lastMatch = FindLast( phrase, mid, end, 1 );
|
||||
matchCount += lastMatch - mid;
|
||||
|
||||
return matchCount;
|
||||
}
|
||||
|
||||
SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
|
||||
{
|
||||
end += direction;
|
||||
while(true)
|
||||
{
|
||||
INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
|
||||
|
||||
int match = Match( phrase, mid );
|
||||
int matchNext = Match( phrase, mid+direction );
|
||||
//cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
|
||||
|
||||
if (match == 0 && matchNext != 0) return mid;
|
||||
|
||||
if (match == 0) // mid point is a match
|
||||
start = mid;
|
||||
else
|
||||
end = mid;
|
||||
}
|
||||
}
|
||||
|
||||
SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
|
||||
{
|
||||
while(true)
|
||||
{
|
||||
INDEX mid = ( start + end + 1 )/2;
|
||||
//cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
|
||||
int match = Match( phrase, mid );
|
||||
|
||||
if (match == 0) return mid;
|
||||
if (start >= end && match != 0 ) return m_size;
|
||||
|
||||
if (match > 0)
|
||||
start = mid+1;
|
||||
else
|
||||
end = mid-1;
|
||||
}
|
||||
}
|
||||
|
||||
int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
|
||||
{
|
||||
INDEX pos = m_index[ index ];
|
||||
for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++)
|
||||
{
|
||||
int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
|
||||
// cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
|
||||
if (match != 0)
|
||||
return match;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void SuffixArray::List(INDEX start, INDEX end)
|
||||
{
|
||||
for(INDEX i=start; i<=end; i++)
|
||||
{
|
||||
INDEX pos = m_index[ i ];
|
||||
// cerr << i << ":" << pos << "\t";
|
||||
for(int j=0; j<5 && j+pos<m_size; j++)
|
||||
{
|
||||
cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
|
||||
}
|
||||
// cerr << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
void SuffixArray::Save( string fileName ) {
|
||||
FILE *pFile = fopen ( fileName.c_str() , "w" );
|
||||
|
||||
fwrite( &m_size, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
|
||||
fwrite( m_index, sizeof(INDEX), m_size, pFile ); // suffix array
|
||||
fwrite( m_wordInSentence, sizeof(char), m_size, pFile); // word index
|
||||
fwrite( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
|
||||
|
||||
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
|
||||
fclose( pFile );
|
||||
|
||||
m_vcb.Save( fileName + ".src-vcb" );
|
||||
}
|
||||
|
||||
void SuffixArray::Load( string fileName ) {
|
||||
FILE *pFile = fopen ( fileName.c_str() , "r" );
|
||||
cerr << "loading from " << fileName << endl;
|
||||
|
||||
fread( &m_size, sizeof(INDEX), 1, pFile );
|
||||
cerr << "words in corpus: " << m_size << endl;
|
||||
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
|
||||
m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
||||
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
|
||||
m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size );
|
||||
|
||||
fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
|
||||
fread( m_index, sizeof(INDEX), m_size, pFile ); // suffix array
|
||||
fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index
|
||||
fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index
|
||||
|
||||
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
||||
cerr << "sentences in corpus: " << m_sentenceCount << endl;
|
||||
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
|
||||
fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
|
||||
fclose( pFile );
|
||||
|
||||
m_vcb.Load( fileName + ".src-vcb" );
|
||||
}
|
||||
|
||||
|
49
scripts/ems/biconcor/SuffixArray.h
Normal file
49
scripts/ems/biconcor/SuffixArray.h
Normal file
@ -0,0 +1,49 @@
|
||||
#include "Vocabulary.h"
|
||||
|
||||
#pragma once
|
||||
|
||||
#define LINE_MAX_LENGTH 10000
|
||||
|
||||
|
||||
class SuffixArray
|
||||
{
|
||||
public:
|
||||
typedef unsigned int INDEX;
|
||||
|
||||
private:
|
||||
WORD_ID *m_array;
|
||||
INDEX *m_index;
|
||||
INDEX *m_buffer;
|
||||
char *m_wordInSentence;
|
||||
INDEX *m_sentence;
|
||||
char *m_sentenceLength;
|
||||
WORD_ID m_endOfSentence;
|
||||
Vocabulary m_vcb;
|
||||
INDEX m_size;
|
||||
INDEX m_sentenceCount;
|
||||
|
||||
public:
|
||||
~SuffixArray();
|
||||
|
||||
void Create( string fileName );
|
||||
void Sort(INDEX start, INDEX end);
|
||||
int CompareIndex( INDEX a, INDEX b ) const;
|
||||
inline int CompareWord( WORD_ID a, WORD_ID b ) const;
|
||||
int Count( const vector< WORD > &phrase );
|
||||
bool MinCount( const vector< WORD > &phrase, INDEX min );
|
||||
bool Exists( const vector< WORD > &phrase );
|
||||
int FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
|
||||
int LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
|
||||
INDEX FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end );
|
||||
INDEX FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction );
|
||||
int Match( const vector< WORD > &phrase, INDEX index );
|
||||
void List( INDEX start, INDEX end );
|
||||
inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; }
|
||||
inline INDEX GetSentence( INDEX position ) { return m_sentence[position]; }
|
||||
inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; }
|
||||
inline char GetSentenceLength( INDEX sentenceId ) { return m_sentenceLength[sentenceId]; }
|
||||
inline INDEX GetSize() { return m_size; }
|
||||
inline WORD GetWord( INDEX position ) { return m_vcb.GetWord( m_array[position] ); }
|
||||
void Save( string fileName );
|
||||
void Load( string fileName );
|
||||
};
|
107
scripts/ems/biconcor/TargetCorpus.cpp
Normal file
107
scripts/ems/biconcor/TargetCorpus.cpp
Normal file
@ -0,0 +1,107 @@
|
||||
#include "TargetCorpus.h"
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <cstring>
|
||||
|
||||
void TargetCorpus::Create( string fileName )
|
||||
{
|
||||
ifstream textFile;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
|
||||
// count the number of words first;
|
||||
textFile.open(fileName.c_str());
|
||||
istream *fileP = &textFile;
|
||||
m_size = 0;
|
||||
m_sentenceCount = 0;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line );
|
||||
m_size += words.size();
|
||||
m_sentenceCount++;
|
||||
}
|
||||
textFile.close();
|
||||
cerr << m_size << " words" << endl;
|
||||
|
||||
// allocate memory
|
||||
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
|
||||
m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount );
|
||||
|
||||
// fill the array
|
||||
int wordIndex = 0;
|
||||
int sentenceId = 0;
|
||||
textFile.open(fileName.c_str());
|
||||
fileP = &textFile;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
vector< WORD_ID > words = m_vcb.Tokenize( line );
|
||||
vector< WORD_ID >::const_iterator i;
|
||||
|
||||
for( i=words.begin(); i!=words.end(); i++)
|
||||
{
|
||||
m_array[ wordIndex++ ] = *i;
|
||||
}
|
||||
m_sentenceEnd[ sentenceId++ ] = wordIndex-1;
|
||||
}
|
||||
textFile.close();
|
||||
cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
|
||||
}
|
||||
|
||||
TargetCorpus::~TargetCorpus()
|
||||
{
|
||||
free(m_array);
|
||||
free(m_sentenceEnd);
|
||||
}
|
||||
|
||||
WORD TargetCorpus::GetWordFromId( const WORD_ID id ) const {
|
||||
return m_vcb.GetWord( id );
|
||||
}
|
||||
|
||||
WORD TargetCorpus::GetWord( INDEX sentence, char word ) {
|
||||
return m_vcb.GetWord( GetWordId( sentence, word ) );
|
||||
}
|
||||
|
||||
WORD_ID TargetCorpus::GetWordId( INDEX sentence, char word ) {
|
||||
if (sentence == 0) {
|
||||
return m_array[ word ];
|
||||
}
|
||||
return m_array[ m_sentenceEnd[ sentence-1 ] + 1 + word ] ;
|
||||
}
|
||||
|
||||
char TargetCorpus::GetSentenceLength( INDEX sentence ) {
|
||||
if (sentence == 0) {
|
||||
return (char) m_sentenceEnd[ 0 ]+1;
|
||||
}
|
||||
return (char) ( m_sentenceEnd[ sentence ] - m_sentenceEnd[ sentence-1 ] );
|
||||
}
|
||||
|
||||
void TargetCorpus::Save( string fileName ) {
|
||||
FILE *pFile = fopen ( (fileName + ".tgt").c_str() , "w" );
|
||||
|
||||
fwrite( &m_size, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
|
||||
|
||||
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
|
||||
fclose( pFile );
|
||||
|
||||
m_vcb.Save( fileName + ".tgt-vcb" );
|
||||
}
|
||||
|
||||
void TargetCorpus::Load( string fileName ) {
|
||||
FILE *pFile = fopen ( (fileName + ".tgt").c_str() , "r" );
|
||||
cerr << "loading from " << fileName << ".tgt" << endl;
|
||||
|
||||
fread( &m_size, sizeof(INDEX), 1, pFile );
|
||||
cerr << "words in corpus: " << m_size << endl;
|
||||
m_array = (WORD_ID*) calloc( sizeof(WORD_ID), m_size );
|
||||
fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus
|
||||
|
||||
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile );
|
||||
cerr << "sentences in corpus: " << m_sentenceCount << endl;
|
||||
m_sentenceEnd = (INDEX*) calloc( sizeof(INDEX), m_sentenceCount );
|
||||
fread( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); // sentence index
|
||||
fclose( pFile );
|
||||
m_vcb.Load( fileName + ".tgt-vcb" );
|
||||
}
|
29
scripts/ems/biconcor/TargetCorpus.h
Normal file
29
scripts/ems/biconcor/TargetCorpus.h
Normal file
@ -0,0 +1,29 @@
|
||||
#include "Vocabulary.h"
|
||||
|
||||
#pragma once
|
||||
|
||||
#define LINE_MAX_LENGTH 10000
|
||||
|
||||
class TargetCorpus
|
||||
{
|
||||
public:
|
||||
typedef unsigned int INDEX;
|
||||
|
||||
private:
|
||||
WORD_ID *m_array;
|
||||
INDEX *m_sentenceEnd;
|
||||
Vocabulary m_vcb;
|
||||
INDEX m_size;
|
||||
INDEX m_sentenceCount;
|
||||
|
||||
public:
|
||||
~TargetCorpus();
|
||||
|
||||
void Create( string fileName );
|
||||
WORD GetWordFromId( const WORD_ID id ) const;
|
||||
WORD GetWord( INDEX sentence, char word );
|
||||
WORD_ID GetWordId( INDEX sentence, char word );
|
||||
char GetSentenceLength( INDEX sentence );
|
||||
void Load( string fileName );
|
||||
void Save( string fileName );
|
||||
};
|
75
scripts/ems/biconcor/Vocabulary.cpp
Normal file
75
scripts/ems/biconcor/Vocabulary.cpp
Normal file
@ -0,0 +1,75 @@
|
||||
// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
|
||||
#include "Vocabulary.h"
|
||||
|
||||
// as in beamdecoder/tables.cpp
|
||||
vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
|
||||
vector< WORD_ID > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
}
|
||||
else if (isSpace && !betweenWords) {
|
||||
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
|
||||
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
||||
|
||||
if( i != lookup.end() )
|
||||
return i->second;
|
||||
|
||||
WORD_ID id = vocab.size();
|
||||
vocab.push_back( word );
|
||||
lookup[ word ] = id;
|
||||
return id;
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::GetWordID( const WORD &word ) {
|
||||
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
||||
if( i == lookup.end() )
|
||||
return 0;
|
||||
WORD_ID w= (WORD_ID) i->second;
|
||||
return w;
|
||||
}
|
||||
|
||||
void Vocabulary::Save( string fileName ) {
|
||||
ofstream vcbFile;
|
||||
vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc);
|
||||
vector< WORD >::iterator i;
|
||||
for(i = vocab.begin(); i != vocab.end(); i++) {
|
||||
const string &word = *i;
|
||||
vcbFile << word << endl;
|
||||
}
|
||||
vcbFile.close();
|
||||
}
|
||||
|
||||
void Vocabulary::Load( string fileName ) {
|
||||
ifstream vcbFile;
|
||||
char line[MAX_LENGTH];
|
||||
vcbFile.open(fileName.c_str());
|
||||
cerr << "loading from " << fileName << endl;
|
||||
istream *fileP = &vcbFile;
|
||||
int count = 0;
|
||||
while(!fileP->eof()) {
|
||||
SAFE_GETLINE((*fileP), line, MAX_LENGTH, '\n');
|
||||
if (fileP->eof()) break;
|
||||
int length = 0;
|
||||
for(; line[length] != '\0'; length++);
|
||||
StoreIfNew( string( line, length ) );
|
||||
count++;
|
||||
}
|
||||
vcbFile.close();
|
||||
cerr << count << " word read, vocabulary size " << vocab.size() << endl;
|
||||
}
|
42
scripts/ems/biconcor/Vocabulary.h
Normal file
42
scripts/ems/biconcor/Vocabulary.h
Normal file
@ -0,0 +1,42 @@
|
||||
// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <queue>
|
||||
#include <map>
|
||||
#include <cmath>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define MAX_LENGTH 10000
|
||||
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
|
||||
_IS.getline(_LINE, _SIZE, _DELIM); \
|
||||
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
|
||||
if (_IS.gcount() == _SIZE-1) { \
|
||||
cerr << "Line too long! Buffer overflow. Delete lines >=" \
|
||||
<< _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
|
||||
<< endl; \
|
||||
exit(1); \
|
||||
} \
|
||||
}
|
||||
|
||||
typedef string WORD;
|
||||
typedef unsigned int WORD_ID;
|
||||
|
||||
class Vocabulary {
|
||||
public:
|
||||
map<WORD, WORD_ID> lookup;
|
||||
vector< WORD > vocab;
|
||||
WORD_ID StoreIfNew( const WORD& );
|
||||
WORD_ID GetWordID( const WORD& );
|
||||
vector<WORD_ID> Tokenize( const char[] );
|
||||
inline WORD &GetWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
|
||||
void Save( string fileName );
|
||||
void Load( string fileName );
|
||||
};
|
116
scripts/ems/biconcor/biconcor.cpp
Normal file
116
scripts/ems/biconcor/biconcor.cpp
Normal file
@ -0,0 +1,116 @@
|
||||
#include "SuffixArray.h"
|
||||
#include "TargetCorpus.h"
|
||||
#include "Alignment.h"
|
||||
#include "PhrasePairCollection.h"
|
||||
#include <getopt.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// handle parameters
|
||||
string query;
|
||||
string fileNameSuffix;
|
||||
string fileNameSource;
|
||||
string fileNameTarget = "";
|
||||
string fileNameAlignment = "";
|
||||
int loadFlag = false;
|
||||
int saveFlag = false;
|
||||
int createFlag = false;
|
||||
int queryFlag = false;
|
||||
int htmlFlag = false;
|
||||
string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n";
|
||||
while(1) {
|
||||
static struct option long_options[] = {
|
||||
{"load", required_argument, 0, 'l'},
|
||||
{"save", required_argument, 0, 's'},
|
||||
{"create", required_argument, 0, 'c'},
|
||||
{"query", required_argument, 0, 'q'},
|
||||
{"target", required_argument, 0, 't'},
|
||||
{"alignment", required_argument, 0, 'a'},
|
||||
{"html", no_argument, &htmlFlag, 0},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
int option_index = 0;
|
||||
int c = getopt_long (argc, argv, "l:s:c:q:t:a:h", long_options, &option_index);
|
||||
if (c == -1) break;
|
||||
switch (c) {
|
||||
case 'l':
|
||||
fileNameSuffix = string(optarg);
|
||||
loadFlag = true;
|
||||
break;
|
||||
case 't':
|
||||
fileNameTarget = string(optarg);
|
||||
break;
|
||||
case 'a':
|
||||
fileNameAlignment = string(optarg);
|
||||
break;
|
||||
case 's':
|
||||
fileNameSuffix = string(optarg);
|
||||
saveFlag = true;
|
||||
break;
|
||||
case 'c':
|
||||
fileNameSource = string(optarg);
|
||||
createFlag = true;
|
||||
break;
|
||||
case 'q':
|
||||
query = string(optarg);
|
||||
queryFlag = true;
|
||||
break;
|
||||
default:
|
||||
cerr << info;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// check if parameter settings are legal
|
||||
if (saveFlag && !createFlag) {
|
||||
cerr << "error: cannot save without creating\n" << info;
|
||||
exit(1);
|
||||
}
|
||||
if (saveFlag && loadFlag) {
|
||||
cerr << "error: cannot load and save at the same time\n" << info;
|
||||
exit(1);
|
||||
}
|
||||
if (!loadFlag && !createFlag) {
|
||||
cerr << "error: neither load or create - i have no info!\n" << info;
|
||||
exit(1);
|
||||
}
|
||||
if (createFlag && (fileNameTarget == "" || fileNameAlignment == "")) {
|
||||
cerr << "error: i have no target corpus or alignment\n" << info;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// do your thing
|
||||
SuffixArray suffixArray;
|
||||
TargetCorpus targetCorpus;
|
||||
Alignment alignment;
|
||||
if (createFlag) {
|
||||
cerr << "will create\n";
|
||||
cerr << "source corpus is in " << fileNameSource << endl;
|
||||
suffixArray.Create( fileNameSource );
|
||||
cerr << "target corpus is in " << fileNameTarget << endl;
|
||||
targetCorpus.Create( fileNameTarget );
|
||||
cerr << "alignment is in " << fileNameAlignment << endl;
|
||||
alignment.Create( fileNameAlignment );
|
||||
if (saveFlag) {
|
||||
suffixArray.Save( fileNameSuffix );
|
||||
targetCorpus.Save( fileNameSuffix );
|
||||
alignment.Save( fileNameSuffix );
|
||||
cerr << "will save in " << fileNameSuffix << endl;
|
||||
}
|
||||
}
|
||||
if (loadFlag) {
|
||||
cerr << "will load from " << fileNameSuffix << endl;
|
||||
suffixArray.Load( fileNameSuffix );
|
||||
targetCorpus.Load( fileNameSuffix );
|
||||
alignment.Load( fileNameSuffix );
|
||||
}
|
||||
if (queryFlag) {
|
||||
cerr << "query is " << query << endl;
|
||||
vector< string > queryString = alignment.Tokenize( query.c_str() );
|
||||
PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment );
|
||||
ppCollection.GetCollection( queryString );
|
||||
ppCollection.PrintHTML();
|
||||
}
|
||||
}
|
@ -274,6 +274,10 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
||||
### lexicalized reordering: specify orientation type
|
||||
# (default: only distance-based reordering model)
|
||||
#
|
||||
@ -419,6 +423,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
#
|
||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||
|
||||
### specify size of n-best list, if produced
|
||||
#
|
||||
#nbest = 100
|
||||
|
||||
### multiple reference translations
|
||||
#
|
||||
multiref = yes
|
||||
|
@ -294,6 +294,10 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
||||
### lexicalized reordering: specify orientation type
|
||||
# (default: only distance-based reordering model)
|
||||
#
|
||||
@ -439,6 +443,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
#
|
||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||
|
||||
### specify size of n-best list, if produced
|
||||
#
|
||||
#nbest = 100
|
||||
|
||||
### multiple reference translations
|
||||
#
|
||||
multiref = yes
|
||||
|
@ -274,6 +274,10 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
||||
### lexicalized reordering: specify orientation type
|
||||
# (default: only distance-based reordering model)
|
||||
#
|
||||
@ -419,6 +423,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
#
|
||||
#decoder-settings = ""
|
||||
|
||||
### specify size of n-best list, if produced
|
||||
#
|
||||
#nbest = 100
|
||||
|
||||
### multiple reference translations
|
||||
#
|
||||
multiref = yes
|
||||
|
@ -278,6 +278,10 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
||||
### lexicalized reordering: specify orientation type
|
||||
# (default: only distance-based reordering model)
|
||||
#
|
||||
@ -423,6 +427,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
#
|
||||
#decoder-settings = ""
|
||||
|
||||
### specify size of n-best list, if produced
|
||||
#
|
||||
#nbest = 100
|
||||
|
||||
### multiple reference translations
|
||||
#
|
||||
multiref = yes
|
||||
|
@ -258,6 +258,10 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
||||
### lexicalized reordering: specify orientation type
|
||||
# (default: only distance-based reordering model)
|
||||
#
|
||||
@ -399,6 +403,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
#
|
||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||
|
||||
### specify size of n-best list, if produced
|
||||
#
|
||||
#nbest = 100
|
||||
|
||||
### multiple reference translations
|
||||
#
|
||||
multiref = yes
|
||||
|
@ -319,6 +319,12 @@ symmetrize-giza
|
||||
rerun-on-change: alignment-symmetrization-method training-options script
|
||||
default-name: model/aligned
|
||||
error: skip=<[1-9]
|
||||
build-biconcor
|
||||
in: word-alignment corpus
|
||||
out: biconcor-model
|
||||
default-name: model/biconcor
|
||||
ignore-unless: biconcor
|
||||
error: usage
|
||||
build-lex-trans
|
||||
in: word-alignment corpus
|
||||
out: lexical-translation-table
|
||||
@ -354,14 +360,14 @@ build-generation
|
||||
ignore-unless: generation-factors
|
||||
default-name: model/generation-table
|
||||
create-config
|
||||
in: reordering-table phrase-translation-table generation-table LM:binlm
|
||||
in: reordering-table phrase-translation-table generation-table LM:binlm biconcor-model
|
||||
out: config
|
||||
ignore-if: use-hiero INTERPOLATED-LM:script
|
||||
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script
|
||||
default-name: model/moses.ini
|
||||
error: Unknown option
|
||||
create-config-interpolated-lm
|
||||
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm
|
||||
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm biconcor-model
|
||||
out: config
|
||||
ignore-if: use-hiero
|
||||
ignore-unless: INTERPOLATED-LM:script
|
||||
@ -617,6 +623,7 @@ remove-markup
|
||||
in: system-output
|
||||
out: cleaned-output
|
||||
default-name: evaluation/cleaned
|
||||
pass-if: TRAINING:hierarchical-rule-set
|
||||
pass-unless: report-segmentation
|
||||
template: $moses-script-dir/ems/support/remove-segmenation-markup.perl < IN > OUT
|
||||
recase-output
|
||||
|
@ -49,6 +49,7 @@ my (@MODULE,
|
||||
%STEP_OUT,
|
||||
%STEP_OUTNAME,
|
||||
%STEP_PASS, # config parameters that have to be set, otherwise pass
|
||||
%STEP_PASS_IF, # config parameters that have to be not set, otherwise pass
|
||||
%STEP_IGNORE, # config parameters that have to be set, otherwise ignore
|
||||
%STEP_IGNORE_IF, # config parameters that have to be not set, otherwise ignore
|
||||
%QSUB_SCRIPT, # flag if script contains qsub's when run on cluster
|
||||
@ -208,6 +209,10 @@ sub read_meta {
|
||||
@{$STEP_PASS{"$module:$step"}} = split(/\s+/,$2);
|
||||
push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2);
|
||||
}
|
||||
elsif ($1 eq "pass-if") {
|
||||
@{$STEP_PASS_IF{"$module:$step"}} = split(/\s+/,$2);
|
||||
push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2);
|
||||
}
|
||||
elsif ($1 eq "ignore-unless") {
|
||||
$STEP_IGNORE{"$module:$step"} = $2;
|
||||
}
|
||||
@ -485,6 +490,15 @@ sub find_steps_for_module {
|
||||
}
|
||||
$PASS{$#DO_STEP}++ if $flag;
|
||||
}
|
||||
|
||||
if (defined($STEP_PASS_IF{$defined_step})) {
|
||||
my $flag = 0;
|
||||
foreach my $pass (@{$STEP_PASS_IF{$defined_step}}) {
|
||||
$flag = 1
|
||||
if &backoff_and_get(&extend_local_name($module,$set,$pass));
|
||||
}
|
||||
$PASS{$#DO_STEP}++ if $flag;
|
||||
}
|
||||
|
||||
# special case for passing: steps that only affect factor 0
|
||||
if (defined($ONLY_FACTOR_0{$defined_step})) {
|
||||
@ -737,6 +751,7 @@ sub find_re_use {
|
||||
|
||||
# summarize and convert hashes into integers for to be re-used
|
||||
print "\nSTEP SUMMARY:\n";
|
||||
open(RE_USE,">".&steps_file("re-use.$VERSION",$VERSION));
|
||||
for(my $i=$#DO_STEP;$i>=0;$i--) {
|
||||
if ($PASS{$i}) {
|
||||
$RE_USE[$i] = 0;
|
||||
@ -747,12 +762,16 @@ sub find_re_use {
|
||||
my @ALL = sort { $a <=> $b} keys %{$RE_USE[$i]};
|
||||
print "re-using (".join(" ",@ALL).")\n";
|
||||
$RE_USE[$i] = $ALL[0];
|
||||
if ($ALL[0] != $VERSION) {
|
||||
print RE_USE "$DO_STEP[$i] $ALL[0]\n";
|
||||
}
|
||||
}
|
||||
else {
|
||||
print "run\n";
|
||||
$RE_USE[$i] = 0;
|
||||
}
|
||||
}
|
||||
close(RE_USE);
|
||||
}
|
||||
|
||||
sub find_dependencies {
|
||||
@ -816,10 +835,10 @@ sub draw_agenda_graph {
|
||||
$step .= " (".$RE_USE[$i].")" if $RE_USE[$i];
|
||||
|
||||
my $color = "green";
|
||||
$color = "#0000ff" if defined($DO{$i}) && $DO{$i} >= 1;
|
||||
$color = "#8080ff" if defined($DONE{$i});
|
||||
$color = "red" if defined($CRASHED{$i});
|
||||
$color = "lightblue" if $RE_USE[$i];
|
||||
$color = "#0000ff" if defined($DO{$i}) && $DO{$i} >= 1;
|
||||
$color = "#8080ff" if defined($DONE{$i}) || ($RE_USE[$i] && $RE_USE[$i] == $VERSION);
|
||||
$color = "red" if defined($CRASHED{$i});
|
||||
$color = "lightyellow" if defined($PASS{$i});
|
||||
|
||||
print DOT " $i [label=\"$step\",shape=box,fontsize=10,height=0,style=filled,fillcolor=\"$color\"];\n";
|
||||
@ -893,6 +912,9 @@ sub define_step {
|
||||
elsif ($DO_STEP[$i] eq 'TRAINING:symmetrize-giza') {
|
||||
&define_training_symmetrize_giza($i);
|
||||
}
|
||||
elsif ($DO_STEP[$i] eq 'TRAINING:build-biconcor') {
|
||||
&define_training_build_biconcor($i);
|
||||
}
|
||||
elsif ($DO_STEP[$i] eq 'TRAINING:build-lex-trans') {
|
||||
&define_training_build_lex_trans($i);
|
||||
}
|
||||
@ -1128,13 +1150,12 @@ sub check_info {
|
||||
print "\tcheck parameter count current: ".(scalar keys %VALUE).", old: ".(scalar keys %INFO)."\n" if $VERBOSE;
|
||||
return 0 unless scalar keys %INFO == scalar keys %VALUE;
|
||||
foreach my $parameter (keys %VALUE) {
|
||||
if (! defined($VALUE{$parameter})) {
|
||||
print "\tcurrent has not '$parameter' -> not re-usable\n" if $VERBOSE;
|
||||
if (! defined($INFO{$parameter})) {
|
||||
print "\told has no '$parameter' -> not re-usable\n" if $VERBOSE;
|
||||
return 0;
|
||||
}
|
||||
print "\tcheck '$VALUE{$parameter}' eq '$INFO{$parameter}' -> " if $VERBOSE;
|
||||
if (defined($INFO{$parameter})
|
||||
&& &match_info_strings($VALUE{$parameter},$INFO{$parameter})) {
|
||||
if (&match_info_strings($VALUE{$parameter},$INFO{$parameter})) {
|
||||
print "ok\n" if $VERBOSE;
|
||||
}
|
||||
else {
|
||||
@ -1148,6 +1169,8 @@ sub check_info {
|
||||
|
||||
sub match_info_strings {
|
||||
my ($current,$old) = @_;
|
||||
$current =~ s/ $//;
|
||||
$old =~ s/ $//;
|
||||
return 1 if $current eq $old;
|
||||
# ignore time stamps, if that option is used
|
||||
if (defined($IGNORE_TIME)) {
|
||||
@ -1469,14 +1492,21 @@ sub factorize_one_language {
|
||||
my $script = &check_and_get("$type:$factor:factor-script");
|
||||
my $out = "$outfile.$factor";
|
||||
if ($parallelizer && defined($PARALLELIZE{&defined_step($DO_STEP[$step_id])})
|
||||
&& &get("$module:jobs") && $CLUSTER) {
|
||||
&& ( (&get("$module:jobs") && $CLUSTER)
|
||||
|| (&get("$module:cores") && $MULTICORE))) {
|
||||
my $subdir = $module;
|
||||
$subdir =~ tr/A-Z/a-z/;
|
||||
$subdir .= "/tmp.$set.$stepname.$type.$factor.$VERSION";
|
||||
my $qsub_args = &get_qsub_args($DO_STEP[$step_id]);
|
||||
my $qflags = "--queue-flags \"$qsub_args\"";
|
||||
$cmd .= "$parallelizer $qflags -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -jobs ".&get("$module:jobs")." -tmpdir $temp_dir/$subdir\n";
|
||||
$QSUB_STEP{$step_id}++;
|
||||
if ($CLUSTER) {
|
||||
my $qflags = "";
|
||||
my $qsub_args = &get_qsub_args($DO_STEP[$step_id]);
|
||||
$qflags="--queue-flags \"$qsub_args\"" if ($CLUSTER && $qsub_args);
|
||||
$cmd .= "$parallelizer $qflags -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -jobs ".&get("$module:jobs")." -tmpdir $temp_dir/$subdir\n";
|
||||
$QSUB_STEP{$step_id}++;
|
||||
}
|
||||
elsif ($MULTICORE) {
|
||||
$cmd .= "$parallelizer -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -cores ".&get("$module:cores")." -tmpdir $temp_dir/$subdir\n";
|
||||
}
|
||||
}
|
||||
else {
|
||||
$cmd .= "$script $infile $out $temp_dir\n";
|
||||
@ -1597,6 +1627,19 @@ sub define_training_symmetrize_giza {
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
|
||||
sub define_training_build_biconcor {
|
||||
my ($step_id) = @_;
|
||||
|
||||
my ($model, $aligned,$corpus) = &get_output_and_input($step_id);
|
||||
my $biconcor = &check_and_get("TRAINING:biconcor");
|
||||
my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
|
||||
my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
|
||||
my $method = &check_and_get("TRAINING:alignment-symmetrization-method");
|
||||
|
||||
my $cmd = "$biconcor -c $corpus.$input_extension -t $corpus.$output_extension -a $aligned.$method -s $model";
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
|
||||
sub define_training_build_lex_trans {
|
||||
my ($step_id) = @_;
|
||||
|
||||
@ -1683,6 +1726,7 @@ sub define_training_create_config {
|
||||
my ($config,
|
||||
$reordering_table,$phrase_translation_table,$generation_table,@LM)
|
||||
= &get_output_and_input($step_id);
|
||||
if ($LM[$#LM] =~ /biconcor/) { pop @LM; }
|
||||
|
||||
my $cmd = &get_training_setting(9);
|
||||
|
||||
@ -1737,7 +1781,7 @@ sub define_training_create_config {
|
||||
$cmd .= "-lm $factor:$order:$LM[0]:$type ";
|
||||
}
|
||||
else {
|
||||
die("ERROR: number of defined LM sets (".(scalar @LM_SETS).") and LM files (".(scalar @LM).") does not match")
|
||||
die("ERROR: number of defined LM sets (".(scalar @LM_SETS).":".join(",",@LM_SETS).") and LM files (".(scalar @LM).":".join(",",@LM).") does not match")
|
||||
unless scalar @LM == scalar @LM_SETS;
|
||||
foreach my $lm (@LM) {
|
||||
my $set = shift @LM_SETS;
|
||||
@ -2020,11 +2064,15 @@ sub define_evaluation_decode {
|
||||
$cmd .= " -queue-parameters \"$qsub_args\"" if ($CLUSTER && $qsub_args);
|
||||
$cmd .= " -decoder $decoder -config $dir/evaluation/filtered.$set.$VERSION/moses.ini -input-file $input --jobs $jobs -decoder-parameters \"$settings\" > $system_output";
|
||||
|
||||
$cmd .= " -n-best-file $system_output.best$nbest -n-best-size $nbest" if $nbest;
|
||||
my $nbest_size;
|
||||
$nbest_size = $nbest + 0 if $nbest;
|
||||
$cmd .= " -n-best-file $system_output.best$nbest_size -n-best-size $nbest" if $nbest;
|
||||
}
|
||||
else {
|
||||
$cmd = $filter."\n$decoder $settings -v 0 -f $dir/evaluation/filtered.$set.$VERSION/moses.ini < $input > $system_output";
|
||||
$cmd .= " -n-best-list $system_output.best$nbest $nbest" if $nbest;
|
||||
my $nbest_size;
|
||||
$nbest_size = $nbest + 0 if $nbest;
|
||||
$cmd .= " -n-best-list $system_output.best$nbest_size $nbest" if $nbest;
|
||||
}
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
|
@ -5,7 +5,7 @@ use Getopt::Long "GetOptions";
|
||||
|
||||
my $MAX_LENGTH = 4;
|
||||
|
||||
my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable,$hierarchical);
|
||||
my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable,$hierarchical,$output_corpus,$alignment,$biconcor);
|
||||
if (!&GetOptions('system=s' => \$system, # raw output from decoder
|
||||
'reference=s' => \$reference, # tokenized reference
|
||||
'dir=s' => \$dir, # directory for storing results
|
||||
@ -13,9 +13,12 @@ if (!&GetOptions('system=s' => \$system, # raw output from decoder
|
||||
'segmentation=s' => \$segmentation, # system output with segmentation markup
|
||||
'input-corpus=s' => \$corpus, # input side of parallel training corpus
|
||||
'ttable=s' => \$ttable, # phrase translation table used for decoding
|
||||
'output-corpus=s' => \$output_corpus, # output side of parallel training corpus
|
||||
'alignment-file=s' => \$alignment, # alignment of parallel corpus
|
||||
'biconcor=s' => \$biconcor, # binary for bilingual concordancer
|
||||
'hierarchical' => \$hierarchical) || # hierarchical model?
|
||||
!defined($dir)) {
|
||||
die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE]");
|
||||
die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE] [-output-corpus FILE] [-alignment-file FILE] [-biconcor BIN]");
|
||||
}
|
||||
|
||||
`mkdir -p $dir`;
|
||||
@ -84,6 +87,11 @@ if (defined($ttable) || defined($corpus)) {
|
||||
&input_annotation();
|
||||
}
|
||||
|
||||
# bilingual concordance -- not used by experiment.perl
|
||||
if (defined($corpus) && defined($output_corpus) && defined($alignment) && defined($biconcor)) {
|
||||
`$biconcor -s $dir/biconcor -c $corpus -t $output_corpus -a $alignment`;
|
||||
}
|
||||
|
||||
sub best_matches {
|
||||
my ($CORRECT,$TOTAL,$out) = @_;
|
||||
my $type = ($out =~ /precision/) ? "precision" : "recall";
|
||||
@ -208,6 +216,9 @@ sub ttable_coverage {
|
||||
if (! -e $ttable && -e $ttable.".gz") {
|
||||
open(TTABLE,"gzip -cd $ttable.gz|");
|
||||
}
|
||||
elsif ($ttable =~ /.gz$/) {
|
||||
open(TTABLE,"gzip -cd $ttable|");
|
||||
}
|
||||
else {
|
||||
open(TTABLE,$ttable) or die "Can't read ttable $ttable";
|
||||
}
|
||||
@ -219,7 +230,7 @@ sub ttable_coverage {
|
||||
my @COLUMN = split(/ \|\|\| /);
|
||||
my ($in,$out,$scores) = @COLUMN;
|
||||
# handling hierarchical
|
||||
$in =~ s/\[[^ \]]+\]$//; # remove lhs nt
|
||||
$in =~ s/ \[[^ \]]+\]$//; # remove lhs nt
|
||||
next if $in =~ /\[[^ \]]+\]\[[^ \]]+\]/; # only consider flat rules
|
||||
$scores = $COLUMN[4] if scalar @COLUMN == 5;
|
||||
my @IN = split(/ /,$in);
|
||||
@ -255,6 +266,7 @@ sub compute_entropy {
|
||||
}
|
||||
my $entropy = 0;
|
||||
foreach my $p (@_) {
|
||||
next if $p == 0;
|
||||
$entropy -= ($p/$z)*log($p/$z)/log(2);
|
||||
}
|
||||
return $entropy;
|
||||
@ -465,7 +477,7 @@ sub hierarchical_segmentation {
|
||||
open(OUTPUT_TREE,">$dir/output-tree");
|
||||
open(NODE,">$dir/node");
|
||||
while(<TRACE>) {
|
||||
/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_");
|
||||
/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_");
|
||||
my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
|
||||
if ($last_sentence >= 0 && $sentence != $last_sentence) {
|
||||
&hs_process($last_sentence,\@DERIVATION,\%STATS);
|
||||
@ -481,7 +493,7 @@ sub hierarchical_segmentation {
|
||||
@{$ITEM{'rule_rhs'}} = split(/ /,$rule_rhs);
|
||||
|
||||
foreach (split(/ /,$alignment)) {
|
||||
/(\d+)\-(\d+)/ || die("funny alignment: $_\n");
|
||||
/(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
|
||||
$ITEM{'alignment'}{$2} = $1; # target non-terminal to source span
|
||||
$ITEM{'alignedSpan'}{$1} = 1;
|
||||
}
|
||||
@ -528,12 +540,14 @@ sub hs_process {
|
||||
my $x=0;
|
||||
while(1) {
|
||||
my $RULE = shift @{$DERIVATION};
|
||||
if ($$RULE{'rule_lhs'} eq "S" &&
|
||||
scalar(@{$$RULE{'rule_rhs'}}) == 2 &&
|
||||
$$RULE{'rule_rhs'}[0] eq "S" &&
|
||||
$$RULE{'rule_rhs'}[1] eq "X") {
|
||||
if (scalar(@{$$RULE{'rule_rhs'}}) == 2 &&
|
||||
($$RULE{'rule_lhs'} eq "S" &&
|
||||
$$RULE{'rule_rhs'}[0] eq "S" &&
|
||||
$$RULE{'rule_rhs'}[1] eq "X") ||
|
||||
($$RULE{'rule_lhs'} eq "Q" &&
|
||||
$$RULE{'rule_rhs'}[0] eq "Q")) {
|
||||
unshift @{$GLUE_RULE{'spans'}},$$RULE{'spans'}[1];
|
||||
push @{$GLUE_RULE{'rule_rhs'}}, "X";
|
||||
push @{$GLUE_RULE{'rule_rhs'}}, $$RULE{'rule_rhs'}[1];
|
||||
$GLUE_RULE{'alignment'}{$x} = $x;
|
||||
$GLUE_RULE{'alignedSpan'}{$x} = 1;
|
||||
$x++;
|
||||
|
@ -33,9 +33,9 @@ function generic_show(field,parameters) {
|
||||
}
|
||||
function highlight_phrase(sentence,phrase) {
|
||||
var input = "input-"+sentence+"-"+phrase;
|
||||
$(input).setStyle({ borderWidth: '3px', borderColor: 'red' });
|
||||
$(input).setStyle({ borderColor: 'red' });
|
||||
var output = "output-"+sentence+"-"+phrase;
|
||||
$(output).setStyle({ borderWidth: '3px', borderColor: 'red' });
|
||||
$(output).setStyle({ borderColor: 'red' });
|
||||
}
|
||||
function show_word_info(sentence,cc,tc,te) {
|
||||
var info = "info-"+sentence;
|
||||
@ -44,14 +44,30 @@ function show_word_info(sentence,cc,tc,te) {
|
||||
}
|
||||
function lowlight_phrase(sentence,phrase) {
|
||||
var input = "input-"+sentence+"-"+phrase;
|
||||
$(input).setStyle({ borderWidth: '1px', borderColor: 'black' });
|
||||
$(input).setStyle({ borderColor: 'black' });
|
||||
var output = "output-"+sentence+"-"+phrase;
|
||||
$(output).setStyle({ borderWidth: '1px', borderColor: 'black' });
|
||||
$(output).setStyle({ borderColor: 'black' });
|
||||
}
|
||||
function hide_word_info(sentence) {
|
||||
var info = "info-"+sentence;
|
||||
$(info).setStyle({ opacity: 0 });
|
||||
}
|
||||
function show_biconcor(sentence,phrase) {
|
||||
var div = "biconcor-"+sentence;
|
||||
var url = '?analysis=biconcor'
|
||||
+ '&setup=<?php print $setup ?>&id=<?php print get_biconcor_version($dir,$id); ?>&set=<?php print $set ?>'
|
||||
+ '&sentence=' + sentence
|
||||
+ '&phrase=' + encodeURIComponent(phrase);
|
||||
document.getElementById(div).innerHTML = "<center><img src=\"spinner.gif\" width=48 height=48></center>";
|
||||
$(div).setStyle({ borderStyle: 'solid', 'border-width': '3px', borderColor: 'black' });
|
||||
new Ajax.Updater(div, url, { method: 'get', evalScripts: true });
|
||||
}
|
||||
function close_biconcor(sentence) {
|
||||
var div = "biconcor-"+sentence;
|
||||
document.getElementById(div).innerHTML = "";
|
||||
$(div).setStyle({ borderStyle: 'none', 'border-width': '0px', borderColor: 'white' });
|
||||
}
|
||||
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
@ -586,7 +602,7 @@ function bleu_show() {
|
||||
|
||||
// annotated sentences core: reads data, sorts sentences, displays them
|
||||
function sentence_annotation() {
|
||||
global $set,$id,$dir;
|
||||
global $set,$id,$dir,$biconcor;
|
||||
|
||||
// load data
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/bleu-annotation");
|
||||
@ -635,19 +651,19 @@ function sentence_annotation() {
|
||||
if ($sentence != $last_sentence) { $span = 0; }
|
||||
$last_sentence = $sentence;
|
||||
$segmentation[$sentence][$span]["brackets"] = $brackets;
|
||||
$segmentation[$sentence][$span]["nt"] = $nt;
|
||||
# $segmentation[$sentence][$span]["nt"] = $nt;
|
||||
$segmentation[$sentence][$span]["words"] = rtrim($words);
|
||||
if ($nt != "") { $nt_count[$nt]++; }
|
||||
$span++;
|
||||
}
|
||||
$hierarchical = 1;
|
||||
if (count($nt_count) <= 2) {
|
||||
foreach ($segmentation as $sentence => $segmentation_span) {
|
||||
foreach ($segmentation_span as $span => $type) {
|
||||
$segmentation[$sentence][$span]["nt"]="";
|
||||
}
|
||||
}
|
||||
}
|
||||
# if (count($nt_count) <= 2) {
|
||||
# foreach ($segmentation as $sentence => $segmentation_span) {
|
||||
# foreach ($segmentation_span as $span => $type) {
|
||||
# $segmentation[$sentence][$span]["nt"]="";
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
}
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/output-tree")) {
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/output-tree");
|
||||
@ -690,6 +706,8 @@ function sentence_annotation() {
|
||||
}
|
||||
}
|
||||
|
||||
$biconcor = get_biconcor_version($dir,$id);
|
||||
|
||||
// sort
|
||||
global $sort;
|
||||
$sort = $_GET['sort'];
|
||||
@ -739,6 +757,10 @@ function sentence_annotation() {
|
||||
}
|
||||
if ($input) {
|
||||
print "<div id=\"info-$i\" style=\"border-color:black; background:#ffff80; opacity:0; width:100%; border:1px;\">8364 occ. in corpus, 56 translations, entropy: 5.54</div>\n";
|
||||
if ($biconcor) {
|
||||
//print "<div id=\"biconcor-$i\" style=\"display: none;\">xxx</div>";
|
||||
print "<div id=\"biconcor-$i\" class=\"biconcor\">xxx</div>";
|
||||
}
|
||||
if ($hierarchical) {
|
||||
sentence_annotation_hierarchical("#".$line["id"],$line["id"],$input[$line["id"]],$segmentation[$line["id"]],"in");
|
||||
}
|
||||
@ -761,8 +783,25 @@ function sentence_annotation() {
|
||||
}
|
||||
}
|
||||
|
||||
function coverage($coverage_vector) {
|
||||
# get information from line in input annotation file
|
||||
$coverage = array();
|
||||
foreach (split(" ",$coverage_vector) as $item) {
|
||||
if (preg_match("/[\-:]/",$item)) {
|
||||
list($from,$to,$corpus_count,$ttable_count,$ttable_entropy) = preg_split("/[\-:]/",$item);
|
||||
$coverage[$from][$to]["corpus_count"] = $corpus_count;
|
||||
$coverage[$from][$to]["ttable_count"] = $ttable_count;
|
||||
$coverage[$from][$to]["ttable_entropy"] = $ttable_entropy;
|
||||
}
|
||||
}
|
||||
$word = split(" ",$words);
|
||||
|
||||
return $coverage;
|
||||
}
|
||||
|
||||
// annotate an inpute sentence
|
||||
function input_annotation($sentence,$input,$segmentation) {
|
||||
global $biconcor;
|
||||
list($words,$coverage_vector) = split("\t",$input);
|
||||
|
||||
# get information from line in input annotation file
|
||||
@ -840,7 +879,7 @@ function input_annotation($sentence,$input,$segmentation) {
|
||||
$highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';";
|
||||
$lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
|
||||
}
|
||||
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords\">";
|
||||
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($phrase)."');\"":"").">";
|
||||
}
|
||||
print "</div></td>";
|
||||
$from += $size-1;
|
||||
@ -868,7 +907,7 @@ function input_annotation($sentence,$input,$segmentation) {
|
||||
$color = '#ffffff';
|
||||
$cc = 0; $tc = 0; $te = 0;
|
||||
}
|
||||
print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\">$word[$j]</span>";
|
||||
print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($word[$j])."');\"":"").">$word[$j]</span>";
|
||||
if ($segmentation && array_key_exists($j,$segmentation["input_end"])) {
|
||||
print "</span>";
|
||||
}
|
||||
@ -945,7 +984,10 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
|
||||
function sentence_annotation_hierarchical($info,$sentence,$sequence,$segmentation,$in_out) {
|
||||
$In_Out = $in_out == "out" ? "Out" : "In";
|
||||
|
||||
$word = split(" ",$sequence);
|
||||
list($words,$coverage_vector) = split("\t",$input);
|
||||
$coverage = coverage($sequence);
|
||||
$word = preg_split("/\s/",$sequence);
|
||||
|
||||
$color = array("#ffe0e0","#f0e0ff","#e0e0ff","#c0c0ff","#a0a0ff");
|
||||
#$color = array("#FFC0C0","#FFC0FF","#C0C0FF","#C0FFFF","#C0FFC0");
|
||||
#$color = array("#c0c0c0","#e0e0ff","#b0b0ff","#8080ff","#4040ff");
|
||||
@ -983,7 +1025,9 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
|
||||
for($w=0;$w<count($span_word);$w++) {
|
||||
if ($w > 0) { print " "; }
|
||||
if ($in_out == "in") {
|
||||
#print "<span style=\"background-color: ".coverage_color($coverage[$word_count][$word_count]).";\">";
|
||||
print $word[$word_count];
|
||||
#print "</span>";
|
||||
}
|
||||
else {
|
||||
list($surface,$correct) = split("\|", $word[$word_count]);
|
||||
@ -1000,3 +1044,22 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
|
||||
}
|
||||
print "</td></tr></table>\n";
|
||||
}
|
||||
|
||||
function biconcor($query) {
|
||||
global $set,$id,$dir;
|
||||
$sentence = $_GET['sentence'];
|
||||
$biconcor = get_biconcor_version($dir,$id);
|
||||
print "<center>
|
||||
<form action=\"...\" method=get>
|
||||
<img src=\"close.gif\" width=17 height=17 onClick=\"close_biconcor($sentence);\">
|
||||
<input width=20 value=\"$query\">
|
||||
<input type=submit value=\"look up\">
|
||||
</form>
|
||||
<div class=\"biconcor-content\">";
|
||||
$cmd = "./biconcor -l $dir/model/biconcor.$biconcor -q ".escapeshellarg($query)." 2>/dev/null";
|
||||
# print $cmd."<p>";
|
||||
system($cmd);
|
||||
# print "<p>done.";
|
||||
print "</div></center>";
|
||||
|
||||
}
|
||||
|
@ -250,12 +250,12 @@ function bleu_diff_annotation() {
|
||||
$matched_with_score = string_edit_distance($word_with_score0,$word_with_score1);
|
||||
$matched = string_edit_distance($word0,$word1);
|
||||
|
||||
print "<font size=-2>[".$line["id"].":".$line["bleu1"]."]</font> ";
|
||||
print "<font size=-2>[".$id2."-".$line["id"].":".$line["bleu1"]."]</font> ";
|
||||
$matched1 = preg_replace('/D/',"",$matched);
|
||||
$matched_with_score1 = preg_replace('/D/',"",$matched_with_score);
|
||||
bleu_line_diff( $word_with_score1, $matched1, $matched_with_score1 );
|
||||
|
||||
print "<font size=-2>[".$line["id"].":".$line["bleu0"]."]</font> ";
|
||||
print "<font size=-2>[".$id."-".$line["id"].":".$line["bleu0"]."]</font> ";
|
||||
$matched0 = preg_replace('/I/',"",$matched);
|
||||
$matched_with_score0 = preg_replace('/I/',"",$matched_with_score);
|
||||
bleu_line_diff( $word_with_score0, $matched0, $matched_with_score0 );
|
||||
|
51
scripts/ems/web/bilingual-concordance.css
Normal file
51
scripts/ems/web/bilingual-concordance.css
Normal file
@ -0,0 +1,51 @@
|
||||
.pp_head {
|
||||
font-size: 150%;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.pp_target_header {
|
||||
font-size: 120%;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
table.biconcor {
|
||||
table-layout: fixed;
|
||||
padding: 0px;
|
||||
margin: 0px;
|
||||
}
|
||||
|
||||
tr.biconcor {
|
||||
padding: 0px;
|
||||
margin: 0px;
|
||||
}
|
||||
|
||||
td.biconcor {
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
padding: 0px;
|
||||
margin: 0px;
|
||||
}
|
||||
|
||||
td.pp_source_left {
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
td.pp_target_left {
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
td.pp_source {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
td.pp_target {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
td.pp_source_right {
|
||||
border-style:solid;
|
||||
border-width:0px 2px 0px 0px ;
|
||||
border-color: black;
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
|
||||
function diff() {
|
||||
global $experiment;
|
||||
$display = $_GET[run];
|
||||
$display = $_GET["run"];
|
||||
sort($display);
|
||||
while (list($i,$run) = each($display)) {
|
||||
if ($i==0) {
|
||||
@ -22,12 +22,15 @@ function compute_diff($base,$change) {
|
||||
$parameter_change = load_parameter($change);
|
||||
print "<H3>Experiment $change</H3><TABLE>";
|
||||
while (list($parameter,$base_value) = each($parameter_base)) {
|
||||
if (!array_key_exists($parameter,$parameter_change)) {
|
||||
$parameter_change[$parameter] = "";
|
||||
}
|
||||
if ($base_value != $parameter_change[$parameter]) {
|
||||
output_diff_line($parameter,$base_value,$parameter_change[$parameter]);
|
||||
}
|
||||
}
|
||||
while (list($parameter,$change_value) = each($parameter_change)) {
|
||||
if (!$parameter_base[$parameter]) {
|
||||
if (!array_key_exists($parameter,$parameter_base)) {
|
||||
output_diff_line($parameter,"",$change_value);
|
||||
}
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ function head($title) {
|
||||
<script language="javascript" src="/javascripts/scriptaculous.js"></script>
|
||||
<script language="javascript" src="hierarchical-segmentation.js"></script>
|
||||
<link href="hierarchical-segmentation.css" rel="stylesheet" type="text/css">
|
||||
<link href="bilingual-concordance.css" rel="stylesheet" type="text/css">
|
||||
</head>
|
||||
<body><h2>'.$title."</h2>\n";
|
||||
}
|
||||
@ -35,6 +36,7 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) {
|
||||
else if ($action == "PrecisionRecallDetails_show") { precision_recall_details(); }
|
||||
else if ($action == "CoverageDetails_show") { coverage_details(); }
|
||||
else if ($action == "SegmentationSummary_show") { segmentation_summary(); }
|
||||
else if ($action == "biconcor") { biconcor($_GET["phrase"]); }
|
||||
else { print "ERROR! $action"; }
|
||||
}
|
||||
else if (array_key_exists("analysis_diff_home",$_GET)) {
|
||||
|
@ -39,7 +39,7 @@ function load_experiment_info() {
|
||||
reset($experiment);
|
||||
while (list($id,$info) = each($experiment)) {
|
||||
if (file_exists($dir."/steps/new") ||
|
||||
file_exists($dir."/steps/1")) {
|
||||
file_exists($dir."/steps/$id")) {
|
||||
$stat = stat("$dir/steps/$id/parameter.$id");
|
||||
}
|
||||
else {
|
||||
@ -71,7 +71,7 @@ function load_experiment_info() {
|
||||
function load_parameter($run) {
|
||||
global $dir;
|
||||
if (file_exists($dir."/steps/new") ||
|
||||
file_exists($dir."/steps/1")) {
|
||||
file_exists($dir."/steps/$run")) {
|
||||
$file = file("$dir/steps/$run/parameter.$run");
|
||||
}
|
||||
else {
|
||||
@ -123,3 +123,49 @@ function process_file_entry($dir,$entry) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function get_coverage_analysis_version($dir,$set,$id) {
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/input-annotation")) {
|
||||
return $id;
|
||||
}
|
||||
if (file_exists("$dir/steps/$id/re-use.$id")) {
|
||||
$re_use = file("$dir/steps/$id/re-use.$id");
|
||||
foreach($re_use as $line) {
|
||||
if (preg_match("/EVALUATION:(.+):analysis-coverage (\d+)/",$line,$match) &&
|
||||
$match[1] == $set &&
|
||||
file_exists("$dir/evaluation/$set.analysis.$match[2]/input-annotation")) {
|
||||
return $match[2];
|
||||
}
|
||||
}
|
||||
}
|
||||
# legacy stuff below...
|
||||
if (! file_exists("$dir/steps/$id/REPORTING_report.$id")) {
|
||||
return 0;
|
||||
}
|
||||
$report = file("$dir/steps/$id/REPORTING_report.$id.INFO");
|
||||
foreach ($report as $line) {
|
||||
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-coverage/",$line,$match) &&
|
||||
$match[2] == $set) {
|
||||
$reuse_id = $match[1];
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$reuse_id/input-annotation")) {
|
||||
return $reuse_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function get_biconcor_version($dir,$id) {
|
||||
if (file_exists("$dir/model/biconcor.$id")) {
|
||||
return $id;
|
||||
}
|
||||
$re_use = file("$dir/steps/$id/re-use.$id");
|
||||
foreach($re_use as $line) {
|
||||
if (preg_match("/TRAINING:build-biconcor (\d+)/",$line,$match) &&
|
||||
file_exists("$dir/model/biconcor.$match[1]")) {
|
||||
return $match[1];
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,7 @@ function setup() {
|
||||
print "<TR><TD><A HREF=\"?setup=$dir[0]\">$dir[0]</A></TD><TD>$dir[1]</TD><TD>$dir[2]</TD><TD>$dir[3]</TD></TR>\n";
|
||||
}
|
||||
print "</TABLE>\n";
|
||||
print "<P>To add experiment, edit /fs/thor4/html/experiment/setup";
|
||||
print "<P>To add experiment, edit setup";
|
||||
}
|
||||
|
||||
function overview() {
|
||||
|
@ -6,6 +6,7 @@ use Getopt::Long;
|
||||
my $help;
|
||||
my $lc = 0; # lowercase the corpus?
|
||||
my $ignore_ratio = 0;
|
||||
my $ignore_xml = 0;
|
||||
my $enc = "utf8"; # encoding of the input and output files
|
||||
# set to anything else you wish, but I have not tested it yet
|
||||
my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
|
||||
@ -17,6 +18,7 @@ GetOptions(
|
||||
"lowercase|lc" => \$lc,
|
||||
"encoding=s" => \$enc,
|
||||
"ignore-ratio" => \$ignore_ratio,
|
||||
"ignore-xml" => \$ignore_xml,
|
||||
"max-word-length|mwl=s" => \$max_word_length
|
||||
) or exit(1);
|
||||
|
||||
@ -108,14 +110,15 @@ while(my $f = <F>) {
|
||||
$f =~ s/ $//;
|
||||
next if $f eq '';
|
||||
next if $e eq '';
|
||||
my @E = split(/ /,$e);
|
||||
my @F = split(/ /,$f);
|
||||
next if scalar(@E) > $max;
|
||||
next if scalar(@F) > $max;
|
||||
next if scalar(@E) < $min;
|
||||
next if scalar(@F) < $min;
|
||||
next if !$ignore_ratio && scalar(@E)/scalar(@F) > 9;
|
||||
next if !$ignore_ratio && scalar(@F)/scalar(@E) > 9;
|
||||
|
||||
my $ec = &word_count($e);
|
||||
my $fc = &word_count($f);
|
||||
next if $ec > $max;
|
||||
next if $fc > $max;
|
||||
next if $ec < $min;
|
||||
next if $fc < $min;
|
||||
next if !$ignore_ratio && $ec/$fc > 9;
|
||||
next if !$ignore_ratio && $fc/$ec > 9;
|
||||
# Skip this segment if any factor is longer than $max_word_length
|
||||
my $max_word_length_plus_one = $max_word_length + 1;
|
||||
next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
|
||||
@ -126,7 +129,6 @@ while(my $f = <F>) {
|
||||
if $f =~ /[ \|]\|/;
|
||||
die "There is a blank factor in $corpus.$l2 on line $innr: $e"
|
||||
if $e =~ /[ \|]\|/;
|
||||
|
||||
|
||||
$outnr++;
|
||||
print FO $f."\n";
|
||||
@ -146,3 +148,15 @@ my $e = <E>;
|
||||
die "$corpus.$l2 is too long!" if defined $e;
|
||||
|
||||
print STDERR "Input sentences: $innr Output sentences: $outnr\n";
|
||||
|
||||
sub word_count {
|
||||
my ($line) = @_;
|
||||
if ($ignore_xml) {
|
||||
$line =~ s/<\S[^>]*\S>//g;
|
||||
$line =~ s/\s+/ /g;
|
||||
$line =~ s/^ //g;
|
||||
$line =~ s/ $//g;
|
||||
}
|
||||
my @w = split(/ /,$line);
|
||||
return scalar @w;
|
||||
}
|
||||
|
@ -15,9 +15,9 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define MAX_WORD 1000 //maximum lengthsource/target strings
|
||||
#define MAX_M 200 //maximum length of source strings
|
||||
#define MAX_N 200 //maximum length of target strings
|
||||
#define MAX_WORD 10000 // maximum lengthsource/target strings
|
||||
#define MAX_M 200 // maximum length of source strings
|
||||
#define MAX_N 200 // maximum length of target strings
|
||||
|
||||
#define UNION 1
|
||||
#define INTERSECT 2
|
||||
|
@ -12,7 +12,9 @@ if (!&GetOptions('mxpost=s' => \$MXPOST) ||
|
||||
exit(1);
|
||||
}
|
||||
|
||||
open(TAGGER,"cat $IN | perl -ne 's/—/-/g; s/\\p{Dash_Punctuation}/-/g; s/\\p{Open_Punctuation}/\(/g; s/\\p{Close_Punctuation}/\)/g; s/\\p{Initial_Punctuation}/\"/g; s/\\p{Final_Punctuation}/\"/g; s/\\p{Connector_Punctuation}/-/g; s/•/*/g; s/\\p{Currency_Symbol}/\\\$/g; s/\\p{Math_Symbol}/*/g; print \$_;' | $MXPOST/mxpost |");
|
||||
my $pipeline = "perl -ne 'chop; tr/\\x20-\\x7f/\?/c; print \$_.\"\\n\";' | tee debug | ";
|
||||
$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";
|
||||
open(TAGGER,"cat $IN | $pipeline");
|
||||
open(OUT,">$OUT");
|
||||
while(<TAGGER>) {
|
||||
foreach my $word_pos (split) {
|
||||
|
26
scripts/training/wrappers/make-factor-suffix.perl
Executable file
26
scripts/training/wrappers/make-factor-suffix.perl
Executable file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
|
||||
my ($size,$in,$out) = @ARGV;
|
||||
|
||||
open(IN,$in);
|
||||
open(OUT,">$out");
|
||||
binmode(IN, ":utf8");
|
||||
binmode(OUT, ":utf8");
|
||||
|
||||
while(<IN>) {
|
||||
my $first = 1;
|
||||
chomp; s/\s+/ /g; s/^ //; s/ $//;
|
||||
foreach my $word (split) {
|
||||
if (length($word) > $size) {
|
||||
$word = substr($word,length($word)-$size);
|
||||
}
|
||||
print OUT " " unless $first;
|
||||
$first = 0;
|
||||
print OUT lc($word);
|
||||
}
|
||||
print OUT "\n";
|
||||
}
|
||||
close(OUT);
|
||||
close(IN);
|
@ -24,7 +24,7 @@ GetOptions(
|
||||
|
||||
# parser settings
|
||||
my $MaxChar=10000;
|
||||
my $MaxWord=200;
|
||||
my $MaxWord=120;
|
||||
my $ParserBin="$COLLINS/code/parser";
|
||||
my $ParserEvn="$COLLINS/models/model2/events.gz";
|
||||
my $ParserGrm="$COLLINS/models/model2/grammar";
|
||||
@ -37,8 +37,13 @@ $pipeline .= "perl -ne 'tr/\\x20-\\x7f//cd; print \$_.\"\\n\";' | ";
|
||||
$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";
|
||||
|
||||
open(TAG,$pipeline);
|
||||
open(PARSER_IN,">$tmpfile");
|
||||
my $sentence_count=0;
|
||||
while(<TAG>) {
|
||||
if ($sentence_count % 2000 == 0) {
|
||||
close(PARSER_IN) if $sentence_count;
|
||||
open(PARSER_IN,sprintf(">%s.%05d",$tmpfile,$sentence_count/2000));
|
||||
}
|
||||
$sentence_count++;
|
||||
chop;
|
||||
|
||||
# convert tagged sequence into parser format
|
||||
@ -53,14 +58,16 @@ while(<TAG>) {
|
||||
close(TAG);
|
||||
close(PARSER_IN);
|
||||
|
||||
# parse and process output of parser
|
||||
`rm $RAW` if defined($RAW) && -e $RAW;
|
||||
$pipeline = "gunzip -c $ParserEvn | $ParserBin $tmpfile $ParserGrm 10000 1 1 1 1 |";
|
||||
$pipeline .= "tee -a \"$RAW\" |" if defined($RAW);
|
||||
# parse
|
||||
for(my $i=0;$i * 2000 < $sentence_count;$i++) {
|
||||
my $i_formatted = sprintf("%05d",$i);
|
||||
`gunzip -c $ParserEvn | $ParserBin $tmpfile.$i_formatted $ParserGrm 10000 1 1 1 1 > $tmpfile.$i_formatted.out`;
|
||||
}
|
||||
|
||||
# process output of parser
|
||||
my $DEBUG = 0;
|
||||
my $DEBUG_SPACE = " ";
|
||||
open(PARSER,$pipeline);
|
||||
open(PARSER,"cat $tmpfile.?????.out|");
|
||||
while(my $line = <PARSER>) {
|
||||
next unless $line =~ /^\(/;
|
||||
if ($line =~ /SentenceTooLong/) {
|
||||
@ -112,7 +119,7 @@ while(my $line = <PARSER>) {
|
||||
my $first=1;
|
||||
foreach (@OUT) {
|
||||
print " " unless $first;
|
||||
s/\\//;
|
||||
# s/\\//; #why?
|
||||
print $_;
|
||||
$first = 0;
|
||||
}
|
||||
@ -129,14 +136,15 @@ sub escape {
|
||||
|
||||
sub check_length {
|
||||
my ($line) = @_;
|
||||
my ($ret,$numc,$numw,@words);
|
||||
my ($numc,$numw,@words);
|
||||
|
||||
return 0 if $line =~ /^\d+ [^a-z0-9]+$/i || $line eq "0" || $line eq "0 ";
|
||||
|
||||
$numc = length($line);
|
||||
@words = split(" ",$line);
|
||||
$numw = ($#words+1)/2;
|
||||
|
||||
$ret = (($numc <= $MaxChar) && ($numw <= $MaxWord));
|
||||
$ret;
|
||||
return ($numc <= $MaxChar) && ($numw <= $MaxWord);
|
||||
}
|
||||
|
||||
sub conv_posfmt {
|
||||
|
Loading…
Reference in New Issue
Block a user