improvements to ems analysis

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4026 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
phkoehn 2011-06-21 21:52:13 +00:00
parent 2cdc39f63f
commit 6acd6a8684
19 changed files with 1142 additions and 312 deletions

View File

@ -14,7 +14,7 @@ private:
INDEX *m_sentenceEnd;
INDEX m_size;
INDEX m_sentenceCount;
char m_unaligned[ 256 ];
char m_unaligned[ 256 ]; // here for speed (local to PhraseAlignment)
public:
~Alignment();
@ -27,4 +27,18 @@ public:
void Load( string fileName );
void Save( string fileName );
vector<string> Tokenize( const char input[] );
INDEX GetSentenceStart( INDEX sentence ) {
if (sentence == 0) return 0;
return m_sentenceEnd[ sentence-1 ] + 2;
}
INDEX GetNumberOfAlignmentPoints( INDEX sentence ) {
return ( m_sentenceEnd[ sentence ] - GetSentenceStart( sentence ) ) / 2;
}
char GetSourceWord( INDEX sentence, INDEX alignment_point ) {
return m_array[ GetSentenceStart( sentence ) + alignment_point*2 ];
}
char GetTargetWord( INDEX sentence, INDEX alignment_point ) {
return m_array[ GetSentenceStart( sentence ) + alignment_point*2 + 1 ];
}
};

View File

@ -6,5 +6,5 @@ clean:
.cpp.o:
g++ -O6 -g -c $<
biconcor: Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o
g++ Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o -o biconcor
biconcor: Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o Mismatch.o PhrasePair.o PhrasePairCollection.o biconcor.o base64.o
g++ Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o Mismatch.o PhrasePair.o PhrasePairCollection.o biconcor.o base64.o -o biconcor

View File

@ -0,0 +1,246 @@
#include "Mismatch.h"
#include "Vocabulary.h"
using namespace std;
#define UNANNOTATED 0
#define PRE_ALIGNED 1
#define POST_ALIGNED 2
#define UNALIGNED 3
#define MISALIGNED 4
#define ALIGNED 5
void Mismatch::PrintClippedHTML( ostream* out, int width )
{
char source_annotation[256], target_annotation[256];
vector< string > label_class;
label_class.push_back( "" );
label_class.push_back( "mismatch_pre_aligned" );
label_class.push_back( "mismatch_post_aligned" );
label_class.push_back( "null_aligned" );
label_class.push_back( "mismatch_misaligned" );
label_class.push_back( "mismatch_aligned" );
for(int i=0; i<m_source_length;i++) source_annotation[i] = UNANNOTATED;
for(int i=0; i<m_target_length;i++) target_annotation[i] = UNANNOTATED;
if (m_unaligned) {
// find alignment points for prior and next word(s) and
// center target phrase around those.
bool found_aligned = false;
for(int i=1; i<m_source_length && !found_aligned; i++) {
if (m_source_start-i >= 0) {
int word_id = m_source_start-i;
source_annotation[ word_id ] = UNALIGNED;
if (!m_source_unaligned[ word_id ]) {
found_aligned = true;
LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
}
}
if (m_source_end+i < m_source_length) {
int word_id = m_source_end+i;
source_annotation[ word_id ] = UNALIGNED;
if (!m_source_unaligned[ word_id ]) {
found_aligned = true;
LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
}
}
}
}
// misalignment
else {
// label aligned output words
for(int i=m_source_start; i<=m_source_end; i++)
LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
// find first and last
int target_start = -1;
int target_end;
for(int i=0; i<m_target_length; i++)
if (target_annotation[i] == ALIGNED) {
if (target_start == -1)
target_start = i;
target_end = i;
}
// go over all enclosed target words
for(int i=target_start; i<=target_end; i++) {
// label other target words as unaligned or misaligned
if (m_target_unaligned[ i ])
target_annotation[ i ] = UNALIGNED;
else {
if (target_annotation[ i ] != ALIGNED)
target_annotation[ i ] = MISALIGNED;
// loop over aligned source words
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
// if not part of the source phrase -> also misaligned
if (source_word < m_source_start || source_word > m_source_end)
source_annotation[ source_word ] = MISALIGNED;
}
}
}
}
// closure
bool change = true;
while(change) {
change = false;
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
if (source_annotation[source_word] != UNANNOTATED &&
target_annotation[target_word] == UNANNOTATED) {
target_annotation[target_word] = MISALIGNED;
change = true;
}
if (source_annotation[source_word] == UNANNOTATED &&
target_annotation[target_word] != UNANNOTATED) {
source_annotation[source_word] = MISALIGNED;
change = true;
}
}
}
}
// print source
// shorten source context if too long
int sentence_start = m_source_position - m_source_start;
int context_space = width/2;
for(int i=m_source_start;i<=m_source_end;i++)
context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
context_space /= 2;
int remaining = context_space;
int start_word = m_source_start;
for(;start_word>0 && remaining>0; start_word--)
remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
if (remaining<0 || start_word == -1) start_word++;
remaining = context_space;
int end_word = m_source_end;
for(;end_word<m_source_length && remaining>0; end_word++)
remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
end_word--;
// output with markup
*out << "<tr><td class=\"pp_source_left\">";
char current_label = UNANNOTATED;
if (start_word>0) {
current_label = source_annotation[start_word-1];
*out << "... ";
}
for(int i=start_word; i<=end_word; i++) {
// change to phrase block
if (i == m_source_start) {
if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>";
*out << "</td><td class=\"pp_source\">";
current_label = UNANNOTATED;
}
// change to labeled word
else if (source_annotation[i] != current_label &&
source_annotation[i] != ALIGNED) {
if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>";
if (source_annotation[i] != UNANNOTATED)
*out << "<span class=\""
<< label_class[ source_annotation[i] ]
<< "\">";
current_label = source_annotation[i];
}
// output word
*out << m_suffixArray->GetWord( sentence_start + i ) << " ";
// change to right context block
if (i == m_source_end) {
*out << "</td><td class=\"pp_source_right\">";
current_label = UNANNOTATED;
}
}
if (current_label != UNANNOTATED && end_word>m_source_end)
*out << "</span>";
if (end_word<m_source_length-1)
*out << "... ";
// print target
// shorten target context if too long
int target_start = -1;
int target_end;
for(int i=0; i<m_target_length; i++)
if (target_annotation[i] != UNANNOTATED) {
if (target_start == -1)
target_start = i;
target_end = i;
}
context_space = width/2;
for(int i=target_start;i<=target_end;i++)
context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
while (context_space < 0) { // shorten matched part, if too long
context_space +=
m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
target_start++;
target_end--;
}
context_space /= 2;
remaining = context_space;
start_word = target_start;
for(;start_word>0 && remaining>0; start_word--) {
//cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
}
if (remaining<0 || start_word == -1) start_word++;
remaining = context_space;
end_word = target_end;
for(;end_word<m_target_length && remaining>0; end_word++) {
//cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
}
end_word--;
// output with markup
*out << "</td><td class=\"mismatch_target\">";
current_label = UNANNOTATED;
if (start_word>0) {
current_label = target_annotation[start_word-1];
*out << "... ";
}
for(int i=start_word; i<=end_word; i++) {
if (target_annotation[i] != current_label) {
if (current_label != UNANNOTATED && i!=start_word)
*out << "</span>";
if (target_annotation[i] != UNANNOTATED)
*out << "<span class=\""
<< label_class[ target_annotation[i] ]
<< "\">";
current_label = target_annotation[i];
}
// output word
*out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
}
if (current_label != UNANNOTATED && end_word>target_end)
*out << "</span>";
if (end_word<m_target_length-1)
*out << "... ";
*out << "</td></tr>";
}
void Mismatch::LabelSourceMatches( char *source_annotation, char *target_annotation, char source_id, char label ) {
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
source_annotation[ source_id ] = label;
target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
}
}
}

View File

@ -0,0 +1,70 @@
#include <string>
#include <stdlib.h>
#include <cstring>
#include <fstream>
#include <sstream>
#include <iostream>
#include "SuffixArray.h"
#include "TargetCorpus.h"
#include "Alignment.h"
#pragma once
using namespace std;
class Mismatch
{
public:
typedef unsigned int INDEX;
private:
SuffixArray *m_suffixArray;
TargetCorpus *m_targetCorpus;
Alignment *m_alignment;
INDEX m_sentence_id;
INDEX m_num_alignment_points;
char m_source_length;
char m_target_length;
SuffixArray::INDEX m_source_position;
char m_source_start, m_source_end;
char m_source_unaligned[ 256 ];
char m_target_unaligned[ 256 ];
char m_unaligned;
public:
Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, char source_length, char target_length, char source_start, char source_end )
:m_suffixArray(sa)
,m_targetCorpus(tc)
,m_alignment(a)
,m_sentence_id(sentence_id)
,m_source_position(position)
,m_source_length(source_length)
,m_target_length(target_length)
,m_source_start(source_start)
,m_source_end(source_end)
{
// initialize unaligned indexes
for(char i=0; i<m_source_length; i++) {
m_source_unaligned[i] = true;
}
for(char i=0; i<m_target_length; i++) {
m_target_unaligned[i] = true;
}
m_num_alignment_points =
m_alignment->GetNumberOfAlignmentPoints( sentence_id );
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
m_source_unaligned[ m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
m_target_unaligned[ m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
}
m_unaligned = true;
for(char i=source_start; i<=source_end; i++) {
if (!m_source_unaligned[ i ]) {
m_unaligned = false;
}
}
}
~Mismatch () {}
bool Unaligned() { return m_unaligned; }
void PrintClippedHTML( ostream* out, int width );
void LabelSourceMatches( char *source_annotation, char *target_annotation, char source_id, char label );
};

View File

@ -145,14 +145,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width )
int source_pre_width = (source_width-source.size())/2;
int source_post_width = (source_width-source.size()+1)/2;
// if phrase is too long, don't show any context
if (source.size() > width) {
source_pre_width = 0;
source_post_width = 0;
}
if (source_pre.size()>source_pre_width)
// too long -> truncate and add "..."
if (source_pre.size()>source_pre_width) {
// first skip up to a space
while(source_pre_width>0 &&
source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
source_pre_width--;
}
source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
if (source_post.size() > source_post_width)
}
if (source_post.size() > source_post_width) {
while(source_post_width>0 &&
source_post.substr(source_post_width-1,1) != " ") {
source_post_width--;
}
source_post = source_post.substr( 0, source_post_width ) + "...";
}
*out << "<tr><td class=\"pp_source_left\">"
<< source_pre
@ -167,8 +180,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width )
string target_pre = "";
string target = "";
string target_post = "";
int target_pre_null_width = 0;
int target_post_null_width = 0;
for( char i=0; i<m_target_start; i++ ) {
target_pre += " " + m_targetCorpus->GetWord( m_sentence_id, i);
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_pre += " " + word;
if (i >= m_target_start-m_pre_null)
target_pre_null_width += word.size() + 1;
}
for( char i=m_target_start; i<=m_target_end; i++ ) {
if (i>m_target_start) target += " ";
@ -176,7 +194,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width )
}
for( char i=m_target_end+1; i<m_target_length; i++ ) {
if (i>m_target_end+1) target_post += " ";
target_post += m_targetCorpus->GetWord( m_sentence_id, i);
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_post += word;
if (i-(m_target_end+1) < m_post_null) {
target_post_null_width += word.size() + 1;
}
}
int target_pre_width = (target_width-target.size())/2;
@ -186,10 +208,47 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width )
target_pre_width = 0;
target_post_width = 0;
}
if (target_pre.size() > target_pre_width)
if (target_pre.size() < target_pre_width)
target_pre_width = target_pre.size();
else {
while(target_pre_width>0 &&
target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
target_pre_width--;
}
target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
if (target_post.size() > target_post_width)
target_post = target_post.substr( 0, target_post_width ) + "...";
}
if (target_post.size() < target_post_width) {
target_post_width = target_post.size();
}
else {
while(target_post_width>0 &&
target_post.substr(target_post_width-1,1) != " ") {
target_post_width--;
}
target_post = target_post.substr( 0, target_post_width ) + "...";
}
if (m_pre_null) {
//cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
if (target_pre_width < target_pre.size())
target_pre_null_width -= target_pre.size()-target_pre_width;
target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
+ "<span class=\"null_aligned\">"
+ target_pre.substr(target_pre_width-target_pre_null_width)
+ "</span>";
}
if (m_post_null) {
//cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
if (target_post_null_width>target_post.size()) {
target_post_null_width = target_post.size();
}
target_post = "<span class=\"null_aligned\">"
+ target_post.substr(0,target_post_null_width)
+ "</span>"
+ target_post.substr(target_post_null_width);
}
*out << "<td class=\"pp_target_left\">"
<< target_pre

View File

@ -47,8 +47,9 @@ bool PhrasePairCollection::GetCollection( const vector< string > sourceString )
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
for( char pre = 0; pre <= pre_null; pre++ ) {
for( char post = 0; post <= post_null; post++ ) {
bool null_boundary_words = false;
for( char pre = 0; pre <= pre_null && (pre==0||null_boundary_words); pre++ ) {
for( char post = 0; post <= post_null && (post==0||null_boundary_words); post++ ) {
vector< WORD_ID > targetString;
cerr << "; ";
for( char target = target_start-pre; target <= target_end+post; target++ ) {
@ -67,6 +68,18 @@ bool PhrasePairCollection::GetCollection( const vector< string > sourceString )
}
}
}
else {
cerr << "mismatch " << (i-first_match)
<< " in sentence " << sentence_id
<< ", starting at word " << source_start
<< " of " << sentence_length
<< ". target sentence has " << target_length << " words.";
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
if (mismatch->Unaligned())
m_unaligned.push_back( mismatch );
else
m_mismatch.push_back( mismatch );
}
cerr << endl;
if (found > m_max_lookup) {
@ -92,23 +105,89 @@ void PhrasePairCollection::Print()
void PhrasePairCollection::PrintHTML()
{
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
int pp_target = 0;
bool singleton = false;
// loop over all translations
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
cout << "<p class=\"pp_target_header\">";
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
int count = ppWithSameTarget->size();
cout << "(" << count << "/" << m_size << ")" << endl;
cout << "<p><table align=\"center\">";
int count = ppWithSameTarget->size();
if (!singleton) {
if (count == 1) {
singleton = true;
cout << "<p class=\"pp_singleton_header\">singleton"
<< (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
<< (m_collection.end() - ppWithSameTarget)
<< "/" << m_size << ")</p>";
}
else {
cout << "<p class=\"pp_target_header\">";
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
cout << " (" << count << "/" << m_size << ")" << endl;
cout << "<p><div id=\"pp_" << pp_target << "\">";
}
cout << "<table align=\"center\">";
}
vector< PhrasePair* >::iterator p;
int pp = 0;
for(p = ppWithSameTarget->begin(); pp<count && p != ppWithSameTarget->end(); p++, pp++ ) {
// loop over all sentences where translation occurs
int pp=0;
int i=0;
for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_pp) {
p += count/m_max_pp-1;
pp += count/m_max_pp-1;
}
}
cout << "</table>\n";
if (i == 10 && pp < count) {
// extended table
cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
cout << "<table align=\"center\">";
for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_pp && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_pp) {
p += count/m_max_pp-1;
pp += count/m_max_pp-1;
}
}
}
if (!singleton) cout << "</table></div>\n";
if (!singleton && pp_target == 9) {
cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
cout << "<p class=\"pp_target_header\">(more)</p></div>";
cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
}
}
if (singleton) cout << "</table></div>\n";
else if (pp_target > 9) cout << "</div>";
int max_mismatch = m_max_pp/3;
// unaligned phrases
if (m_unaligned.size() > 0) {
cout << "<p class=\"pp_singleton_header\">unaligned"
<< " (" << (m_unaligned.size()) << ")</p>";
cout << "<table align=\"center\">";
int step_size = 1;
if (m_unaligned.size() > max_mismatch)
step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
for(int i=0;i<m_unaligned.size();i+=step_size)
m_unaligned[i]->PrintClippedHTML( &cout, 160 );
cout << "</table>";
}
// mismatched phrases
if (m_mismatch.size() > 0) {
cout << "<p class=\"pp_singleton_header\">mismatched"
<< " (" << (m_mismatch.size()) << ")</p>";
cout << "<table align=\"center\">";
int step_size = 1;
if (m_mismatch.size() > max_mismatch)
step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
for(int i=0;i<m_mismatch.size();i+=step_size)
m_mismatch[i]->PrintClippedHTML( &cout, 160 );
cout << "</table>";
}
}

View File

@ -3,6 +3,7 @@
#include "TargetCorpus.h"
#include "Alignment.h"
#include "PhrasePair.h"
#include "Mismatch.h"
#pragma once
@ -16,6 +17,7 @@ private:
TargetCorpus *m_targetCorpus;
Alignment *m_alignment;
vector< vector<PhrasePair*> > m_collection;
vector< Mismatch* > m_mismatch, m_unaligned;
int m_size;
int m_max_lookup;
int m_max_pp_target;

View File

@ -0,0 +1,123 @@
/*
base64.cpp and base64.h
Copyright (C) 2004-2008 René Nyffenegger
This source code is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this source code must not be misrepresented; you must not
claim that you wrote the original source code. If you use this source code
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original source code.
3. This notice may not be removed or altered from any source distribution.
René Nyffenegger rene.nyffenegger@adp-gmbh.ch
*/
#include "base64.h"
#include <iostream>
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
static inline bool is_base64(unsigned char c) {
return (isalnum(c) || (c == '+') || (c == '/'));
}
std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) {
std::string ret;
int i = 0;
int j = 0;
unsigned char char_array_3[3];
unsigned char char_array_4[4];
while (in_len--) {
char_array_3[i++] = *(bytes_to_encode++);
if (i == 3) {
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for(i = 0; (i <4) ; i++)
ret += base64_chars[char_array_4[i]];
i = 0;
}
}
if (i)
{
for(j = i; j < 3; j++)
char_array_3[j] = '\0';
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for (j = 0; (j < i + 1); j++)
ret += base64_chars[char_array_4[j]];
while((i++ < 3))
ret += '=';
}
return ret;
}
std::string base64_decode(std::string const& encoded_string) {
int in_len = encoded_string.size();
int i = 0;
int j = 0;
int in_ = 0;
unsigned char char_array_4[4], char_array_3[3];
std::string ret;
while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
char_array_4[i++] = encoded_string[in_]; in_++;
if (i ==4) {
for (i = 0; i <4; i++)
char_array_4[i] = base64_chars.find(char_array_4[i]);
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (i = 0; (i < 3); i++)
ret += char_array_3[i];
i = 0;
}
}
if (i) {
for (j = i; j <4; j++)
char_array_4[j] = 0;
for (j = 0; j <4; j++)
char_array_4[j] = base64_chars.find(char_array_4[j]);
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (j = 0; (j < i - 1); j++) ret += char_array_3[j];
}
return ret;
}

View File

@ -0,0 +1,4 @@
#include <string>
std::string base64_encode(unsigned char const* , unsigned int len);
std::string base64_decode(std::string const& s);

View File

@ -3,6 +3,7 @@
#include "Alignment.h"
#include "PhrasePairCollection.h"
#include <getopt.h>
#include "base64.h"
using namespace std;
@ -32,7 +33,7 @@ int main(int argc, char* argv[])
{0, 0, 0, 0}
};
int option_index = 0;
int c = getopt_long (argc, argv, "l:s:c:q:t:a:h", long_options, &option_index);
int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:h", long_options, &option_index);
if (c == -1) break;
switch (c) {
case 'l':
@ -53,6 +54,10 @@ int main(int argc, char* argv[])
fileNameSource = string(optarg);
createFlag = true;
break;
case 'Q':
query = base64_decode(string(optarg));
queryFlag = true;
break;
case 'q':
query = string(optarg);
queryFlag = true;

View File

@ -370,14 +370,14 @@ build-generation-custom
ignore-unless: AND generation-factors generation-corpus
default-name: model/generation-table
create-config
in: reordering-table phrase-translation-table generation-table LM:binlm biconcor-model
in: reordering-table phrase-translation-table generation-table LM:binlm
out: config
ignore-if: use-hiero INTERPOLATED-LM:script
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings
default-name: model/moses.ini
error: Unknown option
create-config-interpolated-lm
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm biconcor-model
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm
out: config
ignore-if: use-hiero
ignore-unless: INTERPOLATED-LM:script
@ -777,6 +777,6 @@ analysis-precision
[REPORTING] single
report
in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec
in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model
out: report
default-name: evaluation/report

View File

@ -1730,7 +1730,6 @@ sub define_training_create_config {
my ($config,
$reordering_table,$phrase_translation_table,$generation_table,@LM)
= &get_output_and_input($step_id);
if ($LM[$#LM] =~ /biconcor/ || $LM[$#LM] eq '') { pop @LM; }
my $cmd = &get_training_setting(9);

View File

@ -1,6 +1,6 @@
<?php
// main page frame, triggers the loading of parts
# main page frame, triggers the loading of parts
function show_analysis() {
global $task,$user,$setup,$id,$set;
global $dir;
@ -8,13 +8,14 @@ function show_analysis() {
head("Analysis: $task ($user), Set $set, Run $id");
?><script>
function show(field,sort,count) {
function show(field,sort,count,filter) {
var url = '?analysis=' + field + '_show'
+ '&setup=<?php print $setup ?>'
+ '&id=<?php print $id ?>'
+ '&set=<?php print $set ?>'
+ '&sort=' + sort
+ '&count=' + count;
+ '&count=' + count
+ '&filter=' + filter;
new Ajax.Updater(field, url, { method: 'get', evalScripts: true });
}
function ngram_show(type,order,count,sort,smooth) {
@ -61,7 +62,7 @@ function hide_word_info(sentence) {
function show_biconcor(sentence,phrase) {
var div = "biconcor-"+sentence;
var url = '?analysis=biconcor'
+ '&setup=<?php print $setup ?>&id=<?php print get_biconcor_version($dir,$id); ?>&set=<?php print $set ?>'
+ '&setup=<?php print $setup ?>&id=<?php print get_biconcor_version($dir,$set,$id); ?>&set=<?php print $set ?>'
+ '&sentence=' + sentence
+ '&phrase=' + encodeURIComponent(phrase);
document.getElementById(div).innerHTML = "<center><img src=\"spinner.gif\" width=48 height=48></center>";
@ -83,7 +84,7 @@ function close_biconcor(sentence) {
<div id="PrecisionRecallDetails"></div>
<div id="bleu">(loading...)</div>
<script language="javascript">
show('bleu','',5);
show('bleu','',5,'');
</script>
</body></html>
<?php
@ -93,13 +94,14 @@ function precision_by_coverage() {
global $experiment,$evalset,$dir,$set,$id;
$img_width = 1000;
print "<h3>Precision by Coverage</h3>";
print "<h3>Precision of Input Words by Coverage</h3>";
print "The graphs display what ratio of words of a specific type are translated correctly (yellow), and what ratio is deleted (blue).";
print " The extend of the boxes is scaled on the x-axis by the number of tokens of the displayed type.";
// load data
$data = file("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage");
$data = file(get_current_analysis_filename("precision","precision-by-corpus-coverage"));
$total = 0;
$log_info = array();
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$info[$item[0]]["precision"] = $item[1];
@ -125,8 +127,8 @@ function precision_by_coverage() {
print "<h4>By log<sub>2</sub>-count in the training corpus</h4>";
precision_by_coverage_graph("byCoverage",$log_info,$total,$img_width,SORT_NUMERIC);
// load factored data
$d = dir("$dir/evaluation/$set.analysis.$id");
# load factored data
$d = dir("$dir/evaluation/$set.analysis.".get_precision_analysis_version($dir,$set,$id));
while (false !== ($file = $d->read())) {
if (preg_match('/precision-by-corpus-coverage.(.+)$/',$file, $match)) {
precision_by_coverage_factored($img_width,$total,$file,$match[1]);
@ -136,7 +138,7 @@ function precision_by_coverage() {
function precision_by_coverage_factored($img_width,$total,$file,$factor_id) {
global $dir,$set,$id;
$data = file("$dir/evaluation/$set.analysis.$id/$file");
$data = file(get_current_analysis_filename("precision",$file));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$factor = $item[0];
@ -187,7 +189,7 @@ function precision_by_word($type) {
$byFactor = $match[1];
}
$data = file("$dir/evaluation/$set.analysis.$id/precision-by-input-word");
$data = file(get_current_analysis_filename("precision","precision-by-input-word"));
for($i=0;$i<count($data);$i++) {
$line = rtrim($data[$i]);
$item = split("\t",$line);
@ -204,8 +206,7 @@ function precision_by_word($type) {
//# filter for factor
$word = $item[5];
$factor = $item[6];
if ($byFactor != "false" && $byFactor != $factor) {
if ($byFactor != "false" && $byFactor != $item[6]) {
continue;
}
@ -218,7 +219,7 @@ function precision_by_word($type) {
print "<table border=1><tr><td align=center>Count</td><td align=center colspan=2>Precision</td><td align=center colspan=2>Delete</td><td align=center>Length</td></tr>\n";
foreach ($info as $word => $wordinfo) {
print "<tr><td align=center>$word</td>";
print "<tr><td align=center><a href=\"javascript:show('bleu','order',5,'".base64_encode($word)."')\">$word</a></td>";
printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%.1f/%d</font></td>",$wordinfo["precision"]/$wordinfo["total"]*100,"%",$wordinfo["precision"],$wordinfo["total"]);
printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%d/%d</font></td>",$wordinfo["delete"]/$wordinfo["total"]*100,"%",$wordinfo["delete"],$wordinfo["total"]);
printf("<td align=right>%.3f</td>",$wordinfo["length"]/$wordinfo["total"]);
@ -361,7 +362,7 @@ ctx.font = '9px serif';
print "</script>";
}
// stats on precision and recall
//# stats on precision and recall
function precision_recall_details() {
?>
<table width=100%>
@ -389,20 +390,20 @@ ngram_show('recall',4,5,'',0);
<?php
}
// stats on ngram precision
//# stats on ngram precision
function ngram_summary() {
global $experiment,$evalset,$dir,$set,$id;
// load data
$data = file("$dir/evaluation/$set.analysis.$id/summary");
//# load data
$data = file(get_current_analysis_filename("basic","summary"));
for($i=0;$i<count($data);$i++) {
$item = split(": ",$data[$i]);
$info[$item[0]] = $item[1];
}
print "<table cellspacing=5 width=100%><tr><td valign=top align=center bgcolor=#eeeeee>";
//foreach (array("precision","recall") as $type) {
print "<b>Precision</b>\n";
//#foreach (array("precision","recall") as $type) {
print "<b>Precision of Output</b>\n";
$type = "precision";
print "<table><tr><td>$type</td><td>1-gram</td><td>2-gram</td><td>3-gram</td><td>4-gram</td></tr>\n";
printf("<tr><td>correct</td><td>%d</td><td>%d</td><td>%d</td><td>%d</td></tr>\n",
@ -424,8 +425,8 @@ function ngram_summary() {
//}
print "<A HREF=\"javascript:generic_show('PrecisionRecallDetails','')\">details</A> ";
if (file_exists("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage")) {
print "| <A HREF=\"javascript:generic_show('PrecisionByCoverage','')\">breakdown by coverage</A> ";
if (file_exists(get_current_analysis_filename("precision","precision-by-corpus-coverage"))) {
print "| <A HREF=\"javascript:generic_show('PrecisionByCoverage','')\">precision of input by coverage</A> ";
}
print "</td><td valign=top valign=top align=center bgcolor=#eeeeee>";
@ -445,8 +446,7 @@ function ngram_summary() {
printf("<p>length-diff: %d (%.1f%s)",$info["precision-1-total"]-$info["recall-1-total"],($info["precision-1-total"]-$info["recall-1-total"])/$info["recall-1-total"]*100,"%");
// coverage
$coverage_id = get_coverage_analysis_version($dir,$set,$id);
if (file_exists("$dir/evaluation/$set.analysis.$coverage_id/corpus-coverage-summary")) {
if (file_exists(get_current_analysis_filename("coverage","corpus-coverage-summary"))) {
print "</td><td valign=top align=center bgcolor=#eeeeee>";
print "<div id=\"CoverageSummary\">";
coverage_summary();
@ -454,8 +454,8 @@ function ngram_summary() {
}
// phrase segmentation
if (file_exists("$dir/evaluation/$set.analysis.$id/segmentation") ||
file_exists("$dir/evaluation/$set.analysis.$id/rule")) {
if (file_exists(get_current_analysis_filename("basic","segmentation")) ||
file_exists(get_current_analysis_filename("basic","rule"))) {
print "</td><td valign=top align=center bgcolor=#eeeeee>";
print "<div id=\"SegmentationSummary\">";
segmentation_summary();
@ -463,7 +463,7 @@ function ngram_summary() {
}
// rules
if (file_exists("$dir/evaluation/$set.analysis.$id/rule")) {
if (file_exists(get_current_analysis_filename("basic","rule"))) {
print "</td><td valign=top align=center bgcolor=#eeeeee>";
print "<div id=\"RuleSummary\">";
rule_summary();
@ -479,7 +479,7 @@ function ngram_show($type) {
// load data
$order = $_GET['order'];
$data = file("$dir/evaluation/$set.analysis.$id/n-gram-$type.$order");
$data = file(get_current_analysis_filename("basic","n-gram-$type.$order"));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$line["total"] = $item[0];
@ -572,7 +572,7 @@ function coverage_details() {
$total[$corpus][$b][$i] = 0;
}
}
$data = file(filename_fallback_to_factored("$dir/evaluation/$set.analysis.$id/$corpus-coverage-summary"));
$data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","$corpus-coverage-summary")));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
if ($item[1]>5) {
@ -614,7 +614,7 @@ function coverage_details() {
}
print "</tr></table>\n";
$data = file(filename_fallback_to_factored("$dir/evaluation/$set.analysis.$id/ttable-unknown"));
$data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","ttable-unknown")));
for($i=0;$i<count($data);$i++) {
list($word,$count) = split("\t",$data[$i]);
$item["word"] = $word;
@ -678,8 +678,7 @@ function filename_fallback_to_factored($file) {
function factor_name($input_output,$factor_id) {
global $dir,$set,$id;
$coverage_id = get_coverage_analysis_version($dir,$set,$id);
$file = "$dir/evaluation/$set.analysis.$coverage_id/factor-names";
$file = get_current_analysis_filename("coverage","factor-names");
if (!file_exists($file)) {
return $factor_id;
}
@ -703,8 +702,7 @@ function coverage_summary() {
}
$total[$corpus][$b] = 0;
}
$coverage_id = get_coverage_analysis_version($dir,$set,$id);
$data = file(filename_fallback_to_factored("$dir/evaluation/$set.analysis.$coverage_id/$corpus-coverage-summary"));
$data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","$corpus-coverage-summary")));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
if ($item[0] == 1) {
@ -768,8 +766,9 @@ function segmentation_summary() {
}
$total = 0;
if (file_exists("$dir/evaluation/$set.analysis.$id/segmentation")) {
$data = file("$dir/evaluation/$set.analysis.$id/segmentation");
$file = get_current_analysis_filename("basic","segmentation");
if (file_exists($file)) {
$data = file($file);
for($i=0;$i<count($data);$i++) {
list($in,$out,$c) = split("\t",$data[$i]);
if ($by == "word") { $c *= $in; }
@ -780,9 +779,12 @@ function segmentation_summary() {
}
}
else {
$data = file("$dir/evaluation/$set.analysis.$id/rule");
$data = file(get_current_analysis_filename("basic","rule"));
for($i=0;$i<count($data);$i++) {
list($type,$rule,$c) = split("\t",$data[$i]);
$field = split("\t",$data[$i]);
$type = $field[0];
$rule = $field[1];
if (count($field) > 2) { $c = $field[2]; } else { $c = 0; }
if ($type == "rule") {
list($rule_in,$in,$nt,$rule_out,$out) = split(":",$rule);
if ($by == "word") { $c *= $in; }
@ -822,9 +824,14 @@ function segmentation_summary() {
// hierarchical rules used in translation
function rule_summary() {
global $dir,$set,$id;
$data = file("$dir/evaluation/$set.analysis.$id/rule");
$data = file(get_current_analysis_filename("basic","rule"));
$rule = array(); $count = array(); $count_nt = array(); $count_w = array();
$nt_count = 0; $total = 0;
foreach ($data as $item) {
list($type,$d,$d2) = split("\t",$item);
$field = split("\t",$item);
$type = $field[0];
$d = $field[1];
if (count($field) > 2) { $d2 = $field[2]; } else { $d2 = 0; }
if ($type == "sentence-count") {
$sentence_count = $d;
}
@ -843,12 +850,16 @@ function rule_summary() {
$rule_out = preg_replace("/b/","y",$rule_out);
$rule_out = preg_replace("/c/","z",$rule_out);
$nt_count += $d2 * $nt;
if (!array_key_exists($d,$rule)) { $rule[$d] = 0; }
$rule[$d] += $d2;
if (!array_key_exists($nt,$count)) { $count[$nt] = 0; }
$count[$nt] += $d2;
$just_nt = preg_replace("/\d/","",$rule_in)."-".preg_replace("/\d/","",$rule_out);
$no_wc = preg_replace("/\d/","W",$rule_in)."-".preg_replace("/\d/","",$rule_out);
if ($just_nt == "-") { $just_nt = "lexical"; }
if (!array_key_exists($just_nt,$count_nt)) { $count_nt[$just_nt] = 0; }
$count_nt[$just_nt] += $d2;
if (!array_key_exists($no_wc,$count_w)) { $count_w[$no_wc] = 0; }
$count_w[$no_wc] += $d2;
$total += $d2;
}
@ -866,108 +877,189 @@ function rule_summary() {
// annotated sentences, navigation
function bleu_show() {
$count = $_GET['count'];
if ($count == 0) { $count = 5; }
print "<b>annotated sentences</b><br><font size=-1>sorted by ";
if ($_GET['sort'] == "order" || $_GET['sort'] == "") {
print "order ";
}
else {
print "<A HREF=\"javascript:show('bleu','order',$count)\">order</A> ";
}
if ($_GET['sort'] == "best") {
print "order ";
}
else {
print "<A HREF=\"javascript:show('bleu','best',$count)\">best</A> ";
}
if ($_GET['sort'] == "worst") {
print "order ";
}
else {
print "<A HREF=\"javascript:show('bleu','worst',$count)\">worst</A> ";
}
#print "display <A HREF=\"\">fullscreen</A> ";
$count = $_GET['count'];
if ($count == 0) { $count = 5; }
print "showing $count ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count)\">more</A> ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999)\">all</A> ";
print "</font><BR>\n";
$filter = "";
if (array_key_exists("filter",$_GET)) {
$filter = base64_decode($_GET['filter']);
}
sentence_annotation();
print "<p align=center><A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count)\">5 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',10+$count)\">10 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',20+$count)\">20 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',50+$count)\">50 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',100+$count)\">100 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999)\">all</A> ";
print "<b>annotated sentences</b><br><font size=-1>sorted by: ";
if ($_GET['sort'] == "order" || $_GET['sort'] == "") { print "order "; }
else {
print "<A HREF=\"javascript:show('bleu','order',$count,'".base64_encode($filter)."')\">order</A> ";
}
if ($_GET['sort'] == "best") { print "best "; }
else {
print "<A HREF=\"javascript:show('bleu','best',$count,'".base64_encode($filter)."')\">best</A> ";
}
if ($_GET['sort'] == "25") { print "25% "; }
else {
print "<A HREF=\"javascript:show('bleu','25',$count,'".base64_encode($filter)."')\">25%</A> ";
}
if ($_GET['sort'] == "avg") { print "avg "; }
else {
print "<A HREF=\"javascript:show('bleu','avg',$count,'".base64_encode($filter)."')\">avg</A> ";
}
if ($_GET['sort'] == "75") { print "75% "; }
else {
print "<A HREF=\"javascript:show('bleu','75',$count,'".base64_encode($filter)."')\">75%</A> ";
}
if ($_GET['sort'] == "worst") { print "worst; "; }
else {
print "<A HREF=\"javascript:show('bleu','worst',$count,'".base64_encode($filter)."')\">worst</A>; ";
}
print "showing: $count ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count,'".base64_encode($filter)."')\">more</A> ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999,'".base64_encode($filter)."')\">all</A>";
if ($filter != "") {
print "; filter: '$filter'";
}
sentence_annotation($count,$filter);
print "<p align=center><A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count,'".base64_encode($filter)."')\">5 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',10+$count,'".base64_encode($filter)."')\">10 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',20+$count,'".base64_encode($filter)."')\">20 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',50+$count,'".base64_encode($filter)."')\">50 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',100+$count,'".base64_encode($filter)."')\">100 more</A> | ";
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999,'".base64_encode($filter)."')\">all</A> ";
}
// annotated sentences core: reads data, sorts sentences, displays them
function sentence_annotation() {
function sentence_annotation($count,$filter) {
global $set,$id,$dir,$biconcor;
// load data
$data = file("$dir/evaluation/$set.analysis.$id/bleu-annotation");
# get input
$filtered = array();
$file = get_current_analysis_filename("coverage","input-annotation");
if (file_exists($file)) {
$input = file($file);
# filter is so specified
if ($filter != "") {
for($i=0;$i<count($input);$i++) {
$item = explode("\t",$input[$i]);
$word = explode(" ",$item[0]);
$keep = 0;
for($j=0;$j<count($word);$j++) {
if ($word[$j] == $filter) {
$keep = 1;
}
}
if (!$keep) { $filtered[$i] = 1; }
}
}
}
# load bleu scores
$data = file(get_current_analysis_filename("basic","bleu-annotation"));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$line["bleu"] = $item[0];
$line["id"] = $item[1];
$line["system"] = $item[2];
$line["reference"] = "";
for($j=3;$j<count($item);$j++) {
if ($j>3) { $line["reference"] .= "<br>"; };
$line["reference"] .= $item[$j];
}
$bleu[] = $line;
$item = split("\t",$data[$i]);
if (! array_key_exists($item[1],$filtered)) {
$line["bleu"] = $item[0];
$line["id"] = $item[1];
$line["system"] = $item[2];
$line["reference"] = "";
for($j=3;$j<count($item);$j++) {
if ($j>3) { $line["reference"] .= "<br>"; };
$line["reference"] .= $item[$j];
}
$bleu[] = $line;
}
}
$coverage_id = get_coverage_analysis_version($dir,$set,$id);
if (file_exists("$dir/evaluation/$set.analysis.$coverage_id/input-annotation")) {
$input = file("$dir/evaluation/$set.analysis.$coverage_id/input-annotation");
# sort and label additional sentences as filtered
global $sort;
function cmp($a, $b) {
global $sort;
if ($sort == "order") {
$a_idx = $a["id"];
$b_idx = $b["id"];
}
else if ($sort == "worst" || $sort == "75") {
$a_idx = $a["bleu"];
$b_idx = $b["bleu"];
if ($a_idx == $b_idx) {
$a_idx = $b["id"];
$b_idx = $a["id"];
}
}
else if ($sort == "best" || $sort == "avg" || $sort == "25") {
$a_idx = -$a["bleu"];
$b_idx = -$b["bleu"];
if ($a_idx == $b_idx) {
$a_idx = $a["id"];
$b_idx = $b["id"];
}
}
if ($a_idx == $b_idx) {
return 0;
}
return ($a_idx < $b_idx) ? -1 : 1;
}
$sort = $_GET['sort'];
if ($sort == '') {
$sort = "order";
}
usort($bleu, 'cmp');
$offset = 0;
if ($sort == "25" || $sort == "75") {
$offset = (int) (count($bleu)/4);
}
else if ($sort == "avg") {
$offset = (int) (count($bleu)/2);
}
if (file_exists("$dir/evaluation/$set.analysis.$id/segmentation-annotation")) {
$data = file("$dir/evaluation/$set.analysis.$id/segmentation-annotation");
for($i=0;$i<count($data);$i++) {
$segment = 0;
foreach (split(" ",$data[$i]) as $item) {
list($in_start,$in_end,$out_start,$out_end) = split(":",$item);
$segment++;
$segmentation[$i]["input_start"][$in_start] = $segment;
$segmentation[$i]["input_end"][$in_end] = $segment;
$segmentation[$i]["output_start"][$out_start] = $segment;
$segmentation[$i]["output_end"][$out_end+0] = $segment;
$retained = array();
for($i=$offset;$i<$count+$offset && $i<count($bleu);$i++) {
$line = $bleu[$i];
$retained[$line["id"]] = 1;
}
# get segmentation (phrase alignment)
$file = get_current_analysis_filename("basic","segmentation-annotation");
if (file_exists($file)) {
$data = file($file);
for($i=0;$i<count($data);$i++) {
if ($filter == "" || array_key_exists($i,$retained)) {
$segment = 0;
foreach (split(" ",$data[$i]) as $item) {
list($in_start,$in_end,$out_start,$out_end) = split(":",$item);
$segment++;
$segmentation[$i]["input_start"][$in_start] = $segment;
$segmentation[$i]["input_end"][$in_end] = $segment;
$segmentation[$i]["output_start"][$out_start] = $segment;
$segmentation[$i]["output_end"][$out_end+0] = $segment;
}
}
}
}
// hierarchical data
# get hierarchical data
$hierarchical = 0;
if (file_exists("$dir/evaluation/$set.analysis.$id/input-tree")) {
$data = file("$dir/evaluation/$set.analysis.$id/input-tree");
$span = 0;
$last_sentence = -1;
$nt_count = array();
for($i=0;$i<count($data);$i++) {
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
if ($sentence != $last_sentence) { $span = 0; }
$last_sentence = $sentence;
$segmentation[$sentence][$span]["brackets"] = $brackets;
$file = get_current_analysis_filename("basic","input-tree");
if (file_exists($file)) {
$data = file($file);
$span = 0;
$last_sentence = -1;
$nt_count = array();
for($i=0;$i<count($data);$i++) {
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
if ($sentence != $last_sentence) { $span = 0; }
$last_sentence = $sentence;
if ($filter == "" || array_key_exists($sentence,$retained)) {
$segmentation[$sentence][$span]["brackets"] = $brackets;
# $segmentation[$sentence][$span]["nt"] = $nt;
$segmentation[$sentence][$span]["words"] = rtrim($words);
if ($nt != "") { $nt_count[$nt]++; }
$span++;
$segmentation[$sentence][$span]["words"] = rtrim($words);
if ($nt != "") { $nt_count[$nt]=1; }
$span++;
}
$hierarchical = 1;
}
$hierarchical = 1;
# if (count($nt_count) <= 2) {
# foreach ($segmentation as $sentence => $segmentation_span) {
# foreach ($segmentation_span as $span => $type) {
@ -976,108 +1068,78 @@ function sentence_annotation() {
# }
# }
}
if (file_exists("$dir/evaluation/$set.analysis.$id/output-tree")) {
$data = file("$dir/evaluation/$set.analysis.$id/output-tree");
$span = 0;
$last_sentence = -1;
$nt_count = array();
for($i=0;$i<count($data);$i++) {
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
if ($sentence != $last_sentence) { $span = 0; }
$last_sentence = $sentence;
$segmentation_out[$sentence][$span]["brackets"] = $brackets;
$segmentation_out[$sentence][$span]["nt"] = $nt;
$segmentation_out[$sentence][$span]["words"] = rtrim($words);
if ($nt != "") { $nt_count[$nt]++; }
$span++;
$file = get_current_analysis_filename("basic","output-tree");
if (file_exists($file)) {
$data = file($file);
$span = 0;
$last_sentence = -1;
$nt_count = array();
for($i=0;$i<count($data);$i++) {
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
if ($sentence != $last_sentence) { $span = 0; }
$last_sentence = $sentence;
if ($filter == "" || array_key_exists($sentence,$retained)) {
$segmentation_out[$sentence][$span]["brackets"] = $brackets;
$segmentation_out[$sentence][$span]["nt"] = $nt;
$segmentation_out[$sentence][$span]["words"] = rtrim($words);
if ($nt != "") { $nt_count[$nt]=1; }
$span++;
}
if (count($nt_count) <= 2) {
foreach ($segmentation_out as $sentence => $segmentation_span) {
foreach ($segmentation_span as $span => $type) {
$segmentation_out[$sentence][$span]["nt"]="";
}
}
}
# no non-terminal markup, if there are two or less non-terminals (X,S)
if (count($nt_count) <= 2) {
foreach ($segmentation_out as $sentence => $segmentation_span) {
foreach ($segmentation_span as $span => $type) {
$segmentation_out[$sentence][$span]["nt"]="";
}
}
}
}
if (file_exists("$dir/evaluation/$set.analysis.$id/node")) {
$data = file("$dir/evaluation/$set.analysis.$id/node");
$n = 0;
$last_sentence = -1;
for($i=0;$i<count($data);$i++) {
list($sentence,$depth,$start_div,$end_div,$start_div_in,$end_div_in,$children) = split(" ",$data[$i]);
if ($sentence != $last_sentence) { $n = 0; }
$last_sentence = $sentence;
$node[$sentence][$n]['depth'] = $depth;
$node[$sentence][$n]['start_div'] = $start_div;
$node[$sentence][$n]['end_div'] = $end_div;
$node[$sentence][$n]['start_div_in'] = $start_div_in;
$node[$sentence][$n]['end_div_in'] = $end_div_in;
$node[$sentence][$n]['children'] = rtrim($children);
$n++;
$file = get_current_analysis_filename("basic","node");
if (file_exists($file)) {
$data = file($file);
$n = 0;
$last_sentence = -1;
for($i=0;$i<count($data);$i++) {
list($sentence,$depth,$start_div,$end_div,$start_div_in,$end_div_in,$children) = split(" ",$data[$i]);
if ($sentence != $last_sentence) { $n = 0; }
$last_sentence = $sentence;
if ($filter == "" || array_key_exists($sentence,$retained)) {
$node[$sentence][$n]['depth'] = $depth;
$node[$sentence][$n]['start_div'] = $start_div;
$node[$sentence][$n]['end_div'] = $end_div;
$node[$sentence][$n]['start_div_in'] = $start_div_in;
$node[$sentence][$n]['end_div_in'] = $end_div_in;
$node[$sentence][$n]['children'] = rtrim($children);
$n++;
}
}
}
$biconcor = get_biconcor_version($dir,$id);
// sort
global $sort;
$sort = $_GET['sort'];
if ($sort == '') {
$sort = "order";
# display
if ($filter != "") {
print " (".(count($input)-count($filtered))." retaining)";
}
function cmp($a, $b) {
global $sort;
if ($sort == "order") {
$a_idx = $a["id"];
$b_idx = $b["id"];
}
else if ($sort == "worst") {
$a_idx = $a["bleu"];
$b_idx = $b["bleu"];
if ($a_idx == $b_idx) {
$a_idx = $b["id"];
$b_idx = $a["id"];
}
}
else if ($sort == "best") {
$a_idx = -$a["bleu"];
$b_idx = -$b["bleu"];
if ($a_idx == $b_idx) {
$a_idx = $a["id"];
$b_idx = $b["id"];
}
}
print "</font><BR>\n";
if ($a_idx == $b_idx) {
return 0;
}
return ($a_idx < $b_idx) ? -1 : 1;
}
usort($bleu, 'cmp');
$count = $_GET['count'];
if ($count == 0) { $count = 5; }
// display
//print "<div id=\"debug\"></div>";
for($i=0;$i<$count && $i<count($bleu);$i++) {
$biconcor = get_biconcor_version($dir,$set,$id);
//print "<div id=\"debug\">$sort / $offset</div>";
for($i=$offset;$i<$count+$offset && $i<count($bleu);$i++) {
$line = $bleu[$i];
if ($hierarchical) {
annotation_hierarchical($line["id"],$segmentation[$line["id"]],$segmentation_out[$line["id"]],$node[$line["id"]]);
}
if ($input) {
print "<div id=\"info-$i\" style=\"border-color:black; background:#ffff80; opacity:0; width:100%; border:1px;\">8364 occ. in corpus, 56 translations, entropy: 5.54</div>\n";
print "<div id=\"info-".$line["id"]."\" style=\"border-color:black; background:#ffff80; opacity:0; width:100%; border:1px;\">0 occ. in corpus, 0 translations, entropy: 0.00</div>\n";
if ($biconcor) {
//print "<div id=\"biconcor-$i\" style=\"display: none;\">xxx</div>";
print "<div id=\"biconcor-$i\" class=\"biconcor\">xxx</div>";
print "<div id=\"biconcor-".$line["id"]."\" class=\"biconcor\"><font size=-2>(click on input phrase for bilingual concordancer)</font></div>";
}
if ($hierarchical) {
sentence_annotation_hierarchical("#".$line["id"],$line["id"],$input[$line["id"]],$segmentation[$line["id"]],"in");
}
else {
print "<font size=-2>[#".$line["id"]."]</font> ";
input_annotation($line["id"],$input[$line["id"]],$segmentation[$line["id"]]);
input_annotation($line["id"],$input[$line["id"]],$segmentation[$line["id"]],$filter);
}
}
//else {
@ -1099,19 +1161,20 @@ function coverage($coverage_vector) {
$coverage = array();
foreach (split(" ",$coverage_vector) as $item) {
if (preg_match("/[\-:]/",$item)) {
list($from,$to,$corpus_count,$ttable_count,$ttable_entropy) = preg_split("/[\-:]/",$item);
$coverage[$from][$to]["corpus_count"] = $corpus_count;
$coverage[$from][$to]["ttable_count"] = $ttable_count;
$coverage[$from][$to]["ttable_entropy"] = $ttable_entropy;
$field = preg_split("/[\-:]/",$item);
$from = $field[0];
$to = $field[1];
if (count($field)>2){ $coverage[$from][$to]["corpus_count"]=$field[2]; }
if (count($field)>3){ $coverage[$from][$to]["ttable_count"]=$field[3]; }
if (count($field)>4){ $coverage[$from][$to]["ttabel_entropy"]=$field[4]; }
}
}
$word = split(" ",$words);
return $coverage;
}
// annotate an inpute sentence
function input_annotation($sentence,$input,$segmentation) {
function input_annotation($sentence,$input,$segmentation,$filter) {
global $biconcor;
list($words,$coverage_vector) = split("\t",$input);
@ -1187,10 +1250,10 @@ function input_annotation($sentence,$input,$segmentation) {
for($j=$from;$j<=$to;$j++) {
if ($j>$from) { $phrase .= " "; }
$phrase .= $word[$j];
$highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';";
$lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
$highlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='#ffff80';";
$lowlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
}
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($phrase)."');\"":"").">";
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($phrase)."');\"":"").">";
}
print "</div></td>";
$from += $size-1;
@ -1218,7 +1281,14 @@ function input_annotation($sentence,$input,$segmentation) {
$color = '#ffffff';
$cc = 0; $tc = 0; $te = 0;
}
print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($word[$j])."');\"":"").">$word[$j]</span>";
print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($word[$j])."');\"":"").">";
if ($word[$j] == $filter) {
print "<b><font color=#ff0000>".$word[$j]."</font></b>";
}
else {
print $word[$j];
}
print "</span>";
if ($segmentation && array_key_exists($j,$segmentation["input_end"])) {
print "</span>";
}
@ -1295,7 +1365,7 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
function sentence_annotation_hierarchical($info,$sentence,$sequence,$segmentation,$in_out) {
$In_Out = $in_out == "out" ? "Out" : "In";
list($words,$coverage_vector) = split("\t",$input);
#list($words,$coverage_vector) = split("\t",$input);
$coverage = coverage($sequence);
$word = preg_split("/\s/",$sequence);
@ -1322,7 +1392,8 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
$words = $segmentation[$span]["words"];
# non terminal
if ($segmentation[$span]["nt"]) {
if (array_key_exists("nt",$segmentation[$span]) &&
$segmentation[$span]["nt"] != "") {
print $segmentation[$span]["nt"].": ";
}
@ -1359,16 +1430,16 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
function biconcor($query) {
global $set,$id,$dir;
$sentence = $_GET['sentence'];
$biconcor = get_biconcor_version($dir,$id);
$biconcor = get_biconcor_version($dir,$set,$id);
print "<center>
<form action=\"...\" method=get>
<form method=get id=\"BiconcorForm\">
<img src=\"close.gif\" width=17 height=17 onClick=\"close_biconcor($sentence);\">
<input width=20 value=\"$query\">
<input type=submit value=\"look up\">
<input width=20 id=\"BiconcorQuery\" value=\"$query\">
<input type=submit onclick=\"show_biconcor($sentence,encodeBase64(document.getElementById('BiconcorQuery').value));\" value=\"look up\">
</form>
<div class=\"biconcor-content\">";
$cmd = "./biconcor -l $dir/model/biconcor.$biconcor -q ".escapeshellarg($query)." 2>/dev/null";
# print $cmd."<p>";
$cmd = "./biconcor -l $dir/model/biconcor.$biconcor -Q ".base64_encode($query)." 2>/dev/null";
#print $cmd."<p>";
system($cmd);
# print "<p>done.";
print "</div></center>";

View File

@ -73,8 +73,9 @@ function precision_by_coverage_diff() {
print "The graphs display what ratio of words of a specific type are translated correctly (yellow), and what ratio is deleted (blue).";
print " The extend of the boxes is scaled on the x-axis by the number of tokens of the displayed type.";
// load data
$data = file("$dir/evaluation/$set.analysis.$id2/precision-by-corpus-coverage");
$data = file(get_current_analysis_filename2("precision","precision-by-corpus-coverage"));
$total = 0;
$log_info = array();
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$info[$item[0]]["precision"] = $item[1];
@ -100,7 +101,7 @@ function precision_by_coverage_diff() {
$log_info_new = $log_info;
// load base data
$data = file("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage");
$data = file(get_current_analysis_filename("precision","precision-by-corpus-coverage"));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$info[$item[0]]["precision"] -= $item[1];
@ -119,10 +120,10 @@ function precision_by_coverage_diff() {
precision_by_coverage_diff_graph("byCoverage",$log_info,$log_info_new,$total,$img_width,SORT_NUMERIC);
// load factored data
$d = dir("$dir/evaluation/$set.analysis.$id");
$d = dir("$dir/evaluation/$set.analysis.".get_precision_analysis_version($dir,$set,$id));
while (false !== ($file = $d->read())) {
if (preg_match('/precision-by-corpus-coverage.(.+)$/',$file, $match) &&
file_exists("$dir/evaluation/$set.analysis.$id2/precision-by-corpus-coverage.$match[1]")) {
file_exists(get_current_analysis_filename2("precision","precision-by-corpus-coverage.$match[1]"))) {
precision_by_coverage_diff_factored($img_width,$total,$file,$match[1]);
}
}
@ -130,7 +131,7 @@ function precision_by_coverage_diff() {
function precision_by_coverage_diff_factored($img_width,$total,$file,$factor_id) {
global $dir,$set,$id,$id2;
$data = file("$dir/evaluation/$set.analysis.$id2/$file");
$data = file(get_current_analysis_filename2("precision",$file));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$factor = $item[0];
@ -158,7 +159,7 @@ function precision_by_coverage_diff_factored($img_width,$total,$file,$factor_id)
$log_info_factored_new = $log_info_factored;
// baseline data
$data = file("$dir/evaluation/$set.analysis.$id/$file");
$data = file(get_current_analysis_filename("precision",$file));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$factor = $item[0];
@ -205,7 +206,9 @@ function precision_by_word_diff($type) {
$byFactor = $match[1];
}
$data = file("$dir/evaluation/$set.analysis.$id2/precision-by-input-word");
$data = file(get_current_analysis_filename2("precision","precision-by-input-word"));
$total = 0;
$info = array();
for($i=0;$i<count($data);$i++) {
$line = rtrim($data[$i]);
$item = split("\t",$line);
@ -215,19 +218,23 @@ function precision_by_word_diff($type) {
$count = $item[4];
$log_count = -1;
if ($count>0) {
$log_count = (int) (log($count)/log(2));
$log_count = (int) (log($count)/log(2));
}
if ($byCoverage != -2 && $byCoverage != $log_count) {
continue;
continue;
}
//# filter for factor
$word = $item[5];
$factor = $item[6];
if ($byFactor != "false" && $byFactor != $factor) {
continue;
if ($byFactor != "false" && $byFactor != $item[6]) {
continue;
}
if (!array_key_exists($word,$info)) {
$info[$word]["precision"] = 0;
$info[$word]["delete"] = 0;
$info[$word]["length"] = 0;
$info[$word]["total"] = 0;
}
$info[$word]["precision"] += $item[0];
$info[$word]["delete"] += $item[1];
$info[$word]["length"] += $item[2];
@ -235,7 +242,7 @@ function precision_by_word_diff($type) {
}
$info_new = $info;
$data = file("$dir/evaluation/$set.analysis.$id/precision-by-input-word");
$data = file(get_current_analysis_filename("precision","precision-by-input-word"));
for($i=0;$i<count($data);$i++) {
$line = rtrim($data[$i]);
$item = split("\t",$line);
@ -252,11 +259,19 @@ function precision_by_word_diff($type) {
//# filter for factor
$word = $item[5];
$factor = $item[6];
if ($byFactor != "false" && $byFactor != $factor) {
if ($byFactor != "false" && $byFactor != $item[6]) {
continue;
}
if (!array_key_exists($word,$info)) {
$info[$word]["precision"] = 0;
$info[$word]["delete"] = 0;
$info[$word]["length"] = 0;
$info_new[$word]["length"] = 0;
$info_new[$word]["delete"] = 0;
$info_new[$word]["precision"] = 0;
$info_new[$word]["total"] = 0;
$info[$word]["total"] = -$item[3];
}
$info[$word]["precision"] -= $item[0];
$info[$word]["delete"] -= $item[1];
$info[$word]["length"] -= $item[2];
@ -308,14 +323,14 @@ ctx.font = '9px serif';
$height = 90-$line/2*180;
print "ctx.moveTo(20, $height);\n";
print "ctx.lineTo($img_width, $height);\n";
print "ctx.fillText(\"".sprintf("%d",10*${line}*1.001)."\%\", 0, $height+4);";
print "ctx.fillText(\"".sprintf("%d",10 * $line * 1.001)."\%\", 0, $height+4);";
}
for($line=-0.4;$line<=0.4;$line+=.2) {
$height = 250+$line/2*180;
print "ctx.moveTo(20, $height);\n";
print "ctx.lineTo($img_width, $height);\n";
if ($line != 0) {
print "ctx.fillText(\"".sprintf("%d",10*${line}*1.001)."\%\", 0, $height+4);";
print "ctx.fillText(\"".sprintf("%d",10 * $line * 1.001)."\%\", 0, $height+4);";
}
}
print "ctx.strokeStyle = \"rgb(100,100,100)\"; ctx.stroke();\n";
@ -385,7 +400,7 @@ function ngram_summary_diff() {
// load data
for($idx=0;$idx<2;$idx++) {
$data = file("$dir/evaluation/$set.analysis.".($idx?$id2:$id)."/summary");
$data = file(get_analysis_filename($dir,$set,$idx?$id2:$id,"basic","summary"));
for($i=0;$i<count($data);$i++) {
$item = split(": ",$data[$i]);
$info[$idx][$item[0]] = $item[1];
@ -393,7 +408,7 @@ function ngram_summary_diff() {
}
print "<table cellspacing=5 width=100%><tr><td valign=top align=center bgcolor=#eeeeee>";
print "<b>Precision</b><br>";
print "<b>Precision of Output</b><br>";
//foreach (array("precision","recall") as $type) {
$type = "precision";
print "<table><tr><td>$type</td><td>1-gram</td><td>2-gram</td><td>3-gram</td><td>4-gram</td></tr>\n";
@ -416,12 +431,11 @@ function ngram_summary_diff() {
//}
print "<A HREF=\"javascript:generic_show_diff('PrecisionRecallDetailsDiff','')\">details</A> ";
if (file_exists("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage") &&
file_exists("$dir/evaluation/$set.analysis.$id2/precision-by-corpus-coverage")) {
print "| <A HREF=\"javascript:generic_show_diff('PrecisionByCoverageDiff','')\">breakdown by coverage</A> ";
if (file_exists(get_current_analysis_filename("precision","precision-by-corpus-coverage")) &&
file_exists(get_current_analysis_filename2("precision","precision-by-corpus-coverage"))) {
print "| <A HREF=\"javascript:generic_show_diff('PrecisionByCoverageDiff','')\">precision of input by coverage</A> ";
}
print "</td><td valign=top align=center bgcolor=#eeeeee>";
print "<b>Metrics</b><br>\n";
@ -434,6 +448,7 @@ function ngram_summary_diff() {
}
}
}
$header = ""; $score_line = ""; $diff_line = "";
foreach ($score as $name => $value) {
$header .= "<td>$name</td>";
$score_line .= "<td>".$score[$name][1]."</td>";
@ -494,7 +509,7 @@ function bleu_diff_annotation() {
// load data
for($idx=0;$idx<2;$idx++) {
$data = file("$dir/evaluation/$set.analysis.".($idx?$id2:$id)."/bleu-annotation");
$data = file(get_analysis_filename($dir,$set,$idx?$id2:$id,"basic","bleu-annotation"));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$annotation[$item[1]]["bleu$idx"] = $item[0];
@ -505,6 +520,7 @@ function bleu_diff_annotation() {
}
$data = array();
$identical=0; $same=0; $better=0; $worse=0;
for($i=0;$i<count($annotation);$i++) {
if ($annotation[$i]["system1"] == $annotation[$i]["system0"]) {
$identical++;
@ -609,7 +625,7 @@ function ngram_diff($type) {
$order = $_GET['order'];
for($idx=0;$idx<2;$idx++) {
$data = file("$dir/evaluation/$set.analysis.".($idx?$id2:$id)."/n-gram-$type.$order");
$data = file(get_analysis_filename($dir,$set,$idx?$id2:$id,"basic","n-gram-$type.$order"));
for($i=0;$i<count($data);$i++) {
$item = split("\t",$data[$i]);
$ngram_hash[$item[2]]["total$idx"] = $item[0];

View File

@ -1,11 +1,18 @@
.pp_head {
font-size: 150%;
font-size: 90%;
font-weight: bold;
text-align: center;
}
.pp_target_header {
font-size: 120%;
font-size: 80%;
font-weight: bold;
text-align: center;
}
.pp_singleton_header {
font-size: 80%;
font-variant: small-caps;
font-weight: bold;
text-align: center;
}
@ -29,23 +36,62 @@ td.biconcor {
}
td.pp_source_left {
font-size: 70%;
text-align: right;
}
td.pp_target_left {
font-size: 70%;
text-align: right;
}
td.pp_source {
font-size: 70%;
font-weight: bold;
}
td.pp_target {
font-size: 70%;
font-weight: bold;
}
td.mismatch_target {
font-size: 70%;
text-align: center;
}
td.pp_source_right {
font-size: 70%;
border-style:solid;
border-width:0px 2px 0px 0px ;
border-color: black;
}
td.pp_target_right {
font-size: 70%;
}
span.null_aligned {
color: blue;
}
span.mismatch_pre_aligned {
color: purple;
}
span.mismatch_post_aligned {
color: olive;
}
span.mismatch_misaligned {
color: red;
}
span.mismatch_aligned {
font-weight: bold;
}
td.pp_more {
font-size: 70%;
text-align: center;
}

View File

@ -1,4 +1,5 @@
h2 {
font:italic x-large/1.75 'Essays 1743','Times New Roman',serif;text-shadow:0 0 1px #667
font:italic x-large/1.75 'Essays 1743','Times New Roman',serif;
text-shadow:0 0 1px #667
}
~

View File

@ -12,6 +12,7 @@ function head($title) {
<script language="javascript" src="/javascripts/prototype.js"></script>
<script language="javascript" src="/javascripts/scriptaculous.js"></script>
<script language="javascript" src="hierarchical-segmentation.js"></script>
<script language="javascript" src="base64.js"></script>
<link href="general.css" rel="stylesheet" type="text/css">
<link href="hierarchical-segmentation.css" rel="stylesheet" type="text/css">
<link href="bilingual-concordance.css" rel="stylesheet" type="text/css">
@ -29,7 +30,7 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) {
$action = $_GET["analysis"];
$set = $_GET["set"];
$id = $_GET["id"];
$id2 = $_GET["id2"];
if (array_key_exists("id2",$_GET)) { $id2 = $_GET["id2"]; }
if ($action == "show") { show_analysis(); }
else if ($action == "bleu_show") { bleu_show(); }
else if ($action == "ngram_precision_show") { ngram_show("precision");}
@ -43,7 +44,7 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) {
else if (preg_match("/PrecisionByWord(.+)_show/",$action,$match)) { precision_by_word($match[1]); }
else if ($action == "CoverageDetails_show") { coverage_details(); }
else if ($action == "SegmentationSummary_show") { segmentation_summary(); }
else if ($action == "biconcor") { biconcor($_GET["phrase"]); }
else if ($action == "biconcor") { biconcor(base64_decode($_GET["phrase"])); }
else { print "ERROR! $action"; }
}
else if (array_key_exists("analysis_diff_home",$_GET)) {

View File

@ -124,48 +124,136 @@ function process_file_entry($dir,$entry) {
}
}
function get_coverage_analysis_version($dir,$set,$id) {
if (file_exists("$dir/evaluation/$set.analysis.$id/input-annotation")) {
return $id;
function get_analysis_version($dir,$set,$id) {
global $analysis_version;
if ($analysis_version
&& array_key_exists($id,$analysis_version)
&& array_key_exists($set,$analysis_version[$id])) {
#reset($analysis_version[$id][$set]);
#print "$id,$set ( ";
#while(list($type,$i) = each($analysis_version[$id][$set])) {
# print "$type=$i ";
#}
#print ") FROM CACHE<br>";
return $analysis_version[$id][$set];
}
$analysis_version[$id][$set]["basic"] = 0;
$analysis_version[$id][$set]["biconcor"] = 0;
$analysis_version[$id][$set]["coverage"] = 0;
$analysis_version[$id][$set]["precision"] = 0;
$prefix = "$dir/evaluation/$set.analysis";
# produced by the run itself ?
if (file_exists("$prefix.$id/summary")) {
$analysis_version[$id][$set]["basic"] = $id;
}
if (file_exists("$prefix.$id/input-annotation")) {
$analysis_version[$id][$set]["coverage"] = $id;
}
if (file_exists("$prefix.$id/precision-by-input-word")) {
$analysis_version[$id][$set]["precision"] = $id;
}
if (file_exists("$dir/model/biconcor.$id")) {
$analysis_version[$id][$set]["biconcor"] = $id;
}
# re-use ?
if (file_exists("$dir/steps/$id/re-use.$id")) {
$re_use = file("$dir/steps/$id/re-use.$id");
foreach($re_use as $line) {
if (preg_match("/EVALUATION:(.+):analysis-coverage (\d+)/",$line,$match) &&
if (preg_match("/EVALUATION:(.+):analysis (\d+)/",$line,$match) &&
$match[1] == $set &&
file_exists("$dir/evaluation/$set.analysis.$match[2]/input-annotation")) {
return $match[2];
file_exists("$prefix.$match[2]/summary")) {
$analysis_version[$id][$set]["basic"] = $match[2];
}
else if (preg_match("/EVALUATION:(.+):analysis-coverage (\d+)/",$line,$match) &&
$match[1] == $set &&
file_exists("$prefix.$match[2]/input-annotation")) {
$analysis_version[$id][$set]["coverage"] = $match[2];
}
else if (preg_match("/EVALUATION:(.+):analysis-precision (\d+)/",$line,$match) &&
$match[1] == $set &&
file_exists("$prefix.$match[2]/precision-by-input-word")) {
$analysis_version[$id][$set]["precision"] = $match[2];
}
else if (preg_match("/TRAINING:build-biconcor (\d+)/",$line,$match) &&
file_exists("$dir/model/biconcor.$match[1]")) {
$analysis_version[$id][$set]["biconcor"] = $match[1];
}
}
}
# legacy stuff below...
if (! file_exists("$dir/steps/$id/REPORTING_report.$id")) {
return 0;
}
$report = file("$dir/steps/$id/REPORTING_report.$id.INFO");
foreach ($report as $line) {
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-coverage/",$line,$match) &&
if (file_exists("$dir/steps/$id/REPORTING_report.$id")) {
$report = file("$dir/steps/$id/REPORTING_report.$id.INFO");
foreach ($report as $line) {
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis/",$line,$match) &&
$match[2] == $set) {
$reuse_id = $match[1];
if (file_exists("$dir/evaluation/$set.analysis.$reuse_id/input-annotation")) {
return $reuse_id;
if (file_exists("$prefix.$match[1]/summary")) {
$analysis_version[$id][$set]["basic"] = $match[1];
}
}
}
return 0;
}
function get_biconcor_version($dir,$id) {
if (file_exists("$dir/model/biconcor.$id")) {
return $id;
}
$re_use = file("$dir/steps/$id/re-use.$id");
foreach($re_use as $line) {
if (preg_match("/TRAINING:build-biconcor (\d+)/",$line,$match) &&
file_exists("$dir/model/biconcor.$match[1]")) {
return $match[1];
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-coverage/",$line,$match) &&
$match[2] == $set) {
if (file_exists("$prefix.$match[1]/input-annotation")) {
$analysis_version[$id][$set]["coverage"] = $match[1];
}
}
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-precision/",$line,$match) &&
$match[2] == $set) {
if (file_exists("$prefix.$match[1]/precision-by-input-word")) {
$analysis_version[$id][$set]["precision"] = $match[1];
}
}
if (preg_match("/\# reuse run (\d+) for TRAINING:biconcor/",$line,$match)){
if (file_exists("$dir/model/biconcor.$match[1]")) {
$analysis_version[$id][$set]["biconcor"] = $match[1];
}
}
}
}
return 0;
#print "$id,$set ( ";
#reset($analysis_version[$id][$set]);
#while(list($type,$i) = each($analysis_version[$id][$set])) {
# print "$type=$i ";
#}
#print ") ZZ<br>";
return $analysis_version[$id][$set];
}
function get_precision_analysis_version($dir,$set,$id) {
$version = get_analysis_version($dir,$set,$id);
return $version["precision"];
}
function get_basic_analysis_version($dir,$set,$id) {
$version = get_analysis_version($dir,$set,$id);
return $version["basic"];
}
function get_coverage_analysis_version($dir,$set,$id) {
$version = get_analysis_version($dir,$set,$id);
return $version["coverage"];
}
function get_biconcor_version($dir,$set,$id) {
$version = get_analysis_version($dir,$set,$id);
return $version["biconcor"];
}
function get_analysis_filename($dir,$set,$id,$type,$file) {
$version = get_analysis_version($dir,$set,$id);
return "$dir/evaluation/$set.analysis.".$version[$type]."/".$file;
}
function get_current_analysis_filename($type,$file) {
global $dir,$set,$id;
$version = get_analysis_version($dir,$set,$id);
return "$dir/evaluation/$set.analysis.".$version[$type]."/".$file;
}
function get_current_analysis_filename2($type,$file) {
global $dir,$set,$id2;
$version = get_analysis_version($dir,$set,$id2);
return "$dir/evaluation/$set.analysis.".$version[$type]."/".$file;
}

View File

@ -1,5 +1,7 @@
<?php
date_default_timezone_set('Europe/London');
function setup() {
$setup = file("setup");
head("All Experimental Setups");
@ -11,7 +13,7 @@ function setup() {
print "<TR><TD><A HREF=\"?setup=$dir[0]\">$dir[0]</A></TD><TD>$dir[1]</TD><TD>$dir[2]</TD><TD>$dir[3]</TD></TR>\n";
}
print "</TABLE>\n";
print "<P>To add experiment, edit setup file on web server";
print "<P>To add experiment, edit setup in web directory";
}
function overview() {
@ -29,10 +31,14 @@ function overview() {
print "<form action=\"\" method=get>\n";
output_state_for_form();
// count how many analyses there are for each test set
while (list($id,$info) = each($experiment)) {
reset($evalset);
while (list($set,$dummy) = each($evalset)) {
$analysis = "$dir/evaluation/$set.analysis.$id";
$report_info = "$dir/steps/$id/REPORTING_report.$id.INFO";
// does the analysis file exist?
if (file_exists($analysis)) {
if (!array_key_exists($set,$has_analysis)) {
$has_analysis[$set] = 0;
@ -117,7 +123,7 @@ function overview() {
list($score) = sscanf($info->result[$set],"%f%s");
if ($score > 0) {
print "score[$id][\"$set\"] = $score;\n";
if ($score > $best[$set]) {
if (!array_key_exists($set,$best) || $score > $best[$set]) {
$best[$set] = $score;
}
}
@ -303,8 +309,8 @@ function output_score($id,$info) {
if ($has_analysis && array_key_exists($set,$has_analysis)) {
print "<td align=center>";
global $dir;
$analysis = "$dir/evaluation/$set.analysis.$id";
if (file_exists($analysis)) {
$analysis = get_analysis_version($dir,$set,$id);
if ($analysis["basic"]) {
print "<a href=\"?analysis=show&setup=$setup&set=$set&id=$id\">&#x24B6;</a> <input type=checkbox name=analysis-$id-$set value=1>";
}
print "</td>";