mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 19:37:58 +03:00
improvements to ems analysis
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4026 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
2cdc39f63f
commit
6acd6a8684
@ -14,7 +14,7 @@ private:
|
||||
INDEX *m_sentenceEnd;
|
||||
INDEX m_size;
|
||||
INDEX m_sentenceCount;
|
||||
char m_unaligned[ 256 ];
|
||||
char m_unaligned[ 256 ]; // here for speed (local to PhraseAlignment)
|
||||
|
||||
public:
|
||||
~Alignment();
|
||||
@ -27,4 +27,18 @@ public:
|
||||
void Load( string fileName );
|
||||
void Save( string fileName );
|
||||
vector<string> Tokenize( const char input[] );
|
||||
|
||||
INDEX GetSentenceStart( INDEX sentence ) {
|
||||
if (sentence == 0) return 0;
|
||||
return m_sentenceEnd[ sentence-1 ] + 2;
|
||||
}
|
||||
INDEX GetNumberOfAlignmentPoints( INDEX sentence ) {
|
||||
return ( m_sentenceEnd[ sentence ] - GetSentenceStart( sentence ) ) / 2;
|
||||
}
|
||||
char GetSourceWord( INDEX sentence, INDEX alignment_point ) {
|
||||
return m_array[ GetSentenceStart( sentence ) + alignment_point*2 ];
|
||||
}
|
||||
char GetTargetWord( INDEX sentence, INDEX alignment_point ) {
|
||||
return m_array[ GetSentenceStart( sentence ) + alignment_point*2 + 1 ];
|
||||
}
|
||||
};
|
||||
|
@ -6,5 +6,5 @@ clean:
|
||||
.cpp.o:
|
||||
g++ -O6 -g -c $<
|
||||
|
||||
biconcor: Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o
|
||||
g++ Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o -o biconcor
|
||||
biconcor: Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o Mismatch.o PhrasePair.o PhrasePairCollection.o biconcor.o base64.o
|
||||
g++ Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o Mismatch.o PhrasePair.o PhrasePairCollection.o biconcor.o base64.o -o biconcor
|
||||
|
246
scripts/ems/biconcor/Mismatch.cpp
Normal file
246
scripts/ems/biconcor/Mismatch.cpp
Normal file
@ -0,0 +1,246 @@
|
||||
#include "Mismatch.h"
|
||||
#include "Vocabulary.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define UNANNOTATED 0
|
||||
#define PRE_ALIGNED 1
|
||||
#define POST_ALIGNED 2
|
||||
#define UNALIGNED 3
|
||||
#define MISALIGNED 4
|
||||
#define ALIGNED 5
|
||||
|
||||
|
||||
void Mismatch::PrintClippedHTML( ostream* out, int width )
|
||||
{
|
||||
char source_annotation[256], target_annotation[256];
|
||||
vector< string > label_class;
|
||||
label_class.push_back( "" );
|
||||
label_class.push_back( "mismatch_pre_aligned" );
|
||||
label_class.push_back( "mismatch_post_aligned" );
|
||||
label_class.push_back( "null_aligned" );
|
||||
label_class.push_back( "mismatch_misaligned" );
|
||||
label_class.push_back( "mismatch_aligned" );
|
||||
|
||||
for(int i=0; i<m_source_length;i++) source_annotation[i] = UNANNOTATED;
|
||||
for(int i=0; i<m_target_length;i++) target_annotation[i] = UNANNOTATED;
|
||||
|
||||
if (m_unaligned) {
|
||||
// find alignment points for prior and next word(s) and
|
||||
// center target phrase around those.
|
||||
bool found_aligned = false;
|
||||
for(int i=1; i<m_source_length && !found_aligned; i++) {
|
||||
if (m_source_start-i >= 0) {
|
||||
int word_id = m_source_start-i;
|
||||
source_annotation[ word_id ] = UNALIGNED;
|
||||
if (!m_source_unaligned[ word_id ]) {
|
||||
found_aligned = true;
|
||||
LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
|
||||
}
|
||||
}
|
||||
|
||||
if (m_source_end+i < m_source_length) {
|
||||
int word_id = m_source_end+i;
|
||||
source_annotation[ word_id ] = UNALIGNED;
|
||||
if (!m_source_unaligned[ word_id ]) {
|
||||
found_aligned = true;
|
||||
LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// misalignment
|
||||
else {
|
||||
// label aligned output words
|
||||
for(int i=m_source_start; i<=m_source_end; i++)
|
||||
LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
|
||||
|
||||
// find first and last
|
||||
int target_start = -1;
|
||||
int target_end;
|
||||
for(int i=0; i<m_target_length; i++)
|
||||
if (target_annotation[i] == ALIGNED) {
|
||||
if (target_start == -1)
|
||||
target_start = i;
|
||||
target_end = i;
|
||||
}
|
||||
// go over all enclosed target words
|
||||
for(int i=target_start; i<=target_end; i++) {
|
||||
// label other target words as unaligned or misaligned
|
||||
if (m_target_unaligned[ i ])
|
||||
target_annotation[ i ] = UNALIGNED;
|
||||
else {
|
||||
if (target_annotation[ i ] != ALIGNED)
|
||||
target_annotation[ i ] = MISALIGNED;
|
||||
// loop over aligned source words
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
|
||||
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
||||
// if not part of the source phrase -> also misaligned
|
||||
if (source_word < m_source_start || source_word > m_source_end)
|
||||
source_annotation[ source_word ] = MISALIGNED;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// closure
|
||||
bool change = true;
|
||||
while(change) {
|
||||
change = false;
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
||||
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
|
||||
if (source_annotation[source_word] != UNANNOTATED &&
|
||||
target_annotation[target_word] == UNANNOTATED) {
|
||||
target_annotation[target_word] = MISALIGNED;
|
||||
change = true;
|
||||
}
|
||||
if (source_annotation[source_word] == UNANNOTATED &&
|
||||
target_annotation[target_word] != UNANNOTATED) {
|
||||
source_annotation[source_word] = MISALIGNED;
|
||||
change = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print source
|
||||
// shorten source context if too long
|
||||
int sentence_start = m_source_position - m_source_start;
|
||||
int context_space = width/2;
|
||||
for(int i=m_source_start;i<=m_source_end;i++)
|
||||
context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
|
||||
context_space /= 2;
|
||||
|
||||
int remaining = context_space;
|
||||
int start_word = m_source_start;
|
||||
for(;start_word>0 && remaining>0; start_word--)
|
||||
remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
|
||||
if (remaining<0 || start_word == -1) start_word++;
|
||||
|
||||
remaining = context_space;
|
||||
int end_word = m_source_end;
|
||||
for(;end_word<m_source_length && remaining>0; end_word++)
|
||||
remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
|
||||
end_word--;
|
||||
|
||||
// output with markup
|
||||
*out << "<tr><td class=\"pp_source_left\">";
|
||||
char current_label = UNANNOTATED;
|
||||
if (start_word>0) {
|
||||
current_label = source_annotation[start_word-1];
|
||||
*out << "... ";
|
||||
}
|
||||
for(int i=start_word; i<=end_word; i++) {
|
||||
// change to phrase block
|
||||
if (i == m_source_start) {
|
||||
if (current_label != UNANNOTATED && i!=start_word)
|
||||
*out << "</span>";
|
||||
*out << "</td><td class=\"pp_source\">";
|
||||
current_label = UNANNOTATED;
|
||||
}
|
||||
|
||||
// change to labeled word
|
||||
else if (source_annotation[i] != current_label &&
|
||||
source_annotation[i] != ALIGNED) {
|
||||
if (current_label != UNANNOTATED && i!=start_word)
|
||||
*out << "</span>";
|
||||
if (source_annotation[i] != UNANNOTATED)
|
||||
*out << "<span class=\""
|
||||
<< label_class[ source_annotation[i] ]
|
||||
<< "\">";
|
||||
current_label = source_annotation[i];
|
||||
}
|
||||
|
||||
// output word
|
||||
*out << m_suffixArray->GetWord( sentence_start + i ) << " ";
|
||||
|
||||
// change to right context block
|
||||
if (i == m_source_end) {
|
||||
*out << "</td><td class=\"pp_source_right\">";
|
||||
current_label = UNANNOTATED;
|
||||
}
|
||||
}
|
||||
|
||||
if (current_label != UNANNOTATED && end_word>m_source_end)
|
||||
*out << "</span>";
|
||||
if (end_word<m_source_length-1)
|
||||
*out << "... ";
|
||||
|
||||
// print target
|
||||
// shorten target context if too long
|
||||
int target_start = -1;
|
||||
int target_end;
|
||||
for(int i=0; i<m_target_length; i++)
|
||||
if (target_annotation[i] != UNANNOTATED) {
|
||||
if (target_start == -1)
|
||||
target_start = i;
|
||||
target_end = i;
|
||||
}
|
||||
|
||||
context_space = width/2;
|
||||
for(int i=target_start;i<=target_end;i++)
|
||||
context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
|
||||
while (context_space < 0) { // shorten matched part, if too long
|
||||
context_space +=
|
||||
m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
|
||||
m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
|
||||
target_start++;
|
||||
target_end--;
|
||||
}
|
||||
context_space /= 2;
|
||||
|
||||
remaining = context_space;
|
||||
start_word = target_start;
|
||||
for(;start_word>0 && remaining>0; start_word--) {
|
||||
//cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
|
||||
remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
|
||||
}
|
||||
if (remaining<0 || start_word == -1) start_word++;
|
||||
|
||||
remaining = context_space;
|
||||
end_word = target_end;
|
||||
for(;end_word<m_target_length && remaining>0; end_word++) {
|
||||
//cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
|
||||
remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
|
||||
}
|
||||
end_word--;
|
||||
|
||||
// output with markup
|
||||
*out << "</td><td class=\"mismatch_target\">";
|
||||
current_label = UNANNOTATED;
|
||||
if (start_word>0) {
|
||||
current_label = target_annotation[start_word-1];
|
||||
*out << "... ";
|
||||
}
|
||||
for(int i=start_word; i<=end_word; i++) {
|
||||
if (target_annotation[i] != current_label) {
|
||||
if (current_label != UNANNOTATED && i!=start_word)
|
||||
*out << "</span>";
|
||||
if (target_annotation[i] != UNANNOTATED)
|
||||
*out << "<span class=\""
|
||||
<< label_class[ target_annotation[i] ]
|
||||
<< "\">";
|
||||
current_label = target_annotation[i];
|
||||
}
|
||||
|
||||
// output word
|
||||
*out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
|
||||
}
|
||||
|
||||
if (current_label != UNANNOTATED && end_word>target_end)
|
||||
*out << "</span>";
|
||||
if (end_word<m_target_length-1)
|
||||
*out << "... ";
|
||||
*out << "</td></tr>";
|
||||
}
|
||||
|
||||
void Mismatch::LabelSourceMatches( char *source_annotation, char *target_annotation, char source_id, char label ) {
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
|
||||
source_annotation[ source_id ] = label;
|
||||
target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
|
||||
}
|
||||
}
|
||||
}
|
70
scripts/ems/biconcor/Mismatch.h
Normal file
70
scripts/ems/biconcor/Mismatch.h
Normal file
@ -0,0 +1,70 @@
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include "SuffixArray.h"
|
||||
#include "TargetCorpus.h"
|
||||
#include "Alignment.h"
|
||||
#pragma once
|
||||
|
||||
using namespace std;
|
||||
|
||||
class Mismatch
|
||||
{
|
||||
public:
|
||||
typedef unsigned int INDEX;
|
||||
|
||||
private:
|
||||
SuffixArray *m_suffixArray;
|
||||
TargetCorpus *m_targetCorpus;
|
||||
Alignment *m_alignment;
|
||||
INDEX m_sentence_id;
|
||||
INDEX m_num_alignment_points;
|
||||
char m_source_length;
|
||||
char m_target_length;
|
||||
SuffixArray::INDEX m_source_position;
|
||||
char m_source_start, m_source_end;
|
||||
char m_source_unaligned[ 256 ];
|
||||
char m_target_unaligned[ 256 ];
|
||||
char m_unaligned;
|
||||
|
||||
public:
|
||||
Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, char source_length, char target_length, char source_start, char source_end )
|
||||
:m_suffixArray(sa)
|
||||
,m_targetCorpus(tc)
|
||||
,m_alignment(a)
|
||||
,m_sentence_id(sentence_id)
|
||||
,m_source_position(position)
|
||||
,m_source_length(source_length)
|
||||
,m_target_length(target_length)
|
||||
,m_source_start(source_start)
|
||||
,m_source_end(source_end)
|
||||
{
|
||||
// initialize unaligned indexes
|
||||
for(char i=0; i<m_source_length; i++) {
|
||||
m_source_unaligned[i] = true;
|
||||
}
|
||||
for(char i=0; i<m_target_length; i++) {
|
||||
m_target_unaligned[i] = true;
|
||||
}
|
||||
m_num_alignment_points =
|
||||
m_alignment->GetNumberOfAlignmentPoints( sentence_id );
|
||||
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
||||
m_source_unaligned[ m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
|
||||
m_target_unaligned[ m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
|
||||
}
|
||||
m_unaligned = true;
|
||||
for(char i=source_start; i<=source_end; i++) {
|
||||
if (!m_source_unaligned[ i ]) {
|
||||
m_unaligned = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
~Mismatch () {}
|
||||
|
||||
bool Unaligned() { return m_unaligned; }
|
||||
void PrintClippedHTML( ostream* out, int width );
|
||||
void LabelSourceMatches( char *source_annotation, char *target_annotation, char source_id, char label );
|
||||
};
|
@ -145,14 +145,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width )
|
||||
int source_pre_width = (source_width-source.size())/2;
|
||||
int source_post_width = (source_width-source.size()+1)/2;
|
||||
|
||||
// if phrase is too long, don't show any context
|
||||
if (source.size() > width) {
|
||||
source_pre_width = 0;
|
||||
source_post_width = 0;
|
||||
}
|
||||
if (source_pre.size()>source_pre_width)
|
||||
// too long -> truncate and add "..."
|
||||
if (source_pre.size()>source_pre_width) {
|
||||
// first skip up to a space
|
||||
while(source_pre_width>0 &&
|
||||
source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
|
||||
source_pre_width--;
|
||||
}
|
||||
source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
|
||||
if (source_post.size() > source_post_width)
|
||||
}
|
||||
if (source_post.size() > source_post_width) {
|
||||
while(source_post_width>0 &&
|
||||
source_post.substr(source_post_width-1,1) != " ") {
|
||||
source_post_width--;
|
||||
}
|
||||
source_post = source_post.substr( 0, source_post_width ) + "...";
|
||||
}
|
||||
|
||||
*out << "<tr><td class=\"pp_source_left\">"
|
||||
<< source_pre
|
||||
@ -167,8 +180,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width )
|
||||
string target_pre = "";
|
||||
string target = "";
|
||||
string target_post = "";
|
||||
int target_pre_null_width = 0;
|
||||
int target_post_null_width = 0;
|
||||
for( char i=0; i<m_target_start; i++ ) {
|
||||
target_pre += " " + m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
target_pre += " " + word;
|
||||
if (i >= m_target_start-m_pre_null)
|
||||
target_pre_null_width += word.size() + 1;
|
||||
}
|
||||
for( char i=m_target_start; i<=m_target_end; i++ ) {
|
||||
if (i>m_target_start) target += " ";
|
||||
@ -176,7 +194,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width )
|
||||
}
|
||||
for( char i=m_target_end+1; i<m_target_length; i++ ) {
|
||||
if (i>m_target_end+1) target_post += " ";
|
||||
target_post += m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
|
||||
target_post += word;
|
||||
if (i-(m_target_end+1) < m_post_null) {
|
||||
target_post_null_width += word.size() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
int target_pre_width = (target_width-target.size())/2;
|
||||
@ -186,10 +208,47 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width )
|
||||
target_pre_width = 0;
|
||||
target_post_width = 0;
|
||||
}
|
||||
if (target_pre.size() > target_pre_width)
|
||||
|
||||
if (target_pre.size() < target_pre_width)
|
||||
target_pre_width = target_pre.size();
|
||||
else {
|
||||
while(target_pre_width>0 &&
|
||||
target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
|
||||
target_pre_width--;
|
||||
}
|
||||
target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
|
||||
if (target_post.size() > target_post_width)
|
||||
target_post = target_post.substr( 0, target_post_width ) + "...";
|
||||
}
|
||||
|
||||
if (target_post.size() < target_post_width) {
|
||||
target_post_width = target_post.size();
|
||||
}
|
||||
else {
|
||||
while(target_post_width>0 &&
|
||||
target_post.substr(target_post_width-1,1) != " ") {
|
||||
target_post_width--;
|
||||
}
|
||||
target_post = target_post.substr( 0, target_post_width ) + "...";
|
||||
}
|
||||
|
||||
if (m_pre_null) {
|
||||
//cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
|
||||
if (target_pre_width < target_pre.size())
|
||||
target_pre_null_width -= target_pre.size()-target_pre_width;
|
||||
target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
|
||||
+ "<span class=\"null_aligned\">"
|
||||
+ target_pre.substr(target_pre_width-target_pre_null_width)
|
||||
+ "</span>";
|
||||
}
|
||||
if (m_post_null) {
|
||||
//cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
|
||||
if (target_post_null_width>target_post.size()) {
|
||||
target_post_null_width = target_post.size();
|
||||
}
|
||||
target_post = "<span class=\"null_aligned\">"
|
||||
+ target_post.substr(0,target_post_null_width)
|
||||
+ "</span>"
|
||||
+ target_post.substr(target_post_null_width);
|
||||
}
|
||||
|
||||
*out << "<td class=\"pp_target_left\">"
|
||||
<< target_pre
|
||||
|
@ -47,8 +47,9 @@ bool PhrasePairCollection::GetCollection( const vector< string > sourceString )
|
||||
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
|
||||
cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
|
||||
cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
|
||||
for( char pre = 0; pre <= pre_null; pre++ ) {
|
||||
for( char post = 0; post <= post_null; post++ ) {
|
||||
bool null_boundary_words = false;
|
||||
for( char pre = 0; pre <= pre_null && (pre==0||null_boundary_words); pre++ ) {
|
||||
for( char post = 0; post <= post_null && (post==0||null_boundary_words); post++ ) {
|
||||
vector< WORD_ID > targetString;
|
||||
cerr << "; ";
|
||||
for( char target = target_start-pre; target <= target_end+post; target++ ) {
|
||||
@ -67,6 +68,18 @@ bool PhrasePairCollection::GetCollection( const vector< string > sourceString )
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
cerr << "mismatch " << (i-first_match)
|
||||
<< " in sentence " << sentence_id
|
||||
<< ", starting at word " << source_start
|
||||
<< " of " << sentence_length
|
||||
<< ". target sentence has " << target_length << " words.";
|
||||
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
|
||||
if (mismatch->Unaligned())
|
||||
m_unaligned.push_back( mismatch );
|
||||
else
|
||||
m_mismatch.push_back( mismatch );
|
||||
}
|
||||
cerr << endl;
|
||||
|
||||
if (found > m_max_lookup) {
|
||||
@ -92,23 +105,89 @@ void PhrasePairCollection::Print()
|
||||
|
||||
void PhrasePairCollection::PrintHTML()
|
||||
{
|
||||
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
|
||||
int pp_target = 0;
|
||||
bool singleton = false;
|
||||
// loop over all translations
|
||||
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
|
||||
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
|
||||
cout << "<p class=\"pp_target_header\">";
|
||||
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
|
||||
int count = ppWithSameTarget->size();
|
||||
cout << "(" << count << "/" << m_size << ")" << endl;
|
||||
cout << "<p><table align=\"center\">";
|
||||
|
||||
int count = ppWithSameTarget->size();
|
||||
if (!singleton) {
|
||||
if (count == 1) {
|
||||
singleton = true;
|
||||
cout << "<p class=\"pp_singleton_header\">singleton"
|
||||
<< (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
|
||||
<< (m_collection.end() - ppWithSameTarget)
|
||||
<< "/" << m_size << ")</p>";
|
||||
}
|
||||
else {
|
||||
cout << "<p class=\"pp_target_header\">";
|
||||
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
|
||||
cout << " (" << count << "/" << m_size << ")" << endl;
|
||||
cout << "<p><div id=\"pp_" << pp_target << "\">";
|
||||
}
|
||||
cout << "<table align=\"center\">";
|
||||
}
|
||||
|
||||
vector< PhrasePair* >::iterator p;
|
||||
int pp = 0;
|
||||
for(p = ppWithSameTarget->begin(); pp<count && p != ppWithSameTarget->end(); p++, pp++ ) {
|
||||
// loop over all sentences where translation occurs
|
||||
int pp=0;
|
||||
int i=0;
|
||||
for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
|
||||
(*p)->PrintClippedHTML( &cout, 160 );
|
||||
if (count > m_max_pp) {
|
||||
p += count/m_max_pp-1;
|
||||
pp += count/m_max_pp-1;
|
||||
}
|
||||
}
|
||||
cout << "</table>\n";
|
||||
if (i == 10 && pp < count) {
|
||||
// extended table
|
||||
cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
|
||||
cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
|
||||
cout << "<table align=\"center\">";
|
||||
for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_pp && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
|
||||
(*p)->PrintClippedHTML( &cout, 160 );
|
||||
if (count > m_max_pp) {
|
||||
p += count/m_max_pp-1;
|
||||
pp += count/m_max_pp-1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!singleton) cout << "</table></div>\n";
|
||||
|
||||
if (!singleton && pp_target == 9) {
|
||||
cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
|
||||
cout << "<p class=\"pp_target_header\">(more)</p></div>";
|
||||
cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
|
||||
}
|
||||
}
|
||||
if (singleton) cout << "</table></div>\n";
|
||||
else if (pp_target > 9) cout << "</div>";
|
||||
|
||||
int max_mismatch = m_max_pp/3;
|
||||
// unaligned phrases
|
||||
if (m_unaligned.size() > 0) {
|
||||
cout << "<p class=\"pp_singleton_header\">unaligned"
|
||||
<< " (" << (m_unaligned.size()) << ")</p>";
|
||||
cout << "<table align=\"center\">";
|
||||
int step_size = 1;
|
||||
if (m_unaligned.size() > max_mismatch)
|
||||
step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
|
||||
for(int i=0;i<m_unaligned.size();i+=step_size)
|
||||
m_unaligned[i]->PrintClippedHTML( &cout, 160 );
|
||||
cout << "</table>";
|
||||
}
|
||||
|
||||
// mismatched phrases
|
||||
if (m_mismatch.size() > 0) {
|
||||
cout << "<p class=\"pp_singleton_header\">mismatched"
|
||||
<< " (" << (m_mismatch.size()) << ")</p>";
|
||||
cout << "<table align=\"center\">";
|
||||
int step_size = 1;
|
||||
if (m_mismatch.size() > max_mismatch)
|
||||
step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
|
||||
for(int i=0;i<m_mismatch.size();i+=step_size)
|
||||
m_mismatch[i]->PrintClippedHTML( &cout, 160 );
|
||||
cout << "</table>";
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "TargetCorpus.h"
|
||||
#include "Alignment.h"
|
||||
#include "PhrasePair.h"
|
||||
#include "Mismatch.h"
|
||||
|
||||
#pragma once
|
||||
|
||||
@ -16,6 +17,7 @@ private:
|
||||
TargetCorpus *m_targetCorpus;
|
||||
Alignment *m_alignment;
|
||||
vector< vector<PhrasePair*> > m_collection;
|
||||
vector< Mismatch* > m_mismatch, m_unaligned;
|
||||
int m_size;
|
||||
int m_max_lookup;
|
||||
int m_max_pp_target;
|
||||
|
123
scripts/ems/biconcor/base64.cpp
Normal file
123
scripts/ems/biconcor/base64.cpp
Normal file
@ -0,0 +1,123 @@
|
||||
/*
|
||||
base64.cpp and base64.h
|
||||
|
||||
Copyright (C) 2004-2008 René Nyffenegger
|
||||
|
||||
This source code is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the author be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this source code must not be misrepresented; you must not
|
||||
claim that you wrote the original source code. If you use this source code
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original source code.
|
||||
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
René Nyffenegger rene.nyffenegger@adp-gmbh.ch
|
||||
|
||||
*/
|
||||
|
||||
#include "base64.h"
|
||||
#include <iostream>
|
||||
|
||||
static const std::string base64_chars =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789+/";
|
||||
|
||||
|
||||
static inline bool is_base64(unsigned char c) {
|
||||
return (isalnum(c) || (c == '+') || (c == '/'));
|
||||
}
|
||||
|
||||
std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) {
|
||||
std::string ret;
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
unsigned char char_array_3[3];
|
||||
unsigned char char_array_4[4];
|
||||
|
||||
while (in_len--) {
|
||||
char_array_3[i++] = *(bytes_to_encode++);
|
||||
if (i == 3) {
|
||||
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
|
||||
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
|
||||
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
|
||||
char_array_4[3] = char_array_3[2] & 0x3f;
|
||||
|
||||
for(i = 0; (i <4) ; i++)
|
||||
ret += base64_chars[char_array_4[i]];
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i)
|
||||
{
|
||||
for(j = i; j < 3; j++)
|
||||
char_array_3[j] = '\0';
|
||||
|
||||
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
|
||||
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
|
||||
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
|
||||
char_array_4[3] = char_array_3[2] & 0x3f;
|
||||
|
||||
for (j = 0; (j < i + 1); j++)
|
||||
ret += base64_chars[char_array_4[j]];
|
||||
|
||||
while((i++ < 3))
|
||||
ret += '=';
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
std::string base64_decode(std::string const& encoded_string) {
|
||||
int in_len = encoded_string.size();
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int in_ = 0;
|
||||
unsigned char char_array_4[4], char_array_3[3];
|
||||
std::string ret;
|
||||
|
||||
while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
|
||||
char_array_4[i++] = encoded_string[in_]; in_++;
|
||||
if (i ==4) {
|
||||
for (i = 0; i <4; i++)
|
||||
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
||||
|
||||
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (i = 0; (i < 3); i++)
|
||||
ret += char_array_3[i];
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i) {
|
||||
for (j = i; j <4; j++)
|
||||
char_array_4[j] = 0;
|
||||
|
||||
for (j = 0; j <4; j++)
|
||||
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
||||
|
||||
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (j = 0; (j < i - 1); j++) ret += char_array_3[j];
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
4
scripts/ems/biconcor/base64.h
Normal file
4
scripts/ems/biconcor/base64.h
Normal file
@ -0,0 +1,4 @@
|
||||
#include <string>
|
||||
|
||||
std::string base64_encode(unsigned char const* , unsigned int len);
|
||||
std::string base64_decode(std::string const& s);
|
@ -3,6 +3,7 @@
|
||||
#include "Alignment.h"
|
||||
#include "PhrasePairCollection.h"
|
||||
#include <getopt.h>
|
||||
#include "base64.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -32,7 +33,7 @@ int main(int argc, char* argv[])
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
int option_index = 0;
|
||||
int c = getopt_long (argc, argv, "l:s:c:q:t:a:h", long_options, &option_index);
|
||||
int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:h", long_options, &option_index);
|
||||
if (c == -1) break;
|
||||
switch (c) {
|
||||
case 'l':
|
||||
@ -53,6 +54,10 @@ int main(int argc, char* argv[])
|
||||
fileNameSource = string(optarg);
|
||||
createFlag = true;
|
||||
break;
|
||||
case 'Q':
|
||||
query = base64_decode(string(optarg));
|
||||
queryFlag = true;
|
||||
break;
|
||||
case 'q':
|
||||
query = string(optarg);
|
||||
queryFlag = true;
|
||||
|
@ -370,14 +370,14 @@ build-generation-custom
|
||||
ignore-unless: AND generation-factors generation-corpus
|
||||
default-name: model/generation-table
|
||||
create-config
|
||||
in: reordering-table phrase-translation-table generation-table LM:binlm biconcor-model
|
||||
in: reordering-table phrase-translation-table generation-table LM:binlm
|
||||
out: config
|
||||
ignore-if: use-hiero INTERPOLATED-LM:script
|
||||
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings
|
||||
default-name: model/moses.ini
|
||||
error: Unknown option
|
||||
create-config-interpolated-lm
|
||||
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm biconcor-model
|
||||
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm
|
||||
out: config
|
||||
ignore-if: use-hiero
|
||||
ignore-unless: INTERPOLATED-LM:script
|
||||
@ -777,6 +777,6 @@ analysis-precision
|
||||
|
||||
[REPORTING] single
|
||||
report
|
||||
in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec
|
||||
in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model
|
||||
out: report
|
||||
default-name: evaluation/report
|
||||
|
@ -1730,7 +1730,6 @@ sub define_training_create_config {
|
||||
my ($config,
|
||||
$reordering_table,$phrase_translation_table,$generation_table,@LM)
|
||||
= &get_output_and_input($step_id);
|
||||
if ($LM[$#LM] =~ /biconcor/ || $LM[$#LM] eq '') { pop @LM; }
|
||||
|
||||
my $cmd = &get_training_setting(9);
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
<?php
|
||||
|
||||
// main page frame, triggers the loading of parts
|
||||
# main page frame, triggers the loading of parts
|
||||
function show_analysis() {
|
||||
global $task,$user,$setup,$id,$set;
|
||||
global $dir;
|
||||
@ -8,13 +8,14 @@ function show_analysis() {
|
||||
head("Analysis: $task ($user), Set $set, Run $id");
|
||||
|
||||
?><script>
|
||||
function show(field,sort,count) {
|
||||
function show(field,sort,count,filter) {
|
||||
var url = '?analysis=' + field + '_show'
|
||||
+ '&setup=<?php print $setup ?>'
|
||||
+ '&id=<?php print $id ?>'
|
||||
+ '&set=<?php print $set ?>'
|
||||
+ '&sort=' + sort
|
||||
+ '&count=' + count;
|
||||
+ '&count=' + count
|
||||
+ '&filter=' + filter;
|
||||
new Ajax.Updater(field, url, { method: 'get', evalScripts: true });
|
||||
}
|
||||
function ngram_show(type,order,count,sort,smooth) {
|
||||
@ -61,7 +62,7 @@ function hide_word_info(sentence) {
|
||||
function show_biconcor(sentence,phrase) {
|
||||
var div = "biconcor-"+sentence;
|
||||
var url = '?analysis=biconcor'
|
||||
+ '&setup=<?php print $setup ?>&id=<?php print get_biconcor_version($dir,$id); ?>&set=<?php print $set ?>'
|
||||
+ '&setup=<?php print $setup ?>&id=<?php print get_biconcor_version($dir,$set,$id); ?>&set=<?php print $set ?>'
|
||||
+ '&sentence=' + sentence
|
||||
+ '&phrase=' + encodeURIComponent(phrase);
|
||||
document.getElementById(div).innerHTML = "<center><img src=\"spinner.gif\" width=48 height=48></center>";
|
||||
@ -83,7 +84,7 @@ function close_biconcor(sentence) {
|
||||
<div id="PrecisionRecallDetails"></div>
|
||||
<div id="bleu">(loading...)</div>
|
||||
<script language="javascript">
|
||||
show('bleu','',5);
|
||||
show('bleu','',5,'');
|
||||
</script>
|
||||
</body></html>
|
||||
<?php
|
||||
@ -93,13 +94,14 @@ function precision_by_coverage() {
|
||||
global $experiment,$evalset,$dir,$set,$id;
|
||||
$img_width = 1000;
|
||||
|
||||
print "<h3>Precision by Coverage</h3>";
|
||||
print "<h3>Precision of Input Words by Coverage</h3>";
|
||||
print "The graphs display what ratio of words of a specific type are translated correctly (yellow), and what ratio is deleted (blue).";
|
||||
print " The extend of the boxes is scaled on the x-axis by the number of tokens of the displayed type.";
|
||||
|
||||
// load data
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage");
|
||||
$data = file(get_current_analysis_filename("precision","precision-by-corpus-coverage"));
|
||||
$total = 0;
|
||||
$log_info = array();
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$info[$item[0]]["precision"] = $item[1];
|
||||
@ -125,8 +127,8 @@ function precision_by_coverage() {
|
||||
print "<h4>By log<sub>2</sub>-count in the training corpus</h4>";
|
||||
precision_by_coverage_graph("byCoverage",$log_info,$total,$img_width,SORT_NUMERIC);
|
||||
|
||||
// load factored data
|
||||
$d = dir("$dir/evaluation/$set.analysis.$id");
|
||||
# load factored data
|
||||
$d = dir("$dir/evaluation/$set.analysis.".get_precision_analysis_version($dir,$set,$id));
|
||||
while (false !== ($file = $d->read())) {
|
||||
if (preg_match('/precision-by-corpus-coverage.(.+)$/',$file, $match)) {
|
||||
precision_by_coverage_factored($img_width,$total,$file,$match[1]);
|
||||
@ -136,7 +138,7 @@ function precision_by_coverage() {
|
||||
|
||||
function precision_by_coverage_factored($img_width,$total,$file,$factor_id) {
|
||||
global $dir,$set,$id;
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/$file");
|
||||
$data = file(get_current_analysis_filename("precision",$file));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$factor = $item[0];
|
||||
@ -187,7 +189,7 @@ function precision_by_word($type) {
|
||||
$byFactor = $match[1];
|
||||
}
|
||||
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/precision-by-input-word");
|
||||
$data = file(get_current_analysis_filename("precision","precision-by-input-word"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$line = rtrim($data[$i]);
|
||||
$item = split("\t",$line);
|
||||
@ -204,8 +206,7 @@ function precision_by_word($type) {
|
||||
|
||||
//# filter for factor
|
||||
$word = $item[5];
|
||||
$factor = $item[6];
|
||||
if ($byFactor != "false" && $byFactor != $factor) {
|
||||
if ($byFactor != "false" && $byFactor != $item[6]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -218,7 +219,7 @@ function precision_by_word($type) {
|
||||
|
||||
print "<table border=1><tr><td align=center>Count</td><td align=center colspan=2>Precision</td><td align=center colspan=2>Delete</td><td align=center>Length</td></tr>\n";
|
||||
foreach ($info as $word => $wordinfo) {
|
||||
print "<tr><td align=center>$word</td>";
|
||||
print "<tr><td align=center><a href=\"javascript:show('bleu','order',5,'".base64_encode($word)."')\">$word</a></td>";
|
||||
printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%.1f/%d</font></td>",$wordinfo["precision"]/$wordinfo["total"]*100,"%",$wordinfo["precision"],$wordinfo["total"]);
|
||||
printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%d/%d</font></td>",$wordinfo["delete"]/$wordinfo["total"]*100,"%",$wordinfo["delete"],$wordinfo["total"]);
|
||||
printf("<td align=right>%.3f</td>",$wordinfo["length"]/$wordinfo["total"]);
|
||||
@ -361,7 +362,7 @@ ctx.font = '9px serif';
|
||||
print "</script>";
|
||||
}
|
||||
|
||||
// stats on precision and recall
|
||||
//# stats on precision and recall
|
||||
function precision_recall_details() {
|
||||
?>
|
||||
<table width=100%>
|
||||
@ -389,20 +390,20 @@ ngram_show('recall',4,5,'',0);
|
||||
<?php
|
||||
}
|
||||
|
||||
// stats on ngram precision
|
||||
//# stats on ngram precision
|
||||
function ngram_summary() {
|
||||
global $experiment,$evalset,$dir,$set,$id;
|
||||
|
||||
// load data
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/summary");
|
||||
//# load data
|
||||
$data = file(get_current_analysis_filename("basic","summary"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split(": ",$data[$i]);
|
||||
$info[$item[0]] = $item[1];
|
||||
}
|
||||
|
||||
print "<table cellspacing=5 width=100%><tr><td valign=top align=center bgcolor=#eeeeee>";
|
||||
//foreach (array("precision","recall") as $type) {
|
||||
print "<b>Precision</b>\n";
|
||||
//#foreach (array("precision","recall") as $type) {
|
||||
print "<b>Precision of Output</b>\n";
|
||||
$type = "precision";
|
||||
print "<table><tr><td>$type</td><td>1-gram</td><td>2-gram</td><td>3-gram</td><td>4-gram</td></tr>\n";
|
||||
printf("<tr><td>correct</td><td>%d</td><td>%d</td><td>%d</td><td>%d</td></tr>\n",
|
||||
@ -424,8 +425,8 @@ function ngram_summary() {
|
||||
//}
|
||||
|
||||
print "<A HREF=\"javascript:generic_show('PrecisionRecallDetails','')\">details</A> ";
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage")) {
|
||||
print "| <A HREF=\"javascript:generic_show('PrecisionByCoverage','')\">breakdown by coverage</A> ";
|
||||
if (file_exists(get_current_analysis_filename("precision","precision-by-corpus-coverage"))) {
|
||||
print "| <A HREF=\"javascript:generic_show('PrecisionByCoverage','')\">precision of input by coverage</A> ";
|
||||
}
|
||||
|
||||
print "</td><td valign=top valign=top align=center bgcolor=#eeeeee>";
|
||||
@ -445,8 +446,7 @@ function ngram_summary() {
|
||||
printf("<p>length-diff: %d (%.1f%s)",$info["precision-1-total"]-$info["recall-1-total"],($info["precision-1-total"]-$info["recall-1-total"])/$info["recall-1-total"]*100,"%");
|
||||
|
||||
// coverage
|
||||
$coverage_id = get_coverage_analysis_version($dir,$set,$id);
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$coverage_id/corpus-coverage-summary")) {
|
||||
if (file_exists(get_current_analysis_filename("coverage","corpus-coverage-summary"))) {
|
||||
print "</td><td valign=top align=center bgcolor=#eeeeee>";
|
||||
print "<div id=\"CoverageSummary\">";
|
||||
coverage_summary();
|
||||
@ -454,8 +454,8 @@ function ngram_summary() {
|
||||
}
|
||||
|
||||
// phrase segmentation
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/segmentation") ||
|
||||
file_exists("$dir/evaluation/$set.analysis.$id/rule")) {
|
||||
if (file_exists(get_current_analysis_filename("basic","segmentation")) ||
|
||||
file_exists(get_current_analysis_filename("basic","rule"))) {
|
||||
print "</td><td valign=top align=center bgcolor=#eeeeee>";
|
||||
print "<div id=\"SegmentationSummary\">";
|
||||
segmentation_summary();
|
||||
@ -463,7 +463,7 @@ function ngram_summary() {
|
||||
}
|
||||
|
||||
// rules
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/rule")) {
|
||||
if (file_exists(get_current_analysis_filename("basic","rule"))) {
|
||||
print "</td><td valign=top align=center bgcolor=#eeeeee>";
|
||||
print "<div id=\"RuleSummary\">";
|
||||
rule_summary();
|
||||
@ -479,7 +479,7 @@ function ngram_show($type) {
|
||||
|
||||
// load data
|
||||
$order = $_GET['order'];
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/n-gram-$type.$order");
|
||||
$data = file(get_current_analysis_filename("basic","n-gram-$type.$order"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$line["total"] = $item[0];
|
||||
@ -572,7 +572,7 @@ function coverage_details() {
|
||||
$total[$corpus][$b][$i] = 0;
|
||||
}
|
||||
}
|
||||
$data = file(filename_fallback_to_factored("$dir/evaluation/$set.analysis.$id/$corpus-coverage-summary"));
|
||||
$data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","$corpus-coverage-summary")));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
if ($item[1]>5) {
|
||||
@ -614,7 +614,7 @@ function coverage_details() {
|
||||
}
|
||||
print "</tr></table>\n";
|
||||
|
||||
$data = file(filename_fallback_to_factored("$dir/evaluation/$set.analysis.$id/ttable-unknown"));
|
||||
$data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","ttable-unknown")));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
list($word,$count) = split("\t",$data[$i]);
|
||||
$item["word"] = $word;
|
||||
@ -678,8 +678,7 @@ function filename_fallback_to_factored($file) {
|
||||
|
||||
function factor_name($input_output,$factor_id) {
|
||||
global $dir,$set,$id;
|
||||
$coverage_id = get_coverage_analysis_version($dir,$set,$id);
|
||||
$file = "$dir/evaluation/$set.analysis.$coverage_id/factor-names";
|
||||
$file = get_current_analysis_filename("coverage","factor-names");
|
||||
if (!file_exists($file)) {
|
||||
return $factor_id;
|
||||
}
|
||||
@ -703,8 +702,7 @@ function coverage_summary() {
|
||||
}
|
||||
$total[$corpus][$b] = 0;
|
||||
}
|
||||
$coverage_id = get_coverage_analysis_version($dir,$set,$id);
|
||||
$data = file(filename_fallback_to_factored("$dir/evaluation/$set.analysis.$coverage_id/$corpus-coverage-summary"));
|
||||
$data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","$corpus-coverage-summary")));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
if ($item[0] == 1) {
|
||||
@ -768,8 +766,9 @@ function segmentation_summary() {
|
||||
}
|
||||
|
||||
$total = 0;
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/segmentation")) {
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/segmentation");
|
||||
$file = get_current_analysis_filename("basic","segmentation");
|
||||
if (file_exists($file)) {
|
||||
$data = file($file);
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
list($in,$out,$c) = split("\t",$data[$i]);
|
||||
if ($by == "word") { $c *= $in; }
|
||||
@ -780,9 +779,12 @@ function segmentation_summary() {
|
||||
}
|
||||
}
|
||||
else {
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/rule");
|
||||
$data = file(get_current_analysis_filename("basic","rule"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
list($type,$rule,$c) = split("\t",$data[$i]);
|
||||
$field = split("\t",$data[$i]);
|
||||
$type = $field[0];
|
||||
$rule = $field[1];
|
||||
if (count($field) > 2) { $c = $field[2]; } else { $c = 0; }
|
||||
if ($type == "rule") {
|
||||
list($rule_in,$in,$nt,$rule_out,$out) = split(":",$rule);
|
||||
if ($by == "word") { $c *= $in; }
|
||||
@ -822,9 +824,14 @@ function segmentation_summary() {
|
||||
// hierarchical rules used in translation
|
||||
function rule_summary() {
|
||||
global $dir,$set,$id;
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/rule");
|
||||
$data = file(get_current_analysis_filename("basic","rule"));
|
||||
$rule = array(); $count = array(); $count_nt = array(); $count_w = array();
|
||||
$nt_count = 0; $total = 0;
|
||||
foreach ($data as $item) {
|
||||
list($type,$d,$d2) = split("\t",$item);
|
||||
$field = split("\t",$item);
|
||||
$type = $field[0];
|
||||
$d = $field[1];
|
||||
if (count($field) > 2) { $d2 = $field[2]; } else { $d2 = 0; }
|
||||
if ($type == "sentence-count") {
|
||||
$sentence_count = $d;
|
||||
}
|
||||
@ -843,12 +850,16 @@ function rule_summary() {
|
||||
$rule_out = preg_replace("/b/","y",$rule_out);
|
||||
$rule_out = preg_replace("/c/","z",$rule_out);
|
||||
$nt_count += $d2 * $nt;
|
||||
if (!array_key_exists($d,$rule)) { $rule[$d] = 0; }
|
||||
$rule[$d] += $d2;
|
||||
if (!array_key_exists($nt,$count)) { $count[$nt] = 0; }
|
||||
$count[$nt] += $d2;
|
||||
$just_nt = preg_replace("/\d/","",$rule_in)."-".preg_replace("/\d/","",$rule_out);
|
||||
$no_wc = preg_replace("/\d/","W",$rule_in)."-".preg_replace("/\d/","",$rule_out);
|
||||
if ($just_nt == "-") { $just_nt = "lexical"; }
|
||||
if (!array_key_exists($just_nt,$count_nt)) { $count_nt[$just_nt] = 0; }
|
||||
$count_nt[$just_nt] += $d2;
|
||||
if (!array_key_exists($no_wc,$count_w)) { $count_w[$no_wc] = 0; }
|
||||
$count_w[$no_wc] += $d2;
|
||||
$total += $d2;
|
||||
}
|
||||
@ -866,108 +877,189 @@ function rule_summary() {
|
||||
|
||||
// annotated sentences, navigation
|
||||
function bleu_show() {
|
||||
$count = $_GET['count'];
|
||||
if ($count == 0) { $count = 5; }
|
||||
|
||||
print "<b>annotated sentences</b><br><font size=-1>sorted by ";
|
||||
|
||||
if ($_GET['sort'] == "order" || $_GET['sort'] == "") {
|
||||
print "order ";
|
||||
}
|
||||
else {
|
||||
print "<A HREF=\"javascript:show('bleu','order',$count)\">order</A> ";
|
||||
}
|
||||
|
||||
if ($_GET['sort'] == "best") {
|
||||
print "order ";
|
||||
}
|
||||
else {
|
||||
print "<A HREF=\"javascript:show('bleu','best',$count)\">best</A> ";
|
||||
}
|
||||
|
||||
if ($_GET['sort'] == "worst") {
|
||||
print "order ";
|
||||
}
|
||||
else {
|
||||
print "<A HREF=\"javascript:show('bleu','worst',$count)\">worst</A> ";
|
||||
}
|
||||
|
||||
#print "display <A HREF=\"\">fullscreen</A> ";
|
||||
|
||||
$count = $_GET['count'];
|
||||
if ($count == 0) { $count = 5; }
|
||||
print "showing $count ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count)\">more</A> ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999)\">all</A> ";
|
||||
|
||||
print "</font><BR>\n";
|
||||
$filter = "";
|
||||
if (array_key_exists("filter",$_GET)) {
|
||||
$filter = base64_decode($_GET['filter']);
|
||||
}
|
||||
|
||||
sentence_annotation();
|
||||
print "<p align=center><A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count)\">5 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',10+$count)\">10 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',20+$count)\">20 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',50+$count)\">50 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',100+$count)\">100 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999)\">all</A> ";
|
||||
print "<b>annotated sentences</b><br><font size=-1>sorted by: ";
|
||||
|
||||
if ($_GET['sort'] == "order" || $_GET['sort'] == "") { print "order "; }
|
||||
else {
|
||||
print "<A HREF=\"javascript:show('bleu','order',$count,'".base64_encode($filter)."')\">order</A> ";
|
||||
}
|
||||
if ($_GET['sort'] == "best") { print "best "; }
|
||||
else {
|
||||
print "<A HREF=\"javascript:show('bleu','best',$count,'".base64_encode($filter)."')\">best</A> ";
|
||||
}
|
||||
if ($_GET['sort'] == "25") { print "25% "; }
|
||||
else {
|
||||
print "<A HREF=\"javascript:show('bleu','25',$count,'".base64_encode($filter)."')\">25%</A> ";
|
||||
}
|
||||
if ($_GET['sort'] == "avg") { print "avg "; }
|
||||
else {
|
||||
print "<A HREF=\"javascript:show('bleu','avg',$count,'".base64_encode($filter)."')\">avg</A> ";
|
||||
}
|
||||
if ($_GET['sort'] == "75") { print "75% "; }
|
||||
else {
|
||||
print "<A HREF=\"javascript:show('bleu','75',$count,'".base64_encode($filter)."')\">75%</A> ";
|
||||
}
|
||||
if ($_GET['sort'] == "worst") { print "worst; "; }
|
||||
else {
|
||||
print "<A HREF=\"javascript:show('bleu','worst',$count,'".base64_encode($filter)."')\">worst</A>; ";
|
||||
}
|
||||
|
||||
print "showing: $count ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count,'".base64_encode($filter)."')\">more</A> ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999,'".base64_encode($filter)."')\">all</A>";
|
||||
|
||||
if ($filter != "") {
|
||||
print "; filter: '$filter'";
|
||||
}
|
||||
sentence_annotation($count,$filter);
|
||||
print "<p align=center><A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count,'".base64_encode($filter)."')\">5 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',10+$count,'".base64_encode($filter)."')\">10 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',20+$count,'".base64_encode($filter)."')\">20 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',50+$count,'".base64_encode($filter)."')\">50 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',100+$count,'".base64_encode($filter)."')\">100 more</A> | ";
|
||||
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999,'".base64_encode($filter)."')\">all</A> ";
|
||||
}
|
||||
|
||||
// annotated sentences core: reads data, sorts sentences, displays them
|
||||
function sentence_annotation() {
|
||||
function sentence_annotation($count,$filter) {
|
||||
global $set,$id,$dir,$biconcor;
|
||||
|
||||
// load data
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/bleu-annotation");
|
||||
# get input
|
||||
$filtered = array();
|
||||
$file = get_current_analysis_filename("coverage","input-annotation");
|
||||
if (file_exists($file)) {
|
||||
$input = file($file);
|
||||
# filter is so specified
|
||||
if ($filter != "") {
|
||||
for($i=0;$i<count($input);$i++) {
|
||||
$item = explode("\t",$input[$i]);
|
||||
$word = explode(" ",$item[0]);
|
||||
$keep = 0;
|
||||
for($j=0;$j<count($word);$j++) {
|
||||
if ($word[$j] == $filter) {
|
||||
$keep = 1;
|
||||
}
|
||||
}
|
||||
if (!$keep) { $filtered[$i] = 1; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# load bleu scores
|
||||
$data = file(get_current_analysis_filename("basic","bleu-annotation"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$line["bleu"] = $item[0];
|
||||
$line["id"] = $item[1];
|
||||
$line["system"] = $item[2];
|
||||
$line["reference"] = "";
|
||||
for($j=3;$j<count($item);$j++) {
|
||||
if ($j>3) { $line["reference"] .= "<br>"; };
|
||||
$line["reference"] .= $item[$j];
|
||||
}
|
||||
$bleu[] = $line;
|
||||
$item = split("\t",$data[$i]);
|
||||
if (! array_key_exists($item[1],$filtered)) {
|
||||
$line["bleu"] = $item[0];
|
||||
$line["id"] = $item[1];
|
||||
$line["system"] = $item[2];
|
||||
$line["reference"] = "";
|
||||
for($j=3;$j<count($item);$j++) {
|
||||
if ($j>3) { $line["reference"] .= "<br>"; };
|
||||
$line["reference"] .= $item[$j];
|
||||
}
|
||||
$bleu[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
$coverage_id = get_coverage_analysis_version($dir,$set,$id);
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$coverage_id/input-annotation")) {
|
||||
$input = file("$dir/evaluation/$set.analysis.$coverage_id/input-annotation");
|
||||
# sort and label additional sentences as filtered
|
||||
global $sort;
|
||||
function cmp($a, $b) {
|
||||
global $sort;
|
||||
if ($sort == "order") {
|
||||
$a_idx = $a["id"];
|
||||
$b_idx = $b["id"];
|
||||
}
|
||||
else if ($sort == "worst" || $sort == "75") {
|
||||
$a_idx = $a["bleu"];
|
||||
$b_idx = $b["bleu"];
|
||||
if ($a_idx == $b_idx) {
|
||||
$a_idx = $b["id"];
|
||||
$b_idx = $a["id"];
|
||||
}
|
||||
}
|
||||
else if ($sort == "best" || $sort == "avg" || $sort == "25") {
|
||||
$a_idx = -$a["bleu"];
|
||||
$b_idx = -$b["bleu"];
|
||||
if ($a_idx == $b_idx) {
|
||||
$a_idx = $a["id"];
|
||||
$b_idx = $b["id"];
|
||||
}
|
||||
}
|
||||
if ($a_idx == $b_idx) {
|
||||
return 0;
|
||||
}
|
||||
return ($a_idx < $b_idx) ? -1 : 1;
|
||||
}
|
||||
$sort = $_GET['sort'];
|
||||
if ($sort == '') {
|
||||
$sort = "order";
|
||||
}
|
||||
usort($bleu, 'cmp');
|
||||
|
||||
$offset = 0;
|
||||
if ($sort == "25" || $sort == "75") {
|
||||
$offset = (int) (count($bleu)/4);
|
||||
}
|
||||
else if ($sort == "avg") {
|
||||
$offset = (int) (count($bleu)/2);
|
||||
}
|
||||
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/segmentation-annotation")) {
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/segmentation-annotation");
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$segment = 0;
|
||||
foreach (split(" ",$data[$i]) as $item) {
|
||||
list($in_start,$in_end,$out_start,$out_end) = split(":",$item);
|
||||
$segment++;
|
||||
$segmentation[$i]["input_start"][$in_start] = $segment;
|
||||
$segmentation[$i]["input_end"][$in_end] = $segment;
|
||||
$segmentation[$i]["output_start"][$out_start] = $segment;
|
||||
$segmentation[$i]["output_end"][$out_end+0] = $segment;
|
||||
$retained = array();
|
||||
for($i=$offset;$i<$count+$offset && $i<count($bleu);$i++) {
|
||||
$line = $bleu[$i];
|
||||
$retained[$line["id"]] = 1;
|
||||
}
|
||||
|
||||
# get segmentation (phrase alignment)
|
||||
$file = get_current_analysis_filename("basic","segmentation-annotation");
|
||||
if (file_exists($file)) {
|
||||
$data = file($file);
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
if ($filter == "" || array_key_exists($i,$retained)) {
|
||||
$segment = 0;
|
||||
foreach (split(" ",$data[$i]) as $item) {
|
||||
list($in_start,$in_end,$out_start,$out_end) = split(":",$item);
|
||||
$segment++;
|
||||
$segmentation[$i]["input_start"][$in_start] = $segment;
|
||||
$segmentation[$i]["input_end"][$in_end] = $segment;
|
||||
$segmentation[$i]["output_start"][$out_start] = $segment;
|
||||
$segmentation[$i]["output_end"][$out_end+0] = $segment;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// hierarchical data
|
||||
# get hierarchical data
|
||||
$hierarchical = 0;
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/input-tree")) {
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/input-tree");
|
||||
$span = 0;
|
||||
$last_sentence = -1;
|
||||
$nt_count = array();
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
|
||||
if ($sentence != $last_sentence) { $span = 0; }
|
||||
$last_sentence = $sentence;
|
||||
$segmentation[$sentence][$span]["brackets"] = $brackets;
|
||||
$file = get_current_analysis_filename("basic","input-tree");
|
||||
if (file_exists($file)) {
|
||||
$data = file($file);
|
||||
$span = 0;
|
||||
$last_sentence = -1;
|
||||
$nt_count = array();
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
|
||||
if ($sentence != $last_sentence) { $span = 0; }
|
||||
$last_sentence = $sentence;
|
||||
if ($filter == "" || array_key_exists($sentence,$retained)) {
|
||||
$segmentation[$sentence][$span]["brackets"] = $brackets;
|
||||
# $segmentation[$sentence][$span]["nt"] = $nt;
|
||||
$segmentation[$sentence][$span]["words"] = rtrim($words);
|
||||
if ($nt != "") { $nt_count[$nt]++; }
|
||||
$span++;
|
||||
$segmentation[$sentence][$span]["words"] = rtrim($words);
|
||||
if ($nt != "") { $nt_count[$nt]=1; }
|
||||
$span++;
|
||||
}
|
||||
$hierarchical = 1;
|
||||
}
|
||||
$hierarchical = 1;
|
||||
# if (count($nt_count) <= 2) {
|
||||
# foreach ($segmentation as $sentence => $segmentation_span) {
|
||||
# foreach ($segmentation_span as $span => $type) {
|
||||
@ -976,108 +1068,78 @@ function sentence_annotation() {
|
||||
# }
|
||||
# }
|
||||
}
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/output-tree")) {
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/output-tree");
|
||||
$span = 0;
|
||||
$last_sentence = -1;
|
||||
$nt_count = array();
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
|
||||
if ($sentence != $last_sentence) { $span = 0; }
|
||||
$last_sentence = $sentence;
|
||||
$segmentation_out[$sentence][$span]["brackets"] = $brackets;
|
||||
$segmentation_out[$sentence][$span]["nt"] = $nt;
|
||||
$segmentation_out[$sentence][$span]["words"] = rtrim($words);
|
||||
if ($nt != "") { $nt_count[$nt]++; }
|
||||
$span++;
|
||||
$file = get_current_analysis_filename("basic","output-tree");
|
||||
if (file_exists($file)) {
|
||||
$data = file($file);
|
||||
$span = 0;
|
||||
$last_sentence = -1;
|
||||
$nt_count = array();
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
|
||||
if ($sentence != $last_sentence) { $span = 0; }
|
||||
$last_sentence = $sentence;
|
||||
if ($filter == "" || array_key_exists($sentence,$retained)) {
|
||||
$segmentation_out[$sentence][$span]["brackets"] = $brackets;
|
||||
$segmentation_out[$sentence][$span]["nt"] = $nt;
|
||||
$segmentation_out[$sentence][$span]["words"] = rtrim($words);
|
||||
if ($nt != "") { $nt_count[$nt]=1; }
|
||||
$span++;
|
||||
}
|
||||
if (count($nt_count) <= 2) {
|
||||
foreach ($segmentation_out as $sentence => $segmentation_span) {
|
||||
foreach ($segmentation_span as $span => $type) {
|
||||
$segmentation_out[$sentence][$span]["nt"]="";
|
||||
}
|
||||
}
|
||||
}
|
||||
# no non-terminal markup, if there are two or less non-terminals (X,S)
|
||||
if (count($nt_count) <= 2) {
|
||||
foreach ($segmentation_out as $sentence => $segmentation_span) {
|
||||
foreach ($segmentation_span as $span => $type) {
|
||||
$segmentation_out[$sentence][$span]["nt"]="";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/node")) {
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/node");
|
||||
$n = 0;
|
||||
$last_sentence = -1;
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
list($sentence,$depth,$start_div,$end_div,$start_div_in,$end_div_in,$children) = split(" ",$data[$i]);
|
||||
if ($sentence != $last_sentence) { $n = 0; }
|
||||
$last_sentence = $sentence;
|
||||
$node[$sentence][$n]['depth'] = $depth;
|
||||
$node[$sentence][$n]['start_div'] = $start_div;
|
||||
$node[$sentence][$n]['end_div'] = $end_div;
|
||||
$node[$sentence][$n]['start_div_in'] = $start_div_in;
|
||||
$node[$sentence][$n]['end_div_in'] = $end_div_in;
|
||||
$node[$sentence][$n]['children'] = rtrim($children);
|
||||
$n++;
|
||||
$file = get_current_analysis_filename("basic","node");
|
||||
if (file_exists($file)) {
|
||||
$data = file($file);
|
||||
$n = 0;
|
||||
$last_sentence = -1;
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
list($sentence,$depth,$start_div,$end_div,$start_div_in,$end_div_in,$children) = split(" ",$data[$i]);
|
||||
if ($sentence != $last_sentence) { $n = 0; }
|
||||
$last_sentence = $sentence;
|
||||
if ($filter == "" || array_key_exists($sentence,$retained)) {
|
||||
$node[$sentence][$n]['depth'] = $depth;
|
||||
$node[$sentence][$n]['start_div'] = $start_div;
|
||||
$node[$sentence][$n]['end_div'] = $end_div;
|
||||
$node[$sentence][$n]['start_div_in'] = $start_div_in;
|
||||
$node[$sentence][$n]['end_div_in'] = $end_div_in;
|
||||
$node[$sentence][$n]['children'] = rtrim($children);
|
||||
$n++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$biconcor = get_biconcor_version($dir,$id);
|
||||
|
||||
// sort
|
||||
global $sort;
|
||||
$sort = $_GET['sort'];
|
||||
if ($sort == '') {
|
||||
$sort = "order";
|
||||
# display
|
||||
if ($filter != "") {
|
||||
print " (".(count($input)-count($filtered))." retaining)";
|
||||
}
|
||||
function cmp($a, $b) {
|
||||
global $sort;
|
||||
if ($sort == "order") {
|
||||
$a_idx = $a["id"];
|
||||
$b_idx = $b["id"];
|
||||
}
|
||||
else if ($sort == "worst") {
|
||||
$a_idx = $a["bleu"];
|
||||
$b_idx = $b["bleu"];
|
||||
if ($a_idx == $b_idx) {
|
||||
$a_idx = $b["id"];
|
||||
$b_idx = $a["id"];
|
||||
}
|
||||
}
|
||||
else if ($sort == "best") {
|
||||
$a_idx = -$a["bleu"];
|
||||
$b_idx = -$b["bleu"];
|
||||
if ($a_idx == $b_idx) {
|
||||
$a_idx = $a["id"];
|
||||
$b_idx = $b["id"];
|
||||
}
|
||||
}
|
||||
print "</font><BR>\n";
|
||||
|
||||
if ($a_idx == $b_idx) {
|
||||
return 0;
|
||||
}
|
||||
return ($a_idx < $b_idx) ? -1 : 1;
|
||||
}
|
||||
|
||||
usort($bleu, 'cmp');
|
||||
|
||||
$count = $_GET['count'];
|
||||
if ($count == 0) { $count = 5; }
|
||||
|
||||
// display
|
||||
//print "<div id=\"debug\"></div>";
|
||||
for($i=0;$i<$count && $i<count($bleu);$i++) {
|
||||
$biconcor = get_biconcor_version($dir,$set,$id);
|
||||
//print "<div id=\"debug\">$sort / $offset</div>";
|
||||
for($i=$offset;$i<$count+$offset && $i<count($bleu);$i++) {
|
||||
$line = $bleu[$i];
|
||||
if ($hierarchical) {
|
||||
annotation_hierarchical($line["id"],$segmentation[$line["id"]],$segmentation_out[$line["id"]],$node[$line["id"]]);
|
||||
}
|
||||
if ($input) {
|
||||
print "<div id=\"info-$i\" style=\"border-color:black; background:#ffff80; opacity:0; width:100%; border:1px;\">8364 occ. in corpus, 56 translations, entropy: 5.54</div>\n";
|
||||
print "<div id=\"info-".$line["id"]."\" style=\"border-color:black; background:#ffff80; opacity:0; width:100%; border:1px;\">0 occ. in corpus, 0 translations, entropy: 0.00</div>\n";
|
||||
if ($biconcor) {
|
||||
//print "<div id=\"biconcor-$i\" style=\"display: none;\">xxx</div>";
|
||||
print "<div id=\"biconcor-$i\" class=\"biconcor\">xxx</div>";
|
||||
print "<div id=\"biconcor-".$line["id"]."\" class=\"biconcor\"><font size=-2>(click on input phrase for bilingual concordancer)</font></div>";
|
||||
}
|
||||
if ($hierarchical) {
|
||||
sentence_annotation_hierarchical("#".$line["id"],$line["id"],$input[$line["id"]],$segmentation[$line["id"]],"in");
|
||||
}
|
||||
else {
|
||||
print "<font size=-2>[#".$line["id"]."]</font> ";
|
||||
input_annotation($line["id"],$input[$line["id"]],$segmentation[$line["id"]]);
|
||||
input_annotation($line["id"],$input[$line["id"]],$segmentation[$line["id"]],$filter);
|
||||
}
|
||||
}
|
||||
//else {
|
||||
@ -1099,19 +1161,20 @@ function coverage($coverage_vector) {
|
||||
$coverage = array();
|
||||
foreach (split(" ",$coverage_vector) as $item) {
|
||||
if (preg_match("/[\-:]/",$item)) {
|
||||
list($from,$to,$corpus_count,$ttable_count,$ttable_entropy) = preg_split("/[\-:]/",$item);
|
||||
$coverage[$from][$to]["corpus_count"] = $corpus_count;
|
||||
$coverage[$from][$to]["ttable_count"] = $ttable_count;
|
||||
$coverage[$from][$to]["ttable_entropy"] = $ttable_entropy;
|
||||
$field = preg_split("/[\-:]/",$item);
|
||||
$from = $field[0];
|
||||
$to = $field[1];
|
||||
if (count($field)>2){ $coverage[$from][$to]["corpus_count"]=$field[2]; }
|
||||
if (count($field)>3){ $coverage[$from][$to]["ttable_count"]=$field[3]; }
|
||||
if (count($field)>4){ $coverage[$from][$to]["ttabel_entropy"]=$field[4]; }
|
||||
}
|
||||
}
|
||||
$word = split(" ",$words);
|
||||
|
||||
return $coverage;
|
||||
}
|
||||
|
||||
// annotate an inpute sentence
|
||||
function input_annotation($sentence,$input,$segmentation) {
|
||||
function input_annotation($sentence,$input,$segmentation,$filter) {
|
||||
global $biconcor;
|
||||
list($words,$coverage_vector) = split("\t",$input);
|
||||
|
||||
@ -1187,10 +1250,10 @@ function input_annotation($sentence,$input,$segmentation) {
|
||||
for($j=$from;$j<=$to;$j++) {
|
||||
if ($j>$from) { $phrase .= " "; }
|
||||
$phrase .= $word[$j];
|
||||
$highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';";
|
||||
$lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
|
||||
$highlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='#ffff80';";
|
||||
$lowlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
|
||||
}
|
||||
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($phrase)."');\"":"").">";
|
||||
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($phrase)."');\"":"").">";
|
||||
}
|
||||
print "</div></td>";
|
||||
$from += $size-1;
|
||||
@ -1218,7 +1281,14 @@ function input_annotation($sentence,$input,$segmentation) {
|
||||
$color = '#ffffff';
|
||||
$cc = 0; $tc = 0; $te = 0;
|
||||
}
|
||||
print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\"".($biconcor?" onclick=\"show_biconcor($sentence,'".htmlspecialchars($word[$j])."');\"":"").">$word[$j]</span>";
|
||||
print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($word[$j])."');\"":"").">";
|
||||
if ($word[$j] == $filter) {
|
||||
print "<b><font color=#ff0000>".$word[$j]."</font></b>";
|
||||
}
|
||||
else {
|
||||
print $word[$j];
|
||||
}
|
||||
print "</span>";
|
||||
if ($segmentation && array_key_exists($j,$segmentation["input_end"])) {
|
||||
print "</span>";
|
||||
}
|
||||
@ -1295,7 +1365,7 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
|
||||
function sentence_annotation_hierarchical($info,$sentence,$sequence,$segmentation,$in_out) {
|
||||
$In_Out = $in_out == "out" ? "Out" : "In";
|
||||
|
||||
list($words,$coverage_vector) = split("\t",$input);
|
||||
#list($words,$coverage_vector) = split("\t",$input);
|
||||
$coverage = coverage($sequence);
|
||||
$word = preg_split("/\s/",$sequence);
|
||||
|
||||
@ -1322,7 +1392,8 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
|
||||
$words = $segmentation[$span]["words"];
|
||||
|
||||
# non terminal
|
||||
if ($segmentation[$span]["nt"]) {
|
||||
if (array_key_exists("nt",$segmentation[$span]) &&
|
||||
$segmentation[$span]["nt"] != "") {
|
||||
print $segmentation[$span]["nt"].": ";
|
||||
}
|
||||
|
||||
@ -1359,16 +1430,16 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node
|
||||
function biconcor($query) {
|
||||
global $set,$id,$dir;
|
||||
$sentence = $_GET['sentence'];
|
||||
$biconcor = get_biconcor_version($dir,$id);
|
||||
$biconcor = get_biconcor_version($dir,$set,$id);
|
||||
print "<center>
|
||||
<form action=\"...\" method=get>
|
||||
<form method=get id=\"BiconcorForm\">
|
||||
<img src=\"close.gif\" width=17 height=17 onClick=\"close_biconcor($sentence);\">
|
||||
<input width=20 value=\"$query\">
|
||||
<input type=submit value=\"look up\">
|
||||
<input width=20 id=\"BiconcorQuery\" value=\"$query\">
|
||||
<input type=submit onclick=\"show_biconcor($sentence,encodeBase64(document.getElementById('BiconcorQuery').value));\" value=\"look up\">
|
||||
</form>
|
||||
<div class=\"biconcor-content\">";
|
||||
$cmd = "./biconcor -l $dir/model/biconcor.$biconcor -q ".escapeshellarg($query)." 2>/dev/null";
|
||||
# print $cmd."<p>";
|
||||
$cmd = "./biconcor -l $dir/model/biconcor.$biconcor -Q ".base64_encode($query)." 2>/dev/null";
|
||||
#print $cmd."<p>";
|
||||
system($cmd);
|
||||
# print "<p>done.";
|
||||
print "</div></center>";
|
||||
|
@ -73,8 +73,9 @@ function precision_by_coverage_diff() {
|
||||
print "The graphs display what ratio of words of a specific type are translated correctly (yellow), and what ratio is deleted (blue).";
|
||||
print " The extend of the boxes is scaled on the x-axis by the number of tokens of the displayed type.";
|
||||
// load data
|
||||
$data = file("$dir/evaluation/$set.analysis.$id2/precision-by-corpus-coverage");
|
||||
$data = file(get_current_analysis_filename2("precision","precision-by-corpus-coverage"));
|
||||
$total = 0;
|
||||
$log_info = array();
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$info[$item[0]]["precision"] = $item[1];
|
||||
@ -100,7 +101,7 @@ function precision_by_coverage_diff() {
|
||||
$log_info_new = $log_info;
|
||||
|
||||
// load base data
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage");
|
||||
$data = file(get_current_analysis_filename("precision","precision-by-corpus-coverage"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$info[$item[0]]["precision"] -= $item[1];
|
||||
@ -119,10 +120,10 @@ function precision_by_coverage_diff() {
|
||||
precision_by_coverage_diff_graph("byCoverage",$log_info,$log_info_new,$total,$img_width,SORT_NUMERIC);
|
||||
|
||||
// load factored data
|
||||
$d = dir("$dir/evaluation/$set.analysis.$id");
|
||||
$d = dir("$dir/evaluation/$set.analysis.".get_precision_analysis_version($dir,$set,$id));
|
||||
while (false !== ($file = $d->read())) {
|
||||
if (preg_match('/precision-by-corpus-coverage.(.+)$/',$file, $match) &&
|
||||
file_exists("$dir/evaluation/$set.analysis.$id2/precision-by-corpus-coverage.$match[1]")) {
|
||||
file_exists(get_current_analysis_filename2("precision","precision-by-corpus-coverage.$match[1]"))) {
|
||||
precision_by_coverage_diff_factored($img_width,$total,$file,$match[1]);
|
||||
}
|
||||
}
|
||||
@ -130,7 +131,7 @@ function precision_by_coverage_diff() {
|
||||
|
||||
function precision_by_coverage_diff_factored($img_width,$total,$file,$factor_id) {
|
||||
global $dir,$set,$id,$id2;
|
||||
$data = file("$dir/evaluation/$set.analysis.$id2/$file");
|
||||
$data = file(get_current_analysis_filename2("precision",$file));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$factor = $item[0];
|
||||
@ -158,7 +159,7 @@ function precision_by_coverage_diff_factored($img_width,$total,$file,$factor_id)
|
||||
$log_info_factored_new = $log_info_factored;
|
||||
|
||||
// baseline data
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/$file");
|
||||
$data = file(get_current_analysis_filename("precision",$file));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$factor = $item[0];
|
||||
@ -205,7 +206,9 @@ function precision_by_word_diff($type) {
|
||||
$byFactor = $match[1];
|
||||
}
|
||||
|
||||
$data = file("$dir/evaluation/$set.analysis.$id2/precision-by-input-word");
|
||||
$data = file(get_current_analysis_filename2("precision","precision-by-input-word"));
|
||||
$total = 0;
|
||||
$info = array();
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$line = rtrim($data[$i]);
|
||||
$item = split("\t",$line);
|
||||
@ -215,19 +218,23 @@ function precision_by_word_diff($type) {
|
||||
$count = $item[4];
|
||||
$log_count = -1;
|
||||
if ($count>0) {
|
||||
$log_count = (int) (log($count)/log(2));
|
||||
$log_count = (int) (log($count)/log(2));
|
||||
}
|
||||
if ($byCoverage != -2 && $byCoverage != $log_count) {
|
||||
continue;
|
||||
continue;
|
||||
}
|
||||
|
||||
//# filter for factor
|
||||
$word = $item[5];
|
||||
$factor = $item[6];
|
||||
if ($byFactor != "false" && $byFactor != $factor) {
|
||||
continue;
|
||||
if ($byFactor != "false" && $byFactor != $item[6]) {
|
||||
continue;
|
||||
}
|
||||
if (!array_key_exists($word,$info)) {
|
||||
$info[$word]["precision"] = 0;
|
||||
$info[$word]["delete"] = 0;
|
||||
$info[$word]["length"] = 0;
|
||||
$info[$word]["total"] = 0;
|
||||
}
|
||||
|
||||
$info[$word]["precision"] += $item[0];
|
||||
$info[$word]["delete"] += $item[1];
|
||||
$info[$word]["length"] += $item[2];
|
||||
@ -235,7 +242,7 @@ function precision_by_word_diff($type) {
|
||||
}
|
||||
$info_new = $info;
|
||||
|
||||
$data = file("$dir/evaluation/$set.analysis.$id/precision-by-input-word");
|
||||
$data = file(get_current_analysis_filename("precision","precision-by-input-word"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$line = rtrim($data[$i]);
|
||||
$item = split("\t",$line);
|
||||
@ -252,11 +259,19 @@ function precision_by_word_diff($type) {
|
||||
|
||||
//# filter for factor
|
||||
$word = $item[5];
|
||||
$factor = $item[6];
|
||||
if ($byFactor != "false" && $byFactor != $factor) {
|
||||
if ($byFactor != "false" && $byFactor != $item[6]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!array_key_exists($word,$info)) {
|
||||
$info[$word]["precision"] = 0;
|
||||
$info[$word]["delete"] = 0;
|
||||
$info[$word]["length"] = 0;
|
||||
$info_new[$word]["length"] = 0;
|
||||
$info_new[$word]["delete"] = 0;
|
||||
$info_new[$word]["precision"] = 0;
|
||||
$info_new[$word]["total"] = 0;
|
||||
$info[$word]["total"] = -$item[3];
|
||||
}
|
||||
$info[$word]["precision"] -= $item[0];
|
||||
$info[$word]["delete"] -= $item[1];
|
||||
$info[$word]["length"] -= $item[2];
|
||||
@ -308,14 +323,14 @@ ctx.font = '9px serif';
|
||||
$height = 90-$line/2*180;
|
||||
print "ctx.moveTo(20, $height);\n";
|
||||
print "ctx.lineTo($img_width, $height);\n";
|
||||
print "ctx.fillText(\"".sprintf("%d",10*${line}*1.001)."\%\", 0, $height+4);";
|
||||
print "ctx.fillText(\"".sprintf("%d",10 * $line * 1.001)."\%\", 0, $height+4);";
|
||||
}
|
||||
for($line=-0.4;$line<=0.4;$line+=.2) {
|
||||
$height = 250+$line/2*180;
|
||||
print "ctx.moveTo(20, $height);\n";
|
||||
print "ctx.lineTo($img_width, $height);\n";
|
||||
if ($line != 0) {
|
||||
print "ctx.fillText(\"".sprintf("%d",10*${line}*1.001)."\%\", 0, $height+4);";
|
||||
print "ctx.fillText(\"".sprintf("%d",10 * $line * 1.001)."\%\", 0, $height+4);";
|
||||
}
|
||||
}
|
||||
print "ctx.strokeStyle = \"rgb(100,100,100)\"; ctx.stroke();\n";
|
||||
@ -385,7 +400,7 @@ function ngram_summary_diff() {
|
||||
|
||||
// load data
|
||||
for($idx=0;$idx<2;$idx++) {
|
||||
$data = file("$dir/evaluation/$set.analysis.".($idx?$id2:$id)."/summary");
|
||||
$data = file(get_analysis_filename($dir,$set,$idx?$id2:$id,"basic","summary"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split(": ",$data[$i]);
|
||||
$info[$idx][$item[0]] = $item[1];
|
||||
@ -393,7 +408,7 @@ function ngram_summary_diff() {
|
||||
}
|
||||
|
||||
print "<table cellspacing=5 width=100%><tr><td valign=top align=center bgcolor=#eeeeee>";
|
||||
print "<b>Precision</b><br>";
|
||||
print "<b>Precision of Output</b><br>";
|
||||
//foreach (array("precision","recall") as $type) {
|
||||
$type = "precision";
|
||||
print "<table><tr><td>$type</td><td>1-gram</td><td>2-gram</td><td>3-gram</td><td>4-gram</td></tr>\n";
|
||||
@ -416,12 +431,11 @@ function ngram_summary_diff() {
|
||||
//}
|
||||
|
||||
print "<A HREF=\"javascript:generic_show_diff('PrecisionRecallDetailsDiff','')\">details</A> ";
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage") &&
|
||||
file_exists("$dir/evaluation/$set.analysis.$id2/precision-by-corpus-coverage")) {
|
||||
print "| <A HREF=\"javascript:generic_show_diff('PrecisionByCoverageDiff','')\">breakdown by coverage</A> ";
|
||||
if (file_exists(get_current_analysis_filename("precision","precision-by-corpus-coverage")) &&
|
||||
file_exists(get_current_analysis_filename2("precision","precision-by-corpus-coverage"))) {
|
||||
print "| <A HREF=\"javascript:generic_show_diff('PrecisionByCoverageDiff','')\">precision of input by coverage</A> ";
|
||||
}
|
||||
|
||||
|
||||
print "</td><td valign=top align=center bgcolor=#eeeeee>";
|
||||
print "<b>Metrics</b><br>\n";
|
||||
|
||||
@ -434,6 +448,7 @@ function ngram_summary_diff() {
|
||||
}
|
||||
}
|
||||
}
|
||||
$header = ""; $score_line = ""; $diff_line = "";
|
||||
foreach ($score as $name => $value) {
|
||||
$header .= "<td>$name</td>";
|
||||
$score_line .= "<td>".$score[$name][1]."</td>";
|
||||
@ -494,7 +509,7 @@ function bleu_diff_annotation() {
|
||||
|
||||
// load data
|
||||
for($idx=0;$idx<2;$idx++) {
|
||||
$data = file("$dir/evaluation/$set.analysis.".($idx?$id2:$id)."/bleu-annotation");
|
||||
$data = file(get_analysis_filename($dir,$set,$idx?$id2:$id,"basic","bleu-annotation"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$annotation[$item[1]]["bleu$idx"] = $item[0];
|
||||
@ -505,6 +520,7 @@ function bleu_diff_annotation() {
|
||||
}
|
||||
$data = array();
|
||||
|
||||
$identical=0; $same=0; $better=0; $worse=0;
|
||||
for($i=0;$i<count($annotation);$i++) {
|
||||
if ($annotation[$i]["system1"] == $annotation[$i]["system0"]) {
|
||||
$identical++;
|
||||
@ -609,7 +625,7 @@ function ngram_diff($type) {
|
||||
$order = $_GET['order'];
|
||||
|
||||
for($idx=0;$idx<2;$idx++) {
|
||||
$data = file("$dir/evaluation/$set.analysis.".($idx?$id2:$id)."/n-gram-$type.$order");
|
||||
$data = file(get_analysis_filename($dir,$set,$idx?$id2:$id,"basic","n-gram-$type.$order"));
|
||||
for($i=0;$i<count($data);$i++) {
|
||||
$item = split("\t",$data[$i]);
|
||||
$ngram_hash[$item[2]]["total$idx"] = $item[0];
|
||||
|
@ -1,11 +1,18 @@
|
||||
.pp_head {
|
||||
font-size: 150%;
|
||||
font-size: 90%;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.pp_target_header {
|
||||
font-size: 120%;
|
||||
font-size: 80%;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.pp_singleton_header {
|
||||
font-size: 80%;
|
||||
font-variant: small-caps;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
@ -29,23 +36,62 @@ td.biconcor {
|
||||
}
|
||||
|
||||
td.pp_source_left {
|
||||
font-size: 70%;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
td.pp_target_left {
|
||||
font-size: 70%;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
td.pp_source {
|
||||
font-size: 70%;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
td.pp_target {
|
||||
font-size: 70%;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
td.mismatch_target {
|
||||
font-size: 70%;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
td.pp_source_right {
|
||||
font-size: 70%;
|
||||
border-style:solid;
|
||||
border-width:0px 2px 0px 0px ;
|
||||
border-color: black;
|
||||
}
|
||||
|
||||
td.pp_target_right {
|
||||
font-size: 70%;
|
||||
}
|
||||
|
||||
span.null_aligned {
|
||||
color: blue;
|
||||
}
|
||||
|
||||
span.mismatch_pre_aligned {
|
||||
color: purple;
|
||||
}
|
||||
|
||||
span.mismatch_post_aligned {
|
||||
color: olive;
|
||||
}
|
||||
|
||||
span.mismatch_misaligned {
|
||||
color: red;
|
||||
}
|
||||
|
||||
span.mismatch_aligned {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
td.pp_more {
|
||||
font-size: 70%;
|
||||
text-align: center;
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
h2 {
|
||||
font:italic x-large/1.75 'Essays 1743','Times New Roman',serif;text-shadow:0 0 1px #667
|
||||
font:italic x-large/1.75 'Essays 1743','Times New Roman',serif;
|
||||
text-shadow:0 0 1px #667
|
||||
}
|
||||
~
|
||||
|
||||
|
@ -12,6 +12,7 @@ function head($title) {
|
||||
<script language="javascript" src="/javascripts/prototype.js"></script>
|
||||
<script language="javascript" src="/javascripts/scriptaculous.js"></script>
|
||||
<script language="javascript" src="hierarchical-segmentation.js"></script>
|
||||
<script language="javascript" src="base64.js"></script>
|
||||
<link href="general.css" rel="stylesheet" type="text/css">
|
||||
<link href="hierarchical-segmentation.css" rel="stylesheet" type="text/css">
|
||||
<link href="bilingual-concordance.css" rel="stylesheet" type="text/css">
|
||||
@ -29,7 +30,7 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) {
|
||||
$action = $_GET["analysis"];
|
||||
$set = $_GET["set"];
|
||||
$id = $_GET["id"];
|
||||
$id2 = $_GET["id2"];
|
||||
if (array_key_exists("id2",$_GET)) { $id2 = $_GET["id2"]; }
|
||||
if ($action == "show") { show_analysis(); }
|
||||
else if ($action == "bleu_show") { bleu_show(); }
|
||||
else if ($action == "ngram_precision_show") { ngram_show("precision");}
|
||||
@ -43,7 +44,7 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) {
|
||||
else if (preg_match("/PrecisionByWord(.+)_show/",$action,$match)) { precision_by_word($match[1]); }
|
||||
else if ($action == "CoverageDetails_show") { coverage_details(); }
|
||||
else if ($action == "SegmentationSummary_show") { segmentation_summary(); }
|
||||
else if ($action == "biconcor") { biconcor($_GET["phrase"]); }
|
||||
else if ($action == "biconcor") { biconcor(base64_decode($_GET["phrase"])); }
|
||||
else { print "ERROR! $action"; }
|
||||
}
|
||||
else if (array_key_exists("analysis_diff_home",$_GET)) {
|
||||
|
@ -124,48 +124,136 @@ function process_file_entry($dir,$entry) {
|
||||
}
|
||||
}
|
||||
|
||||
function get_coverage_analysis_version($dir,$set,$id) {
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$id/input-annotation")) {
|
||||
return $id;
|
||||
function get_analysis_version($dir,$set,$id) {
|
||||
global $analysis_version;
|
||||
if ($analysis_version
|
||||
&& array_key_exists($id,$analysis_version)
|
||||
&& array_key_exists($set,$analysis_version[$id])) {
|
||||
#reset($analysis_version[$id][$set]);
|
||||
#print "$id,$set ( ";
|
||||
#while(list($type,$i) = each($analysis_version[$id][$set])) {
|
||||
# print "$type=$i ";
|
||||
#}
|
||||
#print ") FROM CACHE<br>";
|
||||
return $analysis_version[$id][$set];
|
||||
}
|
||||
$analysis_version[$id][$set]["basic"] = 0;
|
||||
$analysis_version[$id][$set]["biconcor"] = 0;
|
||||
$analysis_version[$id][$set]["coverage"] = 0;
|
||||
$analysis_version[$id][$set]["precision"] = 0;
|
||||
$prefix = "$dir/evaluation/$set.analysis";
|
||||
|
||||
# produced by the run itself ?
|
||||
if (file_exists("$prefix.$id/summary")) {
|
||||
$analysis_version[$id][$set]["basic"] = $id;
|
||||
}
|
||||
if (file_exists("$prefix.$id/input-annotation")) {
|
||||
$analysis_version[$id][$set]["coverage"] = $id;
|
||||
}
|
||||
if (file_exists("$prefix.$id/precision-by-input-word")) {
|
||||
$analysis_version[$id][$set]["precision"] = $id;
|
||||
}
|
||||
if (file_exists("$dir/model/biconcor.$id")) {
|
||||
$analysis_version[$id][$set]["biconcor"] = $id;
|
||||
}
|
||||
|
||||
# re-use ?
|
||||
if (file_exists("$dir/steps/$id/re-use.$id")) {
|
||||
$re_use = file("$dir/steps/$id/re-use.$id");
|
||||
foreach($re_use as $line) {
|
||||
if (preg_match("/EVALUATION:(.+):analysis-coverage (\d+)/",$line,$match) &&
|
||||
if (preg_match("/EVALUATION:(.+):analysis (\d+)/",$line,$match) &&
|
||||
$match[1] == $set &&
|
||||
file_exists("$dir/evaluation/$set.analysis.$match[2]/input-annotation")) {
|
||||
return $match[2];
|
||||
file_exists("$prefix.$match[2]/summary")) {
|
||||
$analysis_version[$id][$set]["basic"] = $match[2];
|
||||
}
|
||||
else if (preg_match("/EVALUATION:(.+):analysis-coverage (\d+)/",$line,$match) &&
|
||||
$match[1] == $set &&
|
||||
file_exists("$prefix.$match[2]/input-annotation")) {
|
||||
$analysis_version[$id][$set]["coverage"] = $match[2];
|
||||
}
|
||||
else if (preg_match("/EVALUATION:(.+):analysis-precision (\d+)/",$line,$match) &&
|
||||
$match[1] == $set &&
|
||||
file_exists("$prefix.$match[2]/precision-by-input-word")) {
|
||||
$analysis_version[$id][$set]["precision"] = $match[2];
|
||||
}
|
||||
else if (preg_match("/TRAINING:build-biconcor (\d+)/",$line,$match) &&
|
||||
file_exists("$dir/model/biconcor.$match[1]")) {
|
||||
$analysis_version[$id][$set]["biconcor"] = $match[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# legacy stuff below...
|
||||
if (! file_exists("$dir/steps/$id/REPORTING_report.$id")) {
|
||||
return 0;
|
||||
}
|
||||
$report = file("$dir/steps/$id/REPORTING_report.$id.INFO");
|
||||
foreach ($report as $line) {
|
||||
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-coverage/",$line,$match) &&
|
||||
if (file_exists("$dir/steps/$id/REPORTING_report.$id")) {
|
||||
$report = file("$dir/steps/$id/REPORTING_report.$id.INFO");
|
||||
foreach ($report as $line) {
|
||||
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis/",$line,$match) &&
|
||||
$match[2] == $set) {
|
||||
$reuse_id = $match[1];
|
||||
if (file_exists("$dir/evaluation/$set.analysis.$reuse_id/input-annotation")) {
|
||||
return $reuse_id;
|
||||
if (file_exists("$prefix.$match[1]/summary")) {
|
||||
$analysis_version[$id][$set]["basic"] = $match[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function get_biconcor_version($dir,$id) {
|
||||
if (file_exists("$dir/model/biconcor.$id")) {
|
||||
return $id;
|
||||
}
|
||||
$re_use = file("$dir/steps/$id/re-use.$id");
|
||||
foreach($re_use as $line) {
|
||||
if (preg_match("/TRAINING:build-biconcor (\d+)/",$line,$match) &&
|
||||
file_exists("$dir/model/biconcor.$match[1]")) {
|
||||
return $match[1];
|
||||
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-coverage/",$line,$match) &&
|
||||
$match[2] == $set) {
|
||||
if (file_exists("$prefix.$match[1]/input-annotation")) {
|
||||
$analysis_version[$id][$set]["coverage"] = $match[1];
|
||||
}
|
||||
}
|
||||
if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-precision/",$line,$match) &&
|
||||
$match[2] == $set) {
|
||||
if (file_exists("$prefix.$match[1]/precision-by-input-word")) {
|
||||
$analysis_version[$id][$set]["precision"] = $match[1];
|
||||
}
|
||||
}
|
||||
if (preg_match("/\# reuse run (\d+) for TRAINING:biconcor/",$line,$match)){
|
||||
if (file_exists("$dir/model/biconcor.$match[1]")) {
|
||||
$analysis_version[$id][$set]["biconcor"] = $match[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
#print "$id,$set ( ";
|
||||
#reset($analysis_version[$id][$set]);
|
||||
#while(list($type,$i) = each($analysis_version[$id][$set])) {
|
||||
# print "$type=$i ";
|
||||
#}
|
||||
#print ") ZZ<br>";
|
||||
return $analysis_version[$id][$set];
|
||||
}
|
||||
|
||||
function get_precision_analysis_version($dir,$set,$id) {
|
||||
$version = get_analysis_version($dir,$set,$id);
|
||||
return $version["precision"];
|
||||
}
|
||||
|
||||
function get_basic_analysis_version($dir,$set,$id) {
|
||||
$version = get_analysis_version($dir,$set,$id);
|
||||
return $version["basic"];
|
||||
}
|
||||
|
||||
function get_coverage_analysis_version($dir,$set,$id) {
|
||||
$version = get_analysis_version($dir,$set,$id);
|
||||
return $version["coverage"];
|
||||
}
|
||||
|
||||
function get_biconcor_version($dir,$set,$id) {
|
||||
$version = get_analysis_version($dir,$set,$id);
|
||||
return $version["biconcor"];
|
||||
}
|
||||
|
||||
function get_analysis_filename($dir,$set,$id,$type,$file) {
|
||||
$version = get_analysis_version($dir,$set,$id);
|
||||
return "$dir/evaluation/$set.analysis.".$version[$type]."/".$file;
|
||||
}
|
||||
|
||||
function get_current_analysis_filename($type,$file) {
|
||||
global $dir,$set,$id;
|
||||
$version = get_analysis_version($dir,$set,$id);
|
||||
return "$dir/evaluation/$set.analysis.".$version[$type]."/".$file;
|
||||
}
|
||||
|
||||
function get_current_analysis_filename2($type,$file) {
|
||||
global $dir,$set,$id2;
|
||||
$version = get_analysis_version($dir,$set,$id2);
|
||||
return "$dir/evaluation/$set.analysis.".$version[$type]."/".$file;
|
||||
}
|
||||
|
@ -1,5 +1,7 @@
|
||||
<?php
|
||||
|
||||
date_default_timezone_set('Europe/London');
|
||||
|
||||
function setup() {
|
||||
$setup = file("setup");
|
||||
head("All Experimental Setups");
|
||||
@ -11,7 +13,7 @@ function setup() {
|
||||
print "<TR><TD><A HREF=\"?setup=$dir[0]\">$dir[0]</A></TD><TD>$dir[1]</TD><TD>$dir[2]</TD><TD>$dir[3]</TD></TR>\n";
|
||||
}
|
||||
print "</TABLE>\n";
|
||||
print "<P>To add experiment, edit setup file on web server";
|
||||
print "<P>To add experiment, edit setup in web directory";
|
||||
}
|
||||
|
||||
function overview() {
|
||||
@ -29,10 +31,14 @@ function overview() {
|
||||
|
||||
print "<form action=\"\" method=get>\n";
|
||||
output_state_for_form();
|
||||
|
||||
// count how many analyses there are for each test set
|
||||
while (list($id,$info) = each($experiment)) {
|
||||
reset($evalset);
|
||||
while (list($set,$dummy) = each($evalset)) {
|
||||
$analysis = "$dir/evaluation/$set.analysis.$id";
|
||||
$report_info = "$dir/steps/$id/REPORTING_report.$id.INFO";
|
||||
// does the analysis file exist?
|
||||
if (file_exists($analysis)) {
|
||||
if (!array_key_exists($set,$has_analysis)) {
|
||||
$has_analysis[$set] = 0;
|
||||
@ -117,7 +123,7 @@ function overview() {
|
||||
list($score) = sscanf($info->result[$set],"%f%s");
|
||||
if ($score > 0) {
|
||||
print "score[$id][\"$set\"] = $score;\n";
|
||||
if ($score > $best[$set]) {
|
||||
if (!array_key_exists($set,$best) || $score > $best[$set]) {
|
||||
$best[$set] = $score;
|
||||
}
|
||||
}
|
||||
@ -303,8 +309,8 @@ function output_score($id,$info) {
|
||||
if ($has_analysis && array_key_exists($set,$has_analysis)) {
|
||||
print "<td align=center>";
|
||||
global $dir;
|
||||
$analysis = "$dir/evaluation/$set.analysis.$id";
|
||||
if (file_exists($analysis)) {
|
||||
$analysis = get_analysis_version($dir,$set,$id);
|
||||
if ($analysis["basic"]) {
|
||||
print "<a href=\"?analysis=show&setup=$setup&set=$set&id=$id\">Ⓐ</a> <input type=checkbox name=analysis-$id-$set value=1>";
|
||||
}
|
||||
print "</td>";
|
||||
|
Loading…
Reference in New Issue
Block a user