diff --git a/scripts/ems/biconcor/Alignment.h b/scripts/ems/biconcor/Alignment.h index 5e3890b0f..38e4dab17 100644 --- a/scripts/ems/biconcor/Alignment.h +++ b/scripts/ems/biconcor/Alignment.h @@ -14,7 +14,7 @@ private: INDEX *m_sentenceEnd; INDEX m_size; INDEX m_sentenceCount; - char m_unaligned[ 256 ]; + char m_unaligned[ 256 ]; // here for speed (local to PhraseAlignment) public: ~Alignment(); @@ -27,4 +27,18 @@ public: void Load( string fileName ); void Save( string fileName ); vector Tokenize( const char input[] ); + + INDEX GetSentenceStart( INDEX sentence ) { + if (sentence == 0) return 0; + return m_sentenceEnd[ sentence-1 ] + 2; + } + INDEX GetNumberOfAlignmentPoints( INDEX sentence ) { + return ( m_sentenceEnd[ sentence ] - GetSentenceStart( sentence ) ) / 2; + } + char GetSourceWord( INDEX sentence, INDEX alignment_point ) { + return m_array[ GetSentenceStart( sentence ) + alignment_point*2 ]; + } + char GetTargetWord( INDEX sentence, INDEX alignment_point ) { + return m_array[ GetSentenceStart( sentence ) + alignment_point*2 + 1 ]; + } }; diff --git a/scripts/ems/biconcor/Makefile b/scripts/ems/biconcor/Makefile index b6ac5a4a7..3b2aa9636 100644 --- a/scripts/ems/biconcor/Makefile +++ b/scripts/ems/biconcor/Makefile @@ -6,5 +6,5 @@ clean: .cpp.o: g++ -O6 -g -c $< -biconcor: Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o - g++ Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o PhrasePair.o PhrasePairCollection.o biconcor.o -o biconcor +biconcor: Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o Mismatch.o PhrasePair.o PhrasePairCollection.o biconcor.o base64.o + g++ Vocabulary.o SuffixArray.o TargetCorpus.o Alignment.o Mismatch.o PhrasePair.o PhrasePairCollection.o biconcor.o base64.o -o biconcor diff --git a/scripts/ems/biconcor/Mismatch.cpp b/scripts/ems/biconcor/Mismatch.cpp new file mode 100644 index 000000000..43e83e921 --- /dev/null +++ b/scripts/ems/biconcor/Mismatch.cpp @@ -0,0 +1,246 @@ +#include "Mismatch.h" +#include "Vocabulary.h" + +using namespace std; + +#define UNANNOTATED 0 +#define PRE_ALIGNED 1 +#define POST_ALIGNED 2 +#define UNALIGNED 3 +#define MISALIGNED 4 +#define ALIGNED 5 + + +void Mismatch::PrintClippedHTML( ostream* out, int width ) +{ + char source_annotation[256], target_annotation[256]; + vector< string > label_class; + label_class.push_back( "" ); + label_class.push_back( "mismatch_pre_aligned" ); + label_class.push_back( "mismatch_post_aligned" ); + label_class.push_back( "null_aligned" ); + label_class.push_back( "mismatch_misaligned" ); + label_class.push_back( "mismatch_aligned" ); + + for(int i=0; i= 0) { + int word_id = m_source_start-i; + source_annotation[ word_id ] = UNALIGNED; + if (!m_source_unaligned[ word_id ]) { + found_aligned = true; + LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED ); + } + } + + if (m_source_end+i < m_source_length) { + int word_id = m_source_end+i; + source_annotation[ word_id ] = UNALIGNED; + if (!m_source_unaligned[ word_id ]) { + found_aligned = true; + LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED ); + } + } + } + + } + // misalignment + else { + // label aligned output words + for(int i=m_source_start; i<=m_source_end; i++) + LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED ); + + // find first and last + int target_start = -1; + int target_end; + for(int i=0; iGetTargetWord( m_sentence_id, ap ) == i) { + int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); + // if not part of the source phrase -> also misaligned + if (source_word < m_source_start || source_word > m_source_end) + source_annotation[ source_word ] = MISALIGNED; + } + } + } + } + // closure + bool change = true; + while(change) { + change = false; + for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ); + int target_word = m_alignment->GetTargetWord( m_sentence_id, ap ); + if (source_annotation[source_word] != UNANNOTATED && + target_annotation[target_word] == UNANNOTATED) { + target_annotation[target_word] = MISALIGNED; + change = true; + } + if (source_annotation[source_word] == UNANNOTATED && + target_annotation[target_word] != UNANNOTATED) { + source_annotation[source_word] = MISALIGNED; + change = true; + } + } + } + } + + // print source + // shorten source context if too long + int sentence_start = m_source_position - m_source_start; + int context_space = width/2; + for(int i=m_source_start;i<=m_source_end;i++) + context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1; + context_space /= 2; + + int remaining = context_space; + int start_word = m_source_start; + for(;start_word>0 && remaining>0; start_word--) + remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1; + if (remaining<0 || start_word == -1) start_word++; + + remaining = context_space; + int end_word = m_source_end; + for(;end_word0; end_word++) + remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1; + end_word--; + + // output with markup + *out << ""; + char current_label = UNANNOTATED; + if (start_word>0) { + current_label = source_annotation[start_word-1]; + *out << "... "; + } + for(int i=start_word; i<=end_word; i++) { + // change to phrase block + if (i == m_source_start) { + if (current_label != UNANNOTATED && i!=start_word) + *out << ""; + *out << ""; + current_label = UNANNOTATED; + } + + // change to labeled word + else if (source_annotation[i] != current_label && + source_annotation[i] != ALIGNED) { + if (current_label != UNANNOTATED && i!=start_word) + *out << ""; + if (source_annotation[i] != UNANNOTATED) + *out << ""; + current_label = source_annotation[i]; + } + + // output word + *out << m_suffixArray->GetWord( sentence_start + i ) << " "; + + // change to right context block + if (i == m_source_end) { + *out << ""; + current_label = UNANNOTATED; + } + } + + if (current_label != UNANNOTATED && end_word>m_source_end) + *out << ""; + if (end_wordGetWord( m_sentence_id, i ).size() + 1; + while (context_space < 0) { // shorten matched part, if too long + context_space += + m_targetCorpus->GetWord( m_sentence_id, target_start ).size() + + m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2; + target_start++; + target_end--; + } + context_space /= 2; + + remaining = context_space; + start_word = target_start; + for(;start_word>0 && remaining>0; start_word--) { + //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl; + remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1; + } + if (remaining<0 || start_word == -1) start_word++; + + remaining = context_space; + end_word = target_end; + for(;end_word0; end_word++) { + //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl; + remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1; + } + end_word--; + + // output with markup + *out << ""; + current_label = UNANNOTATED; + if (start_word>0) { + current_label = target_annotation[start_word-1]; + *out << "... "; + } + for(int i=start_word; i<=end_word; i++) { + if (target_annotation[i] != current_label) { + if (current_label != UNANNOTATED && i!=start_word) + *out << ""; + if (target_annotation[i] != UNANNOTATED) + *out << ""; + current_label = target_annotation[i]; + } + + // output word + *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " "; + } + + if (current_label != UNANNOTATED && end_word>target_end) + *out << ""; + if (end_word"; +} + +void Mismatch::LabelSourceMatches( char *source_annotation, char *target_annotation, char source_id, char label ) { + for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ) == source_id) { + source_annotation[ source_id ] = label; + target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label; + } + } +} diff --git a/scripts/ems/biconcor/Mismatch.h b/scripts/ems/biconcor/Mismatch.h new file mode 100644 index 000000000..bfcbf4fd8 --- /dev/null +++ b/scripts/ems/biconcor/Mismatch.h @@ -0,0 +1,70 @@ +#include +#include +#include +#include +#include +#include +#include "SuffixArray.h" +#include "TargetCorpus.h" +#include "Alignment.h" +#pragma once + +using namespace std; + +class Mismatch +{ +public: + typedef unsigned int INDEX; + +private: + SuffixArray *m_suffixArray; + TargetCorpus *m_targetCorpus; + Alignment *m_alignment; + INDEX m_sentence_id; + INDEX m_num_alignment_points; + char m_source_length; + char m_target_length; + SuffixArray::INDEX m_source_position; + char m_source_start, m_source_end; + char m_source_unaligned[ 256 ]; + char m_target_unaligned[ 256 ]; + char m_unaligned; + +public: + Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, char source_length, char target_length, char source_start, char source_end ) + :m_suffixArray(sa) + ,m_targetCorpus(tc) + ,m_alignment(a) + ,m_sentence_id(sentence_id) + ,m_source_position(position) + ,m_source_length(source_length) + ,m_target_length(target_length) + ,m_source_start(source_start) + ,m_source_end(source_end) + { + // initialize unaligned indexes + for(char i=0; iGetNumberOfAlignmentPoints( sentence_id ); + for(INDEX ap=0; apGetSourceWord( sentence_id, ap ) ] = false; + m_target_unaligned[ m_alignment->GetTargetWord( sentence_id, ap ) ] = false; + } + m_unaligned = true; + for(char i=source_start; i<=source_end; i++) { + if (!m_source_unaligned[ i ]) { + m_unaligned = false; + } + } + } + ~Mismatch () {} + + bool Unaligned() { return m_unaligned; } + void PrintClippedHTML( ostream* out, int width ); + void LabelSourceMatches( char *source_annotation, char *target_annotation, char source_id, char label ); +}; diff --git a/scripts/ems/biconcor/PhrasePair.cpp b/scripts/ems/biconcor/PhrasePair.cpp index 5ca6b5f6e..b5330a5fb 100644 --- a/scripts/ems/biconcor/PhrasePair.cpp +++ b/scripts/ems/biconcor/PhrasePair.cpp @@ -145,14 +145,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) int source_pre_width = (source_width-source.size())/2; int source_post_width = (source_width-source.size()+1)/2; + // if phrase is too long, don't show any context if (source.size() > width) { source_pre_width = 0; source_post_width = 0; } - if (source_pre.size()>source_pre_width) + // too long -> truncate and add "..." + if (source_pre.size()>source_pre_width) { + // first skip up to a space + while(source_pre_width>0 && + source_pre.substr(source_pre.size()-source_pre_width,1) != " ") { + source_pre_width--; + } source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ); - if (source_post.size() > source_post_width) + } + if (source_post.size() > source_post_width) { + while(source_post_width>0 && + source_post.substr(source_post_width-1,1) != " ") { + source_post_width--; + } source_post = source_post.substr( 0, source_post_width ) + "..."; + } *out << "" << source_pre @@ -167,8 +180,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) string target_pre = ""; string target = ""; string target_post = ""; + int target_pre_null_width = 0; + int target_post_null_width = 0; for( char i=0; iGetWord( m_sentence_id, i); + WORD word = m_targetCorpus->GetWord( m_sentence_id, i); + target_pre += " " + word; + if (i >= m_target_start-m_pre_null) + target_pre_null_width += word.size() + 1; } for( char i=m_target_start; i<=m_target_end; i++ ) { if (i>m_target_start) target += " "; @@ -176,7 +194,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) } for( char i=m_target_end+1; im_target_end+1) target_post += " "; - target_post += m_targetCorpus->GetWord( m_sentence_id, i); + WORD word = m_targetCorpus->GetWord( m_sentence_id, i); + target_post += word; + if (i-(m_target_end+1) < m_post_null) { + target_post_null_width += word.size() + 1; + } } int target_pre_width = (target_width-target.size())/2; @@ -186,10 +208,47 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) target_pre_width = 0; target_post_width = 0; } - if (target_pre.size() > target_pre_width) + + if (target_pre.size() < target_pre_width) + target_pre_width = target_pre.size(); + else { + while(target_pre_width>0 && + target_pre.substr(target_pre.size()-target_pre_width,1) != " ") { + target_pre_width--; + } target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ); - if (target_post.size() > target_post_width) - target_post = target_post.substr( 0, target_post_width ) + "..."; + } + + if (target_post.size() < target_post_width) { + target_post_width = target_post.size(); + } + else { + while(target_post_width>0 && + target_post.substr(target_post_width-1,1) != " ") { + target_post_width--; + } + target_post = target_post.substr( 0, target_post_width ) + "..."; + } + + if (m_pre_null) { + //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl; + if (target_pre_width < target_pre.size()) + target_pre_null_width -= target_pre.size()-target_pre_width; + target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width) + + "" + + target_pre.substr(target_pre_width-target_pre_null_width) + + ""; + } + if (m_post_null) { + //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl; + if (target_post_null_width>target_post.size()) { + target_post_null_width = target_post.size(); + } + target_post = "" + + target_post.substr(0,target_post_null_width) + + "" + + target_post.substr(target_post_null_width); + } *out << "" << target_pre diff --git a/scripts/ems/biconcor/PhrasePairCollection.cpp b/scripts/ems/biconcor/PhrasePairCollection.cpp index 6647fe985..10d4e37c4 100644 --- a/scripts/ems/biconcor/PhrasePairCollection.cpp +++ b/scripts/ems/biconcor/PhrasePairCollection.cpp @@ -47,8 +47,9 @@ bool PhrasePairCollection::GetCollection( const vector< string > sourceString ) if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) { cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; - for( char pre = 0; pre <= pre_null; pre++ ) { - for( char post = 0; post <= post_null; post++ ) { + bool null_boundary_words = false; + for( char pre = 0; pre <= pre_null && (pre==0||null_boundary_words); pre++ ) { + for( char post = 0; post <= post_null && (post==0||null_boundary_words); post++ ) { vector< WORD_ID > targetString; cerr << "; "; for( char target = target_start-pre; target <= target_end+post; target++ ) { @@ -67,6 +68,18 @@ bool PhrasePairCollection::GetCollection( const vector< string > sourceString ) } } } + else { + cerr << "mismatch " << (i-first_match) + << " in sentence " << sentence_id + << ", starting at word " << source_start + << " of " << sentence_length + << ". target sentence has " << target_length << " words."; + Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end ); + if (mismatch->Unaligned()) + m_unaligned.push_back( mismatch ); + else + m_mismatch.push_back( mismatch ); + } cerr << endl; if (found > m_max_lookup) { @@ -92,23 +105,89 @@ void PhrasePairCollection::Print() void PhrasePairCollection::PrintHTML() { - vector< vector >::iterator ppWithSameTarget; int pp_target = 0; + bool singleton = false; + // loop over all translations + vector< vector >::iterator ppWithSameTarget; for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target"; - (*(ppWithSameTarget->begin()))->PrintTarget( &cout ); - int count = ppWithSameTarget->size(); - cout << "(" << count << "/" << m_size << ")" << endl; - cout << "

"; + + int count = ppWithSameTarget->size(); + if (!singleton) { + if (count == 1) { + singleton = true; + cout << "

singleton" + << (m_collection.end() - ppWithSameTarget==1?"":"s") << " (" + << (m_collection.end() - ppWithSameTarget) + << "/" << m_size << ")

"; + } + else { + cout << "

"; + (*(ppWithSameTarget->begin()))->PrintTarget( &cout ); + cout << " (" << count << "/" << m_size << ")" << endl; + cout << "

"; + } + cout << "
"; + } + vector< PhrasePair* >::iterator p; - int pp = 0; - for(p = ppWithSameTarget->begin(); ppend(); p++, pp++ ) { + // loop over all sentences where translation occurs + int pp=0; + int i=0; + for(p = ppWithSameTarget->begin(); i<10 && ppend(); p++, pp++, i++ ) { (*p)->PrintClippedHTML( &cout, 160 ); if (count > m_max_pp) { p += count/m_max_pp-1; pp += count/m_max_pp-1; } } - cout << "
\n"; + if (i == 10 && pp < count) { + // extended table + cout << "(more)"; + cout << "

"; + cout << ""; + for(i=0, pp=0, p = ppWithSameTarget->begin(); iend(); p++, pp++, i++ ) { + (*p)->PrintClippedHTML( &cout, 160 ); + if (count > m_max_pp) { + p += count/m_max_pp-1; + pp += count/m_max_pp-1; + } + } + } + if (!singleton) cout << "
\n"; + + if (!singleton && pp_target == 9) { + cout << "
"; + cout << "

(more)

"; + cout << "
"; + } } + if (singleton) cout << "
\n"; + else if (pp_target > 9) cout << ""; + + int max_mismatch = m_max_pp/3; + // unaligned phrases + if (m_unaligned.size() > 0) { + cout << "

unaligned" + << " (" << (m_unaligned.size()) << ")

"; + cout << ""; + int step_size = 1; + if (m_unaligned.size() > max_mismatch) + step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch; + for(int i=0;iPrintClippedHTML( &cout, 160 ); + cout << "
"; + } + + // mismatched phrases + if (m_mismatch.size() > 0) { + cout << "

mismatched" + << " (" << (m_mismatch.size()) << ")

"; + cout << ""; + int step_size = 1; + if (m_mismatch.size() > max_mismatch) + step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch; + for(int i=0;iPrintClippedHTML( &cout, 160 ); + cout << "
"; + } } diff --git a/scripts/ems/biconcor/PhrasePairCollection.h b/scripts/ems/biconcor/PhrasePairCollection.h index e90122d3d..b2473515e 100644 --- a/scripts/ems/biconcor/PhrasePairCollection.h +++ b/scripts/ems/biconcor/PhrasePairCollection.h @@ -3,6 +3,7 @@ #include "TargetCorpus.h" #include "Alignment.h" #include "PhrasePair.h" +#include "Mismatch.h" #pragma once @@ -16,6 +17,7 @@ private: TargetCorpus *m_targetCorpus; Alignment *m_alignment; vector< vector > m_collection; + vector< Mismatch* > m_mismatch, m_unaligned; int m_size; int m_max_lookup; int m_max_pp_target; diff --git a/scripts/ems/biconcor/base64.cpp b/scripts/ems/biconcor/base64.cpp new file mode 100644 index 000000000..2a863d161 --- /dev/null +++ b/scripts/ems/biconcor/base64.cpp @@ -0,0 +1,123 @@ +/* + base64.cpp and base64.h + + Copyright (C) 2004-2008 René Nyffenegger + + This source code is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this source code must not be misrepresented; you must not + claim that you wrote the original source code. If you use this source code + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original source code. + + 3. This notice may not be removed or altered from any source distribution. + + René Nyffenegger rene.nyffenegger@adp-gmbh.ch + +*/ + +#include "base64.h" +#include + +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + + +static inline bool is_base64(unsigned char c) { + return (isalnum(c) || (c == '+') || (c == '/')); +} + +std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) { + std::string ret; + int i = 0; + int j = 0; + unsigned char char_array_3[3]; + unsigned char char_array_4[4]; + + while (in_len--) { + char_array_3[i++] = *(bytes_to_encode++); + if (i == 3) { + char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; + char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); + char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); + char_array_4[3] = char_array_3[2] & 0x3f; + + for(i = 0; (i <4) ; i++) + ret += base64_chars[char_array_4[i]]; + i = 0; + } + } + + if (i) + { + for(j = i; j < 3; j++) + char_array_3[j] = '\0'; + + char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; + char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); + char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); + char_array_4[3] = char_array_3[2] & 0x3f; + + for (j = 0; (j < i + 1); j++) + ret += base64_chars[char_array_4[j]]; + + while((i++ < 3)) + ret += '='; + + } + + return ret; + +} + +std::string base64_decode(std::string const& encoded_string) { + int in_len = encoded_string.size(); + int i = 0; + int j = 0; + int in_ = 0; + unsigned char char_array_4[4], char_array_3[3]; + std::string ret; + + while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { + char_array_4[i++] = encoded_string[in_]; in_++; + if (i ==4) { + for (i = 0; i <4; i++) + char_array_4[i] = base64_chars.find(char_array_4[i]); + + char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (i = 0; (i < 3); i++) + ret += char_array_3[i]; + i = 0; + } + } + + if (i) { + for (j = i; j <4; j++) + char_array_4[j] = 0; + + for (j = 0; j <4; j++) + char_array_4[j] = base64_chars.find(char_array_4[j]); + + char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (j = 0; (j < i - 1); j++) ret += char_array_3[j]; + } + + return ret; +} diff --git a/scripts/ems/biconcor/base64.h b/scripts/ems/biconcor/base64.h new file mode 100644 index 000000000..65d5db8b2 --- /dev/null +++ b/scripts/ems/biconcor/base64.h @@ -0,0 +1,4 @@ +#include + +std::string base64_encode(unsigned char const* , unsigned int len); +std::string base64_decode(std::string const& s); diff --git a/scripts/ems/biconcor/biconcor.cpp b/scripts/ems/biconcor/biconcor.cpp index e518ae28c..0b6218623 100644 --- a/scripts/ems/biconcor/biconcor.cpp +++ b/scripts/ems/biconcor/biconcor.cpp @@ -3,6 +3,7 @@ #include "Alignment.h" #include "PhrasePairCollection.h" #include +#include "base64.h" using namespace std; @@ -32,7 +33,7 @@ int main(int argc, char* argv[]) {0, 0, 0, 0} }; int option_index = 0; - int c = getopt_long (argc, argv, "l:s:c:q:t:a:h", long_options, &option_index); + int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:h", long_options, &option_index); if (c == -1) break; switch (c) { case 'l': @@ -53,6 +54,10 @@ int main(int argc, char* argv[]) fileNameSource = string(optarg); createFlag = true; break; + case 'Q': + query = base64_decode(string(optarg)); + queryFlag = true; + break; case 'q': query = string(optarg); queryFlag = true; diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index c19872f15..87d44a7d3 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -370,14 +370,14 @@ build-generation-custom ignore-unless: AND generation-factors generation-corpus default-name: model/generation-table create-config - in: reordering-table phrase-translation-table generation-table LM:binlm biconcor-model + in: reordering-table phrase-translation-table generation-table LM:binlm out: config ignore-if: use-hiero INTERPOLATED-LM:script rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings default-name: model/moses.ini error: Unknown option create-config-interpolated-lm - in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm biconcor-model + in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm out: config ignore-if: use-hiero ignore-unless: INTERPOLATED-LM:script @@ -777,6 +777,6 @@ analysis-precision [REPORTING] single report - in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec + in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model out: report default-name: evaluation/report diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index fc8ac7cec..366456223 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1730,7 +1730,6 @@ sub define_training_create_config { my ($config, $reordering_table,$phrase_translation_table,$generation_table,@LM) = &get_output_and_input($step_id); - if ($LM[$#LM] =~ /biconcor/ || $LM[$#LM] eq '') { pop @LM; } my $cmd = &get_training_setting(9); diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php index 71368e15b..489bdc50d 100644 --- a/scripts/ems/web/analysis.php +++ b/scripts/ems/web/analysis.php @@ -1,6 +1,6 @@ Precision by Coverage"; + print "

Precision of Input Words by Coverage

"; print "The graphs display what ratio of words of a specific type are translated correctly (yellow), and what ratio is deleted (blue)."; print " The extend of the boxes is scaled on the x-axis by the number of tokens of the displayed type."; // load data - $data = file("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage"); + $data = file(get_current_analysis_filename("precision","precision-by-corpus-coverage")); $total = 0; + $log_info = array(); for($i=0;$iBy log2-count in the training corpus"; precision_by_coverage_graph("byCoverage",$log_info,$total,$img_width,SORT_NUMERIC); - // load factored data - $d = dir("$dir/evaluation/$set.analysis.$id"); + # load factored data + $d = dir("$dir/evaluation/$set.analysis.".get_precision_analysis_version($dir,$set,$id)); while (false !== ($file = $d->read())) { if (preg_match('/precision-by-corpus-coverage.(.+)$/',$file, $match)) { precision_by_coverage_factored($img_width,$total,$file,$match[1]); @@ -136,7 +138,7 @@ function precision_by_coverage() { function precision_by_coverage_factored($img_width,$total,$file,$factor_id) { global $dir,$set,$id; - $data = file("$dir/evaluation/$set.analysis.$id/$file"); + $data = file(get_current_analysis_filename("precision",$file)); for($i=0;$iCountPrecisionDeleteLength\n"; foreach ($info as $word => $wordinfo) { - print "$word"; + print "$word"; printf("%.1f%s%.1f/%d",$wordinfo["precision"]/$wordinfo["total"]*100,"%",$wordinfo["precision"],$wordinfo["total"]); printf("%.1f%s%d/%d",$wordinfo["delete"]/$wordinfo["total"]*100,"%",$wordinfo["delete"],$wordinfo["total"]); printf("%.3f",$wordinfo["length"]/$wordinfo["total"]); @@ -361,7 +362,7 @@ ctx.font = '9px serif'; print ""; } -// stats on precision and recall +//# stats on precision and recall function precision_recall_details() { ?> @@ -389,20 +390,20 @@ ngram_show('recall',4,5,'',0); "; $from += $size-1; @@ -1218,7 +1281,14 @@ function input_annotation($sentence,$input,$segmentation) { $color = '#ffffff'; $cc = 0; $tc = 0; $te = 0; } - print "$word[$j]"; + print ""; + if ($word[$j] == $filter) { + print "".$word[$j].""; + } + else { + print $word[$j]; + } + print ""; if ($segmentation && array_key_exists($j,$segmentation["input_end"])) { print ""; } @@ -1295,7 +1365,7 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node function sentence_annotation_hierarchical($info,$sentence,$sequence,$segmentation,$in_out) { $In_Out = $in_out == "out" ? "Out" : "In"; - list($words,$coverage_vector) = split("\t",$input); + #list($words,$coverage_vector) = split("\t",$input); $coverage = coverage($sequence); $word = preg_split("/\s/",$sequence); @@ -1322,7 +1392,8 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node $words = $segmentation[$span]["words"]; # non terminal - if ($segmentation[$span]["nt"]) { + if (array_key_exists("nt",$segmentation[$span]) && + $segmentation[$span]["nt"] != "") { print $segmentation[$span]["nt"].": "; } @@ -1359,16 +1430,16 @@ function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node function biconcor($query) { global $set,$id,$dir; $sentence = $_GET['sentence']; - $biconcor = get_biconcor_version($dir,$id); + $biconcor = get_biconcor_version($dir,$set,$id); print "
-
+ - - + +
"; - $cmd = "./biconcor -l $dir/model/biconcor.$biconcor -q ".escapeshellarg($query)." 2>/dev/null"; - # print $cmd."

"; + $cmd = "./biconcor -l $dir/model/biconcor.$biconcor -Q ".base64_encode($query)." 2>/dev/null"; + #print $cmd."

"; system($cmd); # print "

done."; print "

"; diff --git a/scripts/ems/web/analysis_diff.php b/scripts/ems/web/analysis_diff.php index 1c74387ba..9cb853030 100644 --- a/scripts/ems/web/analysis_diff.php +++ b/scripts/ems/web/analysis_diff.php @@ -73,8 +73,9 @@ function precision_by_coverage_diff() { print "The graphs display what ratio of words of a specific type are translated correctly (yellow), and what ratio is deleted (blue)."; print " The extend of the boxes is scaled on the x-axis by the number of tokens of the displayed type."; // load data - $data = file("$dir/evaluation/$set.analysis.$id2/precision-by-corpus-coverage"); + $data = file(get_current_analysis_filename2("precision","precision-by-corpus-coverage")); $total = 0; + $log_info = array(); for($i=0;$iread())) { if (preg_match('/precision-by-corpus-coverage.(.+)$/',$file, $match) && - file_exists("$dir/evaluation/$set.analysis.$id2/precision-by-corpus-coverage.$match[1]")) { + file_exists(get_current_analysis_filename2("precision","precision-by-corpus-coverage.$match[1]"))) { precision_by_coverage_diff_factored($img_width,$total,$file,$match[1]); } } @@ -130,7 +131,7 @@ function precision_by_coverage_diff() { function precision_by_coverage_diff_factored($img_width,$total,$file,$factor_id) { global $dir,$set,$id,$id2; - $data = file("$dir/evaluation/$set.analysis.$id2/$file"); + $data = file(get_current_analysis_filename2("precision",$file)); for($i=0;$i0) { - $log_count = (int) (log($count)/log(2)); + $log_count = (int) (log($count)/log(2)); } if ($byCoverage != -2 && $byCoverage != $log_count) { - continue; + continue; } //# filter for factor $word = $item[5]; - $factor = $item[6]; - if ($byFactor != "false" && $byFactor != $factor) { - continue; + if ($byFactor != "false" && $byFactor != $item[6]) { + continue; + } + if (!array_key_exists($word,$info)) { + $info[$word]["precision"] = 0; + $info[$word]["delete"] = 0; + $info[$word]["length"] = 0; + $info[$word]["total"] = 0; } - $info[$word]["precision"] += $item[0]; $info[$word]["delete"] += $item[1]; $info[$word]["length"] += $item[2]; @@ -235,7 +242,7 @@ function precision_by_word_diff($type) { } $info_new = $info; - $data = file("$dir/evaluation/$set.analysis.$id/precision-by-input-word"); + $data = file(get_current_analysis_filename("precision","precision-by-input-word")); for($i=0;$i";
"; - //foreach (array("precision","recall") as $type) { - print "Precision\n"; + //#foreach (array("precision","recall") as $type) { + print "Precision of Output\n"; $type = "precision"; print "\n"; printf("\n", @@ -424,8 +425,8 @@ function ngram_summary() { //} print "details "; - if (file_exists("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage")) { - print "| breakdown by coverage "; + if (file_exists(get_current_analysis_filename("precision","precision-by-corpus-coverage"))) { + print "| precision of input by coverage "; } print "
$type1-gram2-gram3-gram4-gram
correct%d%d%d%d
"; @@ -445,8 +446,7 @@ function ngram_summary() { printf("

length-diff: %d (%.1f%s)",$info["precision-1-total"]-$info["recall-1-total"],($info["precision-1-total"]-$info["recall-1-total"])/$info["recall-1-total"]*100,"%"); // coverage - $coverage_id = get_coverage_analysis_version($dir,$set,$id); - if (file_exists("$dir/evaluation/$set.analysis.$coverage_id/corpus-coverage-summary")) { + if (file_exists(get_current_analysis_filename("coverage","corpus-coverage-summary"))) { print "

"; print "
"; coverage_summary(); @@ -454,8 +454,8 @@ function ngram_summary() { } // phrase segmentation - if (file_exists("$dir/evaluation/$set.analysis.$id/segmentation") || - file_exists("$dir/evaluation/$set.analysis.$id/rule")) { + if (file_exists(get_current_analysis_filename("basic","segmentation")) || + file_exists(get_current_analysis_filename("basic","rule"))) { print "
"; print "
"; segmentation_summary(); @@ -463,7 +463,7 @@ function ngram_summary() { } // rules - if (file_exists("$dir/evaluation/$set.analysis.$id/rule")) { + if (file_exists(get_current_analysis_filename("basic","rule"))) { print "
"; print "
"; rule_summary(); @@ -479,7 +479,7 @@ function ngram_show($type) { // load data $order = $_GET['order']; - $data = file("$dir/evaluation/$set.analysis.$id/n-gram-$type.$order"); + $data = file(get_current_analysis_filename("basic","n-gram-$type.$order")); for($i=0;$i5) { @@ -614,7 +614,7 @@ function coverage_details() { } print "
\n"; - $data = file(filename_fallback_to_factored("$dir/evaluation/$set.analysis.$id/ttable-unknown")); + $data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","ttable-unknown"))); for($i=0;$i 2) { $c = $field[2]; } else { $c = 0; } if ($type == "rule") { list($rule_in,$in,$nt,$rule_out,$out) = split(":",$rule); if ($by == "word") { $c *= $in; } @@ -822,9 +824,14 @@ function segmentation_summary() { // hierarchical rules used in translation function rule_summary() { global $dir,$set,$id; - $data = file("$dir/evaluation/$set.analysis.$id/rule"); + $data = file(get_current_analysis_filename("basic","rule")); + $rule = array(); $count = array(); $count_nt = array(); $count_w = array(); + $nt_count = 0; $total = 0; foreach ($data as $item) { - list($type,$d,$d2) = split("\t",$item); + $field = split("\t",$item); + $type = $field[0]; + $d = $field[1]; + if (count($field) > 2) { $d2 = $field[2]; } else { $d2 = 0; } if ($type == "sentence-count") { $sentence_count = $d; } @@ -843,12 +850,16 @@ function rule_summary() { $rule_out = preg_replace("/b/","y",$rule_out); $rule_out = preg_replace("/c/","z",$rule_out); $nt_count += $d2 * $nt; + if (!array_key_exists($d,$rule)) { $rule[$d] = 0; } $rule[$d] += $d2; + if (!array_key_exists($nt,$count)) { $count[$nt] = 0; } $count[$nt] += $d2; $just_nt = preg_replace("/\d/","",$rule_in)."-".preg_replace("/\d/","",$rule_out); $no_wc = preg_replace("/\d/","W",$rule_in)."-".preg_replace("/\d/","",$rule_out); if ($just_nt == "-") { $just_nt = "lexical"; } + if (!array_key_exists($just_nt,$count_nt)) { $count_nt[$just_nt] = 0; } $count_nt[$just_nt] += $d2; + if (!array_key_exists($no_wc,$count_w)) { $count_w[$no_wc] = 0; } $count_w[$no_wc] += $d2; $total += $d2; } @@ -866,108 +877,189 @@ function rule_summary() { // annotated sentences, navigation function bleu_show() { - $count = $_GET['count']; - if ($count == 0) { $count = 5; } - - print "annotated sentences
sorted by "; - - if ($_GET['sort'] == "order" || $_GET['sort'] == "") { - print "order "; - } - else { - print "order "; - } - - if ($_GET['sort'] == "best") { - print "order "; - } - else { - print "best "; - } - - if ($_GET['sort'] == "worst") { - print "order "; - } - else { - print "worst "; - } - - #print "display fullscreen "; $count = $_GET['count']; if ($count == 0) { $count = 5; } - print "showing $count "; - print "more "; - print "all "; - print "
\n"; + $filter = ""; + if (array_key_exists("filter",$_GET)) { + $filter = base64_decode($_GET['filter']); + } - sentence_annotation(); - print "

5 more | "; - print "10 more | "; - print "20 more | "; - print "50 more | "; - print "100 more | "; - print "all "; + print "annotated sentences
sorted by: "; + + if ($_GET['sort'] == "order" || $_GET['sort'] == "") { print "order "; } + else { + print "order "; + } + if ($_GET['sort'] == "best") { print "best "; } + else { + print "best "; + } + if ($_GET['sort'] == "25") { print "25% "; } + else { + print "25% "; + } + if ($_GET['sort'] == "avg") { print "avg "; } + else { + print "avg "; + } + if ($_GET['sort'] == "75") { print "75% "; } + else { + print "75% "; + } + if ($_GET['sort'] == "worst") { print "worst; "; } + else { + print "worst; "; + } + + print "showing: $count "; + print "more "; + print "all"; + + if ($filter != "") { + print "; filter: '$filter'"; + } + sentence_annotation($count,$filter); + print "

5 more | "; + print "10 more | "; + print "20 more | "; + print "50 more | "; + print "100 more | "; + print "all "; } // annotated sentences core: reads data, sorts sentences, displays them -function sentence_annotation() { +function sentence_annotation($count,$filter) { global $set,$id,$dir,$biconcor; - // load data - $data = file("$dir/evaluation/$set.analysis.$id/bleu-annotation"); + # get input + $filtered = array(); + $file = get_current_analysis_filename("coverage","input-annotation"); + if (file_exists($file)) { + $input = file($file); + # filter is so specified + if ($filter != "") { + for($i=0;$i3) { $line["reference"] .= "
"; }; - $line["reference"] .= $item[$j]; - } - $bleu[] = $line; + $item = split("\t",$data[$i]); + if (! array_key_exists($item[1],$filtered)) { + $line["bleu"] = $item[0]; + $line["id"] = $item[1]; + $line["system"] = $item[2]; + $line["reference"] = ""; + for($j=3;$j3) { $line["reference"] .= "
"; }; + $line["reference"] .= $item[$j]; + } + $bleu[] = $line; + } } - $coverage_id = get_coverage_analysis_version($dir,$set,$id); - if (file_exists("$dir/evaluation/$set.analysis.$coverage_id/input-annotation")) { - $input = file("$dir/evaluation/$set.analysis.$coverage_id/input-annotation"); + # sort and label additional sentences as filtered + global $sort; + function cmp($a, $b) { + global $sort; + if ($sort == "order") { + $a_idx = $a["id"]; + $b_idx = $b["id"]; + } + else if ($sort == "worst" || $sort == "75") { + $a_idx = $a["bleu"]; + $b_idx = $b["bleu"]; + if ($a_idx == $b_idx) { + $a_idx = $b["id"]; + $b_idx = $a["id"]; + } + } + else if ($sort == "best" || $sort == "avg" || $sort == "25") { + $a_idx = -$a["bleu"]; + $b_idx = -$b["bleu"]; + if ($a_idx == $b_idx) { + $a_idx = $a["id"]; + $b_idx = $b["id"]; + } + } + if ($a_idx == $b_idx) { + return 0; + } + return ($a_idx < $b_idx) ? -1 : 1; + } + $sort = $_GET['sort']; + if ($sort == '') { + $sort = "order"; + } + usort($bleu, 'cmp'); + + $offset = 0; + if ($sort == "25" || $sort == "75") { + $offset = (int) (count($bleu)/4); + } + else if ($sort == "avg") { + $offset = (int) (count($bleu)/2); } - if (file_exists("$dir/evaluation/$set.analysis.$id/segmentation-annotation")) { - $data = file("$dir/evaluation/$set.analysis.$id/segmentation-annotation"); - for($i=0;$i $segmentation_span) { # foreach ($segmentation_span as $span => $type) { @@ -976,108 +1068,78 @@ function sentence_annotation() { # } # } } - if (file_exists("$dir/evaluation/$set.analysis.$id/output-tree")) { - $data = file("$dir/evaluation/$set.analysis.$id/output-tree"); - $span = 0; - $last_sentence = -1; - $nt_count = array(); - for($i=0;$i $segmentation_span) { - foreach ($segmentation_span as $span => $type) { - $segmentation_out[$sentence][$span]["nt"]=""; - } - } + } + # no non-terminal markup, if there are two or less non-terminals (X,S) + if (count($nt_count) <= 2) { + foreach ($segmentation_out as $sentence => $segmentation_span) { + foreach ($segmentation_span as $span => $type) { + $segmentation_out[$sentence][$span]["nt"]=""; + } } + } } - if (file_exists("$dir/evaluation/$set.analysis.$id/node")) { - $data = file("$dir/evaluation/$set.analysis.$id/node"); - $n = 0; - $last_sentence = -1; - for($i=0;$i
\n"; - if ($a_idx == $b_idx) { - return 0; - } - return ($a_idx < $b_idx) ? -1 : 1; - } - - usort($bleu, 'cmp'); - - $count = $_GET['count']; - if ($count == 0) { $count = 5; } - - // display - //print "

"; - for($i=0;$i<$count && $i$sort / $offset"; + for($i=$offset;$i<$count+$offset && $i8364 occ. in corpus, 56 translations, entropy: 5.54\n"; + print "
0 occ. in corpus, 0 translations, entropy: 0.00
\n"; if ($biconcor) { - //print "
xxx
"; - print "
xxx
"; + print "
(click on input phrase for bilingual concordancer)
"; } if ($hierarchical) { sentence_annotation_hierarchical("#".$line["id"],$line["id"],$input[$line["id"]],$segmentation[$line["id"]],"in"); } else { print "[#".$line["id"]."] "; - input_annotation($line["id"],$input[$line["id"]],$segmentation[$line["id"]]); + input_annotation($line["id"],$input[$line["id"]],$segmentation[$line["id"]],$filter); } } //else { @@ -1099,19 +1161,20 @@ function coverage($coverage_vector) { $coverage = array(); foreach (split(" ",$coverage_vector) as $item) { if (preg_match("/[\-:]/",$item)) { - list($from,$to,$corpus_count,$ttable_count,$ttable_entropy) = preg_split("/[\-:]/",$item); - $coverage[$from][$to]["corpus_count"] = $corpus_count; - $coverage[$from][$to]["ttable_count"] = $ttable_count; - $coverage[$from][$to]["ttable_entropy"] = $ttable_entropy; + $field = preg_split("/[\-:]/",$item); + $from = $field[0]; + $to = $field[1]; + if (count($field)>2){ $coverage[$from][$to]["corpus_count"]=$field[2]; } + if (count($field)>3){ $coverage[$from][$to]["ttable_count"]=$field[3]; } + if (count($field)>4){ $coverage[$from][$to]["ttabel_entropy"]=$field[4]; } } } - $word = split(" ",$words); return $coverage; } // annotate an inpute sentence -function input_annotation($sentence,$input,$segmentation) { +function input_annotation($sentence,$input,$segmentation,$filter) { global $biconcor; list($words,$coverage_vector) = split("\t",$input); @@ -1187,10 +1250,10 @@ function input_annotation($sentence,$input,$segmentation) { for($j=$from;$j<=$to;$j++) { if ($j>$from) { $phrase .= " "; } $phrase .= $word[$j]; - $highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';"; - $lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';"; + $highlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='#ffff80';"; + $lowlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';"; } - print "
"; + print "
"; } print "
"; - print "Precision
"; + print "Precision of Output
"; //foreach (array("precision","recall") as $type) { $type = "precision"; print "\n"; @@ -416,12 +431,11 @@ function ngram_summary_diff() { //} print "details "; - if (file_exists("$dir/evaluation/$set.analysis.$id/precision-by-corpus-coverage") && - file_exists("$dir/evaluation/$set.analysis.$id2/precision-by-corpus-coverage")) { - print "| breakdown by coverage "; + if (file_exists(get_current_analysis_filename("precision","precision-by-corpus-coverage")) && + file_exists(get_current_analysis_filename2("precision","precision-by-corpus-coverage"))) { + print "| precision of input by coverage "; } - print ""; $score_line .= ""; @@ -494,7 +509,7 @@ function bleu_diff_annotation() { // load data for($idx=0;$idx<2;$idx++) { - $data = file("$dir/evaluation/$set.analysis.".($idx?$id2:$id)."/bleu-annotation"); + $data = file(get_analysis_filename($dir,$set,$idx?$id2:$id,"basic","bleu-annotation")); for($i=0;$i + @@ -29,7 +30,7 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) { $action = $_GET["analysis"]; $set = $_GET["set"]; $id = $_GET["id"]; - $id2 = $_GET["id2"]; + if (array_key_exists("id2",$_GET)) { $id2 = $_GET["id2"]; } if ($action == "show") { show_analysis(); } else if ($action == "bleu_show") { bleu_show(); } else if ($action == "ngram_precision_show") { ngram_show("precision");} @@ -43,7 +44,7 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) { else if (preg_match("/PrecisionByWord(.+)_show/",$action,$match)) { precision_by_word($match[1]); } else if ($action == "CoverageDetails_show") { coverage_details(); } else if ($action == "SegmentationSummary_show") { segmentation_summary(); } - else if ($action == "biconcor") { biconcor($_GET["phrase"]); } + else if ($action == "biconcor") { biconcor(base64_decode($_GET["phrase"])); } else { print "ERROR! $action"; } } else if (array_key_exists("analysis_diff_home",$_GET)) { diff --git a/scripts/ems/web/lib.php b/scripts/ems/web/lib.php index 45f2636cb..440940d9c 100644 --- a/scripts/ems/web/lib.php +++ b/scripts/ems/web/lib.php @@ -124,48 +124,136 @@ function process_file_entry($dir,$entry) { } } -function get_coverage_analysis_version($dir,$set,$id) { - if (file_exists("$dir/evaluation/$set.analysis.$id/input-annotation")) { - return $id; +function get_analysis_version($dir,$set,$id) { + global $analysis_version; + if ($analysis_version + && array_key_exists($id,$analysis_version) + && array_key_exists($set,$analysis_version[$id])) { + #reset($analysis_version[$id][$set]); + #print "$id,$set ( "; + #while(list($type,$i) = each($analysis_version[$id][$set])) { + # print "$type=$i "; + #} + #print ") FROM CACHE
"; + return $analysis_version[$id][$set]; } + $analysis_version[$id][$set]["basic"] = 0; + $analysis_version[$id][$set]["biconcor"] = 0; + $analysis_version[$id][$set]["coverage"] = 0; + $analysis_version[$id][$set]["precision"] = 0; + $prefix = "$dir/evaluation/$set.analysis"; + + # produced by the run itself ? + if (file_exists("$prefix.$id/summary")) { + $analysis_version[$id][$set]["basic"] = $id; + } + if (file_exists("$prefix.$id/input-annotation")) { + $analysis_version[$id][$set]["coverage"] = $id; + } + if (file_exists("$prefix.$id/precision-by-input-word")) { + $analysis_version[$id][$set]["precision"] = $id; + } + if (file_exists("$dir/model/biconcor.$id")) { + $analysis_version[$id][$set]["biconcor"] = $id; + } + + # re-use ? if (file_exists("$dir/steps/$id/re-use.$id")) { $re_use = file("$dir/steps/$id/re-use.$id"); foreach($re_use as $line) { - if (preg_match("/EVALUATION:(.+):analysis-coverage (\d+)/",$line,$match) && + if (preg_match("/EVALUATION:(.+):analysis (\d+)/",$line,$match) && $match[1] == $set && - file_exists("$dir/evaluation/$set.analysis.$match[2]/input-annotation")) { - return $match[2]; + file_exists("$prefix.$match[2]/summary")) { + $analysis_version[$id][$set]["basic"] = $match[2]; + } + else if (preg_match("/EVALUATION:(.+):analysis-coverage (\d+)/",$line,$match) && + $match[1] == $set && + file_exists("$prefix.$match[2]/input-annotation")) { + $analysis_version[$id][$set]["coverage"] = $match[2]; + } + else if (preg_match("/EVALUATION:(.+):analysis-precision (\d+)/",$line,$match) && + $match[1] == $set && + file_exists("$prefix.$match[2]/precision-by-input-word")) { + $analysis_version[$id][$set]["precision"] = $match[2]; + } + else if (preg_match("/TRAINING:build-biconcor (\d+)/",$line,$match) && + file_exists("$dir/model/biconcor.$match[1]")) { + $analysis_version[$id][$set]["biconcor"] = $match[1]; } } } + # legacy stuff below... - if (! file_exists("$dir/steps/$id/REPORTING_report.$id")) { - return 0; - } - $report = file("$dir/steps/$id/REPORTING_report.$id.INFO"); - foreach ($report as $line) { - if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-coverage/",$line,$match) && + if (file_exists("$dir/steps/$id/REPORTING_report.$id")) { + $report = file("$dir/steps/$id/REPORTING_report.$id.INFO"); + foreach ($report as $line) { + if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis/",$line,$match) && $match[2] == $set) { - $reuse_id = $match[1]; - if (file_exists("$dir/evaluation/$set.analysis.$reuse_id/input-annotation")) { - return $reuse_id; + if (file_exists("$prefix.$match[1]/summary")) { + $analysis_version[$id][$set]["basic"] = $match[1]; } } - } - return 0; -} - -function get_biconcor_version($dir,$id) { - if (file_exists("$dir/model/biconcor.$id")) { - return $id; - } - $re_use = file("$dir/steps/$id/re-use.$id"); - foreach($re_use as $line) { - if (preg_match("/TRAINING:build-biconcor (\d+)/",$line,$match) && - file_exists("$dir/model/biconcor.$match[1]")) { - return $match[1]; + if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-coverage/",$line,$match) && + $match[2] == $set) { + if (file_exists("$prefix.$match[1]/input-annotation")) { + $analysis_version[$id][$set]["coverage"] = $match[1]; + } } + if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis-precision/",$line,$match) && + $match[2] == $set) { + if (file_exists("$prefix.$match[1]/precision-by-input-word")) { + $analysis_version[$id][$set]["precision"] = $match[1]; + } + } + if (preg_match("/\# reuse run (\d+) for TRAINING:biconcor/",$line,$match)){ + if (file_exists("$dir/model/biconcor.$match[1]")) { + $analysis_version[$id][$set]["biconcor"] = $match[1]; + } + } + } } - return 0; + #print "$id,$set ( "; + #reset($analysis_version[$id][$set]); + #while(list($type,$i) = each($analysis_version[$id][$set])) { + # print "$type=$i "; + #} + #print ") ZZ
"; + return $analysis_version[$id][$set]; } +function get_precision_analysis_version($dir,$set,$id) { + $version = get_analysis_version($dir,$set,$id); + return $version["precision"]; +} + +function get_basic_analysis_version($dir,$set,$id) { + $version = get_analysis_version($dir,$set,$id); + return $version["basic"]; +} + +function get_coverage_analysis_version($dir,$set,$id) { + $version = get_analysis_version($dir,$set,$id); + return $version["coverage"]; +} + +function get_biconcor_version($dir,$set,$id) { + $version = get_analysis_version($dir,$set,$id); + return $version["biconcor"]; +} + +function get_analysis_filename($dir,$set,$id,$type,$file) { + $version = get_analysis_version($dir,$set,$id); + return "$dir/evaluation/$set.analysis.".$version[$type]."/".$file; +} + +function get_current_analysis_filename($type,$file) { + global $dir,$set,$id; + $version = get_analysis_version($dir,$set,$id); + return "$dir/evaluation/$set.analysis.".$version[$type]."/".$file; +} + +function get_current_analysis_filename2($type,$file) { + global $dir,$set,$id2; + $version = get_analysis_version($dir,$set,$id2); + return "$dir/evaluation/$set.analysis.".$version[$type]."/".$file; +} diff --git a/scripts/ems/web/overview.php b/scripts/ems/web/overview.php index db85a2c7e..7280bc35b 100644 --- a/scripts/ems/web/overview.php +++ b/scripts/ems/web/overview.php @@ -1,5 +1,7 @@
\n"; } print "
$type1-gram2-gram3-gram4-gram
"; print "Metrics
\n"; @@ -434,6 +448,7 @@ function ngram_summary_diff() { } } } + $header = ""; $score_line = ""; $diff_line = ""; foreach ($score as $name => $value) { $header .= "
$name".$score[$name][1]."$dir[0]$dir[1]$dir[2]$dir[3]
\n"; - print "

To add experiment, edit setup file on web server"; + print "

To add experiment, edit setup in web directory"; } function overview() { @@ -29,10 +31,14 @@ function overview() { print "

\n"; output_state_for_form(); + + // count how many analyses there are for each test set while (list($id,$info) = each($experiment)) { reset($evalset); while (list($set,$dummy) = each($evalset)) { $analysis = "$dir/evaluation/$set.analysis.$id"; + $report_info = "$dir/steps/$id/REPORTING_report.$id.INFO"; + // does the analysis file exist? if (file_exists($analysis)) { if (!array_key_exists($set,$has_analysis)) { $has_analysis[$set] = 0; @@ -117,7 +123,7 @@ function overview() { list($score) = sscanf($info->result[$set],"%f%s"); if ($score > 0) { print "score[$id][\"$set\"] = $score;\n"; - if ($score > $best[$set]) { + if (!array_key_exists($set,$best) || $score > $best[$set]) { $best[$set] = $score; } } @@ -303,8 +309,8 @@ function output_score($id,$info) { if ($has_analysis && array_key_exists($set,$has_analysis)) { print "
"; global $dir; - $analysis = "$dir/evaluation/$set.analysis.$id"; - if (file_exists($analysis)) { + $analysis = get_analysis_version($dir,$set,$id); + if ($analysis["basic"]) { print " "; } print "