#include "Mismatch.h" #include "Vocabulary.h" using namespace std; #define UNANNOTATED 0 #define PRE_ALIGNED 1 #define POST_ALIGNED 2 #define UNALIGNED 3 #define MISALIGNED 4 #define ALIGNED 5 void Mismatch::PrintClippedHTML( ostream* out, int width ) { char source_annotation[256], target_annotation[256]; vector< string > label_class; label_class.push_back( "" ); label_class.push_back( "mismatch_pre_aligned" ); label_class.push_back( "mismatch_post_aligned" ); label_class.push_back( "null_aligned" ); label_class.push_back( "mismatch_misaligned" ); label_class.push_back( "mismatch_aligned" ); for(int i=0; i= 0) { int word_id = m_source_start-i; source_annotation[ word_id ] = UNALIGNED; if (!m_source_unaligned[ word_id ]) { found_aligned = true; LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED ); } } if (m_source_end+i < m_source_length) { int word_id = m_source_end+i; source_annotation[ word_id ] = UNALIGNED; if (!m_source_unaligned[ word_id ]) { found_aligned = true; LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED ); } } } } // misalignment else { // label aligned output words for(int i=m_source_start; i<=m_source_end; i++) LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED ); // find first and last int target_start = -1; int target_end; for(int i=0; iGetTargetWord( m_sentence_id, ap ) == i) { int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); // if not part of the source phrase -> also misaligned if (source_word < m_source_start || source_word > m_source_end) source_annotation[ source_word ] = MISALIGNED; } } } } // closure bool change = true; while(change) { change = false; for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ); int target_word = m_alignment->GetTargetWord( m_sentence_id, ap ); if (source_annotation[source_word] != UNANNOTATED && target_annotation[target_word] == UNANNOTATED) { target_annotation[target_word] = MISALIGNED; change = true; } if (source_annotation[source_word] == UNANNOTATED && target_annotation[target_word] != UNANNOTATED) { source_annotation[source_word] = MISALIGNED; change = true; } } } } // print source // shorten source context if too long int sentence_start = m_source_position - m_source_start; int context_space = width/2; for(int i=m_source_start;i<=m_source_end;i++) context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1; context_space /= 2; int remaining = context_space; int start_word = m_source_start; for(;start_word>0 && remaining>0; start_word--) remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1; if (remaining<0 || start_word == -1) start_word++; remaining = context_space; int end_word = m_source_end; for(;end_word0; end_word++) remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1; end_word--; // output with markup *out << ""; char current_label = UNANNOTATED; if (start_word>0) { current_label = source_annotation[start_word-1]; *out << "... "; } for(int i=start_word; i<=end_word; i++) { // change to phrase block if (i == m_source_start) { if (current_label != UNANNOTATED && i!=start_word) *out << ""; *out << ""; current_label = UNANNOTATED; } // change to labeled word else if (source_annotation[i] != current_label && source_annotation[i] != ALIGNED) { if (current_label != UNANNOTATED && i!=start_word) *out << ""; if (source_annotation[i] != UNANNOTATED) *out << ""; current_label = source_annotation[i]; } // output word *out << m_suffixArray->GetWord( sentence_start + i ) << " "; // change to right context block if (i == m_source_end) { *out << ""; current_label = UNANNOTATED; } } if (current_label != UNANNOTATED && end_word>m_source_end) *out << ""; if (end_wordGetWord( m_sentence_id, i ).size() + 1; while (context_space < 0) { // shorten matched part, if too long context_space += m_targetCorpus->GetWord( m_sentence_id, target_start ).size() + m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2; target_start++; target_end--; } context_space /= 2; remaining = context_space; start_word = target_start; for(;start_word>0 && remaining>0; start_word--) { //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl; remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1; } if (remaining<0 || start_word == -1) start_word++; remaining = context_space; end_word = target_end; for(;end_word0; end_word++) { //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl; remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1; } end_word--; // output with markup *out << ""; current_label = UNANNOTATED; if (start_word>0) { current_label = target_annotation[start_word-1]; *out << "... "; } for(int i=start_word; i<=end_word; i++) { if (target_annotation[i] != current_label) { if (current_label != UNANNOTATED && i!=start_word) *out << ""; if (target_annotation[i] != UNANNOTATED) *out << ""; current_label = target_annotation[i]; } // output word *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " "; } if (current_label != UNANNOTATED && end_word>target_end) *out << ""; if (end_word"; } void Mismatch::LabelSourceMatches( char *source_annotation, char *target_annotation, char source_id, char label ) { for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ) == source_id) { source_annotation[ source_id ] = label; target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label; } } }