mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
6fa57a1dac
This is one of those little chores in managing a long-lived C++ project: standard C headers like stdio.h and math.h now have their own place in the C++ standard as resp. cstdio, cmath, and so on. In this branch the #include names are updated for the biconcor/ subdirectory. C++11 adds cstdint, but to support compilation with the previous standard, that change is left for later.
293 lines
9.3 KiB
C++
293 lines
9.3 KiB
C++
#include "Mismatch.h"
|
|
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <cstring>
|
|
#include <string>
|
|
#include <cstdlib>
|
|
|
|
#include "SuffixArray.h"
|
|
#include "TargetCorpus.h"
|
|
#include "Alignment.h"
|
|
#include "Vocabulary.h"
|
|
|
|
using namespace std;
|
|
|
|
enum {
|
|
UNANNOTATED = 0,
|
|
PRE_ALIGNED = 1,
|
|
POST_ALIGNED = 2,
|
|
UNALIGNED = 3,
|
|
MISALIGNED = 4,
|
|
ALIGNED = 5
|
|
};
|
|
|
|
Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end )
|
|
:m_suffixArray(sa)
|
|
,m_targetCorpus(tc)
|
|
,m_alignment(a)
|
|
,m_sentence_id(sentence_id)
|
|
,m_source_length(source_length)
|
|
,m_target_length(target_length)
|
|
,m_source_position(position)
|
|
,m_source_start(source_start)
|
|
,m_source_end(source_end)
|
|
,m_unaligned(true)
|
|
{
|
|
// initialize unaligned indexes
|
|
for (int i = 0; i < m_source_length; i++) {
|
|
m_source_unaligned[i] = true;
|
|
}
|
|
for (int i = 0; i < m_target_length; i++) {
|
|
m_target_unaligned[i] = true;
|
|
}
|
|
m_num_alignment_points =
|
|
m_alignment->GetNumberOfAlignmentPoints( sentence_id );
|
|
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
|
m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
|
|
m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
|
|
}
|
|
for(int i = source_start; i <= source_end; i++) {
|
|
if (!m_source_unaligned[ i ]) {
|
|
m_unaligned = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
Mismatch::~Mismatch () {}
|
|
|
|
void Mismatch::PrintClippedHTML( ostream* out, int width )
|
|
{
|
|
int source_annotation[256], target_annotation[256];
|
|
vector< string > label_class;
|
|
label_class.push_back( "" );
|
|
label_class.push_back( "mismatch_pre_aligned" );
|
|
label_class.push_back( "mismatch_post_aligned" );
|
|
label_class.push_back( "null_aligned" );
|
|
label_class.push_back( "mismatch_misaligned" );
|
|
label_class.push_back( "mismatch_aligned" );
|
|
|
|
for(int i=0; i<m_source_length; i++) source_annotation[i] = UNANNOTATED;
|
|
for(int i=0; i<m_target_length; i++) target_annotation[i] = UNANNOTATED;
|
|
|
|
if (m_unaligned) {
|
|
// find alignment points for prior and next word(s) and
|
|
// center target phrase around those.
|
|
bool found_aligned = false;
|
|
for(int i=1; i<m_source_length && !found_aligned; i++) {
|
|
if (m_source_start-i >= 0) {
|
|
int word_id = m_source_start-i;
|
|
source_annotation[ word_id ] = UNALIGNED;
|
|
if (!m_source_unaligned[ word_id ]) {
|
|
found_aligned = true;
|
|
LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
|
|
}
|
|
}
|
|
|
|
if (m_source_end+i < m_source_length) {
|
|
int word_id = m_source_end+i;
|
|
source_annotation[ word_id ] = UNALIGNED;
|
|
if (!m_source_unaligned[ word_id ]) {
|
|
found_aligned = true;
|
|
LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
// misalignment
|
|
else {
|
|
// label aligned output words
|
|
for(int i=m_source_start; i<=m_source_end; i++)
|
|
LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
|
|
|
|
// find first and last
|
|
int target_start = -1;
|
|
int target_end;
|
|
for(int i=0; i<m_target_length; i++)
|
|
if (target_annotation[i] == ALIGNED) {
|
|
if (target_start == -1)
|
|
target_start = i;
|
|
target_end = i;
|
|
}
|
|
// go over all enclosed target words
|
|
for(int i=target_start; i<=target_end; i++) {
|
|
// label other target words as unaligned or misaligned
|
|
if (m_target_unaligned[ i ])
|
|
target_annotation[ i ] = UNALIGNED;
|
|
else {
|
|
if (target_annotation[ i ] != ALIGNED)
|
|
target_annotation[ i ] = MISALIGNED;
|
|
// loop over aligned source words
|
|
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
|
if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
|
|
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
|
// if not part of the source phrase -> also misaligned
|
|
if (source_word < m_source_start || source_word > m_source_end)
|
|
source_annotation[ source_word ] = MISALIGNED;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// closure
|
|
bool change = true;
|
|
while(change) {
|
|
change = false;
|
|
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
|
int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
|
|
int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
|
|
if (source_annotation[source_word] != UNANNOTATED &&
|
|
target_annotation[target_word] == UNANNOTATED) {
|
|
target_annotation[target_word] = MISALIGNED;
|
|
change = true;
|
|
}
|
|
if (source_annotation[source_word] == UNANNOTATED &&
|
|
target_annotation[target_word] != UNANNOTATED) {
|
|
source_annotation[source_word] = MISALIGNED;
|
|
change = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// print source
|
|
// shorten source context if too long
|
|
int sentence_start = m_source_position - m_source_start;
|
|
int context_space = width/2;
|
|
for(int i=m_source_start; i<=m_source_end; i++)
|
|
context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
|
|
context_space /= 2;
|
|
|
|
int remaining = context_space;
|
|
int start_word = m_source_start;
|
|
for(; start_word>0 && remaining>0; start_word--)
|
|
remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
|
|
if (remaining<0 || start_word == -1) start_word++;
|
|
|
|
remaining = context_space;
|
|
int end_word = m_source_end;
|
|
for(; end_word<m_source_length && remaining>0; end_word++)
|
|
remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
|
|
end_word--;
|
|
|
|
// output with markup
|
|
*out << "<tr><td class=\"pp_source_left\">";
|
|
char current_label = UNANNOTATED;
|
|
if (start_word>0) {
|
|
current_label = source_annotation[start_word-1];
|
|
*out << "... ";
|
|
}
|
|
for(int i=start_word; i<=end_word; i++) {
|
|
// change to phrase block
|
|
if (i == m_source_start) {
|
|
if (current_label != UNANNOTATED && i!=start_word)
|
|
*out << "</span>";
|
|
*out << "</td><td class=\"pp_source\">";
|
|
current_label = UNANNOTATED;
|
|
}
|
|
|
|
// change to labeled word
|
|
else if (source_annotation[i] != current_label &&
|
|
source_annotation[i] != ALIGNED) {
|
|
if (current_label != UNANNOTATED && i!=start_word)
|
|
*out << "</span>";
|
|
if (source_annotation[i] != UNANNOTATED)
|
|
*out << "<span class=\""
|
|
<< label_class[ source_annotation[i] ]
|
|
<< "\">";
|
|
current_label = source_annotation[i];
|
|
}
|
|
|
|
// output word
|
|
*out << m_suffixArray->GetWord( sentence_start + i ) << " ";
|
|
|
|
// change to right context block
|
|
if (i == m_source_end) {
|
|
*out << "</td><td class=\"pp_source_right\">";
|
|
current_label = UNANNOTATED;
|
|
}
|
|
}
|
|
|
|
if (current_label != UNANNOTATED && end_word>m_source_end)
|
|
*out << "</span>";
|
|
if (end_word<m_source_length-1)
|
|
*out << "... ";
|
|
|
|
// print target
|
|
// shorten target context if too long
|
|
int target_start = -1;
|
|
int target_end;
|
|
for(int i=0; i<m_target_length; i++)
|
|
if (target_annotation[i] != UNANNOTATED) {
|
|
if (target_start == -1)
|
|
target_start = i;
|
|
target_end = i;
|
|
}
|
|
|
|
context_space = width/2;
|
|
for(int i=target_start; i<=target_end; i++)
|
|
context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
|
|
while (context_space < 0) { // shorten matched part, if too long
|
|
context_space +=
|
|
m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
|
|
m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
|
|
target_start++;
|
|
target_end--;
|
|
}
|
|
context_space /= 2;
|
|
|
|
remaining = context_space;
|
|
start_word = target_start;
|
|
for(; start_word>0 && remaining>0; start_word--) {
|
|
//cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
|
|
remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
|
|
}
|
|
if (remaining<0 || start_word == -1) start_word++;
|
|
|
|
remaining = context_space;
|
|
end_word = target_end;
|
|
for(; end_word<m_target_length && remaining>0; end_word++) {
|
|
//cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
|
|
remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
|
|
}
|
|
end_word--;
|
|
|
|
// output with markup
|
|
*out << "</td><td class=\"mismatch_target\">";
|
|
current_label = UNANNOTATED;
|
|
if (start_word>0) {
|
|
current_label = target_annotation[start_word-1];
|
|
*out << "... ";
|
|
}
|
|
for(int i=start_word; i<=end_word; i++) {
|
|
if (target_annotation[i] != current_label) {
|
|
if (current_label != UNANNOTATED && i!=start_word)
|
|
*out << "</span>";
|
|
if (target_annotation[i] != UNANNOTATED)
|
|
*out << "<span class=\""
|
|
<< label_class[ target_annotation[i] ]
|
|
<< "\">";
|
|
current_label = target_annotation[i];
|
|
}
|
|
|
|
// output word
|
|
*out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
|
|
}
|
|
|
|
if (current_label != UNANNOTATED && end_word>target_end)
|
|
*out << "</span>";
|
|
if (end_word<m_target_length-1)
|
|
*out << "... ";
|
|
*out << "</td></tr>";
|
|
}
|
|
|
|
void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label )
|
|
{
|
|
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
|
|
if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
|
|
source_annotation[ source_id ] = label;
|
|
target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
|
|
}
|
|
}
|
|
}
|