2010-10-21 13:49:27 +04:00
|
|
|
#include "PhrasePairCollection.h"
|
2012-05-07 18:41:18 +04:00
|
|
|
|
2010-10-21 13:49:27 +04:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include <cstring>
|
|
|
|
#include <algorithm>
|
|
|
|
|
2012-05-07 18:41:18 +04:00
|
|
|
#include "Vocabulary.h"
|
|
|
|
#include "SuffixArray.h"
|
|
|
|
#include "TargetCorpus.h"
|
|
|
|
#include "Alignment.h"
|
|
|
|
#include "PhrasePair.h"
|
|
|
|
#include "Mismatch.h"
|
|
|
|
|
2010-10-21 13:49:27 +04:00
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a )
|
2011-02-24 16:57:11 +03:00
|
|
|
:m_suffixArray(sa)
|
|
|
|
,m_targetCorpus(tc)
|
|
|
|
,m_alignment(a)
|
|
|
|
,m_size(0)
|
|
|
|
,m_max_lookup(10000)
|
|
|
|
,m_max_pp_target(50)
|
|
|
|
,m_max_pp(50)
|
2010-10-21 13:49:27 +04:00
|
|
|
{}
|
|
|
|
|
|
|
|
PhrasePairCollection::~PhrasePairCollection()
|
|
|
|
{}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
bool PhrasePairCollection::GetCollection( const vector< string > sourceString )
|
|
|
|
{
|
|
|
|
INDEX first_match, last_match;
|
|
|
|
if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
cerr << "\tfirst match " << first_match << endl;
|
|
|
|
cerr << "\tlast match " << last_match << endl;
|
2010-10-21 13:49:27 +04:00
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
INDEX found = last_match - first_match +1;
|
2010-10-21 13:49:27 +04:00
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
map< vector< WORD_ID >, INDEX > index;
|
|
|
|
for( INDEX i=first_match; i<=last_match; i++ ) {
|
|
|
|
int position = m_suffixArray->GetPosition( i );
|
|
|
|
int source_start = m_suffixArray->GetWordInSentence( position );
|
|
|
|
int source_end = source_start + sourceString.size()-1;
|
|
|
|
INDEX sentence_id = m_suffixArray->GetSentence( position );
|
|
|
|
int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
|
|
|
|
int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
|
|
|
|
cerr << "match " << (i-first_match)
|
|
|
|
<< " in sentence " << sentence_id
|
|
|
|
<< ", starting at word " << source_start
|
|
|
|
<< " of " << sentence_length
|
|
|
|
<< ". target sentence has " << target_length << " words.";
|
|
|
|
char target_start, target_end, pre_null, post_null;
|
|
|
|
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
|
|
|
|
cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
|
|
|
|
cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
|
2011-06-22 01:52:13 +04:00
|
|
|
bool null_boundary_words = false;
|
|
|
|
for( char pre = 0; pre <= pre_null && (pre==0||null_boundary_words); pre++ ) {
|
|
|
|
for( char post = 0; post <= post_null && (post==0||null_boundary_words); post++ ) {
|
2011-02-24 16:57:11 +03:00
|
|
|
vector< WORD_ID > targetString;
|
|
|
|
cerr << "; ";
|
|
|
|
for( char target = target_start-pre; target <= target_end+post; target++ ) {
|
|
|
|
targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) );
|
|
|
|
cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
|
|
|
|
}
|
|
|
|
PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post);
|
|
|
|
// matchCollection.Add( sentence_id, )
|
|
|
|
if (index.find( targetString ) == index.end()) {
|
|
|
|
index[targetString] = m_collection.size();
|
|
|
|
vector< PhrasePair* > emptyVector;
|
|
|
|
m_collection.push_back( emptyVector );
|
|
|
|
}
|
|
|
|
m_collection[ index[targetString] ].push_back( phrasePair );
|
|
|
|
m_size++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2011-06-22 01:52:13 +04:00
|
|
|
else {
|
|
|
|
cerr << "mismatch " << (i-first_match)
|
|
|
|
<< " in sentence " << sentence_id
|
|
|
|
<< ", starting at word " << source_start
|
|
|
|
<< " of " << sentence_length
|
|
|
|
<< ". target sentence has " << target_length << " words.";
|
|
|
|
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
|
|
|
|
if (mismatch->Unaligned())
|
|
|
|
m_unaligned.push_back( mismatch );
|
|
|
|
else
|
|
|
|
m_mismatch.push_back( mismatch );
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
cerr << endl;
|
|
|
|
|
|
|
|
if (found > m_max_lookup) {
|
|
|
|
i += found/m_max_lookup-1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
sort(m_collection.begin(), m_collection.end(), CompareBySize());
|
2012-02-01 17:03:49 +04:00
|
|
|
return true;
|
2010-10-21 13:49:27 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void PhrasePairCollection::Print()
|
|
|
|
{
|
|
|
|
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
|
|
|
|
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) {
|
|
|
|
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
|
|
|
|
int count = ppWithSameTarget->size();
|
|
|
|
cout << "(" << count << ")" << endl;
|
|
|
|
vector< PhrasePair* >::iterator p;
|
|
|
|
for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) {
|
|
|
|
(*p)->Print( &cout, 100 );
|
|
|
|
}
|
|
|
|
}
|
2010-10-21 13:49:27 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void PhrasePairCollection::PrintHTML()
|
|
|
|
{
|
|
|
|
int pp_target = 0;
|
2011-06-22 01:52:13 +04:00
|
|
|
bool singleton = false;
|
|
|
|
// loop over all translations
|
|
|
|
vector< vector<PhrasePair*> >::iterator ppWithSameTarget;
|
2011-02-24 16:57:11 +03:00
|
|
|
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
|
2011-06-22 01:52:13 +04:00
|
|
|
|
|
|
|
int count = ppWithSameTarget->size();
|
|
|
|
if (!singleton) {
|
|
|
|
if (count == 1) {
|
|
|
|
singleton = true;
|
|
|
|
cout << "<p class=\"pp_singleton_header\">singleton"
|
|
|
|
<< (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
|
|
|
|
<< (m_collection.end() - ppWithSameTarget)
|
|
|
|
<< "/" << m_size << ")</p>";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
cout << "<p class=\"pp_target_header\">";
|
|
|
|
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
|
|
|
|
cout << " (" << count << "/" << m_size << ")" << endl;
|
|
|
|
cout << "<p><div id=\"pp_" << pp_target << "\">";
|
|
|
|
}
|
|
|
|
cout << "<table align=\"center\">";
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
vector< PhrasePair* >::iterator p;
|
2011-06-22 01:52:13 +04:00
|
|
|
// loop over all sentences where translation occurs
|
|
|
|
int pp=0;
|
|
|
|
int i=0;
|
|
|
|
for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
|
2011-02-24 16:57:11 +03:00
|
|
|
(*p)->PrintClippedHTML( &cout, 160 );
|
|
|
|
if (count > m_max_pp) {
|
|
|
|
p += count/m_max_pp-1;
|
|
|
|
pp += count/m_max_pp-1;
|
|
|
|
}
|
|
|
|
}
|
2011-06-22 01:52:13 +04:00
|
|
|
if (i == 10 && pp < count) {
|
|
|
|
// extended table
|
|
|
|
cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
|
|
|
|
cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
|
|
|
|
cout << "<table align=\"center\">";
|
|
|
|
for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_pp && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
|
|
|
|
(*p)->PrintClippedHTML( &cout, 160 );
|
|
|
|
if (count > m_max_pp) {
|
|
|
|
p += count/m_max_pp-1;
|
|
|
|
pp += count/m_max_pp-1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!singleton) cout << "</table></div>\n";
|
|
|
|
|
|
|
|
if (!singleton && pp_target == 9) {
|
|
|
|
cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
|
|
|
|
cout << "<p class=\"pp_target_header\">(more)</p></div>";
|
|
|
|
cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
}
|
2011-06-22 01:52:13 +04:00
|
|
|
if (singleton) cout << "</table></div>\n";
|
|
|
|
else if (pp_target > 9) cout << "</div>";
|
|
|
|
|
|
|
|
int max_mismatch = m_max_pp/3;
|
|
|
|
// unaligned phrases
|
|
|
|
if (m_unaligned.size() > 0) {
|
|
|
|
cout << "<p class=\"pp_singleton_header\">unaligned"
|
|
|
|
<< " (" << (m_unaligned.size()) << ")</p>";
|
|
|
|
cout << "<table align=\"center\">";
|
|
|
|
int step_size = 1;
|
|
|
|
if (m_unaligned.size() > max_mismatch)
|
|
|
|
step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
|
|
|
|
for(int i=0;i<m_unaligned.size();i+=step_size)
|
|
|
|
m_unaligned[i]->PrintClippedHTML( &cout, 160 );
|
|
|
|
cout << "</table>";
|
|
|
|
}
|
|
|
|
|
|
|
|
// mismatched phrases
|
|
|
|
if (m_mismatch.size() > 0) {
|
|
|
|
cout << "<p class=\"pp_singleton_header\">mismatched"
|
|
|
|
<< " (" << (m_mismatch.size()) << ")</p>";
|
|
|
|
cout << "<table align=\"center\">";
|
|
|
|
int step_size = 1;
|
|
|
|
if (m_mismatch.size() > max_mismatch)
|
|
|
|
step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
|
|
|
|
for(int i=0;i<m_mismatch.size();i+=step_size)
|
|
|
|
m_mismatch[i]->PrintClippedHTML( &cout, 160 );
|
|
|
|
cout << "</table>";
|
|
|
|
}
|
2010-10-21 13:49:27 +04:00
|
|
|
}
|