mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
47ac8a474d
This change might be useful to avoid duplicating the names. The reason is that although MERT programs are standalone applications, some header files such as data.h and point.h have common guard macro names like "DATA_H" and "POINT_H", and this is not good naming conventions when you want to include external headers. Some files actually include headers in Moses and KenLM's util.
1036 lines
30 KiB
C++
1036 lines
30 KiB
C++
//
|
|
// C++ Implementation: tercalc
|
|
//
|
|
// Description:
|
|
//
|
|
//
|
|
// Author: <>, (C) 2010
|
|
//
|
|
// Copyright: See COPYING file that comes with this distribution
|
|
//
|
|
//
|
|
#include "tercalc.h"
|
|
using namespace std;
|
|
using namespace Tools;
|
|
namespace TERCpp
|
|
{
|
|
|
|
terCalc::terCalc()
|
|
{
|
|
MAX_SHIFT_SIZE = 50;
|
|
INF = 999999.0;
|
|
shift_cost = 1.0;
|
|
insert_cost = 1.0;
|
|
delete_cost = 1.0;
|
|
substitute_cost = 1.0;
|
|
match_cost = 0.0;
|
|
NUM_SEGMENTS_SCORED = 0;
|
|
NUM_SHIFTS_CONSIDERED = 0;
|
|
NUM_BEAM_SEARCH_CALLS = 0;
|
|
BEAM_WIDTH = 20;
|
|
MAX_SHIFT_DIST = 50;
|
|
PRINT_DEBUG = false;
|
|
}
|
|
|
|
|
|
// terCalc::~terCalc()
|
|
// {
|
|
// }
|
|
// size_t* terCalc::hashVec ( vector<string> s )
|
|
// {
|
|
// size_t retour[ ( int ) s.size() ];
|
|
// int i=0;
|
|
// for ( i=0; i< ( int ) s.size(); i++ )
|
|
// {
|
|
// boost::hash<std::string> hasher;
|
|
// retour[i]=hasher ( s.at ( i ) );
|
|
// }
|
|
// return retour;
|
|
// }
|
|
|
|
|
|
int terCalc::WERCalculation ( size_t * ref, size_t * hyp )
|
|
{
|
|
int retour;
|
|
int REFSize = sizeof ( ref ) + 1;
|
|
int HYPSize = sizeof ( hyp ) + 1;
|
|
int WER[REFSize][HYPSize];
|
|
int i = 0;
|
|
int j = 0;
|
|
for ( i = 0; i < REFSize; i++ ) {
|
|
WER[i][0] = ( int ) i;
|
|
}
|
|
for ( j = 0; j < HYPSize; j++ ) {
|
|
WER[0][j] = ( int ) j;
|
|
}
|
|
for ( j = 1; j < HYPSize; j++ ) {
|
|
for ( i = 1; i < REFSize; i++ ) {
|
|
if ( i == 1 ) {
|
|
cerr << endl;
|
|
}
|
|
if ( ref[i-1] == hyp[j-1] ) {
|
|
WER[i][j] = WER[i-1][j-1];
|
|
cerr << "- ";
|
|
cerr << WER[i][j] << "-\t";
|
|
} else {
|
|
if ( ( ( WER[i-1][ j] + 1 ) < ( WER[i][j-1] + 1 ) ) && ( ( WER[i-1][j] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
|
|
WER[i][j] = ( WER[i-1][j] + 1 );
|
|
// cerr << "D ";
|
|
cerr << WER[i][j] << "D\t";
|
|
} else {
|
|
if ( ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j] + 1 ) ) && ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
|
|
WER[i][j] = ( WER[i][j-1] + 1 );
|
|
// cerr << "I ";
|
|
cerr << WER[i][j] << "I\t";
|
|
} else {
|
|
WER[i][j] = ( WER[i-1][j-1] + 1 );
|
|
// cerr << "S ";
|
|
cerr << WER[i][j] << "S\t";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
cerr << endl;
|
|
retour = WER[i-1][j-1];
|
|
cerr << "i : " << i - 1 << "\tj : " << j - 1 << endl;
|
|
return retour;
|
|
}
|
|
int terCalc::WERCalculation ( std::vector< int > ref, std::vector< int > hyp )
|
|
{
|
|
stringstream s;
|
|
s.str ( "" );
|
|
string stringRef ( "" );
|
|
string stringHyp ( "" );
|
|
for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) {
|
|
if ( l_it == ref.begin() ) {
|
|
s << ( *l_it );
|
|
} else {
|
|
s << " " << ( *l_it );
|
|
}
|
|
}
|
|
stringRef = s.str();
|
|
s.str ( "" );
|
|
for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) {
|
|
if ( l_itHyp == hyp.begin() ) {
|
|
s << ( *l_itHyp );
|
|
} else {
|
|
s << " " << ( *l_itHyp );
|
|
}
|
|
}
|
|
stringHyp = s.str();
|
|
s.str ( "" );
|
|
return WERCalculation ( stringToVector ( stringRef, " " ), stringToVector ( stringHyp , " " ) );
|
|
}
|
|
|
|
terAlignment terCalc::TER ( std::vector< int > hyp, std::vector< int > ref )
|
|
{
|
|
stringstream s;
|
|
s.str ( "" );
|
|
string stringRef ( "" );
|
|
string stringHyp ( "" );
|
|
for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) {
|
|
if ( l_it == ref.begin() ) {
|
|
s << ( *l_it );
|
|
} else {
|
|
s << " " << ( *l_it );
|
|
}
|
|
}
|
|
stringRef = s.str();
|
|
s.str ( "" );
|
|
for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) {
|
|
if ( l_itHyp == hyp.begin() ) {
|
|
s << ( *l_itHyp );
|
|
} else {
|
|
s << " " << ( *l_itHyp );
|
|
}
|
|
}
|
|
stringHyp = s.str();
|
|
s.str ( "" );
|
|
return TER ( stringToVector ( stringRef , " " ), stringToVector ( stringHyp , " " ) );
|
|
}
|
|
|
|
int terCalc::WERCalculation ( vector<string> ref, vector<string> hyp )
|
|
{
|
|
int retour;
|
|
int REFSize = ( int ) ref.size() + 1;
|
|
int HYPSize = ( int ) hyp.size() + 1;
|
|
int WER[REFSize][HYPSize];
|
|
char WERchar[REFSize][HYPSize];
|
|
int i = 0;
|
|
int j = 0;
|
|
for ( i = 0; i < REFSize; i++ ) {
|
|
WER[i][0] = ( int ) i;
|
|
}
|
|
for ( j = 0; j < HYPSize; j++ ) {
|
|
WER[0][j] = ( int ) j;
|
|
}
|
|
for ( j = 1; j < HYPSize; j++ ) {
|
|
for ( i = 1; i < REFSize; i++ ) {
|
|
// if (i==1)
|
|
// {
|
|
// cerr << endl;
|
|
// }
|
|
if ( ref[i-1] == hyp[j-1] ) {
|
|
WER[i][j] = WER[i-1][j-1];
|
|
// cerr << "- ";
|
|
// cerr << WER[i][j]<< "-\t";
|
|
WERchar[i][j] = '-';
|
|
} else {
|
|
if ( ( ( WER[i-1][ j] + 1 ) < ( WER[i][j-1] + 1 ) ) && ( ( WER[i-1][j] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
|
|
WER[i][j] = ( WER[i-1][j] + 1 );
|
|
// cerr << "D ";
|
|
// cerr << WER[i][j]<< "D\t";
|
|
WERchar[i][j] = 'D';
|
|
} else {
|
|
if ( ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j] + 1 ) ) && ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
|
|
WER[i][j] = ( WER[i][j-1] + 1 );
|
|
// cerr << "I ";
|
|
// cerr << WER[i][j]<< "I\t";
|
|
WERchar[i][j] = 'I';
|
|
} else {
|
|
WER[i][j] = ( WER[i-1][j-1] + 1 );
|
|
// cerr << "S ";
|
|
// cerr << WER[i][j]<< "S\t";
|
|
WERchar[i][j] = 'S';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
cerr << endl;
|
|
retour = WER[REFSize-1][HYPSize-1];
|
|
cerr << "i : " << i - 1 << "\tj : " << j - 1 << endl;
|
|
j = HYPSize - 1;
|
|
i = REFSize - 1;
|
|
int k;
|
|
stringstream s;
|
|
// WERalignment local[HYPSize];
|
|
if ( HYPSize > REFSize ) {
|
|
k = HYPSize;
|
|
} else {
|
|
k = REFSize;
|
|
}
|
|
WERalignment local;
|
|
while ( j > 0 && i > 0 ) {
|
|
cerr << "indice i : " << i << "\t";
|
|
cerr << "indice j : " << j << endl;
|
|
if ( ( j == HYPSize - 1 ) && ( i == REFSize - 1 ) ) {
|
|
alignmentElement localInfos;
|
|
s << WER[i][j];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
s << WERchar[i][j];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
local.push_back ( localInfos );
|
|
// // i--;
|
|
// j--;
|
|
}
|
|
// else
|
|
{
|
|
if ( ( ( WER[i-1][j-1] ) <= ( WER[i-1][j] ) ) && ( ( WER[i-1][j-1] ) <= ( WER[i][j-1] ) ) ) {
|
|
alignmentElement localInfos;
|
|
s << WER[i-1][j-1];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
s << WERchar[i-1][j-1];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
local.push_back ( localInfos );
|
|
i--;
|
|
j--;
|
|
} else {
|
|
if ( ( ( WER[i][j-1] ) <= ( WER[i-1][j] ) ) && ( ( WER[i][j-1] ) <= ( WER[i-1][j-1] ) ) ) {
|
|
alignmentElement localInfos;
|
|
s << WER[i][j-1];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
s << WERchar[i][j-1];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
local.push_back ( localInfos );
|
|
j--;
|
|
} else {
|
|
alignmentElement localInfos;
|
|
s << WER[i-1][j];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
s << WERchar[i-1][j];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
local.push_back ( localInfos );
|
|
i--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for ( j = 1; j < HYPSize; j++ ) {
|
|
for ( i = 1; i < REFSize; i++ ) {
|
|
cerr << WERchar[i][j] << " ";
|
|
}
|
|
cerr << endl;
|
|
}
|
|
cerr << endl;
|
|
for ( j = 1; j < HYPSize; j++ ) {
|
|
for ( i = 1; i < REFSize; i++ ) {
|
|
cerr << WER[i][j] << " ";
|
|
}
|
|
cerr << endl;
|
|
}
|
|
|
|
cerr << "=================" << endl;
|
|
// k=local.size()-1;
|
|
// while (k>0)
|
|
// {
|
|
// alignmentElement localInfos;
|
|
// localInfos=local.at(k-1);
|
|
|
|
// l_WERalignment.push_back(localInfos);
|
|
// cerr << (string)localInfos.at(1)+"\t";
|
|
k--;
|
|
// }
|
|
// cerr<<endl;
|
|
k = local.size() - 1;
|
|
int l = 0;
|
|
int m = 0;
|
|
while ( k > 0 ) {
|
|
alignmentElement localInfos;
|
|
localInfos = local.at ( k - 1 );
|
|
if ( ( int ) ( localInfos.at ( 1 ).compare ( "D" ) ) == 0 || l > HYPSize - 1 ) {
|
|
localInfos.push_back ( "***" );
|
|
} else {
|
|
localInfos.push_back ( hyp.at ( l ) );
|
|
l++;
|
|
}
|
|
if ( ( int ) ( localInfos.at ( 1 ).compare ( "I" ) ) == 0 || m > REFSize - 1 ) {
|
|
localInfos.push_back ( "***" );
|
|
} else {
|
|
localInfos.push_back ( ref.at ( m ) );
|
|
m++;
|
|
}
|
|
// cerr << vectorToString(localInfos)<<endl;
|
|
// cerr <<localInfos.at(0)<<"\t"<<localInfos.at(1)<<"\t"<<localInfos.at(2)<<"\t"<<localInfos.at(3)<<endl;
|
|
l_WERalignment.push_back ( localInfos );
|
|
// cerr << (string)localInfos.at(1)+"\t";
|
|
k--;
|
|
}
|
|
cerr << endl;
|
|
/* k=local.size()-1;
|
|
while (k>0)
|
|
{
|
|
alignmentElement localInfos;
|
|
localInfos=local.at(k-1);
|
|
// l_WERalignment.push_back(localInfos);
|
|
cerr << (string)localInfos.at(0)+"\t";
|
|
k--;
|
|
}
|
|
cerr<<endl;*/
|
|
k = 0;
|
|
// k=l_WERalignment.size()-1;
|
|
m = 0;
|
|
while ( k < ( int ) l_WERalignment.size() ) {
|
|
alignmentElement localInfos;
|
|
localInfos = l_WERalignment.at ( k );
|
|
cerr << localInfos.at ( 0 ) << "\t" << localInfos.at ( 1 ) << "\t" << localInfos.at ( 2 ) << "\t" << localInfos.at ( 3 ) << endl;
|
|
/* if ((int)(localInfos.at(1).compare("I"))==0)
|
|
{
|
|
cerr << "***\t";
|
|
}
|
|
else
|
|
{
|
|
// if (m<ref.size())
|
|
{
|
|
cerr << ref.at(m) << "\t";
|
|
}
|
|
m++;
|
|
}
|
|
*/
|
|
k++;
|
|
}
|
|
cerr << endl;
|
|
/* k=local.size()-1;
|
|
l=0;
|
|
while (k>0)
|
|
{
|
|
alignmentElement localInfos;
|
|
localInfos=local.at(k-1);
|
|
// l_WERalignment.push_back(localInfos);
|
|
if ((int)(localInfos.at(1).compare("D"))==0)
|
|
{
|
|
cerr << "***\t";
|
|
}
|
|
else
|
|
{
|
|
cerr << hyp.at(l) << "\t";
|
|
l++;
|
|
}
|
|
k--;
|
|
}
|
|
cerr<<endl;*/
|
|
cerr << "=================" << endl;
|
|
return retour;
|
|
}
|
|
|
|
// string terCalc::vectorToString(vector<string> vec)
|
|
// {
|
|
// string retour("");
|
|
// for (vector<string>::iterator vecIter=vec.begin();vecIter!=vec.end(); vecIter++)
|
|
// {
|
|
// retour+=(*vecIter)+"\t";
|
|
// }
|
|
// return retour;
|
|
// }
|
|
|
|
// vector<string> terCalc::subVector(vector<string> vec, int start, int end)
|
|
// {
|
|
// if (start>end)
|
|
// {
|
|
// cerr << "ERREUR : terCalc::subVector : end > start"<<endl;
|
|
// exit(0);
|
|
// }
|
|
// vector<string> retour;
|
|
// for (int i=start; ((i<end) && (i< vec.size())); i++)
|
|
// {
|
|
// retour.push_back(vec.at(i));
|
|
// }
|
|
// return retour;
|
|
// }
|
|
|
|
hashMapInfos terCalc::BuildWordMatches ( vector<string> hyp, vector<string> ref )
|
|
{
|
|
hashMap tempHash;
|
|
hashMapInfos retour;
|
|
for ( int i = 0; i < ( int ) hyp.size(); i++ ) {
|
|
tempHash.addHasher ( hyp.at ( i ), "" );
|
|
}
|
|
bool cor[ref.size() ];
|
|
for ( int i = 0; i < ( int ) ref.size(); i++ ) {
|
|
if ( tempHash.trouve ( ( string ) ref.at ( i ) ) ) {
|
|
cor[i] = true;
|
|
} else {
|
|
cor[i] = false;
|
|
}
|
|
}
|
|
for ( int start = 0; start < ( int ) ref.size(); start++ ) {
|
|
if ( cor[start] ) {
|
|
for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= MAX_SHIFT_SIZE ) && ( cor[end] ) ); end++ ) {
|
|
vector<string> ajouter = subVector ( ref, start, end + 1 );
|
|
string ajouterString = vectorToString ( ajouter );
|
|
vector<int> values = retour.getValue ( ajouterString );
|
|
values.push_back ( start );
|
|
if ( values.size() > 1 ) {
|
|
retour.setValue ( ajouterString, values );
|
|
} else {
|
|
retour.addValue ( ajouterString, values );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return retour;
|
|
}
|
|
|
|
bool terCalc::spanIntersection ( vecInt refSpan, vecInt hypSpan )
|
|
{
|
|
if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) ) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
terAlignment terCalc::MinEditDist ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans )
|
|
{
|
|
double current_best = INF;
|
|
double last_best = INF;
|
|
int first_good = 0;
|
|
int current_first_good = 0;
|
|
int last_good = -1;
|
|
int cur_last_good = 0;
|
|
int last_peak = 0;
|
|
int cur_last_peak = 0;
|
|
int i, j;
|
|
double cost, icost, dcost;
|
|
double score;
|
|
|
|
// int hwsize = hyp.size()-1;
|
|
// int rwsize = ref.size()-1;
|
|
NUM_BEAM_SEARCH_CALLS++;
|
|
// if ((ref.size()+1 > sizeof(S)) || (hyp.size()+1 > sizeof(S)))
|
|
// {
|
|
// int max = ref.size();
|
|
// if (hyp.size() > ref.size()) max = hyp.size();
|
|
// max += 26; // we only need a +1 here, but let's pad for future use
|
|
// S = new double[max][max];
|
|
// P = new char[max][max];
|
|
// }
|
|
for ( i = 0; i <= ( int ) ref.size(); i++ ) {
|
|
for ( j = 0; j <= ( int ) hyp.size(); j++ ) {
|
|
S[i][j] = -1.0;
|
|
P[i][j] = '0';
|
|
}
|
|
}
|
|
S[0][0] = 0.0;
|
|
for ( j = 0; j <= ( int ) hyp.size(); j++ ) {
|
|
last_best = current_best;
|
|
current_best = INF;
|
|
first_good = current_first_good;
|
|
current_first_good = -1;
|
|
last_good = cur_last_good;
|
|
cur_last_good = -1;
|
|
last_peak = cur_last_peak;
|
|
cur_last_peak = 0;
|
|
for ( i = first_good; i <= ( int ) ref.size(); i++ ) {
|
|
if ( i > last_good ) {
|
|
break;
|
|
}
|
|
if ( S[i][j] < 0 ) {
|
|
continue;
|
|
}
|
|
score = S[i][j];
|
|
if ( ( j < ( int ) hyp.size() ) && ( score > last_best + BEAM_WIDTH ) ) {
|
|
continue;
|
|
}
|
|
if ( current_first_good == -1 ) {
|
|
current_first_good = i ;
|
|
}
|
|
if ( ( i < ( int ) ref.size() ) && ( j < ( int ) hyp.size() ) ) {
|
|
if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || spanIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) {
|
|
if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) {
|
|
cost = match_cost + score;
|
|
if ( ( S[i+1][j+1] == -1 ) || ( cost < S[i+1][j+1] ) ) {
|
|
S[i+1][j+1] = cost;
|
|
P[i+1][j+1] = ' ';
|
|
}
|
|
if ( cost < current_best ) {
|
|
current_best = cost;
|
|
}
|
|
if ( current_best == cost ) {
|
|
cur_last_peak = i + 1;
|
|
}
|
|
} else {
|
|
cost = substitute_cost + score;
|
|
if ( ( S[i+1][j+1] < 0 ) || ( cost < S[i+1][j+1] ) ) {
|
|
S[i+1][j+1] = cost;
|
|
P[i+1][j+1] = 'S';
|
|
if ( cost < current_best ) {
|
|
current_best = cost;
|
|
}
|
|
if ( current_best == cost ) {
|
|
cur_last_peak = i + 1 ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
cur_last_good = i + 1;
|
|
if ( j < ( int ) hyp.size() ) {
|
|
icost = score + insert_cost;
|
|
if ( ( S[i][j+1] < 0 ) || ( S[i][j+1] > icost ) ) {
|
|
S[i][j+1] = icost;
|
|
P[i][j+1] = 'I';
|
|
if ( ( cur_last_peak < i ) && ( current_best == icost ) ) {
|
|
cur_last_peak = i;
|
|
}
|
|
}
|
|
}
|
|
if ( i < ( int ) ref.size() ) {
|
|
dcost = score + delete_cost;
|
|
if ( ( S[ i+1][ j] < 0.0 ) || ( S[i+1][j] > dcost ) ) {
|
|
S[i+1][j] = dcost;
|
|
P[i+1][j] = 'D';
|
|
if ( i >= last_good ) {
|
|
last_good = i + 1 ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
int tracelength = 0;
|
|
i = ref.size();
|
|
j = hyp.size();
|
|
while ( ( i > 0 ) || ( j > 0 ) ) {
|
|
tracelength++;
|
|
if ( P[i][j] == ' ' ) {
|
|
i--;
|
|
j--;
|
|
} else if ( P[i][j] == 'S' ) {
|
|
i--;
|
|
j--;
|
|
} else if ( P[i][j] == 'D' ) {
|
|
i--;
|
|
} else if ( P[i][j] == 'I' ) {
|
|
j--;
|
|
} else {
|
|
cerr << "ERROR : terCalc::MinEditDist : Invalid path : " << P[i][j] << endl;
|
|
exit ( -1 );
|
|
}
|
|
}
|
|
vector<char> path ( tracelength );
|
|
i = ref.size();
|
|
j = hyp.size();
|
|
while ( ( i > 0 ) || ( j > 0 ) ) {
|
|
path[--tracelength] = P[i][j];
|
|
if ( P[i][j] == ' ' ) {
|
|
i--;
|
|
j--;
|
|
} else if ( P[i][j] == 'S' ) {
|
|
i--;
|
|
j--;
|
|
} else if ( P[i][j] == 'D' ) {
|
|
i--;
|
|
} else if ( P[i][j] == 'I' ) {
|
|
j--;
|
|
}
|
|
}
|
|
terAlignment to_return;
|
|
to_return.numWords = ref.size();
|
|
to_return.alignment = path;
|
|
to_return.numEdits = S[ref.size() ][hyp.size() ];
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::MinEditDist : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl;
|
|
}
|
|
return to_return;
|
|
|
|
}
|
|
terAlignment terCalc::TER ( vector<string> hyp, vector<string> ref )
|
|
{
|
|
hashMapInfos rloc = BuildWordMatches ( hyp, ref );
|
|
terAlignment cur_align = MinEditDist ( hyp, ref, hypSpans );
|
|
vector<string> cur = hyp;
|
|
cur_align.hyp = hyp;
|
|
cur_align.ref = ref;
|
|
cur_align.aftershift = hyp;
|
|
double edits = 0;
|
|
// int numshifts = 0;
|
|
|
|
vector<terShift> allshifts;
|
|
|
|
// cerr << "Initial Alignment:" << endl << cur_align.toString() <<endl;
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::TER : cur_align :" << endl << cur_align.toString() << endl << "END DEBUG" << endl;
|
|
}
|
|
while ( true ) {
|
|
bestShiftStruct returns;
|
|
returns = CalcBestShift ( cur, hyp, ref, rloc, cur_align );
|
|
if ( returns.m_empty ) {
|
|
break;
|
|
}
|
|
terShift bestShift = returns.m_best_shift;
|
|
cur_align = returns.m_best_align;
|
|
edits += bestShift.cost;
|
|
bestShift.alignment = cur_align.alignment;
|
|
bestShift.aftershift = cur_align.aftershift;
|
|
allshifts.push_back ( bestShift );
|
|
cur = cur_align.aftershift;
|
|
}
|
|
terAlignment to_return;
|
|
to_return = cur_align;
|
|
to_return.allshifts = allshifts;
|
|
to_return.numEdits += edits;
|
|
NUM_SEGMENTS_SCORED++;
|
|
return to_return;
|
|
}
|
|
bestShiftStruct terCalc::CalcBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment med_align )
|
|
{
|
|
bestShiftStruct to_return;
|
|
bool anygain = false;
|
|
bool herr[ ( int ) hyp.size() ];
|
|
bool rerr[ ( int ) ref.size() ];
|
|
int ralign[ ( int ) ref.size() ];
|
|
FindAlignErr ( med_align, herr, rerr, ralign );
|
|
vector<vecTerShift> poss_shifts;
|
|
poss_shifts = GatherAllPossShifts ( cur, ref, rloc, med_align, herr, rerr, ralign );
|
|
double curerr = med_align.numEdits;
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
|
|
cerr << "Possible Shifts:" << endl;
|
|
for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) {
|
|
for ( int j = 0; j < ( int ) ( poss_shifts.at ( i ) ).size(); j++ ) {
|
|
cerr << " [" << i << "] " << ( ( poss_shifts.at ( i ) ).at ( j ) ).toString() << endl;
|
|
}
|
|
}
|
|
cerr << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
double cur_best_shift_cost = 0.0;
|
|
terAlignment cur_best_align = med_align;
|
|
terShift cur_best_shift;
|
|
|
|
|
|
|
|
for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) {
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
|
|
cerr << "Considering shift of length " << i << " (" << ( poss_shifts.at ( i ) ).size() << ")" << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
/* Consider shifts of length i+1 */
|
|
double curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits );
|
|
double maxfix = ( 2 * ( 1 + i ) );
|
|
if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
|
|
break;
|
|
}
|
|
|
|
for ( int s = 0; s < ( int ) ( poss_shifts.at ( i ) ).size(); s++ ) {
|
|
curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits );
|
|
if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
|
|
break;
|
|
}
|
|
terShift curshift = ( poss_shifts.at ( i ) ).at ( s );
|
|
|
|
alignmentStruct shiftReturns = PerformShift ( cur, curshift );
|
|
vector<string> shiftarr = shiftReturns.nwords;
|
|
vector<vecInt> curHypSpans = shiftReturns.aftershift;
|
|
|
|
terAlignment curalign = MinEditDist ( shiftarr, ref, curHypSpans );
|
|
|
|
curalign.hyp = hyp;
|
|
curalign.ref = ref;
|
|
curalign.aftershift = shiftarr;
|
|
|
|
double gain = ( cur_best_align.numEdits + cur_best_shift_cost ) - ( curalign.numEdits + curshift.cost );
|
|
|
|
// if (DEBUG) {
|
|
// string testeuh=terAlignment join(" ", shiftarr);
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
|
|
cerr << "Gain for " << curshift.toString() << " is " << gain << ". (result: [" << curalign.join ( " ", shiftarr ) << "]" << endl;
|
|
cerr << "" << curalign.toString() << "\n" << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
// }
|
|
//
|
|
if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) ) {
|
|
anygain = true;
|
|
cur_best_shift = curshift;
|
|
cur_best_shift_cost = curshift.cost;
|
|
cur_best_align = curalign;
|
|
// if (DEBUG)
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
|
|
cerr << "Tmp Choosing shift: " << cur_best_shift.toString() << " gives:\n" << cur_best_align.toString() << "\n" << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if ( anygain ) {
|
|
to_return.m_best_shift = cur_best_shift;
|
|
to_return.m_best_align = cur_best_align;
|
|
to_return.m_empty = false;
|
|
} else {
|
|
to_return.m_empty = true;
|
|
}
|
|
return to_return;
|
|
}
|
|
|
|
void terCalc::FindAlignErr ( terAlignment align, bool* herr, bool* rerr, int* ralign )
|
|
{
|
|
int hpos = -1;
|
|
int rpos = -1;
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
cerr << "BEGIN DEBUG : terCalc::FindAlignErr : " << endl << align.toString() << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) {
|
|
char sym = align.alignment[i];
|
|
if ( sym == ' ' ) {
|
|
hpos++;
|
|
rpos++;
|
|
herr[hpos] = false;
|
|
rerr[rpos] = false;
|
|
ralign[rpos] = hpos;
|
|
} else if ( sym == 'S' ) {
|
|
hpos++;
|
|
rpos++;
|
|
herr[hpos] = true;
|
|
rerr[rpos] = true;
|
|
ralign[rpos] = hpos;
|
|
} else if ( sym == 'I' ) {
|
|
hpos++;
|
|
herr[hpos] = true;
|
|
} else if ( sym == 'D' ) {
|
|
rpos++;
|
|
rerr[rpos] = true;
|
|
ralign[rpos] = hpos;
|
|
} else {
|
|
cerr << "ERROR : terCalc::FindAlignErr : Invalid mini align sequence " << sym << " at pos " << i << endl;
|
|
exit ( -1 );
|
|
}
|
|
}
|
|
}
|
|
|
|
vector<vecTerShift> terCalc::GatherAllPossShifts ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign )
|
|
{
|
|
vector<vecTerShift> to_return;
|
|
// Don't even bother to look if shifts can't be done
|
|
if ( ( MAX_SHIFT_SIZE <= 0 ) || ( MAX_SHIFT_DIST <= 0 ) ) {
|
|
// terShift[][] to_return = new terShift[0][];
|
|
return to_return;
|
|
}
|
|
|
|
vector<vecTerShift> allshifts ( MAX_SHIFT_SIZE + 1 );
|
|
|
|
// ArrayList[] allshifts = new ArrayList[MAX_SHIFT_SIZE+1];
|
|
// for (int i = 0; i < allshifts.length; i++)
|
|
// {
|
|
// allshifts[i] = new ArrayList();
|
|
// }
|
|
|
|
// List hyplist = Arrays.asList(hyp);
|
|
for ( int start = 0; start < ( int ) hyp.size(); start++ ) {
|
|
string subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) );
|
|
if ( ! rloc.trouve ( subVectorHypString ) ) {
|
|
continue;
|
|
}
|
|
|
|
bool ok = false;
|
|
vector<int> mtiVec = rloc.getValue ( subVectorHypString );
|
|
vector<int>::iterator mti = mtiVec.begin();
|
|
while ( mti != mtiVec.end() && ( ! ok ) ) {
|
|
int moveto = ( *mti );
|
|
mti++;
|
|
if ( ( start != ralign[moveto] ) && ( ( ralign[moveto] - start ) <= MAX_SHIFT_DIST ) && ( ( start - ralign[moveto] - 1 ) <= MAX_SHIFT_DIST ) ) {
|
|
ok = true;
|
|
}
|
|
}
|
|
if ( ! ok ) {
|
|
continue;
|
|
}
|
|
ok = true;
|
|
for ( int end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + MAX_SHIFT_SIZE ) ); end++ ) {
|
|
/* check if cand is good if so, add it */
|
|
vector<string> cand = subVector ( hyp, start, end + 1 );
|
|
ok = false;
|
|
if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) ) {
|
|
continue;
|
|
}
|
|
|
|
bool any_herr = false;
|
|
|
|
for ( int i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ ) {
|
|
if ( herr[start+i] ) {
|
|
any_herr = true;
|
|
}
|
|
}
|
|
if ( any_herr == false ) {
|
|
ok = true;
|
|
continue;
|
|
}
|
|
|
|
vector<int> movetoitVec;
|
|
movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) );
|
|
vector<int>::iterator movetoit = movetoitVec.begin();
|
|
while ( movetoit != movetoitVec.end() ) {
|
|
int moveto = ( *movetoit );
|
|
movetoit++;
|
|
if ( ! ( ( ralign[moveto] != start ) && ( ( ralign[moveto] < start ) || ( ralign[moveto] > end ) ) && ( ( ralign[moveto] - start ) <= MAX_SHIFT_DIST ) && ( ( start - ralign[moveto] ) <= MAX_SHIFT_DIST ) ) ) {
|
|
continue;
|
|
}
|
|
ok = true;
|
|
|
|
/* check to see if there are any errors in either string
|
|
(only move if this is the case!)
|
|
*/
|
|
|
|
bool any_rerr = false;
|
|
for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ ) {
|
|
if ( rerr[moveto+i] ) {
|
|
any_rerr = true;
|
|
}
|
|
}
|
|
if ( ! any_rerr ) {
|
|
continue;
|
|
}
|
|
for ( int roff = -1; roff <= ( end - start ); roff++ ) {
|
|
terShift topush;
|
|
bool topushNull = true;
|
|
if ( ( roff == -1 ) && ( moveto == 0 ) ) {
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
cerr << "BEGIN DEBUG : terCalc::GatherAllPossShifts 01 : " << endl << "Consider making " << start << "..." << end << " moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl;
|
|
}
|
|
terShift t01 ( start, end, -1, -1 );
|
|
topush = t01;
|
|
topushNull = false;
|
|
} else if ( ( start != ralign[moveto+roff] ) && ( ( roff == 0 ) || ( ralign[moveto+roff] != ralign[moveto] ) ) ) {
|
|
int newloc = ralign[moveto+roff];
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
cerr << "BEGIN DEBUG : terCalc::GatherAllPossShifts 02 : " << endl << "Consider making " << start << "..." << end << " moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl;
|
|
}
|
|
terShift t02 ( start, end, moveto + roff, newloc );
|
|
topush = t02;
|
|
topushNull = false;
|
|
}
|
|
if ( !topushNull ) {
|
|
topush.shifted = cand;
|
|
topush.cost = shift_cost;
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
cerr << "BEGIN DEBUG : terCalc::GatherAllPossShifts 02 : " << endl;
|
|
cerr << "start : " << start << endl;
|
|
cerr << "end : " << end << endl;
|
|
cerr << "end - start : " << end - start << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
( allshifts.at ( end - start ) ).push_back ( topush );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// vector<vecTerShift> to_return;
|
|
to_return.clear();
|
|
// terShift[][] to_return = new terShift[MAX_SHIFT_SIZE+1][];
|
|
for ( int i = 0; i < MAX_SHIFT_SIZE + 1; i++ ) {
|
|
// to_return[i] = (terShift[]) allshifts[i].toArray(new terShift[0]);
|
|
to_return.push_back ( ( vecTerShift ) allshifts.at ( i ) );
|
|
}
|
|
return to_return;
|
|
}
|
|
|
|
|
|
alignmentStruct terCalc::PerformShift ( vector<string> words, terShift s )
|
|
{
|
|
return PerformShift ( words, s.start, s.end, s.newloc );
|
|
}
|
|
|
|
|
|
alignmentStruct terCalc::PerformShift ( vector<string> words, int start, int end, int newloc )
|
|
{
|
|
int c = 0;
|
|
vector<string> nwords ( words );
|
|
vector<vecInt> spans ( ( int ) hypSpans.size() );
|
|
alignmentStruct toreturn;
|
|
// ON EST ICI
|
|
// if((int)hypSpans.size()>0) spans = new TERintpair[(int)hypSpans.size()];
|
|
// if(DEBUG) {
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
cerr << "BEGIN DEBUG : terCalc::PerformShift :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl << "END DEBUG " << endl;
|
|
} else {
|
|
cerr << "BEGIN DEBUG : terCalc::PerformShift :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl << "END DEBUG " << endl;
|
|
}
|
|
}
|
|
// }
|
|
|
|
if ( newloc == -1 ) {
|
|
for ( int i = start; i <= end; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = 0; i <= start - 1; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
} else {
|
|
if ( newloc < start ) {
|
|
for ( int i = 0; i <= newloc; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = start; i <= end; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = newloc + 1; i <= start - 1; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
} else {
|
|
if ( newloc > end ) {
|
|
for ( int i = 0; i <= start - 1; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = end + 1; i <= newloc; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = start; i <= end; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = newloc + 1; i < ( int ) words.size(); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
} else {
|
|
// we are moving inside of ourselves
|
|
for ( int i = 0; i <= start - 1; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = start; i <= end; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size(); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
NUM_SHIFTS_CONSIDERED++;
|
|
|
|
toreturn.nwords = nwords;
|
|
toreturn.aftershift = spans;
|
|
return toreturn;
|
|
}
|
|
void terCalc::setDebugMode ( bool b )
|
|
{
|
|
PRINT_DEBUG = b;
|
|
}
|
|
|
|
}
|