mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 22:45:50 +03:00
3ef02eb7e6
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4146 1f5c12ca-751b-0410-a591-d2e778427230
1036 lines
30 KiB
C++
1036 lines
30 KiB
C++
//
|
|
// C++ Implementation: tercalc
|
|
//
|
|
// Description:
|
|
//
|
|
//
|
|
// Author: <>, (C) 2010
|
|
//
|
|
// Copyright: See COPYING file that comes with this distribution
|
|
//
|
|
//
|
|
#include "tercalc.h"
|
|
using namespace std;
|
|
using namespace Tools;
|
|
namespace TERCpp
|
|
{
|
|
|
|
terCalc::terCalc()
|
|
{
|
|
MAX_SHIFT_SIZE = 50;
|
|
INF = 999999.0;
|
|
shift_cost = 1.0;
|
|
insert_cost = 1.0;
|
|
delete_cost = 1.0;
|
|
substitute_cost = 1.0;
|
|
match_cost = 0.0;
|
|
NUM_SEGMENTS_SCORED = 0;
|
|
NUM_SHIFTS_CONSIDERED = 0;
|
|
NUM_BEAM_SEARCH_CALLS = 0;
|
|
BEAM_WIDTH = 20;
|
|
MAX_SHIFT_DIST = 50;
|
|
PRINT_DEBUG = false;
|
|
}
|
|
|
|
|
|
// terCalc::~terCalc()
|
|
// {
|
|
// }
|
|
// size_t* terCalc::hashVec ( vector<string> s )
|
|
// {
|
|
// size_t retour[ ( int ) s.size() ];
|
|
// int i=0;
|
|
// for ( i=0; i< ( int ) s.size(); i++ )
|
|
// {
|
|
// boost::hash<std::string> hasher;
|
|
// retour[i]=hasher ( s.at ( i ) );
|
|
// }
|
|
// return retour;
|
|
// }
|
|
|
|
|
|
int terCalc::WERCalculation ( size_t * ref, size_t * hyp )
|
|
{
|
|
int retour;
|
|
int REFSize = sizeof ( ref ) + 1;
|
|
int HYPSize = sizeof ( hyp ) + 1;
|
|
int WER[REFSize][HYPSize];
|
|
int i = 0;
|
|
int j = 0;
|
|
for ( i = 0; i < REFSize; i++ ) {
|
|
WER[i][0] = ( int ) i;
|
|
}
|
|
for ( j = 0; j < HYPSize; j++ ) {
|
|
WER[0][j] = ( int ) j;
|
|
}
|
|
for ( j = 1; j < HYPSize; j++ ) {
|
|
for ( i = 1; i < REFSize; i++ ) {
|
|
if ( i == 1 ) {
|
|
cerr << endl;
|
|
}
|
|
if ( ref[i-1] == hyp[j-1] ) {
|
|
WER[i][j] = WER[i-1][j-1];
|
|
cerr << "- ";
|
|
cerr << WER[i][j] << "-\t";
|
|
} else {
|
|
if ( ( ( WER[i-1][ j] + 1 ) < ( WER[i][j-1] + 1 ) ) && ( ( WER[i-1][j] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
|
|
WER[i][j] = ( WER[i-1][j] + 1 );
|
|
// cerr << "D ";
|
|
cerr << WER[i][j] << "D\t";
|
|
} else {
|
|
if ( ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j] + 1 ) ) && ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
|
|
WER[i][j] = ( WER[i][j-1] + 1 );
|
|
// cerr << "I ";
|
|
cerr << WER[i][j] << "I\t";
|
|
} else {
|
|
WER[i][j] = ( WER[i-1][j-1] + 1 );
|
|
// cerr << "S ";
|
|
cerr << WER[i][j] << "S\t";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
cerr << endl;
|
|
retour = WER[i-1][j-1];
|
|
cerr << "i : " << i - 1 << "\tj : " << j - 1 << endl;
|
|
return retour;
|
|
}
|
|
int terCalc::WERCalculation ( std::vector< int > ref, std::vector< int > hyp )
|
|
{
|
|
stringstream s;
|
|
s.str ( "" );
|
|
string stringRef ( "" );
|
|
string stringHyp ( "" );
|
|
for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) {
|
|
if ( l_it == ref.begin() ) {
|
|
s << ( *l_it );
|
|
} else {
|
|
s << " " << ( *l_it );
|
|
}
|
|
}
|
|
stringRef = s.str();
|
|
s.str ( "" );
|
|
for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) {
|
|
if ( l_itHyp == hyp.begin() ) {
|
|
s << ( *l_itHyp );
|
|
} else {
|
|
s << " " << ( *l_itHyp );
|
|
}
|
|
}
|
|
stringHyp = s.str();
|
|
s.str ( "" );
|
|
return WERCalculation ( stringToVector ( stringRef, " " ), stringToVector ( stringHyp , " " ) );
|
|
}
|
|
|
|
terAlignment terCalc::TER ( std::vector< int > hyp, std::vector< int > ref )
|
|
{
|
|
stringstream s;
|
|
s.str ( "" );
|
|
string stringRef ( "" );
|
|
string stringHyp ( "" );
|
|
for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) {
|
|
if ( l_it == ref.begin() ) {
|
|
s << ( *l_it );
|
|
} else {
|
|
s << " " << ( *l_it );
|
|
}
|
|
}
|
|
stringRef = s.str();
|
|
s.str ( "" );
|
|
for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) {
|
|
if ( l_itHyp == hyp.begin() ) {
|
|
s << ( *l_itHyp );
|
|
} else {
|
|
s << " " << ( *l_itHyp );
|
|
}
|
|
}
|
|
stringHyp = s.str();
|
|
s.str ( "" );
|
|
return TER ( stringToVector ( stringRef , " " ), stringToVector ( stringHyp , " " ) );
|
|
}
|
|
|
|
int terCalc::WERCalculation ( vector<string> ref, vector<string> hyp )
|
|
{
|
|
int retour;
|
|
int REFSize = ( int ) ref.size() + 1;
|
|
int HYPSize = ( int ) hyp.size() + 1;
|
|
int WER[REFSize][HYPSize];
|
|
char WERchar[REFSize][HYPSize];
|
|
int i = 0;
|
|
int j = 0;
|
|
for ( i = 0; i < REFSize; i++ ) {
|
|
WER[i][0] = ( int ) i;
|
|
}
|
|
for ( j = 0; j < HYPSize; j++ ) {
|
|
WER[0][j] = ( int ) j;
|
|
}
|
|
for ( j = 1; j < HYPSize; j++ ) {
|
|
for ( i = 1; i < REFSize; i++ ) {
|
|
// if (i==1)
|
|
// {
|
|
// cerr << endl;
|
|
// }
|
|
if ( ref[i-1] == hyp[j-1] ) {
|
|
WER[i][j] = WER[i-1][j-1];
|
|
// cerr << "- ";
|
|
// cerr << WER[i][j]<< "-\t";
|
|
WERchar[i][j] = '-';
|
|
} else {
|
|
if ( ( ( WER[i-1][ j] + 1 ) < ( WER[i][j-1] + 1 ) ) && ( ( WER[i-1][j] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
|
|
WER[i][j] = ( WER[i-1][j] + 1 );
|
|
// cerr << "D ";
|
|
// cerr << WER[i][j]<< "D\t";
|
|
WERchar[i][j] = 'D';
|
|
} else {
|
|
if ( ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j] + 1 ) ) && ( ( WER[i][j-1] + 1 ) < ( WER[i-1][j-1] + 1 ) ) ) {
|
|
WER[i][j] = ( WER[i][j-1] + 1 );
|
|
// cerr << "I ";
|
|
// cerr << WER[i][j]<< "I\t";
|
|
WERchar[i][j] = 'I';
|
|
} else {
|
|
WER[i][j] = ( WER[i-1][j-1] + 1 );
|
|
// cerr << "S ";
|
|
// cerr << WER[i][j]<< "S\t";
|
|
WERchar[i][j] = 'S';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
cerr << endl;
|
|
retour = WER[REFSize-1][HYPSize-1];
|
|
cerr << "i : " << i - 1 << "\tj : " << j - 1 << endl;
|
|
j = HYPSize - 1;
|
|
i = REFSize - 1;
|
|
int k;
|
|
stringstream s;
|
|
// WERalignment local[HYPSize];
|
|
if ( HYPSize > REFSize ) {
|
|
k = HYPSize;
|
|
} else {
|
|
k = REFSize;
|
|
}
|
|
WERalignment local;
|
|
while ( j > 0 && i > 0 ) {
|
|
cerr << "indice i : " << i << "\t";
|
|
cerr << "indice j : " << j << endl;
|
|
if ( ( j == HYPSize - 1 ) && ( i == REFSize - 1 ) ) {
|
|
alignmentElement localInfos;
|
|
s << WER[i][j];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
s << WERchar[i][j];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
local.push_back ( localInfos );
|
|
// // i--;
|
|
// j--;
|
|
}
|
|
// else
|
|
{
|
|
if ( ( ( WER[i-1][j-1] ) <= ( WER[i-1][j] ) ) && ( ( WER[i-1][j-1] ) <= ( WER[i][j-1] ) ) ) {
|
|
alignmentElement localInfos;
|
|
s << WER[i-1][j-1];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
s << WERchar[i-1][j-1];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
local.push_back ( localInfos );
|
|
i--;
|
|
j--;
|
|
} else {
|
|
if ( ( ( WER[i][j-1] ) <= ( WER[i-1][j] ) ) && ( ( WER[i][j-1] ) <= ( WER[i-1][j-1] ) ) ) {
|
|
alignmentElement localInfos;
|
|
s << WER[i][j-1];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
s << WERchar[i][j-1];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
local.push_back ( localInfos );
|
|
j--;
|
|
} else {
|
|
alignmentElement localInfos;
|
|
s << WER[i-1][j];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
s << WERchar[i-1][j];
|
|
localInfos.push_back ( s.str() );
|
|
s.str ( "" );
|
|
local.push_back ( localInfos );
|
|
i--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for ( j = 1; j < HYPSize; j++ ) {
|
|
for ( i = 1; i < REFSize; i++ ) {
|
|
cerr << WERchar[i][j] << " ";
|
|
}
|
|
cerr << endl;
|
|
}
|
|
cerr << endl;
|
|
for ( j = 1; j < HYPSize; j++ ) {
|
|
for ( i = 1; i < REFSize; i++ ) {
|
|
cerr << WER[i][j] << " ";
|
|
}
|
|
cerr << endl;
|
|
}
|
|
|
|
cerr << "=================" << endl;
|
|
// k=local.size()-1;
|
|
// while (k>0)
|
|
// {
|
|
// alignmentElement localInfos;
|
|
// localInfos=local.at(k-1);
|
|
|
|
// l_WERalignment.push_back(localInfos);
|
|
// cerr << (string)localInfos.at(1)+"\t";
|
|
k--;
|
|
// }
|
|
// cerr<<endl;
|
|
k = local.size() - 1;
|
|
int l = 0;
|
|
int m = 0;
|
|
while ( k > 0 ) {
|
|
alignmentElement localInfos;
|
|
localInfos = local.at ( k - 1 );
|
|
if ( ( int ) ( localInfos.at ( 1 ).compare ( "D" ) ) == 0 || l > HYPSize - 1 ) {
|
|
localInfos.push_back ( "***" );
|
|
} else {
|
|
localInfos.push_back ( hyp.at ( l ) );
|
|
l++;
|
|
}
|
|
if ( ( int ) ( localInfos.at ( 1 ).compare ( "I" ) ) == 0 || m > REFSize - 1 ) {
|
|
localInfos.push_back ( "***" );
|
|
} else {
|
|
localInfos.push_back ( ref.at ( m ) );
|
|
m++;
|
|
}
|
|
// cerr << vectorToString(localInfos)<<endl;
|
|
// cerr <<localInfos.at(0)<<"\t"<<localInfos.at(1)<<"\t"<<localInfos.at(2)<<"\t"<<localInfos.at(3)<<endl;
|
|
l_WERalignment.push_back ( localInfos );
|
|
// cerr << (string)localInfos.at(1)+"\t";
|
|
k--;
|
|
}
|
|
cerr << endl;
|
|
/* k=local.size()-1;
|
|
while (k>0)
|
|
{
|
|
alignmentElement localInfos;
|
|
localInfos=local.at(k-1);
|
|
// l_WERalignment.push_back(localInfos);
|
|
cerr << (string)localInfos.at(0)+"\t";
|
|
k--;
|
|
}
|
|
cerr<<endl;*/
|
|
k = 0;
|
|
// k=l_WERalignment.size()-1;
|
|
m = 0;
|
|
while ( k < ( int ) l_WERalignment.size() ) {
|
|
alignmentElement localInfos;
|
|
localInfos = l_WERalignment.at ( k );
|
|
cerr << localInfos.at ( 0 ) << "\t" << localInfos.at ( 1 ) << "\t" << localInfos.at ( 2 ) << "\t" << localInfos.at ( 3 ) << endl;
|
|
/* if ((int)(localInfos.at(1).compare("I"))==0)
|
|
{
|
|
cerr << "***\t";
|
|
}
|
|
else
|
|
{
|
|
// if (m<ref.size())
|
|
{
|
|
cerr << ref.at(m) << "\t";
|
|
}
|
|
m++;
|
|
}
|
|
*/
|
|
k++;
|
|
}
|
|
cerr << endl;
|
|
/* k=local.size()-1;
|
|
l=0;
|
|
while (k>0)
|
|
{
|
|
alignmentElement localInfos;
|
|
localInfos=local.at(k-1);
|
|
// l_WERalignment.push_back(localInfos);
|
|
if ((int)(localInfos.at(1).compare("D"))==0)
|
|
{
|
|
cerr << "***\t";
|
|
}
|
|
else
|
|
{
|
|
cerr << hyp.at(l) << "\t";
|
|
l++;
|
|
}
|
|
k--;
|
|
}
|
|
cerr<<endl;*/
|
|
cerr << "=================" << endl;
|
|
return retour;
|
|
}
|
|
|
|
// string terCalc::vectorToString(vector<string> vec)
|
|
// {
|
|
// string retour("");
|
|
// for (vector<string>::iterator vecIter=vec.begin();vecIter!=vec.end(); vecIter++)
|
|
// {
|
|
// retour+=(*vecIter)+"\t";
|
|
// }
|
|
// return retour;
|
|
// }
|
|
|
|
// vector<string> terCalc::subVector(vector<string> vec, int start, int end)
|
|
// {
|
|
// if (start>end)
|
|
// {
|
|
// cerr << "ERREUR : terCalc::subVector : end > start"<<endl;
|
|
// exit(0);
|
|
// }
|
|
// vector<string> retour;
|
|
// for (int i=start; ((i<end) && (i< vec.size())); i++)
|
|
// {
|
|
// retour.push_back(vec.at(i));
|
|
// }
|
|
// return retour;
|
|
// }
|
|
|
|
hashMapInfos terCalc::BuildWordMatches ( vector<string> hyp, vector<string> ref )
|
|
{
|
|
hashMap tempHash;
|
|
hashMapInfos retour;
|
|
for ( int i = 0; i < ( int ) hyp.size(); i++ ) {
|
|
tempHash.addHasher ( hyp.at ( i ), "" );
|
|
}
|
|
bool cor[ref.size() ];
|
|
for ( int i = 0; i < ( int ) ref.size(); i++ ) {
|
|
if ( tempHash.trouve ( ( string ) ref.at ( i ) ) ) {
|
|
cor[i] = true;
|
|
} else {
|
|
cor[i] = false;
|
|
}
|
|
}
|
|
for ( int start = 0; start < ( int ) ref.size(); start++ ) {
|
|
if ( cor[start] ) {
|
|
for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= MAX_SHIFT_SIZE ) && ( cor[end] ) ); end++ ) {
|
|
vector<string> ajouter = subVector ( ref, start, end + 1 );
|
|
string ajouterString = vectorToString ( ajouter );
|
|
vector<int> values = retour.getValue ( ajouterString );
|
|
values.push_back ( start );
|
|
if ( values.size() > 1 ) {
|
|
retour.setValue ( ajouterString, values );
|
|
} else {
|
|
retour.addValue ( ajouterString, values );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return retour;
|
|
}
|
|
|
|
bool terCalc::spanIntersection ( vecInt refSpan, vecInt hypSpan )
|
|
{
|
|
if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) ) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
terAlignment terCalc::MinEditDist ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans )
|
|
{
|
|
double current_best = INF;
|
|
double last_best = INF;
|
|
int first_good = 0;
|
|
int current_first_good = 0;
|
|
int last_good = -1;
|
|
int cur_last_good = 0;
|
|
int last_peak = 0;
|
|
int cur_last_peak = 0;
|
|
int i, j;
|
|
double cost, icost, dcost;
|
|
double score;
|
|
|
|
// int hwsize = hyp.size()-1;
|
|
// int rwsize = ref.size()-1;
|
|
NUM_BEAM_SEARCH_CALLS++;
|
|
// if ((ref.size()+1 > sizeof(S)) || (hyp.size()+1 > sizeof(S)))
|
|
// {
|
|
// int max = ref.size();
|
|
// if (hyp.size() > ref.size()) max = hyp.size();
|
|
// max += 26; // we only need a +1 here, but let's pad for future use
|
|
// S = new double[max][max];
|
|
// P = new char[max][max];
|
|
// }
|
|
for ( i = 0; i <= ( int ) ref.size(); i++ ) {
|
|
for ( j = 0; j <= ( int ) hyp.size(); j++ ) {
|
|
S[i][j] = -1.0;
|
|
P[i][j] = '0';
|
|
}
|
|
}
|
|
S[0][0] = 0.0;
|
|
for ( j = 0; j <= ( int ) hyp.size(); j++ ) {
|
|
last_best = current_best;
|
|
current_best = INF;
|
|
first_good = current_first_good;
|
|
current_first_good = -1;
|
|
last_good = cur_last_good;
|
|
cur_last_good = -1;
|
|
last_peak = cur_last_peak;
|
|
cur_last_peak = 0;
|
|
for ( i = first_good; i <= ( int ) ref.size(); i++ ) {
|
|
if ( i > last_good ) {
|
|
break;
|
|
}
|
|
if ( S[i][j] < 0 ) {
|
|
continue;
|
|
}
|
|
score = S[i][j];
|
|
if ( ( j < ( int ) hyp.size() ) && ( score > last_best + BEAM_WIDTH ) ) {
|
|
continue;
|
|
}
|
|
if ( current_first_good == -1 ) {
|
|
current_first_good = i ;
|
|
}
|
|
if ( ( i < ( int ) ref.size() ) && ( j < ( int ) hyp.size() ) ) {
|
|
if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || spanIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) {
|
|
if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) {
|
|
cost = match_cost + score;
|
|
if ( ( S[i+1][j+1] == -1 ) || ( cost < S[i+1][j+1] ) ) {
|
|
S[i+1][j+1] = cost;
|
|
P[i+1][j+1] = ' ';
|
|
}
|
|
if ( cost < current_best ) {
|
|
current_best = cost;
|
|
}
|
|
if ( current_best == cost ) {
|
|
cur_last_peak = i + 1;
|
|
}
|
|
} else {
|
|
cost = substitute_cost + score;
|
|
if ( ( S[i+1][j+1] < 0 ) || ( cost < S[i+1][j+1] ) ) {
|
|
S[i+1][j+1] = cost;
|
|
P[i+1][j+1] = 'S';
|
|
if ( cost < current_best ) {
|
|
current_best = cost;
|
|
}
|
|
if ( current_best == cost ) {
|
|
cur_last_peak = i + 1 ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
cur_last_good = i + 1;
|
|
if ( j < ( int ) hyp.size() ) {
|
|
icost = score + insert_cost;
|
|
if ( ( S[i][j+1] < 0 ) || ( S[i][j+1] > icost ) ) {
|
|
S[i][j+1] = icost;
|
|
P[i][j+1] = 'I';
|
|
if ( ( cur_last_peak < i ) && ( current_best == icost ) ) {
|
|
cur_last_peak = i;
|
|
}
|
|
}
|
|
}
|
|
if ( i < ( int ) ref.size() ) {
|
|
dcost = score + delete_cost;
|
|
if ( ( S[ i+1][ j] < 0.0 ) || ( S[i+1][j] > dcost ) ) {
|
|
S[i+1][j] = dcost;
|
|
P[i+1][j] = 'D';
|
|
if ( i >= last_good ) {
|
|
last_good = i + 1 ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
int tracelength = 0;
|
|
i = ref.size();
|
|
j = hyp.size();
|
|
while ( ( i > 0 ) || ( j > 0 ) ) {
|
|
tracelength++;
|
|
if ( P[i][j] == ' ' ) {
|
|
i--;
|
|
j--;
|
|
} else if ( P[i][j] == 'S' ) {
|
|
i--;
|
|
j--;
|
|
} else if ( P[i][j] == 'D' ) {
|
|
i--;
|
|
} else if ( P[i][j] == 'I' ) {
|
|
j--;
|
|
} else {
|
|
cerr << "ERROR : terCalc::MinEditDist : Invalid path : " << P[i][j] << endl;
|
|
exit ( -1 );
|
|
}
|
|
}
|
|
vector<char> path ( tracelength );
|
|
i = ref.size();
|
|
j = hyp.size();
|
|
while ( ( i > 0 ) || ( j > 0 ) ) {
|
|
path[--tracelength] = P[i][j];
|
|
if ( P[i][j] == ' ' ) {
|
|
i--;
|
|
j--;
|
|
} else if ( P[i][j] == 'S' ) {
|
|
i--;
|
|
j--;
|
|
} else if ( P[i][j] == 'D' ) {
|
|
i--;
|
|
} else if ( P[i][j] == 'I' ) {
|
|
j--;
|
|
}
|
|
}
|
|
terAlignment to_return;
|
|
to_return.numWords = ref.size();
|
|
to_return.alignment = path;
|
|
to_return.numEdits = S[ref.size() ][hyp.size() ];
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::MinEditDist : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl;
|
|
}
|
|
return to_return;
|
|
|
|
}
|
|
terAlignment terCalc::TER ( vector<string> hyp, vector<string> ref )
|
|
{
|
|
hashMapInfos rloc = BuildWordMatches ( hyp, ref );
|
|
terAlignment cur_align = MinEditDist ( hyp, ref, hypSpans );
|
|
vector<string> cur = hyp;
|
|
cur_align.hyp = hyp;
|
|
cur_align.ref = ref;
|
|
cur_align.aftershift = hyp;
|
|
double edits = 0;
|
|
// int numshifts = 0;
|
|
|
|
vector<terShift> allshifts;
|
|
|
|
// cerr << "Initial Alignment:" << endl << cur_align.toString() <<endl;
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::TER : cur_align :" << endl << cur_align.toString() << endl << "END DEBUG" << endl;
|
|
}
|
|
while ( true ) {
|
|
bestShiftStruct returns;
|
|
returns = CalcBestShift ( cur, hyp, ref, rloc, cur_align );
|
|
if ( returns.m_empty ) {
|
|
break;
|
|
}
|
|
terShift bestShift = returns.m_best_shift;
|
|
cur_align = returns.m_best_align;
|
|
edits += bestShift.cost;
|
|
bestShift.alignment = cur_align.alignment;
|
|
bestShift.aftershift = cur_align.aftershift;
|
|
allshifts.push_back ( bestShift );
|
|
cur = cur_align.aftershift;
|
|
}
|
|
terAlignment to_return;
|
|
to_return = cur_align;
|
|
to_return.allshifts = allshifts;
|
|
to_return.numEdits += edits;
|
|
NUM_SEGMENTS_SCORED++;
|
|
return to_return;
|
|
}
|
|
bestShiftStruct terCalc::CalcBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment med_align )
|
|
{
|
|
bestShiftStruct to_return;
|
|
bool anygain = false;
|
|
bool herr[ ( int ) hyp.size() ];
|
|
bool rerr[ ( int ) ref.size() ];
|
|
int ralign[ ( int ) ref.size() ];
|
|
FindAlignErr ( med_align, herr, rerr, ralign );
|
|
vector<vecTerShift> poss_shifts;
|
|
poss_shifts = GatherAllPossShifts ( cur, ref, rloc, med_align, herr, rerr, ralign );
|
|
double curerr = med_align.numEdits;
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
|
|
cerr << "Possible Shifts:" << endl;
|
|
for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) {
|
|
for ( int j = 0; j < ( int ) ( poss_shifts.at ( i ) ).size(); j++ ) {
|
|
cerr << " [" << i << "] " << ( ( poss_shifts.at ( i ) ).at ( j ) ).toString() << endl;
|
|
}
|
|
}
|
|
cerr << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
double cur_best_shift_cost = 0.0;
|
|
terAlignment cur_best_align = med_align;
|
|
terShift cur_best_shift;
|
|
|
|
|
|
|
|
for ( int i = ( int ) poss_shifts.size() - 1; i >= 0; i-- ) {
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
|
|
cerr << "Considering shift of length " << i << " (" << ( poss_shifts.at ( i ) ).size() << ")" << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
/* Consider shifts of length i+1 */
|
|
double curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits );
|
|
double maxfix = ( 2 * ( 1 + i ) );
|
|
if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
|
|
break;
|
|
}
|
|
|
|
for ( int s = 0; s < ( int ) ( poss_shifts.at ( i ) ).size(); s++ ) {
|
|
curfix = curerr - ( cur_best_shift_cost + cur_best_align.numEdits );
|
|
if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
|
|
break;
|
|
}
|
|
terShift curshift = ( poss_shifts.at ( i ) ).at ( s );
|
|
|
|
alignmentStruct shiftReturns = PerformShift ( cur, curshift );
|
|
vector<string> shiftarr = shiftReturns.nwords;
|
|
vector<vecInt> curHypSpans = shiftReturns.aftershift;
|
|
|
|
terAlignment curalign = MinEditDist ( shiftarr, ref, curHypSpans );
|
|
|
|
curalign.hyp = hyp;
|
|
curalign.ref = ref;
|
|
curalign.aftershift = shiftarr;
|
|
|
|
double gain = ( cur_best_align.numEdits + cur_best_shift_cost ) - ( curalign.numEdits + curshift.cost );
|
|
|
|
// if (DEBUG) {
|
|
// string testeuh=terAlignment join(" ", shiftarr);
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
|
|
cerr << "Gain for " << curshift.toString() << " is " << gain << ". (result: [" << curalign.join ( " ", shiftarr ) << "]" << endl;
|
|
cerr << "" << curalign.toString() << "\n" << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
// }
|
|
//
|
|
if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) ) {
|
|
anygain = true;
|
|
cur_best_shift = curshift;
|
|
cur_best_shift_cost = curshift.cost;
|
|
cur_best_align = curalign;
|
|
// if (DEBUG)
|
|
if ( PRINT_DEBUG ) {
|
|
cerr << "BEGIN DEBUG : terCalc::CalcBestShift :" << endl;
|
|
cerr << "Tmp Choosing shift: " << cur_best_shift.toString() << " gives:\n" << cur_best_align.toString() << "\n" << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if ( anygain ) {
|
|
to_return.m_best_shift = cur_best_shift;
|
|
to_return.m_best_align = cur_best_align;
|
|
to_return.m_empty = false;
|
|
} else {
|
|
to_return.m_empty = true;
|
|
}
|
|
return to_return;
|
|
}
|
|
|
|
void terCalc::FindAlignErr ( terAlignment align, bool* herr, bool* rerr, int* ralign )
|
|
{
|
|
int hpos = -1;
|
|
int rpos = -1;
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
cerr << "BEGIN DEBUG : terCalc::FindAlignErr : " << endl << align.toString() << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) {
|
|
char sym = align.alignment[i];
|
|
if ( sym == ' ' ) {
|
|
hpos++;
|
|
rpos++;
|
|
herr[hpos] = false;
|
|
rerr[rpos] = false;
|
|
ralign[rpos] = hpos;
|
|
} else if ( sym == 'S' ) {
|
|
hpos++;
|
|
rpos++;
|
|
herr[hpos] = true;
|
|
rerr[rpos] = true;
|
|
ralign[rpos] = hpos;
|
|
} else if ( sym == 'I' ) {
|
|
hpos++;
|
|
herr[hpos] = true;
|
|
} else if ( sym == 'D' ) {
|
|
rpos++;
|
|
rerr[rpos] = true;
|
|
ralign[rpos] = hpos;
|
|
} else {
|
|
cerr << "ERROR : terCalc::FindAlignErr : Invalid mini align sequence " << sym << " at pos " << i << endl;
|
|
exit ( -1 );
|
|
}
|
|
}
|
|
}
|
|
|
|
vector<vecTerShift> terCalc::GatherAllPossShifts ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign )
|
|
{
|
|
vector<vecTerShift> to_return;
|
|
// Don't even bother to look if shifts can't be done
|
|
if ( ( MAX_SHIFT_SIZE <= 0 ) || ( MAX_SHIFT_DIST <= 0 ) ) {
|
|
// terShift[][] to_return = new terShift[0][];
|
|
return to_return;
|
|
}
|
|
|
|
vector<vecTerShift> allshifts ( MAX_SHIFT_SIZE + 1 );
|
|
|
|
// ArrayList[] allshifts = new ArrayList[MAX_SHIFT_SIZE+1];
|
|
// for (int i = 0; i < allshifts.length; i++)
|
|
// {
|
|
// allshifts[i] = new ArrayList();
|
|
// }
|
|
|
|
// List hyplist = Arrays.asList(hyp);
|
|
for ( int start = 0; start < ( int ) hyp.size(); start++ ) {
|
|
string subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) );
|
|
if ( ! rloc.trouve ( subVectorHypString ) ) {
|
|
continue;
|
|
}
|
|
|
|
bool ok = false;
|
|
vector<int> mtiVec = rloc.getValue ( subVectorHypString );
|
|
vector<int>::iterator mti = mtiVec.begin();
|
|
while ( mti != mtiVec.end() && ( ! ok ) ) {
|
|
int moveto = ( *mti );
|
|
mti++;
|
|
if ( ( start != ralign[moveto] ) && ( ( ralign[moveto] - start ) <= MAX_SHIFT_DIST ) && ( ( start - ralign[moveto] - 1 ) <= MAX_SHIFT_DIST ) ) {
|
|
ok = true;
|
|
}
|
|
}
|
|
if ( ! ok ) {
|
|
continue;
|
|
}
|
|
ok = true;
|
|
for ( int end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + MAX_SHIFT_SIZE ) ); end++ ) {
|
|
/* check if cand is good if so, add it */
|
|
vector<string> cand = subVector ( hyp, start, end + 1 );
|
|
ok = false;
|
|
if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) ) {
|
|
continue;
|
|
}
|
|
|
|
bool any_herr = false;
|
|
|
|
for ( int i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ ) {
|
|
if ( herr[start+i] ) {
|
|
any_herr = true;
|
|
}
|
|
}
|
|
if ( any_herr == false ) {
|
|
ok = true;
|
|
continue;
|
|
}
|
|
|
|
vector<int> movetoitVec;
|
|
movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) );
|
|
vector<int>::iterator movetoit = movetoitVec.begin();
|
|
while ( movetoit != movetoitVec.end() ) {
|
|
int moveto = ( *movetoit );
|
|
movetoit++;
|
|
if ( ! ( ( ralign[moveto] != start ) && ( ( ralign[moveto] < start ) || ( ralign[moveto] > end ) ) && ( ( ralign[moveto] - start ) <= MAX_SHIFT_DIST ) && ( ( start - ralign[moveto] ) <= MAX_SHIFT_DIST ) ) ) {
|
|
continue;
|
|
}
|
|
ok = true;
|
|
|
|
/* check to see if there are any errors in either string
|
|
(only move if this is the case!)
|
|
*/
|
|
|
|
bool any_rerr = false;
|
|
for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ ) {
|
|
if ( rerr[moveto+i] ) {
|
|
any_rerr = true;
|
|
}
|
|
}
|
|
if ( ! any_rerr ) {
|
|
continue;
|
|
}
|
|
for ( int roff = -1; roff <= ( end - start ); roff++ ) {
|
|
terShift topush;
|
|
bool topushNull = true;
|
|
if ( ( roff == -1 ) && ( moveto == 0 ) ) {
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
cerr << "BEGIN DEBUG : terCalc::GatherAllPossShifts 01 : " << endl << "Consider making " << start << "..." << end << " moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl;
|
|
}
|
|
terShift t01 ( start, end, -1, -1 );
|
|
topush = t01;
|
|
topushNull = false;
|
|
} else if ( ( start != ralign[moveto+roff] ) && ( ( roff == 0 ) || ( ralign[moveto+roff] != ralign[moveto] ) ) ) {
|
|
int newloc = ralign[moveto+roff];
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
cerr << "BEGIN DEBUG : terCalc::GatherAllPossShifts 02 : " << endl << "Consider making " << start << "..." << end << " moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl;
|
|
}
|
|
terShift t02 ( start, end, moveto + roff, newloc );
|
|
topush = t02;
|
|
topushNull = false;
|
|
}
|
|
if ( !topushNull ) {
|
|
topush.shifted = cand;
|
|
topush.cost = shift_cost;
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
cerr << "BEGIN DEBUG : terCalc::GatherAllPossShifts 02 : " << endl;
|
|
cerr << "start : " << start << endl;
|
|
cerr << "end : " << end << endl;
|
|
cerr << "end - start : " << end - start << endl;
|
|
cerr << "END DEBUG " << endl;
|
|
}
|
|
( allshifts.at ( end - start ) ).push_back ( topush );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// vector<vecTerShift> to_return;
|
|
to_return.clear();
|
|
// terShift[][] to_return = new terShift[MAX_SHIFT_SIZE+1][];
|
|
for ( int i = 0; i < MAX_SHIFT_SIZE + 1; i++ ) {
|
|
// to_return[i] = (terShift[]) allshifts[i].toArray(new terShift[0]);
|
|
to_return.push_back ( ( vecTerShift ) allshifts.at ( i ) );
|
|
}
|
|
return to_return;
|
|
}
|
|
|
|
|
|
alignmentStruct terCalc::PerformShift ( vector<string> words, terShift s )
|
|
{
|
|
return PerformShift ( words, s.start, s.end, s.newloc );
|
|
}
|
|
|
|
|
|
alignmentStruct terCalc::PerformShift ( vector<string> words, int start, int end, int newloc )
|
|
{
|
|
int c = 0;
|
|
vector<string> nwords ( words );
|
|
vector<vecInt> spans ( ( int ) hypSpans.size() );
|
|
alignmentStruct toreturn;
|
|
// ON EST ICI
|
|
// if((int)hypSpans.size()>0) spans = new TERintpair[(int)hypSpans.size()];
|
|
// if(DEBUG) {
|
|
if ( PRINT_DEBUG ) {
|
|
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
cerr << "BEGIN DEBUG : terCalc::PerformShift :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl << "END DEBUG " << endl;
|
|
} else {
|
|
cerr << "BEGIN DEBUG : terCalc::PerformShift :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl << "END DEBUG " << endl;
|
|
}
|
|
}
|
|
// }
|
|
|
|
if ( newloc == -1 ) {
|
|
for ( int i = start; i <= end; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = 0; i <= start - 1; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
} else {
|
|
if ( newloc < start ) {
|
|
for ( int i = 0; i <= newloc; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = start; i <= end; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = newloc + 1; i <= start - 1; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
} else {
|
|
if ( newloc > end ) {
|
|
for ( int i = 0; i <= start - 1; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = end + 1; i <= newloc; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = start; i <= end; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = newloc + 1; i < ( int ) words.size(); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
} else {
|
|
// we are moving inside of ourselves
|
|
for ( int i = 0; i <= start - 1; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = start; i <= end; i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size(); i++ ) {
|
|
nwords.at ( c++ ) = words.at ( i );
|
|
if ( ( int ) hypSpans.size() > 0 ) {
|
|
spans.at ( c - 1 ) = hypSpans.at ( i );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
NUM_SHIFTS_CONSIDERED++;
|
|
|
|
toreturn.nwords = nwords;
|
|
toreturn.aftershift = spans;
|
|
return toreturn;
|
|
}
|
|
void terCalc::setDebugMode ( bool b )
|
|
{
|
|
PRINT_DEBUG = b;
|
|
}
|
|
|
|
}
|