mosesdecoder/mert/TER/tercalc.cpp
2015-02-19 12:27:23 +00:00

1042 lines
34 KiB
C++

/*********************************
tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation.
Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
This program and library are distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU Lesser General Public License
along with this library; if not, write to the Free Software Foundation,
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
**********************************/
//
// C++ Implementation: tercalc
//
// Description:
//
//
// Author: <>, (C) 2010
//
// Copyright: See COPYING file that comes with this distribution
//
//
#include "tercalc.h"
using namespace std;
using namespace TERCPPNS_Tools;
namespace TERCPPNS_TERCpp
{
terCalc::terCalc()
{
TAILLE_PERMUT_MAX = 10;
NBR_PERMUT_MAX = 10;
infinite = 99999.0;
shift_cost = 1.0;
insert_cost = 1.0;
delete_cost = 1.0;
substitute_cost = 1.0;
match_cost = 0.0;
NBR_SEGS_EVALUATED = 0;
NBR_PERMUTS_CONSID = 0;
NBR_BS_APPELS = 0;
TAILLE_BEAM = 10;
DIST_MAX_PERMUT = 25;
PRINT_DEBUG = false;
hypSpans.clear();
refSpans.clear();
CALL_TER_ALIGN=0;
CALL_CALC_PERMUT=0;
CALL_FIND_BSHIFT=0;
MAX_LENGTH_SENTENCE=10;
S = new vector < vector < double > >(MAX_LENGTH_SENTENCE, std::vector<double>(MAX_LENGTH_SENTENCE,0.0));
P = new vector < vector < char > >(MAX_LENGTH_SENTENCE, std::vector<char>(MAX_LENGTH_SENTENCE,' '));
}
terCalc::~terCalc()
{
delete(S);
delete(P);
}
terAlignment terCalc::WERCalculation ( vector< string >& hyp , vector< string >& ref )
{
return minimizeDistanceEdition ( hyp, ref, hypSpans );
}
terAlignment terCalc::TER ( vector< int >& hyp, vector< int >& ref )
{
stringstream s;
s.str ( "" );
string stringRef ( "" );
string stringHyp ( "" );
for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) {
if ( l_it == ref.begin() ) {
s << ( *l_it );
} else {
s << " " << ( *l_it );
}
}
stringRef = s.str();
s.str ( "" );
for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) {
if ( l_itHyp == hyp.begin() ) {
s << ( *l_itHyp );
} else {
s << " " << ( *l_itHyp );
}
}
stringHyp = s.str();
s.str ( "" );
vector<string> l_vref=stringToVector ( stringRef , " " );
vector<string> l_vhyp=stringToVector ( stringHyp , " " );
return TER ( l_vhyp , l_vref);
}
hashMapInfos terCalc::createConcordMots ( vector< string >& hyp, vector< string >& ref )
{
hashMap tempHash;
hashMapInfos retour;
for ( int i = 0; i < ( int ) hyp.size(); i++ ) {
tempHash.addHasher ( hyp.at ( i ), "" );
}
bool cor[ref.size() ];
for ( int i = 0; i < ( int ) ref.size(); i++ ) {
if ( tempHash.trouve ( ( string ) ref.at ( i ) ) ) {
cor[i] = true;
} else {
cor[i] = false;
}
}
for ( int start = 0; start < ( int ) ref.size(); start++ ) {
if ( cor[start] ) {
for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= TAILLE_PERMUT_MAX ) && ( cor[end] ) ); end++ ) {
vector<string> ajouter = subVector ( ref, start, end + 1 );
string ajouterString = vectorToString ( ajouter );
vector<int> values = retour.getValue ( ajouterString );
values.push_back ( start );
if ( values.size() > 1 ) {
retour.setValue ( ajouterString, values );
} else {
retour.addValue ( ajouterString, values );
}
}
}
}
return retour;
}
bool terCalc::trouverIntersection ( vecInt& refSpan, vecInt& hypSpan )
{
if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) ) {
return true;
}
return false;
}
terAlignment terCalc::minimizeDistanceEdition ( vector< string >& hyp, vector< string >& ref, vector< vecInt >& curHypSpans )
{
double current_best = infinite;
double last_best = infinite;
int first_good = 0;
int current_first_good = 0;
int last_good = -1;
int cur_last_good = 0;
int last_peak = 0;
int cur_last_peak = 0;
int i=0;
int j=0;
int ref_size=0 ;
ref_size=( int ) ref.size();
int hyp_size=0;
hyp_size=( int ) hyp.size();
double cost, icost, dcost;
double score;
delete(S);
delete(P);
S = new vector < vector < double > >(ref_size+1, std::vector<double>(hyp_size+1,-1.0));
P = new vector < vector < char > >(ref_size+1, std::vector<char>(hyp_size+1,'0'));
NBR_BS_APPELS++;
// cerr << "Appels : " << NBR_BS_APPELS << endl;
// for ( i = 0; i <= ref_size; i++ )
// {
// for ( j = 0; j <= hyp_size; j++ )
// {
// S->at(i).at(j) = -1.0;
// P->at(i).at(j) = '0';
// }
// }
S->at(0).at(0) = 0.0;
for ( j = 0; j <= hyp_size; j++ ) {
last_best = current_best;
current_best = infinite;
first_good = current_first_good;
current_first_good = -1;
last_good = cur_last_good;
cur_last_good = -1;
last_peak = cur_last_peak;
cur_last_peak = 0;
for ( i = first_good; i <= ref_size; i++ ) {
if ( i > last_good ) {
break;
}
if ( S->at(i).at(j) < 0 ) {
continue;
}
score = S->at(i).at(j);
if ( ( j < hyp_size ) && ( score > last_best + TAILLE_BEAM ) ) {
continue;
}
if ( current_first_good == -1 ) {
current_first_good = i ;
}
if ( ( i < ref_size ) && ( j < hyp_size ) ) {
if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) {
if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) {
cost = match_cost + score;
if ( ( S->at(i+1).at(j+1) == -1 ) || ( cost < S->at(i+1).at(j+1) ) ) {
S->at(i+1).at(j+1) = cost;
P->at(i+1).at(j+1) = 'A';
}
if ( cost < current_best ) {
current_best = cost;
}
if ( current_best == cost ) {
cur_last_peak = i + 1;
}
} else {
cost = substitute_cost + score;
if ( ( S->at(i+1).at(j+1) < 0 ) || ( cost < S->at(i+1).at(j+1) ) ) {
S->at(i+1).at(j+1) = cost;
P->at(i+1).at(j+1) = 'S';
if ( cost < current_best ) {
current_best = cost;
}
if ( current_best == cost ) {
cur_last_peak = i + 1 ;
}
}
}
}
}
cur_last_good = i + 1;
if ( j < hyp_size ) {
icost = score + insert_cost;
if ( ( S->at(i).at(j+1) < 0 ) || ( S->at(i).at(j+1) > icost ) ) {
S->at(i).at(j+1) = icost;
P->at(i).at(j+1) = 'I';
if ( ( cur_last_peak < i ) && ( current_best == icost ) ) {
cur_last_peak = i;
}
}
}
if ( i < ref_size ) {
dcost = score + delete_cost;
if ( ( S->at(i+1).at(j) < 0.0 ) || ( S->at(i+1).at(j) > dcost ) ) {
S->at(i+1).at(j) = dcost;
P->at(i+1).at(j) = 'D';
if ( i >= last_good ) {
last_good = i + 1 ;
}
}
}
}
}
int tracelength = 0;
i = ref.size();
j = hyp.size();
while ( ( i > 0 ) || ( j > 0 ) ) {
tracelength++;
if ( P->at(i).at(j) == 'A' ) {
i--;
j--;
} else if ( P->at(i).at(j) == 'S' ) {
i--;
j--;
} else if ( P->at(i).at(j) == 'D' ) {
i--;
} else if ( P->at(i).at(j) == 'I' ) {
j--;
} else {
cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " << P->at(i).at(j) << endl;
exit ( -1 );
}
}
vector<char> path ( tracelength );
i = ref.size();
j = hyp.size();
while ( ( i > 0 ) || ( j > 0 ) ) {
path[--tracelength] = P->at(i).at(j);
if ( P->at(i).at(j) == 'A' ) {
i--;
j--;
} else if ( P->at(i).at(j) == 'S' ) {
i--;
j--;
} else if ( P->at(i).at(j) == 'D' ) {
i--;
} else if ( P->at(i).at(j) == 'I' ) {
j--;
}
}
terAlignment to_return;
to_return.numWords = ref_size;
to_return.alignment = path;
to_return.numEdits = S->at(ref_size).at(hyp_size);
to_return.hyp = hyp;
to_return.ref = ref;
to_return.averageWords = ref_size;
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl;
}
return to_return;
}
void terCalc::minimizeDistanceEdition ( vector< string >& hyp, vector< string >& ref, vector< vecInt >& curHypSpans, terAlignment* to_return )
{
double current_best = infinite;
double last_best = infinite;
int first_good = 0;
int current_first_good = 0;
int last_good = -1;
int cur_last_good = 0;
int last_peak = 0;
int cur_last_peak = 0;
int i=0;
int j=0;
int ref_size=0 ;
ref_size=( int ) ref.size();
int hyp_size=0;
hyp_size=( int ) hyp.size();
double cost, icost, dcost;
double score;
delete(S);
delete(P);
S = new vector < vector < double > >(ref_size+1, std::vector<double>(hyp_size+1,-1.0));
P = new vector < vector < char > >(ref_size+1, std::vector<char>(hyp_size+1,'0'));
NBR_BS_APPELS++;
// cerr << "Appels : " << NBR_BS_APPELS << endl;
// for ( i = 0; i <= ref_size; i++ )
// {
// for ( j = 0; j <= hyp_size; j++ )
// {
// S->at(i).at(j) = -1.0;
// P->at(i).at(j) = '0';
// }
// }
S->at(0).at(0) = 0.0;
for ( j = 0; j <= hyp_size; j++ ) {
last_best = current_best;
current_best = infinite;
first_good = current_first_good;
current_first_good = -1;
last_good = cur_last_good;
cur_last_good = -1;
last_peak = cur_last_peak;
cur_last_peak = 0;
for ( i = first_good; i <= ref_size; i++ ) {
if ( i > last_good ) {
break;
}
if (S->at(i).at(j) < 0 ) {
continue;
}
score = S->at(i).at(j);
if ( ( j < hyp_size ) && ( score > last_best + TAILLE_BEAM ) ) {
continue;
}
if ( current_first_good == -1 ) {
current_first_good = i ;
}
if ( ( i < ref_size ) && ( j < hyp_size ) ) {
if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) {
if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) {
cost = match_cost + score;
if ( ( S->at(i+1).at(j+1) == -1 ) || ( cost < S->at(i+1).at(j+1) ) ) {
S->at(i+1).at(j+1) = cost;
P->at(i+1).at(j+1) = 'A';
}
if ( cost < current_best ) {
current_best = cost;
}
if ( current_best == cost ) {
cur_last_peak = i + 1;
}
} else {
cost = substitute_cost + score;
if ( ( S->at(i+1).at(j+1) < 0 ) || ( cost < S->at(i+1).at(j+1) ) ) {
S->at(i+1).at(j+1) = cost;
P->at(i+1).at(j+1) = 'S';
if ( cost < current_best ) {
current_best = cost;
}
if ( current_best == cost ) {
cur_last_peak = i + 1 ;
}
}
}
}
}
cur_last_good = i + 1;
if ( j < hyp_size ) {
icost = score + insert_cost;
if ( ( S->at(i).at(j+1) < 0 ) || ( S->at(i).at(j+1) > icost ) ) {
S->at(i).at(j+1) = icost;
P->at(i).at(j+1) = 'I';
if ( ( cur_last_peak < i ) && ( current_best == icost ) ) {
cur_last_peak = i;
}
}
}
if ( i < ref_size ) {
dcost = score + delete_cost;
if ( ( S->at(i+1).at(j) < 0.0 ) || ( S->at(i+1).at(j) > dcost ) ) {
S->at(i+1).at(j) = dcost;
P->at(i+1).at(j) = 'D';
if ( i >= last_good ) {
last_good = i + 1 ;
}
}
}
}
}
int tracelength = 0;
i = ref_size;;
j = hyp_size;
while ( ( i > 0 ) || ( j > 0 ) ) {
tracelength++;
if (P->at(i).at(j) == 'A' ) {
i--;
j--;
} else if (P->at(i).at(j) == 'S' ) {
i--;
j--;
} else if (P->at(i).at(j) == 'D' ) {
i--;
} else if (P->at(i).at(j) == 'I' ) {
j--;
} else {
cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " <<P->at(i).at(j) << endl;
exit ( -1 );
}
}
vector<char> path ( tracelength );
i = ref_size;
j = hyp_size;
while ( ( i > 0 ) || ( j > 0 ) ) {
path[--tracelength] =P->at(i).at(j);
if (P->at(i).at(j) == 'A' ) {
i--;
j--;
} else if (P->at(i).at(j) == 'S' ) {
i--;
j--;
} else if (P->at(i).at(j) == 'D' ) {
i--;
} else if (P->at(i).at(j) == 'I' ) {
j--;
}
}
// terAlignment to_return;
to_return->numWords = ref_size;
to_return->alignment = path;
to_return->numEdits = S->at(ref_size).at(hyp_size);
to_return->hyp = hyp;
to_return->ref = ref;
to_return->averageWords = ref_size;
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return->toString() << endl << "END DEBUG" << endl;
}
// return to_return;
}
terAlignment terCalc::TER ( vector<string>& hyp, vector<string>& ref )
{
hashMapInfos rloc = createConcordMots ( hyp, ref );
terAlignment cur_align = minimizeDistanceEdition ( hyp, ref, hypSpans );
vector<string> cur = hyp;
cur_align.hyp = hyp;
cur_align.ref = ref;
cur_align.aftershift = hyp;
double edits = 0;
// int numshifts = 0;
vector<terShift> * allshifts=new vector<terShift>(0);
bestShiftStruct * returns=new bestShiftStruct();
// cerr << "Initial Alignment:" << endl << cur_align.toString() <<endl;
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::TER : cur_align :" << endl << cur_align.toString() << endl << "END DEBUG" << endl;
}
while ( true ) {
returns=findBestShift ( cur, hyp, ref, rloc, cur_align );
// cerr << "****************************************************************** " << returns->getEmpty() << endl;
if ( returns->getEmpty()) {
break;
}
terShift bestShift = (*(returns->m_best_shift));
cur_align = (*(returns->m_best_align));
edits += bestShift.cost;
bestShift.alignment = cur_align.alignment;
bestShift.aftershift = cur_align.aftershift;
allshifts->push_back ( bestShift );
cur = cur_align.aftershift;
delete(returns);
}
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::TER : Final to return :" << endl << cur_align.toString() << endl << "END DEBUG" << endl;
}
terAlignment to_return;
to_return = cur_align;
to_return.allshifts = (*(allshifts));
to_return.numEdits += edits;
NBR_SEGS_EVALUATED++;
return to_return;
}
bestShiftStruct * terCalc::findBestShift ( vector<string>& cur, vector<string>& hyp, vector<string>& ref, hashMapInfos& rloc, terAlignment& med_align )
{
CALL_FIND_BSHIFT++;
// cerr << "CALL_FIND_BSHIFT " << CALL_FIND_BSHIFT <<endl;
// to_return->m_empty = new bool(false);
bool anygain = false;
vector <bool> * herr = new vector<bool>(( int ) hyp.size() + 1 );
vector <bool> * rerr = new vector<bool>( ( int ) ref.size() + 1 );
vector <int> * ralign = new vector<int>( ( int ) ref.size() + 1 );
int l_i,i,j,s;
for (i = 0 ; i< ( int ) hyp.size() + 1 ; i++) {
herr->at(i)=false;
}
for (i = 0 ; i< ( int ) ref.size() + 1 ; i++) {
rerr->at(i)=false;
ralign->at(i)=-1;
}
calculateTerAlignment ( med_align, herr, rerr, ralign );
vector<vecTerShift> * poss_shifts = new vector< vector<terShift> >(0) ;
terAlignment * cur_best_align = new terAlignment();
terShift * cur_best_shift = new terShift();
double cur_best_shift_cost = 0.0;
vector<string> shiftarr;
vector<vecInt> curHypSpans;
terShift * curshift = new terShift();
alignmentStruct shiftReturns;
terAlignment * curalign = new terAlignment() ;
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::findBestShift (after the calculateTerAlignment call) :" << endl;
cerr << "indices: ";
for (l_i=0; l_i < ( int ) ref.size() ; l_i++) {
cerr << l_i << "\t";
}
cerr << endl;
cerr << "hyp : \t"<<vectorToString(hyp ,"\t") << endl;
cerr << "cur : \t"<<vectorToString(cur ,"\t") << endl;
cerr << "ref : \t"<<vectorToString(ref ,"\t") << endl;
cerr << "herr : "<<vectorToString(herr,"\t",( int ) hyp.size()) << " | " << ( int ) hyp.size() <<endl;
cerr << "rerr : "<<vectorToString(rerr,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() <<endl;
cerr << "ralign : "<< vectorToString(ralign,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() << endl;
cerr << "END DEBUG " << endl;
}
poss_shifts = calculerPermutations ( cur, ref, rloc, med_align, herr, rerr, ralign );
double curerr = med_align.numEdits;
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
cerr << "Possible Shifts:" << endl;
for ( i = ( int ) poss_shifts->size() - 1; i >= 0; i-- ) {
for ( j = 0; j < ( int ) ( poss_shifts->at ( i ) ).size(); j++ ) {
cerr << " [" << i << "] " << ( ( poss_shifts->at ( i ) ).at ( j ) ).toString() << endl;
}
}
cerr << endl;
cerr << "END DEBUG " << endl;
}
// exit(0);
cur_best_align->set(med_align);
for ( i = ( int ) poss_shifts->size() - 1; i >= 0; i-- ) {
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
cerr << "Considering shift of length " << i << " (" << ( poss_shifts->at ( i ) ).size() << ")" << endl;
cerr << "END DEBUG " << endl;
}
/* Consider shifts of length i+1 */
double curfix = curerr - ( cur_best_shift_cost + cur_best_align->numEdits );
double maxfix = ( 2 * ( 1 + i ) );
if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
break;
} else {
for ( s = 0; s < ( int ) ( poss_shifts->at ( i ) ).size(); s++ ) {
curfix = curerr - ( cur_best_shift_cost + cur_best_align->numEdits );
if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
break;
} else {
curshift->set(( poss_shifts->at ( i ) ).at ( s ));
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
cerr << "cur : "<< join(" ",cur) << endl;
cerr << "shift size : "<< i << endl;
cerr << "shift number : "<< s << endl;
cerr << "size of shift size : "<< ( int ) ( poss_shifts->at ( i ) ).size() << endl;
cerr << "curshift : "<< curshift->toString() << endl;
}
// alignmentStruct shiftReturns;
shiftReturns.set(permuter ( cur, curshift ));
shiftarr = shiftReturns.nwords;
curHypSpans = shiftReturns.aftershift;
if ( PRINT_DEBUG ) {
cerr << "shiftarr : "<< join(" ",shiftarr) << endl;
cerr << "curHypSpans size : "<< (int)curHypSpans.size() << endl;
cerr << "END DEBUG " << endl;
}
// terAlignment tmp=minimizeDistanceEdition ( shiftarr, ref, curHypSpans );
minimizeDistanceEdition ( shiftarr, ref, curHypSpans, curalign );
// curalign->set(tmp);
curalign->hyp = hyp;
curalign->ref = ref;
curalign->aftershift = shiftarr;
double gain = ( cur_best_align->numEdits + cur_best_shift_cost ) - ( curalign->numEdits + curshift->cost );
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
cerr << "Gain for " << curshift->toString() << " is " << gain << ". (result: [" << curalign->join ( " ", shiftarr ) << "]" << endl;
cerr << "Details of gains : gain = ( cur_best_align->numEdits + cur_best_shift_cost ) - ( curalign->numEdits + curshift->cost )"<<endl;
cerr << "Details of gains : gain = ("<<cur_best_align->numEdits << "+" << cur_best_shift_cost << ") - (" << curalign->numEdits << "+" << curshift->cost << ")"<<endl;
cerr << "" << curalign->toString() << "\n" << endl;
cerr << "END DEBUG " << endl;
}
if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) ) {
anygain = true;
cur_best_shift->set(curshift);
cur_best_shift_cost = curshift->cost;
cur_best_align->set(curalign);
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
cerr << "Tmp Choosing shift: " << cur_best_shift->toString() << " gives:\n" << cur_best_align->toString() << "\n" << endl;
cerr << "END DEBUG " << endl;
}
}
}
}
}
}
bestShiftStruct * to_return=new bestShiftStruct();
if ( anygain ) {
to_return->setEmpty(false);
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
cerr << "Final shift chosen : " << cur_best_shift->toString() << " gives:\n" << cur_best_align->toString() << "\n" << endl;
cerr << "END DEBUG " << endl;
}
to_return->m_best_shift->set(cur_best_shift);
// terAlignment tmp=cur_best_align;
// cur_best_align->toString();
// to_return.m_best_align.toString();
// if ((int)cur_best_align->alignment.size() == 0)
// {
// to_return.m_best_align = cur_best_align;
// }
// else
// {
// cerr << "Warning: cur_best_align->alignment.size() = 0 !!!"<<endl;
//
// }
to_return->m_best_align->set(cur_best_align);
// to_return.m_best_align.toString();
} else {
to_return->setEmpty(true);
}
// // cerr << to_return->toString() << endl;
delete(poss_shifts);
delete(cur_best_align);
delete(cur_best_shift);
delete(curshift);
delete(curalign) ;
return to_return;
}
void terCalc::calculateTerAlignment ( terAlignment& align, vector<bool>* herr, vector<bool>* rerr, vector<int>* ralign )
{
int hpos = -1;
int rpos = -1;
CALL_TER_ALIGN++;
// cerr << "CALL_TER_ALIGN " << CALL_TER_ALIGN << endl;
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::calculateTerAlignment : " << endl << align.toString() << endl;
cerr << "END DEBUG " << endl;
}
// cerr << (int)herr->size() <<endl;
// cerr << (int)rerr->size() <<endl;
// cerr << ( int ) align.alignment.size() <<endl;
// for ( int i = 0; i < ( int ) align.alignment.size(); i++ )
// {
// herr->at(i) = false;
// rerr->at(i) = false;
// ralign->at(i) = -1;
// }
for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) {
char sym = align.alignment.at(i);
if ( sym == 'A' ) {
hpos++;
rpos++;
herr->at(hpos) = false;
rerr->at(rpos) = false;
ralign->at(rpos) = hpos;
} else if ( sym == 'S' ) {
hpos++;
rpos++;
herr->at(hpos) = true;
rerr->at(rpos) = true;
ralign->at(rpos) = hpos;
} else if ( sym == 'I' ) {
hpos++;
herr->at(hpos) = true;
} else if ( sym == 'D' ) {
rpos++;
rerr->at(rpos) = true;
ralign->at(rpos) = hpos+1;
} else {
cerr << "ERROR : terCalc::calculateTerAlignment : Invalid mini align sequence " << sym << " at pos " << i << endl;
exit ( -1 );
}
}
}
vector<vecTerShift> * terCalc::calculerPermutations ( vector< string >& hyp, vector< string >& ref, hashMapInfos& rloc, TERCPPNS_TERCpp::terAlignment& align, vector<bool>* herr, vector<bool>* rerr, vector<int>* ralign )
{
vector<vecTerShift> * allshifts = new vector<vecTerShift>(0);
// to_return.clear();
CALL_CALC_PERMUT++;
// cerr << "CALL_CALC_PERMUT " << CALL_CALC_PERMUT << endl;
if ( ( TAILLE_PERMUT_MAX <= 0 ) || ( DIST_MAX_PERMUT <= 0 ) ) {
return allshifts;
}
allshifts = new vector<vecTerShift>( TAILLE_PERMUT_MAX + 1 );
int start=0;
int end=0;
bool ok = false;
vector<int> mtiVec(0);
vector<int>::iterator mti;
int moveto=0;
vector<string> cand(0);
bool any_herr = false;
bool any_rerr = false;
int i=0;
int l_nbr_permuts=0;
// for (i=0; i< (int)ref.size() +1 ; i++) {cerr << " " << ralign[i] ;} cerr <<endl;
vector<int> movetoitVec(0);
string subVectorHypString="";
terShift * topush;
for ( start = 0; start < ( int ) hyp.size(); start++ ) {
subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) );
if ( ! rloc.trouve ( subVectorHypString ) ) {
continue;
}
ok = false;
mtiVec = rloc.getValue ( subVectorHypString );
mti = mtiVec.begin();
while ( mti != mtiVec.end() && ( ! ok ) ) {
moveto = ( *mti );
mti++;
if ( ( start != ralign->at(moveto) ) && ( ( ralign->at(moveto) - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign->at(moveto) - 1 ) <= DIST_MAX_PERMUT ) ) {
ok = true;
}
}
if ( ! ok ) {
continue;
}
ok = true;
for ( end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + TAILLE_PERMUT_MAX ) ); end++ ) {
/* check if cand is good if so, add it */
cand = subVector ( hyp, start, end + 1 );
ok = false;
if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) ) {
continue;
}
any_herr = false;
for ( i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ ) {
if ( herr->at(start+i) ) {
any_herr = true;
}
}
if ( any_herr == false ) {
ok = true;
continue;
}
movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) );
// cerr << "CANDIDATE " << ( string ) vectorToString ( cand ) <<" PLACED : " << ( string ) vectorToString ( movetoitVec," ") << endl;
vector<int>::iterator movetoit;
movetoit = movetoitVec.begin();
while ( movetoit != movetoitVec.end() ) {
moveto = ( *movetoit );
movetoit++;
if ( ! ( ( ralign->at(moveto) != start ) && ( ( ralign->at(moveto) < start ) || ( ralign->at(moveto) > end ) ) && ( ( ralign->at(moveto) - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign->at(moveto) ) <= DIST_MAX_PERMUT ) ) ) {
continue;
}
ok = true;
/* check to see if there are any errors in either string
(only move if this is the case!)
*/
any_rerr = false;
for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ ) {
if ( rerr->at(moveto+i) ) {
any_rerr = true;
}
}
if ( ! any_rerr ) {
continue;
}
for ( int roff = -1; roff <= ( end - start ); roff++ ) {
topush = new terShift();
bool topushNull = true;
if ( ( roff == -1 ) && ( moveto == 0 ) ) {
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::calculerPermutations 01 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl;
}
// terShift t01 ( start, end, -1, -1 );
// topush = t01;
topush->start=start;
topush->end=end;
topush->moveto=-1;
topush->newloc=-1;
topushNull = false;
} else if ( ( start != ralign->at(moveto+roff) ) && ( ( roff == 0 ) || ( ralign->at(moveto+roff) != ralign->at(moveto) ) ) ) {
int newloc = ralign->at(moveto+roff);
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl;
}
// terShift t02 ( start, end, moveto + roff, newloc );
// topush = t02;
topush->start=start;
topush->end=end;
topush->moveto=moveto + roff;
topush->newloc=newloc;
topushNull = false;
}
if ( !topushNull ) {
topush->shifted = cand;
topush->cost = shift_cost;
l_nbr_permuts++;
if ( PRINT_DEBUG ) {
cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl;
cerr << "start : " << start << endl;
cerr << "end : " << end << endl;
cerr << "end - start : " << end - start << endl;
cerr << "nbr Permutations added: " << l_nbr_permuts << endl;
cerr << "END DEBUG " << endl;
}
if (l_nbr_permuts < NBR_PERMUT_MAX + 1) {
( allshifts->at ( end - start ) ).push_back ( (*(topush)) );
}
// else
// {
// break;
// }
}
delete(topush);
}
}
}
}
// to_return.clear();
// for ( int i = 0; i < TAILLE_PERMUT_MAX + 1; i++ )
// {
// to_return.push_back ( ( vecTerShift ) allshifts.at ( i ) );
// }
return allshifts;
}
alignmentStruct terCalc::permuter ( vector< string >& words, TERCPPNS_TERCpp::terShift& s )
{
return permuter ( words, s.start, s.end, s.newloc );
}
alignmentStruct terCalc::permuter ( vector< string >& words, TERCPPNS_TERCpp::terShift* s )
{
return permuter ( words, s->start, s->end, s->newloc );
}
alignmentStruct terCalc::permuter ( vector< string >& words, int start, int end, int newloc )
{
int c = 0;
vector<string> nwords ( words );
vector<vecInt> spans ( ( int ) hypSpans.size() );
alignmentStruct to_return;
if ( PRINT_DEBUG ) {
if ( ( int ) hypSpans.size() > 0 ) {
cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl ;
} else {
cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl ;
}
cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << join(" ",words) << " start: " << start << " end: " << end << " newloc "<< newloc << endl << "END DEBUG " << endl;
}
if (newloc >= ( int ) words.size()) {
if ( PRINT_DEBUG ) {
cerr << "WARNING: Relocation over the size of the hypothesis, replacing at the end of it."<<endl;
}
newloc = ( int ) words.size()-1;
}
// }
if ( newloc == -1 ) {
for ( int i = start; i <= end; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = 0; i <= start - 1; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
} else {
if ( newloc < start ) {
for ( int i = 0; i < newloc; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = start; i <= end; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = newloc ; i < start ; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
} else {
if ( newloc > end ) {
for ( int i = 0; i <= start - 1; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = end + 1; i <= newloc; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = start; i <= end; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = newloc + 1; i < ( int ) words.size(); i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
} else {
// we are moving inside of ourselves
for ( int i = 0; i <= start - 1; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = start; i <= end; i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size(); i++ ) {
nwords.at ( c++ ) = words.at ( i );
if ( ( int ) hypSpans.size() > 0 ) {
spans.at ( c - 1 ) = hypSpans.at ( i );
}
}
}
}
}
NBR_PERMUTS_CONSID++;
if ( PRINT_DEBUG ) {
cerr << "nwords" << join(" ",nwords) << endl;
// cerr << "spans" << spans. << endl;
}
to_return.nwords = nwords;
to_return.aftershift = spans;
return to_return;
}
void terCalc::setDebugMode ( bool b )
{
PRINT_DEBUG = b;
}
}