mosesdecoder/moses/FF/OSM-Feature/osmHyp.cpp

602 lines
12 KiB
C++
Raw Normal View History

2013-07-04 23:19:51 +04:00
#include "osmHyp.h"
2013-06-24 15:29:33 +04:00
#include <sstream>
2013-06-25 19:51:56 +04:00
using namespace std;
2013-06-25 20:08:15 +04:00
using namespace lm::ngram;
2013-06-25 19:51:56 +04:00
2013-06-24 15:29:33 +04:00
namespace Moses
{
2013-06-25 19:27:41 +04:00
osmState::osmState(const State & val)
2013-07-04 23:19:51 +04:00
:j(0)
,E(0)
2013-06-24 15:29:33 +04:00
{
2013-06-25 19:27:41 +04:00
lmState = val;
2013-07-04 23:19:51 +04:00
2013-06-24 15:29:33 +04:00
}
2013-07-01 16:10:58 +04:00
void osmState::saveState(int jVal, int eVal, map <int , string> & gapVal)
2013-06-24 15:29:33 +04:00
{
2013-07-04 23:19:51 +04:00
gap.clear();
gap = gapVal;
j = jVal;
E = eVal;
2013-06-24 15:29:33 +04:00
}
int osmState::Compare(const FFState& otherBase) const
{
const osmState &other = static_cast<const osmState&>(otherBase);
if (j != other.j)
return (j < other.j) ? -1 : +1;
if (E != other.E)
return (E < other.E) ? -1 : +1;
if (gap != other.gap)
return (gap < other.gap) ? -1 : +1;
2013-06-25 19:27:41 +04:00
if (lmState.length < other.lmState.length) return -1;
2013-07-04 23:19:51 +04:00
2013-06-25 19:27:41 +04:00
if (lmState.length > other.lmState.length) return 1;
2013-06-24 15:29:33 +04:00
return 0;
}
std::string osmState :: getName() const
{
2013-07-04 23:19:51 +04:00
return "done";
2013-06-24 15:29:33 +04:00
}
//////////////////////////////////////////////////
osmHypothesis :: osmHypothesis()
{
2013-07-04 23:19:51 +04:00
opProb = 0;
gapWidth = 0;
gapCount = 0;
openGapCount = 0;
deletionCount = 0;
gapCount = 0;
j = 0;
E = 0;
gap.clear();
2013-06-24 15:29:33 +04:00
}
void osmHypothesis :: setState(const FFState* prev_state)
{
2013-07-04 23:19:51 +04:00
if(prev_state != NULL) {
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
j = static_cast <const osmState *> (prev_state)->getJ();
E = static_cast <const osmState *> (prev_state)->getE();
gap = static_cast <const osmState *> (prev_state)->getGap();
lmState = static_cast <const osmState *> (prev_state)->getLMState();
}
2013-06-24 15:29:33 +04:00
}
osmState * osmHypothesis :: saveState()
{
2013-07-04 23:19:51 +04:00
osmState * statePtr = new osmState(lmState);
statePtr->saveState(j,E,gap);
return statePtr;
2013-06-24 15:29:33 +04:00
}
int osmHypothesis :: isTranslationOperation(int x)
{
2013-07-04 23:19:51 +04:00
if (operations[x].find("_JMP_BCK_") != -1)
return 0;
if (operations[x].find("_JMP_FWD_") != -1)
return 0;
if (operations[x].find("_CONT_CEPT_") != -1)
return 0;
if (operations[x].find("_INS_GAP_") != -1)
return 0;
return 1;
2013-06-24 15:29:33 +04:00
}
void osmHypothesis :: removeReorderingOperations()
{
2013-07-04 23:19:51 +04:00
gapCount = 0;
deletionCount = 0;
openGapCount = 0;
gapWidth = 0;
std::vector <std::string> tupleSequence;
for (int x = 0; x < operations.size(); x++) {
// cout<<operations[x]<<endl;
if(isTranslationOperation(x) == 1) {
tupleSequence.push_back(operations[x]);
}
}
operations.clear();
operations = tupleSequence;
2013-06-24 15:29:33 +04:00
}
void osmHypothesis :: calculateOSMProb(OSMLM& ptrOp)
2013-06-25 19:27:41 +04:00
{
2013-07-04 23:19:51 +04:00
opProb = 0;
State currState = lmState;
State temp;
for (int i = 0; i<operations.size(); i++) {
temp = currState;
opProb += ptrOp.Score(temp,operations[i],currState);
2013-07-04 23:19:51 +04:00
}
2013-06-25 19:27:41 +04:00
2013-07-04 23:19:51 +04:00
lmState = currState;
2013-06-25 19:27:41 +04:00
2013-07-04 23:19:51 +04:00
//print();
2013-06-25 19:27:41 +04:00
}
2013-06-24 15:29:33 +04:00
int osmHypothesis :: firstOpenGap(vector <int> & coverageVector)
{
2013-07-04 23:19:51 +04:00
int firstOG =-1;
for(int nd = 0; nd < coverageVector.size(); nd++) {
if(coverageVector[nd]==0) {
firstOG = nd;
return firstOG;
}
}
return firstOG;
2013-06-24 15:29:33 +04:00
}
string osmHypothesis :: intToString(int num)
{
2013-07-04 23:19:51 +04:00
std::ostringstream stm;
stm<<num;
return stm.str();
2013-06-24 15:29:33 +04:00
}
void osmHypothesis :: generateOperations(int & startIndex , int j1 , int contFlag , WordsBitmap & coverageVector , string english , string german , set <int> & targetNullWords , vector <string> & currF)
{
2013-07-04 23:19:51 +04:00
int gFlag = 0;
int gp = 0;
int ans;
if ( j < j1) { // j1 is the index of the source word we are about to generate ...
//if(coverageVector[j]==0) // if source word at j is not generated yet ...
if(coverageVector.GetValue(j)==0) { // if source word at j is not generated yet ...
operations.push_back("_INS_GAP_");
gFlag++;
gap[j]="Unfilled";
}
if (j == E) {
j = j1;
} else {
operations.push_back("_JMP_FWD_");
j=E;
}
}
if (j1 < j) {
// if(j < E && coverageVector[j]==0)
if(j < E && coverageVector.GetValue(j)==0) {
operations.push_back("_INS_GAP_");
gFlag++;
gap[j]="Unfilled";
}
j=closestGap(gap,j1,gp);
operations.push_back("_JMP_BCK_"+ intToString(gp));
//cout<<"I am j "<<j<<endl;
//cout<<"I am j1 "<<j1<<endl;
if(j==j1)
gap[j]="Filled";
}
if (j < j1) {
operations.push_back("_INS_GAP_");
gap[j] = "Unfilled";
gFlag++;
j=j1;
}
if(contFlag == 0) { // First words of the multi-word cept ...
if(english == "_TRANS_SLF_") { // Unknown word ...
operations.push_back("_TRANS_SLF_");
} else {
operations.push_back("_TRANS_" + english + "_TO_" + german);
}
//ans = firstOpenGap(coverageVector);
ans = coverageVector.GetFirstGapPos();
if (ans != -1)
gapWidth += j - ans;
} else if (contFlag == 2) {
operations.push_back("_INS_" + german);
ans = coverageVector.GetFirstGapPos();
if (ans != -1)
gapWidth += j - ans;
deletionCount++;
} else {
operations.push_back("_CONT_CEPT_");
}
//coverageVector[j]=1;
coverageVector.SetValue(j,1);
j+=1;
if(E<j)
E=j;
if (gFlag > 0)
gapCount++;
openGapCount += getOpenGaps();
//if (coverageVector[j] == 0 && targetNullWords.find(j) != targetNullWords.end())
2014-06-02 19:23:55 +04:00
if (j < coverageVector.GetSize()) {
if (coverageVector.GetValue(j) == 0 && targetNullWords.find(j) != targetNullWords.end()) {
j1 = j;
german = currF[j1-startIndex];
english = "_INS_";
generateOperations(startIndex, j1, 2 , coverageVector , english , german , targetNullWords , currF);
}
2013-07-04 23:19:51 +04:00
}
2013-06-24 15:29:33 +04:00
}
void osmHypothesis :: print()
{
2013-07-04 23:19:51 +04:00
for (int i = 0; i< operations.size(); i++) {
cerr<<operations[i]<<" ";
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
cerr<<endl<<endl;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
cerr<<"Operation Probability "<<opProb<<endl;
cerr<<"Gap Count "<<gapCount<<endl;
cerr<<"Open Gap Count "<<openGapCount<<endl;
cerr<<"Gap Width "<<gapWidth<<endl;
cerr<<"Deletion Count "<<deletionCount<<endl;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
cerr<<"_______________"<<endl;
2013-06-24 15:29:33 +04:00
}
int osmHypothesis :: closestGap(map <int,string> gap, int j1, int & gp)
{
2013-07-04 23:19:51 +04:00
int dist=1172;
int value=-1;
int temp=0;
gp=0;
int opGap=0;
map <int,string> :: iterator iter;
iter=gap.end();
do {
iter--;
//cout<<"Trapped "<<iter->first<<endl;
if(iter->first==j1 && iter->second== "Unfilled") {
opGap++;
gp = opGap;
return j1;
}
if(iter->second =="Unfilled") {
opGap++;
temp = iter->first - j1;
if(temp<0)
temp=temp * -1;
if(dist>temp && iter->first < j1) {
dist=temp;
value=iter->first;
gp=opGap;
}
}
} while(iter!=gap.begin());
return value;
2013-06-24 15:29:33 +04:00
}
int osmHypothesis :: getOpenGaps()
{
2013-07-04 23:19:51 +04:00
map <int,string> :: iterator iter;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
int nd = 0;
for (iter = gap.begin(); iter!=gap.end(); iter++) {
if(iter->second == "Unfilled")
nd++;
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
return nd;
2013-06-24 15:29:33 +04:00
}
void osmHypothesis :: generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes)
{
2013-07-04 23:19:51 +04:00
operations.push_back("_DEL_" + english);
currTargetIndex++;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
while(doneTargetIndexes.find(currTargetIndex) != doneTargetIndexes.end()) {
currTargetIndex++;
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
if (sourceNullWords.find(currTargetIndex) != sourceNullWords.end()) {
english = currE[currTargetIndex];
generateDeleteOperations(english,currTargetIndex,doneTargetIndexes);
}
2013-06-24 15:29:33 +04:00
}
2013-06-25 19:54:41 +04:00
void osmHypothesis :: computeOSMFeature(int startIndex , WordsBitmap & coverageVector)
2013-06-24 15:29:33 +04:00
{
2013-07-04 23:19:51 +04:00
set <int> doneTargetIndexes;
set <int> eSide;
set <int> fSide;
set <int> :: iterator iter;
string english;
string source;
int j1;
int start = 0;
int targetIndex = 0;
doneTargetIndexes.clear();
if (targetNullWords.size() != 0) { // Source words to be deleted in the start of this phrase ...
iter = targetNullWords.begin();
if (*iter == startIndex) {
j1 = startIndex;
source = currF[j1-startIndex];
english = "_INS_";
generateOperations(startIndex, j1, 2 , coverageVector , english , source , targetNullWords , currF);
}
}
if (sourceNullWords.find(targetIndex) != sourceNullWords.end()) { // first word has to be deleted ...
english = currE[targetIndex];
generateDeleteOperations(english,targetIndex, doneTargetIndexes);
}
for (int i = 0; i < ceptsInPhrase.size(); i++) {
source = "";
english = "";
fSide = ceptsInPhrase[i].first;
eSide = ceptsInPhrase[i].second;
iter = eSide.begin();
targetIndex = *iter;
english += currE[*iter];
iter++;
for (; iter != eSide.end(); iter++) {
if(*iter == targetIndex+1)
targetIndex++;
else
doneTargetIndexes.insert(*iter);
english += "^_^";
english += currE[*iter];
}
iter = fSide.begin();
source += currF[*iter];
iter++;
for (; iter != fSide.end(); iter++) {
source += "^_^";
source += currF[*iter];
}
iter = fSide.begin();
j1 = *iter + startIndex;
iter++;
generateOperations(startIndex, j1, 0 , coverageVector , english , source , targetNullWords , currF);
for (; iter != fSide.end(); iter++) {
j1 = *iter + startIndex;
generateOperations(startIndex, j1, 1 , coverageVector , english , source , targetNullWords , currF);
}
targetIndex++; // Check whether the next target word is unaligned ...
while(doneTargetIndexes.find(targetIndex) != doneTargetIndexes.end()) {
targetIndex++;
}
if(sourceNullWords.find(targetIndex) != sourceNullWords.end()) {
english = currE[targetIndex];
generateDeleteOperations(english,targetIndex, doneTargetIndexes);
}
}
//removeReorderingOperations();
//print();
2013-06-24 15:29:33 +04:00
}
void osmHypothesis :: getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT)
{
2013-07-04 23:19:51 +04:00
set <int> :: iterator iter;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
int sz = eSide.size();
vector <int> t;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (iter = eSide.begin(); iter != eSide.end(); iter++) {
t = tS[*iter];
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (int i = 0; i < t.size(); i++) {
fSide.insert(t[i]);
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (iter = fSide.begin(); iter != fSide.end(); iter++) {
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
t = sT[*iter];
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (int i = 0 ; i<t.size(); i++) {
eSide.insert(t[i]);
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
if (eSide.size () > sz) {
getMeCepts(eSide,fSide,tS,sT);
}
2013-06-24 15:29:33 +04:00
}
void osmHypothesis :: constructCepts(vector <int> & align , int startIndex , int endIndex, int targetPhraseLength)
{
2013-07-04 23:19:51 +04:00
std::map <int , vector <int> > sT;
std::map <int , vector <int> > tS;
std::set <int> eSide;
std::set <int> fSide;
std::set <int> :: iterator iter;
std :: map <int , vector <int> > :: iterator iter2;
std :: pair < set <int> , set <int> > cept;
int src;
int tgt;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (int i = 0; i < align.size(); i+=2) {
src = align[i];
tgt = align[i+1];
tS[tgt].push_back(src);
sT[src].push_back(tgt);
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (int i = startIndex; i<= endIndex; i++) { // What are unaligned source words in this phrase ...
if (sT.find(i-startIndex) == sT.end()) {
targetNullWords.insert(i);
}
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (int i = 0; i < targetPhraseLength; i++) { // What are unaligned target words in this phrase ...
if (tS.find(i) == tS.end()) {
sourceNullWords.insert(i);
}
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
while (tS.size() != 0 && sT.size() != 0) {
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
iter2 = tS.begin();
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
eSide.clear();
fSide.clear();
eSide.insert (iter2->first);
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
getMeCepts(eSide, fSide, tS , sT);
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (iter = eSide.begin(); iter != eSide.end(); iter++) {
iter2 = tS.find(*iter);
tS.erase(iter2);
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (iter = fSide.begin(); iter != fSide.end(); iter++) {
iter2 = sT.find(*iter);
sT.erase(iter2);
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
cept = make_pair (fSide , eSide);
ceptsInPhrase.push_back(cept);
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
/*
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
cerr<<"Extracted Cepts "<<endl;
for (int i = 0; i < ceptsInPhrase.size(); i++)
{
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
fSide = ceptsInPhrase[i].first;
eSide = ceptsInPhrase[i].second;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (iter = eSide.begin(); iter != eSide.end(); iter++)
{
cerr<<*iter<<" ";
}
cerr<<"<---> ";
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (iter = fSide.begin(); iter != fSide.end(); iter++)
{
cerr<<*iter<<" ";
}
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
cerr<<endl;
}
cerr<<endl;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
cerr<<"Unaligned Target Words"<<endl;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (iter = sourceNullWords.begin(); iter != sourceNullWords.end(); iter++)
cerr<<*iter<<"<--->"<<endl;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
cerr<<"Unaligned Source Words"<<endl;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
for (iter = targetNullWords.begin(); iter != targetNullWords.end(); iter++)
cerr<<*iter<<"<--->"<<endl;
2013-06-24 15:29:33 +04:00
2013-07-04 23:19:51 +04:00
*/
2013-06-24 15:29:33 +04:00
}
2013-08-25 16:23:42 +04:00
void osmHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
2013-06-24 15:29:33 +04:00
{
2013-07-04 23:19:51 +04:00
scores.clear();
2013-08-28 14:06:27 +04:00
scores.push_back(opProb);
2013-08-25 16:23:42 +04:00
if (numFeatures == 1)
2013-08-28 14:06:27 +04:00
return;
2013-08-25 16:23:42 +04:00
2013-07-04 23:19:51 +04:00
scores.push_back(gapWidth);
scores.push_back(gapCount);
scores.push_back(openGapCount);
scores.push_back(deletionCount);
2013-06-24 15:29:33 +04:00
}
} // namespace